698 files changed, 19677 insertions, 12076 deletions
diff --git a/Documentation/ABI/testing/sysfs-power b/Documentation/ABI/testing/sysfs-power
index 4d8e1ad020f0..d38da077905a 100644
--- a/Documentation/ABI/testing/sysfs-power
+++ b/Documentation/ABI/testing/sysfs-power
@@ -454,3 +454,19 @@ Description:
 		disables it.  Reads from the file return the current value.
 		The default is "1" if the build-time "SUSPEND_SKIP_SYNC" config
 		flag is unset, or "0" otherwise.
+
+What:           /sys/power/hibernate_compression_threads
+Date:           October 2025
+Contact:        <luoxueqin@kylinos.cn>
+Description:
+                Controls the number of threads used for compression
+                and decompression of hibernation images.
+
+                The value can be adjusted at runtime to balance
+                performance and CPU utilization.
+
+                The change takes effect on the next hibernation or
+                resume operation.
+
+                Minimum value: 1
+                Default value: 3
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 8c5636a120ee..2b465eab41a1 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -1907,6 +1907,16 @@
 			/sys/power/pm_test). Only available when CONFIG_PM_DEBUG
 			is set. Default value is 5.
 
+	hibernate_compression_threads=
+			[HIBERNATION]
+			Set the number of threads used for compressing or decompressing
+			hibernation images.
+
+			Format: <integer>
+			Default: 3
+			Minimum: 1
+			Example: hibernate_compression_threads=4
+
 	highmem=nn[KMG]	[KNL,BOOT,EARLY] forces the highmem zone to have an exact
 			size of <nn>. This works even on boxes that have no
 			highmem otherwise. This also works to reduce highmem
diff --git a/Documentation/admin-guide/pm/cpuidle.rst b/Documentation/admin-guide/pm/cpuidle.rst
index 0c090b076224..be4c1120e3f0 100644
--- a/Documentation/admin-guide/pm/cpuidle.rst
+++ b/Documentation/admin-guide/pm/cpuidle.rst
@@ -580,6 +580,15 @@ the given CPU as the upper limit for the exit latency of the idle states that
 they are allowed to select for that CPU.  They should never select any idle
 states with exit latency beyond that limit.
 
+While the above CPU QoS constraints apply to CPU idle time management, user
+space may also request a CPU system wakeup latency QoS limit, via the
+`cpu_wakeup_latency` file.  This QoS constraint is respected when selecting a
+suitable idle state for the CPUs, while entering the system-wide suspend-to-idle
+sleep state, but also to the regular CPU idle time management.
+
+Note that, the management of the `cpu_wakeup_latency` file works according to
+the 'cpu_dma_latency' file from user space point of view.  Moreover, the unit
+is also microseconds.
 
 Idle States Control Via Kernel Command Line
 ===========================================
diff --git a/Documentation/admin-guide/pm/intel_pstate.rst b/Documentation/admin-guide/pm/intel_pstate.rst
index 26e702c7016e..fde967b0c2e0 100644
--- a/Documentation/admin-guide/pm/intel_pstate.rst
+++ b/Documentation/admin-guide/pm/intel_pstate.rst
@@ -48,8 +48,9 @@ only way to pass early-configuration-time parameters to it is via the kernel
 command line.  However, its configuration can be adjusted via ``sysfs`` to a
 great extent.  In some configurations it even is possible to unregister it via
 ``sysfs`` which allows another ``CPUFreq`` scaling driver to be loaded and
-registered (see `below <status_attr_>`_).
+registered (see :ref:`below <status_attr>`).
 
+.. _operation_modes:
 
 Operation Modes
 ===============
@@ -62,6 +63,8 @@ a certain performance scaling algorithm.  Which of them will be in effect
 depends on what kernel command line options are used and on the capabilities of
 the processor.
 
+.. _active_mode:
+
 Active Mode
 -----------
 
@@ -94,6 +97,8 @@ Which of the P-state selection algorithms is used by default depends on the
 Namely, if that option is set, the ``performance`` algorithm will be used by
 default, and the other one will be used by default if it is not set.
 
+.. _active_mode_hwp:
+
 Active Mode With HWP
 ~~~~~~~~~~~~~~~~~~~~
 
@@ -123,7 +128,7 @@ Energy-Performance Bias (EPB) knob (otherwise), which means that the processor's
 internal P-state selection logic is expected to focus entirely on performance.
 
 This will override the EPP/EPB setting coming from the ``sysfs`` interface
-(see `Energy vs Performance Hints`_ below).  Moreover, any attempts to change
+(see :ref:`energy_performance_hints` below).  Moreover, any attempts to change
 the EPP/EPB to a value different from 0 ("performance") via ``sysfs`` in this
 configuration will be rejected.
 
@@ -192,6 +197,8 @@ This is the default P-state selection algorithm if the
 :c:macro:`CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE` kernel configuration option
 is not set.
 
+.. _passive_mode:
+
 Passive Mode
 ------------
 
@@ -289,12 +296,12 @@ Unlike ``_PSS`` objects in the ACPI tables, ``intel_pstate`` always exposes
 the entire range of available P-states, including the whole turbo range, to the
 ``CPUFreq`` core and (in the passive mode) to generic scaling governors.  This
 generally causes turbo P-states to be set more often when ``intel_pstate`` is
-used relative to ACPI-based CPU performance scaling (see `below <acpi-cpufreq_>`_
-for more information).
+used relative to ACPI-based CPU performance scaling (see
+:ref:`below <acpi-cpufreq>` for more information).
 
 Moreover, since ``intel_pstate`` always knows what the real turbo threshold is
 (even if the Configurable TDP feature is enabled in the processor), its
-``no_turbo`` attribute in ``sysfs`` (described `below <no_turbo_attr_>`_) should
+``no_turbo`` attribute in ``sysfs`` (described :ref:`below <no_turbo_attr>`) should
 work as expected in all cases (that is, if set to disable turbo P-states, it
 always should prevent ``intel_pstate`` from using them).
 
@@ -307,12 +314,12 @@ pieces of information on it to be known, including:
 
  * The minimum supported P-state.
 
- * The maximum supported `non-turbo P-state <turbo_>`_.
+ * The maximum supported :ref:`non-turbo P-state <turbo>`.
 
  * Whether or not turbo P-states are supported at all.
 
- * The maximum supported `one-core turbo P-state <turbo_>`_ (if turbo P-states
-   are supported).
+ * The maximum supported :ref:`one-core turbo P-state <turbo>` (if turbo
+   P-states are supported).
 
  * The scaling formula to translate the driver's internal representation
    of P-states into frequencies and the other way around.
@@ -400,10 +407,10 @@ Energy-Aware Scheduling Support
 
 If ``CONFIG_ENERGY_MODEL`` has been set during kernel configuration and
 ``intel_pstate`` runs on a hybrid processor without SMT, in addition to enabling
-`CAS <CAS_>`_ it registers an Energy Model for the processor.  This allows the
+:ref:`CAS` it registers an Energy Model for the processor.  This allows the
 Energy-Aware Scheduling (EAS) support to be enabled in the CPU scheduler if
 ``schedutil`` is used as the  ``CPUFreq`` governor which requires ``intel_pstate``
-to operate in the `passive mode <Passive Mode_>`_.
+to operate in the :ref:`passive mode <passive_mode>`.
 
 The Energy Model registered by ``intel_pstate`` is artificial (that is, it is
 based on abstract cost values and it does not include any real power numbers)
@@ -432,6 +439,8 @@ the ``energy_model`` directory in ``debugfs`` (typlically mounted on
 User Space Interface in ``sysfs``
 =================================
 
+.. _global_attributes:
+
 Global Attributes
 -----------------
 
@@ -444,8 +453,8 @@ argument is passed to the kernel in the command line.
 
 ``max_perf_pct``
 	Maximum P-state the driver is allowed to set in percent of the
-	maximum supported performance level (the highest supported `turbo
-	P-state <turbo_>`_).
+	maximum supported performance level (the highest supported :ref:`turbo
+	P-state <turbo>`).
 
 	This attribute will not be exposed if the
 	``intel_pstate=per_cpu_perf_limits`` argument is present in the kernel
@@ -453,8 +462,8 @@ argument is passed to the kernel in the command line.
 
 ``min_perf_pct``
 	Minimum P-state the driver is allowed to set in percent of the
-	maximum supported performance level (the highest supported `turbo
-	P-state <turbo_>`_).
+	maximum supported performance level (the highest supported :ref:`turbo
+	P-state <turbo>`).
 
 	This attribute will not be exposed if the
 	``intel_pstate=per_cpu_perf_limits`` argument is present in the kernel
@@ -463,18 +472,18 @@ argument is passed to the kernel in the command line.
 ``num_pstates``
 	Number of P-states supported by the processor (between 0 and 255
 	inclusive) including both turbo and non-turbo P-states (see
-	`Turbo P-states Support`_).
+	:ref:`turbo`).
 
 	This attribute is present only if the value exposed by it is the same
 	for all of the CPUs in the system.
 
 	The value of this attribute is not affected by the ``no_turbo``
-	setting described `below <no_turbo_attr_>`_.
+	setting described :ref:`below <no_turbo_attr>`.
 
 	This attribute is read-only.
 
 ``turbo_pct``
-	Ratio of the `turbo range <turbo_>`_ size to the size of the entire
+	Ratio of the :ref:`turbo range <turbo>` size to the size of the entire
 	range of supported P-states, in percent.
 
 	This attribute is present only if the value exposed by it is the same
@@ -486,7 +495,7 @@ argument is passed to the kernel in the command line.
 
 ``no_turbo``
 	If set (equal to 1), the driver is not allowed to set any turbo P-states
-	(see `Turbo P-states Support`_).  If unset (equal to 0, which is the
+	(see :ref:`turbo`).  If unset (equal to 0, which is the
 	default), turbo P-states can be set by the driver.
 	[Note that ``intel_pstate`` does not support the general ``boost``
 	attribute (supported by some other scaling drivers) which is replaced
@@ -495,11 +504,11 @@ argument is passed to the kernel in the command line.
 	This attribute does not affect the maximum supported frequency value
 	supplied to the ``CPUFreq`` core and exposed via the policy interface,
 	but it affects the maximum possible value of per-policy P-state	limits
-	(see `Interpretation of Policy Attributes`_ below for details).
+	(see :ref:`policy_attributes_interpretation` below for details).
 
 ``hwp_dynamic_boost``
 	This attribute is only present if ``intel_pstate`` works in the
-	`active mode with the HWP feature enabled <Active Mode With HWP_>`_ in
+	:ref:`active mode with the HWP feature enabled <active_mode_hwp>` in
 	the processor.  If set (equal to 1), it causes the minimum P-state limit
 	to be increased dynamically for a short time whenever a task previously
 	waiting on I/O is selected to run on a given logical CPU (the purpose
@@ -514,12 +523,12 @@ argument is passed to the kernel in the command line.
 	Operation mode of the driver: "active", "passive" or "off".
 
 	"active"
-		The driver is functional and in the `active mode
-		<Active Mode_>`_.
+		The driver is functional and in the :ref:`active mode
+		<active_mode>`.
 
 	"passive"
-		The driver is functional and in the `passive mode
-		<Passive Mode_>`_.
+		The driver is functional and in the :ref:`passive mode
+		<passive_mode>`.
 
 	"off"
 		The driver is not functional (it is not registered as a scaling
@@ -547,13 +556,15 @@ argument is passed to the kernel in the command line.
 	attribute to "1" enables the energy-efficiency optimizations and setting
 	to "0" disables them.
 
+.. _policy_attributes_interpretation:
+
 Interpretation of Policy Attributes
 -----------------------------------
 
 The interpretation of some ``CPUFreq`` policy attributes described in
 Documentation/admin-guide/pm/cpufreq.rst is special with ``intel_pstate``
 as the current scaling driver and it generally depends on the driver's
-`operation mode <Operation Modes_>`_.
+:ref:`operation mode <operation_modes>`.
 
 First of all, the values of the ``cpuinfo_max_freq``, ``cpuinfo_min_freq`` and
 ``scaling_cur_freq`` attributes are produced by applying a processor-specific
@@ -562,9 +573,10 @@ Also, the values of the ``scaling_max_freq`` and ``scaling_min_freq``
 attributes are capped by the frequency corresponding to the maximum P-state that
 the driver is allowed to set.
 
-If the ``no_turbo`` `global attribute <no_turbo_attr_>`_ is set, the driver is
-not allowed to use turbo P-states, so the maximum value of ``scaling_max_freq``
-and ``scaling_min_freq`` is limited to the maximum non-turbo P-state frequency.
+If the ``no_turbo`` :ref:`global attribute <no_turbo_attr>` is set, the driver
+is not allowed to use turbo P-states, so the maximum value of
+``scaling_max_freq`` and ``scaling_min_freq`` is limited to the maximum
+non-turbo P-state frequency.
 Accordingly, setting ``no_turbo`` causes ``scaling_max_freq`` and
 ``scaling_min_freq`` to go down to that value if they were above it before.
 However, the old values of ``scaling_max_freq`` and ``scaling_min_freq`` will be
@@ -576,7 +588,7 @@ and ``scaling_min_freq`` corresponds to the maximum supported turbo P-state,
 which also is the value of ``cpuinfo_max_freq`` in either case.
 
 Next, the following policy attributes have special meaning if
-``intel_pstate`` works in the `active mode <Active Mode_>`_:
+``intel_pstate`` works in the :ref:`active mode <active_mode>`:
 
 ``scaling_available_governors``
 	List of P-state selection algorithms provided by ``intel_pstate``.
@@ -597,20 +609,22 @@ processor:
 	Shows the base frequency of the CPU. Any frequency above this will be
 	in the turbo frequency range.
 
-The meaning of these attributes in the `passive mode <Passive Mode_>`_ is the
+The meaning of these attributes in the :ref:`passive mode <passive_mode>` is the
 same as for other scaling drivers.
 
 Additionally, the value of the ``scaling_driver`` attribute for ``intel_pstate``
 depends on the operation mode of the driver.  Namely, it is either
-"intel_pstate" (in the `active mode <Active Mode_>`_) or "intel_cpufreq" (in the
-`passive mode <Passive Mode_>`_).
+"intel_pstate" (in the :ref:`active mode <active_mode>`) or "intel_cpufreq"
+(in the :ref:`passive mode <passive_mode>`).
+
+.. _pstate_limits_coordination:
 
 Coordination of P-State Limits
 ------------------------------
 
 ``intel_pstate`` allows P-state limits to be set in two ways: with the help of
-the ``max_perf_pct`` and ``min_perf_pct`` `global attributes
-<Global Attributes_>`_ or via the ``scaling_max_freq`` and ``scaling_min_freq``
+the ``max_perf_pct`` and ``min_perf_pct`` :ref:`global attributes
+<global_attributes>` or via the ``scaling_max_freq`` and ``scaling_min_freq``
 ``CPUFreq`` policy attributes.  The coordination between those limits is based
 on the following rules, regardless of the current operation mode of the driver:
 
@@ -632,17 +646,18 @@ on the following rules, regardless of the current operation mode of the driver:
 
  3. The global and per-policy limits can be set independently.
 
-In the `active mode with the HWP feature enabled <Active Mode With HWP_>`_, the
+In the :ref:`active mode with the HWP feature enabled <active_mode_hwp>`, the
 resulting effective values are written into hardware registers whenever the
 limits change in order to request its internal P-state selection logic to always
 set P-states within these limits.  Otherwise, the limits are taken into account
-by scaling governors (in the `passive mode <Passive Mode_>`_) and by the driver
-every time before setting a new P-state for a CPU.
+by scaling governors (in the :ref:`passive mode <passive_mode>`) and by the
+driver every time before setting a new P-state for a CPU.
 
 Additionally, if the ``intel_pstate=per_cpu_perf_limits`` command line argument
 is passed to the kernel, ``max_perf_pct`` and ``min_perf_pct`` are not exposed
 at all and the only way to set the limits is by using the policy attributes.
 
+.. _energy_performance_hints:
 
 Energy vs Performance Hints
 ---------------------------
@@ -702,9 +717,9 @@ output.
 On those systems each ``_PSS`` object returns a list of P-states supported by
 the corresponding CPU which basically is a subset of the P-states range that can
 be used by ``intel_pstate`` on the same system, with one exception: the whole
-`turbo range <turbo_>`_ is represented by one item in it (the topmost one).  By
-convention, the frequency returned by ``_PSS`` for that item is greater by 1 MHz
-than the frequency of the highest non-turbo P-state listed by it, but the
+:ref:`turbo range <turbo>` is represented by one item in it (the topmost one).
+By convention, the frequency returned by ``_PSS`` for that item is greater by
+1 MHz than the frequency of the highest non-turbo P-state listed by it, but the
 corresponding P-state representation (following the hardware specification)
 returned for it matches the maximum supported turbo P-state (or is the
 special value 255 meaning essentially "go as high as you can get").
@@ -730,18 +745,18 @@ benefit from running at turbo frequencies will be given non-turbo P-states
 instead.
 
 One more issue related to that may appear on systems supporting the
-`Configurable TDP feature <turbo_>`_ allowing the platform firmware to set the
-turbo threshold.  Namely, if that is not coordinated with the lists of P-states
-returned by ``_PSS`` properly, there may be more than one item corresponding to
-a turbo P-state in those lists and there may be a problem with avoiding the
-turbo range (if desirable or necessary).  Usually, to avoid using turbo
-P-states overall, ``acpi-cpufreq`` simply avoids using the topmost state listed
-by ``_PSS``, but that is not sufficient when there are other turbo P-states in
-the list returned by it.
+:ref:`Configurable TDP feature <turbo>` allowing the platform firmware to set
+the turbo threshold.  Namely, if that is not coordinated with the lists of
+P-states returned by ``_PSS`` properly, there may be more than one item
+corresponding to a turbo P-state in those lists and there may be a problem with
+avoiding the turbo range (if desirable or necessary).  Usually, to avoid using
+turbo P-states overall, ``acpi-cpufreq`` simply avoids using the topmost state
+listed by ``_PSS``, but that is not sufficient when there are other turbo
+P-states in the list returned by it.
 
 Apart from the above, ``acpi-cpufreq`` works like ``intel_pstate`` in the
-`passive mode <Passive Mode_>`_, except that the number of P-states it can set
-is limited to the ones listed by the ACPI ``_PSS`` objects.
+:ref:`passive mode <passive_mode>`, except that the number of P-states it can
+set is limited to the ones listed by the ACPI ``_PSS`` objects.
 
 
 Kernel Command Line Options for ``intel_pstate``
@@ -756,11 +771,11 @@ of them have to be prepended with the ``intel_pstate=`` prefix.
 	processor is supported by it.
 
 ``active``
-	Register ``intel_pstate`` in the `active mode <Active Mode_>`_ to start
-	with.
+	Register ``intel_pstate`` in the :ref:`active mode <active_mode>` to
+        start with.
 
 ``passive``
-	Register ``intel_pstate`` in the `passive mode <Passive Mode_>`_ to
+	Register ``intel_pstate`` in the :ref:`passive mode <passive_mode>` to
 	start with.
 
 ``force``
@@ -793,12 +808,12 @@ of them have to be prepended with the ``intel_pstate=`` prefix.
 	and this option has no effect.
 
 ``per_cpu_perf_limits``
-	Use per-logical-CPU P-State limits (see `Coordination of P-state
-	Limits`_ for details).
+	Use per-logical-CPU P-State limits (see
+        :ref:`pstate_limits_coordination` for details).
 
 ``no_cas``
-	Do not enable `capacity-aware scheduling <CAS_>`_ which is enabled by
-	default on hybrid systems without SMT.
+	Do not enable :ref:`capacity-aware scheduling <CAS>` which is enabled
+        by default on hybrid systems without SMT.
 
 Diagnostics and Tuning
 ======================
@@ -810,7 +825,7 @@ There are two static trace events that can be used for ``intel_pstate``
 diagnostics.  One of them is the ``cpu_frequency`` trace event generally used
 by ``CPUFreq``, and the other one is the ``pstate_sample`` trace event specific
 to ``intel_pstate``.  Both of them are triggered by ``intel_pstate`` only if
-it works in the `active mode <Active Mode_>`_.
+it works in the :ref:`active mode <active_mode>`.
 
 The following sequence of shell commands can be used to enable them and see
 their output (if the kernel is generally configured to support event tracing)::
@@ -822,7 +837,7 @@ their output (if the kernel is generally configured to support event tracing)::
  gnome-terminal--4510  [001] ..s.  1177.680733: pstate_sample: core_busy=107 scaled=94 from=26 to=26 mperf=1143818 aperf=1230607 tsc=29838618 freq=2474476
  cat-5235  [002] ..s.  1177.681723: cpu_frequency: state=2900000 cpu_id=2
 
-If ``intel_pstate`` works in the `passive mode <Passive Mode_>`_, the
+If ``intel_pstate`` works in the :ref:`passive mode <passive_mode>`, the
 ``cpu_frequency`` trace event will be triggered either by the ``schedutil``
 scaling governor (for the policies it is attached to), or by the ``CPUFreq``
 core (for the policies with other scaling governors).
diff --git a/Documentation/admin-guide/thermal/index.rst b/Documentation/admin-guide/thermal/index.rst
index 193b7b01a87d..e48bc0a1951b 100644
--- a/Documentation/admin-guide/thermal/index.rst
+++ b/Documentation/admin-guide/thermal/index.rst
@@ -6,3 +6,4 @@ Thermal Subsystem
    :maxdepth: 1
 
    intel_powerclamp
+   intel_thermal_throttle
diff --git a/Documentation/admin-guide/thermal/intel_thermal_throttle.rst b/Documentation/admin-guide/thermal/intel_thermal_throttle.rst
new file mode 100644
index 000000000000..f4fbf9d5a4ec
--- /dev/null
+++ b/Documentation/admin-guide/thermal/intel_thermal_throttle.rst
@@ -0,0 +1,91 @@
+.. SPDX-License-Identifier: GPL-2.0
+.. include:: <isonum.txt>
+
+=======================================
+Intel thermal throttle events reporting
+=======================================
+
+:Author: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
+
+Introduction
+------------
+
+Intel processors have built in automatic and adaptive thermal monitoring
+mechanisms that force the processor to reduce its power consumption in order
+to operate within predetermined temperature limits.
+
+Refer to section "THERMAL MONITORING AND PROTECTION" in the "Intel® 64 and
+IA-32 Architectures Software Developer’s Manual Volume 3 (3A, 3B, 3C, & 3D):
+System Programming Guide" for more details.
+
+In general, there are two mechanisms to control the core temperature of the
+processor. They are called "Thermal Monitor 1 (TM1) and Thermal Monitor 2 (TM2)".
+
+The status of the temperature sensor that triggers the thermal monitor (TM1/TM2)
+is indicated through the "thermal status flag" and "thermal status log flag" in
+MSR_IA32_THERM_STATUS for core level and MSR_IA32_PACKAGE_THERM_STATUS for
+package level.
+
+Thermal Status flag, bit 0 — When set, indicates that the processor core
+temperature is currently at the trip temperature of the thermal monitor and that
+the processor power consumption is being reduced via either TM1 or TM2, depending
+on which is enabled. When clear, the flag indicates that the core temperature is
+below the thermal monitor trip temperature. This flag is read only.
+
+Thermal Status Log flag, bit 1 — When set, indicates that the thermal sensor has
+tripped since the last power-up or reset or since the last time that software
+cleared this flag. This flag is a sticky bit; once set it remains set until
+cleared by software or until a power-up or reset of the processor. The default
+state is clear.
+
+It is possible that when user reads MSR_IA32_THERM_STATUS or
+MSR_IA32_PACKAGE_THERM_STATUS, TM1/TM2 is not active. In this case,
+"Thermal Status flag" will read "0" and the "Thermal Status Log flag" will be set
+to show any previous "TM1/TM2" activation. But since it needs to be cleared by
+the software, it can't show the number of occurrences of "TM1/TM2" activations.
+
+Hence, Linux provides counters of how many times the "Thermal Status flag" was
+set. Also presents how long the "Thermal Status flag" was active in milliseconds.
+Using these counters, users can check if the performance was limited because of
+thermal events. It is recommended to read from sysfs instead of directly reading
+MSRs as the "Thermal Status Log flag" is reset by the driver to implement rate
+control.
+
+Sysfs Interface
+---------------
+
+Thermal throttling events are presented for each CPU under
+"/sys/devices/system/cpu/cpuX/thermal_throttle/", where "X" is the CPU number.
+
+All these counters are read-only. They can't be reset to 0. So, they can potentially
+overflow after reaching the maximum 64 bit unsigned integer.
+
+``core_throttle_count``
+	Shows the number of times "Thermal Status flag" changed from 0 to 1 for this
+	CPU since OS boot and thermal vector is initialized. This is a 64 bit counter.
+
+``package_throttle_count``
+	Shows the number of times "Thermal Status flag" changed from 0 to 1 for the
+	package containing this CPU since OS boot and thermal vector is initialized.
+	Package status is broadcast to all CPUs; all CPUs in the package increment
+	this count. This is a 64-bit counter.
+
+``core_throttle_max_time_ms``
+	Shows the maximum amount of time for which "Thermal Status flag" has been
+	set to 1 for this CPU at the core level since OS boot and thermal vector
+	is initialized.
+
+``package_throttle_max_time_ms``
+	Shows the maximum amount of time for which "Thermal Status flag" has been
+	set to 1 for the package containing this CPU since OS boot and thermal
+	vector is initialized.
+
+``core_throttle_total_time_ms``
+	Shows the cumulative time for which "Thermal Status flag" has been
+	set to 1 for this CPU for core level since OS boot and thermal vector
+	is initialized.
+
+``package_throttle_total_time_ms``
+	Shows the cumulative time for which "Thermal Status flag" has been set
+	to 1 for the package containing this CPU since OS boot and thermal vector
+	is initialized.
diff --git a/Documentation/arch/arm64/booting.rst b/Documentation/arch/arm64/booting.rst
index e4f953839f71..26efca09aef3 100644
--- a/Documentation/arch/arm64/booting.rst
+++ b/Documentation/arch/arm64/booting.rst
@@ -391,13 +391,13 @@ Before jumping into the kernel, the following conditions must be met:
     - SMCR_EL2.LEN must be initialised to the same value for all CPUs the
       kernel will execute on.
 
-    - HWFGRTR_EL2.nTPIDR2_EL0 (bit 55) must be initialised to 0b01.
+    - HFGRTR_EL2.nTPIDR2_EL0 (bit 55) must be initialised to 0b01.
 
-    - HWFGWTR_EL2.nTPIDR2_EL0 (bit 55) must be initialised to 0b01.
+    - HFGWTR_EL2.nTPIDR2_EL0 (bit 55) must be initialised to 0b01.
 
-    - HWFGRTR_EL2.nSMPRI_EL1 (bit 54) must be initialised to 0b01.
+    - HFGRTR_EL2.nSMPRI_EL1 (bit 54) must be initialised to 0b01.
 
-    - HWFGWTR_EL2.nSMPRI_EL1 (bit 54) must be initialised to 0b01.
+    - HFGWTR_EL2.nSMPRI_EL1 (bit 54) must be initialised to 0b01.
 
   For CPUs with the Scalable Matrix Extension FA64 feature (FEAT_SME_FA64):
 
diff --git a/Documentation/arch/arm64/sve.rst b/Documentation/arch/arm64/sve.rst
index 28152492c29c..a61c9d0efe4d 100644
--- a/Documentation/arch/arm64/sve.rst
+++ b/Documentation/arch/arm64/sve.rst
@@ -402,6 +402,11 @@ The regset data starts with struct user_sve_header, containing:
   streaming mode and any SETREGSET of NT_ARM_SSVE will enter streaming mode
   if the target was not in streaming mode.
 
+* On systems that do not support SVE it is permitted to use SETREGSET to
+  write SVE_PT_REGS_FPSIMD formatted data via NT_ARM_SVE, in this case the
+  vector length should be specified as 0. This allows streaming mode to be
+  disabled on systems with SME but not SVE.
+
 * If any register data is provided along with SVE_PT_VL_ONEXEC then the
   registers data will be interpreted with the current vector length, not
   the vector length configured for use on exec.
diff --git a/Documentation/arch/s390/s390dbf.rst b/Documentation/arch/s390/s390dbf.rst
index af8bdc3629e7..aad6d88974fe 100644
--- a/Documentation/arch/s390/s390dbf.rst
+++ b/Documentation/arch/s390/s390dbf.rst
@@ -243,9 +243,8 @@ Examples:
 
 Changing the size of debug areas
 ------------------------------------
-It is possible the change the size of debug areas through piping
-the number of pages to the debugfs file "pages". The resize request will
-also flush the debug areas.
+To resize a debug area, write the desired page count to the "pages" file.
+Existing data is preserved if it fits; otherwise, oldest entries are dropped.
 
 Example:
 
diff --git a/Documentation/crypto/index.rst b/Documentation/crypto/index.rst
index 100b47d049c0..4ee667c446f9 100644
--- a/Documentation/crypto/index.rst
+++ b/Documentation/crypto/index.rst
@@ -27,3 +27,4 @@ for cryptographic use cases, as well as programming examples.
    descore-readme
    device_drivers/index
    krb5
+   sha3
diff --git a/Documentation/crypto/sha3.rst b/Documentation/crypto/sha3.rst
new file mode 100644
index 000000000000..37640f295118
--- /dev/null
+++ b/Documentation/crypto/sha3.rst
@@ -0,0 +1,130 @@
+.. SPDX-License-Identifier: GPL-2.0-or-later
+
+==========================
+SHA-3 Algorithm Collection
+==========================
+
+.. contents::
+
+Overview
+========
+
+The SHA-3 family of algorithms, as specified in NIST FIPS-202 [1]_, contains six
+algorithms based on the Keccak sponge function.  The differences between them
+are: the "rate" (how much of the state buffer gets updated with new data between
+invocations of the Keccak function and analogous to the "block size"), what
+domain separation suffix gets appended to the input data, and how much output
+data is extracted at the end.  The Keccak sponge function is designed such that
+arbitrary amounts of output can be obtained for certain algorithms.
+
+Four digest algorithms are provided:
+
+ - SHA3-224
+ - SHA3-256
+ - SHA3-384
+ - SHA3-512
+
+Additionally, two Extendable-Output Functions (XOFs) are provided:
+
+ - SHAKE128
+ - SHAKE256
+
+The SHA-3 library API supports all six of these algorithms.  The four digest
+algorithms are also supported by the crypto_shash and crypto_ahash APIs.
+
+This document describes the SHA-3 library API.
+
+
+Digests
+=======
+
+The following functions compute SHA-3 digests::
+
+	void sha3_224(const u8 *in, size_t in_len, u8 out[SHA3_224_DIGEST_SIZE]);
+	void sha3_256(const u8 *in, size_t in_len, u8 out[SHA3_256_DIGEST_SIZE]);
+	void sha3_384(const u8 *in, size_t in_len, u8 out[SHA3_384_DIGEST_SIZE]);
+	void sha3_512(const u8 *in, size_t in_len, u8 out[SHA3_512_DIGEST_SIZE]);
+
+For users that need to pass in data incrementally, an incremental API is also
+provided.  The incremental API uses the following struct::
+
+	struct sha3_ctx { ... };
+
+Initialization is done with one of::
+
+	void sha3_224_init(struct sha3_ctx *ctx);
+	void sha3_256_init(struct sha3_ctx *ctx);
+	void sha3_384_init(struct sha3_ctx *ctx);
+	void sha3_512_init(struct sha3_ctx *ctx);
+
+Input data is then added with any number of calls to::
+
+	void sha3_update(struct sha3_ctx *ctx, const u8 *in, size_t in_len);
+
+Finally, the digest is generated using::
+
+	void sha3_final(struct sha3_ctx *ctx, u8 *out);
+
+which also zeroizes the context.  The length of the digest is determined by the
+initialization function that was called.
+
+
+Extendable-Output Functions
+===========================
+
+The following functions compute the SHA-3 extendable-output functions (XOFs)::
+
+	void shake128(const u8 *in, size_t in_len, u8 *out, size_t out_len);
+	void shake256(const u8 *in, size_t in_len, u8 *out, size_t out_len);
+
+For users that need to provide the input data incrementally and/or receive the
+output data incrementally, an incremental API is also provided.  The incremental
+API uses the following struct::
+
+	struct shake_ctx { ... };
+
+Initialization is done with one of::
+
+	void shake128_init(struct shake_ctx *ctx);
+	void shake256_init(struct shake_ctx *ctx);
+
+Input data is then added with any number of calls to::
+
+	void shake_update(struct shake_ctx *ctx, const u8 *in, size_t in_len);
+
+Finally, the output data is extracted with any number of calls to::
+
+	void shake_squeeze(struct shake_ctx *ctx, u8 *out, size_t out_len);
+
+and telling it how much data should be extracted.  Note that performing multiple
+squeezes, with the output laid consecutively in a buffer, gets exactly the same
+output as doing a single squeeze for the combined amount over the same buffer.
+
+More input data cannot be added after squeezing has started.
+
+Once all the desired output has been extracted, zeroize the context::
+
+	void shake_zeroize_ctx(struct shake_ctx *ctx);
+
+
+Testing
+=======
+
+To test the SHA-3 code, use sha3_kunit (CONFIG_CRYPTO_LIB_SHA3_KUNIT_TEST).
+
+Since the SHA-3 algorithms are FIPS-approved, when the kernel is booted in FIPS
+mode the SHA-3 library also performs a simple self-test.  This is purely to meet
+a FIPS requirement.  Normal testing done by kernel developers and integrators
+should use the much more comprehensive KUnit test suite instead.
+
+
+References
+==========
+
+.. [1] https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.202.pdf
+
+
+API Function Reference
+======================
+
+.. kernel-doc:: include/crypto/sha3.h
diff --git a/Documentation/devicetree/bindings/perf/fsl-imx-ddr.yaml b/Documentation/devicetree/bindings/perf/fsl-imx-ddr.yaml
index d2e578d6b83b..103e4aec2439 100644
--- a/Documentation/devicetree/bindings/perf/fsl-imx-ddr.yaml
+++ b/Documentation/devicetree/bindings/perf/fsl-imx-ddr.yaml
@@ -14,6 +14,7 @@ properties:
     oneOf:
       - enum:
           - fsl,imx8-ddr-pmu
+          - fsl,imx8dxl-db-pmu
           - fsl,imx8m-ddr-pmu
           - fsl,imx8mq-ddr-pmu
           - fsl,imx8mm-ddr-pmu
@@ -28,7 +29,10 @@ properties:
               - fsl,imx8mp-ddr-pmu
           - const: fsl,imx8m-ddr-pmu
       - items:
-          - const: fsl,imx8dxl-ddr-pmu
+          - enum:
+              - fsl,imx8dxl-ddr-pmu
+              - fsl,imx8qm-ddr-pmu
+              - fsl,imx8qxp-ddr-pmu
           - const: fsl,imx8-ddr-pmu
       - items:
           - enum:
@@ -43,6 +47,14 @@ properties:
   interrupts:
     maxItems: 1
 
+  clocks:
+    maxItems: 2
+
+  clock-names:
+    items:
+      - const: ipg
+      - const: cnt
+
 required:
   - compatible
   - reg
@@ -50,6 +62,21 @@ required:
 
 additionalProperties: false
 
+allOf:
+  - if:
+      properties:
+        compatible:
+          contains:
+            const: fsl,imx8dxl-db-pmu
+    then:
+      required:
+        - clocks
+        - clock-names
+    else:
+      properties:
+        clocks: false
+        clock-names: false
+
 examples:
   - |
     #include <dt-bindings/interrupt-controller/arm-gic.h>
diff --git a/Documentation/devicetree/bindings/thermal/fsl,imx91-tmu.yaml b/Documentation/devicetree/bindings/thermal/fsl,imx91-tmu.yaml
new file mode 100644
index 000000000000..7fd1a86d7287
--- /dev/null
+++ b/Documentation/devicetree/bindings/thermal/fsl,imx91-tmu.yaml
@@ -0,0 +1,87 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/thermal/fsl,imx91-tmu.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: NXP i.MX91 Thermal
+
+maintainers:
+  - Pengfei Li <pengfei.li_1@nxp.com>
+
+description:
+  i.MX91 features a new temperature sensor. It includes programmable
+  temperature threshold comparators for both normal and privileged
+  accesses and allows a programmable measurement frequency for the
+  Periodic One-Shot Measurement mode. Additionally, it provides
+  status registers for indicating the end of measurement and threshold
+  violation events.
+
+properties:
+  compatible:
+    items:
+      - const: fsl,imx91-tmu
+
+  reg:
+    maxItems: 1
+
+  clocks:
+    maxItems: 1
+
+  interrupts:
+    items:
+      - description: Comparator 1 irq
+      - description: Comparator 2 irq
+      - description: Data ready irq
+
+  interrupt-names:
+    items:
+      - const: thr1
+      - const: thr2
+      - const: ready
+
+  nvmem-cells:
+    items:
+      - description: Phandle to the trim control 1 provided by ocotp
+      - description: Phandle to the trim control 2 provided by ocotp
+
+  nvmem-cell-names:
+    items:
+      - const: trim1
+      - const: trim2
+
+  "#thermal-sensor-cells":
+    const: 0
+
+required:
+  - compatible
+  - reg
+  - clocks
+  - interrupts
+  - interrupt-names
+
+allOf:
+  - $ref: thermal-sensor.yaml
+
+unevaluatedProperties: false
+
+examples:
+  - |
+    #include <dt-bindings/interrupt-controller/arm-gic.h>
+    #include <dt-bindings/clock/imx93-clock.h>
+
+    thermal-sensor@44482000 {
+        compatible = "fsl,imx91-tmu";
+        reg = <0x44482000 0x1000>;
+        #thermal-sensor-cells = <0>;
+        clocks = <&clk IMX93_CLK_TMC_GATE>;
+        interrupt-parent = <&gic>;
+        interrupts = <GIC_SPI 83 IRQ_TYPE_LEVEL_HIGH>,
+                     <GIC_SPI 84 IRQ_TYPE_LEVEL_HIGH>,
+                     <GIC_SPI 85 IRQ_TYPE_LEVEL_HIGH>;
+        interrupt-names = "thr1", "thr2", "ready";
+        nvmem-cells = <&tmu_trim1>, <&tmu_trim2>;
+        nvmem-cell-names = "trim1", "trim2";
+    };
+
+...
diff --git a/Documentation/devicetree/bindings/thermal/qcom-tsens.yaml b/Documentation/devicetree/bindings/thermal/qcom-tsens.yaml
index 78e2f6573b96..921b6172d6f0 100644
--- a/Documentation/devicetree/bindings/thermal/qcom-tsens.yaml
+++ b/Documentation/devicetree/bindings/thermal/qcom-tsens.yaml
@@ -36,10 +36,15 @@ properties:
               - qcom,msm8974-tsens
           - const: qcom,tsens-v0_1
 
+      - description:
+          v1 of TSENS without RPM which requires to be explicitly reset
+          and enabled in the driver.
+        enum:
+          - qcom,ipq5018-tsens
+
       - description: v1 of TSENS
         items:
           - enum:
-              - qcom,ipq5018-tsens
               - qcom,msm8937-tsens
               - qcom,msm8956-tsens
               - qcom,msm8976-tsens
@@ -50,11 +55,13 @@ properties:
         items:
           - enum:
               - qcom,glymur-tsens
+              - qcom,kaanapali-tsens
               - qcom,milos-tsens
               - qcom,msm8953-tsens
               - qcom,msm8996-tsens
               - qcom,msm8998-tsens
               - qcom,qcm2290-tsens
+	      - qcom,qcs8300-tsens
               - qcom,qcs615-tsens
               - qcom,sa8255p-tsens
               - qcom,sa8775p-tsens
diff --git a/Documentation/devicetree/bindings/thermal/renesas,r9a09g047-tsu.yaml b/Documentation/devicetree/bindings/thermal/renesas,r9a09g047-tsu.yaml
index 8d3f3c24f0f2..befdc8b7a082 100644
--- a/Documentation/devicetree/bindings/thermal/renesas,r9a09g047-tsu.yaml
+++ b/Documentation/devicetree/bindings/thermal/renesas,r9a09g047-tsu.yaml
@@ -16,7 +16,11 @@ description:
 
 properties:
   compatible:
-    const: renesas,r9a09g047-tsu
+    oneOf:
+      - const: renesas,r9a09g047-tsu # RZ/G3E
+      - items:
+          - const: renesas,r9a09g057-tsu # RZ/V2H
+          - const: renesas,r9a09g047-tsu # RZ/G3E
 
   reg:
     maxItems: 1
diff --git a/Documentation/driver-api/thermal/intel_dptf.rst b/Documentation/driver-api/thermal/intel_dptf.rst
index c51ac793dc06..916bf0f36a03 100644
--- a/Documentation/driver-api/thermal/intel_dptf.rst
+++ b/Documentation/driver-api/thermal/intel_dptf.rst
@@ -409,3 +409,26 @@ based on the processor generation.
 		Limit 1 from being exhausted.
 
 	4 – Unknown: Can't classify.
+
+	On processors starting from Panther Lake additional hints are provided.
+	The hardware analyzes workload residencies over an extended period to
+	determine whether the workload classification tends toward idle/battery
+	life states or sustained/performance states. Based on this long-term
+	analysis, it classifies:
+
+	Power Classification: If the workload exhibits more idle or battery life
+	residencies, it is classified as "power".
+
+	Performance Classification: If the workload exhibits more sustained or
+	performance residencies, it is classified as "performance".
+
+	This approach enables applications to ignore short-term workload
+	fluctuations and instead respond to longer-term power vs. performance
+	trends.
+
+	Residency thresholds for this classification are CPU generation-specific.
+	Classification is reported via bit 4 of the workload_type_index:
+
+	Bit 4 = 1: Power classification
+
+	Bit 4 = 0: Performance classification
diff --git a/Documentation/filesystems/fscrypt.rst b/Documentation/filesystems/fscrypt.rst
index 696a5844bfa3..70af896822e1 100644
--- a/Documentation/filesystems/fscrypt.rst
+++ b/Documentation/filesystems/fscrypt.rst
@@ -450,9 +450,7 @@ API, but the filenames mode still does.
         - CONFIG_CRYPTO_HCTR2
     - Recommended:
         - arm64: CONFIG_CRYPTO_AES_ARM64_CE_BLK
-        - arm64: CONFIG_CRYPTO_POLYVAL_ARM64_CE
         - x86: CONFIG_CRYPTO_AES_NI_INTEL
-        - x86: CONFIG_CRYPTO_POLYVAL_CLMUL_NI
 
 - Adiantum
     - Mandatory:
diff --git a/Documentation/netlink/specs/em.yaml b/Documentation/netlink/specs/em.yaml
new file mode 100644
index 000000000000..9905ca482325
--- /dev/null
+++ b/Documentation/netlink/specs/em.yaml
@@ -0,0 +1,113 @@
+# SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause)
+
+name: em
+
+doc: |
+  Energy model netlink interface to notify its changes.
+
+protocol: genetlink
+
+uapi-header: linux/energy_model.h
+
+attribute-sets:
+  -
+    name: pds
+    attributes:
+      -
+        name: pd
+        type: nest
+        nested-attributes: pd
+        multi-attr: true
+  -
+    name: pd
+    attributes:
+      -
+        name: pad
+        type: pad
+      -
+        name: pd-id
+        type: u32
+      -
+        name: flags
+        type: u64
+      -
+        name: cpus
+        type: string
+  -
+    name: pd-table
+    attributes:
+      -
+        name: pd-id
+        type: u32
+      -
+        name: ps
+        type: nest
+        nested-attributes: ps
+        multi-attr: true
+  -
+    name: ps
+    attributes:
+      -
+        name: pad
+        type: pad
+      -
+        name: performance
+        type: u64
+      -
+        name: frequency
+        type: u64
+      -
+        name: power
+        type: u64
+      -
+        name: cost
+        type: u64
+      -
+        name: flags
+        type: u64
+
+operations:
+  list:
+    -
+      name: get-pds
+      attribute-set: pds
+      doc: Get the list of information for all performance domains.
+      do:
+        reply:
+          attributes:
+            - pd
+    -
+      name: get-pd-table
+      attribute-set: pd-table
+      doc: Get the energy model table of a performance domain.
+      do:
+        request:
+          attributes:
+            - pd-id
+        reply:
+          attributes:
+            - pd-id
+            - ps
+    -
+      name: pd-created
+      doc: A performance domain is created.
+      notify: get-pd-table
+      mcgrp: event
+    -
+      name: pd-updated
+      doc: A performance domain is updated.
+      notify: get-pd-table
+      mcgrp: event
+    -
+      name: pd-deleted
+      doc: A performance domain is deleted.
+      attribute-set: pd-table
+      event:
+        attributes:
+            - pd-id
+      mcgrp: event
+
+mcast-groups:
+  list:
+    -
+      name: event
diff --git a/Documentation/power/index.rst b/Documentation/power/index.rst
index a0f5244fb427..ea70633d9ce6 100644
--- a/Documentation/power/index.rst
+++ b/Documentation/power/index.rst
@@ -19,6 +19,7 @@ Power Management
     power_supply_class
     runtime_pm
     s2ram
+    shutdown-debugging
     suspend-and-cpuhotplug
     suspend-and-interrupts
     swsusp-and-swap-files
diff --git a/Documentation/power/pm_qos_interface.rst b/Documentation/power/pm_qos_interface.rst
index 5019c79c7710..4c008e2202f0 100644
--- a/Documentation/power/pm_qos_interface.rst
+++ b/Documentation/power/pm_qos_interface.rst
@@ -55,7 +55,8 @@ int cpu_latency_qos_request_active(handle):
 
 From user space:
 
-The infrastructure exposes one device node, /dev/cpu_dma_latency, for the CPU
+The infrastructure exposes two separate device nodes, /dev/cpu_dma_latency for
+the CPU latency QoS and /dev/cpu_wakeup_latency for the CPU system wakeup
 latency QoS.
 
 Only processes can register a PM QoS request.  To provide for automatic
@@ -63,15 +64,15 @@ cleanup of a process, the interface requires the process to register its
 parameter requests as follows.
 
 To register the default PM QoS target for the CPU latency QoS, the process must
-open /dev/cpu_dma_latency.
+open /dev/cpu_dma_latency.  To register a CPU system wakeup QoS limit, the
+process must open /dev/cpu_wakeup_latency.
 
 As long as the device node is held open that process has a registered
 request on the parameter.
 
 To change the requested target value, the process needs to write an s32 value to
 the open device node.  Alternatively, it can write a hex string for the value
-using the 10 char long format e.g. "0x12345678".  This translates to a
-cpu_latency_qos_update_request() call.
+using the 10 char long format e.g. "0x12345678".
 
 To remove the user mode request for a target value simply close the device
 node.
diff --git a/Documentation/power/runtime_pm.rst b/Documentation/power/runtime_pm.rst
index c8dbdb8595e5..8246df3cecd7 100644
--- a/Documentation/power/runtime_pm.rst
+++ b/Documentation/power/runtime_pm.rst
@@ -480,16 +480,6 @@ drivers/base/power/runtime.c and include/linux/pm_runtime.h:
   `bool pm_runtime_status_suspended(struct device *dev);`
     - return true if the device's runtime PM status is 'suspended'
 
-  `void pm_runtime_allow(struct device *dev);`
-    - set the power.runtime_auto flag for the device and decrease its usage
-      counter (used by the /sys/devices/.../power/control interface to
-      effectively allow the device to be power managed at run time)
-
-  `void pm_runtime_forbid(struct device *dev);`
-    - unset the power.runtime_auto flag for the device and increase its usage
-      counter (used by the /sys/devices/.../power/control interface to
-      effectively prevent the device from being power managed at run time)
-
   `void pm_runtime_no_callbacks(struct device *dev);`
     - set the power.no_callbacks flag for the device and remove the runtime
       PM attributes from /sys/devices/.../power (or prevent them from being
diff --git a/Documentation/power/shutdown-debugging.rst b/Documentation/power/shutdown-debugging.rst
new file mode 100644
index 000000000000..c510122e0bbc
--- /dev/null
+++ b/Documentation/power/shutdown-debugging.rst
@@ -0,0 +1,53 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+Debugging Kernel Shutdown Hangs with pstore
++++++++++++++++++++++++++++++++++++++++++++
+
+Overview
+========
+If the system hangs while shutting down, the kernel logs may need to be
+retrieved to debug the issue.
+
+On systems that have a UART available, it is best to configure the kernel to use
+this UART for kernel console output.
+
+If a UART isn't available, the ``pstore`` subsystem provides a mechanism to
+persist this data across a system reset, allowing it to be retrieved on the next
+boot.
+
+Kernel Configuration
+====================
+To enable ``pstore`` and enable saving kernel ring buffer logs, set the
+following kernel configuration options:
+
+* ``CONFIG_PSTORE=y``
+* ``CONFIG_PSTORE_CONSOLE=y``
+
+Additionally, enable a backend to store the data. Depending upon your platform
+some potential options include:
+
+* ``CONFIG_EFI_VARS_PSTORE=y``
+* ``CONFIG_PSTORE_RAM=y``
+* ``CONFIG_CHROMEOS_PSTORE=y``
+* ``CONFIG_PSTORE_BLK=y``
+
+Kernel Command-line Parameters
+==============================
+Add these parameters to your kernel command line:
+
+* ``printk.always_kmsg_dump=Y``
+	* Forces the kernel to dump the entire message buffer to pstore during
+		shutdown
+* ``efi_pstore.pstore_disable=N``
+	* For EFI-based systems, ensures the EFI backend is active
+
+Userspace Interaction and Log Retrieval
+=======================================
+On the next boot after a hang, pstore logs will be available in the pstore
+filesystem (``/sys/fs/pstore``) and can be retrieved by userspace.
+
+On systemd systems, the ``systemd-pstore`` service will help do the following:
+
+#. Locate pstore data in ``/sys/fs/pstore``
+#. Read and save it to ``/var/lib/systemd/pstore``
+#. Clear pstore data for the next event
diff --git a/MAINTAINERS b/MAINTAINERS
index e8ad048bf35d..472dc58de40d 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -9188,6 +9188,9 @@ S:	Maintained
 F:	kernel/power/energy_model.c
 F:	include/linux/energy_model.h
 F:	Documentation/power/energy-model.rst
+F:	Documentation/netlink/specs/em.yaml
+F:	include/uapi/linux/energy_model.h
+F:	kernel/power/em_netlink*.*
 
 EPAPR HYPERVISOR BYTE CHANNEL DEVICE DRIVER
 M:	Laurentiu Tudor <laurentiu.tudor@nxp.com>
@@ -17470,6 +17473,16 @@ S:	Maintained
 F:	Documentation/devicetree/bindings/leds/backlight/mps,mp3309c.yaml
 F:	drivers/video/backlight/mp3309c.c
 
+MPAM DRIVER
+M:	James Morse <james.morse@arm.com>
+M:	Ben Horgan <ben.horgan@arm.com>
+R:	Reinette Chatre <reinette.chatre@intel.com>
+R:	Fenghua Yu <fenghuay@nvidia.com>
+S:	Maintained
+F:	drivers/resctrl/mpam_*
+F:	drivers/resctrl/test_mpam_*
+F:	include/linux/arm_mpam.h
+
 MPS MP2869 DRIVER
 M:	Wensheng Wang <wenswang@yeah.net>
 L:	linux-hwmon@vger.kernel.org
diff --git a/arch/arm/crypto/Kconfig b/arch/arm/crypto/Kconfig
index c436eec22d86..f30d743df264 100644
--- a/arch/arm/crypto/Kconfig
+++ b/arch/arm/crypto/Kconfig
@@ -33,22 +33,6 @@ config CRYPTO_NHPOLY1305_NEON
 	  Architecture: arm using:
 	  - NEON (Advanced SIMD) extensions
 
-config CRYPTO_BLAKE2B_NEON
-	tristate "Hash functions: BLAKE2b (NEON)"
-	depends on KERNEL_MODE_NEON
-	select CRYPTO_BLAKE2B
-	help
-	  BLAKE2b cryptographic hash function (RFC 7693)
-
-	  Architecture: arm using
-	  - NEON (Advanced SIMD) extensions
-
-	  BLAKE2b digest algorithm optimized with ARM NEON instructions.
-	  On ARM processors that have NEON support but not the ARMv8
-	  Crypto Extensions, typically this BLAKE2b implementation is
-	  much faster than the SHA-2 family and slightly faster than
-	  SHA-1.
-
 config CRYPTO_AES_ARM
 	tristate "Ciphers: AES"
 	select CRYPTO_ALGAPI
diff --git a/arch/arm/crypto/Makefile b/arch/arm/crypto/Makefile
index 6346a73effc0..86dd43313dbf 100644
--- a/arch/arm/crypto/Makefile
+++ b/arch/arm/crypto/Makefile
@@ -5,7 +5,6 @@
 
 obj-$(CONFIG_CRYPTO_AES_ARM) += aes-arm.o
 obj-$(CONFIG_CRYPTO_AES_ARM_BS) += aes-arm-bs.o
-obj-$(CONFIG_CRYPTO_BLAKE2B_NEON) += blake2b-neon.o
 obj-$(CONFIG_CRYPTO_NHPOLY1305_NEON) += nhpoly1305-neon.o
 
 obj-$(CONFIG_CRYPTO_AES_ARM_CE) += aes-arm-ce.o
@@ -13,7 +12,6 @@ obj-$(CONFIG_CRYPTO_GHASH_ARM_CE) += ghash-arm-ce.o
 
 aes-arm-y	:= aes-cipher-core.o aes-cipher-glue.o
 aes-arm-bs-y	:= aes-neonbs-core.o aes-neonbs-glue.o
-blake2b-neon-y  := blake2b-neon-core.o blake2b-neon-glue.o
 aes-arm-ce-y	:= aes-ce-core.o aes-ce-glue.o
 ghash-arm-ce-y	:= ghash-ce-core.o ghash-ce-glue.o
 nhpoly1305-neon-y := nh-neon-core.o nhpoly1305-neon-glue.o
diff --git a/arch/arm/crypto/blake2b-neon-glue.c b/arch/arm/crypto/blake2b-neon-glue.c
deleted file mode 100644
index 2ff443a91724..000000000000
--- a/arch/arm/crypto/blake2b-neon-glue.c
+++ /dev/null
@@ -1,104 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * BLAKE2b digest algorithm, NEON accelerated
- *
- * Copyright 2020 Google LLC
- */
-
-#include <crypto/internal/blake2b.h>
-#include <crypto/internal/hash.h>
-
-#include <linux/module.h>
-#include <linux/sizes.h>
-
-#include <asm/neon.h>
-#include <asm/simd.h>
-
-asmlinkage void blake2b_compress_neon(struct blake2b_state *state,
-				      const u8 *block, size_t nblocks, u32 inc);
-
-static void blake2b_compress_arch(struct blake2b_state *state,
-				  const u8 *block, size_t nblocks, u32 inc)
-{
-	do {
-		const size_t blocks = min_t(size_t, nblocks,
-					    SZ_4K / BLAKE2B_BLOCK_SIZE);
-
-		kernel_neon_begin();
-		blake2b_compress_neon(state, block, blocks, inc);
-		kernel_neon_end();
-
-		nblocks -= blocks;
-		block += blocks * BLAKE2B_BLOCK_SIZE;
-	} while (nblocks);
-}
-
-static int crypto_blake2b_update_neon(struct shash_desc *desc,
-				      const u8 *in, unsigned int inlen)
-{
-	return crypto_blake2b_update_bo(desc, in, inlen, blake2b_compress_arch);
-}
-
-static int crypto_blake2b_finup_neon(struct shash_desc *desc, const u8 *in,
-				     unsigned int inlen, u8 *out)
-{
-	return crypto_blake2b_finup(desc, in, inlen, out,
-				    blake2b_compress_arch);
-}
-
-#define BLAKE2B_ALG(name, driver_name, digest_size)			\
-	{								\
-		.base.cra_name		= name,				\
-		.base.cra_driver_name	= driver_name,			\
-		.base.cra_priority	= 200,				\
-		.base.cra_flags		= CRYPTO_ALG_OPTIONAL_KEY |	\
-					  CRYPTO_AHASH_ALG_BLOCK_ONLY |	\
-					  CRYPTO_AHASH_ALG_FINAL_NONZERO, \
-		.base.cra_blocksize	= BLAKE2B_BLOCK_SIZE,		\
-		.base.cra_ctxsize	= sizeof(struct blake2b_tfm_ctx), \
-		.base.cra_module	= THIS_MODULE,			\
-		.digestsize		= digest_size,			\
-		.setkey			= crypto_blake2b_setkey,	\
-		.init			= crypto_blake2b_init,		\
-		.update			= crypto_blake2b_update_neon,	\
-		.finup			= crypto_blake2b_finup_neon,	\
-		.descsize		= sizeof(struct blake2b_state),	\
-		.statesize		= BLAKE2B_STATE_SIZE,		\
-	}
-
-static struct shash_alg blake2b_neon_algs[] = {
-	BLAKE2B_ALG("blake2b-160", "blake2b-160-neon", BLAKE2B_160_HASH_SIZE),
-	BLAKE2B_ALG("blake2b-256", "blake2b-256-neon", BLAKE2B_256_HASH_SIZE),
-	BLAKE2B_ALG("blake2b-384", "blake2b-384-neon", BLAKE2B_384_HASH_SIZE),
-	BLAKE2B_ALG("blake2b-512", "blake2b-512-neon", BLAKE2B_512_HASH_SIZE),
-};
-
-static int __init blake2b_neon_mod_init(void)
-{
-	if (!(elf_hwcap & HWCAP_NEON))
-		return -ENODEV;
-
-	return crypto_register_shashes(blake2b_neon_algs,
-				       ARRAY_SIZE(blake2b_neon_algs));
-}
-
-static void __exit blake2b_neon_mod_exit(void)
-{
-	crypto_unregister_shashes(blake2b_neon_algs,
-				  ARRAY_SIZE(blake2b_neon_algs));
-}
-
-module_init(blake2b_neon_mod_init);
-module_exit(blake2b_neon_mod_exit);
-
-MODULE_DESCRIPTION("BLAKE2b digest algorithm, NEON accelerated");
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Eric Biggers <ebiggers@google.com>");
-MODULE_ALIAS_CRYPTO("blake2b-160");
-MODULE_ALIAS_CRYPTO("blake2b-160-neon");
-MODULE_ALIAS_CRYPTO("blake2b-256");
-MODULE_ALIAS_CRYPTO("blake2b-256-neon");
-MODULE_ALIAS_CRYPTO("blake2b-384");
-MODULE_ALIAS_CRYPTO("blake2b-384-neon");
-MODULE_ALIAS_CRYPTO("blake2b-512");
-MODULE_ALIAS_CRYPTO("blake2b-512-neon");
diff --git a/arch/arm/include/asm/simd.h b/arch/arm/include/asm/simd.h
index be08a8da046f..8549fa8b7253 100644
--- a/arch/arm/include/asm/simd.h
+++ b/arch/arm/include/asm/simd.h
@@ -2,14 +2,21 @@
 #ifndef _ASM_SIMD_H
 #define _ASM_SIMD_H
 
+#include <linux/cleanup.h>
 #include <linux/compiler_attributes.h>
 #include <linux/preempt.h>
 #include <linux/types.h>
 
+#include <asm/neon.h>
+
 static __must_check inline bool may_use_simd(void)
 {
 	return IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && !in_hardirq()
 	       && !irqs_disabled();
 }
 
+DEFINE_LOCK_GUARD_0(ksimd, kernel_neon_begin(), kernel_neon_end())
+
+#define scoped_ksimd()	scoped_guard(ksimd)
+
 #endif	/* _ASM_SIMD_H */
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 6663ffd23f25..65db12f66b8f 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -47,7 +47,6 @@ config ARM64
 	select ARCH_HAS_SETUP_DMA_OPS
 	select ARCH_HAS_SET_DIRECT_MAP
 	select ARCH_HAS_SET_MEMORY
-	select ARCH_HAS_MEM_ENCRYPT
 	select ARCH_HAS_FORCE_DMA_UNENCRYPTED
 	select ARCH_STACKWALK
 	select ARCH_HAS_STRICT_KERNEL_RWX
@@ -2023,6 +2022,31 @@ config ARM64_TLB_RANGE
 	  ARMv8.4-TLBI provides TLBI invalidation instruction that apply to a
 	  range of input addresses.
 
+config ARM64_MPAM
+	bool "Enable support for MPAM"
+	select ARM64_MPAM_DRIVER if EXPERT	# does nothing yet
+	select ACPI_MPAM if ACPI
+	help
+	  Memory System Resource Partitioning and Monitoring (MPAM) is an
+	  optional extension to the Arm architecture that allows each
+	  transaction issued to the memory system to be labelled with a
+	  Partition identifier (PARTID) and Performance Monitoring Group
+	  identifier (PMG).
+
+	  Memory system components, such as the caches, can be configured with
+	  policies to control how much of various physical resources (such as
+	  memory bandwidth or cache memory) the transactions labelled with each
+	  PARTID can consume.  Depending on the capabilities of the hardware,
+	  the PARTID and PMG can also be used as filtering criteria to measure
+	  the memory system resource consumption of different parts of a
+	  workload.
+
+	  Use of this extension requires CPU support, support in the
+	  Memory System Components (MSC), and a description from firmware
+	  of where the MSCs are in the address space.
+
+	  MPAM is exposed to user-space via the resctrl pseudo filesystem.
+
 endmenu # "ARMv8.4 architectural features"
 
 menu "ARMv8.5 architectural features"
diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig
index 1a48faad2473..997fa7cd9de5 100644
--- a/arch/arm64/configs/defconfig
+++ b/arch/arm64/configs/defconfig
@@ -1783,10 +1783,10 @@ CONFIG_CRYPTO_CHACHA20=m
 CONFIG_CRYPTO_BENCHMARK=m
 CONFIG_CRYPTO_ECHAINIV=y
 CONFIG_CRYPTO_MICHAEL_MIC=m
+CONFIG_CRYPTO_SHA3=m
 CONFIG_CRYPTO_ANSI_CPRNG=y
 CONFIG_CRYPTO_USER_API_RNG=m
 CONFIG_CRYPTO_GHASH_ARM64_CE=y
-CONFIG_CRYPTO_SHA3_ARM64=m
 CONFIG_CRYPTO_SM3_ARM64_CE=m
 CONFIG_CRYPTO_AES_ARM64_CE_BLK=y
 CONFIG_CRYPTO_AES_ARM64_BS=m
diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig
index 91f3093eee6a..bdd276a6e540 100644
--- a/arch/arm64/crypto/Kconfig
+++ b/arch/arm64/crypto/Kconfig
@@ -25,17 +25,6 @@ config CRYPTO_NHPOLY1305_NEON
 	  Architecture: arm64 using:
 	  - NEON (Advanced SIMD) extensions
 
-config CRYPTO_SHA3_ARM64
-	tristate "Hash functions: SHA-3 (ARMv8.2 Crypto Extensions)"
-	depends on KERNEL_MODE_NEON
-	select CRYPTO_HASH
-	select CRYPTO_SHA3
-	help
-	  SHA-3 secure hash algorithms (FIPS 202)
-
-	  Architecture: arm64 using:
-	  - ARMv8.2 Crypto Extensions
-
 config CRYPTO_SM3_NEON
 	tristate "Hash functions: SM3 (NEON)"
 	depends on KERNEL_MODE_NEON
@@ -58,16 +47,6 @@ config CRYPTO_SM3_ARM64_CE
 	  Architecture: arm64 using:
 	  - ARMv8.2 Crypto Extensions
 
-config CRYPTO_POLYVAL_ARM64_CE
-	tristate "Hash functions: POLYVAL (ARMv8 Crypto Extensions)"
-	depends on KERNEL_MODE_NEON
-	select CRYPTO_POLYVAL
-	help
-	  POLYVAL hash function for HCTR2
-
-	  Architecture: arm64 using:
-	  - ARMv8 Crypto Extensions
-
 config CRYPTO_AES_ARM64
 	tristate "Ciphers: AES, modes: ECB, CBC, CTR, CTS, XCTR, XTS"
 	select CRYPTO_AES
diff --git a/arch/arm64/crypto/Makefile b/arch/arm64/crypto/Makefile
index a8b2cdbe202c..1e330aa08d3f 100644
--- a/arch/arm64/crypto/Makefile
+++ b/arch/arm64/crypto/Makefile
@@ -5,9 +5,6 @@
 # Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org>
 #
 
-obj-$(CONFIG_CRYPTO_SHA3_ARM64) += sha3-ce.o
-sha3-ce-y := sha3-ce-glue.o sha3-ce-core.o
-
 obj-$(CONFIG_CRYPTO_SM3_NEON) += sm3-neon.o
 sm3-neon-y := sm3-neon-glue.o sm3-neon-core.o
 
@@ -32,9 +29,6 @@ sm4-neon-y := sm4-neon-glue.o sm4-neon-core.o
 obj-$(CONFIG_CRYPTO_GHASH_ARM64_CE) += ghash-ce.o
 ghash-ce-y := ghash-ce-glue.o ghash-ce-core.o
 
-obj-$(CONFIG_CRYPTO_POLYVAL_ARM64_CE) += polyval-ce.o
-polyval-ce-y := polyval-ce-glue.o polyval-ce-core.o
-
 obj-$(CONFIG_CRYPTO_AES_ARM64_CE) += aes-ce-cipher.o
 aes-ce-cipher-y := aes-ce-core.o aes-ce-glue.o
 
diff --git a/arch/arm64/crypto/aes-ce-ccm-glue.c b/arch/arm64/crypto/aes-ce-ccm-glue.c
index 2d791d51891b..c4fd648471f1 100644
--- a/arch/arm64/crypto/aes-ce-ccm-glue.c
+++ b/arch/arm64/crypto/aes-ce-ccm-glue.c
@@ -8,7 +8,6 @@
  * Author: Ard Biesheuvel <ardb@kernel.org>
  */
 
-#include <asm/neon.h>
 #include <linux/unaligned.h>
 #include <crypto/aes.h>
 #include <crypto/scatterwalk.h>
@@ -16,6 +15,8 @@
 #include <crypto/internal/skcipher.h>
 #include <linux/module.h>
 
+#include <asm/simd.h>
+
 #include "aes-ce-setkey.h"
 
 MODULE_IMPORT_NS("CRYPTO_INTERNAL");
@@ -114,11 +115,8 @@ static u32 ce_aes_ccm_auth_data(u8 mac[], u8 const in[], u32 abytes,
 			in += adv;
 			abytes -= adv;
 
-			if (unlikely(rem)) {
-				kernel_neon_end();
-				kernel_neon_begin();
+			if (unlikely(rem))
 				macp = 0;
-			}
 		} else {
 			u32 l = min(AES_BLOCK_SIZE - macp, abytes);
 
@@ -187,40 +185,38 @@ static int ccm_encrypt(struct aead_request *req)
 	if (unlikely(err))
 		return err;
 
-	kernel_neon_begin();
+	scoped_ksimd() {
+		if (req->assoclen)
+			ccm_calculate_auth_mac(req, mac);
 
-	if (req->assoclen)
-		ccm_calculate_auth_mac(req, mac);
+		do {
+			u32 tail = walk.nbytes % AES_BLOCK_SIZE;
+			const u8 *src = walk.src.virt.addr;
+			u8 *dst = walk.dst.virt.addr;
+			u8 buf[AES_BLOCK_SIZE];
+			u8 *final_iv = NULL;
 
-	do {
-		u32 tail = walk.nbytes % AES_BLOCK_SIZE;
-		const u8 *src = walk.src.virt.addr;
-		u8 *dst = walk.dst.virt.addr;
-		u8 buf[AES_BLOCK_SIZE];
-		u8 *final_iv = NULL;
-
-		if (walk.nbytes == walk.total) {
-			tail = 0;
-			final_iv = orig_iv;
-		}
-
-		if (unlikely(walk.nbytes < AES_BLOCK_SIZE))
-			src = dst = memcpy(&buf[sizeof(buf) - walk.nbytes],
-					   src, walk.nbytes);
+			if (walk.nbytes == walk.total) {
+				tail = 0;
+				final_iv = orig_iv;
+			}
 
-		ce_aes_ccm_encrypt(dst, src, walk.nbytes - tail,
-				   ctx->key_enc, num_rounds(ctx),
-				   mac, walk.iv, final_iv);
+			if (unlikely(walk.nbytes < AES_BLOCK_SIZE))
+				src = dst = memcpy(&buf[sizeof(buf) - walk.nbytes],
+						   src, walk.nbytes);
 
-		if (unlikely(walk.nbytes < AES_BLOCK_SIZE))
-			memcpy(walk.dst.virt.addr, dst, walk.nbytes);
+			ce_aes_ccm_encrypt(dst, src, walk.nbytes - tail,
+					   ctx->key_enc, num_rounds(ctx),
+					   mac, walk.iv, final_iv);
 
-		if (walk.nbytes) {
-			err = skcipher_walk_done(&walk, tail);
-		}
-	} while (walk.nbytes);
+			if (unlikely(walk.nbytes < AES_BLOCK_SIZE))
+				memcpy(walk.dst.virt.addr, dst, walk.nbytes);
 
-	kernel_neon_end();
+			if (walk.nbytes) {
+				err = skcipher_walk_done(&walk, tail);
+			}
+		} while (walk.nbytes);
+	}
 
 	if (unlikely(err))
 		return err;
@@ -254,40 +250,38 @@ static int ccm_decrypt(struct aead_request *req)
 	if (unlikely(err))
 		return err;
 
-	kernel_neon_begin();
-
-	if (req->assoclen)
-		ccm_calculate_auth_mac(req, mac);
+	scoped_ksimd() {
+		if (req->assoclen)
+			ccm_calculate_auth_mac(req, mac);
 
-	do {
-		u32 tail = walk.nbytes % AES_BLOCK_SIZE;
-		const u8 *src = walk.src.virt.addr;
-		u8 *dst = walk.dst.virt.addr;
-		u8 buf[AES_BLOCK_SIZE];
-		u8 *final_iv = NULL;
-
-		if (walk.nbytes == walk.total) {
-			tail = 0;
-			final_iv = orig_iv;
-		}
+		do {
+			u32 tail = walk.nbytes % AES_BLOCK_SIZE;
+			const u8 *src = walk.src.virt.addr;
+			u8 *dst = walk.dst.virt.addr;
+			u8 buf[AES_BLOCK_SIZE];
+			u8 *final_iv = NULL;
 
-		if (unlikely(walk.nbytes < AES_BLOCK_SIZE))
-			src = dst = memcpy(&buf[sizeof(buf) - walk.nbytes],
-					   src, walk.nbytes);
+			if (walk.nbytes == walk.total) {
+				tail = 0;
+				final_iv = orig_iv;
+			}
 
-		ce_aes_ccm_decrypt(dst, src, walk.nbytes - tail,
-				   ctx->key_enc, num_rounds(ctx),
-				   mac, walk.iv, final_iv);
+			if (unlikely(walk.nbytes < AES_BLOCK_SIZE))
+				src = dst = memcpy(&buf[sizeof(buf) - walk.nbytes],
+						   src, walk.nbytes);
 
-		if (unlikely(walk.nbytes < AES_BLOCK_SIZE))
-			memcpy(walk.dst.virt.addr, dst, walk.nbytes);
+			ce_aes_ccm_decrypt(dst, src, walk.nbytes - tail,
+					   ctx->key_enc, num_rounds(ctx),
+					   mac, walk.iv, final_iv);
 
-		if (walk.nbytes) {
-			err = skcipher_walk_done(&walk, tail);
-		}
-	} while (walk.nbytes);
+			if (unlikely(walk.nbytes < AES_BLOCK_SIZE))
+				memcpy(walk.dst.virt.addr, dst, walk.nbytes);
 
-	kernel_neon_end();
+			if (walk.nbytes) {
+				err = skcipher_walk_done(&walk, tail);
+			}
+		} while (walk.nbytes);
+	}
 
 	if (unlikely(err))
 		return err;
diff --git a/arch/arm64/crypto/aes-ce-glue.c b/arch/arm64/crypto/aes-ce-glue.c
index 00b8749013c5..a4dad370991d 100644
--- a/arch/arm64/crypto/aes-ce-glue.c
+++ b/arch/arm64/crypto/aes-ce-glue.c
@@ -52,9 +52,8 @@ static void aes_cipher_encrypt(struct crypto_tfm *tfm, u8 dst[], u8 const src[])
 		return;
 	}
 
-	kernel_neon_begin();
-	__aes_ce_encrypt(ctx->key_enc, dst, src, num_rounds(ctx));
-	kernel_neon_end();
+	scoped_ksimd()
+		__aes_ce_encrypt(ctx->key_enc, dst, src, num_rounds(ctx));
 }
 
 static void aes_cipher_decrypt(struct crypto_tfm *tfm, u8 dst[], u8 const src[])
@@ -66,9 +65,8 @@ static void aes_cipher_decrypt(struct crypto_tfm *tfm, u8 dst[], u8 const src[])
 		return;
 	}
 
-	kernel_neon_begin();
-	__aes_ce_decrypt(ctx->key_dec, dst, src, num_rounds(ctx));
-	kernel_neon_end();
+	scoped_ksimd()
+		__aes_ce_decrypt(ctx->key_dec, dst, src, num_rounds(ctx));
 }
 
 int ce_aes_expandkey(struct crypto_aes_ctx *ctx, const u8 *in_key,
@@ -94,47 +92,48 @@ int ce_aes_expandkey(struct crypto_aes_ctx *ctx, const u8 *in_key,
 	for (i = 0; i < kwords; i++)
 		ctx->key_enc[i] = get_unaligned_le32(in_key + i * sizeof(u32));
 
-	kernel_neon_begin();
-	for (i = 0; i < sizeof(rcon); i++) {
-		u32 *rki = ctx->key_enc + (i * kwords);
-		u32 *rko = rki + kwords;
-
-		rko[0] = ror32(__aes_ce_sub(rki[kwords - 1]), 8) ^ rcon[i] ^ rki[0];
-		rko[1] = rko[0] ^ rki[1];
-		rko[2] = rko[1] ^ rki[2];
-		rko[3] = rko[2] ^ rki[3];
-
-		if (key_len == AES_KEYSIZE_192) {
-			if (i >= 7)
-				break;
-			rko[4] = rko[3] ^ rki[4];
-			rko[5] = rko[4] ^ rki[5];
-		} else if (key_len == AES_KEYSIZE_256) {
-			if (i >= 6)
-				break;
-			rko[4] = __aes_ce_sub(rko[3]) ^ rki[4];
-			rko[5] = rko[4] ^ rki[5];
-			rko[6] = rko[5] ^ rki[6];
-			rko[7] = rko[6] ^ rki[7];
+	scoped_ksimd() {
+		for (i = 0; i < sizeof(rcon); i++) {
+			u32 *rki = ctx->key_enc + (i * kwords);
+			u32 *rko = rki + kwords;
+
+			rko[0] = ror32(__aes_ce_sub(rki[kwords - 1]), 8) ^
+				 rcon[i] ^ rki[0];
+			rko[1] = rko[0] ^ rki[1];
+			rko[2] = rko[1] ^ rki[2];
+			rko[3] = rko[2] ^ rki[3];
+
+			if (key_len == AES_KEYSIZE_192) {
+				if (i >= 7)
+					break;
+				rko[4] = rko[3] ^ rki[4];
+				rko[5] = rko[4] ^ rki[5];
+			} else if (key_len == AES_KEYSIZE_256) {
+				if (i >= 6)
+					break;
+				rko[4] = __aes_ce_sub(rko[3]) ^ rki[4];
+				rko[5] = rko[4] ^ rki[5];
+				rko[6] = rko[5] ^ rki[6];
+				rko[7] = rko[6] ^ rki[7];
+			}
 		}
-	}
 
-	/*
-	 * Generate the decryption keys for the Equivalent Inverse Cipher.
-	 * This involves reversing the order of the round keys, and applying
-	 * the Inverse Mix Columns transformation on all but the first and
-	 * the last one.
-	 */
-	key_enc = (struct aes_block *)ctx->key_enc;
-	key_dec = (struct aes_block *)ctx->key_dec;
-	j = num_rounds(ctx);
-
-	key_dec[0] = key_enc[j];
-	for (i = 1, j--; j > 0; i++, j--)
-		__aes_ce_invert(key_dec + i, key_enc + j);
-	key_dec[i] = key_enc[0];
+		/*
+		 * Generate the decryption keys for the Equivalent Inverse
+		 * Cipher.  This involves reversing the order of the round
+		 * keys, and applying the Inverse Mix Columns transformation on
+		 * all but the first and the last one.
+		 */
+		key_enc = (struct aes_block *)ctx->key_enc;
+		key_dec = (struct aes_block *)ctx->key_dec;
+		j = num_rounds(ctx);
+
+		key_dec[0] = key_enc[j];
+		for (i = 1, j--; j > 0; i++, j--)
+			__aes_ce_invert(key_dec + i, key_enc + j);
+		key_dec[i] = key_enc[0];
+	}
 
-	kernel_neon_end();
 	return 0;
 }
 EXPORT_SYMBOL(ce_aes_expandkey);
diff --git a/arch/arm64/crypto/aes-glue.c b/arch/arm64/crypto/aes-glue.c
index 5e207ff34482..b087b900d279 100644
--- a/arch/arm64/crypto/aes-glue.c
+++ b/arch/arm64/crypto/aes-glue.c
@@ -5,8 +5,6 @@
  * Copyright (C) 2013 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
  */
 
-#include <asm/hwcap.h>
-#include <asm/neon.h>
 #include <crypto/aes.h>
 #include <crypto/ctr.h>
 #include <crypto/internal/hash.h>
@@ -20,6 +18,9 @@
 #include <linux/module.h>
 #include <linux/string.h>
 
+#include <asm/hwcap.h>
+#include <asm/simd.h>
+
 #include "aes-ce-setkey.h"
 
 #ifdef USE_V8_CRYPTO_EXTENSIONS
@@ -186,10 +187,9 @@ static int __maybe_unused ecb_encrypt(struct skcipher_request *req)
 	err = skcipher_walk_virt(&walk, req, false);
 
 	while ((blocks = (walk.nbytes / AES_BLOCK_SIZE))) {
-		kernel_neon_begin();
-		aes_ecb_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
-				ctx->key_enc, rounds, blocks);
-		kernel_neon_end();
+		scoped_ksimd()
+			aes_ecb_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
+					ctx->key_enc, rounds, blocks);
 		err = skcipher_walk_done(&walk, walk.nbytes % AES_BLOCK_SIZE);
 	}
 	return err;
@@ -206,10 +206,9 @@ static int __maybe_unused ecb_decrypt(struct skcipher_request *req)
 	err = skcipher_walk_virt(&walk, req, false);
 
 	while ((blocks = (walk.nbytes / AES_BLOCK_SIZE))) {
-		kernel_neon_begin();
-		aes_ecb_decrypt(walk.dst.virt.addr, walk.src.virt.addr,
-				ctx->key_dec, rounds, blocks);
-		kernel_neon_end();
+		scoped_ksimd()
+			aes_ecb_decrypt(walk.dst.virt.addr, walk.src.virt.addr,
+					ctx->key_dec, rounds, blocks);
 		err = skcipher_walk_done(&walk, walk.nbytes % AES_BLOCK_SIZE);
 	}
 	return err;
@@ -224,10 +223,9 @@ static int cbc_encrypt_walk(struct skcipher_request *req,
 	unsigned int blocks;
 
 	while ((blocks = (walk->nbytes / AES_BLOCK_SIZE))) {
-		kernel_neon_begin();
-		aes_cbc_encrypt(walk->dst.virt.addr, walk->src.virt.addr,
-				ctx->key_enc, rounds, blocks, walk->iv);
-		kernel_neon_end();
+		scoped_ksimd()
+			aes_cbc_encrypt(walk->dst.virt.addr, walk->src.virt.addr,
+					ctx->key_enc, rounds, blocks, walk->iv);
 		err = skcipher_walk_done(walk, walk->nbytes % AES_BLOCK_SIZE);
 	}
 	return err;
@@ -253,10 +251,9 @@ static int cbc_decrypt_walk(struct skcipher_request *req,
 	unsigned int blocks;
 
 	while ((blocks = (walk->nbytes / AES_BLOCK_SIZE))) {
-		kernel_neon_begin();
-		aes_cbc_decrypt(walk->dst.virt.addr, walk->src.virt.addr,
-				ctx->key_dec, rounds, blocks, walk->iv);
-		kernel_neon_end();
+		scoped_ksimd()
+			aes_cbc_decrypt(walk->dst.virt.addr, walk->src.virt.addr,
+					ctx->key_dec, rounds, blocks, walk->iv);
 		err = skcipher_walk_done(walk, walk->nbytes % AES_BLOCK_SIZE);
 	}
 	return err;
@@ -322,10 +319,9 @@ static int cts_cbc_encrypt(struct skcipher_request *req)
 	if (err)
 		return err;
 
-	kernel_neon_begin();
-	aes_cbc_cts_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
-			    ctx->key_enc, rounds, walk.nbytes, walk.iv);
-	kernel_neon_end();
+	scoped_ksimd()
+		aes_cbc_cts_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
+				    ctx->key_enc, rounds, walk.nbytes, walk.iv);
 
 	return skcipher_walk_done(&walk, 0);
 }
@@ -379,10 +375,9 @@ static int cts_cbc_decrypt(struct skcipher_request *req)
 	if (err)
 		return err;
 
-	kernel_neon_begin();
-	aes_cbc_cts_decrypt(walk.dst.virt.addr, walk.src.virt.addr,
-			    ctx->key_dec, rounds, walk.nbytes, walk.iv);
-	kernel_neon_end();
+	scoped_ksimd()
+		aes_cbc_cts_decrypt(walk.dst.virt.addr, walk.src.virt.addr,
+				    ctx->key_dec, rounds, walk.nbytes, walk.iv);
 
 	return skcipher_walk_done(&walk, 0);
 }
@@ -399,11 +394,11 @@ static int __maybe_unused essiv_cbc_encrypt(struct skcipher_request *req)
 
 	blocks = walk.nbytes / AES_BLOCK_SIZE;
 	if (blocks) {
-		kernel_neon_begin();
-		aes_essiv_cbc_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
-				      ctx->key1.key_enc, rounds, blocks,
-				      req->iv, ctx->key2.key_enc);
-		kernel_neon_end();
+		scoped_ksimd()
+			aes_essiv_cbc_encrypt(walk.dst.virt.addr,
+					      walk.src.virt.addr,
+					      ctx->key1.key_enc, rounds, blocks,
+					      req->iv, ctx->key2.key_enc);
 		err = skcipher_walk_done(&walk, walk.nbytes % AES_BLOCK_SIZE);
 	}
 	return err ?: cbc_encrypt_walk(req, &walk);
@@ -421,11 +416,11 @@ static int __maybe_unused essiv_cbc_decrypt(struct skcipher_request *req)
 
 	blocks = walk.nbytes / AES_BLOCK_SIZE;
 	if (blocks) {
-		kernel_neon_begin();
-		aes_essiv_cbc_decrypt(walk.dst.virt.addr, walk.src.virt.addr,
-				      ctx->key1.key_dec, rounds, blocks,
-				      req->iv, ctx->key2.key_enc);
-		kernel_neon_end();
+		scoped_ksimd()
+			aes_essiv_cbc_decrypt(walk.dst.virt.addr,
+					      walk.src.virt.addr,
+					      ctx->key1.key_dec, rounds, blocks,
+					      req->iv, ctx->key2.key_enc);
 		err = skcipher_walk_done(&walk, walk.nbytes % AES_BLOCK_SIZE);
 	}
 	return err ?: cbc_decrypt_walk(req, &walk);
@@ -461,10 +456,9 @@ static int __maybe_unused xctr_encrypt(struct skcipher_request *req)
 		else if (nbytes < walk.total)
 			nbytes &= ~(AES_BLOCK_SIZE - 1);
 
-		kernel_neon_begin();
-		aes_xctr_encrypt(dst, src, ctx->key_enc, rounds, nbytes,
-						 walk.iv, byte_ctr);
-		kernel_neon_end();
+		scoped_ksimd()
+			aes_xctr_encrypt(dst, src, ctx->key_enc, rounds, nbytes,
+							 walk.iv, byte_ctr);
 
 		if (unlikely(nbytes < AES_BLOCK_SIZE))
 			memcpy(walk.dst.virt.addr,
@@ -506,10 +500,9 @@ static int __maybe_unused ctr_encrypt(struct skcipher_request *req)
 		else if (nbytes < walk.total)
 			nbytes &= ~(AES_BLOCK_SIZE - 1);
 
-		kernel_neon_begin();
-		aes_ctr_encrypt(dst, src, ctx->key_enc, rounds, nbytes,
-				walk.iv);
-		kernel_neon_end();
+		scoped_ksimd()
+			aes_ctr_encrypt(dst, src, ctx->key_enc, rounds, nbytes,
+					walk.iv);
 
 		if (unlikely(nbytes < AES_BLOCK_SIZE))
 			memcpy(walk.dst.virt.addr,
@@ -562,11 +555,10 @@ static int __maybe_unused xts_encrypt(struct skcipher_request *req)
 		if (walk.nbytes < walk.total)
 			nbytes &= ~(AES_BLOCK_SIZE - 1);
 
-		kernel_neon_begin();
-		aes_xts_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
-				ctx->key1.key_enc, rounds, nbytes,
-				ctx->key2.key_enc, walk.iv, first);
-		kernel_neon_end();
+		scoped_ksimd()
+			aes_xts_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
+					ctx->key1.key_enc, rounds, nbytes,
+					ctx->key2.key_enc, walk.iv, first);
 		err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
 	}
 
@@ -584,11 +576,10 @@ static int __maybe_unused xts_encrypt(struct skcipher_request *req)
 	if (err)
 		return err;
 
-	kernel_neon_begin();
-	aes_xts_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
-			ctx->key1.key_enc, rounds, walk.nbytes,
-			ctx->key2.key_enc, walk.iv, first);
-	kernel_neon_end();
+	scoped_ksimd()
+		aes_xts_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
+				ctx->key1.key_enc, rounds, walk.nbytes,
+				ctx->key2.key_enc, walk.iv, first);
 
 	return skcipher_walk_done(&walk, 0);
 }
@@ -634,11 +625,10 @@ static int __maybe_unused xts_decrypt(struct skcipher_request *req)
 		if (walk.nbytes < walk.total)
 			nbytes &= ~(AES_BLOCK_SIZE - 1);
 
-		kernel_neon_begin();
-		aes_xts_decrypt(walk.dst.virt.addr, walk.src.virt.addr,
-				ctx->key1.key_dec, rounds, nbytes,
-				ctx->key2.key_enc, walk.iv, first);
-		kernel_neon_end();
+		scoped_ksimd()
+			aes_xts_decrypt(walk.dst.virt.addr, walk.src.virt.addr,
+					ctx->key1.key_dec, rounds, nbytes,
+					ctx->key2.key_enc, walk.iv, first);
 		err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
 	}
 
@@ -657,11 +647,10 @@ static int __maybe_unused xts_decrypt(struct skcipher_request *req)
 		return err;
 
 
-	kernel_neon_begin();
-	aes_xts_decrypt(walk.dst.virt.addr, walk.src.virt.addr,
-			ctx->key1.key_dec, rounds, walk.nbytes,
-			ctx->key2.key_enc, walk.iv, first);
-	kernel_neon_end();
+	scoped_ksimd()
+		aes_xts_decrypt(walk.dst.virt.addr, walk.src.virt.addr,
+				ctx->key1.key_dec, rounds, walk.nbytes,
+				ctx->key2.key_enc, walk.iv, first);
 
 	return skcipher_walk_done(&walk, 0);
 }
@@ -808,10 +797,9 @@ static int cmac_setkey(struct crypto_shash *tfm, const u8 *in_key,
 		return err;
 
 	/* encrypt the zero vector */
-	kernel_neon_begin();
-	aes_ecb_encrypt(ctx->consts, (u8[AES_BLOCK_SIZE]){}, ctx->key.key_enc,
-			rounds, 1);
-	kernel_neon_end();
+	scoped_ksimd()
+		aes_ecb_encrypt(ctx->consts, (u8[AES_BLOCK_SIZE]){},
+				ctx->key.key_enc, rounds, 1);
 
 	cmac_gf128_mul_by_x(consts, consts);
 	cmac_gf128_mul_by_x(consts + 1, consts);
@@ -837,10 +825,10 @@ static int xcbc_setkey(struct crypto_shash *tfm, const u8 *in_key,
 	if (err)
 		return err;
 
-	kernel_neon_begin();
-	aes_ecb_encrypt(key, ks[0], ctx->key.key_enc, rounds, 1);
-	aes_ecb_encrypt(ctx->consts, ks[1], ctx->key.key_enc, rounds, 2);
-	kernel_neon_end();
+	scoped_ksimd() {
+		aes_ecb_encrypt(key, ks[0], ctx->key.key_enc, rounds, 1);
+		aes_ecb_encrypt(ctx->consts, ks[1], ctx->key.key_enc, rounds, 2);
+	}
 
 	return cbcmac_setkey(tfm, key, sizeof(key));
 }
@@ -860,10 +848,9 @@ static void mac_do_update(struct crypto_aes_ctx *ctx, u8 const in[], int blocks,
 	int rem;
 
 	do {
-		kernel_neon_begin();
-		rem = aes_mac_update(in, ctx->key_enc, rounds, blocks,
-				     dg, enc_before, !enc_before);
-		kernel_neon_end();
+		scoped_ksimd()
+			rem = aes_mac_update(in, ctx->key_enc, rounds, blocks,
+					     dg, enc_before, !enc_before);
 		in += (blocks - rem) * AES_BLOCK_SIZE;
 		blocks = rem;
 	} while (blocks);
diff --git a/arch/arm64/crypto/aes-neonbs-glue.c b/arch/arm64/crypto/aes-neonbs-glue.c
index c4a623e86593..d496effb0a5b 100644
--- a/arch/arm64/crypto/aes-neonbs-glue.c
+++ b/arch/arm64/crypto/aes-neonbs-glue.c
@@ -85,9 +85,8 @@ static int aesbs_setkey(struct crypto_skcipher *tfm, const u8 *in_key,
 
 	ctx->rounds = 6 + key_len / 4;
 
-	kernel_neon_begin();
-	aesbs_convert_key(ctx->rk, rk.key_enc, ctx->rounds);
-	kernel_neon_end();
+	scoped_ksimd()
+		aesbs_convert_key(ctx->rk, rk.key_enc, ctx->rounds);
 
 	return 0;
 }
@@ -110,10 +109,9 @@ static int __ecb_crypt(struct skcipher_request *req,
 			blocks = round_down(blocks,
 					    walk.stride / AES_BLOCK_SIZE);
 
-		kernel_neon_begin();
-		fn(walk.dst.virt.addr, walk.src.virt.addr, ctx->rk,
-		   ctx->rounds, blocks);
-		kernel_neon_end();
+		scoped_ksimd()
+			fn(walk.dst.virt.addr, walk.src.virt.addr, ctx->rk,
+			   ctx->rounds, blocks);
 		err = skcipher_walk_done(&walk,
 					 walk.nbytes - blocks * AES_BLOCK_SIZE);
 	}
@@ -146,9 +144,8 @@ static int aesbs_cbc_ctr_setkey(struct crypto_skcipher *tfm, const u8 *in_key,
 
 	memcpy(ctx->enc, rk.key_enc, sizeof(ctx->enc));
 
-	kernel_neon_begin();
-	aesbs_convert_key(ctx->key.rk, rk.key_enc, ctx->key.rounds);
-	kernel_neon_end();
+	scoped_ksimd()
+		aesbs_convert_key(ctx->key.rk, rk.key_enc, ctx->key.rounds);
 	memzero_explicit(&rk, sizeof(rk));
 
 	return 0;
@@ -167,11 +164,11 @@ static int cbc_encrypt(struct skcipher_request *req)
 		unsigned int blocks = walk.nbytes / AES_BLOCK_SIZE;
 
 		/* fall back to the non-bitsliced NEON implementation */
-		kernel_neon_begin();
-		neon_aes_cbc_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
-				     ctx->enc, ctx->key.rounds, blocks,
-				     walk.iv);
-		kernel_neon_end();
+		scoped_ksimd()
+			neon_aes_cbc_encrypt(walk.dst.virt.addr,
+					     walk.src.virt.addr,
+					     ctx->enc, ctx->key.rounds, blocks,
+					     walk.iv);
 		err = skcipher_walk_done(&walk, walk.nbytes % AES_BLOCK_SIZE);
 	}
 	return err;
@@ -193,11 +190,10 @@ static int cbc_decrypt(struct skcipher_request *req)
 			blocks = round_down(blocks,
 					    walk.stride / AES_BLOCK_SIZE);
 
-		kernel_neon_begin();
-		aesbs_cbc_decrypt(walk.dst.virt.addr, walk.src.virt.addr,
-				  ctx->key.rk, ctx->key.rounds, blocks,
-				  walk.iv);
-		kernel_neon_end();
+		scoped_ksimd()
+			aesbs_cbc_decrypt(walk.dst.virt.addr, walk.src.virt.addr,
+					  ctx->key.rk, ctx->key.rounds, blocks,
+					  walk.iv);
 		err = skcipher_walk_done(&walk,
 					 walk.nbytes - blocks * AES_BLOCK_SIZE);
 	}
@@ -220,30 +216,32 @@ static int ctr_encrypt(struct skcipher_request *req)
 		const u8 *src = walk.src.virt.addr;
 		u8 *dst = walk.dst.virt.addr;
 
-		kernel_neon_begin();
-		if (blocks >= 8) {
-			aesbs_ctr_encrypt(dst, src, ctx->key.rk, ctx->key.rounds,
-					  blocks, walk.iv);
-			dst += blocks * AES_BLOCK_SIZE;
-			src += blocks * AES_BLOCK_SIZE;
-		}
-		if (nbytes && walk.nbytes == walk.total) {
-			u8 buf[AES_BLOCK_SIZE];
-			u8 *d = dst;
-
-			if (unlikely(nbytes < AES_BLOCK_SIZE))
-				src = dst = memcpy(buf + sizeof(buf) - nbytes,
-						   src, nbytes);
-
-			neon_aes_ctr_encrypt(dst, src, ctx->enc, ctx->key.rounds,
-					     nbytes, walk.iv);
+		scoped_ksimd() {
+			if (blocks >= 8) {
+				aesbs_ctr_encrypt(dst, src, ctx->key.rk,
+						  ctx->key.rounds, blocks,
+						  walk.iv);
+				dst += blocks * AES_BLOCK_SIZE;
+				src += blocks * AES_BLOCK_SIZE;
+			}
+			if (nbytes && walk.nbytes == walk.total) {
+				u8 buf[AES_BLOCK_SIZE];
+				u8 *d = dst;
+
+				if (unlikely(nbytes < AES_BLOCK_SIZE))
+					src = dst = memcpy(buf + sizeof(buf) -
+							   nbytes, src, nbytes);
+
+				neon_aes_ctr_encrypt(dst, src, ctx->enc,
+						     ctx->key.rounds, nbytes,
+						     walk.iv);
 
-			if (unlikely(nbytes < AES_BLOCK_SIZE))
-				memcpy(d, dst, nbytes);
+				if (unlikely(nbytes < AES_BLOCK_SIZE))
+					memcpy(d, dst, nbytes);
 
-			nbytes = 0;
+				nbytes = 0;
+			}
 		}
-		kernel_neon_end();
 		err = skcipher_walk_done(&walk, nbytes);
 	}
 	return err;
@@ -320,33 +318,33 @@ static int __xts_crypt(struct skcipher_request *req, bool encrypt,
 		in = walk.src.virt.addr;
 		nbytes = walk.nbytes;
 
-		kernel_neon_begin();
-		if (blocks >= 8) {
-			if (first == 1)
-				neon_aes_ecb_encrypt(walk.iv, walk.iv,
-						     ctx->twkey,
-						     ctx->key.rounds, 1);
-			first = 2;
-
-			fn(out, in, ctx->key.rk, ctx->key.rounds, blocks,
-			   walk.iv);
-
-			out += blocks * AES_BLOCK_SIZE;
-			in += blocks * AES_BLOCK_SIZE;
-			nbytes -= blocks * AES_BLOCK_SIZE;
+		scoped_ksimd() {
+			if (blocks >= 8) {
+				if (first == 1)
+					neon_aes_ecb_encrypt(walk.iv, walk.iv,
+							     ctx->twkey,
+							     ctx->key.rounds, 1);
+				first = 2;
+
+				fn(out, in, ctx->key.rk, ctx->key.rounds, blocks,
+				   walk.iv);
+
+				out += blocks * AES_BLOCK_SIZE;
+				in += blocks * AES_BLOCK_SIZE;
+				nbytes -= blocks * AES_BLOCK_SIZE;
+			}
+			if (walk.nbytes == walk.total && nbytes > 0) {
+				if (encrypt)
+					neon_aes_xts_encrypt(out, in, ctx->cts.key_enc,
+							     ctx->key.rounds, nbytes,
+							     ctx->twkey, walk.iv, first);
+				else
+					neon_aes_xts_decrypt(out, in, ctx->cts.key_dec,
+							     ctx->key.rounds, nbytes,
+							     ctx->twkey, walk.iv, first);
+				nbytes = first = 0;
+			}
 		}
-		if (walk.nbytes == walk.total && nbytes > 0) {
-			if (encrypt)
-				neon_aes_xts_encrypt(out, in, ctx->cts.key_enc,
-						     ctx->key.rounds, nbytes,
-						     ctx->twkey, walk.iv, first);
-			else
-				neon_aes_xts_decrypt(out, in, ctx->cts.key_dec,
-						     ctx->key.rounds, nbytes,
-						     ctx->twkey, walk.iv, first);
-			nbytes = first = 0;
-		}
-		kernel_neon_end();
 		err = skcipher_walk_done(&walk, nbytes);
 	}
 
@@ -369,14 +367,16 @@ static int __xts_crypt(struct skcipher_request *req, bool encrypt,
 	in = walk.src.virt.addr;
 	nbytes = walk.nbytes;
 
-	kernel_neon_begin();
-	if (encrypt)
-		neon_aes_xts_encrypt(out, in, ctx->cts.key_enc, ctx->key.rounds,
-				     nbytes, ctx->twkey, walk.iv, first);
-	else
-		neon_aes_xts_decrypt(out, in, ctx->cts.key_dec, ctx->key.rounds,
-				     nbytes, ctx->twkey, walk.iv, first);
-	kernel_neon_end();
+	scoped_ksimd() {
+		if (encrypt)
+			neon_aes_xts_encrypt(out, in, ctx->cts.key_enc,
+					     ctx->key.rounds, nbytes, ctx->twkey,
+					     walk.iv, first);
+		else
+			neon_aes_xts_decrypt(out, in, ctx->cts.key_dec,
+					     ctx->key.rounds, nbytes, ctx->twkey,
+					     walk.iv, first);
+	}
 
 	return skcipher_walk_done(&walk, 0);
 }
diff --git a/arch/arm64/crypto/ghash-ce-glue.c b/arch/arm64/crypto/ghash-ce-glue.c
index 4995b6e22335..7951557a285a 100644
--- a/arch/arm64/crypto/ghash-ce-glue.c
+++ b/arch/arm64/crypto/ghash-ce-glue.c
@@ -5,7 +5,6 @@
  * Copyright (C) 2014 - 2018 Linaro Ltd. <ard.biesheuvel@linaro.org>
  */
 
-#include <asm/neon.h>
 #include <crypto/aes.h>
 #include <crypto/b128ops.h>
 #include <crypto/gcm.h>
@@ -22,6 +21,8 @@
 #include <linux/string.h>
 #include <linux/unaligned.h>
 
+#include <asm/simd.h>
+
 MODULE_DESCRIPTION("GHASH and AES-GCM using ARMv8 Crypto Extensions");
 MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
 MODULE_LICENSE("GPL v2");
@@ -74,9 +75,8 @@ void ghash_do_simd_update(int blocks, u64 dg[], const char *src,
 					      u64 const h[][2],
 					      const char *head))
 {
-	kernel_neon_begin();
-	simd_update(blocks, dg, src, key->h, head);
-	kernel_neon_end();
+	scoped_ksimd()
+		simd_update(blocks, dg, src, key->h, head);
 }
 
 /* avoid hogging the CPU for too long */
@@ -329,11 +329,10 @@ static int gcm_encrypt(struct aead_request *req, char *iv, int assoclen)
 			tag = NULL;
 		}
 
-		kernel_neon_begin();
-		pmull_gcm_encrypt(nbytes, dst, src, ctx->ghash_key.h,
-				  dg, iv, ctx->aes_key.key_enc, nrounds,
-				  tag);
-		kernel_neon_end();
+		scoped_ksimd()
+			pmull_gcm_encrypt(nbytes, dst, src, ctx->ghash_key.h,
+					  dg, iv, ctx->aes_key.key_enc, nrounds,
+					  tag);
 
 		if (unlikely(!nbytes))
 			break;
@@ -399,11 +398,11 @@ static int gcm_decrypt(struct aead_request *req, char *iv, int assoclen)
 			tag = NULL;
 		}
 
-		kernel_neon_begin();
-		ret = pmull_gcm_decrypt(nbytes, dst, src, ctx->ghash_key.h,
-					dg, iv, ctx->aes_key.key_enc,
-					nrounds, tag, otag, authsize);
-		kernel_neon_end();
+		scoped_ksimd()
+			ret = pmull_gcm_decrypt(nbytes, dst, src,
+						ctx->ghash_key.h,
+						dg, iv, ctx->aes_key.key_enc,
+						nrounds, tag, otag, authsize);
 
 		if (unlikely(!nbytes))
 			break;
diff --git a/arch/arm64/crypto/nhpoly1305-neon-glue.c b/arch/arm64/crypto/nhpoly1305-neon-glue.c
index e4a0b463f080..013de6ac569a 100644
--- a/arch/arm64/crypto/nhpoly1305-neon-glue.c
+++ b/arch/arm64/crypto/nhpoly1305-neon-glue.c
@@ -25,9 +25,8 @@ static int nhpoly1305_neon_update(struct shash_desc *desc,
 	do {
 		unsigned int n = min_t(unsigned int, srclen, SZ_4K);
 
-		kernel_neon_begin();
-		crypto_nhpoly1305_update_helper(desc, src, n, nh_neon);
-		kernel_neon_end();
+		scoped_ksimd()
+			crypto_nhpoly1305_update_helper(desc, src, n, nh_neon);
 		src += n;
 		srclen -= n;
 	} while (srclen);
diff --git a/arch/arm64/crypto/polyval-ce-glue.c b/arch/arm64/crypto/polyval-ce-glue.c
deleted file mode 100644
index c4e653688ea0..000000000000
--- a/arch/arm64/crypto/polyval-ce-glue.c
+++ /dev/null
@@ -1,158 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Glue code for POLYVAL using ARMv8 Crypto Extensions
- *
- * Copyright (c) 2007 Nokia Siemens Networks - Mikko Herranen <mh1@iki.fi>
- * Copyright (c) 2009 Intel Corp.
- *   Author: Huang Ying <ying.huang@intel.com>
- * Copyright 2021 Google LLC
- */
-
-/*
- * Glue code based on ghash-clmulni-intel_glue.c.
- *
- * This implementation of POLYVAL uses montgomery multiplication accelerated by
- * ARMv8 Crypto Extensions instructions to implement the finite field operations.
- */
-
-#include <asm/neon.h>
-#include <crypto/internal/hash.h>
-#include <crypto/polyval.h>
-#include <crypto/utils.h>
-#include <linux/cpufeature.h>
-#include <linux/errno.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/string.h>
-
-#define NUM_KEY_POWERS	8
-
-struct polyval_tfm_ctx {
-	/*
-	 * These powers must be in the order h^8, ..., h^1.
-	 */
-	u8 key_powers[NUM_KEY_POWERS][POLYVAL_BLOCK_SIZE];
-};
-
-struct polyval_desc_ctx {
-	u8 buffer[POLYVAL_BLOCK_SIZE];
-};
-
-asmlinkage void pmull_polyval_update(const struct polyval_tfm_ctx *keys,
-	const u8 *in, size_t nblocks, u8 *accumulator);
-asmlinkage void pmull_polyval_mul(u8 *op1, const u8 *op2);
-
-static void internal_polyval_update(const struct polyval_tfm_ctx *keys,
-	const u8 *in, size_t nblocks, u8 *accumulator)
-{
-	kernel_neon_begin();
-	pmull_polyval_update(keys, in, nblocks, accumulator);
-	kernel_neon_end();
-}
-
-static void internal_polyval_mul(u8 *op1, const u8 *op2)
-{
-	kernel_neon_begin();
-	pmull_polyval_mul(op1, op2);
-	kernel_neon_end();
-}
-
-static int polyval_arm64_setkey(struct crypto_shash *tfm,
-			const u8 *key, unsigned int keylen)
-{
-	struct polyval_tfm_ctx *tctx = crypto_shash_ctx(tfm);
-	int i;
-
-	if (keylen != POLYVAL_BLOCK_SIZE)
-		return -EINVAL;
-
-	memcpy(tctx->key_powers[NUM_KEY_POWERS-1], key, POLYVAL_BLOCK_SIZE);
-
-	for (i = NUM_KEY_POWERS-2; i >= 0; i--) {
-		memcpy(tctx->key_powers[i], key, POLYVAL_BLOCK_SIZE);
-		internal_polyval_mul(tctx->key_powers[i],
-				     tctx->key_powers[i+1]);
-	}
-
-	return 0;
-}
-
-static int polyval_arm64_init(struct shash_desc *desc)
-{
-	struct polyval_desc_ctx *dctx = shash_desc_ctx(desc);
-
-	memset(dctx, 0, sizeof(*dctx));
-
-	return 0;
-}
-
-static int polyval_arm64_update(struct shash_desc *desc,
-			 const u8 *src, unsigned int srclen)
-{
-	struct polyval_desc_ctx *dctx = shash_desc_ctx(desc);
-	const struct polyval_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm);
-	unsigned int nblocks;
-
-	do {
-		/* allow rescheduling every 4K bytes */
-		nblocks = min(srclen, 4096U) / POLYVAL_BLOCK_SIZE;
-		internal_polyval_update(tctx, src, nblocks, dctx->buffer);
-		srclen -= nblocks * POLYVAL_BLOCK_SIZE;
-		src += nblocks * POLYVAL_BLOCK_SIZE;
-	} while (srclen >= POLYVAL_BLOCK_SIZE);
-
-	return srclen;
-}
-
-static int polyval_arm64_finup(struct shash_desc *desc, const u8 *src,
-			       unsigned int len, u8 *dst)
-{
-	struct polyval_desc_ctx *dctx = shash_desc_ctx(desc);
-	const struct polyval_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm);
-
-	if (len) {
-		crypto_xor(dctx->buffer, src, len);
-		internal_polyval_mul(dctx->buffer,
-				     tctx->key_powers[NUM_KEY_POWERS-1]);
-	}
-
-	memcpy(dst, dctx->buffer, POLYVAL_BLOCK_SIZE);
-
-	return 0;
-}
-
-static struct shash_alg polyval_alg = {
-	.digestsize	= POLYVAL_DIGEST_SIZE,
-	.init		= polyval_arm64_init,
-	.update		= polyval_arm64_update,
-	.finup		= polyval_arm64_finup,
-	.setkey		= polyval_arm64_setkey,
-	.descsize	= sizeof(struct polyval_desc_ctx),
-	.base		= {
-		.cra_name		= "polyval",
-		.cra_driver_name	= "polyval-ce",
-		.cra_priority		= 200,
-		.cra_flags		= CRYPTO_AHASH_ALG_BLOCK_ONLY,
-		.cra_blocksize		= POLYVAL_BLOCK_SIZE,
-		.cra_ctxsize		= sizeof(struct polyval_tfm_ctx),
-		.cra_module		= THIS_MODULE,
-	},
-};
-
-static int __init polyval_ce_mod_init(void)
-{
-	return crypto_register_shash(&polyval_alg);
-}
-
-static void __exit polyval_ce_mod_exit(void)
-{
-	crypto_unregister_shash(&polyval_alg);
-}
-
-module_cpu_feature_match(PMULL, polyval_ce_mod_init)
-module_exit(polyval_ce_mod_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("POLYVAL hash function accelerated by ARMv8 Crypto Extensions");
-MODULE_ALIAS_CRYPTO("polyval");
-MODULE_ALIAS_CRYPTO("polyval-ce");
diff --git a/arch/arm64/crypto/sha3-ce-glue.c b/arch/arm64/crypto/sha3-ce-glue.c
deleted file mode 100644
index b4f1001046c9..000000000000
--- a/arch/arm64/crypto/sha3-ce-glue.c
+++ /dev/null
@@ -1,151 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * sha3-ce-glue.c - core SHA-3 transform using v8.2 Crypto Extensions
- *
- * Copyright (C) 2018 Linaro Ltd <ard.biesheuvel@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <asm/hwcap.h>
-#include <asm/neon.h>
-#include <asm/simd.h>
-#include <crypto/internal/hash.h>
-#include <crypto/sha3.h>
-#include <linux/cpufeature.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/string.h>
-#include <linux/unaligned.h>
-
-MODULE_DESCRIPTION("SHA3 secure hash using ARMv8 Crypto Extensions");
-MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
-MODULE_LICENSE("GPL v2");
-MODULE_ALIAS_CRYPTO("sha3-224");
-MODULE_ALIAS_CRYPTO("sha3-256");
-MODULE_ALIAS_CRYPTO("sha3-384");
-MODULE_ALIAS_CRYPTO("sha3-512");
-
-asmlinkage int sha3_ce_transform(u64 *st, const u8 *data, int blocks,
-				 int md_len);
-
-static int sha3_update(struct shash_desc *desc, const u8 *data,
-		       unsigned int len)
-{
-	struct sha3_state *sctx = shash_desc_ctx(desc);
-	struct crypto_shash *tfm = desc->tfm;
-	unsigned int bs, ds;
-	int blocks;
-
-	ds = crypto_shash_digestsize(tfm);
-	bs = crypto_shash_blocksize(tfm);
-	blocks = len / bs;
-	len -= blocks * bs;
-	do {
-		int rem;
-
-		kernel_neon_begin();
-		rem = sha3_ce_transform(sctx->st, data, blocks, ds);
-		kernel_neon_end();
-		data += (blocks - rem) * bs;
-		blocks = rem;
-	} while (blocks);
-	return len;
-}
-
-static int sha3_finup(struct shash_desc *desc, const u8 *src, unsigned int len,
-		      u8 *out)
-{
-	struct sha3_state *sctx = shash_desc_ctx(desc);
-	struct crypto_shash *tfm = desc->tfm;
-	__le64 *digest = (__le64 *)out;
-	u8 block[SHA3_224_BLOCK_SIZE];
-	unsigned int bs, ds;
-	int i;
-
-	ds = crypto_shash_digestsize(tfm);
-	bs = crypto_shash_blocksize(tfm);
-	memcpy(block, src, len);
-
-	block[len++] = 0x06;
-	memset(block + len, 0, bs - len);
-	block[bs - 1] |= 0x80;
-
-	kernel_neon_begin();
-	sha3_ce_transform(sctx->st, block, 1, ds);
-	kernel_neon_end();
-	memzero_explicit(block , sizeof(block));
-
-	for (i = 0; i < ds / 8; i++)
-		put_unaligned_le64(sctx->st[i], digest++);
-
-	if (ds & 4)
-		put_unaligned_le32(sctx->st[i], (__le32 *)digest);
-
-	return 0;
-}
-
-static struct shash_alg algs[] = { {
-	.digestsize		= SHA3_224_DIGEST_SIZE,
-	.init			= crypto_sha3_init,
-	.update			= sha3_update,
-	.finup			= sha3_finup,
-	.descsize		= SHA3_STATE_SIZE,
-	.base.cra_name		= "sha3-224",
-	.base.cra_driver_name	= "sha3-224-ce",
-	.base.cra_flags		= CRYPTO_AHASH_ALG_BLOCK_ONLY,
-	.base.cra_blocksize	= SHA3_224_BLOCK_SIZE,
-	.base.cra_module	= THIS_MODULE,
-	.base.cra_priority	= 200,
-}, {
-	.digestsize		= SHA3_256_DIGEST_SIZE,
-	.init			= crypto_sha3_init,
-	.update			= sha3_update,
-	.finup			= sha3_finup,
-	.descsize		= SHA3_STATE_SIZE,
-	.base.cra_name		= "sha3-256",
-	.base.cra_driver_name	= "sha3-256-ce",
-	.base.cra_flags		= CRYPTO_AHASH_ALG_BLOCK_ONLY,
-	.base.cra_blocksize	= SHA3_256_BLOCK_SIZE,
-	.base.cra_module	= THIS_MODULE,
-	.base.cra_priority	= 200,
-}, {
-	.digestsize		= SHA3_384_DIGEST_SIZE,
-	.init			= crypto_sha3_init,
-	.update			= sha3_update,
-	.finup			= sha3_finup,
-	.descsize		= SHA3_STATE_SIZE,
-	.base.cra_name		= "sha3-384",
-	.base.cra_driver_name	= "sha3-384-ce",
-	.base.cra_flags		= CRYPTO_AHASH_ALG_BLOCK_ONLY,
-	.base.cra_blocksize	= SHA3_384_BLOCK_SIZE,
-	.base.cra_module	= THIS_MODULE,
-	.base.cra_priority	= 200,
-}, {
-	.digestsize		= SHA3_512_DIGEST_SIZE,
-	.init			= crypto_sha3_init,
-	.update			= sha3_update,
-	.finup			= sha3_finup,
-	.descsize		= SHA3_STATE_SIZE,
-	.base.cra_name		= "sha3-512",
-	.base.cra_driver_name	= "sha3-512-ce",
-	.base.cra_flags		= CRYPTO_AHASH_ALG_BLOCK_ONLY,
-	.base.cra_blocksize	= SHA3_512_BLOCK_SIZE,
-	.base.cra_module	= THIS_MODULE,
-	.base.cra_priority	= 200,
-} };
-
-static int __init sha3_neon_mod_init(void)
-{
-	return crypto_register_shashes(algs, ARRAY_SIZE(algs));
-}
-
-static void __exit sha3_neon_mod_fini(void)
-{
-	crypto_unregister_shashes(algs, ARRAY_SIZE(algs));
-}
-
-module_cpu_feature_match(SHA3, sha3_neon_mod_init);
-module_exit(sha3_neon_mod_fini);
diff --git a/arch/arm64/crypto/sm3-ce-glue.c b/arch/arm64/crypto/sm3-ce-glue.c
index eac6f5fa0abe..24c1fcfae072 100644
--- a/arch/arm64/crypto/sm3-ce-glue.c
+++ b/arch/arm64/crypto/sm3-ce-glue.c
@@ -5,7 +5,6 @@
  * Copyright (C) 2018 Linaro Ltd <ard.biesheuvel@linaro.org>
  */
 
-#include <asm/neon.h>
 #include <crypto/internal/hash.h>
 #include <crypto/sm3.h>
 #include <crypto/sm3_base.h>
@@ -13,6 +12,8 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 
+#include <asm/simd.h>
+
 MODULE_DESCRIPTION("SM3 secure hash using ARMv8 Crypto Extensions");
 MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
 MODULE_LICENSE("GPL v2");
@@ -25,18 +26,18 @@ static int sm3_ce_update(struct shash_desc *desc, const u8 *data,
 {
 	int remain;
 
-	kernel_neon_begin();
-	remain = sm3_base_do_update_blocks(desc, data, len, sm3_ce_transform);
-	kernel_neon_end();
+	scoped_ksimd() {
+		remain = sm3_base_do_update_blocks(desc, data, len, sm3_ce_transform);
+	}
 	return remain;
 }
 
 static int sm3_ce_finup(struct shash_desc *desc, const u8 *data,
 			unsigned int len, u8 *out)
 {
-	kernel_neon_begin();
-	sm3_base_do_finup(desc, data, len, sm3_ce_transform);
-	kernel_neon_end();
+	scoped_ksimd() {
+		sm3_base_do_finup(desc, data, len, sm3_ce_transform);
+	}
 	return sm3_base_finish(desc, out);
 }
 
diff --git a/arch/arm64/crypto/sm3-neon-glue.c b/arch/arm64/crypto/sm3-neon-glue.c
index 6c4611a503a3..15f30cc24f32 100644
--- a/arch/arm64/crypto/sm3-neon-glue.c
+++ b/arch/arm64/crypto/sm3-neon-glue.c
@@ -5,7 +5,7 @@
  * Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
  */
 
-#include <asm/neon.h>
+#include <asm/simd.h>
 #include <crypto/internal/hash.h>
 #include <crypto/sm3.h>
 #include <crypto/sm3_base.h>
@@ -20,20 +20,16 @@ asmlinkage void sm3_neon_transform(struct sm3_state *sst, u8 const *src,
 static int sm3_neon_update(struct shash_desc *desc, const u8 *data,
 			   unsigned int len)
 {
-	int remain;
-
-	kernel_neon_begin();
-	remain = sm3_base_do_update_blocks(desc, data, len, sm3_neon_transform);
-	kernel_neon_end();
-	return remain;
+	scoped_ksimd()
+		return sm3_base_do_update_blocks(desc, data, len,
+						 sm3_neon_transform);
 }
 
 static int sm3_neon_finup(struct shash_desc *desc, const u8 *data,
 			  unsigned int len, u8 *out)
 {
-	kernel_neon_begin();
-	sm3_base_do_finup(desc, data, len, sm3_neon_transform);
-	kernel_neon_end();
+	scoped_ksimd()
+		sm3_base_do_finup(desc, data, len, sm3_neon_transform);
 	return sm3_base_finish(desc, out);
 }
 
diff --git a/arch/arm64/crypto/sm4-ce-ccm-glue.c b/arch/arm64/crypto/sm4-ce-ccm-glue.c
index e9cc1c1364ec..332f02167a96 100644
--- a/arch/arm64/crypto/sm4-ce-ccm-glue.c
+++ b/arch/arm64/crypto/sm4-ce-ccm-glue.c
@@ -11,7 +11,7 @@
 #include <linux/crypto.h>
 #include <linux/kernel.h>
 #include <linux/cpufeature.h>
-#include <asm/neon.h>
+#include <asm/simd.h>
 #include <crypto/scatterwalk.h>
 #include <crypto/internal/aead.h>
 #include <crypto/internal/skcipher.h>
@@ -35,10 +35,9 @@ static int ccm_setkey(struct crypto_aead *tfm, const u8 *key,
 	if (key_len != SM4_KEY_SIZE)
 		return -EINVAL;
 
-	kernel_neon_begin();
-	sm4_ce_expand_key(key, ctx->rkey_enc, ctx->rkey_dec,
-			  crypto_sm4_fk, crypto_sm4_ck);
-	kernel_neon_end();
+	scoped_ksimd()
+		sm4_ce_expand_key(key, ctx->rkey_enc, ctx->rkey_dec,
+				  crypto_sm4_fk, crypto_sm4_ck);
 
 	return 0;
 }
@@ -167,39 +166,23 @@ static int ccm_crypt(struct aead_request *req, struct skcipher_walk *walk,
 	memcpy(ctr0, walk->iv, SM4_BLOCK_SIZE);
 	crypto_inc(walk->iv, SM4_BLOCK_SIZE);
 
-	kernel_neon_begin();
+	scoped_ksimd() {
+		if (req->assoclen)
+			ccm_calculate_auth_mac(req, mac);
 
-	if (req->assoclen)
-		ccm_calculate_auth_mac(req, mac);
-
-	while (walk->nbytes && walk->nbytes != walk->total) {
-		unsigned int tail = walk->nbytes % SM4_BLOCK_SIZE;
-
-		sm4_ce_ccm_crypt(rkey_enc, walk->dst.virt.addr,
-				 walk->src.virt.addr, walk->iv,
-				 walk->nbytes - tail, mac);
-
-		kernel_neon_end();
-
-		err = skcipher_walk_done(walk, tail);
-
-		kernel_neon_begin();
-	}
-
-	if (walk->nbytes) {
-		sm4_ce_ccm_crypt(rkey_enc, walk->dst.virt.addr,
-				 walk->src.virt.addr, walk->iv,
-				 walk->nbytes, mac);
+		while (walk->nbytes) {
+			unsigned int tail = walk->nbytes % SM4_BLOCK_SIZE;
 
-		sm4_ce_ccm_final(rkey_enc, ctr0, mac);
+			if (walk->nbytes == walk->total)
+				tail = 0;
 
-		kernel_neon_end();
+			sm4_ce_ccm_crypt(rkey_enc, walk->dst.virt.addr,
+					 walk->src.virt.addr, walk->iv,
+					 walk->nbytes - tail, mac);
 
-		err = skcipher_walk_done(walk, 0);
-	} else {
+			err = skcipher_walk_done(walk, tail);
+		}
 		sm4_ce_ccm_final(rkey_enc, ctr0, mac);
-
-		kernel_neon_end();
 	}
 
 	return err;
diff --git a/arch/arm64/crypto/sm4-ce-cipher-glue.c b/arch/arm64/crypto/sm4-ce-cipher-glue.c
index c31d76fb5a17..bceec833ef4e 100644
--- a/arch/arm64/crypto/sm4-ce-cipher-glue.c
+++ b/arch/arm64/crypto/sm4-ce-cipher-glue.c
@@ -32,9 +32,8 @@ static void sm4_ce_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
 	if (!crypto_simd_usable()) {
 		sm4_crypt_block(ctx->rkey_enc, out, in);
 	} else {
-		kernel_neon_begin();
-		sm4_ce_do_crypt(ctx->rkey_enc, out, in);
-		kernel_neon_end();
+		scoped_ksimd()
+			sm4_ce_do_crypt(ctx->rkey_enc, out, in);
 	}
 }
 
@@ -45,9 +44,8 @@ static void sm4_ce_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
 	if (!crypto_simd_usable()) {
 		sm4_crypt_block(ctx->rkey_dec, out, in);
 	} else {
-		kernel_neon_begin();
-		sm4_ce_do_crypt(ctx->rkey_dec, out, in);
-		kernel_neon_end();
+		scoped_ksimd()
+			sm4_ce_do_crypt(ctx->rkey_dec, out, in);
 	}
 }
 
diff --git a/arch/arm64/crypto/sm4-ce-gcm-glue.c b/arch/arm64/crypto/sm4-ce-gcm-glue.c
index c2ea3d5f690b..ef06f4f768a1 100644
--- a/arch/arm64/crypto/sm4-ce-gcm-glue.c
+++ b/arch/arm64/crypto/sm4-ce-gcm-glue.c
@@ -11,7 +11,7 @@
 #include <linux/crypto.h>
 #include <linux/kernel.h>
 #include <linux/cpufeature.h>
-#include <asm/neon.h>
+#include <asm/simd.h>
 #include <crypto/b128ops.h>
 #include <crypto/scatterwalk.h>
 #include <crypto/internal/aead.h>
@@ -48,13 +48,11 @@ static int gcm_setkey(struct crypto_aead *tfm, const u8 *key,
 	if (key_len != SM4_KEY_SIZE)
 		return -EINVAL;
 
-	kernel_neon_begin();
-
-	sm4_ce_expand_key(key, ctx->key.rkey_enc, ctx->key.rkey_dec,
-			  crypto_sm4_fk, crypto_sm4_ck);
-	sm4_ce_pmull_ghash_setup(ctx->key.rkey_enc, ctx->ghash_table);
-
-	kernel_neon_end();
+	scoped_ksimd() {
+		sm4_ce_expand_key(key, ctx->key.rkey_enc, ctx->key.rkey_dec,
+				crypto_sm4_fk, crypto_sm4_ck);
+		sm4_ce_pmull_ghash_setup(ctx->key.rkey_enc, ctx->ghash_table);
+	}
 	return 0;
 }
 
@@ -149,44 +147,28 @@ static int gcm_crypt(struct aead_request *req, struct skcipher_walk *walk,
 	memcpy(iv, req->iv, GCM_IV_SIZE);
 	put_unaligned_be32(2, iv + GCM_IV_SIZE);
 
-	kernel_neon_begin();
+	scoped_ksimd() {
+		if (req->assoclen)
+			gcm_calculate_auth_mac(req, ghash);
 
-	if (req->assoclen)
-		gcm_calculate_auth_mac(req, ghash);
+		do {
+			unsigned int tail = walk->nbytes % SM4_BLOCK_SIZE;
+			const u8 *src = walk->src.virt.addr;
+			u8 *dst = walk->dst.virt.addr;
+			const u8 *l = NULL;
 
-	while (walk->nbytes) {
-		unsigned int tail = walk->nbytes % SM4_BLOCK_SIZE;
-		const u8 *src = walk->src.virt.addr;
-		u8 *dst = walk->dst.virt.addr;
+			if (walk->nbytes == walk->total) {
+				l = (const u8 *)&lengths;
+				tail = 0;
+			}
 
-		if (walk->nbytes == walk->total) {
 			sm4_ce_pmull_gcm_crypt(ctx->key.rkey_enc, dst, src, iv,
-					       walk->nbytes, ghash,
-					       ctx->ghash_table,
-					       (const u8 *)&lengths);
-
-			kernel_neon_end();
-
-			return skcipher_walk_done(walk, 0);
-		}
+					       walk->nbytes - tail, ghash,
+					       ctx->ghash_table, l);
 
-		sm4_ce_pmull_gcm_crypt(ctx->key.rkey_enc, dst, src, iv,
-				       walk->nbytes - tail, ghash,
-				       ctx->ghash_table, NULL);
-
-		kernel_neon_end();
-
-		err = skcipher_walk_done(walk, tail);
-
-		kernel_neon_begin();
+			err = skcipher_walk_done(walk, tail);
+		} while (walk->nbytes);
 	}
-
-	sm4_ce_pmull_gcm_crypt(ctx->key.rkey_enc, NULL, NULL, iv,
-			       walk->nbytes, ghash, ctx->ghash_table,
-			       (const u8 *)&lengths);
-
-	kernel_neon_end();
-
 	return err;
 }
 
diff --git a/arch/arm64/crypto/sm4-ce-glue.c b/arch/arm64/crypto/sm4-ce-glue.c
index 7a60e7b559dc..5569cece5a0b 100644
--- a/arch/arm64/crypto/sm4-ce-glue.c
+++ b/arch/arm64/crypto/sm4-ce-glue.c
@@ -8,7 +8,7 @@
  * Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
  */
 
-#include <asm/neon.h>
+#include <asm/simd.h>
 #include <crypto/b128ops.h>
 #include <crypto/internal/hash.h>
 #include <crypto/internal/skcipher.h>
@@ -74,10 +74,9 @@ static int sm4_setkey(struct crypto_skcipher *tfm, const u8 *key,
 	if (key_len != SM4_KEY_SIZE)
 		return -EINVAL;
 
-	kernel_neon_begin();
-	sm4_ce_expand_key(key, ctx->rkey_enc, ctx->rkey_dec,
-			  crypto_sm4_fk, crypto_sm4_ck);
-	kernel_neon_end();
+	scoped_ksimd()
+		sm4_ce_expand_key(key, ctx->rkey_enc, ctx->rkey_dec,
+				  crypto_sm4_fk, crypto_sm4_ck);
 	return 0;
 }
 
@@ -94,12 +93,12 @@ static int sm4_xts_setkey(struct crypto_skcipher *tfm, const u8 *key,
 	if (ret)
 		return ret;
 
-	kernel_neon_begin();
-	sm4_ce_expand_key(key, ctx->key1.rkey_enc,
-			  ctx->key1.rkey_dec, crypto_sm4_fk, crypto_sm4_ck);
-	sm4_ce_expand_key(&key[SM4_KEY_SIZE], ctx->key2.rkey_enc,
-			  ctx->key2.rkey_dec, crypto_sm4_fk, crypto_sm4_ck);
-	kernel_neon_end();
+	scoped_ksimd() {
+		sm4_ce_expand_key(key, ctx->key1.rkey_enc,
+				ctx->key1.rkey_dec, crypto_sm4_fk, crypto_sm4_ck);
+		sm4_ce_expand_key(&key[SM4_KEY_SIZE], ctx->key2.rkey_enc,
+				ctx->key2.rkey_dec, crypto_sm4_fk, crypto_sm4_ck);
+	}
 
 	return 0;
 }
@@ -117,16 +116,14 @@ static int sm4_ecb_do_crypt(struct skcipher_request *req, const u32 *rkey)
 		u8 *dst = walk.dst.virt.addr;
 		unsigned int nblks;
 
-		kernel_neon_begin();
-
-		nblks = BYTES2BLKS(nbytes);
-		if (nblks) {
-			sm4_ce_crypt(rkey, dst, src, nblks);
-			nbytes -= nblks * SM4_BLOCK_SIZE;
+		scoped_ksimd() {
+			nblks = BYTES2BLKS(nbytes);
+			if (nblks) {
+				sm4_ce_crypt(rkey, dst, src, nblks);
+				nbytes -= nblks * SM4_BLOCK_SIZE;
+			}
 		}
 
-		kernel_neon_end();
-
 		err = skcipher_walk_done(&walk, nbytes);
 	}
 
@@ -167,16 +164,14 @@ static int sm4_cbc_crypt(struct skcipher_request *req,
 
 		nblocks = nbytes / SM4_BLOCK_SIZE;
 		if (nblocks) {
-			kernel_neon_begin();
-
-			if (encrypt)
-				sm4_ce_cbc_enc(ctx->rkey_enc, dst, src,
-					       walk.iv, nblocks);
-			else
-				sm4_ce_cbc_dec(ctx->rkey_dec, dst, src,
-					       walk.iv, nblocks);
-
-			kernel_neon_end();
+			scoped_ksimd() {
+				if (encrypt)
+					sm4_ce_cbc_enc(ctx->rkey_enc, dst, src,
+						       walk.iv, nblocks);
+				else
+					sm4_ce_cbc_dec(ctx->rkey_dec, dst, src,
+						       walk.iv, nblocks);
+			}
 		}
 
 		err = skcipher_walk_done(&walk, nbytes % SM4_BLOCK_SIZE);
@@ -249,16 +244,14 @@ static int sm4_cbc_cts_crypt(struct skcipher_request *req, bool encrypt)
 	if (err)
 		return err;
 
-	kernel_neon_begin();
-
-	if (encrypt)
-		sm4_ce_cbc_cts_enc(ctx->rkey_enc, walk.dst.virt.addr,
-				   walk.src.virt.addr, walk.iv, walk.nbytes);
-	else
-		sm4_ce_cbc_cts_dec(ctx->rkey_dec, walk.dst.virt.addr,
-				   walk.src.virt.addr, walk.iv, walk.nbytes);
-
-	kernel_neon_end();
+	scoped_ksimd() {
+		if (encrypt)
+			sm4_ce_cbc_cts_enc(ctx->rkey_enc, walk.dst.virt.addr,
+					   walk.src.virt.addr, walk.iv, walk.nbytes);
+		else
+			sm4_ce_cbc_cts_dec(ctx->rkey_dec, walk.dst.virt.addr,
+					   walk.src.virt.addr, walk.iv, walk.nbytes);
+	}
 
 	return skcipher_walk_done(&walk, 0);
 }
@@ -288,28 +281,26 @@ static int sm4_ctr_crypt(struct skcipher_request *req)
 		u8 *dst = walk.dst.virt.addr;
 		unsigned int nblks;
 
-		kernel_neon_begin();
-
-		nblks = BYTES2BLKS(nbytes);
-		if (nblks) {
-			sm4_ce_ctr_enc(ctx->rkey_enc, dst, src, walk.iv, nblks);
-			dst += nblks * SM4_BLOCK_SIZE;
-			src += nblks * SM4_BLOCK_SIZE;
-			nbytes -= nblks * SM4_BLOCK_SIZE;
-		}
-
-		/* tail */
-		if (walk.nbytes == walk.total && nbytes > 0) {
-			u8 keystream[SM4_BLOCK_SIZE];
-
-			sm4_ce_crypt_block(ctx->rkey_enc, keystream, walk.iv);
-			crypto_inc(walk.iv, SM4_BLOCK_SIZE);
-			crypto_xor_cpy(dst, src, keystream, nbytes);
-			nbytes = 0;
+		scoped_ksimd() {
+			nblks = BYTES2BLKS(nbytes);
+			if (nblks) {
+				sm4_ce_ctr_enc(ctx->rkey_enc, dst, src, walk.iv, nblks);
+				dst += nblks * SM4_BLOCK_SIZE;
+				src += nblks * SM4_BLOCK_SIZE;
+				nbytes -= nblks * SM4_BLOCK_SIZE;
+			}
+
+			/* tail */
+			if (walk.nbytes == walk.total && nbytes > 0) {
+				u8 keystream[SM4_BLOCK_SIZE];
+
+				sm4_ce_crypt_block(ctx->rkey_enc, keystream, walk.iv);
+				crypto_inc(walk.iv, SM4_BLOCK_SIZE);
+				crypto_xor_cpy(dst, src, keystream, nbytes);
+				nbytes = 0;
+			}
 		}
 
-		kernel_neon_end();
-
 		err = skcipher_walk_done(&walk, nbytes);
 	}
 
@@ -359,18 +350,16 @@ static int sm4_xts_crypt(struct skcipher_request *req, bool encrypt)
 		if (nbytes < walk.total)
 			nbytes &= ~(SM4_BLOCK_SIZE - 1);
 
-		kernel_neon_begin();
-
-		if (encrypt)
-			sm4_ce_xts_enc(ctx->key1.rkey_enc, walk.dst.virt.addr,
-				       walk.src.virt.addr, walk.iv, nbytes,
-				       rkey2_enc);
-		else
-			sm4_ce_xts_dec(ctx->key1.rkey_dec, walk.dst.virt.addr,
-				       walk.src.virt.addr, walk.iv, nbytes,
-				       rkey2_enc);
-
-		kernel_neon_end();
+		scoped_ksimd() {
+			if (encrypt)
+				sm4_ce_xts_enc(ctx->key1.rkey_enc, walk.dst.virt.addr,
+						walk.src.virt.addr, walk.iv, nbytes,
+						rkey2_enc);
+			else
+				sm4_ce_xts_dec(ctx->key1.rkey_dec, walk.dst.virt.addr,
+						walk.src.virt.addr, walk.iv, nbytes,
+						rkey2_enc);
+		}
 
 		rkey2_enc = NULL;
 
@@ -395,18 +384,16 @@ static int sm4_xts_crypt(struct skcipher_request *req, bool encrypt)
 	if (err)
 		return err;
 
-	kernel_neon_begin();
-
-	if (encrypt)
-		sm4_ce_xts_enc(ctx->key1.rkey_enc, walk.dst.virt.addr,
-			       walk.src.virt.addr, walk.iv, walk.nbytes,
-			       rkey2_enc);
-	else
-		sm4_ce_xts_dec(ctx->key1.rkey_dec, walk.dst.virt.addr,
-			       walk.src.virt.addr, walk.iv, walk.nbytes,
-			       rkey2_enc);
-
-	kernel_neon_end();
+	scoped_ksimd() {
+		if (encrypt)
+			sm4_ce_xts_enc(ctx->key1.rkey_enc, walk.dst.virt.addr,
+					walk.src.virt.addr, walk.iv, walk.nbytes,
+					rkey2_enc);
+		else
+			sm4_ce_xts_dec(ctx->key1.rkey_dec, walk.dst.virt.addr,
+					walk.src.virt.addr, walk.iv, walk.nbytes,
+					rkey2_enc);
+	}
 
 	return skcipher_walk_done(&walk, 0);
 }
@@ -510,11 +497,9 @@ static int sm4_cbcmac_setkey(struct crypto_shash *tfm, const u8 *key,
 	if (key_len != SM4_KEY_SIZE)
 		return -EINVAL;
 
-	kernel_neon_begin();
-	sm4_ce_expand_key(key, ctx->key.rkey_enc, ctx->key.rkey_dec,
-			  crypto_sm4_fk, crypto_sm4_ck);
-	kernel_neon_end();
-
+	scoped_ksimd()
+		sm4_ce_expand_key(key, ctx->key.rkey_enc, ctx->key.rkey_dec,
+				crypto_sm4_fk, crypto_sm4_ck);
 	return 0;
 }
 
@@ -530,15 +515,13 @@ static int sm4_cmac_setkey(struct crypto_shash *tfm, const u8 *key,
 
 	memset(consts, 0, SM4_BLOCK_SIZE);
 
-	kernel_neon_begin();
-
-	sm4_ce_expand_key(key, ctx->key.rkey_enc, ctx->key.rkey_dec,
-			  crypto_sm4_fk, crypto_sm4_ck);
+	scoped_ksimd() {
+		sm4_ce_expand_key(key, ctx->key.rkey_enc, ctx->key.rkey_dec,
+				crypto_sm4_fk, crypto_sm4_ck);
 
-	/* encrypt the zero block */
-	sm4_ce_crypt_block(ctx->key.rkey_enc, (u8 *)consts, (const u8 *)consts);
-
-	kernel_neon_end();
+		/* encrypt the zero block */
+		sm4_ce_crypt_block(ctx->key.rkey_enc, (u8 *)consts, (const u8 *)consts);
+	}
 
 	/* gf(2^128) multiply zero-ciphertext with u and u^2 */
 	a = be64_to_cpu(consts[0].a);
@@ -568,18 +551,16 @@ static int sm4_xcbc_setkey(struct crypto_shash *tfm, const u8 *key,
 	if (key_len != SM4_KEY_SIZE)
 		return -EINVAL;
 
-	kernel_neon_begin();
-
-	sm4_ce_expand_key(key, ctx->key.rkey_enc, ctx->key.rkey_dec,
-			  crypto_sm4_fk, crypto_sm4_ck);
+	scoped_ksimd() {
+		sm4_ce_expand_key(key, ctx->key.rkey_enc, ctx->key.rkey_dec,
+				crypto_sm4_fk, crypto_sm4_ck);
 
-	sm4_ce_crypt_block(ctx->key.rkey_enc, key2, ks[0]);
-	sm4_ce_crypt(ctx->key.rkey_enc, ctx->consts, ks[1], 2);
+		sm4_ce_crypt_block(ctx->key.rkey_enc, key2, ks[0]);
+		sm4_ce_crypt(ctx->key.rkey_enc, ctx->consts, ks[1], 2);
 
-	sm4_ce_expand_key(key2, ctx->key.rkey_enc, ctx->key.rkey_dec,
-			  crypto_sm4_fk, crypto_sm4_ck);
-
-	kernel_neon_end();
+		sm4_ce_expand_key(key2, ctx->key.rkey_enc, ctx->key.rkey_dec,
+				crypto_sm4_fk, crypto_sm4_ck);
+	}
 
 	return 0;
 }
@@ -600,10 +581,9 @@ static int sm4_mac_update(struct shash_desc *desc, const u8 *p,
 	unsigned int nblocks = len / SM4_BLOCK_SIZE;
 
 	len %= SM4_BLOCK_SIZE;
-	kernel_neon_begin();
-	sm4_ce_mac_update(tctx->key.rkey_enc, ctx->digest, p,
-			  nblocks, false, true);
-	kernel_neon_end();
+	scoped_ksimd()
+		sm4_ce_mac_update(tctx->key.rkey_enc, ctx->digest, p,
+				nblocks, false, true);
 	return len;
 }
 
@@ -619,10 +599,9 @@ static int sm4_cmac_finup(struct shash_desc *desc, const u8 *src,
 		ctx->digest[len] ^= 0x80;
 		consts += SM4_BLOCK_SIZE;
 	}
-	kernel_neon_begin();
-	sm4_ce_mac_update(tctx->key.rkey_enc, ctx->digest, consts, 1,
-			  false, true);
-	kernel_neon_end();
+	scoped_ksimd()
+		sm4_ce_mac_update(tctx->key.rkey_enc, ctx->digest, consts, 1,
+				  false, true);
 	memcpy(out, ctx->digest, SM4_BLOCK_SIZE);
 	return 0;
 }
@@ -635,10 +614,9 @@ static int sm4_cbcmac_finup(struct shash_desc *desc, const u8 *src,
 
 	if (len) {
 		crypto_xor(ctx->digest, src, len);
-		kernel_neon_begin();
-		sm4_ce_crypt_block(tctx->key.rkey_enc, ctx->digest,
-				   ctx->digest);
-		kernel_neon_end();
+		scoped_ksimd()
+			sm4_ce_crypt_block(tctx->key.rkey_enc, ctx->digest,
+					   ctx->digest);
 	}
 	memcpy(out, ctx->digest, SM4_BLOCK_SIZE);
 	return 0;
diff --git a/arch/arm64/crypto/sm4-neon-glue.c b/arch/arm64/crypto/sm4-neon-glue.c
index e3500aca2d18..e944c2a2efb0 100644
--- a/arch/arm64/crypto/sm4-neon-glue.c
+++ b/arch/arm64/crypto/sm4-neon-glue.c
@@ -48,11 +48,8 @@ static int sm4_ecb_do_crypt(struct skcipher_request *req, const u32 *rkey)
 
 		nblocks = nbytes / SM4_BLOCK_SIZE;
 		if (nblocks) {
-			kernel_neon_begin();
-
-			sm4_neon_crypt(rkey, dst, src, nblocks);
-
-			kernel_neon_end();
+			scoped_ksimd()
+				sm4_neon_crypt(rkey, dst, src, nblocks);
 		}
 
 		err = skcipher_walk_done(&walk, nbytes % SM4_BLOCK_SIZE);
@@ -126,12 +123,9 @@ static int sm4_cbc_decrypt(struct skcipher_request *req)
 
 		nblocks = nbytes / SM4_BLOCK_SIZE;
 		if (nblocks) {
-			kernel_neon_begin();
-
-			sm4_neon_cbc_dec(ctx->rkey_dec, dst, src,
-					 walk.iv, nblocks);
-
-			kernel_neon_end();
+			scoped_ksimd()
+				sm4_neon_cbc_dec(ctx->rkey_dec, dst, src,
+						 walk.iv, nblocks);
 		}
 
 		err = skcipher_walk_done(&walk, nbytes % SM4_BLOCK_SIZE);
@@ -157,12 +151,9 @@ static int sm4_ctr_crypt(struct skcipher_request *req)
 
 		nblocks = nbytes / SM4_BLOCK_SIZE;
 		if (nblocks) {
-			kernel_neon_begin();
-
-			sm4_neon_ctr_crypt(ctx->rkey_enc, dst, src,
-					   walk.iv, nblocks);
-
-			kernel_neon_end();
+			scoped_ksimd()
+				sm4_neon_ctr_crypt(ctx->rkey_enc, dst, src,
+						   walk.iv, nblocks);
 
 			dst += nblocks * SM4_BLOCK_SIZE;
 			src += nblocks * SM4_BLOCK_SIZE;
diff --git a/arch/arm64/include/asm/alternative-macros.h b/arch/arm64/include/asm/alternative-macros.h
index c8c77f9e36d6..862416624852 100644
--- a/arch/arm64/include/asm/alternative-macros.h
+++ b/arch/arm64/include/asm/alternative-macros.h
@@ -19,7 +19,7 @@
 #error "cpucaps have overflown ARM64_CB_BIT"
 #endif
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 #include <linux/stringify.h>
 
@@ -207,7 +207,7 @@ alternative_endif
 #define _ALTERNATIVE_CFG(insn1, insn2, cap, cfg, ...)	\
 	alternative_insn insn1, insn2, cap, IS_ENABLED(cfg)
 
-#endif  /*  __ASSEMBLY__  */
+#endif  /*  __ASSEMBLER__  */
 
 /*
  * Usage: asm(ALTERNATIVE(oldinstr, newinstr, cpucap));
@@ -219,7 +219,7 @@ alternative_endif
 #define ALTERNATIVE(oldinstr, newinstr, ...)   \
 	_ALTERNATIVE_CFG(oldinstr, newinstr, __VA_ARGS__, 1)
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 #include <linux/types.h>
 
@@ -263,6 +263,6 @@ l_yes:
 	return true;
 }
 
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 
 #endif /* __ASM_ALTERNATIVE_MACROS_H */
diff --git a/arch/arm64/include/asm/alternative.h b/arch/arm64/include/asm/alternative.h
index 51746005239b..621aa8550174 100644
--- a/arch/arm64/include/asm/alternative.h
+++ b/arch/arm64/include/asm/alternative.h
@@ -4,7 +4,7 @@
 
 #include <asm/alternative-macros.h>
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 #include <linux/init.h>
 #include <linux/types.h>
@@ -37,5 +37,5 @@ static inline int apply_alternatives_module(void *start, size_t length)
 void alt_cb_patch_nops(struct alt_instr *alt, __le32 *origptr,
 		       __le32 *updptr, int nr_inst);
 
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 #endif /* __ASM_ALTERNATIVE_H */
diff --git a/arch/arm64/include/asm/arch_gicv3.h b/arch/arm64/include/asm/arch_gicv3.h
index 9e96f024b2f1..d20b03931a8d 100644
--- a/arch/arm64/include/asm/arch_gicv3.h
+++ b/arch/arm64/include/asm/arch_gicv3.h
@@ -9,7 +9,7 @@
 
 #include <asm/sysreg.h>
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 #include <linux/irqchip/arm-gic-common.h>
 #include <linux/stringify.h>
@@ -188,5 +188,5 @@ static inline bool gic_has_relaxed_pmr_sync(void)
 	return cpus_have_cap(ARM64_HAS_GIC_PRIO_RELAXED_SYNC);
 }
 
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 #endif /* __ASM_ARCH_GICV3_H */
diff --git a/arch/arm64/include/asm/asm-extable.h b/arch/arm64/include/asm/asm-extable.h
index 292f2687a12e..d67e2fdd1aee 100644
--- a/arch/arm64/include/asm/asm-extable.h
+++ b/arch/arm64/include/asm/asm-extable.h
@@ -27,7 +27,7 @@
 /* Data fields for EX_TYPE_UACCESS_CPY */
 #define EX_DATA_UACCESS_WRITE	BIT(0)
 
-#ifdef __ASSEMBLY__
+#ifdef __ASSEMBLER__
 
 #define __ASM_EXTABLE_RAW(insn, fixup, type, data)	\
 	.pushsection	__ex_table, "a";		\
@@ -77,7 +77,7 @@
 	__ASM_EXTABLE_RAW(\insn, \fixup, EX_TYPE_UACCESS_CPY, \uaccess_is_write)
 	.endm
 
-#else /* __ASSEMBLY__ */
+#else /* __ASSEMBLER__ */
 
 #include <linux/stringify.h>
 
@@ -132,6 +132,6 @@
 			    EX_DATA_REG(ADDR, addr)				\
 			  ")")
 
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 
 #endif /* __ASM_ASM_EXTABLE_H */
diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h
index 23be85d93348..f0ca7196f6fa 100644
--- a/arch/arm64/include/asm/assembler.h
+++ b/arch/arm64/include/asm/assembler.h
@@ -5,7 +5,7 @@
  * Copyright (C) 1996-2000 Russell King
  * Copyright (C) 2012 ARM Ltd.
  */
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 #error "Only include this from assembly code"
 #endif
 
@@ -325,14 +325,14 @@ alternative_cb_end
  * tcr_set_t0sz - update TCR.T0SZ so that we can load the ID map
  */
 	.macro	tcr_set_t0sz, valreg, t0sz
-	bfi	\valreg, \t0sz, #TCR_T0SZ_OFFSET, #TCR_TxSZ_WIDTH
+	bfi	\valreg, \t0sz, #TCR_EL1_T0SZ_SHIFT, #TCR_EL1_T0SZ_WIDTH
 	.endm
 
 /*
  * tcr_set_t1sz - update TCR.T1SZ
  */
 	.macro	tcr_set_t1sz, valreg, t1sz
-	bfi	\valreg, \t1sz, #TCR_T1SZ_OFFSET, #TCR_TxSZ_WIDTH
+	bfi	\valreg, \t1sz, #TCR_EL1_T1SZ_SHIFT, #TCR_EL1_T1SZ_WIDTH
 	.endm
 
 /*
@@ -371,7 +371,7 @@ alternative_endif
  * [start, end) with dcache line size explicitly provided.
  *
  * 	op:		operation passed to dc instruction
- * 	domain:		domain used in dsb instruciton
+ * 	domain:		domain used in dsb instruction
  * 	start:          starting virtual address of the region
  * 	end:            end virtual address of the region
  *	linesz:		dcache line size
@@ -412,7 +412,7 @@ alternative_endif
  * [start, end)
  *
  * 	op:		operation passed to dc instruction
- * 	domain:		domain used in dsb instruciton
+ * 	domain:		domain used in dsb instruction
  * 	start:          starting virtual address of the region
  * 	end:            end virtual address of the region
  * 	fixup:		optional label to branch to on user fault
@@ -589,7 +589,7 @@ alternative_endif
 	.macro	offset_ttbr1, ttbr, tmp
 #if defined(CONFIG_ARM64_VA_BITS_52) && !defined(CONFIG_ARM64_LPA2)
 	mrs	\tmp, tcr_el1
-	and	\tmp, \tmp, #TCR_T1SZ_MASK
+	and	\tmp, \tmp, #TCR_EL1_T1SZ_MASK
 	cmp	\tmp, #TCR_T1SZ(VA_BITS_MIN)
 	orr	\tmp, \ttbr, #TTBR1_BADDR_4852_OFFSET
 	csel	\ttbr, \tmp, \ttbr, eq
diff --git a/arch/arm64/include/asm/atomic_lse.h b/arch/arm64/include/asm/atomic_lse.h
index 87f568a94e55..afad1849c4cf 100644
--- a/arch/arm64/include/asm/atomic_lse.h
+++ b/arch/arm64/include/asm/atomic_lse.h
@@ -103,17 +103,17 @@ static __always_inline void __lse_atomic_and(int i, atomic_t *v)
 	return __lse_atomic_andnot(~i, v);
 }
 
-#define ATOMIC_FETCH_OP_AND(name, mb, cl...)				\
+#define ATOMIC_FETCH_OP_AND(name)					\
 static __always_inline int						\
 __lse_atomic_fetch_and##name(int i, atomic_t *v)			\
 {									\
 	return __lse_atomic_fetch_andnot##name(~i, v);			\
 }
 
-ATOMIC_FETCH_OP_AND(_relaxed,   )
-ATOMIC_FETCH_OP_AND(_acquire,  a, "memory")
-ATOMIC_FETCH_OP_AND(_release,  l, "memory")
-ATOMIC_FETCH_OP_AND(        , al, "memory")
+ATOMIC_FETCH_OP_AND(_relaxed)
+ATOMIC_FETCH_OP_AND(_acquire)
+ATOMIC_FETCH_OP_AND(_release)
+ATOMIC_FETCH_OP_AND(        )
 
 #undef ATOMIC_FETCH_OP_AND
 
@@ -210,17 +210,17 @@ static __always_inline void __lse_atomic64_and(s64 i, atomic64_t *v)
 	return __lse_atomic64_andnot(~i, v);
 }
 
-#define ATOMIC64_FETCH_OP_AND(name, mb, cl...)				\
+#define ATOMIC64_FETCH_OP_AND(name)					\
 static __always_inline long						\
 __lse_atomic64_fetch_and##name(s64 i, atomic64_t *v)			\
 {									\
 	return __lse_atomic64_fetch_andnot##name(~i, v);		\
 }
 
-ATOMIC64_FETCH_OP_AND(_relaxed,   )
-ATOMIC64_FETCH_OP_AND(_acquire,  a, "memory")
-ATOMIC64_FETCH_OP_AND(_release,  l, "memory")
-ATOMIC64_FETCH_OP_AND(        , al, "memory")
+ATOMIC64_FETCH_OP_AND(_relaxed)
+ATOMIC64_FETCH_OP_AND(_acquire)
+ATOMIC64_FETCH_OP_AND(_release)
+ATOMIC64_FETCH_OP_AND(        )
 
 #undef ATOMIC64_FETCH_OP_AND
 
diff --git a/arch/arm64/include/asm/barrier.h b/arch/arm64/include/asm/barrier.h
index f5801b0ba9e9..9495c4441a46 100644
--- a/arch/arm64/include/asm/barrier.h
+++ b/arch/arm64/include/asm/barrier.h
@@ -7,7 +7,7 @@
 #ifndef __ASM_BARRIER_H
 #define __ASM_BARRIER_H
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 #include <linux/kasan-checks.h>
 
@@ -221,6 +221,6 @@ do {									\
 
 #include <asm-generic/barrier.h>
 
-#endif	/* __ASSEMBLY__ */
+#endif	/* __ASSEMBLER__ */
 
 #endif	/* __ASM_BARRIER_H */
diff --git a/arch/arm64/include/asm/cache.h b/arch/arm64/include/asm/cache.h
index 09963004ceea..dd2c8586a725 100644
--- a/arch/arm64/include/asm/cache.h
+++ b/arch/arm64/include/asm/cache.h
@@ -35,7 +35,7 @@
 #define ARCH_DMA_MINALIGN	(128)
 #define ARCH_KMALLOC_MINALIGN	(8)
 
-#if !defined(__ASSEMBLY__) && !defined(BUILD_VDSO)
+#if !defined(__ASSEMBLER__) && !defined(BUILD_VDSO)
 
 #include <linux/bitops.h>
 #include <linux/kasan-enabled.h>
@@ -135,6 +135,6 @@ static inline u32 __attribute_const__ read_cpuid_effective_cachetype(void)
 	return ctr;
 }
 
-#endif /* !defined(__ASSEMBLY__) && !defined(BUILD_VDSO) */
+#endif /* !defined(__ASSEMBLER__) && !defined(BUILD_VDSO) */
 
 #endif
diff --git a/arch/arm64/include/asm/cpucaps.h b/arch/arm64/include/asm/cpucaps.h
index 9d769291a306..2c8029472ad4 100644
--- a/arch/arm64/include/asm/cpucaps.h
+++ b/arch/arm64/include/asm/cpucaps.h
@@ -5,7 +5,7 @@
 
 #include <asm/cpucap-defs.h>
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 #include <linux/types.h>
 /*
  * Check whether a cpucap is possible at compiletime.
@@ -77,6 +77,6 @@ cpucap_is_possible(const unsigned int cap)
 
 	return true;
 }
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 
 #endif /* __ASM_CPUCAPS_H */
diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h
index e223cbf350e4..4de51f8d92cb 100644
--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@ -19,7 +19,7 @@
 #define ARM64_SW_FEATURE_OVERRIDE_HVHE		4
 #define ARM64_SW_FEATURE_OVERRIDE_RODATA_OFF	8
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 #include <linux/bug.h>
 #include <linux/jump_label.h>
@@ -199,7 +199,7 @@ extern struct arm64_ftr_reg arm64_ftr_reg_ctrel0;
  *    registers (e.g, SCTLR, TCR etc.) or patching the kernel via
  *    alternatives. The kernel patching is batched and performed at later
  *    point. The actions are always initiated only after the capability
- *    is finalised. This is usally denoted by "enabling" the capability.
+ *    is finalised. This is usually denoted by "enabling" the capability.
  *    The actions are initiated as follows :
  *	a) Action is triggered on all online CPUs, after the capability is
  *	finalised, invoked within the stop_machine() context from
@@ -251,7 +251,7 @@ extern struct arm64_ftr_reg arm64_ftr_reg_ctrel0;
 #define ARM64_CPUCAP_SCOPE_LOCAL_CPU		((u16)BIT(0))
 #define ARM64_CPUCAP_SCOPE_SYSTEM		((u16)BIT(1))
 /*
- * The capabilitiy is detected on the Boot CPU and is used by kernel
+ * The capability is detected on the Boot CPU and is used by kernel
  * during early boot. i.e, the capability should be "detected" and
  * "enabled" as early as possibly on all booting CPUs.
  */
@@ -1078,6 +1078,6 @@ static inline bool cpu_has_lpa2(void)
 #endif
 }
 
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 
 #endif
diff --git a/arch/arm64/include/asm/cputype.h b/arch/arm64/include/asm/cputype.h
index 9b00b75acbf2..08860d482e60 100644
--- a/arch/arm64/include/asm/cputype.h
+++ b/arch/arm64/include/asm/cputype.h
@@ -247,9 +247,9 @@
 /* Fujitsu Erratum 010001 affects A64FX 1.0 and 1.1, (v0r0 and v1r0) */
 #define MIDR_FUJITSU_ERRATUM_010001		MIDR_FUJITSU_A64FX
 #define MIDR_FUJITSU_ERRATUM_010001_MASK	(~MIDR_CPU_VAR_REV(1, 0))
-#define TCR_CLEAR_FUJITSU_ERRATUM_010001	(TCR_NFD1 | TCR_NFD0)
+#define TCR_CLEAR_FUJITSU_ERRATUM_010001	(TCR_EL1_NFD1 | TCR_EL1_NFD0)
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 #include <asm/sysreg.h>
 
@@ -328,6 +328,6 @@ static inline u32 __attribute_const__ read_cpuid_cachetype(void)
 {
 	return read_cpuid(CTR_EL0);
 }
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 
 #endif
diff --git a/arch/arm64/include/asm/current.h b/arch/arm64/include/asm/current.h
index 54ceae0874c7..c92912eaf186 100644
--- a/arch/arm64/include/asm/current.h
+++ b/arch/arm64/include/asm/current.h
@@ -4,7 +4,7 @@
 
 #include <linux/compiler.h>
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 struct task_struct;
 
@@ -23,7 +23,7 @@ static __always_inline struct task_struct *get_current(void)
 
 #define current get_current()
 
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 
 #endif /* __ASM_CURRENT_H */
 
diff --git a/arch/arm64/include/asm/debug-monitors.h b/arch/arm64/include/asm/debug-monitors.h
index f5e3ed2420ce..8d5f92418838 100644
--- a/arch/arm64/include/asm/debug-monitors.h
+++ b/arch/arm64/include/asm/debug-monitors.h
@@ -48,7 +48,7 @@
 #define AARCH32_BREAK_THUMB2_LO	0xf7f0
 #define AARCH32_BREAK_THUMB2_HI	0xa000
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 struct task_struct;
 
 #define DBG_ARCH_ID_RESERVED	0	/* In case of ptrace ABI updates. */
@@ -88,5 +88,5 @@ static inline bool try_step_suspended_breakpoints(struct pt_regs *regs)
 
 bool try_handle_aarch32_break(struct pt_regs *regs);
 
-#endif	/* __ASSEMBLY */
+#endif	/* __ASSEMBLER__ */
 #endif	/* __ASM_DEBUG_MONITORS_H */
diff --git a/arch/arm64/include/asm/efi.h b/arch/arm64/include/asm/efi.h
index bcd5622aa096..aa91165ca140 100644
--- a/arch/arm64/include/asm/efi.h
+++ b/arch/arm64/include/asm/efi.h
@@ -126,21 +126,14 @@ static inline void efi_set_pgd(struct mm_struct *mm)
 		if (mm != current->active_mm) {
 			/*
 			 * Update the current thread's saved ttbr0 since it is
-			 * restored as part of a return from exception. Enable
-			 * access to the valid TTBR0_EL1 and invoke the errata
-			 * workaround directly since there is no return from
-			 * exception when invoking the EFI run-time services.
+			 * restored as part of a return from exception.
 			 */
 			update_saved_ttbr0(current, mm);
-			uaccess_ttbr0_enable();
-			post_ttbr_update_workaround();
 		} else {
 			/*
-			 * Defer the switch to the current thread's TTBR0_EL1
-			 * until uaccess_enable(). Restore the current
-			 * thread's saved ttbr0 corresponding to its active_mm
+			 * Restore the current thread's saved ttbr0
+			 * corresponding to its active_mm
 			 */
-			uaccess_ttbr0_disable();
 			update_saved_ttbr0(current, current->active_mm);
 		}
 	}
diff --git a/arch/arm64/include/asm/el2_setup.h b/arch/arm64/include/asm/el2_setup.h
index 99a7c0235e6d..cacd20df1786 100644
--- a/arch/arm64/include/asm/el2_setup.h
+++ b/arch/arm64/include/asm/el2_setup.h
@@ -7,7 +7,7 @@
 #ifndef __ARM_KVM_INIT_H__
 #define __ARM_KVM_INIT_H__
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 #error Assembly-only header
 #endif
 
@@ -24,7 +24,7 @@
 	 * ID_AA64MMFR4_EL1.E2H0 < 0. On such CPUs HCR_EL2.E2H is RES1, but it
 	 * can reset into an UNKNOWN state and might not read as 1 until it has
 	 * been initialized explicitly.
-	 * Initalize HCR_EL2.E2H so that later code can rely upon HCR_EL2.E2H
+	 * Initialize HCR_EL2.E2H so that later code can rely upon HCR_EL2.E2H
 	 * indicating whether the CPU is running in E2H mode.
 	 */
 	mrs_s	x1, SYS_ID_AA64MMFR4_EL1
diff --git a/arch/arm64/include/asm/elf.h b/arch/arm64/include/asm/elf.h
index 3f93f4eef953..d2779d604c7b 100644
--- a/arch/arm64/include/asm/elf.h
+++ b/arch/arm64/include/asm/elf.h
@@ -133,7 +133,7 @@
 #define ELF_ET_DYN_BASE		(2 * DEFAULT_MAP_WINDOW_64 / 3)
 #endif /* CONFIG_ARM64_FORCE_52BIT */
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 #include <uapi/linux/elf.h>
 #include <linux/bug.h>
@@ -293,6 +293,6 @@ static inline int arch_check_elf(void *ehdr, bool has_interp,
 	return 0;
 }
 
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
 
 #endif
diff --git a/arch/arm64/include/asm/esr.h b/arch/arm64/include/asm/esr.h
index e1deed824464..4975a92cbd17 100644
--- a/arch/arm64/include/asm/esr.h
+++ b/arch/arm64/include/asm/esr.h
@@ -431,7 +431,7 @@
 #define ESR_ELx_IT_GCSPOPCX		6
 #define ESR_ELx_IT_GCSPOPX		7
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 #include <asm/types.h>
 
 static inline unsigned long esr_brk_comment(unsigned long esr)
@@ -534,6 +534,6 @@ static inline bool esr_iss_is_eretab(unsigned long esr)
 }
 
 const char *esr_get_class_string(unsigned long esr);
-#endif /* __ASSEMBLY */
+#endif /* __ASSEMBLER__ */
 
 #endif /* __ASM_ESR_H */
diff --git a/arch/arm64/include/asm/fixmap.h b/arch/arm64/include/asm/fixmap.h
index 635a43c4ec85..65555284446e 100644
--- a/arch/arm64/include/asm/fixmap.h
+++ b/arch/arm64/include/asm/fixmap.h
@@ -15,7 +15,7 @@
 #ifndef _ASM_ARM64_FIXMAP_H
 #define _ASM_ARM64_FIXMAP_H
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 #include <linux/kernel.h>
 #include <linux/math.h>
 #include <linux/sizes.h>
@@ -117,5 +117,5 @@ extern void __set_fixmap(enum fixed_addresses idx, phys_addr_t phys, pgprot_t pr
 
 #include <asm-generic/fixmap.h>
 
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
 #endif /* _ASM_ARM64_FIXMAP_H */
diff --git a/arch/arm64/include/asm/fpsimd.h b/arch/arm64/include/asm/fpsimd.h
index b8cf0ea43cc0..1d2e33559bd5 100644
--- a/arch/arm64/include/asm/fpsimd.h
+++ b/arch/arm64/include/asm/fpsimd.h
@@ -12,7 +12,7 @@
 #include <asm/sigcontext.h>
 #include <asm/sysreg.h>
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 #include <linux/bitmap.h>
 #include <linux/build_bug.h>
diff --git a/arch/arm64/include/asm/fpu.h b/arch/arm64/include/asm/fpu.h
index 2ae50bdce59b..751e88a96734 100644
--- a/arch/arm64/include/asm/fpu.h
+++ b/arch/arm64/include/asm/fpu.h
@@ -6,10 +6,22 @@
 #ifndef __ASM_FPU_H
 #define __ASM_FPU_H
 
+#include <linux/preempt.h>
 #include <asm/neon.h>
 
 #define kernel_fpu_available()	cpu_has_neon()
-#define kernel_fpu_begin()	kernel_neon_begin()
-#define kernel_fpu_end()	kernel_neon_end()
+
+static inline void kernel_fpu_begin(void)
+{
+	BUG_ON(!in_task());
+	preempt_disable();
+	kernel_neon_begin(NULL);
+}
+
+static inline void kernel_fpu_end(void)
+{
+	kernel_neon_end(NULL);
+	preempt_enable();
+}
 
 #endif /* ! __ASM_FPU_H */
diff --git a/arch/arm64/include/asm/ftrace.h b/arch/arm64/include/asm/ftrace.h
index ba7cf7fec5e9..1621c84f44b3 100644
--- a/arch/arm64/include/asm/ftrace.h
+++ b/arch/arm64/include/asm/ftrace.h
@@ -37,7 +37,7 @@
  */
 #define ARCH_FTRACE_SHIFT_STACK_TRACER 1
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 #include <linux/compat.h>
 
 extern void _mcount(unsigned long);
@@ -217,9 +217,9 @@ static inline bool arch_syscall_match_sym_name(const char *sym,
 	 */
 	return !strcmp(sym + 8, name);
 }
-#endif /* ifndef __ASSEMBLY__ */
+#endif /* ifndef __ASSEMBLER__ */
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 
 void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent,
diff --git a/arch/arm64/include/asm/gpr-num.h b/arch/arm64/include/asm/gpr-num.h
index 05da4a7c5788..a114e4f8209b 100644
--- a/arch/arm64/include/asm/gpr-num.h
+++ b/arch/arm64/include/asm/gpr-num.h
@@ -2,7 +2,7 @@
 #ifndef __ASM_GPR_NUM_H
 #define __ASM_GPR_NUM_H
 
-#ifdef __ASSEMBLY__
+#ifdef __ASSEMBLER__
 
 	.irp	num,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30
 	.equ	.L__gpr_num_x\num, \num
@@ -11,7 +11,7 @@
 	.equ	.L__gpr_num_xzr, 31
 	.equ	.L__gpr_num_wzr, 31
 
-#else /* __ASSEMBLY__ */
+#else /* __ASSEMBLER__ */
 
 #define __DEFINE_ASM_GPR_NUMS					\
 "	.irp	num,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30\n" \
@@ -21,6 +21,6 @@
 "	.equ	.L__gpr_num_xzr, 31\n"				\
 "	.equ	.L__gpr_num_wzr, 31\n"
 
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 
 #endif /* __ASM_GPR_NUM_H */
diff --git a/arch/arm64/include/asm/hwcap.h b/arch/arm64/include/asm/hwcap.h
index 6d567265467c..1f63814ae6c4 100644
--- a/arch/arm64/include/asm/hwcap.h
+++ b/arch/arm64/include/asm/hwcap.h
@@ -46,7 +46,7 @@
 #define COMPAT_HWCAP2_SB	(1 << 5)
 #define COMPAT_HWCAP2_SSBS	(1 << 6)
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 #include <linux/log2.h>
 
 /*
diff --git a/arch/arm64/include/asm/image.h b/arch/arm64/include/asm/image.h
index c09cf942dc92..9ba85173f857 100644
--- a/arch/arm64/include/asm/image.h
+++ b/arch/arm64/include/asm/image.h
@@ -20,7 +20,7 @@
 #define ARM64_IMAGE_FLAG_PAGE_SIZE_64K		3
 #define ARM64_IMAGE_FLAG_PHYS_BASE		1
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 #define arm64_image_flag_field(flags, field) \
 				(((flags) >> field##_SHIFT) & field##_MASK)
@@ -54,6 +54,6 @@ struct arm64_image_header {
 	__le32 res5;
 };
 
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 
 #endif /* __ASM_IMAGE_H */
diff --git a/arch/arm64/include/asm/insn.h b/arch/arm64/include/asm/insn.h
index 18c7811774d3..e1d30ba99d01 100644
--- a/arch/arm64/include/asm/insn.h
+++ b/arch/arm64/include/asm/insn.h
@@ -12,7 +12,7 @@
 
 #include <asm/insn-def.h>
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 enum aarch64_insn_hint_cr_op {
 	AARCH64_INSN_HINT_NOP	= 0x0 << 5,
@@ -730,6 +730,6 @@ u32 aarch32_insn_mcr_extract_crm(u32 insn);
 typedef bool (pstate_check_t)(unsigned long);
 extern pstate_check_t * const aarch32_opcode_cond_checks[16];
 
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 
 #endif	/* __ASM_INSN_H */
diff --git a/arch/arm64/include/asm/jump_label.h b/arch/arm64/include/asm/jump_label.h
index 424ed421cd97..0cb211d3607d 100644
--- a/arch/arm64/include/asm/jump_label.h
+++ b/arch/arm64/include/asm/jump_label.h
@@ -8,7 +8,7 @@
 #ifndef __ASM_JUMP_LABEL_H
 #define __ASM_JUMP_LABEL_H
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 #include <linux/types.h>
 #include <asm/insn.h>
@@ -58,5 +58,5 @@ l_yes:
 	return true;
 }
 
-#endif  /* __ASSEMBLY__ */
+#endif  /* __ASSEMBLER__ */
 #endif	/* __ASM_JUMP_LABEL_H */
diff --git a/arch/arm64/include/asm/kasan.h b/arch/arm64/include/asm/kasan.h
index e1b57c13f8a4..b167e9d3da91 100644
--- a/arch/arm64/include/asm/kasan.h
+++ b/arch/arm64/include/asm/kasan.h
@@ -2,7 +2,7 @@
 #ifndef __ASM_KASAN_H
 #define __ASM_KASAN_H
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 #include <linux/linkage.h>
 #include <asm/memory.h>
diff --git a/arch/arm64/include/asm/kexec.h b/arch/arm64/include/asm/kexec.h
index 4d9cc7a76d9c..892e5bebda95 100644
--- a/arch/arm64/include/asm/kexec.h
+++ b/arch/arm64/include/asm/kexec.h
@@ -25,7 +25,7 @@
 
 #define KEXEC_ARCH KEXEC_ARCH_AARCH64
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 /**
  * crash_setup_regs() - save registers for the panic kernel
@@ -130,6 +130,6 @@ extern int load_other_segments(struct kimage *image,
 		char *cmdline);
 #endif
 
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 
 #endif
diff --git a/arch/arm64/include/asm/kgdb.h b/arch/arm64/include/asm/kgdb.h
index 3184f5d1e3ae..67ef1c5532ae 100644
--- a/arch/arm64/include/asm/kgdb.h
+++ b/arch/arm64/include/asm/kgdb.h
@@ -14,7 +14,7 @@
 #include <linux/ptrace.h>
 #include <asm/debug-monitors.h>
 
-#ifndef	__ASSEMBLY__
+#ifndef	__ASSEMBLER__
 
 static inline void arch_kgdb_breakpoint(void)
 {
@@ -36,7 +36,7 @@ static inline int kgdb_single_step_handler(struct pt_regs *regs,
 }
 #endif
 
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
 
 /*
  * gdb remote procotol (well most versions of it) expects the following
diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h
index 9da54d4ee49e..4b34f7b7ed2f 100644
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@@ -46,7 +46,7 @@
 
 #define __KVM_HOST_SMCCC_FUNC___kvm_hyp_init			0
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 #include <linux/mm.h>
 
@@ -303,7 +303,7 @@ void kvm_compute_final_ctr_el0(struct alt_instr *alt,
 void __noreturn __cold nvhe_hyp_panic_handler(u64 esr, u64 spsr, u64 elr_virt,
 	u64 elr_phys, u64 par, uintptr_t vcpu, u64 far, u64 hpfar);
 
-#else /* __ASSEMBLY__ */
+#else /* __ASSEMBLER__ */
 
 .macro get_host_ctxt reg, tmp
 	adr_this_cpu \reg, kvm_host_data, \tmp
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index e4069f2ce642..2dc5e6e742bb 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -49,7 +49,7 @@
  * mappings, and none of this applies in that case.
  */
 
-#ifdef __ASSEMBLY__
+#ifdef __ASSEMBLER__
 
 #include <asm/alternative.h>
 
@@ -396,5 +396,5 @@ void kvm_s2_ptdump_create_debugfs(struct kvm *kvm);
 static inline void kvm_s2_ptdump_create_debugfs(struct kvm *kvm) {}
 #endif /* CONFIG_PTDUMP_STAGE2_DEBUGFS */
 
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 #endif /* __ARM64_KVM_MMU_H__ */
diff --git a/arch/arm64/include/asm/kvm_mte.h b/arch/arm64/include/asm/kvm_mte.h
index de002636eb1f..3171963ad25c 100644
--- a/arch/arm64/include/asm/kvm_mte.h
+++ b/arch/arm64/include/asm/kvm_mte.h
@@ -5,7 +5,7 @@
 #ifndef __ASM_KVM_MTE_H
 #define __ASM_KVM_MTE_H
 
-#ifdef __ASSEMBLY__
+#ifdef __ASSEMBLER__
 
 #include <asm/sysreg.h>
 
@@ -62,5 +62,5 @@ alternative_else_nop_endif
 .endm
 
 #endif /* CONFIG_ARM64_MTE */
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 #endif /* __ASM_KVM_MTE_H */
diff --git a/arch/arm64/include/asm/kvm_ptrauth.h b/arch/arm64/include/asm/kvm_ptrauth.h
index 6199c9f7ec6e..e50987b32483 100644
--- a/arch/arm64/include/asm/kvm_ptrauth.h
+++ b/arch/arm64/include/asm/kvm_ptrauth.h
@@ -8,7 +8,7 @@
 #ifndef __ASM_KVM_PTRAUTH_H
 #define __ASM_KVM_PTRAUTH_H
 
-#ifdef __ASSEMBLY__
+#ifdef __ASSEMBLER__
 
 #include <asm/sysreg.h>
 
@@ -100,7 +100,7 @@ alternative_else_nop_endif
 .endm
 #endif /* CONFIG_ARM64_PTR_AUTH */
 
-#else  /* !__ASSEMBLY */
+#else  /* !__ASSEMBLER__ */
 
 #define __ptrauth_save_key(ctxt, key)					\
 	do {								\
@@ -120,5 +120,5 @@ alternative_else_nop_endif
 		__ptrauth_save_key(ctxt, APGA);				\
 	} while(0)
 
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 #endif /* __ASM_KVM_PTRAUTH_H */
diff --git a/arch/arm64/include/asm/linkage.h b/arch/arm64/include/asm/linkage.h
index d3acd9c87509..40bd17add539 100644
--- a/arch/arm64/include/asm/linkage.h
+++ b/arch/arm64/include/asm/linkage.h
@@ -1,7 +1,7 @@
 #ifndef __ASM_LINKAGE_H
 #define __ASM_LINKAGE_H
 
-#ifdef __ASSEMBLY__
+#ifdef __ASSEMBLER__
 #include <asm/assembler.h>
 #endif
 
diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h
index f1505c4acb38..9d54b2ea49d6 100644
--- a/arch/arm64/include/asm/memory.h
+++ b/arch/arm64/include/asm/memory.h
@@ -207,7 +207,7 @@
  */
 #define TRAMP_SWAPPER_OFFSET	(2 * PAGE_SIZE)
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 #include <linux/bitops.h>
 #include <linux/compiler.h>
@@ -392,7 +392,6 @@ static inline unsigned long virt_to_pfn(const void *kaddr)
  *  virt_to_page(x)	convert a _valid_ virtual address to struct page *
  *  virt_addr_valid(x)	indicates whether a virtual address is valid
  */
-#define ARCH_PFN_OFFSET		((unsigned long)PHYS_PFN_OFFSET)
 
 #if defined(CONFIG_DEBUG_VIRTUAL)
 #define page_to_virt(x)	({						\
@@ -422,7 +421,7 @@ static inline unsigned long virt_to_pfn(const void *kaddr)
 })
 
 void dump_mem_limit(void);
-#endif /* !ASSEMBLY */
+#endif /* !__ASSEMBLER__ */
 
 /*
  * Given that the GIC architecture permits ITS implementations that can only be
diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h
index 78a4dbf75e60..137a173df1ff 100644
--- a/arch/arm64/include/asm/mmu.h
+++ b/arch/arm64/include/asm/mmu.h
@@ -12,7 +12,7 @@
 #define USER_ASID_FLAG	(UL(1) << USER_ASID_BIT)
 #define TTBR_ASID_MASK	(UL(0xffff) << 48)
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 #include <linux/refcount.h>
 #include <asm/cpufeature.h>
@@ -112,5 +112,5 @@ void kpti_install_ng_mappings(void);
 static inline void kpti_install_ng_mappings(void) {}
 #endif
 
-#endif	/* !__ASSEMBLY__ */
+#endif	/* !__ASSEMBLER__ */
 #endif
diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h
index 0dbe3b29049b..cc80af59c69e 100644
--- a/arch/arm64/include/asm/mmu_context.h
+++ b/arch/arm64/include/asm/mmu_context.h
@@ -8,7 +8,7 @@
 #ifndef __ASM_MMU_CONTEXT_H
 #define __ASM_MMU_CONTEXT_H
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 #include <linux/compiler.h>
 #include <linux/sched.h>
@@ -62,29 +62,21 @@ static inline void cpu_switch_mm(pgd_t *pgd, struct mm_struct *mm)
 }
 
 /*
- * TCR.T0SZ value to use when the ID map is active.
- */
-#define idmap_t0sz	TCR_T0SZ(IDMAP_VA_BITS)
-
-/*
  * Ensure TCR.T0SZ is set to the provided value.
  */
 static inline void __cpu_set_tcr_t0sz(unsigned long t0sz)
 {
 	unsigned long tcr = read_sysreg(tcr_el1);
 
-	if ((tcr & TCR_T0SZ_MASK) == t0sz)
+	if ((tcr & TCR_EL1_T0SZ_MASK) == t0sz)
 		return;
 
-	tcr &= ~TCR_T0SZ_MASK;
+	tcr &= ~TCR_EL1_T0SZ_MASK;
 	tcr |= t0sz;
 	write_sysreg(tcr, tcr_el1);
 	isb();
 }
 
-#define cpu_set_default_tcr_t0sz()	__cpu_set_tcr_t0sz(TCR_T0SZ(vabits_actual))
-#define cpu_set_idmap_tcr_t0sz()	__cpu_set_tcr_t0sz(idmap_t0sz)
-
 /*
  * Remove the idmap from TTBR0_EL1 and install the pgd of the active mm.
  *
@@ -103,7 +95,7 @@ static inline void cpu_uninstall_idmap(void)
 
 	cpu_set_reserved_ttbr0();
 	local_flush_tlb_all();
-	cpu_set_default_tcr_t0sz();
+	__cpu_set_tcr_t0sz(TCR_T0SZ(vabits_actual));
 
 	if (mm != &init_mm && !system_uses_ttbr0_pan())
 		cpu_switch_mm(mm->pgd, mm);
@@ -113,7 +105,7 @@ static inline void cpu_install_idmap(void)
 {
 	cpu_set_reserved_ttbr0();
 	local_flush_tlb_all();
-	cpu_set_idmap_tcr_t0sz();
+	__cpu_set_tcr_t0sz(TCR_T0SZ(IDMAP_VA_BITS));
 
 	cpu_switch_mm(lm_alias(idmap_pg_dir), &init_mm);
 }
@@ -330,6 +322,6 @@ static inline void deactivate_mm(struct task_struct *tsk,
 
 #include <asm-generic/mmu_context.h>
 
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
 
 #endif /* !__ASM_MMU_CONTEXT_H */
diff --git a/arch/arm64/include/asm/mte-kasan.h b/arch/arm64/include/asm/mte-kasan.h
index 0f9b08e8fb8d..352139271918 100644
--- a/arch/arm64/include/asm/mte-kasan.h
+++ b/arch/arm64/include/asm/mte-kasan.h
@@ -9,7 +9,7 @@
 #include <asm/cputype.h>
 #include <asm/mte-def.h>
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 #include <linux/types.h>
 
@@ -259,6 +259,6 @@ static inline int mte_enable_kernel_store_only(void)
 
 #endif /* CONFIG_ARM64_MTE */
 
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 
 #endif /* __ASM_MTE_KASAN_H  */
diff --git a/arch/arm64/include/asm/mte.h b/arch/arm64/include/asm/mte.h
index 3b5069f4683d..6d4a78b9dc3e 100644
--- a/arch/arm64/include/asm/mte.h
+++ b/arch/arm64/include/asm/mte.h
@@ -8,7 +8,7 @@
 #include <asm/compiler.h>
 #include <asm/mte-def.h>
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 #include <linux/bitfield.h>
 #include <linux/kasan-enabled.h>
@@ -282,5 +282,5 @@ static inline void mte_check_tfsr_exit(void)
 }
 #endif /* CONFIG_KASAN_HW_TAGS */
 
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 #endif /* __ASM_MTE_H  */
diff --git a/arch/arm64/include/asm/neon.h b/arch/arm64/include/asm/neon.h
index d4b1d172a79b..acebee4605b5 100644
--- a/arch/arm64/include/asm/neon.h
+++ b/arch/arm64/include/asm/neon.h
@@ -13,7 +13,7 @@
 
 #define cpu_has_neon()		system_supports_fpsimd()
 
-void kernel_neon_begin(void);
-void kernel_neon_end(void);
+void kernel_neon_begin(struct user_fpsimd_state *);
+void kernel_neon_end(struct user_fpsimd_state *);
 
 #endif /* ! __ASM_NEON_H */
diff --git a/arch/arm64/include/asm/page.h b/arch/arm64/include/asm/page.h
index 258cca4b4873..00f117ff4f7a 100644
--- a/arch/arm64/include/asm/page.h
+++ b/arch/arm64/include/asm/page.h
@@ -10,7 +10,7 @@
 
 #include <asm/page-def.h>
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 #include <linux/personality.h> /* for READ_IMPLIES_EXEC */
 #include <linux/types.h> /* for gfp_t */
@@ -45,7 +45,7 @@ int pfn_is_map_memory(unsigned long pfn);
 
 #include <asm/memory.h>
 
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
 
 #define VM_DATA_DEFAULT_FLAGS	(VM_DATA_FLAGS_TSK_EXEC | VM_MTE_ALLOWED)
 
diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h
index f3b77deedfa2..d49180bb7cb3 100644
--- a/arch/arm64/include/asm/pgtable-hwdef.h
+++ b/arch/arm64/include/asm/pgtable-hwdef.h
@@ -228,102 +228,53 @@
 /*
  * TCR flags.
  */
-#define TCR_T0SZ_OFFSET		0
-#define TCR_T1SZ_OFFSET		16
-#define TCR_T0SZ(x)		((UL(64) - (x)) << TCR_T0SZ_OFFSET)
-#define TCR_T1SZ(x)		((UL(64) - (x)) << TCR_T1SZ_OFFSET)
-#define TCR_TxSZ(x)		(TCR_T0SZ(x) | TCR_T1SZ(x))
-#define TCR_TxSZ_WIDTH		6
-#define TCR_T0SZ_MASK		(((UL(1) << TCR_TxSZ_WIDTH) - 1) << TCR_T0SZ_OFFSET)
-#define TCR_T1SZ_MASK		(((UL(1) << TCR_TxSZ_WIDTH) - 1) << TCR_T1SZ_OFFSET)
-
-#define TCR_EPD0_SHIFT		7
-#define TCR_EPD0_MASK		(UL(1) << TCR_EPD0_SHIFT)
-#define TCR_IRGN0_SHIFT		8
-#define TCR_IRGN0_MASK		(UL(3) << TCR_IRGN0_SHIFT)
-#define TCR_IRGN0_NC		(UL(0) << TCR_IRGN0_SHIFT)
-#define TCR_IRGN0_WBWA		(UL(1) << TCR_IRGN0_SHIFT)
-#define TCR_IRGN0_WT		(UL(2) << TCR_IRGN0_SHIFT)
-#define TCR_IRGN0_WBnWA		(UL(3) << TCR_IRGN0_SHIFT)
-
-#define TCR_EPD1_SHIFT		23
-#define TCR_EPD1_MASK		(UL(1) << TCR_EPD1_SHIFT)
-#define TCR_IRGN1_SHIFT		24
-#define TCR_IRGN1_MASK		(UL(3) << TCR_IRGN1_SHIFT)
-#define TCR_IRGN1_NC		(UL(0) << TCR_IRGN1_SHIFT)
-#define TCR_IRGN1_WBWA		(UL(1) << TCR_IRGN1_SHIFT)
-#define TCR_IRGN1_WT		(UL(2) << TCR_IRGN1_SHIFT)
-#define TCR_IRGN1_WBnWA		(UL(3) << TCR_IRGN1_SHIFT)
-
-#define TCR_IRGN_NC		(TCR_IRGN0_NC | TCR_IRGN1_NC)
-#define TCR_IRGN_WBWA		(TCR_IRGN0_WBWA | TCR_IRGN1_WBWA)
-#define TCR_IRGN_WT		(TCR_IRGN0_WT | TCR_IRGN1_WT)
-#define TCR_IRGN_WBnWA		(TCR_IRGN0_WBnWA | TCR_IRGN1_WBnWA)
-#define TCR_IRGN_MASK		(TCR_IRGN0_MASK | TCR_IRGN1_MASK)
-
-
-#define TCR_ORGN0_SHIFT		10
-#define TCR_ORGN0_MASK		(UL(3) << TCR_ORGN0_SHIFT)
-#define TCR_ORGN0_NC		(UL(0) << TCR_ORGN0_SHIFT)
-#define TCR_ORGN0_WBWA		(UL(1) << TCR_ORGN0_SHIFT)
-#define TCR_ORGN0_WT		(UL(2) << TCR_ORGN0_SHIFT)
-#define TCR_ORGN0_WBnWA		(UL(3) << TCR_ORGN0_SHIFT)
-
-#define TCR_ORGN1_SHIFT		26
-#define TCR_ORGN1_MASK		(UL(3) << TCR_ORGN1_SHIFT)
-#define TCR_ORGN1_NC		(UL(0) << TCR_ORGN1_SHIFT)
-#define TCR_ORGN1_WBWA		(UL(1) << TCR_ORGN1_SHIFT)
-#define TCR_ORGN1_WT		(UL(2) << TCR_ORGN1_SHIFT)
-#define TCR_ORGN1_WBnWA		(UL(3) << TCR_ORGN1_SHIFT)
-
-#define TCR_ORGN_NC		(TCR_ORGN0_NC | TCR_ORGN1_NC)
-#define TCR_ORGN_WBWA		(TCR_ORGN0_WBWA | TCR_ORGN1_WBWA)
-#define TCR_ORGN_WT		(TCR_ORGN0_WT | TCR_ORGN1_WT)
-#define TCR_ORGN_WBnWA		(TCR_ORGN0_WBnWA | TCR_ORGN1_WBnWA)
-#define TCR_ORGN_MASK		(TCR_ORGN0_MASK | TCR_ORGN1_MASK)
-
-#define TCR_SH0_SHIFT		12
-#define TCR_SH0_MASK		(UL(3) << TCR_SH0_SHIFT)
-#define TCR_SH0_INNER		(UL(3) << TCR_SH0_SHIFT)
-
-#define TCR_SH1_SHIFT		28
-#define TCR_SH1_MASK		(UL(3) << TCR_SH1_SHIFT)
-#define TCR_SH1_INNER		(UL(3) << TCR_SH1_SHIFT)
-#define TCR_SHARED		(TCR_SH0_INNER | TCR_SH1_INNER)
-
-#define TCR_TG0_SHIFT		14
-#define TCR_TG0_MASK		(UL(3) << TCR_TG0_SHIFT)
-#define TCR_TG0_4K		(UL(0) << TCR_TG0_SHIFT)
-#define TCR_TG0_64K		(UL(1) << TCR_TG0_SHIFT)
-#define TCR_TG0_16K		(UL(2) << TCR_TG0_SHIFT)
-
-#define TCR_TG1_SHIFT		30
-#define TCR_TG1_MASK		(UL(3) << TCR_TG1_SHIFT)
-#define TCR_TG1_16K		(UL(1) << TCR_TG1_SHIFT)
-#define TCR_TG1_4K		(UL(2) << TCR_TG1_SHIFT)
-#define TCR_TG1_64K		(UL(3) << TCR_TG1_SHIFT)
-
-#define TCR_IPS_SHIFT		32
-#define TCR_IPS_MASK		(UL(7) << TCR_IPS_SHIFT)
-#define TCR_A1			(UL(1) << 22)
-#define TCR_ASID16		(UL(1) << 36)
-#define TCR_TBI0		(UL(1) << 37)
-#define TCR_TBI1		(UL(1) << 38)
-#define TCR_HA			(UL(1) << 39)
-#define TCR_HD			(UL(1) << 40)
-#define TCR_HPD0_SHIFT		41
-#define TCR_HPD0		(UL(1) << TCR_HPD0_SHIFT)
-#define TCR_HPD1_SHIFT		42
-#define TCR_HPD1		(UL(1) << TCR_HPD1_SHIFT)
-#define TCR_TBID0		(UL(1) << 51)
-#define TCR_TBID1		(UL(1) << 52)
-#define TCR_NFD0		(UL(1) << 53)
-#define TCR_NFD1		(UL(1) << 54)
-#define TCR_E0PD0		(UL(1) << 55)
-#define TCR_E0PD1		(UL(1) << 56)
-#define TCR_TCMA0		(UL(1) << 57)
-#define TCR_TCMA1		(UL(1) << 58)
-#define TCR_DS			(UL(1) << 59)
+#define TCR_T0SZ(x)		((UL(64) - (x)) << TCR_EL1_T0SZ_SHIFT)
+#define TCR_T1SZ(x)		((UL(64) - (x)) << TCR_EL1_T1SZ_SHIFT)
+
+#define TCR_T0SZ_MASK		TCR_EL1_T0SZ_MASK
+#define TCR_T1SZ_MASK		TCR_EL1_T1SZ_MASK
+
+#define TCR_EPD0_MASK		TCR_EL1_EPD0_MASK
+#define TCR_EPD1_MASK		TCR_EL1_EPD1_MASK
+
+#define TCR_IRGN0_MASK		TCR_EL1_IRGN0_MASK
+#define TCR_IRGN0_WBWA		(TCR_EL1_IRGN0_WBWA << TCR_EL1_IRGN0_SHIFT)
+
+#define TCR_ORGN0_MASK		TCR_EL1_ORGN0_MASK
+#define TCR_ORGN0_WBWA		(TCR_EL1_ORGN0_WBWA << TCR_EL1_ORGN0_SHIFT)
+
+#define TCR_SH0_MASK		TCR_EL1_SH0_MASK
+#define TCR_SH0_INNER		(TCR_EL1_SH0_INNER << TCR_EL1_SH0_SHIFT)
+
+#define TCR_SH1_MASK		TCR_EL1_SH1_MASK
+
+#define TCR_TG0_SHIFT		TCR_EL1_TG0_SHIFT
+#define TCR_TG0_MASK		TCR_EL1_TG0_MASK
+#define TCR_TG0_4K		(TCR_EL1_TG0_4K << TCR_EL1_TG0_SHIFT)
+#define TCR_TG0_64K		(TCR_EL1_TG0_64K << TCR_EL1_TG0_SHIFT)
+#define TCR_TG0_16K		(TCR_EL1_TG0_16K << TCR_EL1_TG0_SHIFT)
+
+#define TCR_TG1_SHIFT		TCR_EL1_TG1_SHIFT
+#define TCR_TG1_MASK		TCR_EL1_TG1_MASK
+#define TCR_TG1_16K		(TCR_EL1_TG1_16K << TCR_EL1_TG1_SHIFT)
+#define TCR_TG1_4K		(TCR_EL1_TG1_4K << TCR_EL1_TG1_SHIFT)
+#define TCR_TG1_64K		(TCR_EL1_TG1_64K << TCR_EL1_TG1_SHIFT)
+
+#define TCR_IPS_SHIFT		TCR_EL1_IPS_SHIFT
+#define TCR_IPS_MASK		TCR_EL1_IPS_MASK
+#define TCR_A1			TCR_EL1_A1
+#define TCR_ASID16		TCR_EL1_AS
+#define TCR_TBI0		TCR_EL1_TBI0
+#define TCR_TBI1		TCR_EL1_TBI1
+#define TCR_HA			TCR_EL1_HA
+#define TCR_HD			TCR_EL1_HD
+#define TCR_HPD0		TCR_EL1_HPD0
+#define TCR_HPD1		TCR_EL1_HPD1
+#define TCR_TBID0		TCR_EL1_TBID0
+#define TCR_TBID1		TCR_EL1_TBID1
+#define TCR_E0PD0		TCR_EL1_E0PD0
+#define TCR_E0PD1		TCR_EL1_E0PD1
+#define TCR_DS			TCR_EL1_DS
 
 /*
  * TTBR.
diff --git a/arch/arm64/include/asm/pgtable-prot.h b/arch/arm64/include/asm/pgtable-prot.h
index 85dceb1c66f4..161e8660eddd 100644
--- a/arch/arm64/include/asm/pgtable-prot.h
+++ b/arch/arm64/include/asm/pgtable-prot.h
@@ -62,7 +62,7 @@
 #define _PAGE_READONLY_EXEC	(_PAGE_DEFAULT | PTE_USER | PTE_RDONLY | PTE_NG | PTE_PXN)
 #define _PAGE_EXECONLY		(_PAGE_DEFAULT | PTE_RDONLY | PTE_NG | PTE_PXN)
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 #include <asm/cpufeature.h>
 #include <asm/pgtable-types.h>
@@ -84,7 +84,7 @@ extern unsigned long prot_ns_shared;
 #else
 static inline bool __pure lpa2_is_enabled(void)
 {
-	return read_tcr() & TCR_DS;
+	return read_tcr() & TCR_EL1_DS;
 }
 
 #define PTE_MAYBE_SHARED	(lpa2_is_enabled() ? 0 : PTE_SHARED)
@@ -127,7 +127,7 @@ static inline bool __pure lpa2_is_enabled(void)
 #define PAGE_READONLY_EXEC	__pgprot(_PAGE_READONLY_EXEC)
 #define PAGE_EXECONLY		__pgprot(_PAGE_EXECONLY)
 
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 
 #define pte_pi_index(pte) ( \
 	((pte & BIT(PTE_PI_IDX_3)) >> (PTE_PI_IDX_3 - 3)) | \
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 0944e296dd4a..64d5f1d9cce9 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -30,7 +30,7 @@
 
 #define vmemmap			((struct page *)VMEMMAP_START - (memstart_addr >> PAGE_SHIFT))
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 #include <asm/cmpxchg.h>
 #include <asm/fixmap.h>
@@ -130,12 +130,16 @@ static inline void arch_leave_lazy_mmu_mode(void)
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
 /*
- * Outside of a few very special situations (e.g. hibernation), we always
- * use broadcast TLB invalidation instructions, therefore a spurious page
- * fault on one CPU which has been handled concurrently by another CPU
- * does not need to perform additional invalidation.
+ * We use local TLB invalidation instruction when reusing page in
+ * write protection fault handler to avoid TLBI broadcast in the hot
+ * path.  This will cause spurious page faults if stale read-only TLB
+ * entries exist.
  */
-#define flush_tlb_fix_spurious_fault(vma, address, ptep) do { } while (0)
+#define flush_tlb_fix_spurious_fault(vma, address, ptep)	\
+	local_flush_tlb_page_nonotify(vma, address)
+
+#define flush_tlb_fix_spurious_fault_pmd(vma, address, pmdp)	\
+	local_flush_tlb_page_nonotify(vma, address)
 
 /*
  * ZERO_PAGE is a global shared page that is always zero: used
@@ -433,7 +437,7 @@ bool pgattr_change_is_safe(pteval_t old, pteval_t new);
  *   1      0      |   1           0          1
  *   1      1      |   0           1          x
  *
- * When hardware DBM is not present, the sofware PTE_DIRTY bit is updated via
+ * When hardware DBM is not present, the software PTE_DIRTY bit is updated via
  * the page fault mechanism. Checking the dirty status of a pte becomes:
  *
  *   PTE_DIRTY || (PTE_WRITE && !PTE_RDONLY)
@@ -599,7 +603,7 @@ static inline int pte_protnone(pte_t pte)
 	/*
 	 * pte_present_invalid() tells us that the pte is invalid from HW
 	 * perspective but present from SW perspective, so the fields are to be
-	 * interpretted as per the HW layout. The second 2 checks are the unique
+	 * interpreted as per the HW layout. The second 2 checks are the unique
 	 * encoding that we use for PROT_NONE. It is insufficient to only use
 	 * the first check because we share the same encoding scheme with pmds
 	 * which support pmd_mkinvalid(), so can be present-invalid without
@@ -1949,6 +1953,6 @@ static inline void clear_young_dirty_ptes(struct vm_area_struct *vma,
 
 #endif /* CONFIG_ARM64_CONTPTE */
 
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
 
 #endif /* __ASM_PGTABLE_H */
diff --git a/arch/arm64/include/asm/proc-fns.h b/arch/arm64/include/asm/proc-fns.h
index 0d5d1f0525eb..ab78a78821a2 100644
--- a/arch/arm64/include/asm/proc-fns.h
+++ b/arch/arm64/include/asm/proc-fns.h
@@ -9,7 +9,7 @@
 #ifndef __ASM_PROCFNS_H
 #define __ASM_PROCFNS_H
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 #include <asm/page.h>
 
@@ -21,5 +21,5 @@ extern u64 cpu_do_resume(phys_addr_t ptr, u64 idmap_ttbr);
 
 #include <asm/memory.h>
 
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 #endif /* __ASM_PROCFNS_H */
diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h
index 61d62bfd5a7b..e30c4c8e3a7a 100644
--- a/arch/arm64/include/asm/processor.h
+++ b/arch/arm64/include/asm/processor.h
@@ -25,7 +25,7 @@
 
 #define MTE_CTRL_STORE_ONLY		(1UL << 19)
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 #include <linux/build_bug.h>
 #include <linux/cache.h>
@@ -172,7 +172,12 @@ struct thread_struct {
 	unsigned long		fault_code;	/* ESR_EL1 value */
 	struct debug_info	debug;		/* debugging */
 
-	struct user_fpsimd_state	kernel_fpsimd_state;
+	/*
+	 * Set [cleared] by kernel_neon_begin() [kernel_neon_end()] to the
+	 * address of a caller provided buffer that will be used to preserve a
+	 * task's kernel mode FPSIMD state while it is scheduled out.
+	 */
+	struct user_fpsimd_state	*kernel_fpsimd_state;
 	unsigned int			kernel_fpsimd_cpu;
 #ifdef CONFIG_ARM64_PTR_AUTH
 	struct ptrauth_keys_user	keys_user;
@@ -437,5 +442,5 @@ int set_tsc_mode(unsigned int val);
 #define GET_TSC_CTL(adr)        get_tsc_mode((adr))
 #define SET_TSC_CTL(val)        set_tsc_mode((val))
 
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 #endif /* __ASM_PROCESSOR_H */
diff --git a/arch/arm64/include/asm/ptrace.h b/arch/arm64/include/asm/ptrace.h
index 65b053a24d82..39582511ad72 100644
--- a/arch/arm64/include/asm/ptrace.h
+++ b/arch/arm64/include/asm/ptrace.h
@@ -94,7 +94,7 @@
  */
 #define NO_SYSCALL (-1)
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 #include <linux/bug.h>
 #include <linux/types.h>
 
@@ -361,5 +361,5 @@ static inline void procedure_link_pointer_set(struct pt_regs *regs,
 
 extern unsigned long profile_pc(struct pt_regs *regs);
 
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 #endif
diff --git a/arch/arm64/include/asm/rsi_smc.h b/arch/arm64/include/asm/rsi_smc.h
index 6cb070eca9e9..e19253f96c94 100644
--- a/arch/arm64/include/asm/rsi_smc.h
+++ b/arch/arm64/include/asm/rsi_smc.h
@@ -122,7 +122,7 @@
  */
 #define SMC_RSI_ATTESTATION_TOKEN_CONTINUE	SMC_RSI_FID(0x195)
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 struct realm_config {
 	union {
@@ -142,7 +142,7 @@ struct realm_config {
 	 */
 } __aligned(0x1000);
 
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 
 /*
  * Read configuration for the current Realm.
diff --git a/arch/arm64/include/asm/rwonce.h b/arch/arm64/include/asm/rwonce.h
index 97d9256d33c9..78beceec10cd 100644
--- a/arch/arm64/include/asm/rwonce.h
+++ b/arch/arm64/include/asm/rwonce.h
@@ -5,7 +5,7 @@
 #ifndef __ASM_RWONCE_H
 #define __ASM_RWONCE_H
 
-#if defined(CONFIG_LTO) && !defined(__ASSEMBLY__)
+#if defined(CONFIG_LTO) && !defined(__ASSEMBLER__)
 
 #include <linux/compiler_types.h>
 #include <asm/alternative-macros.h>
@@ -62,7 +62,7 @@
 })
 
 #endif	/* !BUILD_VDSO */
-#endif	/* CONFIG_LTO && !__ASSEMBLY__ */
+#endif	/* CONFIG_LTO && !__ASSEMBLER__ */
 
 #include <asm-generic/rwonce.h>
 
diff --git a/arch/arm64/include/asm/scs.h b/arch/arm64/include/asm/scs.h
index c59f6324f2bb..0fbc2e7867d3 100644
--- a/arch/arm64/include/asm/scs.h
+++ b/arch/arm64/include/asm/scs.h
@@ -2,7 +2,7 @@
 #ifndef _ASM_SCS_H
 #define _ASM_SCS_H
 
-#ifdef __ASSEMBLY__
+#ifdef __ASSEMBLER__
 
 #include <asm/asm-offsets.h>
 #include <asm/sysreg.h>
@@ -55,6 +55,6 @@ enum {
 
 int __pi_scs_patch(const u8 eh_frame[], int size, bool skip_dry_run);
 
-#endif /* __ASSEMBLY __ */
+#endif /* __ASSEMBLER__ */
 
 #endif /* _ASM_SCS_H */
diff --git a/arch/arm64/include/asm/sdei.h b/arch/arm64/include/asm/sdei.h
index 484cb6972e99..b2248bd3cb58 100644
--- a/arch/arm64/include/asm/sdei.h
+++ b/arch/arm64/include/asm/sdei.h
@@ -9,7 +9,7 @@
 
 #define SDEI_STACK_SIZE		IRQ_STACK_SIZE
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 #include <linux/linkage.h>
 #include <linux/preempt.h>
@@ -49,5 +49,5 @@ unsigned long do_sdei_event(struct pt_regs *regs,
 unsigned long sdei_arch_get_entry_point(int conduit);
 #define sdei_arch_get_entry_point(x)	sdei_arch_get_entry_point(x)
 
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 #endif	/* __ASM_SDEI_H */
diff --git a/arch/arm64/include/asm/simd.h b/arch/arm64/include/asm/simd.h
index 8e86c9e70e48..0941f6f58a14 100644
--- a/arch/arm64/include/asm/simd.h
+++ b/arch/arm64/include/asm/simd.h
@@ -6,12 +6,15 @@
 #ifndef __ASM_SIMD_H
 #define __ASM_SIMD_H
 
+#include <linux/cleanup.h>
 #include <linux/compiler.h>
 #include <linux/irqflags.h>
 #include <linux/percpu.h>
 #include <linux/preempt.h>
 #include <linux/types.h>
 
+#include <asm/neon.h>
+
 #ifdef CONFIG_KERNEL_MODE_NEON
 
 /*
@@ -29,7 +32,7 @@ static __must_check inline bool may_use_simd(void)
 	 */
 	return !WARN_ON(!system_capabilities_finalized()) &&
 	       system_supports_fpsimd() &&
-	       !in_hardirq() && !irqs_disabled() && !in_nmi();
+	       !in_hardirq() && !in_nmi();
 }
 
 #else /* ! CONFIG_KERNEL_MODE_NEON */
@@ -40,4 +43,11 @@ static __must_check inline bool may_use_simd(void) {
 
 #endif /* ! CONFIG_KERNEL_MODE_NEON */
 
+DEFINE_LOCK_GUARD_1(ksimd,
+		    struct user_fpsimd_state,
+		    kernel_neon_begin(_T->lock),
+		    kernel_neon_end(_T->lock))
+
+#define scoped_ksimd()	scoped_guard(ksimd, &(struct user_fpsimd_state){})
+
 #endif
diff --git a/arch/arm64/include/asm/smp.h b/arch/arm64/include/asm/smp.h
index d48ef6d5abcc..10ea4f543069 100644
--- a/arch/arm64/include/asm/smp.h
+++ b/arch/arm64/include/asm/smp.h
@@ -23,7 +23,7 @@
 #define CPU_STUCK_REASON_52_BIT_VA	(UL(1) << CPU_STUCK_REASON_SHIFT)
 #define CPU_STUCK_REASON_NO_GRAN	(UL(2) << CPU_STUCK_REASON_SHIFT)
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 #include <linux/threads.h>
 #include <linux/cpumask.h>
@@ -155,6 +155,6 @@ bool cpus_are_stuck_in_kernel(void);
 extern void crash_smp_send_stop(void);
 extern bool smp_crash_stop_failed(void);
 
-#endif /* ifndef __ASSEMBLY__ */
+#endif /* ifndef __ASSEMBLER__ */
 
 #endif /* ifndef __ASM_SMP_H */
diff --git a/arch/arm64/include/asm/spectre.h b/arch/arm64/include/asm/spectre.h
index 900454aaa292..296ae3420bfd 100644
--- a/arch/arm64/include/asm/spectre.h
+++ b/arch/arm64/include/asm/spectre.h
@@ -12,7 +12,7 @@
 #define BP_HARDEN_EL2_SLOTS 4
 #define __BP_HARDEN_HYP_VECS_SZ	((BP_HARDEN_EL2_SLOTS - 1) * SZ_2K)
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 #include <linux/smp.h>
 #include <asm/percpu.h>
 
@@ -119,5 +119,5 @@ void spectre_bhb_patch_clearbhb(struct alt_instr *alt,
 				__le32 *origptr, __le32 *updptr, int nr_inst);
 void spectre_print_disabled_mitigations(void);
 
-#endif	/* __ASSEMBLY__ */
+#endif	/* __ASSEMBLER__ */
 #endif	/* __ASM_SPECTRE_H */
diff --git a/arch/arm64/include/asm/stacktrace/frame.h b/arch/arm64/include/asm/stacktrace/frame.h
index 0ee0f6ba0fd8..796797b8db7e 100644
--- a/arch/arm64/include/asm/stacktrace/frame.h
+++ b/arch/arm64/include/asm/stacktrace/frame.h
@@ -25,7 +25,7 @@
 #define FRAME_META_TYPE_FINAL		1
 #define FRAME_META_TYPE_PT_REGS		2
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 /* 
  * A standard AAPCS64 frame record.
  */
@@ -43,6 +43,6 @@ struct frame_record_meta {
 	struct frame_record record;
 	u64 type;
 };
-#endif /* __ASSEMBLY */
+#endif /* __ASSEMBLER__ */
 
 #endif /* __ASM_STACKTRACE_FRAME_H */
diff --git a/arch/arm64/include/asm/suspend.h b/arch/arm64/include/asm/suspend.h
index 0cde2f473971..e65f33edf9d6 100644
--- a/arch/arm64/include/asm/suspend.h
+++ b/arch/arm64/include/asm/suspend.h
@@ -23,7 +23,7 @@ struct cpu_suspend_ctx {
  * __cpu_suspend_enter()'s caller, and populated by __cpu_suspend_enter().
  * This data must survive until cpu_resume() is called.
  *
- * This struct desribes the size and the layout of the saved cpu state.
+ * This struct describes the size and the layout of the saved cpu state.
  * The layout of the callee_saved_regs is defined by the implementation
  * of __cpu_suspend_enter(), and cpu_resume(). This struct must be passed
  * in by the caller as __cpu_suspend_enter()'s stack-frame is gone once it
diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h
index c231d2a3e515..9df51accbb02 100644
--- a/arch/arm64/include/asm/sysreg.h
+++ b/arch/arm64/include/asm/sysreg.h
@@ -52,7 +52,7 @@
 
 #ifndef CONFIG_BROKEN_GAS_INST
 
-#ifdef __ASSEMBLY__
+#ifdef __ASSEMBLER__
 // The space separator is omitted so that __emit_inst(x) can be parsed as
 // either an assembler directive or an assembler macro argument.
 #define __emit_inst(x)			.inst(x)
@@ -71,11 +71,11 @@
 					 (((x) >> 24) & 0x000000ff))
 #endif	/* CONFIG_CPU_BIG_ENDIAN */
 
-#ifdef __ASSEMBLY__
+#ifdef __ASSEMBLER__
 #define __emit_inst(x)			.long __INSTR_BSWAP(x)
-#else  /* __ASSEMBLY__ */
+#else  /* __ASSEMBLER__ */
 #define __emit_inst(x)			".long " __stringify(__INSTR_BSWAP(x)) "\n\t"
-#endif	/* __ASSEMBLY__ */
+#endif	/* __ASSEMBLER__ */
 
 #endif	/* CONFIG_BROKEN_GAS_INST */
 
@@ -1129,9 +1129,7 @@
 #define gicr_insn(insn)			read_sysreg_s(GICV5_OP_GICR_##insn)
 #define gic_insn(v, insn)		write_sysreg_s(v, GICV5_OP_GIC_##insn)
 
-#define ARM64_FEATURE_FIELD_BITS	4
-
-#ifdef __ASSEMBLY__
+#ifdef __ASSEMBLER__
 
 	.macro	mrs_s, rt, sreg
 	 __emit_inst(0xd5200000|(\sreg)|(.L__gpr_num_\rt))
diff --git a/arch/arm64/include/asm/system_misc.h b/arch/arm64/include/asm/system_misc.h
index 344b1c1a4bbb..d316a804eb38 100644
--- a/arch/arm64/include/asm/system_misc.h
+++ b/arch/arm64/include/asm/system_misc.h
@@ -7,7 +7,7 @@
 #ifndef __ASM_SYSTEM_MISC_H
 #define __ASM_SYSTEM_MISC_H
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 #include <linux/compiler.h>
 #include <linux/linkage.h>
@@ -28,6 +28,6 @@ void arm64_notify_die(const char *str, struct pt_regs *regs,
 struct mm_struct;
 extern void __show_regs(struct pt_regs *);
 
-#endif	/* __ASSEMBLY__ */
+#endif	/* __ASSEMBLER__ */
 
 #endif	/* __ASM_SYSTEM_MISC_H */
diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h
index f241b8601ebd..a803b887b0b4 100644
--- a/arch/arm64/include/asm/thread_info.h
+++ b/arch/arm64/include/asm/thread_info.h
@@ -10,7 +10,7 @@
 
 #include <linux/compiler.h>
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 struct task_struct;
 
diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h
index 18a5dc0c9a54..a2d65d7d6aae 100644
--- a/arch/arm64/include/asm/tlbflush.h
+++ b/arch/arm64/include/asm/tlbflush.h
@@ -8,7 +8,7 @@
 #ifndef __ASM_TLBFLUSH_H
 #define __ASM_TLBFLUSH_H
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 #include <linux/bitfield.h>
 #include <linux/mm_types.h>
@@ -249,6 +249,19 @@ static inline unsigned long get_trans_granule(void)
  *		cannot be easily determined, the value TLBI_TTL_UNKNOWN will
  *		perform a non-hinted invalidation.
  *
+ *	local_flush_tlb_page(vma, addr)
+ *		Local variant of flush_tlb_page().  Stale TLB entries may
+ *		remain in remote CPUs.
+ *
+ *	local_flush_tlb_page_nonotify(vma, addr)
+ *		Same as local_flush_tlb_page() except MMU notifier will not be
+ *		called.
+ *
+ *	local_flush_tlb_contpte(vma, addr)
+ *		Invalidate the virtual-address range
+ *		'[addr, addr+CONT_PTE_SIZE)' mapped with contpte on local CPU
+ *		for the user address space corresponding to 'vma->mm'.  Stale
+ *		TLB entries may remain in remote CPUs.
  *
  *	Finally, take a look at asm/tlb.h to see how tlb_flush() is implemented
  *	on top of these routines, since that is our interface to the mmu_gather
@@ -282,6 +295,33 @@ static inline void flush_tlb_mm(struct mm_struct *mm)
 	mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL);
 }
 
+static inline void __local_flush_tlb_page_nonotify_nosync(struct mm_struct *mm,
+							  unsigned long uaddr)
+{
+	unsigned long addr;
+
+	dsb(nshst);
+	addr = __TLBI_VADDR(uaddr, ASID(mm));
+	__tlbi(vale1, addr);
+	__tlbi_user(vale1, addr);
+}
+
+static inline void local_flush_tlb_page_nonotify(struct vm_area_struct *vma,
+						 unsigned long uaddr)
+{
+	__local_flush_tlb_page_nonotify_nosync(vma->vm_mm, uaddr);
+	dsb(nsh);
+}
+
+static inline void local_flush_tlb_page(struct vm_area_struct *vma,
+					unsigned long uaddr)
+{
+	__local_flush_tlb_page_nonotify_nosync(vma->vm_mm, uaddr);
+	mmu_notifier_arch_invalidate_secondary_tlbs(vma->vm_mm, uaddr & PAGE_MASK,
+						(uaddr & PAGE_MASK) + PAGE_SIZE);
+	dsb(nsh);
+}
+
 static inline void __flush_tlb_page_nosync(struct mm_struct *mm,
 					   unsigned long uaddr)
 {
@@ -472,6 +512,22 @@ static inline void __flush_tlb_range(struct vm_area_struct *vma,
 	dsb(ish);
 }
 
+static inline void local_flush_tlb_contpte(struct vm_area_struct *vma,
+					   unsigned long addr)
+{
+	unsigned long asid;
+
+	addr = round_down(addr, CONT_PTE_SIZE);
+
+	dsb(nshst);
+	asid = ASID(vma->vm_mm);
+	__flush_tlb_range_op(vale1, addr, CONT_PTES, PAGE_SIZE, asid,
+			     3, true, lpa2_is_enabled());
+	mmu_notifier_arch_invalidate_secondary_tlbs(vma->vm_mm, addr,
+						    addr + CONT_PTE_SIZE);
+	dsb(nsh);
+}
+
 static inline void flush_tlb_range(struct vm_area_struct *vma,
 				   unsigned long start, unsigned long end)
 {
@@ -524,6 +580,33 @@ static inline void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *b
 {
 	__flush_tlb_range_nosync(mm, start, end, PAGE_SIZE, true, 3);
 }
+
+static inline bool __pte_flags_need_flush(ptdesc_t oldval, ptdesc_t newval)
+{
+	ptdesc_t diff = oldval ^ newval;
+
+	/* invalid to valid transition requires no flush */
+	if (!(oldval & PTE_VALID))
+		return false;
+
+	/* Transition in the SW bits requires no flush */
+	diff &= ~PTE_SWBITS_MASK;
+
+	return diff;
+}
+
+static inline bool pte_needs_flush(pte_t oldpte, pte_t newpte)
+{
+	return __pte_flags_need_flush(pte_val(oldpte), pte_val(newpte));
+}
+#define pte_needs_flush pte_needs_flush
+
+static inline bool huge_pmd_needs_flush(pmd_t oldpmd, pmd_t newpmd)
+{
+	return __pte_flags_need_flush(pmd_val(oldpmd), pmd_val(newpmd));
+}
+#define huge_pmd_needs_flush huge_pmd_needs_flush
+
 #endif
 
 #endif
diff --git a/arch/arm64/include/asm/vdso.h b/arch/arm64/include/asm/vdso.h
index 61679070f595..232b46969088 100644
--- a/arch/arm64/include/asm/vdso.h
+++ b/arch/arm64/include/asm/vdso.h
@@ -7,7 +7,7 @@
 
 #define __VDSO_PAGES    4
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 #include <generated/vdso-offsets.h>
 
@@ -19,6 +19,6 @@
 extern char vdso_start[], vdso_end[];
 extern char vdso32_start[], vdso32_end[];
 
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
 
 #endif /* __ASM_VDSO_H */
diff --git a/arch/arm64/include/asm/vdso/compat_barrier.h b/arch/arm64/include/asm/vdso/compat_barrier.h
index 6d75e03d3827..d7ebe7ceefa0 100644
--- a/arch/arm64/include/asm/vdso/compat_barrier.h
+++ b/arch/arm64/include/asm/vdso/compat_barrier.h
@@ -5,7 +5,7 @@
 #ifndef __COMPAT_BARRIER_H
 #define __COMPAT_BARRIER_H
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 /*
  * Warning: This code is meant to be used from the compat vDSO only.
  */
@@ -31,6 +31,6 @@
 #define smp_rmb()	aarch32_smp_rmb()
 #define smp_wmb()	aarch32_smp_wmb()
 
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
 
 #endif /* __COMPAT_BARRIER_H */
diff --git a/arch/arm64/include/asm/vdso/compat_gettimeofday.h b/arch/arm64/include/asm/vdso/compat_gettimeofday.h
index 7d1a116549b1..0d513f924321 100644
--- a/arch/arm64/include/asm/vdso/compat_gettimeofday.h
+++ b/arch/arm64/include/asm/vdso/compat_gettimeofday.h
@@ -5,7 +5,7 @@
 #ifndef __ASM_VDSO_COMPAT_GETTIMEOFDAY_H
 #define __ASM_VDSO_COMPAT_GETTIMEOFDAY_H
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 #include <asm/barrier.h>
 #include <asm/unistd_compat_32.h>
@@ -161,6 +161,6 @@ static inline bool vdso_clocksource_ok(const struct vdso_clock *vc)
 }
 #define vdso_clocksource_ok	vdso_clocksource_ok
 
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
 
 #endif /* __ASM_VDSO_COMPAT_GETTIMEOFDAY_H */
diff --git a/arch/arm64/include/asm/vdso/getrandom.h b/arch/arm64/include/asm/vdso/getrandom.h
index a2197da1951b..da1d58bbfabe 100644
--- a/arch/arm64/include/asm/vdso/getrandom.h
+++ b/arch/arm64/include/asm/vdso/getrandom.h
@@ -3,7 +3,7 @@
 #ifndef __ASM_VDSO_GETRANDOM_H
 #define __ASM_VDSO_GETRANDOM_H
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 #include <asm/unistd.h>
 #include <asm/vdso/vsyscall.h>
@@ -33,6 +33,6 @@ static __always_inline ssize_t getrandom_syscall(void *_buffer, size_t _len, uns
 	return ret;
 }
 
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
 
 #endif /* __ASM_VDSO_GETRANDOM_H */
diff --git a/arch/arm64/include/asm/vdso/gettimeofday.h b/arch/arm64/include/asm/vdso/gettimeofday.h
index c59e84105b43..3658a757e255 100644
--- a/arch/arm64/include/asm/vdso/gettimeofday.h
+++ b/arch/arm64/include/asm/vdso/gettimeofday.h
@@ -7,7 +7,7 @@
 
 #ifdef __aarch64__
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 #include <asm/alternative.h>
 #include <asm/arch_timer.h>
@@ -96,7 +96,7 @@ static __always_inline const struct vdso_time_data *__arch_get_vdso_u_time_data(
 #define __arch_get_vdso_u_time_data __arch_get_vdso_u_time_data
 #endif /* IS_ENABLED(CONFIG_CC_IS_GCC) && IS_ENABLED(CONFIG_PAGE_SIZE_64KB) */
 
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
 
 #else /* !__aarch64__ */
 
diff --git a/arch/arm64/include/asm/vdso/processor.h b/arch/arm64/include/asm/vdso/processor.h
index ff830b766ad2..7abb0cc81cd6 100644
--- a/arch/arm64/include/asm/vdso/processor.h
+++ b/arch/arm64/include/asm/vdso/processor.h
@@ -5,13 +5,13 @@
 #ifndef __ASM_VDSO_PROCESSOR_H
 #define __ASM_VDSO_PROCESSOR_H
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 static inline void cpu_relax(void)
 {
 	asm volatile("yield" ::: "memory");
 }
 
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 
 #endif /* __ASM_VDSO_PROCESSOR_H */
diff --git a/arch/arm64/include/asm/vdso/vsyscall.h b/arch/arm64/include/asm/vdso/vsyscall.h
index 417aae5763a8..3f3c8eb74e2e 100644
--- a/arch/arm64/include/asm/vdso/vsyscall.h
+++ b/arch/arm64/include/asm/vdso/vsyscall.h
@@ -2,7 +2,7 @@
 #ifndef __ASM_VDSO_VSYSCALL_H
 #define __ASM_VDSO_VSYSCALL_H
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 #include <vdso/datapage.h>
 
@@ -22,6 +22,6 @@ void __arch_update_vdso_clock(struct vdso_clock *vc)
 /* The asm-generic header needs to be included after the definitions above */
 #include <asm-generic/vdso/vsyscall.h>
 
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
 
 #endif /* __ASM_VDSO_VSYSCALL_H */
diff --git a/arch/arm64/include/asm/virt.h b/arch/arm64/include/asm/virt.h
index aa280f356b96..530af9620fdb 100644
--- a/arch/arm64/include/asm/virt.h
+++ b/arch/arm64/include/asm/virt.h
@@ -56,7 +56,7 @@
  */
 #define BOOT_CPU_FLAG_E2H	BIT_ULL(32)
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 #include <asm/ptrace.h>
 #include <asm/sections.h>
@@ -161,6 +161,6 @@ static inline bool is_hyp_nvhe(void)
 	return is_hyp_mode_available() && !is_kernel_in_hyp_mode();
 }
 
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 
 #endif /* ! __ASM__VIRT_H */
diff --git a/arch/arm64/include/asm/vmap_stack.h b/arch/arm64/include/asm/vmap_stack.h
index 20873099c035..75daee1a07e9 100644
--- a/arch/arm64/include/asm/vmap_stack.h
+++ b/arch/arm64/include/asm/vmap_stack.h
@@ -3,9 +3,7 @@
 #ifndef __ASM_VMAP_STACK_H
 #define __ASM_VMAP_STACK_H
 
-#include <linux/bug.h>
 #include <linux/gfp.h>
-#include <linux/kconfig.h>
 #include <linux/vmalloc.h>
 #include <linux/pgtable.h>
 #include <asm/memory.h>
@@ -19,8 +17,6 @@ static inline unsigned long *arch_alloc_vmap_stack(size_t stack_size, int node)
 {
 	void *p;
 
-	BUILD_BUG_ON(!IS_ENABLED(CONFIG_VMAP_STACK));
-
 	p = __vmalloc_node(stack_size, THREAD_ALIGN, THREADINFO_GFP, node,
 			__builtin_return_address(0));
 	return kasan_reset_tag(p);
diff --git a/arch/arm64/include/asm/xor.h b/arch/arm64/include/asm/xor.h
index befcd8a7abc9..c38e3d017a79 100644
--- a/arch/arm64/include/asm/xor.h
+++ b/arch/arm64/include/asm/xor.h
@@ -9,7 +9,7 @@
 #include <linux/hardirq.h>
 #include <asm-generic/xor.h>
 #include <asm/hwcap.h>
-#include <asm/neon.h>
+#include <asm/simd.h>
 
 #ifdef CONFIG_KERNEL_MODE_NEON
 
@@ -19,9 +19,8 @@ static void
 xor_neon_2(unsigned long bytes, unsigned long * __restrict p1,
 	   const unsigned long * __restrict p2)
 {
-	kernel_neon_begin();
-	xor_block_inner_neon.do_2(bytes, p1, p2);
-	kernel_neon_end();
+	scoped_ksimd()
+		xor_block_inner_neon.do_2(bytes, p1, p2);
 }
 
 static void
@@ -29,9 +28,8 @@ xor_neon_3(unsigned long bytes, unsigned long * __restrict p1,
 	   const unsigned long * __restrict p2,
 	   const unsigned long * __restrict p3)
 {
-	kernel_neon_begin();
-	xor_block_inner_neon.do_3(bytes, p1, p2, p3);
-	kernel_neon_end();
+	scoped_ksimd()
+		xor_block_inner_neon.do_3(bytes, p1, p2, p3);
 }
 
 static void
@@ -40,9 +38,8 @@ xor_neon_4(unsigned long bytes, unsigned long * __restrict p1,
 	   const unsigned long * __restrict p3,
 	   const unsigned long * __restrict p4)
 {
-	kernel_neon_begin();
-	xor_block_inner_neon.do_4(bytes, p1, p2, p3, p4);
-	kernel_neon_end();
+	scoped_ksimd()
+		xor_block_inner_neon.do_4(bytes, p1, p2, p3, p4);
 }
 
 static void
@@ -52,9 +49,8 @@ xor_neon_5(unsigned long bytes, unsigned long * __restrict p1,
 	   const unsigned long * __restrict p4,
 	   const unsigned long * __restrict p5)
 {
-	kernel_neon_begin();
-	xor_block_inner_neon.do_5(bytes, p1, p2, p3, p4, p5);
-	kernel_neon_end();
+	scoped_ksimd()
+		xor_block_inner_neon.do_5(bytes, p1, p2, p3, p4, p5);
 }
 
 static struct xor_block_template xor_block_arm64 = {
diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h
index ed5f3892674c..a792a599b9d6 100644
--- a/arch/arm64/include/uapi/asm/kvm.h
+++ b/arch/arm64/include/uapi/asm/kvm.h
@@ -31,7 +31,7 @@
 #define KVM_SPSR_FIQ	4
 #define KVM_NR_SPSR	5
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 #include <linux/psci.h>
 #include <linux/types.h>
 #include <asm/ptrace.h>
diff --git a/arch/arm64/include/uapi/asm/ptrace.h b/arch/arm64/include/uapi/asm/ptrace.h
index 0f39ba4f3efd..6fed93fb2536 100644
--- a/arch/arm64/include/uapi/asm/ptrace.h
+++ b/arch/arm64/include/uapi/asm/ptrace.h
@@ -80,7 +80,7 @@
 #define PTRACE_PEEKMTETAGS	  33
 #define PTRACE_POKEMTETAGS	  34
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 /*
  * User structures for general purpose, floating point and debug registers.
@@ -332,6 +332,6 @@ struct user_gcs {
 	__u64 gcspr_el0;
 };
 
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 
 #endif /* _UAPI__ASM_PTRACE_H */
diff --git a/arch/arm64/include/uapi/asm/sigcontext.h b/arch/arm64/include/uapi/asm/sigcontext.h
index d42f7a92238b..e29bf3e2d0cc 100644
--- a/arch/arm64/include/uapi/asm/sigcontext.h
+++ b/arch/arm64/include/uapi/asm/sigcontext.h
@@ -17,7 +17,7 @@
 #ifndef _UAPI__ASM_SIGCONTEXT_H
 #define _UAPI__ASM_SIGCONTEXT_H
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 #include <linux/types.h>
 
@@ -192,7 +192,7 @@ struct gcs_context {
 	__u64 reserved;
 };
 
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
 
 #include <asm/sve_context.h>
 
diff --git a/arch/arm64/kernel/acpi.c b/arch/arm64/kernel/acpi.c
index f1cb2447afc9..af90128cfed5 100644
--- a/arch/arm64/kernel/acpi.c
+++ b/arch/arm64/kernel/acpi.c
@@ -133,7 +133,7 @@ static int __init acpi_fadt_sanity_check(void)
 
 	/*
 	 * FADT is required on arm64; retrieve it to check its presence
-	 * and carry out revision and ACPI HW reduced compliancy tests
+	 * and carry out revision and ACPI HW reduced compliance tests
 	 */
 	status = acpi_get_table(ACPI_SIG_FADT, 0, &table);
 	if (ACPI_FAILURE(status)) {
@@ -423,7 +423,7 @@ int apei_claim_sea(struct pt_regs *regs)
 			irq_work_run();
 			__irq_exit();
 		} else {
-			pr_warn_ratelimited("APEI work queued but not completed");
+			pr_warn_ratelimited("APEI work queued but not completed\n");
 			err = -EINPROGRESS;
 		}
 	}
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index e25b0f84a22d..42b182cfa404 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -1003,7 +1003,7 @@ static void __init sort_ftr_regs(void)
 
 /*
  * Initialise the CPU feature register from Boot CPU values.
- * Also initiliases the strict_mask for the register.
+ * Also initialises the strict_mask for the register.
  * Any bits that are not covered by an arm64_ftr_bits entry are considered
  * RES0 for the system-wide value, and must strictly match.
  */
@@ -1970,7 +1970,7 @@ static struct cpumask dbm_cpus __read_mostly;
 
 static inline void __cpu_enable_hw_dbm(void)
 {
-	u64 tcr = read_sysreg(tcr_el1) | TCR_HD;
+	u64 tcr = read_sysreg(tcr_el1) | TCR_EL1_HD;
 
 	write_sysreg(tcr, tcr_el1);
 	isb();
@@ -2256,7 +2256,7 @@ static bool has_generic_auth(const struct arm64_cpu_capabilities *entry,
 static void cpu_enable_e0pd(struct arm64_cpu_capabilities const *cap)
 {
 	if (this_cpu_has_cap(ARM64_HAS_E0PD))
-		sysreg_clear_set(tcr_el1, 0, TCR_E0PD1);
+		sysreg_clear_set(tcr_el1, 0, TCR_EL1_E0PD1);
 }
 #endif /* CONFIG_ARM64_E0PD */
 
diff --git a/arch/arm64/kernel/efi.c b/arch/arm64/kernel/efi.c
index 6c371b158b99..a81cb4aa4738 100644
--- a/arch/arm64/kernel/efi.c
+++ b/arch/arm64/kernel/efi.c
@@ -10,6 +10,7 @@
 #include <linux/efi.h>
 #include <linux/init.h>
 #include <linux/kmemleak.h>
+#include <linux/kthread.h>
 #include <linux/screen_info.h>
 #include <linux/vmalloc.h>
 
@@ -165,20 +166,53 @@ asmlinkage efi_status_t efi_handle_corrupted_x18(efi_status_t s, const char *f)
 	return s;
 }
 
-static DEFINE_RAW_SPINLOCK(efi_rt_lock);
-
 void arch_efi_call_virt_setup(void)
 {
-	efi_virtmap_load();
-	raw_spin_lock(&efi_rt_lock);
+	efi_runtime_assert_lock_held();
+
+	if (preemptible() && (current->flags & PF_KTHREAD)) {
+		/*
+		 * Disable migration to ensure that a preempted EFI runtime
+		 * service call will be resumed on the same CPU. This avoids
+		 * potential issues with EFI runtime calls that are preempted
+		 * while polling for an asynchronous completion of a secure
+		 * firmware call, which may not permit the CPU to change.
+		 */
+		migrate_disable();
+		kthread_use_mm(&efi_mm);
+	} else {
+		efi_virtmap_load();
+	}
+
+	/*
+	 * Enable access to the valid TTBR0_EL1 and invoke the errata
+	 * workaround directly since there is no return from exception when
+	 * invoking the EFI run-time services.
+	 */
+	uaccess_ttbr0_enable();
+	post_ttbr_update_workaround();
+
 	__efi_fpsimd_begin();
 }
 
 void arch_efi_call_virt_teardown(void)
 {
 	__efi_fpsimd_end();
-	raw_spin_unlock(&efi_rt_lock);
-	efi_virtmap_unload();
+
+	/*
+	 * Defer the switch to the current thread's TTBR0_EL1 until
+	 * uaccess_enable(). Do so before efi_virtmap_unload() updates the
+	 * saved TTBR0 value, so the userland page tables are not activated
+	 * inadvertently over the back of an exception.
+	 */
+	uaccess_ttbr0_disable();
+
+	if (preemptible() && (current->flags & PF_KTHREAD)) {
+		kthread_unuse_mm(&efi_mm);
+		migrate_enable();
+	} else {
+		efi_virtmap_unload();
+	}
 }
 
 asmlinkage u64 *efi_rt_stack_top __ro_after_init;
diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c
index 0a97e2621f60..3625797e9ee8 100644
--- a/arch/arm64/kernel/entry-common.c
+++ b/arch/arm64/kernel/entry-common.c
@@ -34,20 +34,12 @@
  * Handle IRQ/context state management when entering from kernel mode.
  * Before this function is called it is not safe to call regular kernel code,
  * instrumentable code, or any code which may trigger an exception.
- *
- * This is intended to match the logic in irqentry_enter(), handling the kernel
- * mode transitions only.
  */
-static __always_inline irqentry_state_t __enter_from_kernel_mode(struct pt_regs *regs)
-{
-	return irqentry_enter(regs);
-}
-
 static noinstr irqentry_state_t enter_from_kernel_mode(struct pt_regs *regs)
 {
 	irqentry_state_t state;
 
-	state = __enter_from_kernel_mode(regs);
+	state = irqentry_enter(regs);
 	mte_check_tfsr_entry();
 	mte_disable_tco_entry(current);
 
@@ -58,21 +50,12 @@ static noinstr irqentry_state_t enter_from_kernel_mode(struct pt_regs *regs)
  * Handle IRQ/context state management when exiting to kernel mode.
  * After this function returns it is not safe to call regular kernel code,
  * instrumentable code, or any code which may trigger an exception.
- *
- * This is intended to match the logic in irqentry_exit(), handling the kernel
- * mode transitions only, and with preemption handled elsewhere.
  */
-static __always_inline void __exit_to_kernel_mode(struct pt_regs *regs,
-						  irqentry_state_t state)
-{
-	irqentry_exit(regs, state);
-}
-
 static void noinstr exit_to_kernel_mode(struct pt_regs *regs,
 					irqentry_state_t state)
 {
 	mte_check_tfsr_exit();
-	__exit_to_kernel_mode(regs, state);
+	irqentry_exit(regs, state);
 }
 
 /*
@@ -80,17 +63,12 @@ static void noinstr exit_to_kernel_mode(struct pt_regs *regs,
  * Before this function is called it is not safe to call regular kernel code,
  * instrumentable code, or any code which may trigger an exception.
  */
-static __always_inline void __enter_from_user_mode(struct pt_regs *regs)
+static __always_inline void arm64_enter_from_user_mode(struct pt_regs *regs)
 {
 	enter_from_user_mode(regs);
 	mte_disable_tco_entry(current);
 }
 
-static __always_inline void arm64_enter_from_user_mode(struct pt_regs *regs)
-{
-	__enter_from_user_mode(regs);
-}
-
 /*
  * Handle IRQ/context state management when exiting to user mode.
  * After this function returns it is not safe to call regular kernel code,
diff --git a/arch/arm64/kernel/entry-ftrace.S b/arch/arm64/kernel/entry-ftrace.S
index 169ccf600066..025140caafe7 100644
--- a/arch/arm64/kernel/entry-ftrace.S
+++ b/arch/arm64/kernel/entry-ftrace.S
@@ -94,7 +94,7 @@ SYM_CODE_START(ftrace_caller)
 	stp	x29, x30, [sp, #FREGS_SIZE]
 	add	x29, sp, #FREGS_SIZE
 
-	/* Prepare arguments for the the tracer func */
+	/* Prepare arguments for the tracer func */
 	sub	x0, x30, #AARCH64_INSN_SIZE		// ip (callsite's BL insn)
 	mov	x1, x9					// parent_ip (callsite's LR)
 	mov	x3, sp					// regs
diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c
index e3f8f51748bc..c154f72634e0 100644
--- a/arch/arm64/kernel/fpsimd.c
+++ b/arch/arm64/kernel/fpsimd.c
@@ -225,10 +225,21 @@ static void fpsimd_bind_task_to_cpu(void);
  */
 static void get_cpu_fpsimd_context(void)
 {
-	if (!IS_ENABLED(CONFIG_PREEMPT_RT))
-		local_bh_disable();
-	else
+	if (!IS_ENABLED(CONFIG_PREEMPT_RT)) {
+		/*
+		 * The softirq subsystem lacks a true unmask/mask API, and
+		 * re-enabling softirq processing using local_bh_enable() will
+		 * not only unmask softirqs, it will also result in immediate
+		 * delivery of any pending softirqs.
+		 * This is undesirable when running with IRQs disabled, but in
+		 * that case, there is no need to mask softirqs in the first
+		 * place, so only bother doing so when IRQs are enabled.
+		 */
+		if (!irqs_disabled())
+			local_bh_disable();
+	} else {
 		preempt_disable();
+	}
 }
 
 /*
@@ -240,10 +251,12 @@ static void get_cpu_fpsimd_context(void)
  */
 static void put_cpu_fpsimd_context(void)
 {
-	if (!IS_ENABLED(CONFIG_PREEMPT_RT))
-		local_bh_enable();
-	else
+	if (!IS_ENABLED(CONFIG_PREEMPT_RT)) {
+		if (!irqs_disabled())
+			local_bh_enable();
+	} else {
 		preempt_enable();
+	}
 }
 
 unsigned int task_get_vl(const struct task_struct *task, enum vec_type type)
@@ -1489,21 +1502,23 @@ static void fpsimd_load_kernel_state(struct task_struct *task)
 	 * Elide the load if this CPU holds the most recent kernel mode
 	 * FPSIMD context of the current task.
 	 */
-	if (last->st == &task->thread.kernel_fpsimd_state &&
+	if (last->st == task->thread.kernel_fpsimd_state &&
 	    task->thread.kernel_fpsimd_cpu == smp_processor_id())
 		return;
 
-	fpsimd_load_state(&task->thread.kernel_fpsimd_state);
+	fpsimd_load_state(task->thread.kernel_fpsimd_state);
 }
 
 static void fpsimd_save_kernel_state(struct task_struct *task)
 {
 	struct cpu_fp_state cpu_fp_state = {
-		.st		= &task->thread.kernel_fpsimd_state,
+		.st		= task->thread.kernel_fpsimd_state,
 		.to_save	= FP_STATE_FPSIMD,
 	};
 
-	fpsimd_save_state(&task->thread.kernel_fpsimd_state);
+	BUG_ON(!cpu_fp_state.st);
+
+	fpsimd_save_state(task->thread.kernel_fpsimd_state);
 	fpsimd_bind_state_to_cpu(&cpu_fp_state);
 
 	task->thread.kernel_fpsimd_cpu = smp_processor_id();
@@ -1774,6 +1789,7 @@ void fpsimd_update_current_state(struct user_fpsimd_state const *state)
 void fpsimd_flush_task_state(struct task_struct *t)
 {
 	t->thread.fpsimd_cpu = NR_CPUS;
+	t->thread.kernel_fpsimd_state = NULL;
 	/*
 	 * If we don't support fpsimd, bail out after we have
 	 * reset the fpsimd_cpu for this task and clear the
@@ -1833,12 +1849,19 @@ void fpsimd_save_and_flush_cpu_state(void)
  *
  * The caller may freely use the FPSIMD registers until kernel_neon_end() is
  * called.
+ *
+ * Unless called from non-preemptible task context, @state must point to a
+ * caller provided buffer that will be used to preserve the task's kernel mode
+ * FPSIMD context when it is scheduled out, or if it is interrupted by kernel
+ * mode FPSIMD occurring in softirq context. May be %NULL otherwise.
  */
-void kernel_neon_begin(void)
+void kernel_neon_begin(struct user_fpsimd_state *state)
 {
 	if (WARN_ON(!system_supports_fpsimd()))
 		return;
 
+	WARN_ON((preemptible() || in_serving_softirq()) && !state);
+
 	BUG_ON(!may_use_simd());
 
 	get_cpu_fpsimd_context();
@@ -1846,7 +1869,7 @@ void kernel_neon_begin(void)
 	/* Save unsaved fpsimd state, if any: */
 	if (test_thread_flag(TIF_KERNEL_FPSTATE)) {
 		BUG_ON(IS_ENABLED(CONFIG_PREEMPT_RT) || !in_serving_softirq());
-		fpsimd_save_kernel_state(current);
+		fpsimd_save_state(state);
 	} else {
 		fpsimd_save_user_state();
 
@@ -1867,8 +1890,16 @@ void kernel_neon_begin(void)
 		 * mode in task context. So in this case, setting the flag here
 		 * is always appropriate.
 		 */
-		if (IS_ENABLED(CONFIG_PREEMPT_RT) || !in_serving_softirq())
+		if (IS_ENABLED(CONFIG_PREEMPT_RT) || !in_serving_softirq()) {
+			/*
+			 * Record the caller provided buffer as the kernel mode
+			 * FP/SIMD buffer for this task, so that the state can
+			 * be preserved and restored on a context switch.
+			 */
+			WARN_ON(current->thread.kernel_fpsimd_state != NULL);
+			current->thread.kernel_fpsimd_state = state;
 			set_thread_flag(TIF_KERNEL_FPSTATE);
+		}
 	}
 
 	/* Invalidate any task state remaining in the fpsimd regs: */
@@ -1886,22 +1917,30 @@ EXPORT_SYMBOL_GPL(kernel_neon_begin);
  *
  * The caller must not use the FPSIMD registers after this function is called,
  * unless kernel_neon_begin() is called again in the meantime.
+ *
+ * The value of @state must match the value passed to the preceding call to
+ * kernel_neon_begin().
  */
-void kernel_neon_end(void)
+void kernel_neon_end(struct user_fpsimd_state *state)
 {
 	if (!system_supports_fpsimd())
 		return;
 
+	if (!test_thread_flag(TIF_KERNEL_FPSTATE))
+		return;
+
 	/*
 	 * If we are returning from a nested use of kernel mode FPSIMD, restore
 	 * the task context kernel mode FPSIMD state. This can only happen when
 	 * running in softirq context on non-PREEMPT_RT.
 	 */
-	if (!IS_ENABLED(CONFIG_PREEMPT_RT) && in_serving_softirq() &&
-	    test_thread_flag(TIF_KERNEL_FPSTATE))
-		fpsimd_load_kernel_state(current);
-	else
+	if (!IS_ENABLED(CONFIG_PREEMPT_RT) && in_serving_softirq()) {
+		fpsimd_load_state(state);
+	} else {
 		clear_thread_flag(TIF_KERNEL_FPSTATE);
+		WARN_ON(current->thread.kernel_fpsimd_state != state);
+		current->thread.kernel_fpsimd_state = NULL;
+	}
 }
 EXPORT_SYMBOL_GPL(kernel_neon_end);
 
@@ -1934,11 +1973,11 @@ void __efi_fpsimd_begin(void)
 	if (!system_supports_fpsimd())
 		return;
 
-	WARN_ON(preemptible());
-
 	if (may_use_simd()) {
-		kernel_neon_begin();
+		kernel_neon_begin(&efi_fpsimd_state);
 	} else {
+		WARN_ON(preemptible());
+
 		/*
 		 * If !efi_sve_state, SVE can't be in use yet and doesn't need
 		 * preserving:
@@ -1986,7 +2025,7 @@ void __efi_fpsimd_end(void)
 		return;
 
 	if (!efi_fpsimd_state_used) {
-		kernel_neon_end();
+		kernel_neon_end(&efi_fpsimd_state);
 	} else {
 		if (system_supports_sve() && efi_sve_state_used) {
 			bool ffr = true;
diff --git a/arch/arm64/kernel/ftrace.c b/arch/arm64/kernel/ftrace.c
index 5adad37ab4fa..5a1554a44162 100644
--- a/arch/arm64/kernel/ftrace.c
+++ b/arch/arm64/kernel/ftrace.c
@@ -492,7 +492,7 @@ int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec,
 		return ret;
 
 	/*
-	 * When using mcount, callsites in modules may have been initalized to
+	 * When using mcount, callsites in modules may have been initialized to
 	 * call an arbitrary module PLT (which redirects to the _mcount stub)
 	 * rather than the ftrace PLT we'll use at runtime (which redirects to
 	 * the ftrace trampoline). We can ignore the old PLT when initializing
diff --git a/arch/arm64/kernel/irq.c b/arch/arm64/kernel/irq.c
index c0065a1d77cf..15dedb385b9e 100644
--- a/arch/arm64/kernel/irq.c
+++ b/arch/arm64/kernel/irq.c
@@ -62,7 +62,7 @@ static void __init init_irq_stacks(void)
 	}
 }
 
-#ifndef CONFIG_PREEMPT_RT
+#ifdef CONFIG_SOFTIRQ_ON_OWN_STACK
 static void ____do_softirq(struct pt_regs *regs)
 {
 	__do_softirq();
diff --git a/arch/arm64/kernel/machine_kexec.c b/arch/arm64/kernel/machine_kexec.c
index 6f121a0164a4..239c16e3d02f 100644
--- a/arch/arm64/kernel/machine_kexec.c
+++ b/arch/arm64/kernel/machine_kexec.c
@@ -251,7 +251,7 @@ void crash_post_resume(void)
  * marked as Reserved as memory was allocated via memblock_reserve().
  *
  * In hibernation, the pages which are Reserved and yet "nosave" are excluded
- * from the hibernation iamge. crash_is_nosave() does thich check for crash
+ * from the hibernation image. crash_is_nosave() does thich check for crash
  * dump kernel and will reduce the total size of hibernation image.
  */
 
diff --git a/arch/arm64/kernel/pi/map_kernel.c b/arch/arm64/kernel/pi/map_kernel.c
index 659297f87cfa..a852264958c3 100644
--- a/arch/arm64/kernel/pi/map_kernel.c
+++ b/arch/arm64/kernel/pi/map_kernel.c
@@ -141,13 +141,13 @@ static void __init map_kernel(u64 kaslr_offset, u64 va_offset, int root_level)
 static void noinline __section(".idmap.text") set_ttbr0_for_lpa2(phys_addr_t ttbr)
 {
 	u64 sctlr = read_sysreg(sctlr_el1);
-	u64 tcr = read_sysreg(tcr_el1) | TCR_DS;
+	u64 tcr = read_sysreg(tcr_el1) | TCR_EL1_DS;
 	u64 mmfr0 = read_sysreg(id_aa64mmfr0_el1);
 	u64 parange = cpuid_feature_extract_unsigned_field(mmfr0,
 							   ID_AA64MMFR0_EL1_PARANGE_SHIFT);
 
-	tcr &= ~TCR_IPS_MASK;
-	tcr |= parange << TCR_IPS_SHIFT;
+	tcr &= ~TCR_EL1_IPS_MASK;
+	tcr |= parange << TCR_EL1_IPS_SHIFT;
 
 	asm("	msr	sctlr_el1, %0		;"
 	    "	isb				;"
@@ -263,7 +263,7 @@ asmlinkage void __init early_map_kernel(u64 boot_status, phys_addr_t fdt)
 	}
 
 	if (va_bits > VA_BITS_MIN)
-		sysreg_clear_set(tcr_el1, TCR_T1SZ_MASK, TCR_T1SZ(va_bits));
+		sysreg_clear_set(tcr_el1, TCR_EL1_T1SZ_MASK, TCR_T1SZ(va_bits));
 
 	/*
 	 * The virtual KASLR displacement modulo 2MiB is decided by the
diff --git a/arch/arm64/kernel/probes/uprobes.c b/arch/arm64/kernel/probes/uprobes.c
index 2799bdb2fb82..941668800aea 100644
--- a/arch/arm64/kernel/probes/uprobes.c
+++ b/arch/arm64/kernel/probes/uprobes.c
@@ -131,7 +131,7 @@ void arch_uprobe_abort_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
 	struct uprobe_task *utask = current->utask;
 
 	/*
-	 * Task has received a fatal signal, so reset back to probbed
+	 * Task has received a fatal signal, so reset back to probed
 	 * address.
 	 */
 	instruction_pointer_set(regs, utask->vaddr);
diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c
index 4b001121c72d..b9bdd83fbbca 100644
--- a/arch/arm64/kernel/ptrace.c
+++ b/arch/arm64/kernel/ptrace.c
@@ -912,13 +912,39 @@ static int sve_set_common(struct task_struct *target,
 		return -EINVAL;
 
 	/*
-	 * Apart from SVE_PT_REGS_MASK, all SVE_PT_* flags are consumed by
-	 * vec_set_vector_length(), which will also validate them for us:
+	 * On systems without SVE we accept FPSIMD format writes with
+	 * a VL of 0 to allow exiting streaming mode, otherwise a VL
+	 * is required.
 	 */
-	ret = vec_set_vector_length(target, type, header.vl,
-		((unsigned long)header.flags & ~SVE_PT_REGS_MASK) << 16);
-	if (ret)
-		return ret;
+	if (header.vl) {
+		/*
+		 * If the system does not support SVE we can't
+		 * configure a SVE VL.
+		 */
+		if (!system_supports_sve() && type == ARM64_VEC_SVE)
+			return -EINVAL;
+
+		/*
+		 * Apart from SVE_PT_REGS_MASK, all SVE_PT_* flags are
+		 * consumed by vec_set_vector_length(), which will
+		 * also validate them for us:
+		 */
+		ret = vec_set_vector_length(target, type, header.vl,
+					    ((unsigned long)header.flags & ~SVE_PT_REGS_MASK) << 16);
+		if (ret)
+			return ret;
+	} else {
+		/* If the system supports SVE we require a VL. */
+		if (system_supports_sve())
+			return -EINVAL;
+
+		/*
+		 * Only FPSIMD formatted data with no flags set is
+		 * supported.
+		 */
+		if (header.flags != SVE_PT_REGS_FPSIMD)
+			return -EINVAL;
+	}
 
 	/* Allocate SME storage if necessary, preserving any existing ZA/ZT state */
 	if (type == ARM64_VEC_SME) {
@@ -1016,7 +1042,7 @@ static int sve_set(struct task_struct *target,
 		   unsigned int pos, unsigned int count,
 		   const void *kbuf, const void __user *ubuf)
 {
-	if (!system_supports_sve())
+	if (!system_supports_sve() && !system_supports_sme())
 		return -EINVAL;
 
 	return sve_set_common(target, regset, pos, count, kbuf, ubuf,
diff --git a/arch/arm64/kernel/sdei.c b/arch/arm64/kernel/sdei.c
index 95169f7b6531..778f2a1faac8 100644
--- a/arch/arm64/kernel/sdei.c
+++ b/arch/arm64/kernel/sdei.c
@@ -63,8 +63,6 @@ static void free_sdei_stacks(void)
 {
 	int cpu;
 
-	BUILD_BUG_ON(!IS_ENABLED(CONFIG_VMAP_STACK));
-
 	for_each_possible_cpu(cpu) {
 		_free_sdei_stack(&sdei_stack_normal_ptr, cpu);
 		_free_sdei_stack(&sdei_stack_critical_ptr, cpu);
@@ -88,8 +86,6 @@ static int init_sdei_stacks(void)
 	int cpu;
 	int err = 0;
 
-	BUILD_BUG_ON(!IS_ENABLED(CONFIG_VMAP_STACK));
-
 	for_each_possible_cpu(cpu) {
 		err = _init_sdei_stack(&sdei_stack_normal_ptr, cpu);
 		if (err)
@@ -202,7 +198,7 @@ out_err:
 /*
  * do_sdei_event() returns one of:
  *  SDEI_EV_HANDLED -  success, return to the interrupted context.
- *  SDEI_EV_FAILED  -  failure, return this error code to firmare.
+ *  SDEI_EV_FAILED  -  failure, return this error code to firmware.
  *  virtual-address -  success, return to this address.
  */
 unsigned long __kprobes do_sdei_event(struct pt_regs *regs,
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index 6fb838eee2e7..1aa324104afb 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -350,7 +350,7 @@ void arch_cpuhp_cleanup_dead_cpu(unsigned int cpu)
 
 	/*
 	 * Now that the dying CPU is beyond the point of no return w.r.t.
-	 * in-kernel synchronisation, try to get the firwmare to help us to
+	 * in-kernel synchronisation, try to get the firmware to help us to
 	 * verify that it has really left the kernel before we consider
 	 * clobbering anything it might still be using.
 	 */
@@ -523,7 +523,7 @@ int arch_register_cpu(int cpu)
 
 	/*
 	 * Availability of the acpi handle is sufficient to establish
-	 * that _STA has aleady been checked. No need to recheck here.
+	 * that _STA has already been checked. No need to recheck here.
 	 */
 	c->hotpluggable = arch_cpu_is_hotpluggable(cpu);
 
diff --git a/arch/arm64/kernel/syscall.c b/arch/arm64/kernel/syscall.c
index aba7ca6bca2d..c062badd1a56 100644
--- a/arch/arm64/kernel/syscall.c
+++ b/arch/arm64/kernel/syscall.c
@@ -96,7 +96,7 @@ static void el0_svc_common(struct pt_regs *regs, int scno, int sc_nr,
 	 * (Similarly for HVC and SMC elsewhere.)
 	 */
 
-	if (flags & _TIF_MTE_ASYNC_FAULT) {
+	if (unlikely(flags & _TIF_MTE_ASYNC_FAULT)) {
 		/*
 		 * Process the asynchronous tag check fault before the actual
 		 * syscall. do_notify_resume() will send a signal to userspace
diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c
index 681939ef5d16..914282016069 100644
--- a/arch/arm64/kernel/traps.c
+++ b/arch/arm64/kernel/traps.c
@@ -922,7 +922,7 @@ void __noreturn panic_bad_stack(struct pt_regs *regs, unsigned long esr, unsigne
 	__show_regs(regs);
 
 	/*
-	 * We use nmi_panic to limit the potential for recusive overflows, and
+	 * We use nmi_panic to limit the potential for recursive overflows, and
 	 * to get a better stack trace.
 	 */
 	nmi_panic(NULL, "kernel stack overflow");
diff --git a/arch/arm64/kernel/vmcore_info.c b/arch/arm64/kernel/vmcore_info.c
index b19d5d6cb8b3..9619ece66b79 100644
--- a/arch/arm64/kernel/vmcore_info.c
+++ b/arch/arm64/kernel/vmcore_info.c
@@ -14,7 +14,7 @@ static inline u64 get_tcr_el1_t1sz(void);
 
 static inline u64 get_tcr_el1_t1sz(void)
 {
-	return (read_sysreg(tcr_el1) & TCR_T1SZ_MASK) >> TCR_T1SZ_OFFSET;
+	return (read_sysreg(tcr_el1) & TCR_EL1_T1SZ_MASK) >> TCR_EL1_T1SZ_SHIFT;
 }
 
 void arch_crash_save_vmcoreinfo(void)
diff --git a/arch/arm64/kvm/arch_timer.c b/arch/arm64/kvm/arch_timer.c
index 3f675875abea..99a07972068d 100644
--- a/arch/arm64/kvm/arch_timer.c
+++ b/arch/arm64/kvm/arch_timer.c
@@ -806,7 +806,7 @@ static void timer_set_traps(struct kvm_vcpu *vcpu, struct timer_map *map)
 		tpt = tpc = true;
 
 	/*
-	 * For the poor sods that could not correctly substract one value
+	 * For the poor sods that could not correctly subtract one value
 	 * from another, trap the full virtual timer and counter.
 	 */
 	if (has_broken_cntvoff() && timer_get_offset(map->direct_vtimer))
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 052bf0d4d0b0..12acc024f7a8 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -2448,7 +2448,7 @@ static void kvm_hyp_init_symbols(void)
 	kvm_nvhe_sym(__icache_flags) = __icache_flags;
 	kvm_nvhe_sym(kvm_arm_vmid_bits) = kvm_arm_vmid_bits;
 
-	/* Propagate the FGT state to the the nVHE side */
+	/* Propagate the FGT state to the nVHE side */
 	kvm_nvhe_sym(hfgrtr_masks)  = hfgrtr_masks;
 	kvm_nvhe_sym(hfgwtr_masks)  = hfgwtr_masks;
 	kvm_nvhe_sym(hfgitr_masks)  = hfgitr_masks;
diff --git a/arch/arm64/kvm/hyp/nvhe/ffa.c b/arch/arm64/kvm/hyp/nvhe/ffa.c
index 58b7d0c477d7..f731cc4c3f28 100644
--- a/arch/arm64/kvm/hyp/nvhe/ffa.c
+++ b/arch/arm64/kvm/hyp/nvhe/ffa.c
@@ -115,7 +115,7 @@ static void ffa_set_retval(struct kvm_cpu_context *ctxt,
 	 *
 	 * FFA-1.3 introduces 64-bit variants of the CPU cycle management
 	 * interfaces. Moreover, FF-A 1.3 clarifies that SMC32 direct requests
-	 * complete with SMC32 direct reponses which *should* allow us use the
+	 * complete with SMC32 direct responses which *should* allow us use the
 	 * function ID sent by the caller to determine whether to return x8-x17.
 	 *
 	 * Note that we also cannot rely on function IDs in the response.
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 7cc964af8d30..9a6a80c3fbe5 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -1755,7 +1755,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 
 	/*
 	 * Check if this is non-struct page memory PFN, and cannot support
-	 * CMOs. It could potentially be unsafe to access as cachable.
+	 * CMOs. It could potentially be unsafe to access as cacheable.
 	 */
 	if (vm_flags & (VM_PFNMAP | VM_MIXEDMAP) && !pfn_is_map_memory(pfn)) {
 		if (is_vma_cacheable) {
diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c
index f04cda40545b..be6bbd167770 100644
--- a/arch/arm64/kvm/nested.c
+++ b/arch/arm64/kvm/nested.c
@@ -85,7 +85,7 @@ int kvm_vcpu_init_nested(struct kvm_vcpu *vcpu)
 	/*
 	 * Let's treat memory allocation failures as benign: If we fail to
 	 * allocate anything, return an error and keep the allocated array
-	 * alive. Userspace may try to recover by intializing the vcpu
+	 * alive. Userspace may try to recover by initializing the vcpu
 	 * again, and there is no reason to affect the whole VM for this.
 	 */
 	num_mmus = atomic_read(&kvm->online_vcpus) * S2_MMU_PER_VCPU;
diff --git a/arch/arm64/mm/contpte.c b/arch/arm64/mm/contpte.c
index c0557945939c..589bcf878938 100644
--- a/arch/arm64/mm/contpte.c
+++ b/arch/arm64/mm/contpte.c
@@ -622,8 +622,7 @@ int contpte_ptep_set_access_flags(struct vm_area_struct *vma,
 			__ptep_set_access_flags(vma, addr, ptep, entry, 0);
 
 		if (dirty)
-			__flush_tlb_range(vma, start_addr, addr,
-							PAGE_SIZE, true, 3);
+			local_flush_tlb_contpte(vma, start_addr);
 	} else {
 		__contpte_try_unfold(vma->vm_mm, addr, ptep, orig_pte);
 		__ptep_set_access_flags(vma, addr, ptep, entry, dirty);
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index a193b6a5d1e6..be9dab2c7d6a 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -233,9 +233,13 @@ int __ptep_set_access_flags(struct vm_area_struct *vma,
 		pteval = cmpxchg_relaxed(&pte_val(*ptep), old_pteval, pteval);
 	} while (pteval != old_pteval);
 
-	/* Invalidate a stale read-only entry */
+	/*
+	 * Invalidate the local stale read-only entry.  Remote stale entries
+	 * may still cause page faults and be invalidated via
+	 * flush_tlb_fix_spurious_fault().
+	 */
 	if (dirty)
-		flush_tlb_page(vma, address);
+		local_flush_tlb_page(vma, address);
 	return 1;
 }
 
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 2ba01dc8ef82..c8d24b7cdd62 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -49,6 +49,8 @@
 #define NO_CONT_MAPPINGS	BIT(1)
 #define NO_EXEC_MAPPINGS	BIT(2)	/* assumes FEAT_HPDS is not used */
 
+#define INVALID_PHYS_ADDR	(-1ULL)
+
 DEFINE_STATIC_KEY_FALSE(arm64_ptdump_lock_key);
 
 u64 kimage_voffset __ro_after_init;
@@ -194,11 +196,11 @@ static void init_pte(pte_t *ptep, unsigned long addr, unsigned long end,
 	} while (ptep++, addr += PAGE_SIZE, addr != end);
 }
 
-static void alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr,
-				unsigned long end, phys_addr_t phys,
-				pgprot_t prot,
-				phys_addr_t (*pgtable_alloc)(enum pgtable_type),
-				int flags)
+static int alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr,
+			       unsigned long end, phys_addr_t phys,
+			       pgprot_t prot,
+			       phys_addr_t (*pgtable_alloc)(enum pgtable_type),
+			       int flags)
 {
 	unsigned long next;
 	pmd_t pmd = READ_ONCE(*pmdp);
@@ -213,6 +215,8 @@ static void alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr,
 			pmdval |= PMD_TABLE_PXN;
 		BUG_ON(!pgtable_alloc);
 		pte_phys = pgtable_alloc(TABLE_PTE);
+		if (pte_phys == INVALID_PHYS_ADDR)
+			return -ENOMEM;
 		ptep = pte_set_fixmap(pte_phys);
 		init_clear_pgtable(ptep);
 		ptep += pte_index(addr);
@@ -244,11 +248,13 @@ static void alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr,
 	 * walker.
 	 */
 	pte_clear_fixmap();
+
+	return 0;
 }
 
-static void init_pmd(pmd_t *pmdp, unsigned long addr, unsigned long end,
-		     phys_addr_t phys, pgprot_t prot,
-		     phys_addr_t (*pgtable_alloc)(enum pgtable_type), int flags)
+static int init_pmd(pmd_t *pmdp, unsigned long addr, unsigned long end,
+		    phys_addr_t phys, pgprot_t prot,
+		    phys_addr_t (*pgtable_alloc)(enum pgtable_type), int flags)
 {
 	unsigned long next;
 
@@ -269,22 +275,29 @@ static void init_pmd(pmd_t *pmdp, unsigned long addr, unsigned long end,
 			BUG_ON(!pgattr_change_is_safe(pmd_val(old_pmd),
 						      READ_ONCE(pmd_val(*pmdp))));
 		} else {
-			alloc_init_cont_pte(pmdp, addr, next, phys, prot,
-					    pgtable_alloc, flags);
+			int ret;
+
+			ret = alloc_init_cont_pte(pmdp, addr, next, phys, prot,
+						  pgtable_alloc, flags);
+			if (ret)
+				return ret;
 
 			BUG_ON(pmd_val(old_pmd) != 0 &&
 			       pmd_val(old_pmd) != READ_ONCE(pmd_val(*pmdp)));
 		}
 		phys += next - addr;
 	} while (pmdp++, addr = next, addr != end);
+
+	return 0;
 }
 
-static void alloc_init_cont_pmd(pud_t *pudp, unsigned long addr,
-				unsigned long end, phys_addr_t phys,
-				pgprot_t prot,
-				phys_addr_t (*pgtable_alloc)(enum pgtable_type),
-				int flags)
+static int alloc_init_cont_pmd(pud_t *pudp, unsigned long addr,
+			       unsigned long end, phys_addr_t phys,
+			       pgprot_t prot,
+			       phys_addr_t (*pgtable_alloc)(enum pgtable_type),
+			       int flags)
 {
+	int ret;
 	unsigned long next;
 	pud_t pud = READ_ONCE(*pudp);
 	pmd_t *pmdp;
@@ -301,6 +314,8 @@ static void alloc_init_cont_pmd(pud_t *pudp, unsigned long addr,
 			pudval |= PUD_TABLE_PXN;
 		BUG_ON(!pgtable_alloc);
 		pmd_phys = pgtable_alloc(TABLE_PMD);
+		if (pmd_phys == INVALID_PHYS_ADDR)
+			return -ENOMEM;
 		pmdp = pmd_set_fixmap(pmd_phys);
 		init_clear_pgtable(pmdp);
 		pmdp += pmd_index(addr);
@@ -320,20 +335,26 @@ static void alloc_init_cont_pmd(pud_t *pudp, unsigned long addr,
 		    (flags & NO_CONT_MAPPINGS) == 0)
 			__prot = __pgprot(pgprot_val(prot) | PTE_CONT);
 
-		init_pmd(pmdp, addr, next, phys, __prot, pgtable_alloc, flags);
+		ret = init_pmd(pmdp, addr, next, phys, __prot, pgtable_alloc, flags);
+		if (ret)
+			goto out;
 
 		pmdp += pmd_index(next) - pmd_index(addr);
 		phys += next - addr;
 	} while (addr = next, addr != end);
 
+out:
 	pmd_clear_fixmap();
+
+	return ret;
 }
 
-static void alloc_init_pud(p4d_t *p4dp, unsigned long addr, unsigned long end,
-			   phys_addr_t phys, pgprot_t prot,
-			   phys_addr_t (*pgtable_alloc)(enum pgtable_type),
-			   int flags)
+static int alloc_init_pud(p4d_t *p4dp, unsigned long addr, unsigned long end,
+			  phys_addr_t phys, pgprot_t prot,
+			  phys_addr_t (*pgtable_alloc)(enum pgtable_type),
+			  int flags)
 {
+	int ret = 0;
 	unsigned long next;
 	p4d_t p4d = READ_ONCE(*p4dp);
 	pud_t *pudp;
@@ -346,6 +367,8 @@ static void alloc_init_pud(p4d_t *p4dp, unsigned long addr, unsigned long end,
 			p4dval |= P4D_TABLE_PXN;
 		BUG_ON(!pgtable_alloc);
 		pud_phys = pgtable_alloc(TABLE_PUD);
+		if (pud_phys == INVALID_PHYS_ADDR)
+			return -ENOMEM;
 		pudp = pud_set_fixmap(pud_phys);
 		init_clear_pgtable(pudp);
 		pudp += pud_index(addr);
@@ -375,8 +398,10 @@ static void alloc_init_pud(p4d_t *p4dp, unsigned long addr, unsigned long end,
 			BUG_ON(!pgattr_change_is_safe(pud_val(old_pud),
 						      READ_ONCE(pud_val(*pudp))));
 		} else {
-			alloc_init_cont_pmd(pudp, addr, next, phys, prot,
-					    pgtable_alloc, flags);
+			ret = alloc_init_cont_pmd(pudp, addr, next, phys, prot,
+						  pgtable_alloc, flags);
+			if (ret)
+				goto out;
 
 			BUG_ON(pud_val(old_pud) != 0 &&
 			       pud_val(old_pud) != READ_ONCE(pud_val(*pudp)));
@@ -384,14 +409,18 @@ static void alloc_init_pud(p4d_t *p4dp, unsigned long addr, unsigned long end,
 		phys += next - addr;
 	} while (pudp++, addr = next, addr != end);
 
+out:
 	pud_clear_fixmap();
+
+	return ret;
 }
 
-static void alloc_init_p4d(pgd_t *pgdp, unsigned long addr, unsigned long end,
-			   phys_addr_t phys, pgprot_t prot,
-			   phys_addr_t (*pgtable_alloc)(enum pgtable_type),
-			   int flags)
+static int alloc_init_p4d(pgd_t *pgdp, unsigned long addr, unsigned long end,
+			  phys_addr_t phys, pgprot_t prot,
+			  phys_addr_t (*pgtable_alloc)(enum pgtable_type),
+			  int flags)
 {
+	int ret;
 	unsigned long next;
 	pgd_t pgd = READ_ONCE(*pgdp);
 	p4d_t *p4dp;
@@ -404,6 +433,8 @@ static void alloc_init_p4d(pgd_t *pgdp, unsigned long addr, unsigned long end,
 			pgdval |= PGD_TABLE_PXN;
 		BUG_ON(!pgtable_alloc);
 		p4d_phys = pgtable_alloc(TABLE_P4D);
+		if (p4d_phys == INVALID_PHYS_ADDR)
+			return -ENOMEM;
 		p4dp = p4d_set_fixmap(p4d_phys);
 		init_clear_pgtable(p4dp);
 		p4dp += p4d_index(addr);
@@ -418,8 +449,10 @@ static void alloc_init_p4d(pgd_t *pgdp, unsigned long addr, unsigned long end,
 
 		next = p4d_addr_end(addr, end);
 
-		alloc_init_pud(p4dp, addr, next, phys, prot,
-			       pgtable_alloc, flags);
+		ret = alloc_init_pud(p4dp, addr, next, phys, prot,
+				     pgtable_alloc, flags);
+		if (ret)
+			goto out;
 
 		BUG_ON(p4d_val(old_p4d) != 0 &&
 		       p4d_val(old_p4d) != READ_ONCE(p4d_val(*p4dp)));
@@ -427,15 +460,19 @@ static void alloc_init_p4d(pgd_t *pgdp, unsigned long addr, unsigned long end,
 		phys += next - addr;
 	} while (p4dp++, addr = next, addr != end);
 
+out:
 	p4d_clear_fixmap();
+
+	return ret;
 }
 
-static void __create_pgd_mapping_locked(pgd_t *pgdir, phys_addr_t phys,
-					unsigned long virt, phys_addr_t size,
-					pgprot_t prot,
-					phys_addr_t (*pgtable_alloc)(enum pgtable_type),
-					int flags)
+static int __create_pgd_mapping_locked(pgd_t *pgdir, phys_addr_t phys,
+				       unsigned long virt, phys_addr_t size,
+				       pgprot_t prot,
+				       phys_addr_t (*pgtable_alloc)(enum pgtable_type),
+				       int flags)
 {
+	int ret;
 	unsigned long addr, end, next;
 	pgd_t *pgdp = pgd_offset_pgd(pgdir, virt);
 
@@ -444,7 +481,7 @@ static void __create_pgd_mapping_locked(pgd_t *pgdir, phys_addr_t phys,
 	 * within a page, we cannot map the region as the caller expects.
 	 */
 	if (WARN_ON((phys ^ virt) & ~PAGE_MASK))
-		return;
+		return -EINVAL;
 
 	phys &= PAGE_MASK;
 	addr = virt & PAGE_MASK;
@@ -452,25 +489,45 @@ static void __create_pgd_mapping_locked(pgd_t *pgdir, phys_addr_t phys,
 
 	do {
 		next = pgd_addr_end(addr, end);
-		alloc_init_p4d(pgdp, addr, next, phys, prot, pgtable_alloc,
-			       flags);
+		ret = alloc_init_p4d(pgdp, addr, next, phys, prot, pgtable_alloc,
+				     flags);
+		if (ret)
+			return ret;
 		phys += next - addr;
 	} while (pgdp++, addr = next, addr != end);
+
+	return 0;
 }
 
-static void __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys,
-				 unsigned long virt, phys_addr_t size,
-				 pgprot_t prot,
-				 phys_addr_t (*pgtable_alloc)(enum pgtable_type),
-				 int flags)
+static int __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys,
+				unsigned long virt, phys_addr_t size,
+				pgprot_t prot,
+				phys_addr_t (*pgtable_alloc)(enum pgtable_type),
+				int flags)
 {
+	int ret;
+
 	mutex_lock(&fixmap_lock);
-	__create_pgd_mapping_locked(pgdir, phys, virt, size, prot,
-				    pgtable_alloc, flags);
+	ret = __create_pgd_mapping_locked(pgdir, phys, virt, size, prot,
+					  pgtable_alloc, flags);
 	mutex_unlock(&fixmap_lock);
+
+	return ret;
 }
 
-#define INVALID_PHYS_ADDR	(-1ULL)
+static void early_create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys,
+				     unsigned long virt, phys_addr_t size,
+				     pgprot_t prot,
+				     phys_addr_t (*pgtable_alloc)(enum pgtable_type),
+				     int flags)
+{
+	int ret;
+
+	ret = __create_pgd_mapping(pgdir, phys, virt, size, prot, pgtable_alloc,
+				   flags);
+	if (ret)
+		panic("Failed to create page tables\n");
+}
 
 static phys_addr_t __pgd_pgtable_alloc(struct mm_struct *mm, gfp_t gfp,
 				       enum pgtable_type pgtable_type)
@@ -503,7 +560,7 @@ static phys_addr_t __pgd_pgtable_alloc(struct mm_struct *mm, gfp_t gfp,
 }
 
 static phys_addr_t
-try_pgd_pgtable_alloc_init_mm(enum pgtable_type pgtable_type, gfp_t gfp)
+pgd_pgtable_alloc_init_mm_gfp(enum pgtable_type pgtable_type, gfp_t gfp)
 {
 	return __pgd_pgtable_alloc(&init_mm, gfp, pgtable_type);
 }
@@ -511,21 +568,13 @@ try_pgd_pgtable_alloc_init_mm(enum pgtable_type pgtable_type, gfp_t gfp)
 static phys_addr_t __maybe_unused
 pgd_pgtable_alloc_init_mm(enum pgtable_type pgtable_type)
 {
-	phys_addr_t pa;
-
-	pa = __pgd_pgtable_alloc(&init_mm, GFP_PGTABLE_KERNEL, pgtable_type);
-	BUG_ON(pa == INVALID_PHYS_ADDR);
-	return pa;
+	return pgd_pgtable_alloc_init_mm_gfp(pgtable_type, GFP_PGTABLE_KERNEL);
 }
 
 static phys_addr_t
 pgd_pgtable_alloc_special_mm(enum pgtable_type pgtable_type)
 {
-	phys_addr_t pa;
-
-	pa = __pgd_pgtable_alloc(NULL, GFP_PGTABLE_KERNEL, pgtable_type);
-	BUG_ON(pa == INVALID_PHYS_ADDR);
-	return pa;
+	return  __pgd_pgtable_alloc(NULL, GFP_PGTABLE_KERNEL, pgtable_type);
 }
 
 static void split_contpte(pte_t *ptep)
@@ -546,7 +595,7 @@ static int split_pmd(pmd_t *pmdp, pmd_t pmd, gfp_t gfp, bool to_cont)
 	pte_t *ptep;
 	int i;
 
-	pte_phys = try_pgd_pgtable_alloc_init_mm(TABLE_PTE, gfp);
+	pte_phys = pgd_pgtable_alloc_init_mm_gfp(TABLE_PTE, gfp);
 	if (pte_phys == INVALID_PHYS_ADDR)
 		return -ENOMEM;
 	ptep = (pte_t *)phys_to_virt(pte_phys);
@@ -591,7 +640,7 @@ static int split_pud(pud_t *pudp, pud_t pud, gfp_t gfp, bool to_cont)
 	pmd_t *pmdp;
 	int i;
 
-	pmd_phys = try_pgd_pgtable_alloc_init_mm(TABLE_PMD, gfp);
+	pmd_phys = pgd_pgtable_alloc_init_mm_gfp(TABLE_PMD, gfp);
 	if (pmd_phys == INVALID_PHYS_ADDR)
 		return -ENOMEM;
 	pmdp = (pmd_t *)phys_to_virt(pmd_phys);
@@ -935,8 +984,8 @@ void __init create_mapping_noalloc(phys_addr_t phys, unsigned long virt,
 			&phys, virt);
 		return;
 	}
-	__create_pgd_mapping(init_mm.pgd, phys, virt, size, prot, NULL,
-			     NO_CONT_MAPPINGS);
+	early_create_pgd_mapping(init_mm.pgd, phys, virt, size, prot, NULL,
+				 NO_CONT_MAPPINGS);
 }
 
 void __init create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys,
@@ -950,8 +999,8 @@ void __init create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys,
 	if (page_mappings_only)
 		flags = NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
 
-	__create_pgd_mapping(mm->pgd, phys, virt, size, prot,
-			     pgd_pgtable_alloc_special_mm, flags);
+	early_create_pgd_mapping(mm->pgd, phys, virt, size, prot,
+				 pgd_pgtable_alloc_special_mm, flags);
 }
 
 static void update_mapping_prot(phys_addr_t phys, unsigned long virt,
@@ -963,8 +1012,8 @@ static void update_mapping_prot(phys_addr_t phys, unsigned long virt,
 		return;
 	}
 
-	__create_pgd_mapping(init_mm.pgd, phys, virt, size, prot, NULL,
-			     NO_CONT_MAPPINGS);
+	early_create_pgd_mapping(init_mm.pgd, phys, virt, size, prot, NULL,
+				 NO_CONT_MAPPINGS);
 
 	/* flush the TLBs after updating live kernel mappings */
 	flush_tlb_kernel_range(virt, virt + size);
@@ -973,8 +1022,8 @@ static void update_mapping_prot(phys_addr_t phys, unsigned long virt,
 static void __init __map_memblock(pgd_t *pgdp, phys_addr_t start,
 				  phys_addr_t end, pgprot_t prot, int flags)
 {
-	__create_pgd_mapping(pgdp, start, __phys_to_virt(start), end - start,
-			     prot, early_pgtable_alloc, flags);
+	early_create_pgd_mapping(pgdp, start, __phys_to_virt(start), end - start,
+				 prot, early_pgtable_alloc, flags);
 }
 
 void __init mark_linear_text_alias_ro(void)
@@ -1207,6 +1256,8 @@ static int __init __kpti_install_ng_mappings(void *__unused)
 	remap_fn = (void *)__pa_symbol(idmap_kpti_install_ng_mappings);
 
 	if (!cpu) {
+		int ret;
+
 		alloc = __get_free_pages(GFP_ATOMIC | __GFP_ZERO, order);
 		kpti_ng_temp_pgd = (pgd_t *)(alloc + (levels - 1) * PAGE_SIZE);
 		kpti_ng_temp_alloc = kpti_ng_temp_pgd_pa = __pa(kpti_ng_temp_pgd);
@@ -1227,9 +1278,11 @@ static int __init __kpti_install_ng_mappings(void *__unused)
 		// covers the PTE[] page itself, the remaining entries are free
 		// to be used as a ad-hoc fixmap.
 		//
-		__create_pgd_mapping_locked(kpti_ng_temp_pgd, __pa(alloc),
-					    KPTI_NG_TEMP_VA, PAGE_SIZE, PAGE_KERNEL,
-					    kpti_ng_pgd_alloc, 0);
+		ret = __create_pgd_mapping_locked(kpti_ng_temp_pgd, __pa(alloc),
+						  KPTI_NG_TEMP_VA, PAGE_SIZE, PAGE_KERNEL,
+						  kpti_ng_pgd_alloc, 0);
+		if (ret)
+			panic("Failed to create page tables\n");
 	}
 
 	cpu_install_idmap();
@@ -1282,9 +1335,9 @@ static int __init map_entry_trampoline(void)
 
 	/* Map only the text into the trampoline page table */
 	memset(tramp_pg_dir, 0, PGD_SIZE);
-	__create_pgd_mapping(tramp_pg_dir, pa_start, TRAMP_VALIAS,
-			     entry_tramp_text_size(), prot,
-			     pgd_pgtable_alloc_init_mm, NO_BLOCK_MAPPINGS);
+	early_create_pgd_mapping(tramp_pg_dir, pa_start, TRAMP_VALIAS,
+				 entry_tramp_text_size(), prot,
+				 pgd_pgtable_alloc_init_mm, NO_BLOCK_MAPPINGS);
 
 	/* Map both the text and data into the kernel page table */
 	for (i = 0; i < DIV_ROUND_UP(entry_tramp_text_size(), PAGE_SIZE); i++)
@@ -1926,23 +1979,28 @@ int arch_add_memory(int nid, u64 start, u64 size,
 	if (force_pte_mapping())
 		flags |= NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
 
-	__create_pgd_mapping(swapper_pg_dir, start, __phys_to_virt(start),
-			     size, params->pgprot, pgd_pgtable_alloc_init_mm,
-			     flags);
+	ret = __create_pgd_mapping(swapper_pg_dir, start, __phys_to_virt(start),
+				   size, params->pgprot, pgd_pgtable_alloc_init_mm,
+				   flags);
+	if (ret)
+		goto err;
 
 	memblock_clear_nomap(start, size);
 
 	ret = __add_pages(nid, start >> PAGE_SHIFT, size >> PAGE_SHIFT,
 			   params);
 	if (ret)
-		__remove_pgd_mapping(swapper_pg_dir,
-				     __phys_to_virt(start), size);
-	else {
-		/* Address of hotplugged memory can be smaller */
-		max_pfn = max(max_pfn, PFN_UP(start + size));
-		max_low_pfn = max_pfn;
-	}
+		goto err;
+
+	/* Address of hotplugged memory can be smaller */
+	max_pfn = max(max_pfn, PFN_UP(start + size));
+	max_low_pfn = max_pfn;
+
+	return 0;
 
+err:
+	__remove_pgd_mapping(swapper_pg_dir,
+			     __phys_to_virt(start), size);
 	return ret;
 }
 
diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c
index 5135f2d66958..f0e784b963e6 100644
--- a/arch/arm64/mm/pageattr.c
+++ b/arch/arm64/mm/pageattr.c
@@ -148,7 +148,7 @@ static int change_memory_common(unsigned long addr, int numpages,
 	unsigned long size = PAGE_SIZE * numpages;
 	unsigned long end = start + size;
 	struct vm_struct *area;
-	int i;
+	int ret;
 
 	if (!PAGE_ALIGNED(addr)) {
 		start &= PAGE_MASK;
@@ -184,9 +184,13 @@ static int change_memory_common(unsigned long addr, int numpages,
 	 */
 	if (rodata_full && (pgprot_val(set_mask) == PTE_RDONLY ||
 			    pgprot_val(clear_mask) == PTE_RDONLY)) {
-		for (i = 0; i < area->nr_pages; i++) {
-			__change_memory_common((u64)page_address(area->pages[i]),
-					       PAGE_SIZE, set_mask, clear_mask);
+		unsigned long idx = (start - (unsigned long)kasan_reset_tag(area->addr))
+				    >> PAGE_SHIFT;
+		for (; numpages; idx++, numpages--) {
+			ret = __change_memory_common((u64)page_address(area->pages[idx]),
+						     PAGE_SIZE, set_mask, clear_mask);
+			if (ret)
+				return ret;
 		}
 	}
 
diff --git a/arch/arm64/mm/pgd.c b/arch/arm64/mm/pgd.c
index 8160cff35089..bf5110b91e2f 100644
--- a/arch/arm64/mm/pgd.c
+++ b/arch/arm64/mm/pgd.c
@@ -56,7 +56,7 @@ void __init pgtable_cache_init(void)
 	 * With 52-bit physical addresses, the architecture requires the
 	 * top-level table to be aligned to at least 64 bytes.
 	 */
-	BUILD_BUG_ON(PGD_SIZE < 64);
+	BUILD_BUG_ON(!IS_ALIGNED(PGD_SIZE, 64));
 #endif
 
 	/*
diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S
index 86818511962b..01e868116448 100644
--- a/arch/arm64/mm/proc.S
+++ b/arch/arm64/mm/proc.S
@@ -23,15 +23,18 @@
 #include <asm/sysreg.h>
 
 #ifdef CONFIG_ARM64_64K_PAGES
-#define TCR_TG_FLAGS	TCR_TG0_64K | TCR_TG1_64K
+#define TCR_TG_FLAGS	((TCR_EL1_TG0_64K << TCR_EL1_TG0_SHIFT) |\
+			 (TCR_EL1_TG1_64K << TCR_EL1_TG1_SHIFT))
 #elif defined(CONFIG_ARM64_16K_PAGES)
-#define TCR_TG_FLAGS	TCR_TG0_16K | TCR_TG1_16K
+#define TCR_TG_FLAGS	((TCR_EL1_TG0_16K << TCR_EL1_TG0_SHIFT) |\
+			 (TCR_EL1_TG1_16K << TCR_EL1_TG1_SHIFT))
 #else /* CONFIG_ARM64_4K_PAGES */
-#define TCR_TG_FLAGS	TCR_TG0_4K | TCR_TG1_4K
+#define TCR_TG_FLAGS	((TCR_EL1_TG0_4K << TCR_EL1_TG0_SHIFT) |\
+			 (TCR_EL1_TG1_4K << TCR_EL1_TG1_SHIFT))
 #endif
 
 #ifdef CONFIG_RANDOMIZE_BASE
-#define TCR_KASLR_FLAGS	TCR_NFD1
+#define TCR_KASLR_FLAGS	TCR_EL1_NFD1
 #else
 #define TCR_KASLR_FLAGS	0
 #endif
@@ -40,23 +43,30 @@
 #define TCR_CACHE_FLAGS	TCR_IRGN_WBWA | TCR_ORGN_WBWA
 
 #ifdef CONFIG_KASAN_SW_TAGS
-#define TCR_KASAN_SW_FLAGS TCR_TBI1 | TCR_TBID1
+#define TCR_KASAN_SW_FLAGS TCR_EL1_TBI1 | TCR_EL1_TBID1
 #else
 #define TCR_KASAN_SW_FLAGS 0
 #endif
 
 #ifdef CONFIG_KASAN_HW_TAGS
-#define TCR_MTE_FLAGS TCR_TCMA1 | TCR_TBI1 | TCR_TBID1
+#define TCR_MTE_FLAGS TCR_EL1_TCMA1 | TCR_EL1_TBI1 | TCR_EL1_TBID1
 #elif defined(CONFIG_ARM64_MTE)
 /*
  * The mte_zero_clear_page_tags() implementation uses DC GZVA, which relies on
  * TBI being enabled at EL1.
  */
-#define TCR_MTE_FLAGS TCR_TBI1 | TCR_TBID1
+#define TCR_MTE_FLAGS TCR_EL1_TBI1 | TCR_EL1_TBID1
 #else
 #define TCR_MTE_FLAGS 0
 #endif
 
+#define TCR_IRGN_WBWA	((TCR_EL1_IRGN0_WBWA << TCR_EL1_IRGN0_SHIFT) |\
+			 (TCR_EL1_IRGN1_WBWA << TCR_EL1_IRGN1_SHIFT))
+#define TCR_ORGN_WBWA	((TCR_EL1_ORGN0_WBWA << TCR_EL1_ORGN0_SHIFT) |\
+			 (TCR_EL1_ORGN1_WBWA << TCR_EL1_ORGN1_SHIFT))
+#define TCR_SHARED	((TCR_EL1_SH0_INNER << TCR_EL1_SH0_SHIFT) |\
+			 (TCR_EL1_SH1_INNER << TCR_EL1_SH1_SHIFT))
+
 /*
  * Default MAIR_EL1. MT_NORMAL_TAGGED is initially mapped as Normal memory and
  * changed during mte_cpu_setup to Normal Tagged if the system supports MTE.
@@ -129,7 +139,7 @@ SYM_FUNC_START(cpu_do_resume)
 
 	/* Don't change t0sz here, mask those bits when restoring */
 	mrs	x7, tcr_el1
-	bfi	x8, x7, TCR_T0SZ_OFFSET, TCR_TxSZ_WIDTH
+	bfi	x8, x7, TCR_EL1_T0SZ_SHIFT, TCR_EL1_T0SZ_WIDTH
 
 	msr	tcr_el1, x8
 	msr	vbar_el1, x9
@@ -481,8 +491,8 @@ SYM_FUNC_START(__cpu_setup)
 	tcr2	.req	x15
 	mov_q	mair, MAIR_EL1_SET
 	mov_q	tcr, TCR_T0SZ(IDMAP_VA_BITS) | TCR_T1SZ(VA_BITS_MIN) | TCR_CACHE_FLAGS | \
-		     TCR_SHARED | TCR_TG_FLAGS | TCR_KASLR_FLAGS | TCR_ASID16 | \
-		     TCR_TBI0 | TCR_A1 | TCR_KASAN_SW_FLAGS | TCR_MTE_FLAGS
+		     TCR_SHARED | TCR_TG_FLAGS | TCR_KASLR_FLAGS | TCR_EL1_AS | \
+		     TCR_EL1_TBI0 | TCR_EL1_A1 | TCR_KASAN_SW_FLAGS | TCR_MTE_FLAGS
 	mov	tcr2, xzr
 
 	tcr_clear_errata_bits tcr, x9, x5
@@ -492,7 +502,7 @@ SYM_FUNC_START(__cpu_setup)
 alternative_if ARM64_HAS_VA52
 	tcr_set_t1sz	tcr, x9
 #ifdef CONFIG_ARM64_LPA2
-	orr		tcr, tcr, #TCR_DS
+	orr		tcr, tcr, #TCR_EL1_DS
 #endif
 alternative_else_nop_endif
 #endif
@@ -500,7 +510,7 @@ alternative_else_nop_endif
 	/*
 	 * Set the IPS bits in TCR_EL1.
 	 */
-	tcr_compute_pa_size tcr, #TCR_IPS_SHIFT, x5, x6
+	tcr_compute_pa_size tcr, #TCR_EL1_IPS_SHIFT, x5, x6
 #ifdef CONFIG_ARM64_HW_AFDBM
 	/*
 	 * Enable hardware update of the Access Flags bit.
@@ -510,7 +520,7 @@ alternative_else_nop_endif
 	mrs	x9, ID_AA64MMFR1_EL1
 	ubfx	x9, x9, ID_AA64MMFR1_EL1_HAFDBS_SHIFT, #4
 	cbz	x9, 1f
-	orr	tcr, tcr, #TCR_HA		// hardware Access flag update
+	orr	tcr, tcr, #TCR_EL1_HA		// hardware Access flag update
 #ifdef CONFIG_ARM64_HAFT
 	cmp	x9, ID_AA64MMFR1_EL1_HAFDBS_HAFT
 	b.lt	1f
diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c
index 0c9a50a1e73e..afd05b41ea9e 100644
--- a/arch/arm64/net/bpf_jit_comp.c
+++ b/arch/arm64/net/bpf_jit_comp.c
@@ -3054,7 +3054,7 @@ bool bpf_jit_supports_exceptions(void)
 	/* We unwind through both kernel frames starting from within bpf_throw
 	 * call and BPF frames. Therefore we require FP unwinder to be enabled
 	 * to walk kernel frames and reach BPF frames in the stack trace.
-	 * ARM64 kernel is aways compiled with CONFIG_FRAME_POINTER=y
+	 * ARM64 kernel is always compiled with CONFIG_FRAME_POINTER=y
 	 */
 	return true;
 }
diff --git a/arch/arm64/tools/gen-sysreg.awk b/arch/arm64/tools/gen-sysreg.awk
index bbbb812603e8..86860ab672dc 100755
--- a/arch/arm64/tools/gen-sysreg.awk
+++ b/arch/arm64/tools/gen-sysreg.awk
@@ -44,21 +44,38 @@ function expect_fields(nf) {
 
 # Print a CPP macro definition, padded with spaces so that the macro bodies
 # line up in a column
-function define(name, val) {
-	printf "%-56s%s\n", "#define " name, val
+function define(prefix, name, val) {
+	printf "%-56s%s\n", "#define " prefix name, val
+}
+
+# Same as above, but without a prefix
+function define_reg(name, val) {
+	define(null, name, val)
 }
 
 # Print standard BITMASK/SHIFT/WIDTH CPP definitions for a field
-function define_field(reg, field, msb, lsb) {
-	define(reg "_" field, "GENMASK(" msb ", " lsb ")")
-	define(reg "_" field "_MASK", "GENMASK(" msb ", " lsb ")")
-	define(reg "_" field "_SHIFT", lsb)
-	define(reg "_" field "_WIDTH", msb - lsb + 1)
+function define_field(prefix, reg, field, msb, lsb) {
+	define(prefix, reg "_" field, "GENMASK(" msb ", " lsb ")")
+	define(prefix, reg "_" field "_MASK", "GENMASK(" msb ", " lsb ")")
+	define(prefix, reg "_" field "_SHIFT", lsb)
+	define(prefix, reg "_" field "_WIDTH", msb - lsb + 1)
 }
 
 # Print a field _SIGNED definition for a field
-function define_field_sign(reg, field, sign) {
-	define(reg "_" field "_SIGNED", sign)
+function define_field_sign(prefix, reg, field, sign) {
+	define(prefix, reg "_" field "_SIGNED", sign)
+}
+
+# Print the Res0, Res1, Unkn masks
+function define_resx_unkn(prefix, reg, res0, res1, unkn) {
+	if (res0 != null)
+		define(prefix, reg "_RES0", "(" res0 ")")
+	if (res1 != null)
+		define(prefix, reg "_RES1", "(" res1 ")")
+	if (unkn != null)
+		define(prefix, reg "_UNKN", "(" unkn ")")
+	if (res0 != null || res1 != null || unkn != null)
+		print ""
 }
 
 # Parse a "<msb>[:<lsb>]" string into the global variables @msb and @lsb
@@ -128,18 +145,17 @@ $1 == "SysregFields" && block_current() == "Root" {
 
 	next_bit = 63
 
+	delete seen_prefixes
+
 	next
 }
 
 $1 == "EndSysregFields" && block_current() == "SysregFields" {
 	expect_fields(1)
-	if (next_bit > 0)
+	if (next_bit >= 0)
 		fatal("Unspecified bits in " reg)
 
-	define(reg "_RES0", "(" res0 ")")
-	define(reg "_RES1", "(" res1 ")")
-	define(reg "_UNKN", "(" unkn ")")
-	print ""
+	define_resx_unkn(prefix, reg, res0, res1, unkn)
 
 	reg = null
 	res0 = null
@@ -170,35 +186,31 @@ $1 == "Sysreg" && block_current() == "Root" {
 		fatal("Duplicate Sysreg definition for " reg)
 	defined_regs[reg] = 1
 
-	define("REG_" reg, "S" op0 "_" op1 "_C" crn "_C" crm "_" op2)
-	define("SYS_" reg, "sys_reg(" op0 ", " op1 ", " crn ", " crm ", " op2 ")")
+	define_reg("REG_" reg, "S" op0 "_" op1 "_C" crn "_C" crm "_" op2)
+	define_reg("SYS_" reg, "sys_reg(" op0 ", " op1 ", " crn ", " crm ", " op2 ")")
 
-	define("SYS_" reg "_Op0", op0)
-	define("SYS_" reg "_Op1", op1)
-	define("SYS_" reg "_CRn", crn)
-	define("SYS_" reg "_CRm", crm)
-	define("SYS_" reg "_Op2", op2)
+	define_reg("SYS_" reg "_Op0", op0)
+	define_reg("SYS_" reg "_Op1", op1)
+	define_reg("SYS_" reg "_CRn", crn)
+	define_reg("SYS_" reg "_CRm", crm)
+	define_reg("SYS_" reg "_Op2", op2)
 
 	print ""
 
+	prefix = null
 	next_bit = 63
 
+	delete seen_prefixes
+
 	next
 }
 
 $1 == "EndSysreg" && block_current() == "Sysreg" {
 	expect_fields(1)
-	if (next_bit > 0)
+	if (next_bit >= 0)
 		fatal("Unspecified bits in " reg)
 
-	if (res0 != null)
-		define(reg "_RES0", "(" res0 ")")
-	if (res1 != null)
-		define(reg "_RES1", "(" res1 ")")
-	if (unkn != null)
-		define(reg "_UNKN", "(" unkn ")")
-	if (res0 != null || res1 != null || unkn != null)
-		print ""
+	define_resx_unkn(prefix, reg, res0, res1, unkn)
 
 	reg = null
 	op0 = null
@@ -209,6 +221,7 @@ $1 == "EndSysreg" && block_current() == "Sysreg" {
 	res0 = null
 	res1 = null
 	unkn = null
+	prefix = null
 
 	block_pop()
 	next
@@ -225,7 +238,7 @@ $1 == "EndSysreg" && block_current() == "Sysreg" {
 	print "/* For " reg " fields see " $2 " */"
 	print ""
 
-        next_bit = 0
+	next_bit = -1
 	res0 = null
 	res1 = null
 	unkn = null
@@ -233,8 +246,7 @@ $1 == "EndSysreg" && block_current() == "Sysreg" {
 	next
 }
 
-
-$1 == "Res0" && (block_current() == "Sysreg" || block_current() == "SysregFields") {
+$1 == "Res0" && (block_current() == "Sysreg" || block_current() == "SysregFields" || block_current() == "Prefix") {
 	expect_fields(2)
 	parse_bitdef(reg, "RES0", $2)
 	field = "RES0_" msb "_" lsb
@@ -244,7 +256,7 @@ $1 == "Res0" && (block_current() == "Sysreg" || block_current() == "SysregFields
 	next
 }
 
-$1 == "Res1" && (block_current() == "Sysreg" || block_current() == "SysregFields") {
+$1 == "Res1" && (block_current() == "Sysreg" || block_current() == "SysregFields" || block_current() == "Prefix") {
 	expect_fields(2)
 	parse_bitdef(reg, "RES1", $2)
 	field = "RES1_" msb "_" lsb
@@ -254,7 +266,7 @@ $1 == "Res1" && (block_current() == "Sysreg" || block_current() == "SysregFields
 	next
 }
 
-$1 == "Unkn" && (block_current() == "Sysreg" || block_current() == "SysregFields") {
+$1 == "Unkn" && (block_current() == "Sysreg" || block_current() == "SysregFields" || block_current() == "Prefix") {
 	expect_fields(2)
 	parse_bitdef(reg, "UNKN", $2)
 	field = "UNKN_" msb "_" lsb
@@ -264,62 +276,62 @@ $1 == "Unkn" && (block_current() == "Sysreg" || block_current() == "SysregFields
 	next
 }
 
-$1 == "Field" && (block_current() == "Sysreg" || block_current() == "SysregFields") {
+$1 == "Field" && (block_current() == "Sysreg" || block_current() == "SysregFields" || block_current() == "Prefix") {
 	expect_fields(3)
 	field = $3
 	parse_bitdef(reg, field, $2)
 
-	define_field(reg, field, msb, lsb)
+	define_field(prefix, reg, field, msb, lsb)
 	print ""
 
 	next
 }
 
-$1 == "Raz" && (block_current() == "Sysreg" || block_current() == "SysregFields") {
+$1 == "Raz" && (block_current() == "Sysreg" || block_current() == "SysregFields" || block_current() == "Prefix") {
 	expect_fields(2)
 	parse_bitdef(reg, field, $2)
 
 	next
 }
 
-$1 == "SignedEnum" && (block_current() == "Sysreg" || block_current() == "SysregFields") {
+$1 == "SignedEnum" && (block_current() == "Sysreg" || block_current() == "SysregFields" || block_current() == "Prefix") {
 	block_push("Enum")
 
 	expect_fields(3)
 	field = $3
 	parse_bitdef(reg, field, $2)
 
-	define_field(reg, field, msb, lsb)
-	define_field_sign(reg, field, "true")
+	define_field(prefix, reg, field, msb, lsb)
+	define_field_sign(prefix, reg, field, "true")
 
 	delete seen_enum_vals
 
 	next
 }
 
-$1 == "UnsignedEnum" && (block_current() == "Sysreg" || block_current() == "SysregFields") {
+$1 == "UnsignedEnum" && (block_current() == "Sysreg" || block_current() == "SysregFields" || block_current() == "Prefix") {
 	block_push("Enum")
 
 	expect_fields(3)
 	field = $3
 	parse_bitdef(reg, field, $2)
 
-	define_field(reg, field, msb, lsb)
-	define_field_sign(reg, field, "false")
+	define_field(prefix, reg, field, msb, lsb)
+	define_field_sign(prefix, reg, field, "false")
 
 	delete seen_enum_vals
 
 	next
 }
 
-$1 == "Enum" && (block_current() == "Sysreg" || block_current() == "SysregFields") {
+$1 == "Enum" && (block_current() == "Sysreg" || block_current() == "SysregFields" || block_current() == "Prefix") {
 	block_push("Enum")
 
 	expect_fields(3)
 	field = $3
 	parse_bitdef(reg, field, $2)
 
-	define_field(reg, field, msb, lsb)
+	define_field(prefix, reg, field, msb, lsb)
 
 	delete seen_enum_vals
 
@@ -349,7 +361,47 @@ $1 == "EndEnum" && block_current() == "Enum" {
 		fatal("Duplicate Enum value " val " for " name)
 	seen_enum_vals[val] = 1
 
-	define(reg "_" field "_" name, "UL(" val ")")
+	define(prefix, reg "_" field "_" name, "UL(" val ")")
+	next
+}
+
+$1 == "Prefix" && (block_current() == "Sysreg" || block_current() == "SysregFields") {
+	block_push("Prefix")
+
+	expect_fields(2)
+
+	if (next_bit < 63)
+		fatal("Prefixed fields must precede non-prefixed fields (" reg ")")
+
+	prefix = $2 "_"
+
+	if (prefix in seen_prefixes)
+		fatal("Duplicate prefix " prefix " for " reg)
+	seen_prefixes[prefix] = 1
+
+	res0 = "UL(0)"
+	res1 = "UL(0)"
+	unkn = "UL(0)"
+	next_bit = 63
+
+	next
+}
+
+$1 == "EndPrefix" && block_current() == "Prefix" {
+	expect_fields(1)
+	if (next_bit >= 0)
+		fatal("Unspecified bits in prefix " prefix " for " reg)
+
+	define_resx_unkn(prefix, reg, res0, res1, unkn)
+
+	prefix = null
+	res0 = "UL(0)"
+	res1 = "UL(0)"
+	unkn = "UL(0)"
+	next_bit = 63
+
+	block_pop()
+
 	next
 }
 
diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg
index 1c6cdf9d54bb..8921b51866d6 100644
--- a/arch/arm64/tools/sysreg
+++ b/arch/arm64/tools/sysreg
@@ -4669,6 +4669,27 @@ Field	1	V3
 Field	0	En
 EndSysreg
 
+Sysreg	ICH_VMCR_EL2	3	4	12	11	7
+Prefix	FEAT_GCIE
+Res0	63:32
+Field	31:27	VPMR
+Res0	26:1
+Field	0	EN
+EndPrefix
+Res0	63:32
+Field	31:24	VPMR
+Field	23:21	VBPR0
+Field	20:18	VBPR1
+Res0	17:10
+Field	9	VEOIM
+Res0	8:5
+Field	4	VCBPR
+Field	3	VFIQEn
+Field	2	VAckCtl
+Field	1	VENG1
+Field	0	VENG0
+EndSysreg
+
 Sysreg	CONTEXTIDR_EL2	3	4	13	0	1
 Fields	CONTEXTIDR_ELx
 EndSysreg
diff --git a/arch/m68k/configs/amiga_defconfig b/arch/m68k/configs/amiga_defconfig
index fba8089c9fb3..e717099b77cc 100644
--- a/arch/m68k/configs/amiga_defconfig
+++ b/arch/m68k/configs/amiga_defconfig
@@ -265,7 +265,6 @@ CONFIG_BRIDGE_EBT_SNAT=m
 CONFIG_BRIDGE_EBT_LOG=m
 CONFIG_BRIDGE_EBT_NFLOG=m
 CONFIG_IP_SCTP=m
-CONFIG_SCTP_COOKIE_HMAC_SHA1=y
 CONFIG_RDS=m
 CONFIG_RDS_TCP=m
 CONFIG_L2TP=m
@@ -281,7 +280,6 @@ CONFIG_6LOWPAN_GHC_EXT_HDR_ROUTE=m
 CONFIG_DNS_RESOLVER=y
 CONFIG_BATMAN_ADV=m
 # CONFIG_BATMAN_ADV_BATMAN_V is not set
-CONFIG_BATMAN_ADV_NC=y
 CONFIG_NETLINK_DIAG=m
 CONFIG_MPLS=y
 CONFIG_NET_MPLS_GSO=m
@@ -583,6 +581,7 @@ CONFIG_CRYPTO_AEGIS128=m
 CONFIG_CRYPTO_MD4=m
 CONFIG_CRYPTO_MICHAEL_MIC=m
 CONFIG_CRYPTO_RMD160=m
+CONFIG_CRYPTO_SHA1=m
 CONFIG_CRYPTO_SM3_GENERIC=m
 CONFIG_CRYPTO_WP512=m
 CONFIG_CRYPTO_XCBC=m
diff --git a/arch/m68k/configs/apollo_defconfig b/arch/m68k/configs/apollo_defconfig
index 6af37716384c..6b0bab752035 100644
--- a/arch/m68k/configs/apollo_defconfig
+++ b/arch/m68k/configs/apollo_defconfig
@@ -261,7 +261,6 @@ CONFIG_BRIDGE_EBT_SNAT=m
 CONFIG_BRIDGE_EBT_LOG=m
 CONFIG_BRIDGE_EBT_NFLOG=m
 CONFIG_IP_SCTP=m
-CONFIG_SCTP_COOKIE_HMAC_SHA1=y
 CONFIG_RDS=m
 CONFIG_RDS_TCP=m
 CONFIG_L2TP=m
@@ -277,7 +276,6 @@ CONFIG_6LOWPAN_GHC_EXT_HDR_ROUTE=m
 CONFIG_DNS_RESOLVER=y
 CONFIG_BATMAN_ADV=m
 # CONFIG_BATMAN_ADV_BATMAN_V is not set
-CONFIG_BATMAN_ADV_NC=y
 CONFIG_NETLINK_DIAG=m
 CONFIG_MPLS=y
 CONFIG_NET_MPLS_GSO=m
@@ -540,6 +538,7 @@ CONFIG_CRYPTO_AEGIS128=m
 CONFIG_CRYPTO_MD4=m
 CONFIG_CRYPTO_MICHAEL_MIC=m
 CONFIG_CRYPTO_RMD160=m
+CONFIG_CRYPTO_SHA1=m
 CONFIG_CRYPTO_SM3_GENERIC=m
 CONFIG_CRYPTO_WP512=m
 CONFIG_CRYPTO_XCBC=m
diff --git a/arch/m68k/configs/atari_defconfig b/arch/m68k/configs/atari_defconfig
index 471f4ec3730d..bb59e3f06ed5 100644
--- a/arch/m68k/configs/atari_defconfig
+++ b/arch/m68k/configs/atari_defconfig
@@ -268,7 +268,6 @@ CONFIG_BRIDGE_EBT_SNAT=m
 CONFIG_BRIDGE_EBT_LOG=m
 CONFIG_BRIDGE_EBT_NFLOG=m
 CONFIG_IP_SCTP=m
-CONFIG_SCTP_COOKIE_HMAC_SHA1=y
 CONFIG_RDS=m
 CONFIG_RDS_TCP=m
 CONFIG_L2TP=m
@@ -284,7 +283,6 @@ CONFIG_6LOWPAN_GHC_EXT_HDR_ROUTE=m
 CONFIG_DNS_RESOLVER=y
 CONFIG_BATMAN_ADV=m
 # CONFIG_BATMAN_ADV_BATMAN_V is not set
-CONFIG_BATMAN_ADV_NC=y
 CONFIG_NETLINK_DIAG=m
 CONFIG_MPLS=y
 CONFIG_NET_MPLS_GSO=m
@@ -560,6 +558,7 @@ CONFIG_CRYPTO_AEGIS128=m
 CONFIG_CRYPTO_MD4=m
 CONFIG_CRYPTO_MICHAEL_MIC=m
 CONFIG_CRYPTO_RMD160=m
+CONFIG_CRYPTO_SHA1=m
 CONFIG_CRYPTO_SM3_GENERIC=m
 CONFIG_CRYPTO_WP512=m
 CONFIG_CRYPTO_XCBC=m
diff --git a/arch/m68k/configs/bvme6000_defconfig b/arch/m68k/configs/bvme6000_defconfig
index 28492ef51457..ed85517ff01c 100644
--- a/arch/m68k/configs/bvme6000_defconfig
+++ b/arch/m68k/configs/bvme6000_defconfig
@@ -258,7 +258,6 @@ CONFIG_BRIDGE_EBT_SNAT=m
 CONFIG_BRIDGE_EBT_LOG=m
 CONFIG_BRIDGE_EBT_NFLOG=m
 CONFIG_IP_SCTP=m
-CONFIG_SCTP_COOKIE_HMAC_SHA1=y
 CONFIG_RDS=m
 CONFIG_RDS_TCP=m
 CONFIG_L2TP=m
@@ -274,7 +273,6 @@ CONFIG_6LOWPAN_GHC_EXT_HDR_ROUTE=m
 CONFIG_DNS_RESOLVER=y
 CONFIG_BATMAN_ADV=m
 # CONFIG_BATMAN_ADV_BATMAN_V is not set
-CONFIG_BATMAN_ADV_NC=y
 CONFIG_NETLINK_DIAG=m
 CONFIG_MPLS=y
 CONFIG_NET_MPLS_GSO=m
@@ -532,6 +530,7 @@ CONFIG_CRYPTO_AEGIS128=m
 CONFIG_CRYPTO_MD4=m
 CONFIG_CRYPTO_MICHAEL_MIC=m
 CONFIG_CRYPTO_RMD160=m
+CONFIG_CRYPTO_SHA1=m
 CONFIG_CRYPTO_SM3_GENERIC=m
 CONFIG_CRYPTO_WP512=m
 CONFIG_CRYPTO_XCBC=m
diff --git a/arch/m68k/configs/hp300_defconfig b/arch/m68k/configs/hp300_defconfig
index 2fbefb16b72e..0217a6982856 100644
--- a/arch/m68k/configs/hp300_defconfig
+++ b/arch/m68k/configs/hp300_defconfig
@@ -260,7 +260,6 @@ CONFIG_BRIDGE_EBT_SNAT=m
 CONFIG_BRIDGE_EBT_LOG=m
 CONFIG_BRIDGE_EBT_NFLOG=m
 CONFIG_IP_SCTP=m
-CONFIG_SCTP_COOKIE_HMAC_SHA1=y
 CONFIG_RDS=m
 CONFIG_RDS_TCP=m
 CONFIG_L2TP=m
@@ -276,7 +275,6 @@ CONFIG_6LOWPAN_GHC_EXT_HDR_ROUTE=m
 CONFIG_DNS_RESOLVER=y
 CONFIG_BATMAN_ADV=m
 # CONFIG_BATMAN_ADV_BATMAN_V is not set
-CONFIG_BATMAN_ADV_NC=y
 CONFIG_NETLINK_DIAG=m
 CONFIG_MPLS=y
 CONFIG_NET_MPLS_GSO=m
@@ -542,6 +540,7 @@ CONFIG_CRYPTO_AEGIS128=m
 CONFIG_CRYPTO_MD4=m
 CONFIG_CRYPTO_MICHAEL_MIC=m
 CONFIG_CRYPTO_RMD160=m
+CONFIG_CRYPTO_SHA1=m
 CONFIG_CRYPTO_SM3_GENERIC=m
 CONFIG_CRYPTO_WP512=m
 CONFIG_CRYPTO_XCBC=m
diff --git a/arch/m68k/configs/mac_defconfig b/arch/m68k/configs/mac_defconfig
index deec5df3f35a..a9ba431a9408 100644
--- a/arch/m68k/configs/mac_defconfig
+++ b/arch/m68k/configs/mac_defconfig
@@ -259,7 +259,6 @@ CONFIG_BRIDGE_EBT_SNAT=m
 CONFIG_BRIDGE_EBT_LOG=m
 CONFIG_BRIDGE_EBT_NFLOG=m
 CONFIG_IP_SCTP=m
-CONFIG_SCTP_COOKIE_HMAC_SHA1=y
 CONFIG_RDS=m
 CONFIG_RDS_TCP=m
 CONFIG_L2TP=m
@@ -275,7 +274,6 @@ CONFIG_6LOWPAN_GHC_EXT_HDR_ROUTE=m
 CONFIG_DNS_RESOLVER=y
 CONFIG_BATMAN_ADV=m
 # CONFIG_BATMAN_ADV_BATMAN_V is not set
-CONFIG_BATMAN_ADV_NC=y
 CONFIG_NETLINK_DIAG=m
 CONFIG_MPLS=y
 CONFIG_NET_MPLS_GSO=m
@@ -559,6 +557,7 @@ CONFIG_CRYPTO_AEGIS128=m
 CONFIG_CRYPTO_MD4=m
 CONFIG_CRYPTO_MICHAEL_MIC=m
 CONFIG_CRYPTO_RMD160=m
+CONFIG_CRYPTO_SHA1=m
 CONFIG_CRYPTO_SM3_GENERIC=m
 CONFIG_CRYPTO_WP512=m
 CONFIG_CRYPTO_XCBC=m
diff --git a/arch/m68k/configs/multi_defconfig b/arch/m68k/configs/multi_defconfig
index 301a05c12577..f0d0d120e144 100644
--- a/arch/m68k/configs/multi_defconfig
+++ b/arch/m68k/configs/multi_defconfig
@@ -279,7 +279,6 @@ CONFIG_BRIDGE_EBT_SNAT=m
 CONFIG_BRIDGE_EBT_LOG=m
 CONFIG_BRIDGE_EBT_NFLOG=m
 CONFIG_IP_SCTP=m
-CONFIG_SCTP_COOKIE_HMAC_SHA1=y
 CONFIG_RDS=m
 CONFIG_RDS_TCP=m
 CONFIG_L2TP=m
@@ -295,7 +294,6 @@ CONFIG_6LOWPAN_GHC_EXT_HDR_ROUTE=m
 CONFIG_DNS_RESOLVER=y
 CONFIG_BATMAN_ADV=m
 # CONFIG_BATMAN_ADV_BATMAN_V is not set
-CONFIG_BATMAN_ADV_NC=y
 CONFIG_NETLINK_DIAG=m
 CONFIG_MPLS=y
 CONFIG_NET_MPLS_GSO=m
@@ -646,6 +644,7 @@ CONFIG_CRYPTO_AEGIS128=m
 CONFIG_CRYPTO_MD4=m
 CONFIG_CRYPTO_MICHAEL_MIC=m
 CONFIG_CRYPTO_RMD160=m
+CONFIG_CRYPTO_SHA1=m
 CONFIG_CRYPTO_SM3_GENERIC=m
 CONFIG_CRYPTO_WP512=m
 CONFIG_CRYPTO_XCBC=m
diff --git a/arch/m68k/configs/mvme147_defconfig b/arch/m68k/configs/mvme147_defconfig
index 0d401db0e8f8..baf0fc7a66d9 100644
--- a/arch/m68k/configs/mvme147_defconfig
+++ b/arch/m68k/configs/mvme147_defconfig
@@ -257,7 +257,6 @@ CONFIG_BRIDGE_EBT_SNAT=m
 CONFIG_BRIDGE_EBT_LOG=m
 CONFIG_BRIDGE_EBT_NFLOG=m
 CONFIG_IP_SCTP=m
-CONFIG_SCTP_COOKIE_HMAC_SHA1=y
 CONFIG_RDS=m
 CONFIG_RDS_TCP=m
 CONFIG_L2TP=m
@@ -273,7 +272,6 @@ CONFIG_6LOWPAN_GHC_EXT_HDR_ROUTE=m
 CONFIG_DNS_RESOLVER=y
 CONFIG_BATMAN_ADV=m
 # CONFIG_BATMAN_ADV_BATMAN_V is not set
-CONFIG_BATMAN_ADV_NC=y
 CONFIG_NETLINK_DIAG=m
 CONFIG_MPLS=y
 CONFIG_NET_MPLS_GSO=m
@@ -532,6 +530,7 @@ CONFIG_CRYPTO_AEGIS128=m
 CONFIG_CRYPTO_MD4=m
 CONFIG_CRYPTO_MICHAEL_MIC=m
 CONFIG_CRYPTO_RMD160=m
+CONFIG_CRYPTO_SHA1=m
 CONFIG_CRYPTO_SM3_GENERIC=m
 CONFIG_CRYPTO_WP512=m
 CONFIG_CRYPTO_XCBC=m
diff --git a/arch/m68k/configs/mvme16x_defconfig b/arch/m68k/configs/mvme16x_defconfig
index 90fb5b6bcf83..9cb732f0c6ce 100644
--- a/arch/m68k/configs/mvme16x_defconfig
+++ b/arch/m68k/configs/mvme16x_defconfig
@@ -258,7 +258,6 @@ CONFIG_BRIDGE_EBT_SNAT=m
 CONFIG_BRIDGE_EBT_LOG=m
 CONFIG_BRIDGE_EBT_NFLOG=m
 CONFIG_IP_SCTP=m
-CONFIG_SCTP_COOKIE_HMAC_SHA1=y
 CONFIG_RDS=m
 CONFIG_RDS_TCP=m
 CONFIG_L2TP=m
@@ -274,7 +273,6 @@ CONFIG_6LOWPAN_GHC_EXT_HDR_ROUTE=m
 CONFIG_DNS_RESOLVER=y
 CONFIG_BATMAN_ADV=m
 # CONFIG_BATMAN_ADV_BATMAN_V is not set
-CONFIG_BATMAN_ADV_NC=y
 CONFIG_NETLINK_DIAG=m
 CONFIG_MPLS=y
 CONFIG_NET_MPLS_GSO=m
@@ -533,6 +531,7 @@ CONFIG_CRYPTO_AEGIS128=m
 CONFIG_CRYPTO_MD4=m
 CONFIG_CRYPTO_MICHAEL_MIC=m
 CONFIG_CRYPTO_RMD160=m
+CONFIG_CRYPTO_SHA1=m
 CONFIG_CRYPTO_SM3_GENERIC=m
 CONFIG_CRYPTO_WP512=m
 CONFIG_CRYPTO_XCBC=m
diff --git a/arch/m68k/configs/q40_defconfig b/arch/m68k/configs/q40_defconfig
index b89b0f7fe2da..6b3295e4c5ae 100644
--- a/arch/m68k/configs/q40_defconfig
+++ b/arch/m68k/configs/q40_defconfig
@@ -259,7 +259,6 @@ CONFIG_BRIDGE_EBT_SNAT=m
 CONFIG_BRIDGE_EBT_LOG=m
 CONFIG_BRIDGE_EBT_NFLOG=m
 CONFIG_IP_SCTP=m
-CONFIG_SCTP_COOKIE_HMAC_SHA1=y
 CONFIG_RDS=m
 CONFIG_RDS_TCP=m
 CONFIG_L2TP=m
@@ -275,7 +274,6 @@ CONFIG_6LOWPAN_GHC_EXT_HDR_ROUTE=m
 CONFIG_DNS_RESOLVER=y
 CONFIG_BATMAN_ADV=m
 # CONFIG_BATMAN_ADV_BATMAN_V is not set
-CONFIG_BATMAN_ADV_NC=y
 CONFIG_NETLINK_DIAG=m
 CONFIG_MPLS=y
 CONFIG_NET_MPLS_GSO=m
@@ -549,6 +547,7 @@ CONFIG_CRYPTO_AEGIS128=m
 CONFIG_CRYPTO_MD4=m
 CONFIG_CRYPTO_MICHAEL_MIC=m
 CONFIG_CRYPTO_RMD160=m
+CONFIG_CRYPTO_SHA1=m
 CONFIG_CRYPTO_SM3_GENERIC=m
 CONFIG_CRYPTO_WP512=m
 CONFIG_CRYPTO_XCBC=m
diff --git a/arch/m68k/configs/sun3_defconfig b/arch/m68k/configs/sun3_defconfig
index 8cc372c4df72..1f3a2fad6d2f 100644
--- a/arch/m68k/configs/sun3_defconfig
+++ b/arch/m68k/configs/sun3_defconfig
@@ -254,7 +254,6 @@ CONFIG_BRIDGE_EBT_SNAT=m
 CONFIG_BRIDGE_EBT_LOG=m
 CONFIG_BRIDGE_EBT_NFLOG=m
 CONFIG_IP_SCTP=m
-CONFIG_SCTP_COOKIE_HMAC_SHA1=y
 CONFIG_RDS=m
 CONFIG_RDS_TCP=m
 CONFIG_L2TP=m
@@ -270,7 +269,6 @@ CONFIG_6LOWPAN_GHC_EXT_HDR_ROUTE=m
 CONFIG_DNS_RESOLVER=y
 CONFIG_BATMAN_ADV=m
 # CONFIG_BATMAN_ADV_BATMAN_V is not set
-CONFIG_BATMAN_ADV_NC=y
 CONFIG_NETLINK_DIAG=m
 CONFIG_MPLS=y
 CONFIG_NET_MPLS_GSO=m
@@ -530,6 +528,7 @@ CONFIG_CRYPTO_AEGIS128=m
 CONFIG_CRYPTO_MD4=m
 CONFIG_CRYPTO_MICHAEL_MIC=m
 CONFIG_CRYPTO_RMD160=m
+CONFIG_CRYPTO_SHA1=m
 CONFIG_CRYPTO_SM3_GENERIC=m
 CONFIG_CRYPTO_WP512=m
 CONFIG_CRYPTO_XCBC=m
diff --git a/arch/m68k/configs/sun3x_defconfig b/arch/m68k/configs/sun3x_defconfig
index f4569f64c6e4..d8ef36564d50 100644
--- a/arch/m68k/configs/sun3x_defconfig
+++ b/arch/m68k/configs/sun3x_defconfig
@@ -255,7 +255,6 @@ CONFIG_BRIDGE_EBT_SNAT=m
 CONFIG_BRIDGE_EBT_LOG=m
 CONFIG_BRIDGE_EBT_NFLOG=m
 CONFIG_IP_SCTP=m
-CONFIG_SCTP_COOKIE_HMAC_SHA1=y
 CONFIG_RDS=m
 CONFIG_RDS_TCP=m
 CONFIG_L2TP=m
@@ -271,7 +270,6 @@ CONFIG_6LOWPAN_GHC_EXT_HDR_ROUTE=m
 CONFIG_DNS_RESOLVER=y
 CONFIG_BATMAN_ADV=m
 # CONFIG_BATMAN_ADV_BATMAN_V is not set
-CONFIG_BATMAN_ADV_NC=y
 CONFIG_NETLINK_DIAG=m
 CONFIG_MPLS=y
 CONFIG_NET_MPLS_GSO=m
@@ -530,6 +528,7 @@ CONFIG_CRYPTO_AEGIS128=m
 CONFIG_CRYPTO_MD4=m
 CONFIG_CRYPTO_MICHAEL_MIC=m
 CONFIG_CRYPTO_RMD160=m
+CONFIG_CRYPTO_SHA1=m
 CONFIG_CRYPTO_SM3_GENERIC=m
 CONFIG_CRYPTO_WP512=m
 CONFIG_CRYPTO_XCBC=m
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index df22b10d9141..938e5df75b2d 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -69,6 +69,9 @@ config CC_HAS_ASM_AOR_FORMAT_FLAGS
 	  Clang versions before 19.1.0 do not support A,
 	  O, and R inline assembly format flags.
 
+config CC_HAS_STACKPROTECTOR_GLOBAL
+	def_bool $(cc-option, -mstack-protector-guard=global -mstack-protector-guard-record)
+
 config S390
 	def_bool y
 	#
@@ -140,7 +143,6 @@ config S390
 	select ARCH_INLINE_WRITE_UNLOCK_IRQ
 	select ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE
 	select ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE
-	select ARCH_MODULE_NEEDS_WEAK_PER_CPU
 	select ARCH_STACKWALK
 	select ARCH_SUPPORTS_ATOMIC_RMW
 	select ARCH_SUPPORTS_DEBUG_PAGEALLOC
@@ -245,6 +247,7 @@ config S390
 	select HAVE_SAMPLE_FTRACE_DIRECT_MULTI
 	select HAVE_SETUP_PER_CPU_AREA
 	select HAVE_SOFTIRQ_ON_OWN_STACK
+	select HAVE_STACKPROTECTOR if CC_HAS_STACKPROTECTOR_GLOBAL
 	select HAVE_SYSCALL_TRACEPOINTS
 	select HAVE_VIRT_CPU_ACCOUNTING
 	select HAVE_VIRT_CPU_ACCOUNTING_IDLE
@@ -504,22 +507,6 @@ config COMMAND_LINE_SIZE
 	  This allows you to specify the maximum length of the kernel command
 	  line.
 
-config COMPAT
-	def_bool n
-	prompt "Kernel support for 31 bit emulation"
-	select ARCH_WANT_OLD_COMPAT_IPC
-	select COMPAT_OLD_SIGACTION
-	select HAVE_UID16
-	depends on MULTIUSER
-	depends on !CC_IS_CLANG && !LD_IS_LLD
-	help
-	  Select this option if you want to enable your system kernel to
-	  handle system-calls from ELF binaries for 31 bit ESA.  This option
-	  (and some other stuff like libraries and such) is needed for
-	  executing 31 bit applications.
-
-	  If unsure say N.
-
 config SMP
 	def_bool y
 
diff --git a/arch/s390/Makefile b/arch/s390/Makefile
index 8578361133a4..d78ad6885ca2 100644
--- a/arch/s390/Makefile
+++ b/arch/s390/Makefile
@@ -90,6 +90,10 @@ ifdef CONFIG_EXPOLINE
   aflags-y += -DCC_USING_EXPOLINE
 endif
 
+ifeq ($(CONFIG_STACKPROTECTOR),y)
+  KBUILD_CFLAGS += -mstack-protector-guard=global -mstack-protector-guard-record
+endif
+
 ifdef CONFIG_FUNCTION_TRACER
   ifeq ($(call cc-option,-mfentry -mnop-mcount),)
     # make use of hotpatch feature if the compiler supports it
@@ -135,10 +139,9 @@ zfcpdump:
 	$(Q)$(MAKE) $(build)=$(boot) $(boot)/$@
 
 archheaders:
-	$(Q)$(MAKE) $(build)=$(syscalls) uapi
+	$(Q)$(MAKE) $(build)=$(syscalls) all
 
 archprepare:
-	$(Q)$(MAKE) $(build)=$(syscalls) kapi
 	$(Q)$(MAKE) $(build)=$(tools) kapi $(extra_tools)
 ifeq ($(KBUILD_EXTMOD),)
 # We need to generate vdso-offsets.h before compiling certain files in kernel/.
@@ -149,12 +152,9 @@ ifeq ($(KBUILD_EXTMOD),)
 # this hack.
 prepare: vdso_prepare
 vdso_prepare: prepare0
-	$(Q)$(MAKE) $(build)=arch/s390/kernel/vdso64 include/generated/vdso64-offsets.h
-	$(if $(CONFIG_COMPAT),$(Q)$(MAKE) \
-		$(build)=arch/s390/kernel/vdso32 include/generated/vdso32-offsets.h)
+	$(Q)$(MAKE) $(build)=arch/s390/kernel/vdso include/generated/vdso-offsets.h
 
-vdso-install-y			+= arch/s390/kernel/vdso64/vdso64.so.dbg
-vdso-install-$(CONFIG_COMPAT)	+= arch/s390/kernel/vdso32/vdso32.so.dbg
+vdso-install-y			+= arch/s390/kernel/vdso/vdso.so.dbg
 
 endif
 
diff --git a/arch/s390/appldata/appldata_base.c b/arch/s390/appldata/appldata_base.c
index ad2b0baa527c..feb43db63f30 100644
--- a/arch/s390/appldata/appldata_base.c
+++ b/arch/s390/appldata/appldata_base.c
@@ -9,8 +9,7 @@
  * Author: Gerald Schaefer <gerald.schaefer@de.ibm.com>
  */
 
-#define KMSG_COMPONENT	"appldata"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "appldata: " fmt
 
 #include <linux/export.h>
 #include <linux/module.h>
diff --git a/arch/s390/appldata/appldata_os.c b/arch/s390/appldata/appldata_os.c
index a363d30ce739..137d4e7e1e9a 100644
--- a/arch/s390/appldata/appldata_os.c
+++ b/arch/s390/appldata/appldata_os.c
@@ -8,8 +8,7 @@
  * Author: Gerald Schaefer <gerald.schaefer@de.ibm.com>
  */
 
-#define KMSG_COMPONENT	"appldata"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "appldata: " fmt
 
 #include <linux/module.h>
 #include <linux/init.h>
diff --git a/arch/s390/boot/Makefile b/arch/s390/boot/Makefile
index 02f2cf082748..490167faba7a 100644
--- a/arch/s390/boot/Makefile
+++ b/arch/s390/boot/Makefile
@@ -32,6 +32,7 @@ obj-$(CONFIG_RANDOMIZE_BASE)	+= kaslr.o
 obj-y	+= $(if $(CONFIG_KERNEL_UNCOMPRESSED),,decompressor.o) info.o
 obj-$(CONFIG_KERNEL_ZSTD) += clz_ctz.o
 obj-$(CONFIG_KMSAN) += kmsan.o
+obj-$(CONFIG_STACKPROTECTOR) += stackprotector.o
 obj-all := $(obj-y) piggy.o syms.o
 
 targets	:= bzImage section_cmp.boot.data section_cmp.boot.preserved.data $(obj-y)
diff --git a/arch/s390/boot/boot.h b/arch/s390/boot/boot.h
index 37d5b097ede5..61a205b489fb 100644
--- a/arch/s390/boot/boot.h
+++ b/arch/s390/boot/boot.h
@@ -28,6 +28,10 @@ struct vmlinux_info {
 	unsigned long invalid_pg_dir_off;
 	unsigned long alt_instructions;
 	unsigned long alt_instructions_end;
+#ifdef CONFIG_STACKPROTECTOR
+	unsigned long stack_prot_start;
+	unsigned long stack_prot_end;
+#endif
 #ifdef CONFIG_KASAN
 	unsigned long kasan_early_shadow_page_off;
 	unsigned long kasan_early_shadow_pte_off;
diff --git a/arch/s390/boot/ipl_data.c b/arch/s390/boot/ipl_data.c
index c4130a80b058..b0fd8a526b42 100644
--- a/arch/s390/boot/ipl_data.c
+++ b/arch/s390/boot/ipl_data.c
@@ -1,6 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
 
-#include <linux/compat.h>
 #include <linux/ptrace.h>
 #include <asm/cio.h>
 #include <asm/asm-offsets.h>
@@ -12,7 +11,7 @@
 #define PSW_MASK_DISABLED (PSW_MASK_WAIT | PSW_MASK_EA | PSW_MASK_BA)
 
 struct ipl_lowcore {
-	psw_t32		ipl_psw;			/* 0x0000 */
+	psw32_t		ipl_psw;			/* 0x0000 */
 	struct ccw0	ccwpgm[2];			/* 0x0008 */
 	u8		fill[56];			/* 0x0018 */
 	struct ccw0	ccwpgmcc[20];			/* 0x0050 */
diff --git a/arch/s390/boot/ipl_parm.c b/arch/s390/boot/ipl_parm.c
index f584d7da29cb..6bc950b92be7 100644
--- a/arch/s390/boot/ipl_parm.c
+++ b/arch/s390/boot/ipl_parm.c
@@ -3,6 +3,7 @@
 #include <linux/init.h>
 #include <linux/ctype.h>
 #include <linux/pgtable.h>
+#include <asm/arch-stackprotector.h>
 #include <asm/abs_lowcore.h>
 #include <asm/page-states.h>
 #include <asm/machine.h>
@@ -294,6 +295,11 @@ void parse_boot_command_line(void)
 				cmma_flag = 0;
 		}
 
+#ifdef CONFIG_STACKPROTECTOR
+		if (!strcmp(param, "debug_stackprotector"))
+			stack_protector_debug = 1;
+#endif
+
 #if IS_ENABLED(CONFIG_KVM)
 		if (!strcmp(param, "prot_virt")) {
 			rc = kstrtobool(val, &enabled);
diff --git a/arch/s390/boot/stackprotector.c b/arch/s390/boot/stackprotector.c
new file mode 100644
index 000000000000..68494940c12a
--- /dev/null
+++ b/arch/s390/boot/stackprotector.c
@@ -0,0 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define boot_fmt(fmt)	"stackprot: " fmt
+
+#include "boot.h"
+#include "../kernel/stackprotector.c"
diff --git a/arch/s390/boot/startup.c b/arch/s390/boot/startup.c
index 3fbd25b9498f..f77067dfc2a8 100644
--- a/arch/s390/boot/startup.c
+++ b/arch/s390/boot/startup.c
@@ -20,6 +20,9 @@
 #include <asm/uv.h>
 #include <asm/abs_lowcore.h>
 #include <asm/physmem_info.h>
+#include <asm/stacktrace.h>
+#include <asm/asm-offsets.h>
+#include <asm/arch-stackprotector.h>
 #include "decompressor.h"
 #include "boot.h"
 #include "uv.h"
@@ -477,6 +480,10 @@ static void kaslr_adjust_vmlinux_info(long offset)
 	vmlinux.invalid_pg_dir_off += offset;
 	vmlinux.alt_instructions += offset;
 	vmlinux.alt_instructions_end += offset;
+#ifdef CONFIG_STACKPROTECTOR
+	vmlinux.stack_prot_start += offset;
+	vmlinux.stack_prot_end += offset;
+#endif
 #ifdef CONFIG_KASAN
 	vmlinux.kasan_early_shadow_page_off += offset;
 	vmlinux.kasan_early_shadow_pte_off += offset;
@@ -622,6 +629,7 @@ void startup_kernel(void)
 	__apply_alternatives((struct alt_instr *)_vmlinux_info.alt_instructions,
 			     (struct alt_instr *)_vmlinux_info.alt_instructions_end,
 			     ALT_CTX_EARLY);
+	stack_protector_apply_early(text_lma);
 
 	/*
 	 * Save KASLR offset for early dumps, before vmcore_info is set.
diff --git a/arch/s390/configs/compat.config b/arch/s390/configs/compat.config
deleted file mode 100644
index 6fd051453ae8..000000000000
--- a/arch/s390/configs/compat.config
+++ /dev/null
@@ -1,3 +0,0 @@
-# Help: Enable compat support
-CONFIG_COMPAT=y
-CONFIG_COMPAT_32BIT_TIME=y
diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig
index 8433f769f7e1..1df484ed6329 100644
--- a/arch/s390/configs/debug_defconfig
+++ b/arch/s390/configs/debug_defconfig
@@ -796,6 +796,7 @@ CONFIG_CRYPTO_MD4=m
 CONFIG_CRYPTO_MD5=y
 CONFIG_CRYPTO_MICHAEL_MIC=m
 CONFIG_CRYPTO_RMD160=m
+CONFIG_CRYPTO_SHA3=m
 CONFIG_CRYPTO_SM3_GENERIC=m
 CONFIG_CRYPTO_WP512=m
 CONFIG_CRYPTO_XCBC=m
@@ -809,8 +810,6 @@ CONFIG_CRYPTO_USER_API_HASH=m
 CONFIG_CRYPTO_USER_API_SKCIPHER=m
 CONFIG_CRYPTO_USER_API_RNG=m
 CONFIG_CRYPTO_USER_API_AEAD=m
-CONFIG_CRYPTO_SHA3_256_S390=m
-CONFIG_CRYPTO_SHA3_512_S390=m
 CONFIG_CRYPTO_GHASH_S390=m
 CONFIG_CRYPTO_AES_S390=m
 CONFIG_CRYPTO_DES_S390=m
diff --git a/arch/s390/configs/defconfig b/arch/s390/configs/defconfig
index 4414dabd04a6..df89105dd520 100644
--- a/arch/s390/configs/defconfig
+++ b/arch/s390/configs/defconfig
@@ -780,6 +780,7 @@ CONFIG_CRYPTO_MD4=m
 CONFIG_CRYPTO_MD5=y
 CONFIG_CRYPTO_MICHAEL_MIC=m
 CONFIG_CRYPTO_RMD160=m
+CONFIG_CRYPTO_SHA3=m
 CONFIG_CRYPTO_SM3_GENERIC=m
 CONFIG_CRYPTO_WP512=m
 CONFIG_CRYPTO_XCBC=m
@@ -794,8 +795,6 @@ CONFIG_CRYPTO_USER_API_HASH=m
 CONFIG_CRYPTO_USER_API_SKCIPHER=m
 CONFIG_CRYPTO_USER_API_RNG=m
 CONFIG_CRYPTO_USER_API_AEAD=m
-CONFIG_CRYPTO_SHA3_256_S390=m
-CONFIG_CRYPTO_SHA3_512_S390=m
 CONFIG_CRYPTO_GHASH_S390=m
 CONFIG_CRYPTO_AES_S390=m
 CONFIG_CRYPTO_DES_S390=m
diff --git a/arch/s390/crypto/Kconfig b/arch/s390/crypto/Kconfig
index 03f73fbd38b6..f838ca055f6d 100644
--- a/arch/s390/crypto/Kconfig
+++ b/arch/s390/crypto/Kconfig
@@ -2,26 +2,6 @@
 
 menu "Accelerated Cryptographic Algorithms for CPU (s390)"
 
-config CRYPTO_SHA3_256_S390
-	tristate "Hash functions: SHA3-224 and SHA3-256"
-	select CRYPTO_HASH
-	help
-	  SHA3-224 and SHA3-256 secure hash algorithms (FIPS 202)
-
-	  Architecture: s390
-
-	  It is available as of z14.
-
-config CRYPTO_SHA3_512_S390
-	tristate "Hash functions: SHA3-384 and SHA3-512"
-	select CRYPTO_HASH
-	help
-	  SHA3-384 and SHA3-512 secure hash algorithms (FIPS 202)
-
-	  Architecture: s390
-
-	  It is available as of z14.
-
 config CRYPTO_GHASH_S390
 	tristate "Hash functions: GHASH"
 	select CRYPTO_HASH
diff --git a/arch/s390/crypto/Makefile b/arch/s390/crypto/Makefile
index 998f4b656b18..387a229e1038 100644
--- a/arch/s390/crypto/Makefile
+++ b/arch/s390/crypto/Makefile
@@ -3,8 +3,6 @@
 # Cryptographic API
 #
 
-obj-$(CONFIG_CRYPTO_SHA3_256_S390) += sha3_256_s390.o sha_common.o
-obj-$(CONFIG_CRYPTO_SHA3_512_S390) += sha3_512_s390.o sha_common.o
 obj-$(CONFIG_CRYPTO_DES_S390) += des_s390.o
 obj-$(CONFIG_CRYPTO_AES_S390) += aes_s390.o
 obj-$(CONFIG_CRYPTO_PAES_S390) += paes_s390.o
diff --git a/arch/s390/crypto/aes_s390.c b/arch/s390/crypto/aes_s390.c
index 5d36f4020dfa..d0a295435680 100644
--- a/arch/s390/crypto/aes_s390.c
+++ b/arch/s390/crypto/aes_s390.c
@@ -14,8 +14,7 @@
  * Derived from "crypto/aes_generic.c"
  */
 
-#define KMSG_COMPONENT "aes_s390"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "aes_s390: " fmt
 
 #include <crypto/aes.h>
 #include <crypto/algapi.h>
diff --git a/arch/s390/crypto/hmac_s390.c b/arch/s390/crypto/hmac_s390.c
index 58444da9b004..f8cd09f341d4 100644
--- a/arch/s390/crypto/hmac_s390.c
+++ b/arch/s390/crypto/hmac_s390.c
@@ -5,8 +5,7 @@
  * s390 specific HMAC support.
  */
 
-#define KMSG_COMPONENT	"hmac_s390"
-#define pr_fmt(fmt)	KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "hmac_s390: " fmt
 
 #include <asm/cpacf.h>
 #include <crypto/internal/hash.h>
diff --git a/arch/s390/crypto/paes_s390.c b/arch/s390/crypto/paes_s390.c
index a624a43a2b54..64aef7eb2030 100644
--- a/arch/s390/crypto/paes_s390.c
+++ b/arch/s390/crypto/paes_s390.c
@@ -10,8 +10,7 @@
  *		Harald Freudenberger <freude@de.ibm.com>
  */
 
-#define KMSG_COMPONENT "paes_s390"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "paes_s390: " fmt
 
 #include <linux/atomic.h>
 #include <linux/cpufeature.h>
diff --git a/arch/s390/crypto/phmac_s390.c b/arch/s390/crypto/phmac_s390.c
index 89f3e6d8fd89..88342bd4c37a 100644
--- a/arch/s390/crypto/phmac_s390.c
+++ b/arch/s390/crypto/phmac_s390.c
@@ -5,8 +5,7 @@
  * s390 specific HMAC support for protected keys.
  */
 
-#define KMSG_COMPONENT	"phmac_s390"
-#define pr_fmt(fmt)	KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "phmac_s390: " fmt
 
 #include <asm/cpacf.h>
 #include <asm/pkey.h>
diff --git a/arch/s390/crypto/prng.c b/arch/s390/crypto/prng.c
index 2becd77df741..84e3d3c6ba09 100644
--- a/arch/s390/crypto/prng.c
+++ b/arch/s390/crypto/prng.c
@@ -6,8 +6,7 @@
  * Driver for the s390 pseudo random number generator
  */
 
-#define KMSG_COMPONENT "prng"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "prng: " fmt
 
 #include <linux/fs.h>
 #include <linux/fips.h>
diff --git a/arch/s390/crypto/sha.h b/arch/s390/crypto/sha.h
deleted file mode 100644
index b9cd9572dd35..000000000000
--- a/arch/s390/crypto/sha.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0+ */
-/*
- * Cryptographic API.
- *
- * s390 generic implementation of the SHA Secure Hash Algorithms.
- *
- * Copyright IBM Corp. 2007
- * Author(s): Jan Glauber (jang@de.ibm.com)
- */
-#ifndef _CRYPTO_ARCH_S390_SHA_H
-#define _CRYPTO_ARCH_S390_SHA_H
-
-#include <crypto/hash.h>
-#include <crypto/sha2.h>
-#include <crypto/sha3.h>
-#include <linux/build_bug.h>
-#include <linux/types.h>
-
-/* must be big enough for the largest SHA variant */
-#define CPACF_MAX_PARMBLOCK_SIZE	SHA3_STATE_SIZE
-#define SHA_MAX_BLOCK_SIZE		SHA3_224_BLOCK_SIZE
-
-struct s390_sha_ctx {
-	u64 count;		/* message length in bytes */
-	union {
-		u32 state[CPACF_MAX_PARMBLOCK_SIZE / sizeof(u32)];
-		struct {
-			u64 state[SHA512_DIGEST_SIZE / sizeof(u64)];
-			u64 count_hi;
-		} sha512;
-		struct {
-			__le64 state[SHA3_STATE_SIZE / sizeof(u64)];
-		} sha3;
-	};
-	int func;		/* KIMD function to use */
-	bool first_message_part;
-};
-
-struct shash_desc;
-
-int s390_sha_update_blocks(struct shash_desc *desc, const u8 *data,
-			   unsigned int len);
-int s390_sha_finup(struct shash_desc *desc, const u8 *src, unsigned int len,
-		   u8 *out);
-
-static inline void __check_s390_sha_ctx_size(void)
-{
-	BUILD_BUG_ON(S390_SHA_CTX_SIZE != sizeof(struct s390_sha_ctx));
-}
-
-#endif
diff --git a/arch/s390/crypto/sha3_256_s390.c b/arch/s390/crypto/sha3_256_s390.c
deleted file mode 100644
index 03bb4f4bab70..000000000000
--- a/arch/s390/crypto/sha3_256_s390.c
+++ /dev/null
@@ -1,157 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0+
-/*
- * Cryptographic API.
- *
- * s390 implementation of the SHA256 and SHA224 Secure Hash Algorithm.
- *
- * s390 Version:
- *   Copyright IBM Corp. 2019
- *   Author(s): Joerg Schmidbauer (jschmidb@de.ibm.com)
- */
-#include <asm/cpacf.h>
-#include <crypto/internal/hash.h>
-#include <crypto/sha3.h>
-#include <linux/cpufeature.h>
-#include <linux/errno.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/string.h>
-
-#include "sha.h"
-
-static int sha3_256_init(struct shash_desc *desc)
-{
-	struct s390_sha_ctx *sctx = shash_desc_ctx(desc);
-
-	sctx->first_message_part = test_facility(86);
-	if (!sctx->first_message_part)
-		memset(sctx->state, 0, sizeof(sctx->state));
-	sctx->count = 0;
-	sctx->func = CPACF_KIMD_SHA3_256;
-
-	return 0;
-}
-
-static int sha3_256_export(struct shash_desc *desc, void *out)
-{
-	struct s390_sha_ctx *sctx = shash_desc_ctx(desc);
-	union {
-		u8 *u8;
-		u64 *u64;
-	} p = { .u8 = out };
-	int i;
-
-	if (sctx->first_message_part) {
-		memset(out, 0, SHA3_STATE_SIZE);
-		return 0;
-	}
-	for (i = 0; i < SHA3_STATE_SIZE / 8; i++)
-		put_unaligned(le64_to_cpu(sctx->sha3.state[i]), p.u64++);
-	return 0;
-}
-
-static int sha3_256_import(struct shash_desc *desc, const void *in)
-{
-	struct s390_sha_ctx *sctx = shash_desc_ctx(desc);
-	union {
-		const u8 *u8;
-		const u64 *u64;
-	} p = { .u8 = in };
-	int i;
-
-	for (i = 0; i < SHA3_STATE_SIZE / 8; i++)
-		sctx->sha3.state[i] = cpu_to_le64(get_unaligned(p.u64++));
-	sctx->count = 0;
-	sctx->first_message_part = 0;
-	sctx->func = CPACF_KIMD_SHA3_256;
-
-	return 0;
-}
-
-static int sha3_224_import(struct shash_desc *desc, const void *in)
-{
-	struct s390_sha_ctx *sctx = shash_desc_ctx(desc);
-
-	sha3_256_import(desc, in);
-	sctx->func = CPACF_KIMD_SHA3_224;
-	return 0;
-}
-
-static struct shash_alg sha3_256_alg = {
-	.digestsize	=	SHA3_256_DIGEST_SIZE,	   /* = 32 */
-	.init		=	sha3_256_init,
-	.update		=	s390_sha_update_blocks,
-	.finup		=	s390_sha_finup,
-	.export		=	sha3_256_export,
-	.import		=	sha3_256_import,
-	.descsize	=	S390_SHA_CTX_SIZE,
-	.statesize	=	SHA3_STATE_SIZE,
-	.base		=	{
-		.cra_name	 =	"sha3-256",
-		.cra_driver_name =	"sha3-256-s390",
-		.cra_priority	 =	300,
-		.cra_flags	 =	CRYPTO_AHASH_ALG_BLOCK_ONLY,
-		.cra_blocksize	 =	SHA3_256_BLOCK_SIZE,
-		.cra_module	 =	THIS_MODULE,
-	}
-};
-
-static int sha3_224_init(struct shash_desc *desc)
-{
-	struct s390_sha_ctx *sctx = shash_desc_ctx(desc);
-
-	sha3_256_init(desc);
-	sctx->func = CPACF_KIMD_SHA3_224;
-	return 0;
-}
-
-static struct shash_alg sha3_224_alg = {
-	.digestsize	=	SHA3_224_DIGEST_SIZE,
-	.init		=	sha3_224_init,
-	.update		=	s390_sha_update_blocks,
-	.finup		=	s390_sha_finup,
-	.export		=	sha3_256_export, /* same as for 256 */
-	.import		=	sha3_224_import, /* function code different! */
-	.descsize	=	S390_SHA_CTX_SIZE,
-	.statesize	=	SHA3_STATE_SIZE,
-	.base		=	{
-		.cra_name	 =	"sha3-224",
-		.cra_driver_name =	"sha3-224-s390",
-		.cra_priority	 =	300,
-		.cra_flags	 =	CRYPTO_AHASH_ALG_BLOCK_ONLY,
-		.cra_blocksize	 =	SHA3_224_BLOCK_SIZE,
-		.cra_module	 =	THIS_MODULE,
-	}
-};
-
-static int __init sha3_256_s390_init(void)
-{
-	int ret;
-
-	if (!cpacf_query_func(CPACF_KIMD, CPACF_KIMD_SHA3_256))
-		return -ENODEV;
-
-	ret = crypto_register_shash(&sha3_256_alg);
-	if (ret < 0)
-		goto out;
-
-	ret = crypto_register_shash(&sha3_224_alg);
-	if (ret < 0)
-		crypto_unregister_shash(&sha3_256_alg);
-out:
-	return ret;
-}
-
-static void __exit sha3_256_s390_fini(void)
-{
-	crypto_unregister_shash(&sha3_224_alg);
-	crypto_unregister_shash(&sha3_256_alg);
-}
-
-module_cpu_feature_match(S390_CPU_FEATURE_MSA, sha3_256_s390_init);
-module_exit(sha3_256_s390_fini);
-
-MODULE_ALIAS_CRYPTO("sha3-256");
-MODULE_ALIAS_CRYPTO("sha3-224");
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("SHA3-256 and SHA3-224 Secure Hash Algorithm");
diff --git a/arch/s390/crypto/sha3_512_s390.c b/arch/s390/crypto/sha3_512_s390.c
deleted file mode 100644
index a5c9690eecb1..000000000000
--- a/arch/s390/crypto/sha3_512_s390.c
+++ /dev/null
@@ -1,157 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0+
-/*
- * Cryptographic API.
- *
- * s390 implementation of the SHA512 and SHA384 Secure Hash Algorithm.
- *
- * Copyright IBM Corp. 2019
- * Author(s): Joerg Schmidbauer (jschmidb@de.ibm.com)
- */
-#include <asm/cpacf.h>
-#include <crypto/internal/hash.h>
-#include <crypto/sha3.h>
-#include <linux/cpufeature.h>
-#include <linux/errno.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/string.h>
-
-#include "sha.h"
-
-static int sha3_512_init(struct shash_desc *desc)
-{
-	struct s390_sha_ctx *sctx = shash_desc_ctx(desc);
-
-	sctx->first_message_part = test_facility(86);
-	if (!sctx->first_message_part)
-		memset(sctx->state, 0, sizeof(sctx->state));
-	sctx->count = 0;
-	sctx->func = CPACF_KIMD_SHA3_512;
-
-	return 0;
-}
-
-static int sha3_512_export(struct shash_desc *desc, void *out)
-{
-	struct s390_sha_ctx *sctx = shash_desc_ctx(desc);
-	union {
-		u8 *u8;
-		u64 *u64;
-	} p = { .u8 = out };
-	int i;
-
-	if (sctx->first_message_part) {
-		memset(out, 0, SHA3_STATE_SIZE);
-		return 0;
-	}
-	for (i = 0; i < SHA3_STATE_SIZE / 8; i++)
-		put_unaligned(le64_to_cpu(sctx->sha3.state[i]), p.u64++);
-	return 0;
-}
-
-static int sha3_512_import(struct shash_desc *desc, const void *in)
-{
-	struct s390_sha_ctx *sctx = shash_desc_ctx(desc);
-	union {
-		const u8 *u8;
-		const u64 *u64;
-	} p = { .u8 = in };
-	int i;
-
-	for (i = 0; i < SHA3_STATE_SIZE / 8; i++)
-		sctx->sha3.state[i] = cpu_to_le64(get_unaligned(p.u64++));
-	sctx->count = 0;
-	sctx->first_message_part = 0;
-	sctx->func = CPACF_KIMD_SHA3_512;
-
-	return 0;
-}
-
-static int sha3_384_import(struct shash_desc *desc, const void *in)
-{
-	struct s390_sha_ctx *sctx = shash_desc_ctx(desc);
-
-	sha3_512_import(desc, in);
-	sctx->func = CPACF_KIMD_SHA3_384;
-	return 0;
-}
-
-static struct shash_alg sha3_512_alg = {
-	.digestsize	=	SHA3_512_DIGEST_SIZE,
-	.init		=	sha3_512_init,
-	.update		=	s390_sha_update_blocks,
-	.finup		=	s390_sha_finup,
-	.export		=	sha3_512_export,
-	.import		=	sha3_512_import,
-	.descsize	=	S390_SHA_CTX_SIZE,
-	.statesize	=	SHA3_STATE_SIZE,
-	.base		=	{
-		.cra_name	 =	"sha3-512",
-		.cra_driver_name =	"sha3-512-s390",
-		.cra_priority	 =	300,
-		.cra_flags	 =	CRYPTO_AHASH_ALG_BLOCK_ONLY,
-		.cra_blocksize	 =	SHA3_512_BLOCK_SIZE,
-		.cra_module	 =	THIS_MODULE,
-	}
-};
-
-MODULE_ALIAS_CRYPTO("sha3-512");
-
-static int sha3_384_init(struct shash_desc *desc)
-{
-	struct s390_sha_ctx *sctx = shash_desc_ctx(desc);
-
-	sha3_512_init(desc);
-	sctx->func = CPACF_KIMD_SHA3_384;
-	return 0;
-}
-
-static struct shash_alg sha3_384_alg = {
-	.digestsize	=	SHA3_384_DIGEST_SIZE,
-	.init		=	sha3_384_init,
-	.update		=	s390_sha_update_blocks,
-	.finup		=	s390_sha_finup,
-	.export		=	sha3_512_export, /* same as for 512 */
-	.import		=	sha3_384_import, /* function code different! */
-	.descsize	=	S390_SHA_CTX_SIZE,
-	.statesize	=	SHA3_STATE_SIZE,
-	.base		=	{
-		.cra_name	 =	"sha3-384",
-		.cra_driver_name =	"sha3-384-s390",
-		.cra_priority	 =	300,
-		.cra_flags	 =	CRYPTO_AHASH_ALG_BLOCK_ONLY,
-		.cra_blocksize	 =	SHA3_384_BLOCK_SIZE,
-		.cra_ctxsize	 =	sizeof(struct s390_sha_ctx),
-		.cra_module	 =	THIS_MODULE,
-	}
-};
-
-MODULE_ALIAS_CRYPTO("sha3-384");
-
-static int __init init(void)
-{
-	int ret;
-
-	if (!cpacf_query_func(CPACF_KIMD, CPACF_KIMD_SHA3_512))
-		return -ENODEV;
-	ret = crypto_register_shash(&sha3_512_alg);
-	if (ret < 0)
-		goto out;
-	ret = crypto_register_shash(&sha3_384_alg);
-	if (ret < 0)
-		crypto_unregister_shash(&sha3_512_alg);
-out:
-	return ret;
-}
-
-static void __exit fini(void)
-{
-	crypto_unregister_shash(&sha3_512_alg);
-	crypto_unregister_shash(&sha3_384_alg);
-}
-
-module_cpu_feature_match(S390_CPU_FEATURE_MSA, init);
-module_exit(fini);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("SHA3-512 and SHA3-384 Secure Hash Algorithm");
diff --git a/arch/s390/crypto/sha_common.c b/arch/s390/crypto/sha_common.c
deleted file mode 100644
index d6f839618794..000000000000
--- a/arch/s390/crypto/sha_common.c
+++ /dev/null
@@ -1,117 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0+
-/*
- * Cryptographic API.
- *
- * s390 generic implementation of the SHA Secure Hash Algorithms.
- *
- * Copyright IBM Corp. 2007
- * Author(s): Jan Glauber (jang@de.ibm.com)
- */
-
-#include <crypto/internal/hash.h>
-#include <linux/export.h>
-#include <linux/module.h>
-#include <asm/cpacf.h>
-#include "sha.h"
-
-int s390_sha_update_blocks(struct shash_desc *desc, const u8 *data,
-			   unsigned int len)
-{
-	unsigned int bsize = crypto_shash_blocksize(desc->tfm);
-	struct s390_sha_ctx *ctx = shash_desc_ctx(desc);
-	unsigned int n;
-	int fc;
-
-	fc = ctx->func;
-	if (ctx->first_message_part)
-		fc |= CPACF_KIMD_NIP;
-
-	/* process as many blocks as possible */
-	n = (len / bsize) * bsize;
-	ctx->count += n;
-	switch (ctx->func) {
-	case CPACF_KLMD_SHA_512:
-	case CPACF_KLMD_SHA3_384:
-		if (ctx->count < n)
-			ctx->sha512.count_hi++;
-		break;
-	}
-	cpacf_kimd(fc, ctx->state, data, n);
-	ctx->first_message_part = 0;
-	return len - n;
-}
-EXPORT_SYMBOL_GPL(s390_sha_update_blocks);
-
-static int s390_crypto_shash_parmsize(int func)
-{
-	switch (func) {
-	case CPACF_KLMD_SHA_1:
-		return 20;
-	case CPACF_KLMD_SHA_256:
-		return 32;
-	case CPACF_KLMD_SHA_512:
-		return 64;
-	case CPACF_KLMD_SHA3_224:
-	case CPACF_KLMD_SHA3_256:
-	case CPACF_KLMD_SHA3_384:
-	case CPACF_KLMD_SHA3_512:
-		return 200;
-	default:
-		return -EINVAL;
-	}
-}
-
-int s390_sha_finup(struct shash_desc *desc, const u8 *src, unsigned int len,
-		   u8 *out)
-{
-	struct s390_sha_ctx *ctx = shash_desc_ctx(desc);
-	int mbl_offset, fc;
-	u64 bits;
-
-	ctx->count += len;
-
-	bits = ctx->count * 8;
-	mbl_offset = s390_crypto_shash_parmsize(ctx->func);
-	if (mbl_offset < 0)
-		return -EINVAL;
-
-	mbl_offset = mbl_offset / sizeof(u32);
-
-	/* set total msg bit length (mbl) in CPACF parmblock */
-	switch (ctx->func) {
-	case CPACF_KLMD_SHA_512:
-		/* The SHA512 parmblock has a 128-bit mbl field. */
-		if (ctx->count < len)
-			ctx->sha512.count_hi++;
-		ctx->sha512.count_hi <<= 3;
-		ctx->sha512.count_hi |= ctx->count >> 61;
-		mbl_offset += sizeof(u64) / sizeof(u32);
-		fallthrough;
-	case CPACF_KLMD_SHA_1:
-	case CPACF_KLMD_SHA_256:
-		memcpy(ctx->state + mbl_offset, &bits, sizeof(bits));
-		break;
-	case CPACF_KLMD_SHA3_224:
-	case CPACF_KLMD_SHA3_256:
-	case CPACF_KLMD_SHA3_384:
-	case CPACF_KLMD_SHA3_512:
-		break;
-	default:
-		return -EINVAL;
-	}
-
-	fc = ctx->func;
-	fc |= test_facility(86) ? CPACF_KLMD_DUFOP : 0;
-	if (ctx->first_message_part)
-		fc |= CPACF_KLMD_NIP;
-	cpacf_klmd(fc, ctx->state, src, len);
-
-	/* copy digest to out */
-	memcpy(out, ctx->state, crypto_shash_digestsize(desc->tfm));
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(s390_sha_finup);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("s390 SHA cipher common functions");
diff --git a/arch/s390/hypfs/hypfs_diag.c b/arch/s390/hypfs/hypfs_diag.c
index c8af67d20994..08777f57bd24 100644
--- a/arch/s390/hypfs/hypfs_diag.c
+++ b/arch/s390/hypfs/hypfs_diag.c
@@ -7,8 +7,7 @@
  *    Author(s): Michael Holzheu <holzheu@de.ibm.com>
  */
 
-#define KMSG_COMPONENT "hypfs"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "hypfs: " fmt
 
 #include <linux/types.h>
 #include <linux/errno.h>
diff --git a/arch/s390/hypfs/hypfs_diag_fs.c b/arch/s390/hypfs/hypfs_diag_fs.c
index ede951dc0085..013da4ff9802 100644
--- a/arch/s390/hypfs/hypfs_diag_fs.c
+++ b/arch/s390/hypfs/hypfs_diag_fs.c
@@ -7,8 +7,7 @@
  *    Author(s): Michael Holzheu <holzheu@de.ibm.com>
  */
 
-#define KMSG_COMPONENT "hypfs"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "hypfs: " fmt
 
 #include <linux/types.h>
 #include <linux/errno.h>
diff --git a/arch/s390/hypfs/hypfs_sprp.c b/arch/s390/hypfs/hypfs_sprp.c
index a2952ed5518b..a72576221cab 100644
--- a/arch/s390/hypfs/hypfs_sprp.c
+++ b/arch/s390/hypfs/hypfs_sprp.c
@@ -7,7 +7,6 @@
  *    Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
  */
 
-#include <linux/compat.h>
 #include <linux/errno.h>
 #include <linux/gfp.h>
 #include <linux/string.h>
@@ -116,10 +115,7 @@ static long hypfs_sprp_ioctl(struct file *file, unsigned int cmd,
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EACCES;
-	if (is_compat_task())
-		argp = compat_ptr(arg);
-	else
-		argp = (void __user *) arg;
+	argp = (void __user *)arg;
 	switch (cmd) {
 	case HYPFS_DIAG304:
 		return __hypfs_sprp_ioctl(argp);
diff --git a/arch/s390/hypfs/inode.c b/arch/s390/hypfs/inode.c
index 96409573c75d..ee5cfa8f71a0 100644
--- a/arch/s390/hypfs/inode.c
+++ b/arch/s390/hypfs/inode.c
@@ -6,8 +6,7 @@
  *    Author(s): Michael Holzheu <holzheu@de.ibm.com>
  */
 
-#define KMSG_COMPONENT "hypfs"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "hypfs: " fmt
 
 #include <linux/types.h>
 #include <linux/errno.h>
diff --git a/arch/s390/include/asm/ap.h b/arch/s390/include/asm/ap.h
index 56817990c73d..b24459f692fa 100644
--- a/arch/s390/include/asm/ap.h
+++ b/arch/s390/include/asm/ap.h
@@ -38,16 +38,30 @@ typedef unsigned int ap_qid_t;
  * The ap queue status word is returned by all three AP functions
  * (PQAP, NQAP and DQAP).  There's a set of flags in the first
  * byte, followed by a 1 byte response code.
+ *
+ * For convenience the 'value' field is a 32 bit access of the
+ * whole status and the 'status_bits' and 'rc' fields comprise
+ * the leftmost 8 status bits and the response_code.
  */
 struct ap_queue_status {
-	unsigned int queue_empty	: 1;
-	unsigned int replies_waiting	: 1;
-	unsigned int queue_full		: 1;
-	unsigned int			: 3;
-	unsigned int async		: 1;
-	unsigned int irq_enabled	: 1;
-	unsigned int response_code	: 8;
-	unsigned int			: 16;
+	union {
+		unsigned int value			: 32;
+		struct {
+			unsigned int status_bits	: 8;
+			unsigned int rc			: 8;
+			unsigned int			: 16;
+		};
+		struct {
+			unsigned int queue_empty	: 1;
+			unsigned int replies_waiting	: 1;
+			unsigned int queue_full		: 1;
+			unsigned int			: 3;
+			unsigned int async		: 1;
+			unsigned int irq_enabled	: 1;
+			unsigned int response_code	: 8;
+			unsigned int			: 16;
+		};
+	};
 };
 
 /*
diff --git a/arch/s390/include/asm/arch-stackprotector.h b/arch/s390/include/asm/arch-stackprotector.h
new file mode 100644
index 000000000000..953627259e91
--- /dev/null
+++ b/arch/s390/include/asm/arch-stackprotector.h
@@ -0,0 +1,25 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _ASM_S390_ARCH_STACKPROTECTOR_H
+#define _ASM_S390_ARCH_STACKPROTECTOR_H
+
+extern unsigned long __stack_chk_guard;
+extern int stack_protector_debug;
+
+void __stack_protector_apply_early(unsigned long kernel_start);
+int __stack_protector_apply(unsigned long *start, unsigned long *end, unsigned long kernel_start);
+
+static inline void stack_protector_apply_early(unsigned long kernel_start)
+{
+	if (IS_ENABLED(CONFIG_STACKPROTECTOR))
+		__stack_protector_apply_early(kernel_start);
+}
+
+static inline int stack_protector_apply(unsigned long *start, unsigned long *end)
+{
+	if (IS_ENABLED(CONFIG_STACKPROTECTOR))
+		return __stack_protector_apply(start, end, 0);
+	return 0;
+}
+
+#endif /* _ASM_S390_ARCH_STACKPROTECTOR_H */
diff --git a/arch/s390/include/asm/cio.h b/arch/s390/include/asm/cio.h
index b6b619f340a5..0a82ae2300b6 100644
--- a/arch/s390/include/asm/cio.h
+++ b/arch/s390/include/asm/cio.h
@@ -18,6 +18,8 @@
 
 #include <asm/scsw.h>
 
+#define CCW_MAX_BYTE_COUNT 65535
+
 /**
  * struct ccw1 - channel command word
  * @cmd_code: command code
diff --git a/arch/s390/include/asm/compat.h b/arch/s390/include/asm/compat.h
deleted file mode 100644
index 3cb9d813f022..000000000000
--- a/arch/s390/include/asm/compat.h
+++ /dev/null
@@ -1,140 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_S390X_COMPAT_H
-#define _ASM_S390X_COMPAT_H
-/*
- * Architecture specific compatibility types
- */
-#include <linux/types.h>
-#include <linux/sched.h>
-#include <linux/sched/task_stack.h>
-#include <linux/thread_info.h>
-#include <asm/ptrace.h>
-
-#define compat_mode_t	compat_mode_t
-typedef u16		compat_mode_t;
-
-#define __compat_uid_t	__compat_uid_t
-typedef u16		__compat_uid_t;
-typedef u16		__compat_gid_t;
-
-#define compat_dev_t	compat_dev_t
-typedef u16		compat_dev_t;
-
-#define compat_ipc_pid_t compat_ipc_pid_t
-typedef u16		 compat_ipc_pid_t;
-
-#define compat_statfs	compat_statfs
-
-#include <asm-generic/compat.h>
-
-#define __TYPE_IS_PTR(t) (!__builtin_types_compatible_p( \
-				typeof(0?(__force t)0:0ULL), u64))
-
-#define __SC_DELOUSE(t,v) ({ \
-	BUILD_BUG_ON(sizeof(t) > 4 && !__TYPE_IS_PTR(t)); \
-	(__force t)(__TYPE_IS_PTR(t) ? ((v) & 0x7fffffff) : (v)); \
-})
-
-#define PSW32_MASK_USER		0x0000FF00UL
-
-#define PSW32_USER_BITS (PSW32_MASK_DAT | PSW32_MASK_IO | PSW32_MASK_EXT | \
-			 PSW32_DEFAULT_KEY | PSW32_MASK_BASE | \
-			 PSW32_MASK_MCHECK | PSW32_MASK_PSTATE | \
-			 PSW32_ASC_PRIMARY)
-
-#define COMPAT_UTS_MACHINE	"s390\0\0\0\0"
-
-typedef u16		compat_nlink_t;
-
-typedef struct {
-	u32 mask;
-	u32 addr;
-} __aligned(8) psw_compat_t;
-
-typedef struct {
-	psw_compat_t psw;
-	u32 gprs[NUM_GPRS];
-	u32 acrs[NUM_ACRS];
-	u32 orig_gpr2;
-} s390_compat_regs;
-
-typedef struct {
-	u32 gprs_high[NUM_GPRS];
-} s390_compat_regs_high;
-
-struct compat_stat {
-	compat_dev_t	st_dev;
-	u16		__pad1;
-	compat_ino_t	st_ino;
-	compat_mode_t	st_mode;
-	compat_nlink_t	st_nlink;
-	__compat_uid_t	st_uid;
-	__compat_gid_t	st_gid;
-	compat_dev_t	st_rdev;
-	u16		__pad2;
-	u32		st_size;
-	u32		st_blksize;
-	u32		st_blocks;
-	u32		st_atime;
-	u32		st_atime_nsec;
-	u32		st_mtime;
-	u32		st_mtime_nsec;
-	u32		st_ctime;
-	u32		st_ctime_nsec;
-	u32		__unused4;
-	u32		__unused5;
-};
-
-struct compat_statfs {
-	u32		f_type;
-	u32		f_bsize;
-	u32		f_blocks;
-	u32		f_bfree;
-	u32		f_bavail;
-	u32		f_files;
-	u32		f_ffree;
-	compat_fsid_t	f_fsid;
-	u32		f_namelen;
-	u32		f_frsize;
-	u32		f_flags;
-	u32		f_spare[4];
-};
-
-struct compat_statfs64 {
-	u32		f_type;
-	u32		f_bsize;
-	u64		f_blocks;
-	u64		f_bfree;
-	u64		f_bavail;
-	u64		f_files;
-	u64		f_ffree;
-	compat_fsid_t	f_fsid;
-	u32		f_namelen;
-	u32		f_frsize;
-	u32		f_flags;
-	u32		f_spare[5];
-};
-
-/*
- * A pointer passed in from user mode. This should not
- * be used for syscall parameters, just declare them
- * as pointers because the syscall entry code will have
- * appropriately converted them already.
- */
-
-static inline void __user *compat_ptr(compat_uptr_t uptr)
-{
-	return (void __user *)(unsigned long)(uptr & 0x7fffffffUL);
-}
-#define compat_ptr(uptr) compat_ptr(uptr)
-
-#ifdef CONFIG_COMPAT
-
-static inline int is_compat_task(void)
-{
-	return test_thread_flag(TIF_31BIT);
-}
-
-#endif
-
-#endif /* _ASM_S390X_COMPAT_H */
diff --git a/arch/s390/include/asm/cpufeature.h b/arch/s390/include/asm/cpufeature.h
index 6c6a99660e78..d6fb999c8c6d 100644
--- a/arch/s390/include/asm/cpufeature.h
+++ b/arch/s390/include/asm/cpufeature.h
@@ -27,7 +27,6 @@ int cpu_have_feature(unsigned int nr);
 #define cpu_has_edat1()		test_facility(8)
 #define cpu_has_edat2()		test_facility(78)
 #define cpu_has_gs()		test_facility(133)
-#define cpu_has_idte()		test_facility(3)
 #define cpu_has_nx()		test_facility(130)
 #define cpu_has_rdp()		test_facility(194)
 #define cpu_has_seq_insn()	test_facility(85)
diff --git a/arch/s390/include/asm/elf.h b/arch/s390/include/asm/elf.h
index a03df312081e..bb63fa4d20bb 100644
--- a/arch/s390/include/asm/elf.h
+++ b/arch/s390/include/asm/elf.h
@@ -162,8 +162,6 @@ enum {
  * ELF register definitions..
  */
 
-#include <linux/compat.h>
-
 #include <asm/ptrace.h>
 #include <asm/syscall.h>
 #include <asm/user.h>
@@ -171,9 +169,6 @@ enum {
 typedef s390_fp_regs elf_fpregset_t;
 typedef s390_regs elf_gregset_t;
 
-typedef s390_fp_regs compat_elf_fpregset_t;
-typedef s390_compat_regs compat_elf_gregset_t;
-
 #include <linux/sched/mm.h>	/* for task_struct */
 #include <asm/mmu_context.h>
 
@@ -183,10 +178,6 @@ typedef s390_compat_regs compat_elf_gregset_t;
 #define elf_check_arch(x) \
 	(((x)->e_machine == EM_S390 || (x)->e_machine == EM_S390_OLD) \
          && (x)->e_ident[EI_CLASS] == ELF_CLASS) 
-#define compat_elf_check_arch(x) \
-	(((x)->e_machine == EM_S390 || (x)->e_machine == EM_S390_OLD) \
-	 && (x)->e_ident[EI_CLASS] == ELF_CLASS)
-#define compat_start_thread	start_thread31
 
 /* For SVR4/S390 the function pointer to be registered with `atexit` is
    passed in R14. */
@@ -203,9 +194,7 @@ typedef s390_compat_regs compat_elf_gregset_t;
    the loader.  We need to make sure that it is out of the way of the program
    that it will "exec", and that there is sufficient room for the brk. 64-bit
    tasks are aligned to 4GB. */
-#define ELF_ET_DYN_BASE (is_compat_task() ? \
-				(STACK_TOP / 3 * 2) : \
-				(STACK_TOP / 3 * 2) & ~((1UL << 32) - 1))
+#define ELF_ET_DYN_BASE ((STACK_TOP / 3 * 2) & ~((1UL << 32) - 1))
 
 /* This yields a mask that user programs can use to figure out what
    instruction set this CPU supports. */
@@ -224,43 +213,21 @@ extern unsigned long elf_hwcap;
 extern char elf_platform[];
 #define ELF_PLATFORM (elf_platform)
 
-#ifndef CONFIG_COMPAT
 #define SET_PERSONALITY(ex) \
 do {								\
 	set_personality(PER_LINUX |				\
 		(current->personality & (~PER_MASK)));		\
-	current->thread.sys_call_table = sys_call_table;	\
-} while (0)
-#else /* CONFIG_COMPAT */
-#define SET_PERSONALITY(ex)					\
-do {								\
-	if (personality(current->personality) != PER_LINUX32)	\
-		set_personality(PER_LINUX |			\
-			(current->personality & ~PER_MASK));	\
-	if ((ex).e_ident[EI_CLASS] == ELFCLASS32) {		\
-		set_thread_flag(TIF_31BIT);			\
-		current->thread.sys_call_table =		\
-			sys_call_table_emu;			\
-	} else {						\
-		clear_thread_flag(TIF_31BIT);			\
-		current->thread.sys_call_table =		\
-			sys_call_table;				\
-	}							\
 } while (0)
-#endif /* CONFIG_COMPAT */
 
 /*
  * Cache aliasing on the latest machines calls for a mapping granularity
- * of 512KB for the anonymous mapping base. For 64-bit processes use a
- * 512KB alignment and a randomization of up to 1GB. For 31-bit processes
- * the virtual address space is limited, use no alignment and limit the
- * randomization to 8MB.
- * For the additional randomization of the program break use 32MB for
- * 64-bit and 8MB for 31-bit.
+ * of 512KB for the anonymous mapping base. Use a 512KB alignment and a
+ * randomization of up to 1GB.
+ * For the additional randomization of the program break use 32MB.
  */
-#define BRK_RND_MASK	(is_compat_task() ? 0x7ffUL : 0x1fffUL)
-#define MMAP_RND_MASK	(is_compat_task() ? 0x7ffUL : 0x3ff80UL)
-#define MMAP_ALIGN_MASK	(is_compat_task() ? 0 : 0x7fUL)
+#define BRK_RND_MASK	(0x1fffUL)
+#define MMAP_RND_MASK	(0x3ff80UL)
+#define MMAP_ALIGN_MASK	(0x7fUL)
 #define STACK_RND_MASK	MMAP_RND_MASK
 
 /* update AT_VECTOR_SIZE_ARCH if the number of NEW_AUX_ENT entries changes */
diff --git a/arch/s390/include/asm/fpu-insn.h b/arch/s390/include/asm/fpu-insn.h
index e99f8bca8e08..96727f3bd0dc 100644
--- a/arch/s390/include/asm/fpu-insn.h
+++ b/arch/s390/include/asm/fpu-insn.h
@@ -12,6 +12,7 @@
 #ifndef __ASSEMBLER__
 
 #include <linux/instrumented.h>
+#include <linux/kmsan.h>
 #include <asm/asm-extable.h>
 
 asm(".include \"asm/fpu-insn-asm.h\"\n");
@@ -393,6 +394,7 @@ static __always_inline void fpu_vstl(u8 v1, u32 index, const void *vxr)
 		     : [vxr] "=Q" (*(u8 *)vxr)
 		     : [index] "d" (index), [v1] "I" (v1)
 		     : "memory");
+	kmsan_unpoison_memory(vxr, size);
 }
 
 #else /* CONFIG_CC_HAS_ASM_AOR_FORMAT_FLAGS */
@@ -409,6 +411,7 @@ static __always_inline void fpu_vstl(u8 v1, u32 index, const void *vxr)
 		: [vxr] "=R" (*(u8 *)vxr)
 		: [index] "d" (index), [v1] "I" (v1)
 		: "memory", "1");
+	kmsan_unpoison_memory(vxr, size);
 }
 
 #endif /* CONFIG_CC_HAS_ASM_AOR_FORMAT_FLAGS */
diff --git a/arch/s390/include/asm/ftrace.h b/arch/s390/include/asm/ftrace.h
index bee2d16c2951..692c484ec163 100644
--- a/arch/s390/include/asm/ftrace.h
+++ b/arch/s390/include/asm/ftrace.h
@@ -105,28 +105,11 @@ static inline void arch_ftrace_set_direct_caller(struct ftrace_regs *fregs, unsi
 }
 #endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */
 
-/*
- * Even though the system call numbers are identical for s390/s390x a
- * different system call table is used for compat tasks. This may lead
- * to e.g. incorrect or missing trace event sysfs files.
- * Therefore simply do not trace compat system calls at all.
- * See kernel/trace/trace_syscalls.c.
- */
-#define ARCH_TRACE_IGNORE_COMPAT_SYSCALLS
-static inline bool arch_trace_is_compat_syscall(struct pt_regs *regs)
-{
-	return is_compat_task();
-}
-
 #define ARCH_HAS_SYSCALL_MATCH_SYM_NAME
 static inline bool arch_syscall_match_sym_name(const char *sym,
 					       const char *name)
 {
-	/*
-	 * Skip __s390_ and __s390x_ prefix - due to compat wrappers
-	 * and aliasing some symbols of 64 bit system call functions
-	 * may get the __s390_ prefix instead of the __s390x_ prefix.
-	 */
+	/* Skip the __s390x_ prefix. */
 	return !strcmp(sym + 7, name) || !strcmp(sym + 8, name);
 }
 
diff --git a/arch/s390/include/asm/idals.h b/arch/s390/include/asm/idals.h
index ac68c657b28c..e5000ee6cdc6 100644
--- a/arch/s390/include/asm/idals.h
+++ b/arch/s390/include/asm/idals.h
@@ -181,6 +181,82 @@ static inline void idal_buffer_free(struct idal_buffer *ib)
 }
 
 /*
+ * Allocate an array of IDAL buffers to cover a total data size of @size. The
+ * resulting array is null-terminated.
+ *
+ * The amount of individual IDAL buffers is determined based on @size.
+ * Each IDAL buffer can have a maximum size of @CCW_MAX_BYTE_COUNT.
+ */
+static inline struct idal_buffer **idal_buffer_array_alloc(size_t size, int page_order)
+{
+	struct idal_buffer **ibs;
+	size_t ib_size; /* Size of a single idal buffer */
+	int count; /* Amount of individual idal buffers */
+	int i;
+
+	count = (size + CCW_MAX_BYTE_COUNT - 1) / CCW_MAX_BYTE_COUNT;
+	ibs = kmalloc_array(count + 1, sizeof(*ibs), GFP_KERNEL);
+	for (i = 0; i < count; i++) {
+		/* Determine size for the current idal buffer */
+		ib_size = min(size, CCW_MAX_BYTE_COUNT);
+		size -= ib_size;
+		ibs[i] = idal_buffer_alloc(ib_size, page_order);
+		if (IS_ERR(ibs[i])) {
+			while (i--)
+				idal_buffer_free(ibs[i]);
+			kfree(ibs);
+			ibs = NULL;
+			return ERR_PTR(-ENOMEM);
+		}
+	}
+	ibs[i] = NULL;
+	return ibs;
+}
+
+/*
+ * Free array of IDAL buffers
+ */
+static inline void idal_buffer_array_free(struct idal_buffer ***ibs)
+{
+	struct idal_buffer **p;
+
+	if (!ibs || !*ibs)
+		return;
+	for (p = *ibs; *p; p++)
+		idal_buffer_free(*p);
+	kfree(*ibs);
+	*ibs = NULL;
+}
+
+/*
+ * Determine size of IDAL buffer array
+ */
+static inline int idal_buffer_array_size(struct idal_buffer **ibs)
+{
+	int size = 0;
+
+	while (ibs && *ibs) {
+		size++;
+		ibs++;
+	}
+	return size;
+}
+
+/*
+ * Determine total data size covered by IDAL buffer array
+ */
+static inline size_t idal_buffer_array_datasize(struct idal_buffer **ibs)
+{
+	size_t size = 0;
+
+	while (ibs && *ibs) {
+		size += (*ibs)->size;
+		ibs++;
+	}
+	return size;
+}
+
+/*
  * Test if a idal list is really needed.
  */
 static inline bool __idal_buffer_is_needed(struct idal_buffer *ib)
diff --git a/arch/s390/include/asm/lowcore.h b/arch/s390/include/asm/lowcore.h
index d9c853db9a40..50ffe75adeb4 100644
--- a/arch/s390/include/asm/lowcore.h
+++ b/arch/s390/include/asm/lowcore.h
@@ -100,7 +100,8 @@ struct lowcore {
 
 	/* Save areas. */
 	__u64	save_area[8];			/* 0x0200 */
-	__u8	pad_0x0240[0x0280-0x0240];	/* 0x0240 */
+	__u64	stack_canary;			/* 0x0240 */
+	__u8	pad_0x0248[0x0280-0x0248];	/* 0x0248 */
 	__u64	save_area_restart[1];		/* 0x0280 */
 
 	__u64	pcpu;				/* 0x0288 */
diff --git a/arch/s390/include/asm/pai.h b/arch/s390/include/asm/pai.h
index ebeabd0aaa51..534d0320e2aa 100644
--- a/arch/s390/include/asm/pai.h
+++ b/arch/s390/include/asm/pai.h
@@ -77,6 +77,7 @@ static __always_inline void pai_kernel_exit(struct pt_regs *regs)
 
 #define PAI_SAVE_AREA(x)	((x)->hw.event_base)
 #define PAI_CPU_MASK(x)		((x)->hw.addr_filters)
+#define PAI_PMU_IDX(x)		((x)->hw.last_tag)
 #define PAI_SWLIST(x)		(&(x)->hw.tp_list)
 
 #endif
diff --git a/arch/s390/include/asm/percpu.h b/arch/s390/include/asm/percpu.h
index 965886dfe954..5899f57f17d1 100644
--- a/arch/s390/include/asm/percpu.h
+++ b/arch/s390/include/asm/percpu.h
@@ -13,14 +13,6 @@
 #define __my_cpu_offset get_lowcore()->percpu_offset
 
 /*
- * For 64 bit module code, the module may be more than 4G above the
- * per cpu area, use weak definitions to force the compiler to
- * generate external references.
- * Therefore, we have enabled CONFIG_ARCH_MODULE_NEEDS_WEAK_PER_CPU
- * in the Kconfig.
- */
-
-/*
  * We use a compare-and-swap loop since that uses less cpu cycles than
  * disabling and enabling interrupts like the generic variant would do.
  */
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index 6663f1619abb..bca9b29778c3 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -648,18 +648,6 @@ static inline int mm_uses_skeys(struct mm_struct *mm)
 	return 0;
 }
 
-static inline void csp(unsigned int *ptr, unsigned int old, unsigned int new)
-{
-	union register_pair r1 = { .even = old, .odd = new, };
-	unsigned long address = (unsigned long)ptr | 1;
-
-	asm volatile(
-		"	csp	%[r1],%[address]"
-		: [r1] "+&d" (r1.pair), "+m" (*ptr)
-		: [address] "d" (address)
-		: "cc");
-}
-
 /**
  * cspg() - Compare and Swap and Purge (CSPG)
  * @ptr: Pointer to the value to be exchanged
@@ -1400,7 +1388,6 @@ int set_pgste_bits(struct mm_struct *mm, unsigned long addr,
 int get_pgste(struct mm_struct *mm, unsigned long hva, unsigned long *pgstep);
 int pgste_perform_essa(struct mm_struct *mm, unsigned long hva, int orc,
 			unsigned long *oldpte, unsigned long *oldpgste);
-void gmap_pmdp_csp(struct mm_struct *mm, unsigned long vmaddr);
 void gmap_pmdp_invalidate(struct mm_struct *mm, unsigned long vmaddr);
 void gmap_pmdp_idte_local(struct mm_struct *mm, unsigned long vmaddr);
 void gmap_pmdp_idte_global(struct mm_struct *mm, unsigned long vmaddr);
@@ -1690,10 +1677,10 @@ static inline pmd_t mk_pmd_phys(unsigned long physpage, pgprot_t pgprot)
 
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLB_PAGE */
 
-static inline void __pmdp_csp(pmd_t *pmdp)
+static inline void __pmdp_cspg(pmd_t *pmdp)
 {
-	csp((unsigned int *)pmdp + 1, pmd_val(*pmdp),
-	    pmd_val(*pmdp) | _SEGMENT_ENTRY_INVALID);
+	cspg((unsigned long *)pmdp, pmd_val(*pmdp),
+	     pmd_val(*pmdp) | _SEGMENT_ENTRY_INVALID);
 }
 
 #define IDTE_GLOBAL	0
diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h
index 93e1034485d7..3affba95845b 100644
--- a/arch/s390/include/asm/processor.h
+++ b/arch/s390/include/asm/processor.h
@@ -119,18 +119,12 @@ extern void execve_tail(void);
 unsigned long vdso_text_size(void);
 unsigned long vdso_size(void);
 
-/*
- * User space process size: 2GB for 31 bit, 4TB or 8PT for 64 bit.
- */
-
-#define TASK_SIZE		(test_thread_flag(TIF_31BIT) ? \
-					_REGION3_SIZE : TASK_SIZE_MAX)
-#define TASK_UNMAPPED_BASE	(test_thread_flag(TIF_31BIT) ? \
-					(_REGION3_SIZE >> 1) : (_REGION2_SIZE >> 1))
+#define TASK_SIZE		(TASK_SIZE_MAX)
+#define TASK_UNMAPPED_BASE	(_REGION2_SIZE >> 1)
 #define TASK_SIZE_MAX		(-PAGE_SIZE)
 
 #define VDSO_BASE		(STACK_TOP + PAGE_SIZE)
-#define VDSO_LIMIT		(test_thread_flag(TIF_31BIT) ? _REGION3_SIZE : _REGION2_SIZE)
+#define VDSO_LIMIT		(_REGION2_SIZE)
 #define STACK_TOP		(VDSO_LIMIT - vdso_size() - PAGE_SIZE)
 #define STACK_TOP_MAX		(_REGION2_SIZE - vdso_size() - PAGE_SIZE)
 
@@ -181,7 +175,6 @@ struct thread_struct {
 	unsigned long system_timer;		/* task cputime in kernel space */
 	unsigned long hardirq_timer;		/* task cputime in hardirq context */
 	unsigned long softirq_timer;		/* task cputime in softirq context */
-	const sys_call_ptr_t *sys_call_table;	/* system call table address */
 	union teid gmap_teid;			/* address and flags of last gmap fault */
 	unsigned int gmap_int_code;		/* int code of last gmap fault */
 	int ufpu_flags;				/* user fpu flags */
@@ -379,14 +372,19 @@ static inline void local_mcck_enable(void)
 /*
  * Rewind PSW instruction address by specified number of bytes.
  */
-static inline unsigned long __rewind_psw(psw_t psw, unsigned long ilc)
+static inline unsigned long __rewind_psw(psw_t psw, long ilen)
 {
 	unsigned long mask;
 
 	mask = (psw.mask & PSW_MASK_EA) ? -1UL :
 	       (psw.mask & PSW_MASK_BA) ? (1UL << 31) - 1 :
 					  (1UL << 24) - 1;
-	return (psw.addr - ilc) & mask;
+	return (psw.addr - ilen) & mask;
+}
+
+static inline unsigned long __forward_psw(psw_t psw, long ilen)
+{
+	return __rewind_psw(psw, -ilen);
 }
 
 /*
diff --git a/arch/s390/include/asm/ptrace.h b/arch/s390/include/asm/ptrace.h
index dfa770b15fad..962cf042c66d 100644
--- a/arch/s390/include/asm/ptrace.h
+++ b/arch/s390/include/asm/ptrace.h
@@ -8,16 +8,19 @@
 #define _S390_PTRACE_H
 
 #include <linux/bits.h>
+#include <linux/typecheck.h>
 #include <uapi/asm/ptrace.h>
 #include <asm/thread_info.h>
 #include <asm/tpi.h>
 
 #define PIF_SYSCALL			0	/* inside a system call */
+#define PIF_PSW_ADDR_ADJUSTED		1	/* psw address has been adjusted */
 #define PIF_SYSCALL_RET_SET		2	/* return value was set via ptrace */
 #define PIF_GUEST_FAULT			3	/* indicates program check in sie64a */
 #define PIF_FTRACE_FULL_REGS		4	/* all register contents valid (ftrace) */
 
 #define _PIF_SYSCALL			BIT(PIF_SYSCALL)
+#define _PIF_ADDR_PSW_ADJUSTED		BIT(PIF_PSW_ADDR_ADJUSTED)
 #define _PIF_SYSCALL_RET_SET		BIT(PIF_SYSCALL_RET_SET)
 #define _PIF_GUEST_FAULT		BIT(PIF_GUEST_FAULT)
 #define _PIF_FTRACE_FULL_REGS		BIT(PIF_FTRACE_FULL_REGS)
@@ -99,7 +102,7 @@ enum {
 typedef struct {
 	unsigned int mask;
 	unsigned int addr;
-} psw_t32 __aligned(8);
+} psw32_t __aligned(8);
 
 #define PGM_INT_CODE_MASK	0x7f
 #define PGM_INT_CODE_PER	0x80
diff --git a/arch/s390/include/asm/seccomp.h b/arch/s390/include/asm/seccomp.h
index 71d46f0ba97b..f904b674fee0 100644
--- a/arch/s390/include/asm/seccomp.h
+++ b/arch/s390/include/asm/seccomp.h
@@ -19,10 +19,5 @@
 #define SECCOMP_ARCH_NATIVE		AUDIT_ARCH_S390X
 #define SECCOMP_ARCH_NATIVE_NR		NR_syscalls
 #define SECCOMP_ARCH_NATIVE_NAME	"s390x"
-#ifdef CONFIG_COMPAT
-# define SECCOMP_ARCH_COMPAT		AUDIT_ARCH_S390
-# define SECCOMP_ARCH_COMPAT_NR		NR_syscalls
-# define SECCOMP_ARCH_COMPAT_NAME	"s390"
-#endif
 
 #endif	/* _ASM_S390_SECCOMP_H */
diff --git a/arch/s390/include/asm/smp.h b/arch/s390/include/asm/smp.h
index 03f4d01664f8..fb2bdbf35da5 100644
--- a/arch/s390/include/asm/smp.h
+++ b/arch/s390/include/asm/smp.h
@@ -43,7 +43,7 @@ extern int __cpu_up(unsigned int cpu, struct task_struct *tidle);
 extern void arch_send_call_function_single_ipi(int cpu);
 extern void arch_send_call_function_ipi_mask(const struct cpumask *mask);
 
-extern void smp_call_ipl_cpu(void (*func)(void *), void *);
+extern void __noreturn smp_call_ipl_cpu(void (*func)(void *), void *data);
 extern void smp_emergency_stop(void);
 
 extern int smp_find_processor_id(u16 address);
diff --git a/arch/s390/include/asm/stackprotector.h b/arch/s390/include/asm/stackprotector.h
new file mode 100644
index 000000000000..0497850103dd
--- /dev/null
+++ b/arch/s390/include/asm/stackprotector.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _ASM_S390_STACKPROTECTOR_H
+#define _ASM_S390_STACKPROTECTOR_H
+
+#include <linux/sched.h>
+#include <asm/current.h>
+#include <asm/lowcore.h>
+
+static __always_inline void boot_init_stack_canary(void)
+{
+	current->stack_canary = get_random_canary();
+	get_lowcore()->stack_canary = current->stack_canary;
+}
+
+#endif /* _ASM_S390_STACKPROTECTOR_H */
diff --git a/arch/s390/include/asm/syscall.h b/arch/s390/include/asm/syscall.h
index 10ce5c4ccbd6..4271e4169f45 100644
--- a/arch/s390/include/asm/syscall.h
+++ b/arch/s390/include/asm/syscall.h
@@ -15,7 +15,6 @@
 #include <asm/ptrace.h>
 
 extern const sys_call_ptr_t sys_call_table[];
-extern const sys_call_ptr_t sys_call_table_emu[];
 
 static inline long syscall_get_nr(struct task_struct *task,
 				  struct pt_regs *regs)
@@ -46,15 +45,7 @@ static inline long syscall_get_error(struct task_struct *task,
 				     struct pt_regs *regs)
 {
 	unsigned long error = regs->gprs[2];
-#ifdef CONFIG_COMPAT
-	if (test_tsk_thread_flag(task, TIF_31BIT)) {
-		/*
-		 * Sign-extend the value so (int)-EFOO becomes (long)-EFOO
-		 * and will match correctly in comparisons.
-		 */
-		error = (long)(int)error;
-	}
-#endif
+
 	return IS_ERR_VALUE(error) ? error : 0;
 }
 
@@ -78,10 +69,6 @@ static inline void syscall_get_arguments(struct task_struct *task,
 {
 	unsigned long mask = -1UL;
 
-#ifdef CONFIG_COMPAT
-	if (test_tsk_thread_flag(task, TIF_31BIT))
-		mask = 0xffffffff;
-#endif
 	for (int i = 1; i < 6; i++)
 		args[i] = regs->gprs[2 + i] & mask;
 
@@ -99,10 +86,6 @@ static inline void syscall_set_arguments(struct task_struct *task,
 
 static inline int syscall_get_arch(struct task_struct *task)
 {
-#ifdef CONFIG_COMPAT
-	if (test_tsk_thread_flag(task, TIF_31BIT))
-		return AUDIT_ARCH_S390;
-#endif
 	return AUDIT_ARCH_S390X;
 }
 
diff --git a/arch/s390/include/asm/syscall_wrapper.h b/arch/s390/include/asm/syscall_wrapper.h
index 35c1d1b860d8..9eb58d5348d8 100644
--- a/arch/s390/include/asm/syscall_wrapper.h
+++ b/arch/s390/include/asm/syscall_wrapper.h
@@ -13,101 +13,12 @@
 	      ,, regs->orig_gpr2,, regs->gprs[3],, regs->gprs[4]	\
 	      ,, regs->gprs[5],, regs->gprs[6],, regs->gprs[7])
 
-#ifdef CONFIG_COMPAT
-
-#define __SC_COMPAT_CAST(t, a)						\
-({									\
-	long __ReS = a;							\
-									\
-	BUILD_BUG_ON((sizeof(t) > 4) && !__TYPE_IS_L(t) &&		\
-		     !__TYPE_IS_UL(t) && !__TYPE_IS_PTR(t) &&		\
-		     !__TYPE_IS_LL(t));					\
-	if (__TYPE_IS_L(t))						\
-		__ReS = (s32)a;						\
-	if (__TYPE_IS_UL(t))						\
-		__ReS = (u32)a;						\
-	if (__TYPE_IS_PTR(t))						\
-		__ReS = a & 0x7fffffff;					\
-	if (__TYPE_IS_LL(t))						\
-		return -ENOSYS;						\
-	(t)__ReS;							\
-})
-
-/*
- * To keep the naming coherent, re-define SYSCALL_DEFINE0 to create an alias
- * named __s390x_sys_*()
- */
-#define COMPAT_SYSCALL_DEFINE0(sname)					\
-	long __s390_compat_sys_##sname(void);				\
-	ALLOW_ERROR_INJECTION(__s390_compat_sys_##sname, ERRNO);	\
-	long __s390_compat_sys_##sname(void)
-
-#define SYSCALL_DEFINE0(sname)						\
-	SYSCALL_METADATA(_##sname, 0);					\
-	long __s390_sys_##sname(void);					\
-	ALLOW_ERROR_INJECTION(__s390_sys_##sname, ERRNO);		\
-	long __s390x_sys_##sname(void);					\
-	ALLOW_ERROR_INJECTION(__s390x_sys_##sname, ERRNO);		\
-	static inline long __do_sys_##sname(void);			\
-	long __s390_sys_##sname(void)					\
-	{								\
-		return __do_sys_##sname();				\
-	}								\
-	long __s390x_sys_##sname(void)					\
-	{								\
-		return __do_sys_##sname();				\
-	}								\
-	static inline long __do_sys_##sname(void)
-
-#define COND_SYSCALL(name)						\
-	cond_syscall(__s390x_sys_##name);				\
-	cond_syscall(__s390_sys_##name)
-
-#define COMPAT_SYSCALL_DEFINEx(x, name, ...)						\
-	long __s390_compat_sys##name(struct pt_regs *regs);				\
-	ALLOW_ERROR_INJECTION(__s390_compat_sys##name, ERRNO);				\
-	static inline long __se_compat_sys##name(__MAP(x, __SC_LONG, __VA_ARGS__));	\
-	static inline long __do_compat_sys##name(__MAP(x, __SC_DECL, __VA_ARGS__));	\
-	long __s390_compat_sys##name(struct pt_regs *regs)				\
-	{										\
-		return __se_compat_sys##name(SC_S390_REGS_TO_ARGS(x, __VA_ARGS__));	\
-	}										\
-	static inline long __se_compat_sys##name(__MAP(x, __SC_LONG, __VA_ARGS__))	\
-	{										\
-		__MAP(x, __SC_TEST, __VA_ARGS__);					\
-		return __do_compat_sys##name(__MAP(x, __SC_DELOUSE, __VA_ARGS__));	\
-	}										\
-	static inline long __do_compat_sys##name(__MAP(x, __SC_DECL, __VA_ARGS__))
-
-/*
- * As some compat syscalls may not be implemented, we need to expand
- * COND_SYSCALL_COMPAT in kernel/sys_ni.c to cover this case as well.
- */
-#define COND_SYSCALL_COMPAT(name)					\
-	cond_syscall(__s390_compat_sys_##name)
-
-#define __S390_SYS_STUBx(x, name, ...)						\
-	long __s390_sys##name(struct pt_regs *regs);				\
-	ALLOW_ERROR_INJECTION(__s390_sys##name, ERRNO);				\
-	static inline long ___se_sys##name(__MAP(x, __SC_LONG, __VA_ARGS__));	\
-	long __s390_sys##name(struct pt_regs *regs)				\
-	{									\
-		return ___se_sys##name(SC_S390_REGS_TO_ARGS(x, __VA_ARGS__));	\
-	}									\
-	static inline long ___se_sys##name(__MAP(x, __SC_LONG, __VA_ARGS__))	\
-	{									\
-		__MAP(x, __SC_TEST, __VA_ARGS__);				\
-		return __do_sys##name(__MAP(x, __SC_COMPAT_CAST, __VA_ARGS__));	\
-	}
-
-#else /* CONFIG_COMPAT */
-
 #define SYSCALL_DEFINE0(sname)						\
 	SYSCALL_METADATA(_##sname, 0);					\
-	long __s390x_sys_##sname(void);					\
+	long __s390x_sys_##sname(struct pt_regs *__unused);		\
 	ALLOW_ERROR_INJECTION(__s390x_sys_##sname, ERRNO);		\
 	static inline long __do_sys_##sname(void);			\
-	long __s390x_sys_##sname(void)					\
+	long __s390x_sys_##sname(struct pt_regs *__unused)		\
 	{								\
 		return __do_sys_##sname();				\
 	}								\
@@ -118,8 +29,6 @@
 
 #define __S390_SYS_STUBx(x, fullname, name, ...)
 
-#endif /* CONFIG_COMPAT */
-
 #define __SYSCALL_DEFINEx(x, name, ...)						\
 	long __s390x_sys##name(struct pt_regs *regs);				\
 	ALLOW_ERROR_INJECTION(__s390x_sys##name, ERRNO);			\
diff --git a/arch/s390/include/asm/thread_info.h b/arch/s390/include/asm/thread_info.h
index 7878e9bfbf07..6a548a819400 100644
--- a/arch/s390/include/asm/thread_info.h
+++ b/arch/s390/include/asm/thread_info.h
@@ -69,7 +69,6 @@ void arch_setup_new_exec(void);
 #define TIF_GUARDED_STORAGE	17	/* load guarded storage control block */
 #define TIF_ISOLATE_BP_GUEST	18	/* Run KVM guests with isolated BP */
 #define TIF_PER_TRAP		19	/* Need to handle PER trap on exit to usermode */
-#define TIF_31BIT		20	/* 32bit process */
 #define TIF_SINGLE_STEP		21	/* This task is single stepped */
 #define TIF_BLOCK_STEP		22	/* This task is block stepped */
 #define TIF_UPROBE_SINGLESTEP	23	/* This task is uprobe single stepped */
@@ -78,7 +77,6 @@ void arch_setup_new_exec(void);
 #define _TIF_GUARDED_STORAGE	BIT(TIF_GUARDED_STORAGE)
 #define _TIF_ISOLATE_BP_GUEST	BIT(TIF_ISOLATE_BP_GUEST)
 #define _TIF_PER_TRAP		BIT(TIF_PER_TRAP)
-#define _TIF_31BIT		BIT(TIF_31BIT)
 #define _TIF_SINGLE_STEP	BIT(TIF_SINGLE_STEP)
 #define _TIF_BLOCK_STEP		BIT(TIF_BLOCK_STEP)
 #define _TIF_UPROBE_SINGLESTEP	BIT(TIF_UPROBE_SINGLESTEP)
diff --git a/arch/s390/include/asm/tlbflush.h b/arch/s390/include/asm/tlbflush.h
index 75491baa2197..163ccbbe8c47 100644
--- a/arch/s390/include/asm/tlbflush.h
+++ b/arch/s390/include/asm/tlbflush.h
@@ -35,9 +35,9 @@ static inline void __tlb_flush_idte(unsigned long asce)
  */
 static inline void __tlb_flush_global(void)
 {
-	unsigned int dummy = 0;
+	unsigned long dummy = 0;
 
-	csp(&dummy, 0, 0);
+	cspg(&dummy, 0, 0);
 }
 
 /*
@@ -54,7 +54,7 @@ static inline void __tlb_flush_mm(struct mm_struct *mm)
 	cpumask_copy(mm_cpumask(mm), &mm->context.cpu_attach_mask);
 	barrier();
 	gmap_asce = READ_ONCE(mm->context.gmap_asce);
-	if (cpu_has_idte() && gmap_asce != -1UL) {
+	if (gmap_asce != -1UL) {
 		if (gmap_asce)
 			__tlb_flush_idte(gmap_asce);
 		__tlb_flush_idte(mm->context.asce);
@@ -68,10 +68,7 @@ static inline void __tlb_flush_mm(struct mm_struct *mm)
 
 static inline void __tlb_flush_kernel(void)
 {
-	if (cpu_has_idte())
-		__tlb_flush_idte(init_mm.context.asce);
-	else
-		__tlb_flush_global();
+	__tlb_flush_idte(init_mm.context.asce);
 }
 
 static inline void __tlb_flush_mm_lazy(struct mm_struct * mm)
@@ -86,7 +83,6 @@ static inline void __tlb_flush_mm_lazy(struct mm_struct * mm)
 
 /*
  * TLB flushing:
- *  flush_tlb() - flushes the current mm struct TLBs
  *  flush_tlb_all() - flushes all processes TLBs
  *  flush_tlb_mm(mm) - flushes the specified mm context TLB's
  *  flush_tlb_page(vma, vmaddr) - flushes one page
@@ -102,7 +98,6 @@ static inline void __tlb_flush_mm_lazy(struct mm_struct * mm)
  * only one user. At the end of the update the flush_tlb_mm and
  * flush_tlb_range functions need to do the flush.
  */
-#define flush_tlb()				do { } while (0)
 #define flush_tlb_all()				do { } while (0)
 #define flush_tlb_page(vma, addr)		do { } while (0)
 
diff --git a/arch/s390/include/asm/trace/ap.h b/arch/s390/include/asm/trace/ap.h
new file mode 100644
index 000000000000..5c2e6c664b4d
--- /dev/null
+++ b/arch/s390/include/asm/trace/ap.h
@@ -0,0 +1,87 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Tracepoint definitions for s390 ap bus related trace events
+ *
+ * There are two AP bus related tracepoint events defined here:
+ * There is a tracepoint s390_ap_nqap event immediately after a request
+ * has been pushed into the AP firmware queue with the NQAP AP command.
+ * The other tracepoint s390_ap_dqap event fires immediately after a
+ * reply has been pulled out of the AP firmware queue via DQAP AP command.
+ * The idea of these two trace events focuses on performance to measure
+ * the runtime of a crypto request/reply as close as possible at the
+ * firmware level. In combination with the two zcrypt tracepoints (see the
+ * zcrypt.h trace event definition file) this gives measurement data about
+ * the runtime of a request/reply within the zcrpyt and AP bus layer.
+ */
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM s390
+
+#if !defined(_TRACE_S390_AP_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_S390_AP_H
+
+#include <linux/tracepoint.h>
+
+DECLARE_EVENT_CLASS(s390_ap_nqapdqap_template,
+		    TP_PROTO(u16 card, u16 dom, u32 status, u64 psmid),
+		    TP_ARGS(card, dom, status, psmid),
+		    TP_STRUCT__entry(
+			    __field(u16, card)
+			    __field(u16, dom)
+			    __field(u32, status)
+			    __field(u64, psmid)),
+		    TP_fast_assign(
+			    __entry->card = card;
+			    __entry->dom = dom;
+			    __entry->status = status;
+			    __entry->psmid = psmid;),
+		    TP_printk("card=%u dom=%u status=0x%08x psmid=0x%016lx",
+			      (unsigned short)__entry->card,
+			      (unsigned short)__entry->dom,
+			      (unsigned int)__entry->status,
+			      (unsigned long)__entry->psmid)
+);
+
+/**
+ * trace_s390_ap_nqap - ap msg nqap tracepoint function
+ * @card:   Crypto card number addressed.
+ * @dom:    Domain within the crypto card addressed.
+ * @status: AP queue status (GR1 on return of nqap).
+ * @psmid:  Unique id identifying this request/reply.
+ *
+ * Called immediately after a request has been enqueued into
+ * the AP firmware queue with the NQAP command.
+ */
+DEFINE_EVENT(s390_ap_nqapdqap_template,
+	     s390_ap_nqap,
+	     TP_PROTO(u16 card, u16 dom, u32 status, u64 psmid),
+	     TP_ARGS(card, dom, status, psmid)
+);
+
+/**
+ * trace_s390_ap_dqap - ap msg dqap tracepoint function
+ * @card:  Crypto card number addressed.
+ * @dom:   Domain within the crypto card addressed.
+ * @status: AP queue status (GR1 on return of dqap).
+ * @psmid: Unique id identifying this request/reply.
+ *
+ * Called immediately after a reply has been dequeued from
+ * the AP firmware queue with the DQAP command.
+ */
+DEFINE_EVENT(s390_ap_nqapdqap_template,
+	     s390_ap_dqap,
+	     TP_PROTO(u16 card, u16 dom, u32 status, u64 psmid),
+	     TP_ARGS(card, dom, status, psmid)
+);
+
+#endif /* _TRACE_S390_AP_H */
+
+/* This part must be outside protection */
+
+#undef TRACE_INCLUDE_PATH
+#undef TRACE_INCLUDE_FILE
+
+#define TRACE_INCLUDE_PATH asm/trace
+#define TRACE_INCLUDE_FILE ap
+
+#include <trace/define_trace.h>
diff --git a/arch/s390/include/asm/trace/zcrypt.h b/arch/s390/include/asm/trace/zcrypt.h
index 457ddaa99e19..bfb01e335f31 100644
--- a/arch/s390/include/asm/trace/zcrypt.h
+++ b/arch/s390/include/asm/trace/zcrypt.h
@@ -2,7 +2,7 @@
 /*
  * Tracepoint definitions for the s390 zcrypt device driver
  *
- * Copyright IBM Corp. 2016
+ * Copyright IBM Corp. 2016,2025
  * Author(s): Harald Freudenberger <freude@de.ibm.com>
  *
  * Currently there are two tracepoint events defined here.
@@ -73,14 +73,15 @@ TRACE_EVENT(s390_zcrypt_req,
 
 /**
  * trace_s390_zcrypt_rep - zcrypt reply tracepoint function
- * @ptr:  Address of the local buffer where the request from userspace
- *	  is stored. Can be used as a unique id to match together
- *	  request and reply.
- * @fc:   Function code.
- * @rc:   The bare returncode as returned by the device driver ioctl
- *	  function.
- * @dev:  The adapter nr where this request was actually processed.
- * @dom:  Domain id of the device where this request was processed.
+ * @ptr:   Address of the local buffer where the request from userspace
+ *	   is stored. Can be used as a unique id to match together
+ *	   request and reply.
+ * @fc:    Function code.
+ * @rc:    The bare returncode as returned by the device driver ioctl
+ *	   function.
+ * @card:  The adapter nr where this request was actually processed.
+ * @dom:   Domain id of the device where this request was processed.
+ * @psmid: Unique id identifying this request/reply.
  *
  * Called upon recognising the reply from the crypto adapter. This
  * message may act as the exit timestamp for the request but also
@@ -88,26 +89,29 @@ TRACE_EVENT(s390_zcrypt_req,
  * and the returncode from the device driver.
  */
 TRACE_EVENT(s390_zcrypt_rep,
-	    TP_PROTO(void *ptr, u32 fc, u32 rc, u16 dev, u16 dom),
-	    TP_ARGS(ptr, fc, rc, dev, dom),
+	    TP_PROTO(void *ptr, u32 fc, u32 rc, u16 card, u16 dom, u64 psmid),
+	    TP_ARGS(ptr, fc, rc, card, dom, psmid),
 	    TP_STRUCT__entry(
 		    __field(void *, ptr)
 		    __field(u32, fc)
 		    __field(u32, rc)
-		    __field(u16, device)
-		    __field(u16, domain)),
+		    __field(u16, card)
+		    __field(u16, dom)
+		    __field(u64, psmid)),
 	    TP_fast_assign(
 		    __entry->ptr = ptr;
 		    __entry->fc = fc;
 		    __entry->rc = rc;
-		    __entry->device = dev;
-		    __entry->domain = dom;),
-	    TP_printk("ptr=%p fc=0x%04x rc=%d dev=0x%02hx domain=0x%04hx",
+		    __entry->card = card;
+		    __entry->dom = dom;
+		    __entry->psmid = psmid;),
+	    TP_printk("ptr=%p fc=0x%04x rc=%d card=%u dom=%u psmid=0x%016lx",
 		      __entry->ptr,
-		      (unsigned int) __entry->fc,
-		      (int) __entry->rc,
-		      (unsigned short) __entry->device,
-		      (unsigned short) __entry->domain)
+		      (unsigned int)__entry->fc,
+		      (int)__entry->rc,
+		      (unsigned short)__entry->card,
+		      (unsigned short)__entry->dom,
+		      (unsigned long)__entry->psmid)
 );
 
 #endif /* _TRACE_S390_ZCRYPT_H */
diff --git a/arch/s390/include/asm/unistd.h b/arch/s390/include/asm/unistd.h
index 70fc671397da..921c3fb3586b 100644
--- a/arch/s390/include/asm/unistd.h
+++ b/arch/s390/include/asm/unistd.h
@@ -8,7 +8,8 @@
 #define _ASM_S390_UNISTD_H_
 
 #include <uapi/asm/unistd.h>
-#include <asm/unistd_nr.h>
+
+#define NR_syscalls (__NR_syscalls)
 
 #define __ARCH_WANT_NEW_STAT
 #define __ARCH_WANT_OLD_READDIR
@@ -27,11 +28,6 @@
 #define __ARCH_WANT_SYS_OLDUMOUNT
 #define __ARCH_WANT_SYS_SIGPENDING
 #define __ARCH_WANT_SYS_SIGPROCMASK
-# ifdef CONFIG_COMPAT
-#   define __ARCH_WANT_COMPAT_STAT
-#   define __ARCH_WANT_SYS_TIME32
-#   define __ARCH_WANT_SYS_UTIME32
-# endif
 #define __ARCH_WANT_SYS_FORK
 #define __ARCH_WANT_SYS_VFORK
 #define __ARCH_WANT_SYS_CLONE
diff --git a/arch/s390/include/asm/vdso-symbols.h b/arch/s390/include/asm/vdso-symbols.h
index 0df17574d788..e3561e67c4e3 100644
--- a/arch/s390/include/asm/vdso-symbols.h
+++ b/arch/s390/include/asm/vdso-symbols.h
@@ -2,16 +2,8 @@
 #ifndef __S390_VDSO_SYMBOLS_H__
 #define __S390_VDSO_SYMBOLS_H__
 
-#include <generated/vdso64-offsets.h>
-#ifdef CONFIG_COMPAT
-#include <generated/vdso32-offsets.h>
-#endif
+#include <generated/vdso-offsets.h>
 
-#define VDSO64_SYMBOL(tsk, name) ((tsk)->mm->context.vdso_base + (vdso64_offset_##name))
-#ifdef CONFIG_COMPAT
-#define VDSO32_SYMBOL(tsk, name) ((tsk)->mm->context.vdso_base + (vdso32_offset_##name))
-#else
-#define VDSO32_SYMBOL(tsk, name) (-1UL)
-#endif
+#define VDSO_SYMBOL(tsk, name) ((tsk)->mm->context.vdso_base + (vdso_offset_##name))
 
 #endif /* __S390_VDSO_SYMBOLS_H__ */
diff --git a/arch/s390/include/uapi/asm/bitsperlong.h b/arch/s390/include/uapi/asm/bitsperlong.h
index cceaf47b021a..7af27a985f25 100644
--- a/arch/s390/include/uapi/asm/bitsperlong.h
+++ b/arch/s390/include/uapi/asm/bitsperlong.h
@@ -2,11 +2,7 @@
 #ifndef __ASM_S390_BITSPERLONG_H
 #define __ASM_S390_BITSPERLONG_H
 
-#ifndef __s390x__
-#define __BITS_PER_LONG 32
-#else
 #define __BITS_PER_LONG 64
-#endif
 
 #include <asm-generic/bitsperlong.h>
 
diff --git a/arch/s390/include/uapi/asm/ipcbuf.h b/arch/s390/include/uapi/asm/ipcbuf.h
index 1030cd186899..9277e76d6d72 100644
--- a/arch/s390/include/uapi/asm/ipcbuf.h
+++ b/arch/s390/include/uapi/asm/ipcbuf.h
@@ -24,9 +24,6 @@ struct ipc64_perm
 	__kernel_mode_t		mode;
 	unsigned short		__pad1;
 	unsigned short		seq;
-#ifndef __s390x__
-	unsigned short		__pad2;
-#endif /* ! __s390x__ */
 	unsigned long		__unused1;
 	unsigned long		__unused2;
 };
diff --git a/arch/s390/include/uapi/asm/posix_types.h b/arch/s390/include/uapi/asm/posix_types.h
index 1913613e71b6..ad5ab940d192 100644
--- a/arch/s390/include/uapi/asm/posix_types.h
+++ b/arch/s390/include/uapi/asm/posix_types.h
@@ -26,17 +26,6 @@ typedef unsigned short __kernel_old_gid_t;
 #define __kernel_old_uid_t __kernel_old_uid_t
 #endif
 
-#ifndef __s390x__
-
-typedef unsigned long   __kernel_ino_t;
-typedef unsigned short  __kernel_mode_t;
-typedef unsigned short  __kernel_ipc_pid_t;
-typedef unsigned short  __kernel_uid_t;
-typedef unsigned short  __kernel_gid_t;
-typedef int             __kernel_ptrdiff_t;
-
-#else /* __s390x__ */
-
 typedef unsigned int    __kernel_ino_t;
 typedef unsigned int    __kernel_mode_t;
 typedef int             __kernel_ipc_pid_t;
@@ -45,8 +34,6 @@ typedef unsigned int    __kernel_gid_t;
 typedef long            __kernel_ptrdiff_t;
 typedef unsigned long   __kernel_sigset_t;      /* at least 32 bits */
 
-#endif /* __s390x__ */
-
 #define __kernel_ino_t  __kernel_ino_t
 #define __kernel_mode_t __kernel_mode_t
 #define __kernel_ipc_pid_t __kernel_ipc_pid_t
diff --git a/arch/s390/include/uapi/asm/ptrace.h b/arch/s390/include/uapi/asm/ptrace.h
index ea202072f1ad..ea29ba470e5a 100644
--- a/arch/s390/include/uapi/asm/ptrace.h
+++ b/arch/s390/include/uapi/asm/ptrace.h
@@ -14,94 +14,6 @@
  * Offsets in the user_regs_struct. They are used for the ptrace
  * system call and in entry.S
  */
-#ifndef __s390x__
-
-#define PT_PSWMASK  0x00
-#define PT_PSWADDR  0x04
-#define PT_GPR0     0x08
-#define PT_GPR1     0x0C
-#define PT_GPR2     0x10
-#define PT_GPR3     0x14
-#define PT_GPR4     0x18
-#define PT_GPR5     0x1C
-#define PT_GPR6     0x20
-#define PT_GPR7     0x24
-#define PT_GPR8     0x28
-#define PT_GPR9     0x2C
-#define PT_GPR10    0x30
-#define PT_GPR11    0x34
-#define PT_GPR12    0x38
-#define PT_GPR13    0x3C
-#define PT_GPR14    0x40
-#define PT_GPR15    0x44
-#define PT_ACR0     0x48
-#define PT_ACR1     0x4C
-#define PT_ACR2     0x50
-#define PT_ACR3     0x54
-#define PT_ACR4	    0x58
-#define PT_ACR5	    0x5C
-#define PT_ACR6	    0x60
-#define PT_ACR7	    0x64
-#define PT_ACR8	    0x68
-#define PT_ACR9	    0x6C
-#define PT_ACR10    0x70
-#define PT_ACR11    0x74
-#define PT_ACR12    0x78
-#define PT_ACR13    0x7C
-#define PT_ACR14    0x80
-#define PT_ACR15    0x84
-#define PT_ORIGGPR2 0x88
-#define PT_FPC	    0x90
-/*
- * A nasty fact of life that the ptrace api
- * only supports passing of longs.
- */
-#define PT_FPR0_HI  0x98
-#define PT_FPR0_LO  0x9C
-#define PT_FPR1_HI  0xA0
-#define PT_FPR1_LO  0xA4
-#define PT_FPR2_HI  0xA8
-#define PT_FPR2_LO  0xAC
-#define PT_FPR3_HI  0xB0
-#define PT_FPR3_LO  0xB4
-#define PT_FPR4_HI  0xB8
-#define PT_FPR4_LO  0xBC
-#define PT_FPR5_HI  0xC0
-#define PT_FPR5_LO  0xC4
-#define PT_FPR6_HI  0xC8
-#define PT_FPR6_LO  0xCC
-#define PT_FPR7_HI  0xD0
-#define PT_FPR7_LO  0xD4
-#define PT_FPR8_HI  0xD8
-#define PT_FPR8_LO  0XDC
-#define PT_FPR9_HI  0xE0
-#define PT_FPR9_LO  0xE4
-#define PT_FPR10_HI 0xE8
-#define PT_FPR10_LO 0xEC
-#define PT_FPR11_HI 0xF0
-#define PT_FPR11_LO 0xF4
-#define PT_FPR12_HI 0xF8
-#define PT_FPR12_LO 0xFC
-#define PT_FPR13_HI 0x100
-#define PT_FPR13_LO 0x104
-#define PT_FPR14_HI 0x108
-#define PT_FPR14_LO 0x10C
-#define PT_FPR15_HI 0x110
-#define PT_FPR15_LO 0x114
-#define PT_CR_9	    0x118
-#define PT_CR_10    0x11C
-#define PT_CR_11    0x120
-#define PT_IEEE_IP  0x13C
-#define PT_LASTOFF  PT_IEEE_IP
-#define PT_ENDREGS  0x140-1
-
-#define GPR_SIZE	4
-#define CR_SIZE		4
-
-#define STACK_FRAME_OVERHEAD	96	/* size of minimum stack frame */
-
-#else /* __s390x__ */
-
 #define PT_PSWMASK  0x00
 #define PT_PSWADDR  0x08
 #define PT_GPR0     0x10
@@ -166,38 +78,6 @@
 
 #define STACK_FRAME_OVERHEAD	160	 /* size of minimum stack frame */
 
-#endif /* __s390x__ */
-
-#ifndef __s390x__
-
-#define PSW_MASK_PER		_AC(0x40000000, UL)
-#define PSW_MASK_DAT		_AC(0x04000000, UL)
-#define PSW_MASK_IO		_AC(0x02000000, UL)
-#define PSW_MASK_EXT		_AC(0x01000000, UL)
-#define PSW_MASK_KEY		_AC(0x00F00000, UL)
-#define PSW_MASK_BASE		_AC(0x00080000, UL)	/* always one */
-#define PSW_MASK_MCHECK		_AC(0x00040000, UL)
-#define PSW_MASK_WAIT		_AC(0x00020000, UL)
-#define PSW_MASK_PSTATE		_AC(0x00010000, UL)
-#define PSW_MASK_ASC		_AC(0x0000C000, UL)
-#define PSW_MASK_CC		_AC(0x00003000, UL)
-#define PSW_MASK_PM		_AC(0x00000F00, UL)
-#define PSW_MASK_RI		_AC(0x00000000, UL)
-#define PSW_MASK_EA		_AC(0x00000000, UL)
-#define PSW_MASK_BA		_AC(0x00000000, UL)
-
-#define PSW_MASK_USER		_AC(0x0000FF00, UL)
-
-#define PSW_ADDR_AMODE		_AC(0x80000000, UL)
-#define PSW_ADDR_INSN		_AC(0x7FFFFFFF, UL)
-
-#define PSW_ASC_PRIMARY		_AC(0x00000000, UL)
-#define PSW_ASC_ACCREG		_AC(0x00004000, UL)
-#define PSW_ASC_SECONDARY	_AC(0x00008000, UL)
-#define PSW_ASC_HOME		_AC(0x0000C000, UL)
-
-#else /* __s390x__ */
-
 #define PSW_MASK_PER		_AC(0x4000000000000000, UL)
 #define PSW_MASK_DAT		_AC(0x0400000000000000, UL)
 #define PSW_MASK_IO		_AC(0x0200000000000000, UL)
@@ -224,8 +104,6 @@
 #define PSW_ASC_SECONDARY	_AC(0x0000800000000000, UL)
 #define PSW_ASC_HOME		_AC(0x0000C00000000000, UL)
 
-#endif /* __s390x__ */
-
 #define NUM_GPRS	16
 #define NUM_FPRS	16
 #define NUM_CRS		16
@@ -308,9 +186,7 @@ typedef struct {
 #define PER_EM_MASK 0xE8000000UL
 
 typedef struct {
-#ifdef __s390x__
 	unsigned		       : 32;
-#endif /* __s390x__ */
 	unsigned em_branching	       : 1;
 	unsigned em_instruction_fetch  : 1;
 	/*
diff --git a/arch/s390/include/uapi/asm/sigcontext.h b/arch/s390/include/uapi/asm/sigcontext.h
index 8b35033334c4..7c90b30c50fd 100644
--- a/arch/s390/include/uapi/asm/sigcontext.h
+++ b/arch/s390/include/uapi/asm/sigcontext.h
@@ -17,24 +17,12 @@
 #define __NUM_VXRS_LOW		16
 #define __NUM_VXRS_HIGH		16
 
-#ifndef __s390x__
-
-/* Has to be at least _NSIG_WORDS from asm/signal.h */
-#define _SIGCONTEXT_NSIG	64
-#define _SIGCONTEXT_NSIG_BPW	32
-/* Size of stack frame allocated when calling signal handler. */
-#define __SIGNAL_FRAMESIZE	96
-
-#else /* __s390x__ */
-
 /* Has to be at least _NSIG_WORDS from asm/signal.h */
 #define _SIGCONTEXT_NSIG	64
 #define _SIGCONTEXT_NSIG_BPW	64 
 /* Size of stack frame allocated when calling signal handler. */
 #define __SIGNAL_FRAMESIZE	160
 
-#endif /* __s390x__ */
-
 #define _SIGCONTEXT_NSIG_WORDS	(_SIGCONTEXT_NSIG / _SIGCONTEXT_NSIG_BPW)
 #define _SIGMASK_COPY_SIZE	(sizeof(unsigned long)*_SIGCONTEXT_NSIG_WORDS)
 
@@ -66,9 +54,6 @@ typedef struct
 
 typedef struct
 {
-#ifndef __s390x__
-	unsigned long gprs_high[__NUM_GPRS];
-#endif
 	unsigned long long vxrs_low[__NUM_VXRS_LOW];
 	__vector128 vxrs_high[__NUM_VXRS_HIGH];
 	unsigned char __reserved[128];
diff --git a/arch/s390/include/uapi/asm/stat.h b/arch/s390/include/uapi/asm/stat.h
index ac253d23606b..21599298c2f5 100644
--- a/arch/s390/include/uapi/asm/stat.h
+++ b/arch/s390/include/uapi/asm/stat.h
@@ -8,74 +8,6 @@
 #ifndef _S390_STAT_H
 #define _S390_STAT_H
 
-#ifndef __s390x__
-struct __old_kernel_stat {
-        unsigned short st_dev;
-        unsigned short st_ino;
-        unsigned short st_mode;
-        unsigned short st_nlink;
-        unsigned short st_uid;
-        unsigned short st_gid;
-        unsigned short st_rdev;
-        unsigned long  st_size;
-        unsigned long  st_atime;
-        unsigned long  st_mtime;
-        unsigned long  st_ctime;
-};
-
-struct stat {
-        unsigned short st_dev;
-        unsigned short __pad1;
-        unsigned long  st_ino;
-        unsigned short st_mode;
-        unsigned short st_nlink;
-        unsigned short st_uid;
-        unsigned short st_gid;
-        unsigned short st_rdev;
-        unsigned short __pad2;
-        unsigned long  st_size;
-        unsigned long  st_blksize;
-        unsigned long  st_blocks;
-        unsigned long  st_atime;
-        unsigned long  st_atime_nsec;
-        unsigned long  st_mtime;
-        unsigned long  st_mtime_nsec;
-        unsigned long  st_ctime;
-        unsigned long  st_ctime_nsec;
-        unsigned long  __unused4;
-        unsigned long  __unused5;
-};
-
-/* This matches struct stat64 in glibc2.1, hence the absolutely
- * insane amounts of padding around dev_t's.
- */
-struct stat64 {
-        unsigned long long	st_dev;
-        unsigned int    __pad1;
-#define STAT64_HAS_BROKEN_ST_INO        1
-        unsigned long   __st_ino;
-        unsigned int    st_mode;
-        unsigned int    st_nlink;
-        unsigned long   st_uid;
-        unsigned long   st_gid;
-        unsigned long long	st_rdev;
-        unsigned int    __pad3;
-        long long	st_size;
-        unsigned long   st_blksize;
-        unsigned char   __pad4[4];
-        unsigned long   __pad5;     /* future possible st_blocks high bits */
-        unsigned long   st_blocks;  /* Number 512-byte blocks allocated. */
-        unsigned long   st_atime;
-        unsigned long   st_atime_nsec;
-        unsigned long   st_mtime;
-        unsigned long   st_mtime_nsec;
-        unsigned long   st_ctime;
-        unsigned long   st_ctime_nsec;  /* will be high 32 bits of ctime someday */
-        unsigned long long	st_ino;
-};
-
-#else /* __s390x__ */
-
 struct stat {
         unsigned long  st_dev;
         unsigned long  st_ino;
@@ -97,8 +29,6 @@ struct stat {
         unsigned long  __unused[3];
 };
 
-#endif /* __s390x__ */
-
 #define STAT_HAVE_NSEC 1
 
 #endif
diff --git a/arch/s390/include/uapi/asm/unistd.h b/arch/s390/include/uapi/asm/unistd.h
index 01b5fe8b9db6..b0c5afe19db2 100644
--- a/arch/s390/include/uapi/asm/unistd.h
+++ b/arch/s390/include/uapi/asm/unistd.h
@@ -8,10 +8,6 @@
 #ifndef _UAPI_ASM_S390_UNISTD_H_
 #define _UAPI_ASM_S390_UNISTD_H_
 
-#ifdef __s390x__
 #include <asm/unistd_64.h>
-#else
-#include <asm/unistd_32.h>
-#endif
 
 #endif /* _UAPI_ASM_S390_UNISTD_H_ */
diff --git a/arch/s390/kernel/Makefile b/arch/s390/kernel/Makefile
index eb06ff888314..42c83d60d6fa 100644
--- a/arch/s390/kernel/Makefile
+++ b/arch/s390/kernel/Makefile
@@ -36,7 +36,7 @@ CFLAGS_stacktrace.o	+= -fno-optimize-sibling-calls
 CFLAGS_dumpstack.o	+= -fno-optimize-sibling-calls
 CFLAGS_unwind_bc.o	+= -fno-optimize-sibling-calls
 
-obj-y	:= head64.o traps.o time.o process.o early.o setup.o idle.o vtime.o
+obj-y	:= head.o traps.o time.o process.o early.o setup.o idle.o vtime.o
 obj-y	+= processor.o syscall.o ptrace.o signal.o cpcmd.o ebcdic.o nmi.o
 obj-y	+= debug.o irq.o ipl.o dis.o vdso.o cpufeature.o
 obj-y	+= sysinfo.o lgr.o os_info.o ctlreg.o
@@ -56,9 +56,6 @@ obj-$(CONFIG_MODULES)		+= module.o
 obj-$(CONFIG_SCHED_TOPOLOGY)	+= topology.o hiperdispatch.o
 obj-$(CONFIG_NUMA)		+= numa.o
 obj-$(CONFIG_AUDIT)		+= audit.o
-compat-obj-$(CONFIG_AUDIT)	+= compat_audit.o
-obj-$(CONFIG_COMPAT)		+= compat_linux.o compat_signal.o
-obj-$(CONFIG_COMPAT)		+= $(compat-obj-y)
 obj-$(CONFIG_EARLY_PRINTK)	+= early_printk.o
 obj-$(CONFIG_KPROBES)		+= kprobes.o
 obj-$(CONFIG_KPROBES)		+= mcount.o
@@ -70,7 +67,7 @@ obj-$(CONFIG_KEXEC_CORE)	+= machine_kexec.o relocate_kernel.o
 obj-$(CONFIG_VMCORE_INFO)	+= vmcore_info.o
 obj-$(CONFIG_UPROBES)		+= uprobes.o
 obj-$(CONFIG_JUMP_LABEL)	+= jump_label.o
-
+obj-$(CONFIG_STACKPROTECTOR)	+= stackprotector.o
 obj-$(CONFIG_KEXEC_FILE)	+= machine_kexec_file.o kexec_image.o
 obj-$(CONFIG_KEXEC_FILE)	+= kexec_elf.o
 obj-$(CONFIG_CERT_STORE)	+= cert_store.o
@@ -79,10 +76,9 @@ obj-$(CONFIG_IMA_SECURE_AND_OR_TRUSTED_BOOT)	+= ima_arch.o
 obj-$(CONFIG_PERF_EVENTS)	+= perf_event.o
 obj-$(CONFIG_PERF_EVENTS)	+= perf_cpum_cf.o perf_cpum_sf.o
 obj-$(CONFIG_PERF_EVENTS)	+= perf_cpum_cf_events.o perf_regs.o
-obj-$(CONFIG_PERF_EVENTS)	+= perf_pai_crypto.o perf_pai_ext.o
+obj-$(CONFIG_PERF_EVENTS)	+= perf_pai.o
 
 obj-$(CONFIG_TRACEPOINTS)	+= trace.o
 
 # vdso
-obj-y				+= vdso64/
-obj-$(CONFIG_COMPAT)		+= vdso32/
+obj-y				+= vdso/
diff --git a/arch/s390/kernel/asm-offsets.c b/arch/s390/kernel/asm-offsets.c
index a8915663e917..cfe27f6579e3 100644
--- a/arch/s390/kernel/asm-offsets.c
+++ b/arch/s390/kernel/asm-offsets.c
@@ -21,6 +21,9 @@ int main(void)
 	OFFSET(__TASK_stack, task_struct, stack);
 	OFFSET(__TASK_thread, task_struct, thread);
 	OFFSET(__TASK_pid, task_struct, pid);
+#ifdef CONFIG_STACKPROTECTOR
+	OFFSET(__TASK_stack_canary, task_struct, stack_canary);
+#endif
 	BLANK();
 	/* thread struct offsets */
 	OFFSET(__THREAD_ksp, thread_struct, ksp);
@@ -139,6 +142,7 @@ int main(void)
 	OFFSET(__LC_CURRENT_PID, lowcore, current_pid);
 	OFFSET(__LC_LAST_BREAK, lowcore, last_break);
 	/* software defined ABI-relevant lowcore locations 0xe00 - 0xe20 */
+	OFFSET(__LC_STACK_CANARY, lowcore, stack_canary);
 	OFFSET(__LC_DUMP_REIPL, lowcore, ipib);
 	OFFSET(__LC_VMCORE_INFO, lowcore, vmcore_info);
 	OFFSET(__LC_OS_INFO, lowcore, os_info);
diff --git a/arch/s390/kernel/audit.c b/arch/s390/kernel/audit.c
index 02051a596b87..7897d9411e13 100644
--- a/arch/s390/kernel/audit.c
+++ b/arch/s390/kernel/audit.c
@@ -3,7 +3,6 @@
 #include <linux/types.h>
 #include <linux/audit.h>
 #include <asm/unistd.h>
-#include "audit.h"
 
 static unsigned dir_class[] = {
 #include <asm-generic/audit_dir_write.h>
@@ -32,19 +31,11 @@ static unsigned signal_class[] = {
 
 int audit_classify_arch(int arch)
 {
-#ifdef CONFIG_COMPAT
-	if (arch == AUDIT_ARCH_S390)
-		return 1;
-#endif
 	return 0;
 }
 
 int audit_classify_syscall(int abi, unsigned syscall)
 {
-#ifdef CONFIG_COMPAT
-	if (abi == AUDIT_ARCH_S390)
-		return s390_classify_syscall(syscall);
-#endif
 	switch(syscall) {
 	case __NR_open:
 		return AUDITSC_OPEN;
@@ -63,13 +54,6 @@ int audit_classify_syscall(int abi, unsigned syscall)
 
 static int __init audit_classes_init(void)
 {
-#ifdef CONFIG_COMPAT
-	audit_register_class(AUDIT_CLASS_WRITE_32, s390_write_class);
-	audit_register_class(AUDIT_CLASS_READ_32, s390_read_class);
-	audit_register_class(AUDIT_CLASS_DIR_WRITE_32, s390_dir_class);
-	audit_register_class(AUDIT_CLASS_CHATTR_32, s390_chattr_class);
-	audit_register_class(AUDIT_CLASS_SIGNAL_32, s390_signal_class);
-#endif
 	audit_register_class(AUDIT_CLASS_WRITE, write_class);
 	audit_register_class(AUDIT_CLASS_READ, read_class);
 	audit_register_class(AUDIT_CLASS_DIR_WRITE, dir_class);
diff --git a/arch/s390/kernel/audit.h b/arch/s390/kernel/audit.h
deleted file mode 100644
index 4d4b596412ec..000000000000
--- a/arch/s390/kernel/audit.h
+++ /dev/null
@@ -1,16 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef __ARCH_S390_KERNEL_AUDIT_H
-#define __ARCH_S390_KERNEL_AUDIT_H
-
-#include <linux/types.h>
-
-#ifdef CONFIG_COMPAT
-extern int s390_classify_syscall(unsigned);
-extern __u32 s390_dir_class[];
-extern __u32 s390_write_class[];
-extern __u32 s390_read_class[];
-extern __u32 s390_chattr_class[];
-extern __u32 s390_signal_class[];
-#endif /* CONFIG_COMPAT */
-
-#endif /* __ARCH_S390_KERNEL_AUDIT_H */
diff --git a/arch/s390/kernel/compat_audit.c b/arch/s390/kernel/compat_audit.c
deleted file mode 100644
index a7c46e8310f0..000000000000
--- a/arch/s390/kernel/compat_audit.c
+++ /dev/null
@@ -1,48 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#undef __s390x__
-#include <linux/audit_arch.h>
-#include <asm/unistd.h>
-#include "audit.h"
-
-unsigned s390_dir_class[] = {
-#include <asm-generic/audit_dir_write.h>
-~0U
-};
-
-unsigned s390_chattr_class[] = {
-#include <asm-generic/audit_change_attr.h>
-~0U
-};
-
-unsigned s390_write_class[] = {
-#include <asm-generic/audit_write.h>
-~0U
-};
-
-unsigned s390_read_class[] = {
-#include <asm-generic/audit_read.h>
-~0U
-};
-
-unsigned s390_signal_class[] = {
-#include <asm-generic/audit_signal.h>
-~0U
-};
-
-int s390_classify_syscall(unsigned syscall)
-{
-	switch(syscall) {
-	case __NR_open:
-		return AUDITSC_OPEN;
-	case __NR_openat:
-		return AUDITSC_OPENAT;
-	case __NR_socketcall:
-		return AUDITSC_SOCKETCALL;
-	case __NR_execve:
-		return AUDITSC_EXECVE;
-	case __NR_openat2:
-		return AUDITSC_OPENAT2;
-	default:
-		return AUDITSC_COMPAT;
-	}
-}
diff --git a/arch/s390/kernel/compat_linux.c b/arch/s390/kernel/compat_linux.c
deleted file mode 100644
index f9d418d1b619..000000000000
--- a/arch/s390/kernel/compat_linux.c
+++ /dev/null
@@ -1,289 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- *  S390 version
- *    Copyright IBM Corp. 2000
- *    Author(s): Martin Schwidefsky (schwidefsky@de.ibm.com),
- *               Gerhard Tonn (ton@de.ibm.com)   
- *               Thomas Spatzier (tspat@de.ibm.com)
- *
- *  Conversion between 31bit and 64bit native syscalls.
- *
- * Heavily inspired by the 32-bit Sparc compat code which is 
- * Copyright (C) 1997,1998 Jakub Jelinek (jj@sunsite.mff.cuni.cz)
- * Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu)
- *
- */
-
-
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/fs.h> 
-#include <linux/mm.h> 
-#include <linux/file.h> 
-#include <linux/signal.h>
-#include <linux/resource.h>
-#include <linux/times.h>
-#include <linux/smp.h>
-#include <linux/sem.h>
-#include <linux/msg.h>
-#include <linux/shm.h>
-#include <linux/uio.h>
-#include <linux/quota.h>
-#include <linux/poll.h>
-#include <linux/personality.h>
-#include <linux/stat.h>
-#include <linux/filter.h>
-#include <linux/highmem.h>
-#include <linux/mman.h>
-#include <linux/ipv6.h>
-#include <linux/in.h>
-#include <linux/icmpv6.h>
-#include <linux/syscalls.h>
-#include <linux/sysctl.h>
-#include <linux/binfmts.h>
-#include <linux/capability.h>
-#include <linux/compat.h>
-#include <linux/vfs.h>
-#include <linux/ptrace.h>
-#include <linux/fadvise.h>
-#include <linux/ipc.h>
-#include <linux/slab.h>
-
-#include <asm/types.h>
-#include <linux/uaccess.h>
-
-#include <net/scm.h>
-#include <net/sock.h>
-
-#include "compat_linux.h"
-
-#ifdef CONFIG_SYSVIPC
-COMPAT_SYSCALL_DEFINE5(s390_ipc, uint, call, int, first, compat_ulong_t, second,
-		compat_ulong_t, third, compat_uptr_t, ptr)
-{
-	if (call >> 16)		/* hack for backward compatibility */
-		return -EINVAL;
-	return compat_ksys_ipc(call, first, second, third, ptr, third);
-}
-#endif
-
-COMPAT_SYSCALL_DEFINE3(s390_truncate64, const char __user *, path, u32, high, u32, low)
-{
-	return ksys_truncate(path, (unsigned long)high << 32 | low);
-}
-
-COMPAT_SYSCALL_DEFINE3(s390_ftruncate64, unsigned int, fd, u32, high, u32, low)
-{
-	return ksys_ftruncate(fd, (unsigned long)high << 32 | low);
-}
-
-COMPAT_SYSCALL_DEFINE5(s390_pread64, unsigned int, fd, char __user *, ubuf,
-		       compat_size_t, count, u32, high, u32, low)
-{
-	if ((compat_ssize_t) count < 0)
-		return -EINVAL;
-	return ksys_pread64(fd, ubuf, count, (unsigned long)high << 32 | low);
-}
-
-COMPAT_SYSCALL_DEFINE5(s390_pwrite64, unsigned int, fd, const char __user *, ubuf,
-		       compat_size_t, count, u32, high, u32, low)
-{
-	if ((compat_ssize_t) count < 0)
-		return -EINVAL;
-	return ksys_pwrite64(fd, ubuf, count, (unsigned long)high << 32 | low);
-}
-
-COMPAT_SYSCALL_DEFINE4(s390_readahead, int, fd, u32, high, u32, low, s32, count)
-{
-	return ksys_readahead(fd, (unsigned long)high << 32 | low, count);
-}
-
-struct stat64_emu31 {
-	unsigned long long  st_dev;
-	unsigned int    __pad1;
-#define STAT64_HAS_BROKEN_ST_INO        1
-	u32             __st_ino;
-	unsigned int    st_mode;
-	unsigned int    st_nlink;
-	u32             st_uid;
-	u32             st_gid;
-	unsigned long long  st_rdev;
-	unsigned int    __pad3;
-	long            st_size;
-	u32             st_blksize;
-	unsigned char   __pad4[4];
-	u32             __pad5;     /* future possible st_blocks high bits */
-	u32             st_blocks;  /* Number 512-byte blocks allocated. */
-	u32             st_atime;
-	u32             __pad6;
-	u32             st_mtime;
-	u32             __pad7;
-	u32             st_ctime;
-	u32             __pad8;     /* will be high 32 bits of ctime someday */
-	unsigned long   st_ino;
-};	
-
-static int cp_stat64(struct stat64_emu31 __user *ubuf, struct kstat *stat)
-{
-	struct stat64_emu31 tmp;
-
-	memset(&tmp, 0, sizeof(tmp));
-
-	tmp.st_dev = huge_encode_dev(stat->dev);
-	tmp.st_ino = stat->ino;
-	tmp.__st_ino = (u32)stat->ino;
-	tmp.st_mode = stat->mode;
-	tmp.st_nlink = (unsigned int)stat->nlink;
-	tmp.st_uid = from_kuid_munged(current_user_ns(), stat->uid);
-	tmp.st_gid = from_kgid_munged(current_user_ns(), stat->gid);
-	tmp.st_rdev = huge_encode_dev(stat->rdev);
-	tmp.st_size = stat->size;
-	tmp.st_blksize = (u32)stat->blksize;
-	tmp.st_blocks = (u32)stat->blocks;
-	tmp.st_atime = (u32)stat->atime.tv_sec;
-	tmp.st_mtime = (u32)stat->mtime.tv_sec;
-	tmp.st_ctime = (u32)stat->ctime.tv_sec;
-
-	return copy_to_user(ubuf,&tmp,sizeof(tmp)) ? -EFAULT : 0; 
-}
-
-COMPAT_SYSCALL_DEFINE2(s390_stat64, const char __user *, filename, struct stat64_emu31 __user *, statbuf)
-{
-	struct kstat stat;
-	int ret = vfs_stat(filename, &stat);
-	if (!ret)
-		ret = cp_stat64(statbuf, &stat);
-	return ret;
-}
-
-COMPAT_SYSCALL_DEFINE2(s390_lstat64, const char __user *, filename, struct stat64_emu31 __user *, statbuf)
-{
-	struct kstat stat;
-	int ret = vfs_lstat(filename, &stat);
-	if (!ret)
-		ret = cp_stat64(statbuf, &stat);
-	return ret;
-}
-
-COMPAT_SYSCALL_DEFINE2(s390_fstat64, unsigned int, fd, struct stat64_emu31 __user *, statbuf)
-{
-	struct kstat stat;
-	int ret = vfs_fstat(fd, &stat);
-	if (!ret)
-		ret = cp_stat64(statbuf, &stat);
-	return ret;
-}
-
-COMPAT_SYSCALL_DEFINE4(s390_fstatat64, unsigned int, dfd, const char __user *, filename,
-		       struct stat64_emu31 __user *, statbuf, int, flag)
-{
-	struct kstat stat;
-	int error;
-
-	error = vfs_fstatat(dfd, filename, &stat, flag);
-	if (error)
-		return error;
-	return cp_stat64(statbuf, &stat);
-}
-
-/*
- * Linux/i386 didn't use to be able to handle more than
- * 4 system call parameters, so these system calls used a memory
- * block for parameter passing..
- */
-
-struct mmap_arg_struct_emu31 {
-	compat_ulong_t addr;
-	compat_ulong_t len;
-	compat_ulong_t prot;
-	compat_ulong_t flags;
-	compat_ulong_t fd;
-	compat_ulong_t offset;
-};
-
-COMPAT_SYSCALL_DEFINE1(s390_old_mmap, struct mmap_arg_struct_emu31 __user *, arg)
-{
-	struct mmap_arg_struct_emu31 a;
-
-	if (copy_from_user(&a, arg, sizeof(a)))
-		return -EFAULT;
-	if (a.offset & ~PAGE_MASK)
-		return -EINVAL;
-	return ksys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd,
-			       a.offset >> PAGE_SHIFT);
-}
-
-COMPAT_SYSCALL_DEFINE1(s390_mmap2, struct mmap_arg_struct_emu31 __user *, arg)
-{
-	struct mmap_arg_struct_emu31 a;
-
-	if (copy_from_user(&a, arg, sizeof(a)))
-		return -EFAULT;
-	return ksys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, a.offset);
-}
-
-COMPAT_SYSCALL_DEFINE3(s390_read, unsigned int, fd, char __user *, buf, compat_size_t, count)
-{
-	if ((compat_ssize_t) count < 0)
-		return -EINVAL; 
-
-	return ksys_read(fd, buf, count);
-}
-
-COMPAT_SYSCALL_DEFINE3(s390_write, unsigned int, fd, const char __user *, buf, compat_size_t, count)
-{
-	if ((compat_ssize_t) count < 0)
-		return -EINVAL; 
-
-	return ksys_write(fd, buf, count);
-}
-
-/*
- * 31 bit emulation wrapper functions for sys_fadvise64/fadvise64_64.
- * These need to rewrite the advise values for POSIX_FADV_{DONTNEED,NOREUSE}
- * because the 31 bit values differ from the 64 bit values.
- */
-
-COMPAT_SYSCALL_DEFINE5(s390_fadvise64, int, fd, u32, high, u32, low, compat_size_t, len, int, advise)
-{
-	if (advise == 4)
-		advise = POSIX_FADV_DONTNEED;
-	else if (advise == 5)
-		advise = POSIX_FADV_NOREUSE;
-	return ksys_fadvise64_64(fd, (unsigned long)high << 32 | low, len,
-				 advise);
-}
-
-struct fadvise64_64_args {
-	int fd;
-	long long offset;
-	long long len;
-	int advice;
-};
-
-COMPAT_SYSCALL_DEFINE1(s390_fadvise64_64, struct fadvise64_64_args __user *, args)
-{
-	struct fadvise64_64_args a;
-
-	if ( copy_from_user(&a, args, sizeof(a)) )
-		return -EFAULT;
-	if (a.advice == 4)
-		a.advice = POSIX_FADV_DONTNEED;
-	else if (a.advice == 5)
-		a.advice = POSIX_FADV_NOREUSE;
-	return ksys_fadvise64_64(a.fd, a.offset, a.len, a.advice);
-}
-
-COMPAT_SYSCALL_DEFINE6(s390_sync_file_range, int, fd, u32, offhigh, u32, offlow,
-		       u32, nhigh, u32, nlow, unsigned int, flags)
-{
-	return ksys_sync_file_range(fd, ((loff_t)offhigh << 32) + offlow,
-				   ((u64)nhigh << 32) + nlow, flags);
-}
-
-COMPAT_SYSCALL_DEFINE6(s390_fallocate, int, fd, int, mode, u32, offhigh, u32, offlow,
-		       u32, lenhigh, u32, lenlow)
-{
-	return ksys_fallocate(fd, mode, ((loff_t)offhigh << 32) + offlow,
-			      ((u64)lenhigh << 32) + lenlow);
-}
diff --git a/arch/s390/kernel/compat_linux.h b/arch/s390/kernel/compat_linux.h
deleted file mode 100644
index ef23739b277c..000000000000
--- a/arch/s390/kernel/compat_linux.h
+++ /dev/null
@@ -1,101 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_S390X_S390_H
-#define _ASM_S390X_S390_H
-
-#include <linux/compat.h>
-#include <linux/socket.h>
-#include <linux/syscalls.h>
-#include <asm/ptrace.h>
-
-/*
- * Macro that masks the high order bit of a 32 bit pointer and
- * converts it to a 64 bit pointer.
- */
-#define A(__x)	((unsigned long)((__x) & 0x7FFFFFFFUL))
-#define AA(__x)	((unsigned long)(__x))
-
-/* Now 32bit compatibility types */
-struct ipc_kludge_32 {
-	__u32	msgp;	/* pointer */
-	__s32	msgtyp;
-};
-
-/* asm/sigcontext.h */
-typedef union {
-	__u64	d;
-	__u32	f;
-} freg_t32;
-
-typedef struct {
-	unsigned int	fpc;
-	unsigned int	pad;
-	freg_t32	fprs[__NUM_FPRS];
-} _s390_fp_regs32;
-
-typedef struct {
-	psw_t32		psw;
-	__u32		gprs[__NUM_GPRS];
-	__u32		acrs[__NUM_ACRS];
-} _s390_regs_common32;
-
-typedef struct {
-	_s390_regs_common32 regs;
-	_s390_fp_regs32	    fpregs;
-} _sigregs32;
-
-typedef struct {
-	__u32		gprs_high[__NUM_GPRS];
-	__u64		vxrs_low[__NUM_VXRS_LOW];
-	__vector128	vxrs_high[__NUM_VXRS_HIGH];
-	__u8		__reserved[128];
-} _sigregs_ext32;
-
-#define _SIGCONTEXT_NSIG32	64
-#define _SIGCONTEXT_NSIG_BPW32	32
-#define __SIGNAL_FRAMESIZE32	96
-#define _SIGMASK_COPY_SIZE32	(sizeof(u32) * 2)
-
-struct sigcontext32 {
-	__u32	oldmask[_COMPAT_NSIG_WORDS];
-	__u32	sregs;	/* pointer */
-};
-
-/* asm/signal.h */
-
-/* asm/ucontext.h */
-struct ucontext32 {
-	__u32			uc_flags;
-	__u32			uc_link;	/* pointer */
-	compat_stack_t		uc_stack;
-	_sigregs32		uc_mcontext;
-	compat_sigset_t		uc_sigmask;
-	/* Allow for uc_sigmask growth. Glibc uses a 1024-bit sigset_t. */
-	unsigned char		__unused[128 - sizeof(compat_sigset_t)];
-	_sigregs_ext32		uc_mcontext_ext;
-};
-
-struct stat64_emu31;
-struct mmap_arg_struct_emu31;
-struct fadvise64_64_args;
-
-long compat_sys_s390_truncate64(const char __user *path, u32 high, u32 low);
-long compat_sys_s390_ftruncate64(unsigned int fd, u32 high, u32 low);
-long compat_sys_s390_pread64(unsigned int fd, char __user *ubuf, compat_size_t count, u32 high, u32 low);
-long compat_sys_s390_pwrite64(unsigned int fd, const char __user *ubuf, compat_size_t count, u32 high, u32 low);
-long compat_sys_s390_readahead(int fd, u32 high, u32 low, s32 count);
-long compat_sys_s390_stat64(const char __user *filename, struct stat64_emu31 __user *statbuf);
-long compat_sys_s390_lstat64(const char __user *filename, struct stat64_emu31 __user *statbuf);
-long compat_sys_s390_fstat64(unsigned int fd, struct stat64_emu31 __user *statbuf);
-long compat_sys_s390_fstatat64(unsigned int dfd, const char __user *filename, struct stat64_emu31 __user *statbuf, int flag);
-long compat_sys_s390_old_mmap(struct mmap_arg_struct_emu31 __user *arg);
-long compat_sys_s390_mmap2(struct mmap_arg_struct_emu31 __user *arg);
-long compat_sys_s390_read(unsigned int fd, char __user *buf, compat_size_t count);
-long compat_sys_s390_write(unsigned int fd, const char __user *buf, compat_size_t count);
-long compat_sys_s390_fadvise64(int fd, u32 high, u32 low, compat_size_t len, int advise);
-long compat_sys_s390_fadvise64_64(struct fadvise64_64_args __user *args);
-long compat_sys_s390_sync_file_range(int fd, u32 offhigh, u32 offlow, u32 nhigh, u32 nlow, unsigned int flags);
-long compat_sys_s390_fallocate(int fd, int mode, u32 offhigh, u32 offlow, u32 lenhigh, u32 lenlow);
-long compat_sys_sigreturn(void);
-long compat_sys_rt_sigreturn(void);
-
-#endif /* _ASM_S390X_S390_H */
diff --git a/arch/s390/kernel/compat_ptrace.h b/arch/s390/kernel/compat_ptrace.h
deleted file mode 100644
index 3c400fc7e987..000000000000
--- a/arch/s390/kernel/compat_ptrace.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _PTRACE32_H
-#define _PTRACE32_H
-
-#include <asm/ptrace.h>    /* needed for NUM_CR_WORDS */
-#include "compat_linux.h"  /* needed for psw_compat_t */
-
-struct compat_per_struct_kernel {
-	__u32 cr9;		/* PER control bits */
-	__u32 cr10;		/* PER starting address */
-	__u32 cr11;		/* PER ending address */
-	__u32 bits;		/* Obsolete software bits */
-	__u32 starting_addr;	/* User specified start address */
-	__u32 ending_addr;	/* User specified end address */
-	__u16 perc_atmid;	/* PER trap ATMID */
-	__u32 address;		/* PER trap instruction address */
-	__u8  access_id;	/* PER trap access identification */
-};
-
-struct compat_user_regs_struct
-{
-	psw_compat_t psw;
-	u32 gprs[NUM_GPRS];
-	u32 acrs[NUM_ACRS];
-	u32 orig_gpr2;
-	/* nb: there's a 4-byte hole here */
-	s390_fp_regs fp_regs;
-	/*
-	 * These per registers are in here so that gdb can modify them
-	 * itself as there is no "official" ptrace interface for hardware
-	 * watchpoints. This is the way intel does it.
-	 */
-	struct compat_per_struct_kernel per_info;
-	u32  ieee_instruction_pointer;	/* obsolete, always 0 */
-};
-
-struct compat_user {
-	/* We start with the registers, to mimic the way that "memory"
-	   is returned from the ptrace(3,...) function.  */
-	struct compat_user_regs_struct regs;
-	/* The rest of this junk is to help gdb figure out what goes where */
-	u32 u_tsize;		/* Text segment size (pages). */
-	u32 u_dsize;	        /* Data segment size (pages). */
-	u32 u_ssize;	        /* Stack segment size (pages). */
-	u32 start_code;         /* Starting virtual address of text. */
-	u32 start_stack;	/* Starting virtual address of stack area.
-				   This is actually the bottom of the stack,
-				   the top of the stack is always found in the
-				   esp register.  */
-	s32 signal;     	 /* Signal that caused the core dump. */
-	u32 u_ar0;               /* Used by gdb to help find the values for */
-	                         /* the registers. */
-	u32 magic;		 /* To uniquely identify a core file */
-	char u_comm[32];	 /* User command that was responsible */
-};
-
-typedef struct
-{
-	__u32   len;
-	__u32   kernel_addr;
-	__u32   process_addr;
-} compat_ptrace_area;
-
-#endif /* _PTRACE32_H */
diff --git a/arch/s390/kernel/compat_signal.c b/arch/s390/kernel/compat_signal.c
deleted file mode 100644
index 5a86b9d1da71..000000000000
--- a/arch/s390/kernel/compat_signal.c
+++ /dev/null
@@ -1,420 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- *    Copyright IBM Corp. 2000, 2006
- *    Author(s): Denis Joseph Barrow (djbarrow@de.ibm.com,barrow_dj@yahoo.com)
- *               Gerhard Tonn (ton@de.ibm.com)                  
- *
- *  Copyright (C) 1991, 1992  Linus Torvalds
- *
- *  1997-11-28  Modified for POSIX.1b signals by Richard Henderson
- */
-
-#include <linux/compat.h>
-#include <linux/sched.h>
-#include <linux/sched/task_stack.h>
-#include <linux/mm.h>
-#include <linux/smp.h>
-#include <linux/kernel.h>
-#include <linux/signal.h>
-#include <linux/errno.h>
-#include <linux/wait.h>
-#include <linux/ptrace.h>
-#include <linux/unistd.h>
-#include <linux/stddef.h>
-#include <linux/tty.h>
-#include <linux/personality.h>
-#include <linux/binfmts.h>
-#include <asm/vdso-symbols.h>
-#include <asm/access-regs.h>
-#include <asm/ucontext.h>
-#include <linux/uaccess.h>
-#include <asm/lowcore.h>
-#include <asm/fpu.h>
-#include "compat_linux.h"
-#include "compat_ptrace.h"
-#include "entry.h"
-
-typedef struct 
-{
-	__u8 callee_used_stack[__SIGNAL_FRAMESIZE32];
-	struct sigcontext32 sc;
-	_sigregs32 sregs;
-	int signo;
-	_sigregs_ext32 sregs_ext;
-	__u16 svc_insn;		/* Offset of svc_insn is NOT fixed! */
-} sigframe32;
-
-typedef struct 
-{
-	__u8 callee_used_stack[__SIGNAL_FRAMESIZE32];
-	__u16 svc_insn;
-	compat_siginfo_t info;
-	struct ucontext32 uc;
-} rt_sigframe32;
-
-/* Store registers needed to create the signal frame */
-static void store_sigregs(void)
-{
-	save_access_regs(current->thread.acrs);
-	save_user_fpu_regs();
-}
-
-/* Load registers after signal return */
-static void load_sigregs(void)
-{
-	restore_access_regs(current->thread.acrs);
-}
-
-static int save_sigregs32(struct pt_regs *regs, _sigregs32 __user *sregs)
-{
-	_sigregs32 user_sregs;
-	int i;
-
-	user_sregs.regs.psw.mask = (__u32)(regs->psw.mask >> 32);
-	user_sregs.regs.psw.mask &= PSW32_MASK_USER | PSW32_MASK_RI;
-	user_sregs.regs.psw.mask |= PSW32_USER_BITS;
-	user_sregs.regs.psw.addr = (__u32) regs->psw.addr |
-		(__u32)(regs->psw.mask & PSW_MASK_BA);
-	for (i = 0; i < NUM_GPRS; i++)
-		user_sregs.regs.gprs[i] = (__u32) regs->gprs[i];
-	memcpy(&user_sregs.regs.acrs, current->thread.acrs,
-	       sizeof(user_sregs.regs.acrs));
-	fpregs_store((_s390_fp_regs *) &user_sregs.fpregs, &current->thread.ufpu);
-	if (__copy_to_user(sregs, &user_sregs, sizeof(_sigregs32)))
-		return -EFAULT;
-	return 0;
-}
-
-static int restore_sigregs32(struct pt_regs *regs,_sigregs32 __user *sregs)
-{
-	_sigregs32 user_sregs;
-	int i;
-
-	/* Always make any pending restarted system call return -EINTR */
-	current->restart_block.fn = do_no_restart_syscall;
-
-	if (__copy_from_user(&user_sregs, &sregs->regs, sizeof(user_sregs)))
-		return -EFAULT;
-
-	if (!is_ri_task(current) && (user_sregs.regs.psw.mask & PSW32_MASK_RI))
-		return -EINVAL;
-
-	/* Use regs->psw.mask instead of PSW_USER_BITS to preserve PER bit. */
-	regs->psw.mask = (regs->psw.mask & ~(PSW_MASK_USER | PSW_MASK_RI)) |
-		(__u64)(user_sregs.regs.psw.mask & PSW32_MASK_USER) << 32 |
-		(__u64)(user_sregs.regs.psw.mask & PSW32_MASK_RI) << 32 |
-		(__u64)(user_sregs.regs.psw.addr & PSW32_ADDR_AMODE);
-	/* Check for invalid user address space control. */
-	if ((regs->psw.mask & PSW_MASK_ASC) == PSW_ASC_HOME)
-		regs->psw.mask = PSW_ASC_PRIMARY |
-			(regs->psw.mask & ~PSW_MASK_ASC);
-	regs->psw.addr = (__u64)(user_sregs.regs.psw.addr & PSW32_ADDR_INSN);
-	for (i = 0; i < NUM_GPRS; i++)
-		regs->gprs[i] = (__u64) user_sregs.regs.gprs[i];
-	memcpy(&current->thread.acrs, &user_sregs.regs.acrs,
-	       sizeof(current->thread.acrs));
-	fpregs_load((_s390_fp_regs *)&user_sregs.fpregs, &current->thread.ufpu);
-
-	clear_pt_regs_flag(regs, PIF_SYSCALL); /* No longer in a system call */
-	return 0;
-}
-
-static int save_sigregs_ext32(struct pt_regs *regs,
-			      _sigregs_ext32 __user *sregs_ext)
-{
-	__u32 gprs_high[NUM_GPRS];
-	__u64 vxrs[__NUM_VXRS_LOW];
-	int i;
-
-	/* Save high gprs to signal stack */
-	for (i = 0; i < NUM_GPRS; i++)
-		gprs_high[i] = regs->gprs[i] >> 32;
-	if (__copy_to_user(&sregs_ext->gprs_high, &gprs_high,
-			   sizeof(sregs_ext->gprs_high)))
-		return -EFAULT;
-
-	/* Save vector registers to signal stack */
-	if (cpu_has_vx()) {
-		for (i = 0; i < __NUM_VXRS_LOW; i++)
-			vxrs[i] = current->thread.ufpu.vxrs[i].low;
-		if (__copy_to_user(&sregs_ext->vxrs_low, vxrs,
-				   sizeof(sregs_ext->vxrs_low)) ||
-		    __copy_to_user(&sregs_ext->vxrs_high,
-				   current->thread.ufpu.vxrs + __NUM_VXRS_LOW,
-				   sizeof(sregs_ext->vxrs_high)))
-			return -EFAULT;
-	}
-	return 0;
-}
-
-static int restore_sigregs_ext32(struct pt_regs *regs,
-				 _sigregs_ext32 __user *sregs_ext)
-{
-	__u32 gprs_high[NUM_GPRS];
-	__u64 vxrs[__NUM_VXRS_LOW];
-	int i;
-
-	/* Restore high gprs from signal stack */
-	if (__copy_from_user(&gprs_high, &sregs_ext->gprs_high,
-			     sizeof(sregs_ext->gprs_high)))
-		return -EFAULT;
-	for (i = 0; i < NUM_GPRS; i++)
-		*(__u32 *)&regs->gprs[i] = gprs_high[i];
-
-	/* Restore vector registers from signal stack */
-	if (cpu_has_vx()) {
-		if (__copy_from_user(vxrs, &sregs_ext->vxrs_low,
-				     sizeof(sregs_ext->vxrs_low)) ||
-		    __copy_from_user(current->thread.ufpu.vxrs + __NUM_VXRS_LOW,
-				     &sregs_ext->vxrs_high,
-				     sizeof(sregs_ext->vxrs_high)))
-			return -EFAULT;
-		for (i = 0; i < __NUM_VXRS_LOW; i++)
-			current->thread.ufpu.vxrs[i].low = vxrs[i];
-	}
-	return 0;
-}
-
-COMPAT_SYSCALL_DEFINE0(sigreturn)
-{
-	struct pt_regs *regs = task_pt_regs(current);
-	sigframe32 __user *frame = (sigframe32 __user *)regs->gprs[15];
-	sigset_t set;
-
-	if (get_compat_sigset(&set, (compat_sigset_t __user *)frame->sc.oldmask))
-		goto badframe;
-	set_current_blocked(&set);
-	save_user_fpu_regs();
-	if (restore_sigregs32(regs, &frame->sregs))
-		goto badframe;
-	if (restore_sigregs_ext32(regs, &frame->sregs_ext))
-		goto badframe;
-	load_sigregs();
-	return regs->gprs[2];
-badframe:
-	force_sig(SIGSEGV);
-	return 0;
-}
-
-COMPAT_SYSCALL_DEFINE0(rt_sigreturn)
-{
-	struct pt_regs *regs = task_pt_regs(current);
-	rt_sigframe32 __user *frame = (rt_sigframe32 __user *)regs->gprs[15];
-	sigset_t set;
-
-	if (get_compat_sigset(&set, &frame->uc.uc_sigmask))
-		goto badframe;
-	set_current_blocked(&set);
-	if (compat_restore_altstack(&frame->uc.uc_stack))
-		goto badframe;
-	save_user_fpu_regs();
-	if (restore_sigregs32(regs, &frame->uc.uc_mcontext))
-		goto badframe;
-	if (restore_sigregs_ext32(regs, &frame->uc.uc_mcontext_ext))
-		goto badframe;
-	load_sigregs();
-	return regs->gprs[2];
-badframe:
-	force_sig(SIGSEGV);
-	return 0;
-}	
-
-/*
- * Set up a signal frame.
- */
-
-
-/*
- * Determine which stack to use..
- */
-static inline void __user *
-get_sigframe(struct k_sigaction *ka, struct pt_regs * regs, size_t frame_size)
-{
-	unsigned long sp;
-
-	/* Default to using normal stack */
-	sp = (unsigned long) A(regs->gprs[15]);
-
-	/* Overflow on alternate signal stack gives SIGSEGV. */
-	if (on_sig_stack(sp) && !on_sig_stack((sp - frame_size) & -8UL))
-		return (void __user *) -1UL;
-
-	/* This is the X/Open sanctioned signal stack switching.  */
-	if (ka->sa.sa_flags & SA_ONSTACK) {
-		if (! sas_ss_flags(sp))
-			sp = current->sas_ss_sp + current->sas_ss_size;
-	}
-
-	return (void __user *)((sp - frame_size) & -8ul);
-}
-
-static int setup_frame32(struct ksignal *ksig, sigset_t *set,
-			 struct pt_regs *regs)
-{
-	int sig = ksig->sig;
-	sigframe32 __user *frame;
-	unsigned long restorer;
-	size_t frame_size;
-
-	/*
-	 * gprs_high are always present for 31-bit compat tasks.
-	 * The space for vector registers is only allocated if
-	 * the machine supports it
-	 */
-	frame_size = sizeof(*frame) - sizeof(frame->sregs_ext.__reserved);
-	if (!cpu_has_vx())
-		frame_size -= sizeof(frame->sregs_ext.vxrs_low) +
-			      sizeof(frame->sregs_ext.vxrs_high);
-	frame = get_sigframe(&ksig->ka, regs, frame_size);
-	if (frame == (void __user *) -1UL)
-		return -EFAULT;
-
-	/* Set up backchain. */
-	if (__put_user(regs->gprs[15], (unsigned int __user *) frame))
-		return -EFAULT;
-
-	/* Create struct sigcontext32 on the signal stack */
-	if (put_compat_sigset((compat_sigset_t __user *)frame->sc.oldmask,
-			      set, sizeof(compat_sigset_t)))
-		return -EFAULT;
-	if (__put_user(ptr_to_compat(&frame->sregs), &frame->sc.sregs))
-		return -EFAULT;
-
-	/* Store registers needed to create the signal frame */
-	store_sigregs();
-
-	/* Create _sigregs32 on the signal stack */
-	if (save_sigregs32(regs, &frame->sregs))
-		return -EFAULT;
-
-	/* Place signal number on stack to allow backtrace from handler.  */
-	if (__put_user(regs->gprs[2], (int __force __user *) &frame->signo))
-		return -EFAULT;
-
-	/* Create _sigregs_ext32 on the signal stack */
-	if (save_sigregs_ext32(regs, &frame->sregs_ext))
-		return -EFAULT;
-
-	/* Set up to return from userspace.  If provided, use a stub
-	   already in userspace.  */
-	if (ksig->ka.sa.sa_flags & SA_RESTORER) {
-		restorer = (unsigned long __force)
-			ksig->ka.sa.sa_restorer | PSW32_ADDR_AMODE;
-	} else {
-		restorer = VDSO32_SYMBOL(current, sigreturn);
-        }
-
-	/* Set up registers for signal handler */
-	regs->gprs[14] = restorer;
-	regs->gprs[15] = (__force __u64) frame;
-	/* Force 31 bit amode and default user address space control. */
-	regs->psw.mask = PSW_MASK_BA |
-		(PSW_USER_BITS & PSW_MASK_ASC) |
-		(regs->psw.mask & ~PSW_MASK_ASC);
-	regs->psw.addr = (__force __u64) ksig->ka.sa.sa_handler;
-
-	regs->gprs[2] = sig;
-	regs->gprs[3] = (__force __u64) &frame->sc;
-
-	/* We forgot to include these in the sigcontext.
-	   To avoid breaking binary compatibility, they are passed as args. */
-	if (sig == SIGSEGV || sig == SIGBUS || sig == SIGILL ||
-	    sig == SIGTRAP || sig == SIGFPE) {
-		/* set extra registers only for synchronous signals */
-		regs->gprs[4] = regs->int_code & 127;
-		regs->gprs[5] = regs->int_parm_long;
-		regs->gprs[6] = current->thread.last_break;
-	}
-
-	return 0;
-}
-
-static int setup_rt_frame32(struct ksignal *ksig, sigset_t *set,
-			    struct pt_regs *regs)
-{
-	rt_sigframe32 __user *frame;
-	unsigned long restorer;
-	size_t frame_size;
-	u32 uc_flags;
-
-	frame_size = sizeof(*frame) -
-		     sizeof(frame->uc.uc_mcontext_ext.__reserved);
-	/*
-	 * gprs_high are always present for 31-bit compat tasks.
-	 * The space for vector registers is only allocated if
-	 * the machine supports it
-	 */
-	uc_flags = UC_GPRS_HIGH;
-	if (cpu_has_vx()) {
-		uc_flags |= UC_VXRS;
-	} else {
-		frame_size -= sizeof(frame->uc.uc_mcontext_ext.vxrs_low) +
-			      sizeof(frame->uc.uc_mcontext_ext.vxrs_high);
-	}
-	frame = get_sigframe(&ksig->ka, regs, frame_size);
-	if (frame == (void __user *) -1UL)
-		return -EFAULT;
-
-	/* Set up backchain. */
-	if (__put_user(regs->gprs[15], (unsigned int __force __user *) frame))
-		return -EFAULT;
-
-	/* Set up to return from userspace.  If provided, use a stub
-	   already in userspace.  */
-	if (ksig->ka.sa.sa_flags & SA_RESTORER) {
-		restorer = (unsigned long __force)
-			ksig->ka.sa.sa_restorer | PSW32_ADDR_AMODE;
-	} else {
-		restorer = VDSO32_SYMBOL(current, rt_sigreturn);
-	}
-
-	/* Create siginfo on the signal stack */
-	if (copy_siginfo_to_user32(&frame->info, &ksig->info))
-		return -EFAULT;
-
-	/* Store registers needed to create the signal frame */
-	store_sigregs();
-
-	/* Create ucontext on the signal stack. */
-	if (__put_user(uc_flags, &frame->uc.uc_flags) ||
-	    __put_user(0, &frame->uc.uc_link) ||
-	    __compat_save_altstack(&frame->uc.uc_stack, regs->gprs[15]) ||
-	    save_sigregs32(regs, &frame->uc.uc_mcontext) ||
-	    put_compat_sigset(&frame->uc.uc_sigmask, set, sizeof(compat_sigset_t)) ||
-	    save_sigregs_ext32(regs, &frame->uc.uc_mcontext_ext))
-		return -EFAULT;
-
-	/* Set up registers for signal handler */
-	regs->gprs[14] = restorer;
-	regs->gprs[15] = (__force __u64) frame;
-	/* Force 31 bit amode and default user address space control. */
-	regs->psw.mask = PSW_MASK_BA |
-		(PSW_USER_BITS & PSW_MASK_ASC) |
-		(regs->psw.mask & ~PSW_MASK_ASC);
-	regs->psw.addr = (__u64 __force) ksig->ka.sa.sa_handler;
-
-	regs->gprs[2] = ksig->sig;
-	regs->gprs[3] = (__force __u64) &frame->info;
-	regs->gprs[4] = (__force __u64) &frame->uc;
-	regs->gprs[5] = current->thread.last_break;
-	return 0;
-}
-
-/*
- * OK, we're invoking a handler
- */	
-
-void handle_signal32(struct ksignal *ksig, sigset_t *oldset,
-		     struct pt_regs *regs)
-{
-	int ret;
-
-	/* Set up the stack frame */
-	if (ksig->ka.sa.sa_flags & SA_SIGINFO)
-		ret = setup_rt_frame32(ksig, oldset, regs);
-	else
-		ret = setup_frame32(ksig, oldset, regs);
-
-	signal_setup_done(ret, ksig, test_thread_flag(TIF_SINGLE_STEP));
-}
-
diff --git a/arch/s390/kernel/cpacf.c b/arch/s390/kernel/cpacf.c
index 3bebc47beeab..9d85b4bc7036 100644
--- a/arch/s390/kernel/cpacf.c
+++ b/arch/s390/kernel/cpacf.c
@@ -3,8 +3,7 @@
  * Copyright IBM Corp. 2024
  */
 
-#define KMSG_COMPONENT "cpacf"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "cpacf: " fmt
 
 #include <linux/cpu.h>
 #include <linux/device.h>
diff --git a/arch/s390/kernel/cpcmd.c b/arch/s390/kernel/cpcmd.c
index 2f4174b961de..ab611764642a 100644
--- a/arch/s390/kernel/cpcmd.c
+++ b/arch/s390/kernel/cpcmd.c
@@ -6,8 +6,7 @@
  *               Christian Borntraeger (cborntra@de.ibm.com),
  */
 
-#define KMSG_COMPONENT "cpcmd"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "cpcmd: " fmt
 
 #include <linux/kernel.h>
 #include <linux/export.h>
diff --git a/arch/s390/kernel/debug.c b/arch/s390/kernel/debug.c
index 6a26f202441d..71cdb6845dd7 100644
--- a/arch/s390/kernel/debug.c
+++ b/arch/s390/kernel/debug.c
@@ -10,8 +10,7 @@
  *    Bugreports to: <Linux390@de.ibm.com>
  */
 
-#define KMSG_COMPONENT "s390dbf"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "s390dbf: " fmt
 
 #include <linux/stddef.h>
 #include <linux/kernel.h>
diff --git a/arch/s390/kernel/dis.c b/arch/s390/kernel/dis.c
index 63a1d4226ff8..1cec93895b3a 100644
--- a/arch/s390/kernel/dis.c
+++ b/arch/s390/kernel/dis.c
@@ -503,24 +503,27 @@ static int copy_from_regs(struct pt_regs *regs, void *dst, void *src, int len)
 void show_code(struct pt_regs *regs)
 {
 	char *mode = user_mode(regs) ? "User" : "Krnl";
+	unsigned long addr, pswaddr;
 	unsigned char code[64];
 	char buffer[128], *ptr;
-	unsigned long addr;
 	int start, end, opsize, hops, i;
 
+	pswaddr = regs->psw.addr;
+	if (test_pt_regs_flag(regs, PIF_PSW_ADDR_ADJUSTED))
+		pswaddr = __forward_psw(regs->psw, regs->int_code >> 16);
 	/* Get a snapshot of the 64 bytes surrounding the fault address. */
-	for (start = 32; start && regs->psw.addr >= 34 - start; start -= 2) {
-		addr = regs->psw.addr - 34 + start;
+	for (start = 32; start && pswaddr >= 34 - start; start -= 2) {
+		addr = pswaddr - 34 + start;
 		if (copy_from_regs(regs, code + start - 2, (void *)addr, 2))
 			break;
 	}
 	for (end = 32; end < 64; end += 2) {
-		addr = regs->psw.addr + end - 32;
+		addr = pswaddr + end - 32;
 		if (copy_from_regs(regs, code + end, (void *)addr, 2))
 			break;
 	}
 	/* Code snapshot usable ? */
-	if ((regs->psw.addr & 1) || start >= end) {
+	if ((pswaddr & 1) || start >= end) {
 		printk("%s Code: Bad PSW.\n", mode);
 		return;
 	}
@@ -543,12 +546,12 @@ void show_code(struct pt_regs *regs)
 	while (start < end && hops < 8) {
 		opsize = insn_length(code[start]);
 		if  (start + opsize == 32)
-			*ptr++ = '#';
+			*ptr++ = '*';
 		else if (start == 32)
 			*ptr++ = '>';
 		else
 			*ptr++ = ' ';
-		addr = regs->psw.addr + start - 32;
+		addr = pswaddr + start - 32;
 		ptr += sprintf(ptr, "%px: ", (void *)addr);
 		if (start + opsize >= end)
 			break;
diff --git a/arch/s390/kernel/dumpstack.c b/arch/s390/kernel/dumpstack.c
index dd410962ecbe..f9d52e05e01e 100644
--- a/arch/s390/kernel/dumpstack.c
+++ b/arch/s390/kernel/dumpstack.c
@@ -155,12 +155,16 @@ static void show_last_breaking_event(struct pt_regs *regs)
 void show_registers(struct pt_regs *regs)
 {
 	struct psw_bits *psw = &psw_bits(regs->psw);
+	unsigned long pswaddr;
 	char *mode;
 
+	pswaddr = regs->psw.addr;
+	if (test_pt_regs_flag(regs, PIF_PSW_ADDR_ADJUSTED))
+		pswaddr = __forward_psw(regs->psw, regs->int_code >> 16);
 	mode = user_mode(regs) ? "User" : "Krnl";
-	printk("%s PSW : %px %px", mode, (void *)regs->psw.mask, (void *)regs->psw.addr);
+	printk("%s PSW : %px %px", mode, (void *)regs->psw.mask, (void *)pswaddr);
 	if (!user_mode(regs))
-		pr_cont(" (%pSR)", (void *)regs->psw.addr);
+		pr_cont(" (%pSR)", (void *)pswaddr);
 	pr_cont("\n");
 	printk("           R:%x T:%x IO:%x EX:%x Key:%x M:%x W:%x "
 	       "P:%x AS:%x CC:%x PM:%x", psw->per, psw->dat, psw->io, psw->ext,
diff --git a/arch/s390/kernel/early.c b/arch/s390/kernel/early.c
index 544e5403dd91..b27239c03d79 100644
--- a/arch/s390/kernel/early.c
+++ b/arch/s390/kernel/early.c
@@ -4,8 +4,7 @@
  *    Author(s): Hongjie Yang <hongjie@us.ibm.com>,
  */
 
-#define KMSG_COMPONENT "setup"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "setup: " fmt
 
 #include <linux/sched/debug.h>
 #include <linux/cpufeature.h>
@@ -120,21 +119,21 @@ static noinline __init void setup_arch_string(void)
 	EBCASC(mach->type, sizeof(mach->type));
 	EBCASC(mach->model, sizeof(mach->model));
 	EBCASC(mach->model_capacity, sizeof(mach->model_capacity));
-	sprintf(mstr, "%-16.16s %-4.4s %-16.16s %-16.16s",
-		mach->manufacturer, mach->type,
-		mach->model, mach->model_capacity);
+	scnprintf(mstr, sizeof(mstr), "%-16.16s %-4.4s %-16.16s %-16.16s",
+		  mach->manufacturer, mach->type,
+		  mach->model, mach->model_capacity);
 	strim_all(mstr);
 	if (stsi(vm, 3, 2, 2) == 0 && vm->count) {
 		EBCASC(vm->vm[0].cpi, sizeof(vm->vm[0].cpi));
-		sprintf(hvstr, "%-16.16s", vm->vm[0].cpi);
+		scnprintf(hvstr, sizeof(hvstr), "%-16.16s", vm->vm[0].cpi);
 		strim_all(hvstr);
 	} else {
-		sprintf(hvstr, "%s",
-			machine_is_lpar() ? "LPAR" :
-			machine_is_vm() ? "z/VM" :
-			machine_is_kvm() ? "KVM" : "unknown");
+		scnprintf(hvstr, sizeof(hvstr), "%s",
+			  machine_is_lpar() ? "LPAR" :
+			  machine_is_vm() ? "z/VM" :
+			  machine_is_kvm() ? "KVM" : "unknown");
 	}
-	sprintf(arch_hw_string, "HW: %s (%s)", mstr, hvstr);
+	scnprintf(arch_hw_string, sizeof(arch_hw_string), "HW: %s (%s)", mstr, hvstr);
 	dump_stack_set_arch_desc("%s (%s)", mstr, hvstr);
 }
 
diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S
index 75b0fbb236d0..c360087807d8 100644
--- a/arch/s390/kernel/entry.S
+++ b/arch/s390/kernel/entry.S
@@ -162,9 +162,13 @@ SYM_FUNC_START(__switch_to_asm)
 	stg	%r3,__LC_CURRENT(%r13)		# store task struct of next
 	stg	%r15,__LC_KERNEL_STACK(%r13)	# store end of kernel stack
 	lg	%r15,__THREAD_ksp(%r1,%r3)	# load kernel stack of next
-	aghi	%r3,__TASK_pid
-	mvc	__LC_CURRENT_PID(4,%r13),0(%r3)	# store pid of next
+	lay	%r4,__TASK_pid(%r3)
+	mvc	__LC_CURRENT_PID(4,%r13),0(%r4) # store pid of next
 	ALTERNATIVE "nop", "lpp _LPP_OFFSET(%r13)", ALT_FACILITY(40)
+#ifdef CONFIG_STACKPROTECTOR
+	lg	%r3,__TASK_stack_canary(%r3)
+	stg	%r3,__LC_STACK_CANARY(%r13)
+#endif
 	lmg	%r6,%r15,__SF_GPRS(%r15)	# load gprs of next task
 	BR_EX	%r14
 SYM_FUNC_END(__switch_to_asm)
@@ -606,20 +610,3 @@ SYM_DATA_START_LOCAL(daton_psw)
 	.quad	PSW_KERNEL_BITS
 	.quad	.Ldaton
 SYM_DATA_END(daton_psw)
-
-	.section .rodata, "a"
-	.balign	8
-#define SYSCALL(esame,emu)	.quad __s390x_ ## esame
-SYM_DATA_START(sys_call_table)
-#include <asm/syscall_table.h>
-SYM_DATA_END(sys_call_table)
-#undef SYSCALL
-
-#ifdef CONFIG_COMPAT
-
-#define SYSCALL(esame,emu)	.quad __s390_ ## emu
-SYM_DATA_START(sys_call_table_emu)
-#include <asm/syscall_table.h>
-SYM_DATA_END(sys_call_table_emu)
-#undef SYSCALL
-#endif
diff --git a/arch/s390/kernel/head64.S b/arch/s390/kernel/head.S
index 7edb9ded199c..7edb9ded199c 100644
--- a/arch/s390/kernel/head64.S
+++ b/arch/s390/kernel/head.S
diff --git a/arch/s390/kernel/hiperdispatch.c b/arch/s390/kernel/hiperdispatch.c
index 2507bc3f7757..217206522266 100644
--- a/arch/s390/kernel/hiperdispatch.c
+++ b/arch/s390/kernel/hiperdispatch.c
@@ -3,8 +3,7 @@
  * Copyright IBM Corp. 2024
  */
 
-#define KMSG_COMPONENT "hd"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "hd: " fmt
 
 /*
  * Hiperdispatch:
@@ -65,7 +64,7 @@
 
 #define HD_DELAY_FACTOR			(4)
 #define HD_DELAY_INTERVAL		(HZ / 4)
-#define HD_STEAL_THRESHOLD		30
+#define HD_STEAL_THRESHOLD		10
 #define HD_STEAL_AVG_WEIGHT		16
 
 static cpumask_t hd_vl_coremask;	/* Mask containing all vertical low COREs */
diff --git a/arch/s390/kernel/module.c b/arch/s390/kernel/module.c
index 91e207b50394..9d1f8a50f5a4 100644
--- a/arch/s390/kernel/module.c
+++ b/arch/s390/kernel/module.c
@@ -22,12 +22,14 @@
 #include <linux/bug.h>
 #include <linux/memory.h>
 #include <linux/execmem.h>
+#include <asm/arch-stackprotector.h>
 #include <asm/alternative.h>
 #include <asm/nospec-branch.h>
 #include <asm/facility.h>
 #include <asm/ftrace.lds.h>
 #include <asm/set_memory.h>
 #include <asm/setup.h>
+#include <asm/asm-offsets.h>
 
 #if 0
 #define DEBUGP printk
@@ -495,9 +497,7 @@ int module_finalize(const Elf_Ehdr *hdr,
 	const Elf_Shdr *s;
 	char *secstrings, *secname;
 	void *aseg;
-#ifdef CONFIG_FUNCTION_TRACER
-	int ret;
-#endif
+	int rc = 0;
 
 	if (IS_ENABLED(CONFIG_EXPOLINE) &&
 	    !nospec_disable && me->arch.plt_size) {
@@ -527,14 +527,21 @@ int module_finalize(const Elf_Ehdr *hdr,
 		    (str_has_prefix(secname, ".s390_return")))
 			nospec_revert(aseg, aseg + s->sh_size);
 
+		if (IS_ENABLED(CONFIG_STACKPROTECTOR) &&
+		    (str_has_prefix(secname, "__stack_protector_loc"))) {
+			rc = stack_protector_apply(aseg, aseg + s->sh_size);
+			if (rc)
+				break;
+		}
+
 #ifdef CONFIG_FUNCTION_TRACER
 		if (!strcmp(FTRACE_CALLSITE_SECTION, secname)) {
-			ret = module_alloc_ftrace_hotpatch_trampolines(me, s);
-			if (ret < 0)
-				return ret;
+			rc = module_alloc_ftrace_hotpatch_trampolines(me, s);
+			if (rc)
+				break;
 		}
 #endif /* CONFIG_FUNCTION_TRACER */
 	}
 
-	return 0;
+	return rc;
 }
diff --git a/arch/s390/kernel/nmi.c b/arch/s390/kernel/nmi.c
index 11f33243a23f..a55abbf65333 100644
--- a/arch/s390/kernel/nmi.c
+++ b/arch/s390/kernel/nmi.c
@@ -184,7 +184,7 @@ static notrace void nmi_print_info(void)
 	sclp_emergency_printk(message);
 }
 
-static notrace void s390_handle_damage(void)
+static notrace void __noreturn s390_handle_damage(void)
 {
 	struct lowcore *lc = get_lowcore();
 	union ctlreg0 cr0, cr0_new;
@@ -214,7 +214,6 @@ static notrace void s390_handle_damage(void)
 	lc->mcck_new_psw = psw_save;
 	local_ctl_load(0, &cr0.reg);
 	disabled_wait();
-	while (1);
 }
 NOKPROBE_SYMBOL(s390_handle_damage);
 
diff --git a/arch/s390/kernel/os_info.c b/arch/s390/kernel/os_info.c
index c2a468986212..94fa44776d0c 100644
--- a/arch/s390/kernel/os_info.c
+++ b/arch/s390/kernel/os_info.c
@@ -6,8 +6,7 @@
  * Author(s): Michael Holzheu <holzheu@linux.vnet.ibm.com>
  */
 
-#define KMSG_COMPONENT "os_info"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "os_info: " fmt
 
 #include <linux/crash_dump.h>
 #include <linux/kernel.h>
diff --git a/arch/s390/kernel/perf_cpum_cf.c b/arch/s390/kernel/perf_cpum_cf.c
index 04457d88e589..408ab93112bf 100644
--- a/arch/s390/kernel/perf_cpum_cf.c
+++ b/arch/s390/kernel/perf_cpum_cf.c
@@ -6,8 +6,7 @@
  *  Author(s): Hendrik Brueckner <brueckner@linux.ibm.com>
  *	       Thomas Richter <tmricht@linux.ibm.com>
  */
-#define KMSG_COMPONENT	"cpum_cf"
-#define pr_fmt(fmt)	KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "cpum_cf: " fmt
 
 #include <linux/kernel.h>
 #include <linux/kernel_stat.h>
@@ -1206,7 +1205,7 @@ static int __init cpumf_pmu_init(void)
 	}
 
 	/* Setup s390dbf facility */
-	cf_dbg = debug_register(KMSG_COMPONENT, 2, 1, 128);
+	cf_dbg = debug_register("cpum_cf", 2, 1, 128);
 	if (!cf_dbg) {
 		pr_err("Registration of s390dbf(cpum_cf) failed\n");
 		rc = -ENOMEM;
@@ -1689,7 +1688,6 @@ static const struct file_operations cfset_fops = {
 	.open = cfset_open,
 	.release = cfset_release,
 	.unlocked_ioctl	= cfset_ioctl,
-	.compat_ioctl = cfset_ioctl,
 };
 
 static struct miscdevice cfset_dev = {
diff --git a/arch/s390/kernel/perf_cpum_sf.c b/arch/s390/kernel/perf_cpum_sf.c
index f432869f8921..459af23a47a5 100644
--- a/arch/s390/kernel/perf_cpum_sf.c
+++ b/arch/s390/kernel/perf_cpum_sf.c
@@ -5,8 +5,7 @@
  * Copyright IBM Corp. 2013, 2018
  * Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
  */
-#define KMSG_COMPONENT	"cpum_sf"
-#define pr_fmt(fmt)	KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "cpum_sf: " fmt
 
 #include <linux/kernel.h>
 #include <linux/kernel_stat.h>
@@ -1093,7 +1092,7 @@ static void perf_event_count_update(struct perf_event *event, u64 count)
  * combined-sampling data entry consists of a basic- and a diagnostic-sampling
  * data entry.	The sampling function is determined by the flags in the perf
  * event hardware structure.  The function always works with a combined-sampling
- * data entry but ignores the the diagnostic portion if it is not available.
+ * data entry but ignores the diagnostic portion if it is not available.
  *
  * Note that the implementation focuses on basic-sampling data entries and, if
  * such an entry is not valid, the entire combined-sampling data entry is
@@ -2070,7 +2069,7 @@ static int __init init_cpum_sampling_pmu(void)
 			CPUMF_EVENT_PTR(SF, SF_CYCLES_BASIC_DIAG);
 	}
 
-	sfdbg = debug_register(KMSG_COMPONENT, 2, 1, 80);
+	sfdbg = debug_register("cpum_sf", 2, 1, 80);
 	if (!sfdbg) {
 		pr_err("Registering for s390dbf failed\n");
 		return -ENOMEM;
diff --git a/arch/s390/kernel/perf_event.c b/arch/s390/kernel/perf_event.c
index 91b8716c883a..606750bae508 100644
--- a/arch/s390/kernel/perf_event.c
+++ b/arch/s390/kernel/perf_event.c
@@ -5,8 +5,7 @@
  *  Copyright IBM Corp. 2012, 2013
  *  Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
  */
-#define KMSG_COMPONENT	"perf"
-#define pr_fmt(fmt)	KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "perf: " fmt
 
 #include <linux/kernel.h>
 #include <linux/perf_event.h>
@@ -15,7 +14,6 @@
 #include <linux/seq_file.h>
 #include <linux/spinlock.h>
 #include <linux/uaccess.h>
-#include <linux/compat.h>
 #include <linux/sysfs.h>
 #include <asm/stacktrace.h>
 #include <asm/irq.h>
diff --git a/arch/s390/kernel/perf_pai.c b/arch/s390/kernel/perf_pai.c
new file mode 100644
index 000000000000..810f5b6c5e01
--- /dev/null
+++ b/arch/s390/kernel/perf_pai.c
@@ -0,0 +1,1230 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Performance event support - Processor Activity Instrumentation Facility
+ *
+ *  Copyright IBM Corp. 2026
+ *  Author(s): Thomas Richter <tmricht@linux.ibm.com>
+ */
+#define pr_fmt(fmt) "pai: " fmt
+
+#include <linux/kernel.h>
+#include <linux/kernel_stat.h>
+#include <linux/percpu.h>
+#include <linux/notifier.h>
+#include <linux/init.h>
+#include <linux/io.h>
+#include <linux/perf_event.h>
+#include <asm/ctlreg.h>
+#include <asm/pai.h>
+#include <asm/debug.h>
+
+static debug_info_t *paidbg;
+
+DEFINE_STATIC_KEY_FALSE(pai_key);
+
+enum {
+	PAI_PMU_CRYPTO,			/* Index of PMU pai_crypto */
+	PAI_PMU_EXT,			/* Index of PMU pai_ext */
+	PAI_PMU_MAX			/* # of PAI PMUs */
+};
+
+enum {
+	PAIE1_CB_SZ = 0x200,		/* Size of PAIE1 control block */
+	PAIE1_CTRBLOCK_SZ = 0x400	/* Size of PAIE1 counter blocks */
+};
+
+struct pai_userdata {
+	u16 num;
+	u64 value;
+} __packed;
+
+/* Create the PAI extension 1 control block area.
+ * The PAI extension control block 1 is pointed to by lowcore
+ * address 0x1508 for each CPU. This control block is 512 bytes in size
+ * and requires a 512 byte boundary alignment.
+ */
+struct paiext_cb {		/* PAI extension 1 control block */
+	u64 header;		/* Not used */
+	u64 reserved1;
+	u64 acc;		/* Addr to analytics counter control block */
+	u8 reserved2[PAIE1_CTRBLOCK_SZ - 3 * sizeof(u64)];
+} __packed;
+
+struct pai_map {
+	unsigned long *area;		/* Area for CPU to store counters */
+	struct pai_userdata *save;	/* Page to store no-zero counters */
+	unsigned int active_events;	/* # of PAI crypto users */
+	refcount_t refcnt;		/* Reference count mapped buffers */
+	struct perf_event *event;	/* Perf event for sampling */
+	struct list_head syswide_list;	/* List system-wide sampling events */
+	struct paiext_cb *paiext_cb;	/* PAI extension control block area */
+	bool fullpage;			/* True: counter area is a full page */
+};
+
+struct pai_mapptr {
+	struct pai_map *mapptr;
+};
+
+static struct pai_root {		/* Anchor to per CPU data */
+	refcount_t refcnt;		/* Overall active events */
+	struct pai_mapptr __percpu *mapptr;
+} pai_root[PAI_PMU_MAX];
+
+/* This table defines the different parameters of the PAI PMUs. During
+ * initialization the machine dependent values are extracted and saved.
+ * However most of the values are static and do not change.
+ * There is one table entry per PAI PMU.
+ */
+struct pai_pmu {			/* Define PAI PMU characteristics */
+	const char *pmuname;		/* Name of PMU */
+	const int facility_nr;		/* Facility number to check for support */
+	unsigned int num_avail;		/* # Counters defined by hardware */
+	unsigned int num_named;		/* # Counters known by name */
+	unsigned long base;		/* Counter set base number */
+	unsigned long kernel_offset;	/* Offset to kernel part in counter page */
+	unsigned long area_size;	/* Size of counter area */
+	const char * const *names;	/* List of counter names */
+	struct pmu *pmu;		/* Ptr to supporting PMU */
+	int (*init)(struct pai_pmu *p);		/* PMU support init function */
+	void (*exit)(struct pai_pmu *p);	/* PMU support exit function */
+	struct attribute_group	*event_group;	/* Ptr to attribute of events */
+};
+
+static struct pai_pmu pai_pmu[];	/* Forward declaration */
+
+/* Free per CPU data when the last event is removed. */
+static void pai_root_free(int idx)
+{
+	if (refcount_dec_and_test(&pai_root[idx].refcnt)) {
+		free_percpu(pai_root[idx].mapptr);
+		pai_root[idx].mapptr = NULL;
+	}
+	debug_sprintf_event(paidbg, 5, "%s root[%d].refcount %d\n", __func__,
+			    idx, refcount_read(&pai_root[idx].refcnt));
+}
+
+/*
+ * On initialization of first event also allocate per CPU data dynamically.
+ * Start with an array of pointers, the array size is the maximum number of
+ * CPUs possible, which might be larger than the number of CPUs currently
+ * online.
+ */
+static int pai_root_alloc(int idx)
+{
+	if (!refcount_inc_not_zero(&pai_root[idx].refcnt)) {
+		/* The memory is already zeroed. */
+		pai_root[idx].mapptr = alloc_percpu(struct pai_mapptr);
+		if (!pai_root[idx].mapptr)
+			return -ENOMEM;
+		refcount_set(&pai_root[idx].refcnt, 1);
+	}
+	return 0;
+}
+
+/* Release the PMU if event is the last perf event */
+static DEFINE_MUTEX(pai_reserve_mutex);
+
+/* Free all memory allocated for event counting/sampling setup */
+static void pai_free(struct pai_mapptr *mp)
+{
+	if (mp->mapptr->fullpage)
+		free_page((unsigned long)mp->mapptr->area);
+	else
+		kfree(mp->mapptr->area);
+	kfree(mp->mapptr->paiext_cb);
+	kvfree(mp->mapptr->save);
+	kfree(mp->mapptr);
+	mp->mapptr = NULL;
+}
+
+/* Adjust usage counters and remove allocated memory when all users are
+ * gone.
+ */
+static void pai_event_destroy_cpu(struct perf_event *event, int cpu)
+{
+	int idx = PAI_PMU_IDX(event);
+	struct pai_mapptr *mp = per_cpu_ptr(pai_root[idx].mapptr, cpu);
+	struct pai_map *cpump = mp->mapptr;
+
+	mutex_lock(&pai_reserve_mutex);
+	debug_sprintf_event(paidbg, 5, "%s event %#llx idx %d cpu %d users %d "
+			    "refcnt %u\n", __func__, event->attr.config, idx,
+			    event->cpu, cpump->active_events,
+			    refcount_read(&cpump->refcnt));
+	if (refcount_dec_and_test(&cpump->refcnt))
+		pai_free(mp);
+	pai_root_free(idx);
+	mutex_unlock(&pai_reserve_mutex);
+}
+
+static void pai_event_destroy(struct perf_event *event)
+{
+	int cpu;
+
+	free_page(PAI_SAVE_AREA(event));
+	if (event->cpu == -1) {
+		struct cpumask *mask = PAI_CPU_MASK(event);
+
+		for_each_cpu(cpu, mask)
+			pai_event_destroy_cpu(event, cpu);
+		kfree(mask);
+	} else {
+		pai_event_destroy_cpu(event, event->cpu);
+	}
+}
+
+static void paicrypt_event_destroy(struct perf_event *event)
+{
+	static_branch_dec(&pai_key);
+	pai_event_destroy(event);
+}
+
+static u64 pai_getctr(unsigned long *page, int nr, unsigned long offset)
+{
+	if (offset)
+		nr += offset / sizeof(*page);
+	return page[nr];
+}
+
+/* Read the counter values. Return value from location in CMP. For base
+ * event xxx_ALL sum up all events. Returns counter value.
+ */
+static u64 pai_getdata(struct perf_event *event, bool kernel)
+{
+	int idx = PAI_PMU_IDX(event);
+	struct pai_mapptr *mp = this_cpu_ptr(pai_root[idx].mapptr);
+	struct pai_pmu *pp = &pai_pmu[idx];
+	struct pai_map *cpump = mp->mapptr;
+	unsigned int i;
+	u64 sum = 0;
+
+	if (event->attr.config != pp->base) {
+		return pai_getctr(cpump->area,
+				       event->attr.config - pp->base,
+				       kernel ? pp->kernel_offset : 0);
+	}
+
+	for (i = 1; i <= pp->num_avail; i++) {
+		u64 val = pai_getctr(cpump->area, i,
+				     kernel ? pp->kernel_offset : 0);
+
+		if (!val)
+			continue;
+		sum += val;
+	}
+	return sum;
+}
+
+static u64 paicrypt_getall(struct perf_event *event)
+{
+	u64 sum = 0;
+
+	if (!event->attr.exclude_kernel)
+		sum += pai_getdata(event, true);
+	if (!event->attr.exclude_user)
+		sum += pai_getdata(event, false);
+
+	return sum;
+}
+
+/* Check concurrent access of counting and sampling for crypto events.
+ * This function is called in process context and it is save to block.
+ * When the event initialization functions fails, no other call back will
+ * be invoked.
+ *
+ * Allocate the memory for the event.
+ */
+static int pai_alloc_cpu(struct perf_event *event, int cpu)
+{
+	int rc, idx = PAI_PMU_IDX(event);
+	struct pai_map *cpump = NULL;
+	bool need_paiext_cb = false;
+	struct pai_mapptr *mp;
+
+	mutex_lock(&pai_reserve_mutex);
+	/* Allocate root node */
+	rc = pai_root_alloc(idx);
+	if (rc)
+		goto unlock;
+
+	/* Allocate node for this event */
+	mp = per_cpu_ptr(pai_root[idx].mapptr, cpu);
+	cpump = mp->mapptr;
+	if (!cpump) {			/* Paicrypt_map allocated? */
+		rc = -ENOMEM;
+		cpump = kzalloc(sizeof(*cpump), GFP_KERNEL);
+		if (!cpump)
+			goto undo;
+		/* Allocate memory for counter page and counter extraction.
+		 * Only the first counting event has to allocate a page.
+		 */
+		mp->mapptr = cpump;
+		if (idx == PAI_PMU_CRYPTO) {
+			cpump->area = (unsigned long *)get_zeroed_page(GFP_KERNEL);
+			/* free_page() can handle 0x0 address */
+			cpump->fullpage = true;
+		} else {			/* PAI_PMU_EXT */
+			/*
+			 * Allocate memory for counter area and counter extraction.
+			 * These are
+			 * - a 512 byte block and requires 512 byte boundary
+			 *   alignment.
+			 * - a 1KB byte block and requires 1KB boundary
+			 *   alignment.
+			 * Only the first counting event has to allocate the area.
+			 *
+			 * Note: This works with commit 59bb47985c1d by default.
+			 * Backporting this to kernels without this commit might
+			 * needs adjustment.
+			 */
+			cpump->area = kzalloc(pai_pmu[idx].area_size, GFP_KERNEL);
+			cpump->paiext_cb = kzalloc(PAIE1_CB_SZ, GFP_KERNEL);
+			need_paiext_cb = true;
+		}
+		cpump->save = kvmalloc_array(pai_pmu[idx].num_avail + 1,
+					     sizeof(struct pai_userdata),
+					     GFP_KERNEL);
+		if (!cpump->area || !cpump->save ||
+		    (need_paiext_cb && !cpump->paiext_cb)) {
+			pai_free(mp);
+			goto undo;
+		}
+		INIT_LIST_HEAD(&cpump->syswide_list);
+		refcount_set(&cpump->refcnt, 1);
+		rc = 0;
+	} else {
+		refcount_inc(&cpump->refcnt);
+	}
+
+undo:
+	if (rc) {
+		/* Error in allocation of event, decrement anchor. Since
+		 * the event in not created, its destroy() function is never
+		 * invoked. Adjust the reference counter for the anchor.
+		 */
+		pai_root_free(idx);
+	}
+unlock:
+	mutex_unlock(&pai_reserve_mutex);
+	/* If rc is non-zero, no increment of counter/sampler was done. */
+	return rc;
+}
+
+static int pai_alloc(struct perf_event *event)
+{
+	struct cpumask *maskptr;
+	int cpu, rc = -ENOMEM;
+
+	maskptr = kzalloc(sizeof(*maskptr), GFP_KERNEL);
+	if (!maskptr)
+		goto out;
+
+	for_each_online_cpu(cpu) {
+		rc = pai_alloc_cpu(event, cpu);
+		if (rc) {
+			for_each_cpu(cpu, maskptr)
+				pai_event_destroy_cpu(event, cpu);
+			kfree(maskptr);
+			goto out;
+		}
+		cpumask_set_cpu(cpu, maskptr);
+	}
+
+	/*
+	 * On error all cpumask are freed and all events have been destroyed.
+	 * Save of which CPUs data structures have been allocated for.
+	 * Release them in pai_event_destroy call back function
+	 * for this event.
+	 */
+	PAI_CPU_MASK(event) = maskptr;
+	rc = 0;
+out:
+	return rc;
+}
+
+/* Validate event number and return error if event is not supported.
+ * On successful return, PAI_PMU_IDX(event) is set to the index of
+ * the supporting paing_support[] array element.
+ */
+static int pai_event_valid(struct perf_event *event, int idx)
+{
+	struct perf_event_attr *a = &event->attr;
+	struct pai_pmu *pp = &pai_pmu[idx];
+
+	/* PAI crypto PMU registered as PERF_TYPE_RAW, check event type */
+	if (a->type != PERF_TYPE_RAW && event->pmu->type != a->type)
+		return -ENOENT;
+	/* Allow only CRYPTO_ALL/NNPA_ALL for sampling */
+	if (a->sample_period && a->config != pp->base)
+		return -EINVAL;
+	/* PAI crypto event must be in valid range, try others if not */
+	if (a->config < pp->base || a->config > pp->base + pp->num_avail)
+		return -ENOENT;
+	if (idx == PAI_PMU_EXT && a->exclude_user)
+		return -EINVAL;
+	PAI_PMU_IDX(event) = idx;
+	return 0;
+}
+
+/* Might be called on different CPU than the one the event is intended for. */
+static int pai_event_init(struct perf_event *event, int idx)
+{
+	struct perf_event_attr *a = &event->attr;
+	int rc;
+
+	/* PAI event must be valid and in supported range */
+	rc = pai_event_valid(event, idx);
+	if (rc)
+		goto out;
+	/* Get a page to store last counter values for sampling */
+	if (a->sample_period) {
+		PAI_SAVE_AREA(event) = get_zeroed_page(GFP_KERNEL);
+		if (!PAI_SAVE_AREA(event)) {
+			rc = -ENOMEM;
+			goto out;
+		}
+	}
+
+	if (event->cpu >= 0)
+		rc = pai_alloc_cpu(event, event->cpu);
+	else
+		rc = pai_alloc(event);
+	if (rc) {
+		free_page(PAI_SAVE_AREA(event));
+		goto out;
+	}
+
+	if (a->sample_period) {
+		a->sample_period = 1;
+		a->freq = 0;
+		/* Register for paicrypt_sched_task() to be called */
+		event->attach_state |= PERF_ATTACH_SCHED_CB;
+		/* Add raw data which contain the memory mapped counters */
+		a->sample_type |= PERF_SAMPLE_RAW;
+		/* Turn off inheritance */
+		a->inherit = 0;
+	}
+out:
+	return rc;
+}
+
+static int paicrypt_event_init(struct perf_event *event)
+{
+	int rc = pai_event_init(event, PAI_PMU_CRYPTO);
+
+	if (!rc) {
+		event->destroy = paicrypt_event_destroy;
+		static_branch_inc(&pai_key);
+	}
+	return rc;
+}
+
+static void pai_read(struct perf_event *event,
+		     u64 (*fct)(struct perf_event *event))
+{
+	u64 prev, new, delta;
+
+	prev = local64_read(&event->hw.prev_count);
+	new = fct(event);
+	local64_set(&event->hw.prev_count, new);
+	delta = (prev <= new) ? new - prev : (-1ULL - prev) + new + 1;
+	local64_add(delta, &event->count);
+}
+
+static void paicrypt_read(struct perf_event *event)
+{
+	pai_read(event, paicrypt_getall);
+}
+
+static void pai_start(struct perf_event *event, int flags,
+		      u64 (*fct)(struct perf_event *event))
+{
+	int idx = PAI_PMU_IDX(event);
+	struct pai_pmu *pp = &pai_pmu[idx];
+	struct pai_mapptr *mp = this_cpu_ptr(pai_root[idx].mapptr);
+	struct pai_map *cpump = mp->mapptr;
+	u64 sum;
+
+	if (!event->attr.sample_period) {	/* Counting */
+		sum = fct(event);		/* Get current value */
+		local64_set(&event->hw.prev_count, sum);
+	} else {				/* Sampling */
+		memcpy((void *)PAI_SAVE_AREA(event), cpump->area, pp->area_size);
+		/* Enable context switch callback for system-wide sampling */
+		if (!(event->attach_state & PERF_ATTACH_TASK)) {
+			list_add_tail(PAI_SWLIST(event), &cpump->syswide_list);
+			perf_sched_cb_inc(event->pmu);
+		} else {
+			cpump->event = event;
+		}
+	}
+}
+
+static void paicrypt_start(struct perf_event *event, int flags)
+{
+	pai_start(event, flags, paicrypt_getall);
+}
+
+static int pai_add(struct perf_event *event, int flags)
+{
+	int idx = PAI_PMU_IDX(event);
+	struct pai_mapptr *mp = this_cpu_ptr(pai_root[idx].mapptr);
+	struct pai_map *cpump = mp->mapptr;
+	struct paiext_cb *pcb = cpump->paiext_cb;
+	unsigned long ccd;
+
+	if (++cpump->active_events == 1) {
+		if (!pcb) {		/* PAI crypto */
+			ccd = virt_to_phys(cpump->area) | PAI_CRYPTO_KERNEL_OFFSET;
+			WRITE_ONCE(get_lowcore()->ccd, ccd);
+			local_ctl_set_bit(0, CR0_CRYPTOGRAPHY_COUNTER_BIT);
+		} else {		/* PAI extension 1 */
+			ccd = virt_to_phys(pcb);
+			WRITE_ONCE(get_lowcore()->aicd, ccd);
+			pcb->acc = virt_to_phys(cpump->area) | 0x1;
+			/* Enable CPU instruction lookup for PAIE1 control block */
+			local_ctl_set_bit(0, CR0_PAI_EXTENSION_BIT);
+		}
+	}
+	if (flags & PERF_EF_START)
+		pai_pmu[idx].pmu->start(event, PERF_EF_RELOAD);
+	event->hw.state = 0;
+	return 0;
+}
+
+static int paicrypt_add(struct perf_event *event, int flags)
+{
+	return pai_add(event, flags);
+}
+
+static void pai_have_sample(struct perf_event *, struct pai_map *);
+static void pai_stop(struct perf_event *event, int flags)
+{
+	int idx = PAI_PMU_IDX(event);
+	struct pai_mapptr *mp = this_cpu_ptr(pai_root[idx].mapptr);
+	struct pai_map *cpump = mp->mapptr;
+
+	if (!event->attr.sample_period) {	/* Counting */
+		pai_pmu[idx].pmu->read(event);
+	} else {				/* Sampling */
+		if (!(event->attach_state & PERF_ATTACH_TASK)) {
+			perf_sched_cb_dec(event->pmu);
+			list_del(PAI_SWLIST(event));
+		} else {
+			pai_have_sample(event, cpump);
+			cpump->event = NULL;
+		}
+	}
+	event->hw.state = PERF_HES_STOPPED;
+}
+
+static void paicrypt_stop(struct perf_event *event, int flags)
+{
+	pai_stop(event, flags);
+}
+
+static void pai_del(struct perf_event *event, int flags)
+{
+	int idx = PAI_PMU_IDX(event);
+	struct pai_mapptr *mp = this_cpu_ptr(pai_root[idx].mapptr);
+	struct pai_map *cpump = mp->mapptr;
+	struct paiext_cb *pcb = cpump->paiext_cb;
+
+	pai_pmu[idx].pmu->stop(event, PERF_EF_UPDATE);
+	if (--cpump->active_events == 0) {
+		if (!pcb) {		/* PAI crypto */
+			local_ctl_clear_bit(0, CR0_CRYPTOGRAPHY_COUNTER_BIT);
+			WRITE_ONCE(get_lowcore()->ccd, 0);
+		} else {		/* PAI extension 1 */
+			/* Disable CPU instruction lookup for PAIE1 control block */
+			local_ctl_clear_bit(0, CR0_PAI_EXTENSION_BIT);
+			pcb->acc = 0;
+			WRITE_ONCE(get_lowcore()->aicd, 0);
+		}
+	}
+}
+
+static void paicrypt_del(struct perf_event *event, int flags)
+{
+	pai_del(event, flags);
+}
+
+/* Create raw data and save it in buffer. Calculate the delta for each
+ * counter between this invocation and the last invocation.
+ * Returns number of bytes copied.
+ * Saves only entries with positive counter difference of the form
+ * 2 bytes: Number of counter
+ * 8 bytes: Value of counter
+ */
+static size_t pai_copy(struct pai_userdata *userdata, unsigned long *page,
+		       struct pai_pmu *pp, unsigned long *page_old,
+		       bool exclude_user, bool exclude_kernel)
+{
+	int i, outidx = 0;
+
+	for (i = 1; i <= pp->num_avail; i++) {
+		u64 val = 0, val_old = 0;
+
+		if (!exclude_kernel) {
+			val += pai_getctr(page, i, pp->kernel_offset);
+			val_old += pai_getctr(page_old, i, pp->kernel_offset);
+		}
+		if (!exclude_user) {
+			val += pai_getctr(page, i, 0);
+			val_old += pai_getctr(page_old, i, 0);
+		}
+		if (val >= val_old)
+			val -= val_old;
+		else
+			val = (~0ULL - val_old) + val + 1;
+		if (val) {
+			userdata[outidx].num = i;
+			userdata[outidx].value = val;
+			outidx++;
+		}
+	}
+	return outidx * sizeof(*userdata);
+}
+
+/* Write sample when one or more counters values are nonzero.
+ *
+ * Note: The function paicrypt_sched_task() and pai_push_sample() are not
+ * invoked after function paicrypt_del() has been called because of function
+ * perf_sched_cb_dec(). Both functions are only
+ * called when sampling is active. Function perf_sched_cb_inc()
+ * has been invoked to install function paicrypt_sched_task() as call back
+ * to run at context switch time.
+ *
+ * This causes function perf_event_context_sched_out() and
+ * perf_event_context_sched_in() to check whether the PMU has installed an
+ * sched_task() callback. That callback is not active after paicrypt_del()
+ * returns and has deleted the event on that CPU.
+ */
+static int pai_push_sample(size_t rawsize, struct pai_map *cpump,
+			   struct perf_event *event)
+{
+	int idx = PAI_PMU_IDX(event);
+	struct pai_pmu *pp = &pai_pmu[idx];
+	struct perf_sample_data data;
+	struct perf_raw_record raw;
+	struct pt_regs regs;
+	int overflow;
+
+	/* Setup perf sample */
+	memset(&regs, 0, sizeof(regs));
+	memset(&raw, 0, sizeof(raw));
+	memset(&data, 0, sizeof(data));
+	perf_sample_data_init(&data, 0, event->hw.last_period);
+	if (event->attr.sample_type & PERF_SAMPLE_TID) {
+		data.tid_entry.pid = task_tgid_nr(current);
+		data.tid_entry.tid = task_pid_nr(current);
+	}
+	if (event->attr.sample_type & PERF_SAMPLE_TIME)
+		data.time = event->clock();
+	if (event->attr.sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER))
+		data.id = event->id;
+	if (event->attr.sample_type & PERF_SAMPLE_CPU) {
+		data.cpu_entry.cpu = smp_processor_id();
+		data.cpu_entry.reserved = 0;
+	}
+	if (event->attr.sample_type & PERF_SAMPLE_RAW) {
+		raw.frag.size = rawsize;
+		raw.frag.data = cpump->save;
+		perf_sample_save_raw_data(&data, event, &raw);
+	}
+
+	overflow = perf_event_overflow(event, &data, &regs);
+	perf_event_update_userpage(event);
+	/* Save crypto counter lowcore page after reading event data. */
+	memcpy((void *)PAI_SAVE_AREA(event), cpump->area, pp->area_size);
+	return overflow;
+}
+
+/* Check if there is data to be saved on schedule out of a task. */
+static void pai_have_sample(struct perf_event *event, struct pai_map *cpump)
+{
+	struct pai_pmu *pp;
+	size_t rawsize;
+
+	if (!event)		/* No event active */
+		return;
+	pp = &pai_pmu[PAI_PMU_IDX(event)];
+	rawsize = pai_copy(cpump->save, cpump->area, pp,
+			   (unsigned long *)PAI_SAVE_AREA(event),
+			   event->attr.exclude_user,
+			   event->attr.exclude_kernel);
+	if (rawsize)			/* No incremented counters */
+		pai_push_sample(rawsize, cpump, event);
+}
+
+/* Check if there is data to be saved on schedule out of a task. */
+static void pai_have_samples(int idx)
+{
+	struct pai_mapptr *mp = this_cpu_ptr(pai_root[idx].mapptr);
+	struct pai_map *cpump = mp->mapptr;
+	struct perf_event *event;
+
+	list_for_each_entry(event, &cpump->syswide_list, hw.tp_list)
+		pai_have_sample(event, cpump);
+}
+
+/* Called on schedule-in and schedule-out. No access to event structure,
+ * but for sampling only event CRYPTO_ALL is allowed.
+ */
+static void paicrypt_sched_task(struct perf_event_pmu_context *pmu_ctx,
+				struct task_struct *task, bool sched_in)
+{
+	/* We started with a clean page on event installation. So read out
+	 * results on schedule_out and if page was dirty, save old values.
+	 */
+	if (!sched_in)
+		pai_have_samples(PAI_PMU_CRYPTO);
+}
+
+/* ============================= paiext ====================================*/
+
+static void paiext_event_destroy(struct perf_event *event)
+{
+	pai_event_destroy(event);
+}
+
+/* Might be called on different CPU than the one the event is intended for. */
+static int paiext_event_init(struct perf_event *event)
+{
+	int rc = pai_event_init(event, PAI_PMU_EXT);
+
+	if (!rc) {
+		event->attr.exclude_kernel = true;	/* No kernel space part */
+		event->destroy = paiext_event_destroy;
+		/* Offset of NNPA in paiext_cb */
+		event->hw.config_base = offsetof(struct paiext_cb, acc);
+	}
+	return rc;
+}
+
+static u64 paiext_getall(struct perf_event *event)
+{
+	return pai_getdata(event, false);
+}
+
+static void paiext_read(struct perf_event *event)
+{
+	pai_read(event, paiext_getall);
+}
+
+static void paiext_start(struct perf_event *event, int flags)
+{
+	pai_start(event, flags, paiext_getall);
+}
+
+static int paiext_add(struct perf_event *event, int flags)
+{
+	return pai_add(event, flags);
+}
+
+static void paiext_stop(struct perf_event *event, int flags)
+{
+	pai_stop(event, flags);
+}
+
+static void paiext_del(struct perf_event *event, int flags)
+{
+	pai_del(event, flags);
+}
+
+/* Called on schedule-in and schedule-out. No access to event structure,
+ * but for sampling only event NNPA_ALL is allowed.
+ */
+static void paiext_sched_task(struct perf_event_pmu_context *pmu_ctx,
+			      struct task_struct *task, bool sched_in)
+{
+	/* We started with a clean page on event installation. So read out
+	 * results on schedule_out and if page was dirty, save old values.
+	 */
+	if (!sched_in)
+		pai_have_samples(PAI_PMU_EXT);
+}
+
+/* Attribute definitions for paicrypt interface. As with other CPU
+ * Measurement Facilities, there is one attribute per mapped counter.
+ * The number of mapped counters may vary per machine generation. Use
+ * the QUERY PROCESSOR ACTIVITY COUNTER INFORMATION (QPACI) instruction
+ * to determine the number of mapped counters. The instructions returns
+ * a positive number, which is the highest number of supported counters.
+ * All counters less than this number are also supported, there are no
+ * holes. A returned number of zero means no support for mapped counters.
+ *
+ * The identification of the counter is a unique number. The chosen range
+ * is 0x1000 + offset in mapped kernel page.
+ * All CPU Measurement Facility counters identifiers must be unique and
+ * the numbers from 0 to 496 are already used for the CPU Measurement
+ * Counter facility. Numbers 0xb0000, 0xbc000 and 0xbd000 are already
+ * used for the CPU Measurement Sampling facility.
+ */
+PMU_FORMAT_ATTR(event, "config:0-63");
+
+static struct attribute *paicrypt_format_attr[] = {
+	&format_attr_event.attr,
+	NULL,
+};
+
+static struct attribute_group paicrypt_events_group = {
+	.name = "events",
+	.attrs = NULL			/* Filled in attr_event_init() */
+};
+
+static struct attribute_group paicrypt_format_group = {
+	.name = "format",
+	.attrs = paicrypt_format_attr,
+};
+
+static const struct attribute_group *paicrypt_attr_groups[] = {
+	&paicrypt_events_group,
+	&paicrypt_format_group,
+	NULL,
+};
+
+/* Performance monitoring unit for mapped counters */
+static struct pmu paicrypt = {
+	.task_ctx_nr  = perf_hw_context,
+	.event_init   = paicrypt_event_init,
+	.add	      = paicrypt_add,
+	.del	      = paicrypt_del,
+	.start	      = paicrypt_start,
+	.stop	      = paicrypt_stop,
+	.read	      = paicrypt_read,
+	.sched_task   = paicrypt_sched_task,
+	.attr_groups  = paicrypt_attr_groups
+};
+
+/* List of symbolic PAI counter names. */
+static const char * const paicrypt_ctrnames[] = {
+	[0] = "CRYPTO_ALL",
+	[1] = "KM_DEA",
+	[2] = "KM_TDEA_128",
+	[3] = "KM_TDEA_192",
+	[4] = "KM_ENCRYPTED_DEA",
+	[5] = "KM_ENCRYPTED_TDEA_128",
+	[6] = "KM_ENCRYPTED_TDEA_192",
+	[7] = "KM_AES_128",
+	[8] = "KM_AES_192",
+	[9] = "KM_AES_256",
+	[10] = "KM_ENCRYPTED_AES_128",
+	[11] = "KM_ENCRYPTED_AES_192",
+	[12] = "KM_ENCRYPTED_AES_256",
+	[13] = "KM_XTS_AES_128",
+	[14] = "KM_XTS_AES_256",
+	[15] = "KM_XTS_ENCRYPTED_AES_128",
+	[16] = "KM_XTS_ENCRYPTED_AES_256",
+	[17] = "KMC_DEA",
+	[18] = "KMC_TDEA_128",
+	[19] = "KMC_TDEA_192",
+	[20] = "KMC_ENCRYPTED_DEA",
+	[21] = "KMC_ENCRYPTED_TDEA_128",
+	[22] = "KMC_ENCRYPTED_TDEA_192",
+	[23] = "KMC_AES_128",
+	[24] = "KMC_AES_192",
+	[25] = "KMC_AES_256",
+	[26] = "KMC_ENCRYPTED_AES_128",
+	[27] = "KMC_ENCRYPTED_AES_192",
+	[28] = "KMC_ENCRYPTED_AES_256",
+	[29] = "KMC_PRNG",
+	[30] = "KMA_GCM_AES_128",
+	[31] = "KMA_GCM_AES_192",
+	[32] = "KMA_GCM_AES_256",
+	[33] = "KMA_GCM_ENCRYPTED_AES_128",
+	[34] = "KMA_GCM_ENCRYPTED_AES_192",
+	[35] = "KMA_GCM_ENCRYPTED_AES_256",
+	[36] = "KMF_DEA",
+	[37] = "KMF_TDEA_128",
+	[38] = "KMF_TDEA_192",
+	[39] = "KMF_ENCRYPTED_DEA",
+	[40] = "KMF_ENCRYPTED_TDEA_128",
+	[41] = "KMF_ENCRYPTED_TDEA_192",
+	[42] = "KMF_AES_128",
+	[43] = "KMF_AES_192",
+	[44] = "KMF_AES_256",
+	[45] = "KMF_ENCRYPTED_AES_128",
+	[46] = "KMF_ENCRYPTED_AES_192",
+	[47] = "KMF_ENCRYPTED_AES_256",
+	[48] = "KMCTR_DEA",
+	[49] = "KMCTR_TDEA_128",
+	[50] = "KMCTR_TDEA_192",
+	[51] = "KMCTR_ENCRYPTED_DEA",
+	[52] = "KMCTR_ENCRYPTED_TDEA_128",
+	[53] = "KMCTR_ENCRYPTED_TDEA_192",
+	[54] = "KMCTR_AES_128",
+	[55] = "KMCTR_AES_192",
+	[56] = "KMCTR_AES_256",
+	[57] = "KMCTR_ENCRYPTED_AES_128",
+	[58] = "KMCTR_ENCRYPTED_AES_192",
+	[59] = "KMCTR_ENCRYPTED_AES_256",
+	[60] = "KMO_DEA",
+	[61] = "KMO_TDEA_128",
+	[62] = "KMO_TDEA_192",
+	[63] = "KMO_ENCRYPTED_DEA",
+	[64] = "KMO_ENCRYPTED_TDEA_128",
+	[65] = "KMO_ENCRYPTED_TDEA_192",
+	[66] = "KMO_AES_128",
+	[67] = "KMO_AES_192",
+	[68] = "KMO_AES_256",
+	[69] = "KMO_ENCRYPTED_AES_128",
+	[70] = "KMO_ENCRYPTED_AES_192",
+	[71] = "KMO_ENCRYPTED_AES_256",
+	[72] = "KIMD_SHA_1",
+	[73] = "KIMD_SHA_256",
+	[74] = "KIMD_SHA_512",
+	[75] = "KIMD_SHA3_224",
+	[76] = "KIMD_SHA3_256",
+	[77] = "KIMD_SHA3_384",
+	[78] = "KIMD_SHA3_512",
+	[79] = "KIMD_SHAKE_128",
+	[80] = "KIMD_SHAKE_256",
+	[81] = "KIMD_GHASH",
+	[82] = "KLMD_SHA_1",
+	[83] = "KLMD_SHA_256",
+	[84] = "KLMD_SHA_512",
+	[85] = "KLMD_SHA3_224",
+	[86] = "KLMD_SHA3_256",
+	[87] = "KLMD_SHA3_384",
+	[88] = "KLMD_SHA3_512",
+	[89] = "KLMD_SHAKE_128",
+	[90] = "KLMD_SHAKE_256",
+	[91] = "KMAC_DEA",
+	[92] = "KMAC_TDEA_128",
+	[93] = "KMAC_TDEA_192",
+	[94] = "KMAC_ENCRYPTED_DEA",
+	[95] = "KMAC_ENCRYPTED_TDEA_128",
+	[96] = "KMAC_ENCRYPTED_TDEA_192",
+	[97] = "KMAC_AES_128",
+	[98] = "KMAC_AES_192",
+	[99] = "KMAC_AES_256",
+	[100] = "KMAC_ENCRYPTED_AES_128",
+	[101] = "KMAC_ENCRYPTED_AES_192",
+	[102] = "KMAC_ENCRYPTED_AES_256",
+	[103] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_DEA",
+	[104] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_TDEA_128",
+	[105] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_TDEA_192",
+	[106] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_ENCRYPTED_DEA",
+	[107] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_ENCRYPTED_TDEA_128",
+	[108] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_ENCRYPTED_TDEA_192",
+	[109] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_AES_128",
+	[110] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_AES_192",
+	[111] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_AES_256",
+	[112] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_ENCRYPTED_AES_128",
+	[113] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_ENCRYPTED_AES_192",
+	[114] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_ENCRYPTED_AES_256",
+	[115] = "PCC_COMPUTE_XTS_PARAMETER_USING_AES_128",
+	[116] = "PCC_COMPUTE_XTS_PARAMETER_USING_AES_256",
+	[117] = "PCC_COMPUTE_XTS_PARAMETER_USING_ENCRYPTED_AES_128",
+	[118] = "PCC_COMPUTE_XTS_PARAMETER_USING_ENCRYPTED_AES_256",
+	[119] = "PCC_SCALAR_MULTIPLY_P256",
+	[120] = "PCC_SCALAR_MULTIPLY_P384",
+	[121] = "PCC_SCALAR_MULTIPLY_P521",
+	[122] = "PCC_SCALAR_MULTIPLY_ED25519",
+	[123] = "PCC_SCALAR_MULTIPLY_ED448",
+	[124] = "PCC_SCALAR_MULTIPLY_X25519",
+	[125] = "PCC_SCALAR_MULTIPLY_X448",
+	[126] = "PRNO_SHA_512_DRNG",
+	[127] = "PRNO_TRNG_QUERY_RAW_TO_CONDITIONED_RATIO",
+	[128] = "PRNO_TRNG",
+	[129] = "KDSA_ECDSA_VERIFY_P256",
+	[130] = "KDSA_ECDSA_VERIFY_P384",
+	[131] = "KDSA_ECDSA_VERIFY_P521",
+	[132] = "KDSA_ECDSA_SIGN_P256",
+	[133] = "KDSA_ECDSA_SIGN_P384",
+	[134] = "KDSA_ECDSA_SIGN_P521",
+	[135] = "KDSA_ENCRYPTED_ECDSA_SIGN_P256",
+	[136] = "KDSA_ENCRYPTED_ECDSA_SIGN_P384",
+	[137] = "KDSA_ENCRYPTED_ECDSA_SIGN_P521",
+	[138] = "KDSA_EDDSA_VERIFY_ED25519",
+	[139] = "KDSA_EDDSA_VERIFY_ED448",
+	[140] = "KDSA_EDDSA_SIGN_ED25519",
+	[141] = "KDSA_EDDSA_SIGN_ED448",
+	[142] = "KDSA_ENCRYPTED_EDDSA_SIGN_ED25519",
+	[143] = "KDSA_ENCRYPTED_EDDSA_SIGN_ED448",
+	[144] = "PCKMO_ENCRYPT_DEA_KEY",
+	[145] = "PCKMO_ENCRYPT_TDEA_128_KEY",
+	[146] = "PCKMO_ENCRYPT_TDEA_192_KEY",
+	[147] = "PCKMO_ENCRYPT_AES_128_KEY",
+	[148] = "PCKMO_ENCRYPT_AES_192_KEY",
+	[149] = "PCKMO_ENCRYPT_AES_256_KEY",
+	[150] = "PCKMO_ENCRYPT_ECC_P256_KEY",
+	[151] = "PCKMO_ENCRYPT_ECC_P384_KEY",
+	[152] = "PCKMO_ENCRYPT_ECC_P521_KEY",
+	[153] = "PCKMO_ENCRYPT_ECC_ED25519_KEY",
+	[154] = "PCKMO_ENCRYPT_ECC_ED448_KEY",
+	[155] = "IBM_RESERVED_155",
+	[156] = "IBM_RESERVED_156",
+	[157] = "KM_FULL_XTS_AES_128",
+	[158] = "KM_FULL_XTS_AES_256",
+	[159] = "KM_FULL_XTS_ENCRYPTED_AES_128",
+	[160] = "KM_FULL_XTS_ENCRYPTED_AES_256",
+	[161] = "KMAC_HMAC_SHA_224",
+	[162] = "KMAC_HMAC_SHA_256",
+	[163] = "KMAC_HMAC_SHA_384",
+	[164] = "KMAC_HMAC_SHA_512",
+	[165] = "KMAC_HMAC_ENCRYPTED_SHA_224",
+	[166] = "KMAC_HMAC_ENCRYPTED_SHA_256",
+	[167] = "KMAC_HMAC_ENCRYPTED_SHA_384",
+	[168] = "KMAC_HMAC_ENCRYPTED_SHA_512",
+	[169] = "PCKMO_ENCRYPT_HMAC_512_KEY",
+	[170] = "PCKMO_ENCRYPT_HMAC_1024_KEY",
+	[171] = "PCKMO_ENCRYPT_AES_XTS_128",
+	[172] = "PCKMO_ENCRYPT_AES_XTS_256",
+};
+
+static struct attribute *paiext_format_attr[] = {
+	&format_attr_event.attr,
+	NULL,
+};
+
+static struct attribute_group paiext_events_group = {
+	.name = "events",
+	.attrs = NULL,			/* Filled in attr_event_init() */
+};
+
+static struct attribute_group paiext_format_group = {
+	.name = "format",
+	.attrs = paiext_format_attr,
+};
+
+static const struct attribute_group *paiext_attr_groups[] = {
+	&paiext_events_group,
+	&paiext_format_group,
+	NULL,
+};
+
+/* Performance monitoring unit for mapped counters */
+static struct pmu paiext = {
+	.task_ctx_nr  = perf_hw_context,
+	.event_init   = paiext_event_init,
+	.add	      = paiext_add,
+	.del	      = paiext_del,
+	.start	      = paiext_start,
+	.stop	      = paiext_stop,
+	.read	      = paiext_read,
+	.sched_task   = paiext_sched_task,
+	.attr_groups  = paiext_attr_groups,
+};
+
+/* List of symbolic PAI extension 1 NNPA counter names. */
+static const char * const paiext_ctrnames[] = {
+	[0] = "NNPA_ALL",
+	[1] = "NNPA_ADD",
+	[2] = "NNPA_SUB",
+	[3] = "NNPA_MUL",
+	[4] = "NNPA_DIV",
+	[5] = "NNPA_MIN",
+	[6] = "NNPA_MAX",
+	[7] = "NNPA_LOG",
+	[8] = "NNPA_EXP",
+	[9] = "NNPA_IBM_RESERVED_9",
+	[10] = "NNPA_RELU",
+	[11] = "NNPA_TANH",
+	[12] = "NNPA_SIGMOID",
+	[13] = "NNPA_SOFTMAX",
+	[14] = "NNPA_BATCHNORM",
+	[15] = "NNPA_MAXPOOL2D",
+	[16] = "NNPA_AVGPOOL2D",
+	[17] = "NNPA_LSTMACT",
+	[18] = "NNPA_GRUACT",
+	[19] = "NNPA_CONVOLUTION",
+	[20] = "NNPA_MATMUL_OP",
+	[21] = "NNPA_MATMUL_OP_BCAST23",
+	[22] = "NNPA_SMALLBATCH",
+	[23] = "NNPA_LARGEDIM",
+	[24] = "NNPA_SMALLTENSOR",
+	[25] = "NNPA_1MFRAME",
+	[26] = "NNPA_2GFRAME",
+	[27] = "NNPA_ACCESSEXCEPT",
+	[28] = "NNPA_TRANSFORM",
+	[29] = "NNPA_GELU",
+	[30] = "NNPA_MOMENTS",
+	[31] = "NNPA_LAYERNORM",
+	[32] = "NNPA_MATMUL_OP_BCAST1",
+	[33] = "NNPA_SQRT",
+	[34] = "NNPA_INVSQRT",
+	[35] = "NNPA_NORM",
+	[36] = "NNPA_REDUCE",
+};
+
+static void __init attr_event_free(struct attribute **attrs)
+{
+	struct perf_pmu_events_attr *pa;
+	unsigned int i;
+
+	for (i = 0; attrs[i]; i++) {
+		struct device_attribute *dap;
+
+		dap = container_of(attrs[i], struct device_attribute, attr);
+		pa = container_of(dap, struct perf_pmu_events_attr, attr);
+		kfree(pa);
+	}
+	kfree(attrs);
+}
+
+static struct attribute * __init attr_event_init_one(int num,
+						     unsigned long base,
+						     const char *name)
+{
+	struct perf_pmu_events_attr *pa;
+
+	pa = kzalloc(sizeof(*pa), GFP_KERNEL);
+	if (!pa)
+		return NULL;
+
+	sysfs_attr_init(&pa->attr.attr);
+	pa->id = base + num;
+	pa->attr.attr.name = name;
+	pa->attr.attr.mode = 0444;
+	pa->attr.show = cpumf_events_sysfs_show;
+	pa->attr.store = NULL;
+	return &pa->attr.attr;
+}
+
+static struct attribute ** __init attr_event_init(struct pai_pmu *p)
+{
+	unsigned int min_attr = min_t(unsigned int, p->num_named, p->num_avail);
+	struct attribute **attrs;
+	unsigned int i;
+
+	attrs = kmalloc_array(min_attr + 1, sizeof(*attrs), GFP_KERNEL | __GFP_ZERO);
+	if (!attrs)
+		goto out;
+	for (i = 0; i < min_attr; i++) {
+		attrs[i] = attr_event_init_one(i, p->base, p->names[i]);
+		if (!attrs[i]) {
+			attr_event_free(attrs);
+			attrs = NULL;
+			goto out;
+		}
+	}
+	attrs[i] = NULL;
+out:
+	return attrs;
+}
+
+static void __init pai_pmu_exit(struct pai_pmu *p)
+{
+	attr_event_free(p->event_group->attrs);
+	p->event_group->attrs = NULL;
+}
+
+/* Add a PMU. Install its events and register the PMU device driver
+ * call back functions.
+ */
+static int __init pai_pmu_init(struct pai_pmu *p)
+{
+	int rc = -ENOMEM;
+
+
+	/* Export known PAI events */
+	p->event_group->attrs = attr_event_init(p);
+	if (!p->event_group->attrs) {
+		pr_err("Creation of PMU %s /sysfs failed\n", p->pmuname);
+		goto out;
+	}
+
+	rc = perf_pmu_register(p->pmu, p->pmuname, -1);
+	if (rc) {
+		pai_pmu_exit(p);
+		pr_err("Registering PMU %s failed with rc=%i\n", p->pmuname,
+		       rc);
+	}
+out:
+	return rc;
+}
+
+/* PAI PMU characteristics table */
+static struct pai_pmu pai_pmu[] __refdata = {
+	[PAI_PMU_CRYPTO] = {
+		.pmuname = "pai_crypto",
+		.facility_nr = 196,
+		.num_named = ARRAY_SIZE(paicrypt_ctrnames),
+		.names = paicrypt_ctrnames,
+		.base = PAI_CRYPTO_BASE,
+		.kernel_offset = PAI_CRYPTO_KERNEL_OFFSET,
+		.area_size = PAGE_SIZE,
+		.init = pai_pmu_init,
+		.exit = pai_pmu_exit,
+		.pmu = &paicrypt,
+		.event_group = &paicrypt_events_group
+	},
+	[PAI_PMU_EXT] = {
+		.pmuname = "pai_ext",
+		.facility_nr = 197,
+		.num_named = ARRAY_SIZE(paiext_ctrnames),
+		.names = paiext_ctrnames,
+		.base = PAI_NNPA_BASE,
+		.kernel_offset = 0,
+		.area_size = PAIE1_CTRBLOCK_SZ,
+		.init = pai_pmu_init,
+		.exit = pai_pmu_exit,
+		.pmu = &paiext,
+		.event_group = &paiext_events_group
+	}
+};
+
+/*
+ * Check if the PMU (via facility) is supported by machine. Try all of the
+ * supported PAI PMUs.
+ * Return number of successfully installed PMUs.
+ */
+static int __init paipmu_setup(void)
+{
+	struct qpaci_info_block ib;
+	int install_ok = 0, rc;
+	struct pai_pmu *p;
+	size_t i;
+
+	for (i = 0; i < ARRAY_SIZE(pai_pmu); ++i) {
+		p = &pai_pmu[i];
+
+		if (!test_facility(p->facility_nr))
+			continue;
+
+		qpaci(&ib);
+		switch (i) {
+		case PAI_PMU_CRYPTO:
+			p->num_avail = ib.num_cc;
+			if (p->num_avail >= PAI_CRYPTO_MAXCTR) {
+				pr_err("Too many PMU %s counters %d\n",
+				       p->pmuname, p->num_avail);
+				continue;
+			}
+			break;
+		case PAI_PMU_EXT:
+			p->num_avail = ib.num_nnpa;
+			break;
+		}
+		p->num_avail += 1;		/* Add xxx_ALL event */
+		if (p->init) {
+			rc = p->init(p);
+			if (!rc)
+				++install_ok;
+		}
+	}
+	return install_ok;
+}
+
+static int __init pai_init(void)
+{
+	/* Setup s390dbf facility */
+	paidbg = debug_register("pai", 32, 256, 128);
+	if (!paidbg) {
+		pr_err("Registration of s390dbf pai failed\n");
+		return -ENOMEM;
+	}
+	debug_register_view(paidbg, &debug_sprintf_view);
+
+	if (!paipmu_setup()) {
+		/* No PMU registration, no need for debug buffer */
+		debug_unregister_view(paidbg, &debug_sprintf_view);
+		debug_unregister(paidbg);
+		return -ENODEV;
+	}
+	return 0;
+}
+
+device_initcall(pai_init);
diff --git a/arch/s390/kernel/perf_pai_crypto.c b/arch/s390/kernel/perf_pai_crypto.c
deleted file mode 100644
index 62bf8a15bf32..000000000000
--- a/arch/s390/kernel/perf_pai_crypto.c
+++ /dev/null
@@ -1,843 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Performance event support - Processor Activity Instrumentation Facility
- *
- *  Copyright IBM Corp. 2022
- *  Author(s): Thomas Richter <tmricht@linux.ibm.com>
- */
-#define KMSG_COMPONENT	"pai_crypto"
-#define pr_fmt(fmt)	KMSG_COMPONENT ": " fmt
-
-#include <linux/kernel.h>
-#include <linux/kernel_stat.h>
-#include <linux/percpu.h>
-#include <linux/notifier.h>
-#include <linux/init.h>
-#include <linux/io.h>
-#include <linux/perf_event.h>
-#include <asm/ctlreg.h>
-#include <asm/pai.h>
-#include <asm/debug.h>
-
-static debug_info_t *cfm_dbg;
-static unsigned int paicrypt_cnt;	/* Size of the mapped counter sets */
-					/* extracted with QPACI instruction */
-
-DEFINE_STATIC_KEY_FALSE(pai_key);
-
-struct pai_userdata {
-	u16 num;
-	u64 value;
-} __packed;
-
-struct paicrypt_map {
-	unsigned long *page;		/* Page for CPU to store counters */
-	struct pai_userdata *save;	/* Page to store no-zero counters */
-	unsigned int active_events;	/* # of PAI crypto users */
-	refcount_t refcnt;		/* Reference count mapped buffers */
-	struct perf_event *event;	/* Perf event for sampling */
-	struct list_head syswide_list;	/* List system-wide sampling events */
-};
-
-struct paicrypt_mapptr {
-	struct paicrypt_map *mapptr;
-};
-
-static struct paicrypt_root {		/* Anchor to per CPU data */
-	refcount_t refcnt;		/* Overall active events */
-	struct paicrypt_mapptr __percpu *mapptr;
-} paicrypt_root;
-
-/* Free per CPU data when the last event is removed. */
-static void paicrypt_root_free(void)
-{
-	if (refcount_dec_and_test(&paicrypt_root.refcnt)) {
-		free_percpu(paicrypt_root.mapptr);
-		paicrypt_root.mapptr = NULL;
-	}
-	debug_sprintf_event(cfm_dbg, 5, "%s root.refcount %d\n", __func__,
-			    refcount_read(&paicrypt_root.refcnt));
-}
-
-/*
- * On initialization of first event also allocate per CPU data dynamically.
- * Start with an array of pointers, the array size is the maximum number of
- * CPUs possible, which might be larger than the number of CPUs currently
- * online.
- */
-static int paicrypt_root_alloc(void)
-{
-	if (!refcount_inc_not_zero(&paicrypt_root.refcnt)) {
-		/* The memory is already zeroed. */
-		paicrypt_root.mapptr = alloc_percpu(struct paicrypt_mapptr);
-		if (!paicrypt_root.mapptr)
-			return -ENOMEM;
-		refcount_set(&paicrypt_root.refcnt, 1);
-	}
-	return 0;
-}
-
-/* Release the PMU if event is the last perf event */
-static DEFINE_MUTEX(pai_reserve_mutex);
-
-/* Free all memory allocated for event counting/sampling setup */
-static void paicrypt_free(struct paicrypt_mapptr *mp)
-{
-	free_page((unsigned long)mp->mapptr->page);
-	kvfree(mp->mapptr->save);
-	kfree(mp->mapptr);
-	mp->mapptr = NULL;
-}
-
-/* Adjust usage counters and remove allocated memory when all users are
- * gone.
- */
-static void paicrypt_event_destroy_cpu(struct perf_event *event, int cpu)
-{
-	struct paicrypt_mapptr *mp = per_cpu_ptr(paicrypt_root.mapptr, cpu);
-	struct paicrypt_map *cpump = mp->mapptr;
-
-	mutex_lock(&pai_reserve_mutex);
-	debug_sprintf_event(cfm_dbg, 5, "%s event %#llx cpu %d users %d "
-			    "refcnt %u\n", __func__, event->attr.config,
-			    event->cpu, cpump->active_events,
-			    refcount_read(&cpump->refcnt));
-	if (refcount_dec_and_test(&cpump->refcnt))
-		paicrypt_free(mp);
-	paicrypt_root_free();
-	mutex_unlock(&pai_reserve_mutex);
-}
-
-static void paicrypt_event_destroy(struct perf_event *event)
-{
-	int cpu;
-
-	static_branch_dec(&pai_key);
-	free_page(PAI_SAVE_AREA(event));
-	if (event->cpu == -1) {
-		struct cpumask *mask = PAI_CPU_MASK(event);
-
-		for_each_cpu(cpu, mask)
-			paicrypt_event_destroy_cpu(event, cpu);
-		kfree(mask);
-	} else {
-		paicrypt_event_destroy_cpu(event, event->cpu);
-	}
-}
-
-static u64 paicrypt_getctr(unsigned long *page, int nr, bool kernel)
-{
-	if (kernel)
-		nr += PAI_CRYPTO_MAXCTR;
-	return page[nr];
-}
-
-/* Read the counter values. Return value from location in CMP. For event
- * CRYPTO_ALL sum up all events.
- */
-static u64 paicrypt_getdata(struct perf_event *event, bool kernel)
-{
-	struct paicrypt_mapptr *mp = this_cpu_ptr(paicrypt_root.mapptr);
-	struct paicrypt_map *cpump = mp->mapptr;
-	u64 sum = 0;
-	int i;
-
-	if (event->attr.config != PAI_CRYPTO_BASE) {
-		return paicrypt_getctr(cpump->page,
-				       event->attr.config - PAI_CRYPTO_BASE,
-				       kernel);
-	}
-
-	for (i = 1; i <= paicrypt_cnt; i++) {
-		u64 val = paicrypt_getctr(cpump->page, i, kernel);
-
-		if (!val)
-			continue;
-		sum += val;
-	}
-	return sum;
-}
-
-static u64 paicrypt_getall(struct perf_event *event)
-{
-	u64 sum = 0;
-
-	if (!event->attr.exclude_kernel)
-		sum += paicrypt_getdata(event, true);
-	if (!event->attr.exclude_user)
-		sum += paicrypt_getdata(event, false);
-
-	return sum;
-}
-
-/* Check concurrent access of counting and sampling for crypto events.
- * This function is called in process context and it is save to block.
- * When the event initialization functions fails, no other call back will
- * be invoked.
- *
- * Allocate the memory for the event.
- */
-static int paicrypt_alloc_cpu(struct perf_event *event, int cpu)
-{
-	struct paicrypt_map *cpump = NULL;
-	struct paicrypt_mapptr *mp;
-	int rc;
-
-	mutex_lock(&pai_reserve_mutex);
-	/* Allocate root node */
-	rc = paicrypt_root_alloc();
-	if (rc)
-		goto unlock;
-
-	/* Allocate node for this event */
-	mp = per_cpu_ptr(paicrypt_root.mapptr, cpu);
-	cpump = mp->mapptr;
-	if (!cpump) {			/* Paicrypt_map allocated? */
-		rc = -ENOMEM;
-		cpump = kzalloc(sizeof(*cpump), GFP_KERNEL);
-		if (!cpump)
-			goto undo;
-		/* Allocate memory for counter page and counter extraction.
-		 * Only the first counting event has to allocate a page.
-		 */
-		mp->mapptr = cpump;
-		cpump->page = (unsigned long *)get_zeroed_page(GFP_KERNEL);
-		cpump->save = kvmalloc_array(paicrypt_cnt + 1,
-					     sizeof(struct pai_userdata),
-					     GFP_KERNEL);
-		if (!cpump->page || !cpump->save) {
-			paicrypt_free(mp);
-			goto undo;
-		}
-		INIT_LIST_HEAD(&cpump->syswide_list);
-		refcount_set(&cpump->refcnt, 1);
-		rc = 0;
-	} else {
-		refcount_inc(&cpump->refcnt);
-	}
-
-undo:
-	if (rc) {
-		/* Error in allocation of event, decrement anchor. Since
-		 * the event in not created, its destroy() function is never
-		 * invoked. Adjust the reference counter for the anchor.
-		 */
-		paicrypt_root_free();
-	}
-unlock:
-	mutex_unlock(&pai_reserve_mutex);
-	return rc;
-}
-
-static int paicrypt_alloc(struct perf_event *event)
-{
-	struct cpumask *maskptr;
-	int cpu, rc = -ENOMEM;
-
-	maskptr = kzalloc(sizeof(*maskptr), GFP_KERNEL);
-	if (!maskptr)
-		goto out;
-
-	for_each_online_cpu(cpu) {
-		rc = paicrypt_alloc_cpu(event, cpu);
-		if (rc) {
-			for_each_cpu(cpu, maskptr)
-				paicrypt_event_destroy_cpu(event, cpu);
-			kfree(maskptr);
-			goto out;
-		}
-		cpumask_set_cpu(cpu, maskptr);
-	}
-
-	/*
-	 * On error all cpumask are freed and all events have been destroyed.
-	 * Save of which CPUs data structures have been allocated for.
-	 * Release them in paicrypt_event_destroy call back function
-	 * for this event.
-	 */
-	PAI_CPU_MASK(event) = maskptr;
-	rc = 0;
-out:
-	return rc;
-}
-
-/* Might be called on different CPU than the one the event is intended for. */
-static int paicrypt_event_init(struct perf_event *event)
-{
-	struct perf_event_attr *a = &event->attr;
-	int rc = 0;
-
-	/* PAI crypto PMU registered as PERF_TYPE_RAW, check event type */
-	if (a->type != PERF_TYPE_RAW && event->pmu->type != a->type)
-		return -ENOENT;
-	/* PAI crypto event must be in valid range, try others if not */
-	if (a->config < PAI_CRYPTO_BASE ||
-	    a->config > PAI_CRYPTO_BASE + paicrypt_cnt)
-		return -ENOENT;
-	/* Allow only CRYPTO_ALL for sampling */
-	if (a->sample_period && a->config != PAI_CRYPTO_BASE)
-		return -EINVAL;
-	/* Get a page to store last counter values for sampling */
-	if (a->sample_period) {
-		PAI_SAVE_AREA(event) = get_zeroed_page(GFP_KERNEL);
-		if (!PAI_SAVE_AREA(event)) {
-			rc = -ENOMEM;
-			goto out;
-		}
-	}
-
-	if (event->cpu >= 0)
-		rc = paicrypt_alloc_cpu(event, event->cpu);
-	else
-		rc = paicrypt_alloc(event);
-	if (rc) {
-		free_page(PAI_SAVE_AREA(event));
-		goto out;
-	}
-	event->destroy = paicrypt_event_destroy;
-
-	if (a->sample_period) {
-		a->sample_period = 1;
-		a->freq = 0;
-		/* Register for paicrypt_sched_task() to be called */
-		event->attach_state |= PERF_ATTACH_SCHED_CB;
-		/* Add raw data which contain the memory mapped counters */
-		a->sample_type |= PERF_SAMPLE_RAW;
-		/* Turn off inheritance */
-		a->inherit = 0;
-	}
-
-	static_branch_inc(&pai_key);
-out:
-	return rc;
-}
-
-static void paicrypt_read(struct perf_event *event)
-{
-	u64 prev, new, delta;
-
-	prev = local64_read(&event->hw.prev_count);
-	new = paicrypt_getall(event);
-	local64_set(&event->hw.prev_count, new);
-	delta = (prev <= new) ? new - prev
-			      : (-1ULL - prev) + new + 1;	 /* overflow */
-	local64_add(delta, &event->count);
-}
-
-static void paicrypt_start(struct perf_event *event, int flags)
-{
-	struct paicrypt_mapptr *mp = this_cpu_ptr(paicrypt_root.mapptr);
-	struct paicrypt_map *cpump = mp->mapptr;
-	u64 sum;
-
-	if (!event->attr.sample_period) {	/* Counting */
-		sum = paicrypt_getall(event);	/* Get current value */
-		local64_set(&event->hw.prev_count, sum);
-	} else {				/* Sampling */
-		memcpy((void *)PAI_SAVE_AREA(event), cpump->page, PAGE_SIZE);
-		/* Enable context switch callback for system-wide sampling */
-		if (!(event->attach_state & PERF_ATTACH_TASK)) {
-			list_add_tail(PAI_SWLIST(event), &cpump->syswide_list);
-			perf_sched_cb_inc(event->pmu);
-		} else {
-			cpump->event = event;
-		}
-	}
-}
-
-static int paicrypt_add(struct perf_event *event, int flags)
-{
-	struct paicrypt_mapptr *mp = this_cpu_ptr(paicrypt_root.mapptr);
-	struct paicrypt_map *cpump = mp->mapptr;
-	unsigned long ccd;
-
-	if (++cpump->active_events == 1) {
-		ccd = virt_to_phys(cpump->page) | PAI_CRYPTO_KERNEL_OFFSET;
-		WRITE_ONCE(get_lowcore()->ccd, ccd);
-		local_ctl_set_bit(0, CR0_CRYPTOGRAPHY_COUNTER_BIT);
-	}
-	if (flags & PERF_EF_START)
-		paicrypt_start(event, PERF_EF_RELOAD);
-	event->hw.state = 0;
-	return 0;
-}
-
-static void paicrypt_have_sample(struct perf_event *, struct paicrypt_map *);
-static void paicrypt_stop(struct perf_event *event, int flags)
-{
-	struct paicrypt_mapptr *mp = this_cpu_ptr(paicrypt_root.mapptr);
-	struct paicrypt_map *cpump = mp->mapptr;
-
-	if (!event->attr.sample_period) {	/* Counting */
-		paicrypt_read(event);
-	} else {				/* Sampling */
-		if (!(event->attach_state & PERF_ATTACH_TASK)) {
-			perf_sched_cb_dec(event->pmu);
-			list_del(PAI_SWLIST(event));
-		} else {
-			paicrypt_have_sample(event, cpump);
-			cpump->event = NULL;
-		}
-	}
-	event->hw.state = PERF_HES_STOPPED;
-}
-
-static void paicrypt_del(struct perf_event *event, int flags)
-{
-	struct paicrypt_mapptr *mp = this_cpu_ptr(paicrypt_root.mapptr);
-	struct paicrypt_map *cpump = mp->mapptr;
-
-	paicrypt_stop(event, PERF_EF_UPDATE);
-	if (--cpump->active_events == 0) {
-		local_ctl_clear_bit(0, CR0_CRYPTOGRAPHY_COUNTER_BIT);
-		WRITE_ONCE(get_lowcore()->ccd, 0);
-	}
-}
-
-/* Create raw data and save it in buffer. Calculate the delta for each
- * counter between this invocation and the last invocation.
- * Returns number of bytes copied.
- * Saves only entries with positive counter difference of the form
- * 2 bytes: Number of counter
- * 8 bytes: Value of counter
- */
-static size_t paicrypt_copy(struct pai_userdata *userdata, unsigned long *page,
-			    unsigned long *page_old, bool exclude_user,
-			    bool exclude_kernel)
-{
-	int i, outidx = 0;
-
-	for (i = 1; i <= paicrypt_cnt; i++) {
-		u64 val = 0, val_old = 0;
-
-		if (!exclude_kernel) {
-			val += paicrypt_getctr(page, i, true);
-			val_old += paicrypt_getctr(page_old, i, true);
-		}
-		if (!exclude_user) {
-			val += paicrypt_getctr(page, i, false);
-			val_old += paicrypt_getctr(page_old, i, false);
-		}
-		if (val >= val_old)
-			val -= val_old;
-		else
-			val = (~0ULL - val_old) + val + 1;
-		if (val) {
-			userdata[outidx].num = i;
-			userdata[outidx].value = val;
-			outidx++;
-		}
-	}
-	return outidx * sizeof(struct pai_userdata);
-}
-
-static int paicrypt_push_sample(size_t rawsize, struct paicrypt_map *cpump,
-				struct perf_event *event)
-{
-	struct perf_sample_data data;
-	struct perf_raw_record raw;
-	struct pt_regs regs;
-	int overflow;
-
-	/* Setup perf sample */
-	memset(&regs, 0, sizeof(regs));
-	memset(&raw, 0, sizeof(raw));
-	memset(&data, 0, sizeof(data));
-	perf_sample_data_init(&data, 0, event->hw.last_period);
-	if (event->attr.sample_type & PERF_SAMPLE_TID) {
-		data.tid_entry.pid = task_tgid_nr(current);
-		data.tid_entry.tid = task_pid_nr(current);
-	}
-	if (event->attr.sample_type & PERF_SAMPLE_TIME)
-		data.time = event->clock();
-	if (event->attr.sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER))
-		data.id = event->id;
-	if (event->attr.sample_type & PERF_SAMPLE_CPU) {
-		data.cpu_entry.cpu = smp_processor_id();
-		data.cpu_entry.reserved = 0;
-	}
-	if (event->attr.sample_type & PERF_SAMPLE_RAW) {
-		raw.frag.size = rawsize;
-		raw.frag.data = cpump->save;
-		perf_sample_save_raw_data(&data, event, &raw);
-	}
-
-	overflow = perf_event_overflow(event, &data, &regs);
-	perf_event_update_userpage(event);
-	/* Save crypto counter lowcore page after reading event data. */
-	memcpy((void *)PAI_SAVE_AREA(event), cpump->page, PAGE_SIZE);
-	return overflow;
-}
-
-/* Check if there is data to be saved on schedule out of a task. */
-static void paicrypt_have_sample(struct perf_event *event,
-				 struct paicrypt_map *cpump)
-{
-	size_t rawsize;
-
-	if (!event)		/* No event active */
-		return;
-	rawsize = paicrypt_copy(cpump->save, cpump->page,
-				(unsigned long *)PAI_SAVE_AREA(event),
-				event->attr.exclude_user,
-				event->attr.exclude_kernel);
-	if (rawsize)			/* No incremented counters */
-		paicrypt_push_sample(rawsize, cpump, event);
-}
-
-/* Check if there is data to be saved on schedule out of a task. */
-static void paicrypt_have_samples(void)
-{
-	struct paicrypt_mapptr *mp = this_cpu_ptr(paicrypt_root.mapptr);
-	struct paicrypt_map *cpump = mp->mapptr;
-	struct perf_event *event;
-
-	list_for_each_entry(event, &cpump->syswide_list, hw.tp_list)
-		paicrypt_have_sample(event, cpump);
-}
-
-/* Called on schedule-in and schedule-out. No access to event structure,
- * but for sampling only event CRYPTO_ALL is allowed.
- */
-static void paicrypt_sched_task(struct perf_event_pmu_context *pmu_ctx,
-				struct task_struct *task, bool sched_in)
-{
-	/* We started with a clean page on event installation. So read out
-	 * results on schedule_out and if page was dirty, save old values.
-	 */
-	if (!sched_in)
-		paicrypt_have_samples();
-}
-
-/* Attribute definitions for paicrypt interface. As with other CPU
- * Measurement Facilities, there is one attribute per mapped counter.
- * The number of mapped counters may vary per machine generation. Use
- * the QUERY PROCESSOR ACTIVITY COUNTER INFORMATION (QPACI) instruction
- * to determine the number of mapped counters. The instructions returns
- * a positive number, which is the highest number of supported counters.
- * All counters less than this number are also supported, there are no
- * holes. A returned number of zero means no support for mapped counters.
- *
- * The identification of the counter is a unique number. The chosen range
- * is 0x1000 + offset in mapped kernel page.
- * All CPU Measurement Facility counters identifiers must be unique and
- * the numbers from 0 to 496 are already used for the CPU Measurement
- * Counter facility. Numbers 0xb0000, 0xbc000 and 0xbd000 are already
- * used for the CPU Measurement Sampling facility.
- */
-PMU_FORMAT_ATTR(event, "config:0-63");
-
-static struct attribute *paicrypt_format_attr[] = {
-	&format_attr_event.attr,
-	NULL,
-};
-
-static struct attribute_group paicrypt_events_group = {
-	.name = "events",
-	.attrs = NULL			/* Filled in attr_event_init() */
-};
-
-static struct attribute_group paicrypt_format_group = {
-	.name = "format",
-	.attrs = paicrypt_format_attr,
-};
-
-static const struct attribute_group *paicrypt_attr_groups[] = {
-	&paicrypt_events_group,
-	&paicrypt_format_group,
-	NULL,
-};
-
-/* Performance monitoring unit for mapped counters */
-static struct pmu paicrypt = {
-	.task_ctx_nr  = perf_hw_context,
-	.event_init   = paicrypt_event_init,
-	.add	      = paicrypt_add,
-	.del	      = paicrypt_del,
-	.start	      = paicrypt_start,
-	.stop	      = paicrypt_stop,
-	.read	      = paicrypt_read,
-	.sched_task   = paicrypt_sched_task,
-	.attr_groups  = paicrypt_attr_groups
-};
-
-/* List of symbolic PAI counter names. */
-static const char * const paicrypt_ctrnames[] = {
-	[0] = "CRYPTO_ALL",
-	[1] = "KM_DEA",
-	[2] = "KM_TDEA_128",
-	[3] = "KM_TDEA_192",
-	[4] = "KM_ENCRYPTED_DEA",
-	[5] = "KM_ENCRYPTED_TDEA_128",
-	[6] = "KM_ENCRYPTED_TDEA_192",
-	[7] = "KM_AES_128",
-	[8] = "KM_AES_192",
-	[9] = "KM_AES_256",
-	[10] = "KM_ENCRYPTED_AES_128",
-	[11] = "KM_ENCRYPTED_AES_192",
-	[12] = "KM_ENCRYPTED_AES_256",
-	[13] = "KM_XTS_AES_128",
-	[14] = "KM_XTS_AES_256",
-	[15] = "KM_XTS_ENCRYPTED_AES_128",
-	[16] = "KM_XTS_ENCRYPTED_AES_256",
-	[17] = "KMC_DEA",
-	[18] = "KMC_TDEA_128",
-	[19] = "KMC_TDEA_192",
-	[20] = "KMC_ENCRYPTED_DEA",
-	[21] = "KMC_ENCRYPTED_TDEA_128",
-	[22] = "KMC_ENCRYPTED_TDEA_192",
-	[23] = "KMC_AES_128",
-	[24] = "KMC_AES_192",
-	[25] = "KMC_AES_256",
-	[26] = "KMC_ENCRYPTED_AES_128",
-	[27] = "KMC_ENCRYPTED_AES_192",
-	[28] = "KMC_ENCRYPTED_AES_256",
-	[29] = "KMC_PRNG",
-	[30] = "KMA_GCM_AES_128",
-	[31] = "KMA_GCM_AES_192",
-	[32] = "KMA_GCM_AES_256",
-	[33] = "KMA_GCM_ENCRYPTED_AES_128",
-	[34] = "KMA_GCM_ENCRYPTED_AES_192",
-	[35] = "KMA_GCM_ENCRYPTED_AES_256",
-	[36] = "KMF_DEA",
-	[37] = "KMF_TDEA_128",
-	[38] = "KMF_TDEA_192",
-	[39] = "KMF_ENCRYPTED_DEA",
-	[40] = "KMF_ENCRYPTED_TDEA_128",
-	[41] = "KMF_ENCRYPTED_TDEA_192",
-	[42] = "KMF_AES_128",
-	[43] = "KMF_AES_192",
-	[44] = "KMF_AES_256",
-	[45] = "KMF_ENCRYPTED_AES_128",
-	[46] = "KMF_ENCRYPTED_AES_192",
-	[47] = "KMF_ENCRYPTED_AES_256",
-	[48] = "KMCTR_DEA",
-	[49] = "KMCTR_TDEA_128",
-	[50] = "KMCTR_TDEA_192",
-	[51] = "KMCTR_ENCRYPTED_DEA",
-	[52] = "KMCTR_ENCRYPTED_TDEA_128",
-	[53] = "KMCTR_ENCRYPTED_TDEA_192",
-	[54] = "KMCTR_AES_128",
-	[55] = "KMCTR_AES_192",
-	[56] = "KMCTR_AES_256",
-	[57] = "KMCTR_ENCRYPTED_AES_128",
-	[58] = "KMCTR_ENCRYPTED_AES_192",
-	[59] = "KMCTR_ENCRYPTED_AES_256",
-	[60] = "KMO_DEA",
-	[61] = "KMO_TDEA_128",
-	[62] = "KMO_TDEA_192",
-	[63] = "KMO_ENCRYPTED_DEA",
-	[64] = "KMO_ENCRYPTED_TDEA_128",
-	[65] = "KMO_ENCRYPTED_TDEA_192",
-	[66] = "KMO_AES_128",
-	[67] = "KMO_AES_192",
-	[68] = "KMO_AES_256",
-	[69] = "KMO_ENCRYPTED_AES_128",
-	[70] = "KMO_ENCRYPTED_AES_192",
-	[71] = "KMO_ENCRYPTED_AES_256",
-	[72] = "KIMD_SHA_1",
-	[73] = "KIMD_SHA_256",
-	[74] = "KIMD_SHA_512",
-	[75] = "KIMD_SHA3_224",
-	[76] = "KIMD_SHA3_256",
-	[77] = "KIMD_SHA3_384",
-	[78] = "KIMD_SHA3_512",
-	[79] = "KIMD_SHAKE_128",
-	[80] = "KIMD_SHAKE_256",
-	[81] = "KIMD_GHASH",
-	[82] = "KLMD_SHA_1",
-	[83] = "KLMD_SHA_256",
-	[84] = "KLMD_SHA_512",
-	[85] = "KLMD_SHA3_224",
-	[86] = "KLMD_SHA3_256",
-	[87] = "KLMD_SHA3_384",
-	[88] = "KLMD_SHA3_512",
-	[89] = "KLMD_SHAKE_128",
-	[90] = "KLMD_SHAKE_256",
-	[91] = "KMAC_DEA",
-	[92] = "KMAC_TDEA_128",
-	[93] = "KMAC_TDEA_192",
-	[94] = "KMAC_ENCRYPTED_DEA",
-	[95] = "KMAC_ENCRYPTED_TDEA_128",
-	[96] = "KMAC_ENCRYPTED_TDEA_192",
-	[97] = "KMAC_AES_128",
-	[98] = "KMAC_AES_192",
-	[99] = "KMAC_AES_256",
-	[100] = "KMAC_ENCRYPTED_AES_128",
-	[101] = "KMAC_ENCRYPTED_AES_192",
-	[102] = "KMAC_ENCRYPTED_AES_256",
-	[103] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_DEA",
-	[104] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_TDEA_128",
-	[105] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_TDEA_192",
-	[106] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_ENCRYPTED_DEA",
-	[107] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_ENCRYPTED_TDEA_128",
-	[108] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_ENCRYPTED_TDEA_192",
-	[109] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_AES_128",
-	[110] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_AES_192",
-	[111] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_AES_256",
-	[112] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_ENCRYPTED_AES_128",
-	[113] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_ENCRYPTED_AES_192",
-	[114] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_ENCRYPTED_AES_256",
-	[115] = "PCC_COMPUTE_XTS_PARAMETER_USING_AES_128",
-	[116] = "PCC_COMPUTE_XTS_PARAMETER_USING_AES_256",
-	[117] = "PCC_COMPUTE_XTS_PARAMETER_USING_ENCRYPTED_AES_128",
-	[118] = "PCC_COMPUTE_XTS_PARAMETER_USING_ENCRYPTED_AES_256",
-	[119] = "PCC_SCALAR_MULTIPLY_P256",
-	[120] = "PCC_SCALAR_MULTIPLY_P384",
-	[121] = "PCC_SCALAR_MULTIPLY_P521",
-	[122] = "PCC_SCALAR_MULTIPLY_ED25519",
-	[123] = "PCC_SCALAR_MULTIPLY_ED448",
-	[124] = "PCC_SCALAR_MULTIPLY_X25519",
-	[125] = "PCC_SCALAR_MULTIPLY_X448",
-	[126] = "PRNO_SHA_512_DRNG",
-	[127] = "PRNO_TRNG_QUERY_RAW_TO_CONDITIONED_RATIO",
-	[128] = "PRNO_TRNG",
-	[129] = "KDSA_ECDSA_VERIFY_P256",
-	[130] = "KDSA_ECDSA_VERIFY_P384",
-	[131] = "KDSA_ECDSA_VERIFY_P521",
-	[132] = "KDSA_ECDSA_SIGN_P256",
-	[133] = "KDSA_ECDSA_SIGN_P384",
-	[134] = "KDSA_ECDSA_SIGN_P521",
-	[135] = "KDSA_ENCRYPTED_ECDSA_SIGN_P256",
-	[136] = "KDSA_ENCRYPTED_ECDSA_SIGN_P384",
-	[137] = "KDSA_ENCRYPTED_ECDSA_SIGN_P521",
-	[138] = "KDSA_EDDSA_VERIFY_ED25519",
-	[139] = "KDSA_EDDSA_VERIFY_ED448",
-	[140] = "KDSA_EDDSA_SIGN_ED25519",
-	[141] = "KDSA_EDDSA_SIGN_ED448",
-	[142] = "KDSA_ENCRYPTED_EDDSA_SIGN_ED25519",
-	[143] = "KDSA_ENCRYPTED_EDDSA_SIGN_ED448",
-	[144] = "PCKMO_ENCRYPT_DEA_KEY",
-	[145] = "PCKMO_ENCRYPT_TDEA_128_KEY",
-	[146] = "PCKMO_ENCRYPT_TDEA_192_KEY",
-	[147] = "PCKMO_ENCRYPT_AES_128_KEY",
-	[148] = "PCKMO_ENCRYPT_AES_192_KEY",
-	[149] = "PCKMO_ENCRYPT_AES_256_KEY",
-	[150] = "PCKMO_ENCRYPT_ECC_P256_KEY",
-	[151] = "PCKMO_ENCRYPT_ECC_P384_KEY",
-	[152] = "PCKMO_ENCRYPT_ECC_P521_KEY",
-	[153] = "PCKMO_ENCRYPT_ECC_ED25519_KEY",
-	[154] = "PCKMO_ENCRYPT_ECC_ED448_KEY",
-	[155] = "IBM_RESERVED_155",
-	[156] = "IBM_RESERVED_156",
-	[157] = "KM_FULL_XTS_AES_128",
-	[158] = "KM_FULL_XTS_AES_256",
-	[159] = "KM_FULL_XTS_ENCRYPTED_AES_128",
-	[160] = "KM_FULL_XTS_ENCRYPTED_AES_256",
-	[161] = "KMAC_HMAC_SHA_224",
-	[162] = "KMAC_HMAC_SHA_256",
-	[163] = "KMAC_HMAC_SHA_384",
-	[164] = "KMAC_HMAC_SHA_512",
-	[165] = "KMAC_HMAC_ENCRYPTED_SHA_224",
-	[166] = "KMAC_HMAC_ENCRYPTED_SHA_256",
-	[167] = "KMAC_HMAC_ENCRYPTED_SHA_384",
-	[168] = "KMAC_HMAC_ENCRYPTED_SHA_512",
-	[169] = "PCKMO_ENCRYPT_HMAC_512_KEY",
-	[170] = "PCKMO_ENCRYPT_HMAC_1024_KEY",
-	[171] = "PCKMO_ENCRYPT_AES_XTS_128",
-	[172] = "PCKMO_ENCRYPT_AES_XTS_256",
-};
-
-static void __init attr_event_free(struct attribute **attrs, int num)
-{
-	struct perf_pmu_events_attr *pa;
-	int i;
-
-	for (i = 0; i < num; i++) {
-		struct device_attribute *dap;
-
-		dap = container_of(attrs[i], struct device_attribute, attr);
-		pa = container_of(dap, struct perf_pmu_events_attr, attr);
-		kfree(pa);
-	}
-	kfree(attrs);
-}
-
-static int __init attr_event_init_one(struct attribute **attrs, int num)
-{
-	struct perf_pmu_events_attr *pa;
-
-	/* Index larger than array_size, no counter name available */
-	if (num >= ARRAY_SIZE(paicrypt_ctrnames)) {
-		attrs[num] = NULL;
-		return 0;
-	}
-
-	pa = kzalloc(sizeof(*pa), GFP_KERNEL);
-	if (!pa)
-		return -ENOMEM;
-
-	sysfs_attr_init(&pa->attr.attr);
-	pa->id = PAI_CRYPTO_BASE + num;
-	pa->attr.attr.name = paicrypt_ctrnames[num];
-	pa->attr.attr.mode = 0444;
-	pa->attr.show = cpumf_events_sysfs_show;
-	pa->attr.store = NULL;
-	attrs[num] = &pa->attr.attr;
-	return 0;
-}
-
-/* Create PMU sysfs event attributes on the fly. */
-static int __init attr_event_init(void)
-{
-	struct attribute **attrs;
-	int ret, i;
-
-	attrs = kmalloc_array(paicrypt_cnt + 2, sizeof(*attrs), GFP_KERNEL);
-	if (!attrs)
-		return -ENOMEM;
-	for (i = 0; i <= paicrypt_cnt; i++) {
-		ret = attr_event_init_one(attrs, i);
-		if (ret) {
-			attr_event_free(attrs, i);
-			return ret;
-		}
-	}
-	attrs[i] = NULL;
-	paicrypt_events_group.attrs = attrs;
-	return 0;
-}
-
-static int __init paicrypt_init(void)
-{
-	struct qpaci_info_block ib;
-	int rc;
-
-	if (!test_facility(196))
-		return 0;
-
-	qpaci(&ib);
-	paicrypt_cnt = ib.num_cc;
-	if (paicrypt_cnt == 0)
-		return 0;
-	if (paicrypt_cnt >= PAI_CRYPTO_MAXCTR) {
-		pr_err("Too many PMU pai_crypto counters %d\n", paicrypt_cnt);
-		return -E2BIG;
-	}
-
-	rc = attr_event_init();		/* Export known PAI crypto events */
-	if (rc) {
-		pr_err("Creation of PMU pai_crypto /sysfs failed\n");
-		return rc;
-	}
-
-	/* Setup s390dbf facility */
-	cfm_dbg = debug_register(KMSG_COMPONENT, 2, 256, 128);
-	if (!cfm_dbg) {
-		pr_err("Registration of s390dbf pai_crypto failed\n");
-		return -ENOMEM;
-	}
-	debug_register_view(cfm_dbg, &debug_sprintf_view);
-
-	rc = perf_pmu_register(&paicrypt, "pai_crypto", -1);
-	if (rc) {
-		pr_err("Registering the pai_crypto PMU failed with rc=%i\n",
-		       rc);
-		debug_unregister_view(cfm_dbg, &debug_sprintf_view);
-		debug_unregister(cfm_dbg);
-		return rc;
-	}
-	return 0;
-}
-
-device_initcall(paicrypt_init);
diff --git a/arch/s390/kernel/perf_pai_ext.c b/arch/s390/kernel/perf_pai_ext.c
deleted file mode 100644
index 7b32935273ce..000000000000
--- a/arch/s390/kernel/perf_pai_ext.c
+++ /dev/null
@@ -1,756 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Performance event support - Processor Activity Instrumentation Extension
- * Facility
- *
- *  Copyright IBM Corp. 2022
- *  Author(s): Thomas Richter <tmricht@linux.ibm.com>
- */
-#define KMSG_COMPONENT	"pai_ext"
-#define pr_fmt(fmt)	KMSG_COMPONENT ": " fmt
-
-#include <linux/kernel.h>
-#include <linux/kernel_stat.h>
-#include <linux/percpu.h>
-#include <linux/notifier.h>
-#include <linux/init.h>
-#include <linux/io.h>
-#include <linux/perf_event.h>
-#include <asm/ctlreg.h>
-#include <asm/pai.h>
-#include <asm/debug.h>
-
-#define	PAIE1_CB_SZ		0x200	/* Size of PAIE1 control block */
-#define	PAIE1_CTRBLOCK_SZ	0x400	/* Size of PAIE1 counter blocks */
-
-static debug_info_t *paiext_dbg;
-static unsigned int paiext_cnt;	/* Extracted with QPACI instruction */
-
-struct pai_userdata {
-	u16 num;
-	u64 value;
-} __packed;
-
-/* Create the PAI extension 1 control block area.
- * The PAI extension control block 1 is pointed to by lowcore
- * address 0x1508 for each CPU. This control block is 512 bytes in size
- * and requires a 512 byte boundary alignment.
- */
-struct paiext_cb {		/* PAI extension 1 control block */
-	u64 header;		/* Not used */
-	u64 reserved1;
-	u64 acc;		/* Addr to analytics counter control block */
-	u8 reserved2[488];
-} __packed;
-
-struct paiext_map {
-	unsigned long *area;		/* Area for CPU to store counters */
-	struct pai_userdata *save;	/* Area to store non-zero counters */
-	unsigned int active_events;	/* # of PAI Extension users */
-	refcount_t refcnt;
-	struct perf_event *event;	/* Perf event for sampling */
-	struct paiext_cb *paiext_cb;	/* PAI extension control block area */
-	struct list_head syswide_list;	/* List system-wide sampling events */
-};
-
-struct paiext_mapptr {
-	struct paiext_map *mapptr;
-};
-
-static struct paiext_root {		/* Anchor to per CPU data */
-	refcount_t refcnt;		/* Overall active events */
-	struct paiext_mapptr __percpu *mapptr;
-} paiext_root;
-
-/* Free per CPU data when the last event is removed. */
-static void paiext_root_free(void)
-{
-	if (refcount_dec_and_test(&paiext_root.refcnt)) {
-		free_percpu(paiext_root.mapptr);
-		paiext_root.mapptr = NULL;
-	}
-	debug_sprintf_event(paiext_dbg, 5, "%s root.refcount %d\n", __func__,
-			    refcount_read(&paiext_root.refcnt));
-}
-
-/* On initialization of first event also allocate per CPU data dynamically.
- * Start with an array of pointers, the array size is the maximum number of
- * CPUs possible, which might be larger than the number of CPUs currently
- * online.
- */
-static int paiext_root_alloc(void)
-{
-	if (!refcount_inc_not_zero(&paiext_root.refcnt)) {
-		/* The memory is already zeroed. */
-		paiext_root.mapptr = alloc_percpu(struct paiext_mapptr);
-		if (!paiext_root.mapptr) {
-			/* Returning without refcnt adjustment is ok. The
-			 * error code is handled by paiext_alloc() which
-			 * decrements refcnt when an event can not be
-			 * created.
-			 */
-			return -ENOMEM;
-		}
-		refcount_set(&paiext_root.refcnt, 1);
-	}
-	return 0;
-}
-
-/* Protects against concurrent increment of sampler and counter member
- * increments at the same time and prohibits concurrent execution of
- * counting and sampling events.
- * Ensures that analytics counter block is deallocated only when the
- * sampling and counting on that cpu is zero.
- * For details see paiext_alloc().
- */
-static DEFINE_MUTEX(paiext_reserve_mutex);
-
-/* Free all memory allocated for event counting/sampling setup */
-static void paiext_free(struct paiext_mapptr *mp)
-{
-	kfree(mp->mapptr->area);
-	kfree(mp->mapptr->paiext_cb);
-	kvfree(mp->mapptr->save);
-	kfree(mp->mapptr);
-	mp->mapptr = NULL;
-}
-
-/* Release the PMU if event is the last perf event */
-static void paiext_event_destroy_cpu(struct perf_event *event, int cpu)
-{
-	struct paiext_mapptr *mp = per_cpu_ptr(paiext_root.mapptr, cpu);
-	struct paiext_map *cpump = mp->mapptr;
-
-	mutex_lock(&paiext_reserve_mutex);
-	if (refcount_dec_and_test(&cpump->refcnt))	/* Last reference gone */
-		paiext_free(mp);
-	paiext_root_free();
-	mutex_unlock(&paiext_reserve_mutex);
-}
-
-static void paiext_event_destroy(struct perf_event *event)
-{
-	int cpu;
-
-	free_page(PAI_SAVE_AREA(event));
-	if (event->cpu == -1) {
-		struct cpumask *mask = PAI_CPU_MASK(event);
-
-		for_each_cpu(cpu, mask)
-			paiext_event_destroy_cpu(event, cpu);
-		kfree(mask);
-	} else {
-		paiext_event_destroy_cpu(event, event->cpu);
-	}
-	debug_sprintf_event(paiext_dbg, 4, "%s cpu %d\n", __func__,
-			    event->cpu);
-}
-
-/* Used to avoid races in checking concurrent access of counting and
- * sampling for pai_extension events.
- *
- * Only one instance of event pai_ext/NNPA_ALL/ for sampling is
- * allowed and when this event is running, no counting event is allowed.
- * Several counting events are allowed in parallel, but no sampling event
- * is allowed while one (or more) counting events are running.
- *
- * This function is called in process context and it is safe to block.
- * When the event initialization functions fails, no other call back will
- * be invoked.
- *
- * Allocate the memory for the event.
- */
-static int paiext_alloc_cpu(struct perf_event *event, int cpu)
-{
-	struct paiext_mapptr *mp;
-	struct paiext_map *cpump;
-	int rc;
-
-	mutex_lock(&paiext_reserve_mutex);
-	rc = paiext_root_alloc();
-	if (rc)
-		goto unlock;
-
-	mp = per_cpu_ptr(paiext_root.mapptr, cpu);
-	cpump = mp->mapptr;
-	if (!cpump) {			/* Paiext_map allocated? */
-		rc = -ENOMEM;
-		cpump = kzalloc(sizeof(*cpump), GFP_KERNEL);
-		if (!cpump)
-			goto undo;
-
-		/* Allocate memory for counter area and counter extraction.
-		 * These are
-		 * - a 512 byte block and requires 512 byte boundary alignment.
-		 * - a 1KB byte block and requires 1KB boundary alignment.
-		 * Only the first counting event has to allocate the area.
-		 *
-		 * Note: This works with commit 59bb47985c1d by default.
-		 * Backporting this to kernels without this commit might
-		 * need adjustment.
-		 */
-		mp->mapptr = cpump;
-		cpump->area = kzalloc(PAIE1_CTRBLOCK_SZ, GFP_KERNEL);
-		cpump->paiext_cb = kzalloc(PAIE1_CB_SZ, GFP_KERNEL);
-		cpump->save = kvmalloc_array(paiext_cnt + 1,
-					     sizeof(struct pai_userdata),
-					     GFP_KERNEL);
-		if (!cpump->save || !cpump->area || !cpump->paiext_cb) {
-			paiext_free(mp);
-			goto undo;
-		}
-		INIT_LIST_HEAD(&cpump->syswide_list);
-		refcount_set(&cpump->refcnt, 1);
-		rc = 0;
-	} else {
-		refcount_inc(&cpump->refcnt);
-	}
-
-undo:
-	if (rc) {
-		/* Error in allocation of event, decrement anchor. Since
-		 * the event in not created, its destroy() function is never
-		 * invoked. Adjust the reference counter for the anchor.
-		 */
-		paiext_root_free();
-	}
-unlock:
-	mutex_unlock(&paiext_reserve_mutex);
-	/* If rc is non-zero, no increment of counter/sampler was done. */
-	return rc;
-}
-
-static int paiext_alloc(struct perf_event *event)
-{
-	struct cpumask *maskptr;
-	int cpu, rc = -ENOMEM;
-
-	maskptr = kzalloc(sizeof(*maskptr), GFP_KERNEL);
-	if (!maskptr)
-		goto out;
-
-	for_each_online_cpu(cpu) {
-		rc = paiext_alloc_cpu(event, cpu);
-		if (rc) {
-			for_each_cpu(cpu, maskptr)
-				paiext_event_destroy_cpu(event, cpu);
-			kfree(maskptr);
-			goto out;
-		}
-		cpumask_set_cpu(cpu, maskptr);
-	}
-
-	/*
-	 * On error all cpumask are freed and all events have been destroyed.
-	 * Save of which CPUs data structures have been allocated for.
-	 * Release them in paicrypt_event_destroy call back function
-	 * for this event.
-	 */
-	PAI_CPU_MASK(event) = maskptr;
-	rc = 0;
-out:
-	return rc;
-}
-
-/* The PAI extension 1 control block supports up to 128 entries. Return
- * the index within PAIE1_CB given the event number. Also validate event
- * number.
- */
-static int paiext_event_valid(struct perf_event *event)
-{
-	u64 cfg = event->attr.config;
-
-	if (cfg >= PAI_NNPA_BASE && cfg <= PAI_NNPA_BASE + paiext_cnt) {
-		/* Offset NNPA in paiext_cb */
-		event->hw.config_base = offsetof(struct paiext_cb, acc);
-		return 0;
-	}
-	return -ENOENT;
-}
-
-/* Might be called on different CPU than the one the event is intended for. */
-static int paiext_event_init(struct perf_event *event)
-{
-	struct perf_event_attr *a = &event->attr;
-	int rc;
-
-	/* PMU pai_ext registered as PERF_TYPE_RAW, check event type */
-	if (a->type != PERF_TYPE_RAW && event->pmu->type != a->type)
-		return -ENOENT;
-	/* PAI extension event must be valid and in supported range */
-	rc = paiext_event_valid(event);
-	if (rc)
-		return rc;
-	/* Allow only event NNPA_ALL for sampling. */
-	if (a->sample_period && a->config != PAI_NNPA_BASE)
-		return -EINVAL;
-	/* Prohibit exclude_user event selection */
-	if (a->exclude_user)
-		return -EINVAL;
-	/* Get a page to store last counter values for sampling */
-	if (a->sample_period) {
-		PAI_SAVE_AREA(event) = get_zeroed_page(GFP_KERNEL);
-		if (!PAI_SAVE_AREA(event))
-			return -ENOMEM;
-	}
-
-	if (event->cpu >= 0)
-		rc = paiext_alloc_cpu(event, event->cpu);
-	else
-		rc = paiext_alloc(event);
-	if (rc) {
-		free_page(PAI_SAVE_AREA(event));
-		return rc;
-	}
-	event->destroy = paiext_event_destroy;
-
-	if (a->sample_period) {
-		a->sample_period = 1;
-		a->freq = 0;
-		/* Register for paicrypt_sched_task() to be called */
-		event->attach_state |= PERF_ATTACH_SCHED_CB;
-		/* Add raw data which are the memory mapped counters */
-		a->sample_type |= PERF_SAMPLE_RAW;
-		/* Turn off inheritance */
-		a->inherit = 0;
-	}
-
-	return 0;
-}
-
-static u64 paiext_getctr(unsigned long *area, int nr)
-{
-	return area[nr];
-}
-
-/* Read the counter values. Return value from location in buffer. For event
- * NNPA_ALL sum up all events.
- */
-static u64 paiext_getdata(struct perf_event *event)
-{
-	struct paiext_mapptr *mp = this_cpu_ptr(paiext_root.mapptr);
-	struct paiext_map *cpump = mp->mapptr;
-	u64 sum = 0;
-	int i;
-
-	if (event->attr.config != PAI_NNPA_BASE)
-		return paiext_getctr(cpump->area,
-				     event->attr.config - PAI_NNPA_BASE);
-
-	for (i = 1; i <= paiext_cnt; i++)
-		sum += paiext_getctr(cpump->area, i);
-
-	return sum;
-}
-
-static u64 paiext_getall(struct perf_event *event)
-{
-	return paiext_getdata(event);
-}
-
-static void paiext_read(struct perf_event *event)
-{
-	u64 prev, new, delta;
-
-	prev = local64_read(&event->hw.prev_count);
-	new = paiext_getall(event);
-	local64_set(&event->hw.prev_count, new);
-	delta = new - prev;
-	local64_add(delta, &event->count);
-}
-
-static void paiext_start(struct perf_event *event, int flags)
-{
-	struct paiext_mapptr *mp = this_cpu_ptr(paiext_root.mapptr);
-	struct paiext_map *cpump = mp->mapptr;
-	u64 sum;
-
-	if (!event->attr.sample_period) {	/* Counting */
-		sum = paiext_getall(event);	/* Get current value */
-		local64_set(&event->hw.prev_count, sum);
-	} else {				/* Sampling */
-		memcpy((void *)PAI_SAVE_AREA(event), cpump->area,
-		       PAIE1_CTRBLOCK_SZ);
-		/* Enable context switch callback for system-wide sampling */
-		if (!(event->attach_state & PERF_ATTACH_TASK)) {
-			list_add_tail(PAI_SWLIST(event), &cpump->syswide_list);
-			perf_sched_cb_inc(event->pmu);
-		} else {
-			cpump->event = event;
-		}
-	}
-}
-
-static int paiext_add(struct perf_event *event, int flags)
-{
-	struct paiext_mapptr *mp = this_cpu_ptr(paiext_root.mapptr);
-	struct paiext_map *cpump = mp->mapptr;
-	struct paiext_cb *pcb = cpump->paiext_cb;
-
-	if (++cpump->active_events == 1) {
-		get_lowcore()->aicd = virt_to_phys(cpump->paiext_cb);
-		pcb->acc = virt_to_phys(cpump->area) | 0x1;
-		/* Enable CPU instruction lookup for PAIE1 control block */
-		local_ctl_set_bit(0, CR0_PAI_EXTENSION_BIT);
-	}
-	if (flags & PERF_EF_START)
-		paiext_start(event, PERF_EF_RELOAD);
-	event->hw.state = 0;
-	return 0;
-}
-
-static void paiext_have_sample(struct perf_event *, struct paiext_map *);
-static void paiext_stop(struct perf_event *event, int flags)
-{
-	struct paiext_mapptr *mp = this_cpu_ptr(paiext_root.mapptr);
-	struct paiext_map *cpump = mp->mapptr;
-
-	if (!event->attr.sample_period) {	/* Counting */
-		paiext_read(event);
-	} else {				/* Sampling */
-		if (!(event->attach_state & PERF_ATTACH_TASK)) {
-			list_del(PAI_SWLIST(event));
-			perf_sched_cb_dec(event->pmu);
-		} else {
-			paiext_have_sample(event, cpump);
-			cpump->event = NULL;
-		}
-	}
-	event->hw.state = PERF_HES_STOPPED;
-}
-
-static void paiext_del(struct perf_event *event, int flags)
-{
-	struct paiext_mapptr *mp = this_cpu_ptr(paiext_root.mapptr);
-	struct paiext_map *cpump = mp->mapptr;
-	struct paiext_cb *pcb = cpump->paiext_cb;
-
-	paiext_stop(event, PERF_EF_UPDATE);
-	if (--cpump->active_events == 0) {
-		/* Disable CPU instruction lookup for PAIE1 control block */
-		local_ctl_clear_bit(0, CR0_PAI_EXTENSION_BIT);
-		pcb->acc = 0;
-		get_lowcore()->aicd = 0;
-	}
-}
-
-/* Create raw data and save it in buffer. Returns number of bytes copied.
- * Saves only positive counter entries of the form
- * 2 bytes: Number of counter
- * 8 bytes: Value of counter
- */
-static size_t paiext_copy(struct pai_userdata *userdata, unsigned long *area,
-			  unsigned long *area_old)
-{
-	int i, outidx = 0;
-
-	for (i = 1; i <= paiext_cnt; i++) {
-		u64 val = paiext_getctr(area, i);
-		u64 val_old = paiext_getctr(area_old, i);
-
-		if (val >= val_old)
-			val -= val_old;
-		else
-			val = (~0ULL - val_old) + val + 1;
-		if (val) {
-			userdata[outidx].num = i;
-			userdata[outidx].value = val;
-			outidx++;
-		}
-	}
-	return outidx * sizeof(*userdata);
-}
-
-/* Write sample when one or more counters values are nonzero.
- *
- * Note: The function paiext_sched_task() and paiext_push_sample() are not
- * invoked after function paiext_del() has been called because of function
- * perf_sched_cb_dec().
- * The function paiext_sched_task() and paiext_push_sample() are only
- * called when sampling is active. Function perf_sched_cb_inc()
- * has been invoked to install function paiext_sched_task() as call back
- * to run at context switch time (see paiext_add()).
- *
- * This causes function perf_event_context_sched_out() and
- * perf_event_context_sched_in() to check whether the PMU has installed an
- * sched_task() callback. That callback is not active after paiext_del()
- * returns and has deleted the event on that CPU.
- */
-static int paiext_push_sample(size_t rawsize, struct paiext_map *cpump,
-			      struct perf_event *event)
-{
-	struct perf_sample_data data;
-	struct perf_raw_record raw;
-	struct pt_regs regs;
-	int overflow;
-
-	/* Setup perf sample */
-	memset(&regs, 0, sizeof(regs));
-	memset(&raw, 0, sizeof(raw));
-	memset(&data, 0, sizeof(data));
-	perf_sample_data_init(&data, 0, event->hw.last_period);
-	if (event->attr.sample_type & PERF_SAMPLE_TID) {
-		data.tid_entry.pid = task_tgid_nr(current);
-		data.tid_entry.tid = task_pid_nr(current);
-	}
-	if (event->attr.sample_type & PERF_SAMPLE_TIME)
-		data.time = event->clock();
-	if (event->attr.sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER))
-		data.id = event->id;
-	if (event->attr.sample_type & PERF_SAMPLE_CPU)
-		data.cpu_entry.cpu = smp_processor_id();
-	if (event->attr.sample_type & PERF_SAMPLE_RAW) {
-		raw.frag.size = rawsize;
-		raw.frag.data = cpump->save;
-		perf_sample_save_raw_data(&data, event, &raw);
-	}
-
-	overflow = perf_event_overflow(event, &data, &regs);
-	perf_event_update_userpage(event);
-	/* Save NNPA lowcore area after read in event */
-	memcpy((void *)PAI_SAVE_AREA(event), cpump->area,
-	       PAIE1_CTRBLOCK_SZ);
-	return overflow;
-}
-
-/* Check if there is data to be saved on schedule out of a task. */
-static void paiext_have_sample(struct perf_event *event,
-			       struct paiext_map *cpump)
-{
-	size_t rawsize;
-
-	if (!event)
-		return;
-	rawsize = paiext_copy(cpump->save, cpump->area,
-			      (unsigned long *)PAI_SAVE_AREA(event));
-	if (rawsize)			/* Incremented counters */
-		paiext_push_sample(rawsize, cpump, event);
-}
-
-/* Check if there is data to be saved on schedule out of a task. */
-static void paiext_have_samples(void)
-{
-	struct paiext_mapptr *mp = this_cpu_ptr(paiext_root.mapptr);
-	struct paiext_map *cpump = mp->mapptr;
-	struct perf_event *event;
-
-	list_for_each_entry(event, &cpump->syswide_list, hw.tp_list)
-		paiext_have_sample(event, cpump);
-}
-
-/* Called on schedule-in and schedule-out. No access to event structure,
- * but for sampling only event NNPA_ALL is allowed.
- */
-static void paiext_sched_task(struct perf_event_pmu_context *pmu_ctx,
-			      struct task_struct *task, bool sched_in)
-{
-	/* We started with a clean page on event installation. So read out
-	 * results on schedule_out and if page was dirty, save old values.
-	 */
-	if (!sched_in)
-		paiext_have_samples();
-}
-
-/* Attribute definitions for pai extension1 interface. As with other CPU
- * Measurement Facilities, there is one attribute per mapped counter.
- * The number of mapped counters may vary per machine generation. Use
- * the QUERY PROCESSOR ACTIVITY COUNTER INFORMATION (QPACI) instruction
- * to determine the number of mapped counters. The instructions returns
- * a positive number, which is the highest number of supported counters.
- * All counters less than this number are also supported, there are no
- * holes. A returned number of zero means no support for mapped counters.
- *
- * The identification of the counter is a unique number. The chosen range
- * is 0x1800 + offset in mapped kernel page.
- * All CPU Measurement Facility counters identifiers must be unique and
- * the numbers from 0 to 496 are already used for the CPU Measurement
- * Counter facility. Number 0x1000 to 0x103e are used for PAI cryptography
- * counters.
- * Numbers 0xb0000, 0xbc000 and 0xbd000 are already
- * used for the CPU Measurement Sampling facility.
- */
-PMU_FORMAT_ATTR(event, "config:0-63");
-
-static struct attribute *paiext_format_attr[] = {
-	&format_attr_event.attr,
-	NULL,
-};
-
-static struct attribute_group paiext_events_group = {
-	.name = "events",
-	.attrs = NULL,			/* Filled in attr_event_init() */
-};
-
-static struct attribute_group paiext_format_group = {
-	.name = "format",
-	.attrs = paiext_format_attr,
-};
-
-static const struct attribute_group *paiext_attr_groups[] = {
-	&paiext_events_group,
-	&paiext_format_group,
-	NULL,
-};
-
-/* Performance monitoring unit for mapped counters */
-static struct pmu paiext = {
-	.task_ctx_nr  = perf_hw_context,
-	.event_init   = paiext_event_init,
-	.add	      = paiext_add,
-	.del	      = paiext_del,
-	.start	      = paiext_start,
-	.stop	      = paiext_stop,
-	.read	      = paiext_read,
-	.sched_task   = paiext_sched_task,
-	.attr_groups  = paiext_attr_groups,
-};
-
-/* List of symbolic PAI extension 1 NNPA counter names. */
-static const char * const paiext_ctrnames[] = {
-	[0] = "NNPA_ALL",
-	[1] = "NNPA_ADD",
-	[2] = "NNPA_SUB",
-	[3] = "NNPA_MUL",
-	[4] = "NNPA_DIV",
-	[5] = "NNPA_MIN",
-	[6] = "NNPA_MAX",
-	[7] = "NNPA_LOG",
-	[8] = "NNPA_EXP",
-	[9] = "NNPA_IBM_RESERVED_9",
-	[10] = "NNPA_RELU",
-	[11] = "NNPA_TANH",
-	[12] = "NNPA_SIGMOID",
-	[13] = "NNPA_SOFTMAX",
-	[14] = "NNPA_BATCHNORM",
-	[15] = "NNPA_MAXPOOL2D",
-	[16] = "NNPA_AVGPOOL2D",
-	[17] = "NNPA_LSTMACT",
-	[18] = "NNPA_GRUACT",
-	[19] = "NNPA_CONVOLUTION",
-	[20] = "NNPA_MATMUL_OP",
-	[21] = "NNPA_MATMUL_OP_BCAST23",
-	[22] = "NNPA_SMALLBATCH",
-	[23] = "NNPA_LARGEDIM",
-	[24] = "NNPA_SMALLTENSOR",
-	[25] = "NNPA_1MFRAME",
-	[26] = "NNPA_2GFRAME",
-	[27] = "NNPA_ACCESSEXCEPT",
-	[28] = "NNPA_TRANSFORM",
-	[29] = "NNPA_GELU",
-	[30] = "NNPA_MOMENTS",
-	[31] = "NNPA_LAYERNORM",
-	[32] = "NNPA_MATMUL_OP_BCAST1",
-	[33] = "NNPA_SQRT",
-	[34] = "NNPA_INVSQRT",
-	[35] = "NNPA_NORM",
-	[36] = "NNPA_REDUCE",
-};
-
-static void __init attr_event_free(struct attribute **attrs, int num)
-{
-	struct perf_pmu_events_attr *pa;
-	struct device_attribute *dap;
-	int i;
-
-	for (i = 0; i < num; i++) {
-		dap = container_of(attrs[i], struct device_attribute, attr);
-		pa = container_of(dap, struct perf_pmu_events_attr, attr);
-		kfree(pa);
-	}
-	kfree(attrs);
-}
-
-static int __init attr_event_init_one(struct attribute **attrs, int num)
-{
-	struct perf_pmu_events_attr *pa;
-
-	/* Index larger than array_size, no counter name available */
-	if (num >= ARRAY_SIZE(paiext_ctrnames)) {
-		attrs[num] = NULL;
-		return 0;
-	}
-
-	pa = kzalloc(sizeof(*pa), GFP_KERNEL);
-	if (!pa)
-		return -ENOMEM;
-
-	sysfs_attr_init(&pa->attr.attr);
-	pa->id = PAI_NNPA_BASE + num;
-	pa->attr.attr.name = paiext_ctrnames[num];
-	pa->attr.attr.mode = 0444;
-	pa->attr.show = cpumf_events_sysfs_show;
-	pa->attr.store = NULL;
-	attrs[num] = &pa->attr.attr;
-	return 0;
-}
-
-/* Create PMU sysfs event attributes on the fly. */
-static int __init attr_event_init(void)
-{
-	struct attribute **attrs;
-	int ret, i;
-
-	attrs = kmalloc_array(paiext_cnt + 2, sizeof(*attrs), GFP_KERNEL);
-	if (!attrs)
-		return -ENOMEM;
-	for (i = 0; i <= paiext_cnt; i++) {
-		ret = attr_event_init_one(attrs, i);
-		if (ret) {
-			attr_event_free(attrs, i);
-			return ret;
-		}
-	}
-	attrs[i] = NULL;
-	paiext_events_group.attrs = attrs;
-	return 0;
-}
-
-static int __init paiext_init(void)
-{
-	struct qpaci_info_block ib;
-	int rc = -ENOMEM;
-
-	if (!test_facility(197))
-		return 0;
-
-	qpaci(&ib);
-	paiext_cnt = ib.num_nnpa;
-	if (paiext_cnt >= PAI_NNPA_MAXCTR)
-		paiext_cnt = PAI_NNPA_MAXCTR;
-	if (!paiext_cnt)
-		return 0;
-
-	rc = attr_event_init();
-	if (rc) {
-		pr_err("Creation of PMU " KMSG_COMPONENT " /sysfs failed\n");
-		return rc;
-	}
-
-	/* Setup s390dbf facility */
-	paiext_dbg = debug_register(KMSG_COMPONENT, 2, 256, 128);
-	if (!paiext_dbg) {
-		pr_err("Registration of s390dbf " KMSG_COMPONENT " failed\n");
-		rc = -ENOMEM;
-		goto out_init;
-	}
-	debug_register_view(paiext_dbg, &debug_sprintf_view);
-
-	rc = perf_pmu_register(&paiext, KMSG_COMPONENT, -1);
-	if (rc) {
-		pr_err("Registration of " KMSG_COMPONENT " PMU failed with "
-		       "rc=%i\n", rc);
-		goto out_pmu;
-	}
-
-	return 0;
-
-out_pmu:
-	debug_unregister_view(paiext_dbg, &debug_sprintf_view);
-	debug_unregister(paiext_dbg);
-out_init:
-	attr_event_free(paiext_events_group.attrs,
-			ARRAY_SIZE(paiext_ctrnames) + 1);
-	return rc;
-}
-
-device_initcall(paiext_init);
diff --git a/arch/s390/kernel/perf_regs.c b/arch/s390/kernel/perf_regs.c
index a6b058ee4a36..7b305f1456f8 100644
--- a/arch/s390/kernel/perf_regs.c
+++ b/arch/s390/kernel/perf_regs.c
@@ -44,9 +44,6 @@ int perf_reg_validate(u64 mask)
 
 u64 perf_reg_abi(struct task_struct *task)
 {
-	if (test_tsk_thread_flag(task, TIF_31BIT))
-		return PERF_SAMPLE_REGS_ABI_32;
-
 	return PERF_SAMPLE_REGS_ABI_64;
 }
 
diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c
index b107dbca4ed7..0df95dcb2101 100644
--- a/arch/s390/kernel/process.c
+++ b/arch/s390/kernel/process.c
@@ -24,7 +24,6 @@
 #include <linux/tick.h>
 #include <linux/personality.h>
 #include <linux/syscalls.h>
-#include <linux/compat.h>
 #include <linux/kprobes.h>
 #include <linux/random.h>
 #include <linux/init_task.h>
@@ -166,12 +165,8 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
 
 	/* Set a new TLS ?  */
 	if (clone_flags & CLONE_SETTLS) {
-		if (is_compat_task()) {
-			p->thread.acrs[0] = (unsigned int)tls;
-		} else {
-			p->thread.acrs[0] = (unsigned int)(tls >> 32);
-			p->thread.acrs[1] = (unsigned int)tls;
-		}
+		p->thread.acrs[0] = (unsigned int)(tls >> 32);
+		p->thread.acrs[1] = (unsigned int)tls;
 	}
 	/*
 	 * s390 stores the svc return address in arch_data when calling
diff --git a/arch/s390/kernel/processor.c b/arch/s390/kernel/processor.c
index 11f70c1e2797..e33a3eccda56 100644
--- a/arch/s390/kernel/processor.c
+++ b/arch/s390/kernel/processor.c
@@ -4,8 +4,7 @@
  *  Author(s): Martin Schwidefsky (schwidefsky@de.ibm.com)
  */
 
-#define KMSG_COMPONENT "cpu"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "cpu: " fmt
 
 #include <linux/stop_machine.h>
 #include <linux/cpufeature.h>
diff --git a/arch/s390/kernel/ptrace.c b/arch/s390/kernel/ptrace.c
index 494216c4b4f3..ceaa1726e328 100644
--- a/arch/s390/kernel/ptrace.c
+++ b/arch/s390/kernel/ptrace.c
@@ -22,7 +22,6 @@
 #include <linux/elf.h>
 #include <linux/regset.h>
 #include <linux/seccomp.h>
-#include <linux/compat.h>
 #include <trace/syscall.h>
 #include <asm/guarded_storage.h>
 #include <asm/access-regs.h>
@@ -38,10 +37,6 @@
 
 #include "entry.h"
 
-#ifdef CONFIG_COMPAT
-#include "compat_ptrace.h"
-#endif
-
 void update_cr_regs(struct task_struct *task)
 {
 	struct pt_regs *regs = task_pt_regs(task);
@@ -507,308 +502,6 @@ long arch_ptrace(struct task_struct *child, long request,
 	}
 }
 
-#ifdef CONFIG_COMPAT
-/*
- * Now the fun part starts... a 31 bit program running in the
- * 31 bit emulation tracing another program. PTRACE_PEEKTEXT,
- * PTRACE_PEEKDATA, PTRACE_POKETEXT and PTRACE_POKEDATA are easy
- * to handle, the difference to the 64 bit versions of the requests
- * is that the access is done in multiples of 4 byte instead of
- * 8 bytes (sizeof(unsigned long) on 31/64 bit).
- * The ugly part are PTRACE_PEEKUSR, PTRACE_PEEKUSR_AREA,
- * PTRACE_POKEUSR and PTRACE_POKEUSR_AREA. If the traced program
- * is a 31 bit program too, the content of struct user can be
- * emulated. A 31 bit program peeking into the struct user of
- * a 64 bit program is a no-no.
- */
-
-/*
- * Same as peek_user_per but for a 31 bit program.
- */
-static inline __u32 __peek_user_per_compat(struct task_struct *child,
-					   addr_t addr)
-{
-	if (addr == offsetof(struct compat_per_struct_kernel, cr9))
-		/* Control bits of the active per set. */
-		return (__u32) test_thread_flag(TIF_SINGLE_STEP) ?
-			PER_EVENT_IFETCH : child->thread.per_user.control;
-	else if (addr == offsetof(struct compat_per_struct_kernel, cr10))
-		/* Start address of the active per set. */
-		return (__u32) test_thread_flag(TIF_SINGLE_STEP) ?
-			0 : child->thread.per_user.start;
-	else if (addr == offsetof(struct compat_per_struct_kernel, cr11))
-		/* End address of the active per set. */
-		return test_thread_flag(TIF_SINGLE_STEP) ?
-			PSW32_ADDR_INSN : child->thread.per_user.end;
-	else if (addr == offsetof(struct compat_per_struct_kernel, bits))
-		/* Single-step bit. */
-		return (__u32) test_thread_flag(TIF_SINGLE_STEP) ?
-			0x80000000 : 0;
-	else if (addr == offsetof(struct compat_per_struct_kernel, starting_addr))
-		/* Start address of the user specified per set. */
-		return (__u32) child->thread.per_user.start;
-	else if (addr == offsetof(struct compat_per_struct_kernel, ending_addr))
-		/* End address of the user specified per set. */
-		return (__u32) child->thread.per_user.end;
-	else if (addr == offsetof(struct compat_per_struct_kernel, perc_atmid))
-		/* PER code, ATMID and AI of the last PER trap */
-		return (__u32) child->thread.per_event.cause << 16;
-	else if (addr == offsetof(struct compat_per_struct_kernel, address))
-		/* Address of the last PER trap */
-		return (__u32) child->thread.per_event.address;
-	else if (addr == offsetof(struct compat_per_struct_kernel, access_id))
-		/* Access id of the last PER trap */
-		return (__u32) child->thread.per_event.paid << 24;
-	return 0;
-}
-
-/*
- * Same as peek_user but for a 31 bit program.
- */
-static u32 __peek_user_compat(struct task_struct *child, addr_t addr)
-{
-	addr_t offset;
-	__u32 tmp;
-
-	if (addr < offsetof(struct compat_user, regs.acrs)) {
-		struct pt_regs *regs = task_pt_regs(child);
-		/*
-		 * psw and gprs are stored on the stack
-		 */
-		if (addr == offsetof(struct compat_user, regs.psw.mask)) {
-			/* Fake a 31 bit psw mask. */
-			tmp = (__u32)(regs->psw.mask >> 32);
-			tmp &= PSW32_MASK_USER | PSW32_MASK_RI;
-			tmp |= PSW32_USER_BITS;
-		} else if (addr == offsetof(struct compat_user, regs.psw.addr)) {
-			/* Fake a 31 bit psw address. */
-			tmp = (__u32) regs->psw.addr |
-				(__u32)(regs->psw.mask & PSW_MASK_BA);
-		} else {
-			/* gpr 0-15 */
-			tmp = *(__u32 *)((addr_t) &regs->psw + addr*2 + 4);
-		}
-	} else if (addr < offsetof(struct compat_user, regs.orig_gpr2)) {
-		/*
-		 * access registers are stored in the thread structure
-		 */
-		offset = addr - offsetof(struct compat_user, regs.acrs);
-		tmp = *(__u32*)((addr_t) &child->thread.acrs + offset);
-
-	} else if (addr == offsetof(struct compat_user, regs.orig_gpr2)) {
-		/*
-		 * orig_gpr2 is stored on the kernel stack
-		 */
-		tmp = *(__u32*)((addr_t) &task_pt_regs(child)->orig_gpr2 + 4);
-
-	} else if (addr < offsetof(struct compat_user, regs.fp_regs)) {
-		/*
-		 * prevent reads of padding hole between
-		 * orig_gpr2 and fp_regs on s390.
-		 */
-		tmp = 0;
-
-	} else if (addr == offsetof(struct compat_user, regs.fp_regs.fpc)) {
-		/*
-		 * floating point control reg. is in the thread structure
-		 */
-		tmp = child->thread.ufpu.fpc;
-
-	} else if (addr < offsetof(struct compat_user, regs.fp_regs) + sizeof(s390_fp_regs)) {
-		/*
-		 * floating point regs. are in the child->thread.ufpu.vxrs array
-		 */
-		offset = addr - offsetof(struct compat_user, regs.fp_regs.fprs);
-		tmp = *(__u32 *)((addr_t)child->thread.ufpu.vxrs + 2 * offset);
-	} else if (addr < offsetof(struct compat_user, regs.per_info) + sizeof(struct compat_per_struct_kernel)) {
-		/*
-		 * Handle access to the per_info structure.
-		 */
-		addr -= offsetof(struct compat_user, regs.per_info);
-		tmp = __peek_user_per_compat(child, addr);
-
-	} else
-		tmp = 0;
-
-	return tmp;
-}
-
-static int peek_user_compat(struct task_struct *child,
-			    addr_t addr, addr_t data)
-{
-	__u32 tmp;
-
-	if (!is_compat_task() || (addr & 3) || addr > sizeof(struct user) - 3)
-		return -EIO;
-
-	tmp = __peek_user_compat(child, addr);
-	return put_user(tmp, (__u32 __user *) data);
-}
-
-/*
- * Same as poke_user_per but for a 31 bit program.
- */
-static inline void __poke_user_per_compat(struct task_struct *child,
-					  addr_t addr, __u32 data)
-{
-	if (addr == offsetof(struct compat_per_struct_kernel, cr9))
-		/* PER event mask of the user specified per set. */
-		child->thread.per_user.control =
-			data & (PER_EVENT_MASK | PER_CONTROL_MASK);
-	else if (addr == offsetof(struct compat_per_struct_kernel, starting_addr))
-		/* Starting address of the user specified per set. */
-		child->thread.per_user.start = data;
-	else if (addr == offsetof(struct compat_per_struct_kernel, ending_addr))
-		/* Ending address of the user specified per set. */
-		child->thread.per_user.end = data;
-}
-
-/*
- * Same as poke_user but for a 31 bit program.
- */
-static int __poke_user_compat(struct task_struct *child,
-			      addr_t addr, addr_t data)
-{
-	__u32 tmp = (__u32) data;
-	addr_t offset;
-
-	if (addr < offsetof(struct compat_user, regs.acrs)) {
-		struct pt_regs *regs = task_pt_regs(child);
-		/*
-		 * psw, gprs, acrs and orig_gpr2 are stored on the stack
-		 */
-		if (addr == offsetof(struct compat_user, regs.psw.mask)) {
-			__u32 mask = PSW32_MASK_USER;
-
-			mask |= is_ri_task(child) ? PSW32_MASK_RI : 0;
-			/* Build a 64 bit psw mask from 31 bit mask. */
-			if ((tmp ^ PSW32_USER_BITS) & ~mask)
-				/* Invalid psw mask. */
-				return -EINVAL;
-			if ((data & PSW32_MASK_ASC) == PSW32_ASC_HOME)
-				/* Invalid address-space-control bits */
-				return -EINVAL;
-			regs->psw.mask = (regs->psw.mask & ~PSW_MASK_USER) |
-				(regs->psw.mask & PSW_MASK_BA) |
-				(__u64)(tmp & mask) << 32;
-		} else if (addr == offsetof(struct compat_user, regs.psw.addr)) {
-			/* Build a 64 bit psw address from 31 bit address. */
-			regs->psw.addr = (__u64) tmp & PSW32_ADDR_INSN;
-			/* Transfer 31 bit amode bit to psw mask. */
-			regs->psw.mask = (regs->psw.mask & ~PSW_MASK_BA) |
-				(__u64)(tmp & PSW32_ADDR_AMODE);
-		} else {
-			if (test_pt_regs_flag(regs, PIF_SYSCALL) &&
-				addr == offsetof(struct compat_user, regs.gprs[2])) {
-				struct pt_regs *regs = task_pt_regs(child);
-
-				regs->int_code = 0x20000 | (data & 0xffff);
-			}
-			/* gpr 0-15 */
-			*(__u32*)((addr_t) &regs->psw + addr*2 + 4) = tmp;
-		}
-	} else if (addr < offsetof(struct compat_user, regs.orig_gpr2)) {
-		/*
-		 * access registers are stored in the thread structure
-		 */
-		offset = addr - offsetof(struct compat_user, regs.acrs);
-		*(__u32*)((addr_t) &child->thread.acrs + offset) = tmp;
-
-	} else if (addr == offsetof(struct compat_user, regs.orig_gpr2)) {
-		/*
-		 * orig_gpr2 is stored on the kernel stack
-		 */
-		*(__u32*)((addr_t) &task_pt_regs(child)->orig_gpr2 + 4) = tmp;
-
-	} else if (addr < offsetof(struct compat_user, regs.fp_regs)) {
-		/*
-		 * prevent writess of padding hole between
-		 * orig_gpr2 and fp_regs on s390.
-		 */
-		return 0;
-
-	} else if (addr == offsetof(struct compat_user, regs.fp_regs.fpc)) {
-		/*
-		 * floating point control reg. is in the thread structure
-		 */
-		child->thread.ufpu.fpc = data;
-
-	} else if (addr < offsetof(struct compat_user, regs.fp_regs) + sizeof(s390_fp_regs)) {
-		/*
-		 * floating point regs. are in the child->thread.ufpu.vxrs array
-		 */
-		offset = addr - offsetof(struct compat_user, regs.fp_regs.fprs);
-		*(__u32 *)((addr_t)child->thread.ufpu.vxrs + 2 * offset) = tmp;
-	} else if (addr < offsetof(struct compat_user, regs.per_info) + sizeof(struct compat_per_struct_kernel)) {
-		/*
-		 * Handle access to the per_info structure.
-		 */
-		addr -= offsetof(struct compat_user, regs.per_info);
-		__poke_user_per_compat(child, addr, data);
-	}
-
-	return 0;
-}
-
-static int poke_user_compat(struct task_struct *child,
-			    addr_t addr, addr_t data)
-{
-	if (!is_compat_task() || (addr & 3) ||
-	    addr > sizeof(struct compat_user) - 3)
-		return -EIO;
-
-	return __poke_user_compat(child, addr, data);
-}
-
-long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
-			compat_ulong_t caddr, compat_ulong_t cdata)
-{
-	unsigned long addr = caddr;
-	unsigned long data = cdata;
-	compat_ptrace_area parea;
-	int copied, ret;
-
-	switch (request) {
-	case PTRACE_PEEKUSR:
-		/* read the word at location addr in the USER area. */
-		return peek_user_compat(child, addr, data);
-
-	case PTRACE_POKEUSR:
-		/* write the word at location addr in the USER area */
-		return poke_user_compat(child, addr, data);
-
-	case PTRACE_PEEKUSR_AREA:
-	case PTRACE_POKEUSR_AREA:
-		if (copy_from_user(&parea, (void __force __user *) addr,
-							sizeof(parea)))
-			return -EFAULT;
-		addr = parea.kernel_addr;
-		data = parea.process_addr;
-		copied = 0;
-		while (copied < parea.len) {
-			if (request == PTRACE_PEEKUSR_AREA)
-				ret = peek_user_compat(child, addr, data);
-			else {
-				__u32 utmp;
-				if (get_user(utmp,
-					     (__u32 __force __user *) data))
-					return -EFAULT;
-				ret = poke_user_compat(child, addr, utmp);
-			}
-			if (ret)
-				return ret;
-			addr += sizeof(unsigned int);
-			data += sizeof(unsigned int);
-			copied += sizeof(unsigned int);
-		}
-		return 0;
-	case PTRACE_GET_LAST_BREAK:
-		return put_user(child->thread.last_break, (unsigned int __user *)data);
-	}
-	return compat_ptrace_request(child, request, addr, data);
-}
-#endif
-
 /*
  * user_regset definitions.
  */
@@ -1297,225 +990,8 @@ static const struct user_regset_view user_s390_view = {
 	.n = ARRAY_SIZE(s390_regsets)
 };
 
-#ifdef CONFIG_COMPAT
-static int s390_compat_regs_get(struct task_struct *target,
-				const struct user_regset *regset,
-				struct membuf to)
-{
-	unsigned n;
-
-	if (target == current)
-		save_access_regs(target->thread.acrs);
-
-	for (n = 0; n < sizeof(s390_compat_regs); n += sizeof(compat_ulong_t))
-		membuf_store(&to, __peek_user_compat(target, n));
-	return 0;
-}
-
-static int s390_compat_regs_set(struct task_struct *target,
-				const struct user_regset *regset,
-				unsigned int pos, unsigned int count,
-				const void *kbuf, const void __user *ubuf)
-{
-	int rc = 0;
-
-	if (target == current)
-		save_access_regs(target->thread.acrs);
-
-	if (kbuf) {
-		const compat_ulong_t *k = kbuf;
-		while (count > 0 && !rc) {
-			rc = __poke_user_compat(target, pos, *k++);
-			count -= sizeof(*k);
-			pos += sizeof(*k);
-		}
-	} else {
-		const compat_ulong_t  __user *u = ubuf;
-		while (count > 0 && !rc) {
-			compat_ulong_t word;
-			rc = __get_user(word, u++);
-			if (rc)
-				break;
-			rc = __poke_user_compat(target, pos, word);
-			count -= sizeof(*u);
-			pos += sizeof(*u);
-		}
-	}
-
-	if (rc == 0 && target == current)
-		restore_access_regs(target->thread.acrs);
-
-	return rc;
-}
-
-static int s390_compat_regs_high_get(struct task_struct *target,
-				     const struct user_regset *regset,
-				     struct membuf to)
-{
-	compat_ulong_t *gprs_high;
-	int i;
-
-	gprs_high = (compat_ulong_t *)task_pt_regs(target)->gprs;
-	for (i = 0; i < NUM_GPRS; i++, gprs_high += 2)
-		membuf_store(&to, *gprs_high);
-	return 0;
-}
-
-static int s390_compat_regs_high_set(struct task_struct *target,
-				     const struct user_regset *regset,
-				     unsigned int pos, unsigned int count,
-				     const void *kbuf, const void __user *ubuf)
-{
-	compat_ulong_t *gprs_high;
-	int rc = 0;
-
-	gprs_high = (compat_ulong_t *)
-		&task_pt_regs(target)->gprs[pos / sizeof(compat_ulong_t)];
-	if (kbuf) {
-		const compat_ulong_t *k = kbuf;
-		while (count > 0) {
-			*gprs_high = *k++;
-			*gprs_high += 2;
-			count -= sizeof(*k);
-		}
-	} else {
-		const compat_ulong_t  __user *u = ubuf;
-		while (count > 0 && !rc) {
-			unsigned long word;
-			rc = __get_user(word, u++);
-			if (rc)
-				break;
-			*gprs_high = word;
-			*gprs_high += 2;
-			count -= sizeof(*u);
-		}
-	}
-
-	return rc;
-}
-
-static int s390_compat_last_break_get(struct task_struct *target,
-				      const struct user_regset *regset,
-				      struct membuf to)
-{
-	compat_ulong_t last_break = target->thread.last_break;
-
-	return membuf_store(&to, (unsigned long)last_break);
-}
-
-static int s390_compat_last_break_set(struct task_struct *target,
-				      const struct user_regset *regset,
-				      unsigned int pos, unsigned int count,
-				      const void *kbuf, const void __user *ubuf)
-{
-	return 0;
-}
-
-static const struct user_regset s390_compat_regsets[] = {
-	{
-		USER_REGSET_NOTE_TYPE(PRSTATUS),
-		.n = sizeof(s390_compat_regs) / sizeof(compat_long_t),
-		.size = sizeof(compat_long_t),
-		.align = sizeof(compat_long_t),
-		.regset_get = s390_compat_regs_get,
-		.set = s390_compat_regs_set,
-	},
-	{
-		USER_REGSET_NOTE_TYPE(PRFPREG),
-		.n = sizeof(s390_fp_regs) / sizeof(compat_long_t),
-		.size = sizeof(compat_long_t),
-		.align = sizeof(compat_long_t),
-		.regset_get = s390_fpregs_get,
-		.set = s390_fpregs_set,
-	},
-	{
-		USER_REGSET_NOTE_TYPE(S390_SYSTEM_CALL),
-		.n = 1,
-		.size = sizeof(compat_uint_t),
-		.align = sizeof(compat_uint_t),
-		.regset_get = s390_system_call_get,
-		.set = s390_system_call_set,
-	},
-	{
-		USER_REGSET_NOTE_TYPE(S390_LAST_BREAK),
-		.n = 1,
-		.size = sizeof(long),
-		.align = sizeof(long),
-		.regset_get = s390_compat_last_break_get,
-		.set = s390_compat_last_break_set,
-	},
-	{
-		USER_REGSET_NOTE_TYPE(S390_TDB),
-		.n = 1,
-		.size = 256,
-		.align = 1,
-		.regset_get = s390_tdb_get,
-		.set = s390_tdb_set,
-	},
-	{
-		USER_REGSET_NOTE_TYPE(S390_VXRS_LOW),
-		.n = __NUM_VXRS_LOW,
-		.size = sizeof(__u64),
-		.align = sizeof(__u64),
-		.regset_get = s390_vxrs_low_get,
-		.set = s390_vxrs_low_set,
-	},
-	{
-		USER_REGSET_NOTE_TYPE(S390_VXRS_HIGH),
-		.n = __NUM_VXRS_HIGH,
-		.size = sizeof(__vector128),
-		.align = sizeof(__vector128),
-		.regset_get = s390_vxrs_high_get,
-		.set = s390_vxrs_high_set,
-	},
-	{
-		USER_REGSET_NOTE_TYPE(S390_HIGH_GPRS),
-		.n = sizeof(s390_compat_regs_high) / sizeof(compat_long_t),
-		.size = sizeof(compat_long_t),
-		.align = sizeof(compat_long_t),
-		.regset_get = s390_compat_regs_high_get,
-		.set = s390_compat_regs_high_set,
-	},
-	{
-		USER_REGSET_NOTE_TYPE(S390_GS_CB),
-		.n = sizeof(struct gs_cb) / sizeof(__u64),
-		.size = sizeof(__u64),
-		.align = sizeof(__u64),
-		.regset_get = s390_gs_cb_get,
-		.set = s390_gs_cb_set,
-	},
-	{
-		USER_REGSET_NOTE_TYPE(S390_GS_BC),
-		.n = sizeof(struct gs_cb) / sizeof(__u64),
-		.size = sizeof(__u64),
-		.align = sizeof(__u64),
-		.regset_get = s390_gs_bc_get,
-		.set = s390_gs_bc_set,
-	},
-	{
-		USER_REGSET_NOTE_TYPE(S390_RI_CB),
-		.n = sizeof(struct runtime_instr_cb) / sizeof(__u64),
-		.size = sizeof(__u64),
-		.align = sizeof(__u64),
-		.regset_get = s390_runtime_instr_get,
-		.set = s390_runtime_instr_set,
-	},
-};
-
-static const struct user_regset_view user_s390_compat_view = {
-	.name = "s390",
-	.e_machine = EM_S390,
-	.regsets = s390_compat_regsets,
-	.n = ARRAY_SIZE(s390_compat_regsets)
-};
-#endif
-
 const struct user_regset_view *task_user_regset_view(struct task_struct *task)
 {
-#ifdef CONFIG_COMPAT
-	if (test_tsk_thread_flag(task, TIF_31BIT))
-		return &user_s390_compat_view;
-#endif
 	return &user_s390_view;
 }
 
diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c
index 892fce2b7549..c1fe0b53c5ac 100644
--- a/arch/s390/kernel/setup.c
+++ b/arch/s390/kernel/setup.c
@@ -13,8 +13,7 @@
  * This file handles the architecture-dependent parts of initialization
  */
 
-#define KMSG_COMPONENT "setup"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "setup: " fmt
 
 #include <linux/errno.h>
 #include <linux/export.h>
@@ -47,7 +46,6 @@
 #include <linux/kexec.h>
 #include <linux/crash_dump.h>
 #include <linux/memory.h>
-#include <linux/compat.h>
 #include <linux/start_kernel.h>
 #include <linux/hugetlb.h>
 #include <linux/kmemleak.h>
@@ -112,7 +110,7 @@ struct exception_table_entry __amode31_ref *__stop_amode31_ex_table = _stop_amod
  * Because the AMODE31 sections are relocated below 2G at startup,
  * the content of control registers CR2, CR5 and CR15 must be updated
  * with new addresses after the relocation. The initial initialization of
- * control registers occurs in head64.S and then gets updated again after AMODE31
+ * control registers occurs in head.S and then gets updated again after AMODE31
  * relocation. We must access the relevant AMODE31 tables indirectly via
  * pointers placed in the .amode31.refs linker section. Those pointers get
  * updated automatically during AMODE31 relocation and always contain a valid
diff --git a/arch/s390/kernel/signal.c b/arch/s390/kernel/signal.c
index e48013cd832c..4874de5edea0 100644
--- a/arch/s390/kernel/signal.c
+++ b/arch/s390/kernel/signal.c
@@ -27,7 +27,6 @@
 #include <linux/personality.h>
 #include <linux/binfmts.h>
 #include <linux/syscalls.h>
-#include <linux/compat.h>
 #include <asm/ucontext.h>
 #include <linux/uaccess.h>
 #include <asm/vdso-symbols.h>
@@ -290,12 +289,6 @@ static int setup_frame(int sig, struct k_sigaction *ka,
 	unsigned long restorer;
 	size_t frame_size;
 
-	/*
-	 * gprs_high are only present for a 31-bit task running on
-	 * a 64-bit kernel (see compat_signal.c) but the space for
-	 * gprs_high need to be allocated if vector registers are
-	 * included in the signal frame on a 31-bit system.
-	 */
 	frame_size = sizeof(*frame) - sizeof(frame->sregs_ext);
 	if (cpu_has_vx())
 		frame_size += sizeof(frame->sregs_ext);
@@ -333,7 +326,7 @@ static int setup_frame(int sig, struct k_sigaction *ka,
 	if (ka->sa.sa_flags & SA_RESTORER)
 		restorer = (unsigned long) ka->sa.sa_restorer;
 	else
-		restorer = VDSO64_SYMBOL(current, sigreturn);
+		restorer = VDSO_SYMBOL(current, sigreturn);
 
 	/* Set up registers for signal handler */
 	regs->gprs[14] = restorer;
@@ -367,12 +360,6 @@ static int setup_rt_frame(struct ksignal *ksig, sigset_t *set,
 	size_t frame_size;
 
 	frame_size = sizeof(struct rt_sigframe) - sizeof(_sigregs_ext);
-	/*
-	 * gprs_high are only present for a 31-bit task running on
-	 * a 64-bit kernel (see compat_signal.c) but the space for
-	 * gprs_high need to be allocated if vector registers are
-	 * included in the signal frame on a 31-bit system.
-	 */
 	uc_flags = 0;
 	if (cpu_has_vx()) {
 		frame_size += sizeof(_sigregs_ext);
@@ -391,7 +378,7 @@ static int setup_rt_frame(struct ksignal *ksig, sigset_t *set,
 	if (ksig->ka.sa.sa_flags & SA_RESTORER)
 		restorer = (unsigned long) ksig->ka.sa.sa_restorer;
 	else
-		restorer = VDSO64_SYMBOL(current, rt_sigreturn);
+		restorer = VDSO_SYMBOL(current, rt_sigreturn);
 
 	/* Create siginfo on the signal stack */
 	if (copy_siginfo_to_user(&frame->info, &ksig->info))
@@ -490,10 +477,7 @@ void arch_do_signal_or_restart(struct pt_regs *regs)
 		clear_pt_regs_flag(regs, PIF_SYSCALL);
 
 		rseq_signal_deliver(&ksig, regs);
-		if (is_compat_task())
-			handle_signal32(&ksig, oldset, regs);
-		else
-			handle_signal(&ksig, oldset, regs);
+		handle_signal(&ksig, oldset, regs);
 		return;
 	}
 
@@ -506,10 +490,7 @@ void arch_do_signal_or_restart(struct pt_regs *regs)
 			/* Restart with sys_restart_syscall */
 			regs->gprs[2] = regs->orig_gpr2;
 			current->restart_block.arch_data = regs->psw.addr;
-			if (is_compat_task())
-				regs->psw.addr = VDSO32_SYMBOL(current, restart_syscall);
-			else
-				regs->psw.addr = VDSO64_SYMBOL(current, restart_syscall);
+			regs->psw.addr = VDSO_SYMBOL(current, restart_syscall);
 			if (test_thread_flag(TIF_SINGLE_STEP))
 				clear_thread_flag(TIF_PER_TRAP);
 			break;
diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c
index da84c0dc6b7e..b7429f30afc1 100644
--- a/arch/s390/kernel/smp.c
+++ b/arch/s390/kernel/smp.c
@@ -15,8 +15,7 @@
  * operates on physical cpu numbers needs to go into smp.c.
  */
 
-#define KMSG_COMPONENT "cpu"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "cpu: " fmt
 
 #include <linux/cpufeature.h>
 #include <linux/workqueue.h>
@@ -281,6 +280,9 @@ static void pcpu_attach_task(int cpu, struct task_struct *tsk)
 	lc->hardirq_timer = tsk->thread.hardirq_timer;
 	lc->softirq_timer = tsk->thread.softirq_timer;
 	lc->steal_timer = 0;
+#ifdef CONFIG_STACKPROTECTOR
+	lc->stack_canary = tsk->stack_canary;
+#endif
 }
 
 static void pcpu_start_fn(int cpu, void (*func)(void *), void *data)
@@ -305,9 +307,9 @@ static void __pcpu_delegate(pcpu_delegate_fn *func, void *data)
 	func(data);	/* should not return */
 }
 
-static void pcpu_delegate(struct pcpu *pcpu, int cpu,
-			  pcpu_delegate_fn *func,
-			  void *data, unsigned long stack)
+static void __noreturn pcpu_delegate(struct pcpu *pcpu, int cpu,
+				     pcpu_delegate_fn *func,
+				     void *data, unsigned long stack)
 {
 	struct lowcore *lc, *abs_lc;
 	unsigned int source_cpu;
@@ -370,7 +372,7 @@ static int pcpu_set_smt(unsigned int mtid)
 /*
  * Call function on the ipl CPU.
  */
-void smp_call_ipl_cpu(void (*func)(void *), void *data)
+void __noreturn smp_call_ipl_cpu(void (*func)(void *), void *data)
 {
 	struct lowcore *lc = lowcore_ptr[0];
 
@@ -697,6 +699,7 @@ static void __ref smp_get_core_info(struct sclp_core_info *info, int early)
 				continue;
 			info->core[info->configured].core_id =
 				address >> smp_cpu_mt_shift;
+			info->core[info->configured].type = boot_core_type;
 			info->configured++;
 		}
 		info->combined = info->configured;
diff --git a/arch/s390/kernel/stackprotector.c b/arch/s390/kernel/stackprotector.c
new file mode 100644
index 000000000000..d4e40483f008
--- /dev/null
+++ b/arch/s390/kernel/stackprotector.c
@@ -0,0 +1,156 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#ifndef pr_fmt
+#define pr_fmt(fmt)	"stackprot: " fmt
+#endif
+
+#include <linux/export.h>
+#include <linux/uaccess.h>
+#include <linux/printk.h>
+#include <asm/abs_lowcore.h>
+#include <asm/sections.h>
+#include <asm/machine.h>
+#include <asm/asm-offsets.h>
+#include <asm/arch-stackprotector.h>
+
+#ifdef __DECOMPRESSOR
+
+#define DEBUGP		boot_debug
+#define EMERGP		boot_emerg
+#define PANIC		boot_panic
+
+#else /* __DECOMPRESSOR */
+
+#define DEBUGP		pr_debug
+#define EMERGP		pr_emerg
+#define PANIC		panic
+
+#endif /* __DECOMPRESSOR */
+
+int __bootdata_preserved(stack_protector_debug);
+
+unsigned long __stack_chk_guard;
+EXPORT_SYMBOL(__stack_chk_guard);
+
+struct insn_ril {
+	u8 opc1 : 8;
+	u8 r1	: 4;
+	u8 opc2 : 4;
+	u32 imm;
+} __packed;
+
+/*
+ * Convert a virtual instruction address to a real instruction address. The
+ * decompressor needs to patch instructions within the kernel image based on
+ * their virtual addresses, while dynamic address translation is still
+ * disabled. Therefore a translation from virtual kernel image addresses to
+ * the corresponding physical addresses is required.
+ *
+ * After dynamic address translation is enabled and when the kernel needs to
+ * patch instructions such a translation is not required since the addresses
+ * are identical.
+ */
+static struct insn_ril *vaddress_to_insn(unsigned long vaddress)
+{
+#ifdef __DECOMPRESSOR
+	return (struct insn_ril *)__kernel_pa(vaddress);
+#else
+	return (struct insn_ril *)vaddress;
+#endif
+}
+
+static unsigned long insn_to_vaddress(struct insn_ril *insn)
+{
+#ifdef __DECOMPRESSOR
+	return (unsigned long)__kernel_va(insn);
+#else
+	return (unsigned long)insn;
+#endif
+}
+
+#define INSN_RIL_STRING_SIZE (sizeof(struct insn_ril) * 2 + 1)
+
+static void insn_ril_to_string(char *str, struct insn_ril *insn)
+{
+	u8 *ptr = (u8 *)insn;
+	int i;
+
+	for (i = 0; i < sizeof(*insn); i++)
+		hex_byte_pack(&str[2 * i], ptr[i]);
+	str[2 * i] = 0;
+}
+
+static void stack_protector_dump(struct insn_ril *old, struct insn_ril *new)
+{
+	char ostr[INSN_RIL_STRING_SIZE];
+	char nstr[INSN_RIL_STRING_SIZE];
+
+	insn_ril_to_string(ostr, old);
+	insn_ril_to_string(nstr, new);
+	DEBUGP("%016lx: %s -> %s\n", insn_to_vaddress(old), ostr, nstr);
+}
+
+static int stack_protector_verify(struct insn_ril *insn, unsigned long kernel_start)
+{
+	char istr[INSN_RIL_STRING_SIZE];
+	unsigned long vaddress, offset;
+
+	/* larl */
+	if (insn->opc1 == 0xc0 && insn->opc2 == 0x0)
+		return 0;
+	/* lgrl */
+	if (insn->opc1 == 0xc4 && insn->opc2 == 0x8)
+		return 0;
+	insn_ril_to_string(istr, insn);
+	vaddress = insn_to_vaddress(insn);
+	if (__is_defined(__DECOMPRESSOR)) {
+		offset = (unsigned long)insn - kernel_start + TEXT_OFFSET;
+		EMERGP("Unexpected instruction at %016lx/%016lx: %s\n", vaddress, offset, istr);
+		PANIC("Stackprotector error\n");
+	} else {
+		EMERGP("Unexpected instruction at %016lx: %s\n", vaddress, istr);
+	}
+	return -EINVAL;
+}
+
+int __stack_protector_apply(unsigned long *start, unsigned long *end, unsigned long kernel_start)
+{
+	unsigned long canary, *loc;
+	struct insn_ril *insn, new;
+	int rc;
+
+	/*
+	 * Convert LARL/LGRL instructions to LLILF so register R1 contains the
+	 * address of the per-cpu / per-process stack canary:
+	 *
+	 * LARL/LGRL R1,__stack_chk_guard => LLILF R1,__lc_stack_canary
+	 */
+	canary = __LC_STACK_CANARY;
+	if (machine_has_relocated_lowcore())
+		canary += LOWCORE_ALT_ADDRESS;
+	for (loc = start; loc < end; loc++) {
+		insn = vaddress_to_insn(*loc);
+		rc = stack_protector_verify(insn, kernel_start);
+		if (rc)
+			return rc;
+		new = *insn;
+		new.opc1 = 0xc0;
+		new.opc2 = 0xf;
+		new.imm = canary;
+		if (stack_protector_debug)
+			stack_protector_dump(insn, &new);
+		s390_kernel_write(insn, &new, sizeof(*insn));
+	}
+	return 0;
+}
+
+#ifdef __DECOMPRESSOR
+void __stack_protector_apply_early(unsigned long kernel_start)
+{
+	unsigned long *start, *end;
+
+	start = (unsigned long *)vmlinux.stack_prot_start;
+	end = (unsigned long *)vmlinux.stack_prot_end;
+	__stack_protector_apply(start, end, kernel_start);
+}
+#endif
diff --git a/arch/s390/kernel/stacktrace.c b/arch/s390/kernel/stacktrace.c
index b153a395f46d..3aae7f70e6ab 100644
--- a/arch/s390/kernel/stacktrace.c
+++ b/arch/s390/kernel/stacktrace.c
@@ -8,7 +8,6 @@
 #include <linux/perf_event.h>
 #include <linux/stacktrace.h>
 #include <linux/uaccess.h>
-#include <linux/compat.h>
 #include <asm/asm-offsets.h>
 #include <asm/stacktrace.h>
 #include <asm/unwind.h>
@@ -107,8 +106,6 @@ void arch_stack_walk_user_common(stack_trace_consume_fn consume_entry, void *coo
 	unsigned long ip, sp;
 	bool first = true;
 
-	if (is_compat_task())
-		return;
 	if (!current->mm)
 		return;
 	ip = instruction_pointer(regs);
diff --git a/arch/s390/kernel/sthyi.c b/arch/s390/kernel/sthyi.c
index f4ccdbed4b89..5eae2e25997a 100644
--- a/arch/s390/kernel/sthyi.c
+++ b/arch/s390/kernel/sthyi.c
@@ -253,7 +253,7 @@ static void fill_diag_mac(struct sthyi_sctns *sctns,
 	sctns->mac.infmval1 |= MAC_CNT_VLD;
 }
 
-/* Returns a pointer to the the next partition block. */
+/* Returns a pointer to the next partition block. */
 static struct diag204_x_part_block *lpar_cpu_inf(struct lpar_cpu_inf *part_inf,
 						 bool this_lpar,
 						 void *diag224_buf,
diff --git a/arch/s390/kernel/syscall.c b/arch/s390/kernel/syscall.c
index 4fee74553ca2..795b6cca74c9 100644
--- a/arch/s390/kernel/syscall.c
+++ b/arch/s390/kernel/syscall.c
@@ -39,6 +39,16 @@
 
 #include "entry.h"
 
+#define __SYSCALL(nr, sym) long __s390x_##sym(struct pt_regs *);
+#include <asm/syscall_table.h>
+#undef __SYSCALL
+
+#define __SYSCALL(nr, sym) [nr] = (__s390x_##sym),
+const sys_call_ptr_t sys_call_table[__NR_syscalls] = {
+#include <asm/syscall_table.h>
+};
+#undef __SYSCALL
+
 #ifdef CONFIG_SYSVIPC
 /*
  * sys_ipc() is the de-multiplexer for the SysV IPC calls.
@@ -122,7 +132,7 @@ void noinstr __do_syscall(struct pt_regs *regs, int per_trap)
 		goto out;
 	regs->gprs[2] = -ENOSYS;
 	if (likely(nr < NR_syscalls))
-		regs->gprs[2] = current->thread.sys_call_table[nr](regs);
+		regs->gprs[2] = sys_call_table[nr](regs);
 out:
 	syscall_exit_to_user_mode(regs);
 }
diff --git a/arch/s390/kernel/syscalls/Makefile b/arch/s390/kernel/syscalls/Makefile
index c5d958a09ff4..d5fca0ca0890 100644
--- a/arch/s390/kernel/syscalls/Makefile
+++ b/arch/s390/kernel/syscalls/Makefile
@@ -1,48 +1,32 @@
 # SPDX-License-Identifier: GPL-2.0
+kapi := arch/$(SRCARCH)/include/generated/asm
+uapi := arch/$(SRCARCH)/include/generated/uapi/asm
 
-gen	:= arch/$(ARCH)/include/generated
-kapi	:= $(gen)/asm
-uapi	:= $(gen)/uapi/asm
-
-syscall	:= $(src)/syscall.tbl
-systbl	:= $(src)/syscalltbl
-
-gen-y := $(kapi)/syscall_table.h
-kapi-hdrs-y := $(kapi)/unistd_nr.h
-uapi-hdrs-y := $(uapi)/unistd_32.h
-uapi-hdrs-y += $(uapi)/unistd_64.h
-
-targets += $(addprefix ../../../../,$(gen-y) $(kapi-hdrs-y) $(uapi-hdrs-y))
-
-PHONY += kapi uapi
-
-kapi:	$(gen-y) $(kapi-hdrs-y)
-uapi:	$(uapi-hdrs-y)
-
-
-# Create output directory if not already present
 $(shell mkdir -p $(uapi) $(kapi))
 
-quiet_cmd_syshdr = SYSHDR  $@
-      cmd_syshdr = $(CONFIG_SHELL) '$(systbl)' -H -a $(syshdr_abi_$(basetarget)) -f "$@" < $< > $@
-
-quiet_cmd_sysnr = SYSNR   $@
-      cmd_sysnr = $(CONFIG_SHELL) '$(systbl)' -N -a $(sysnr_abi_$(basetarget)) < $< > $@
+syscall := $(src)/syscall.tbl
+syshdr := $(srctree)/scripts/syscallhdr.sh
+systbl := $(srctree)/scripts/syscalltbl.sh
 
-quiet_cmd_syscalls = SYSTBL  $@
-      cmd_syscalls = $(CONFIG_SHELL) '$(systbl)' -S < $< > $@
+quiet_cmd_syshdr = SYSHDR  $@
+      cmd_syshdr = $(CONFIG_SHELL) $(syshdr) --emit-nr --abis common,$* $< $@
 
-syshdr_abi_unistd_32 := common,32
-$(uapi)/unistd_32.h: $(syscall) $(systbl) FORCE
-	$(call if_changed,syshdr)
+quiet_cmd_systbl = SYSTBL  $@
+      cmd_systbl = $(CONFIG_SHELL) $(systbl) --abis common,$* $< $@
 
-syshdr_abi_unistd_64 := common,64
-$(uapi)/unistd_64.h: $(syscall) $(systbl) FORCE
+$(uapi)/unistd_%.h: $(syscall) $(syshdr) FORCE
 	$(call if_changed,syshdr)
 
 $(kapi)/syscall_table.h: $(syscall) $(systbl) FORCE
-	$(call if_changed,syscalls)
+	$(call if_changed,systbl)
+
+uapisyshdr-y		+= unistd_64.h
+kapisyshdr-y		+= syscall_table.h
+
+uapisyshdr-y	:= $(addprefix $(uapi)/, $(uapisyshdr-y))
+kapisyshdr-y	:= $(addprefix $(kapi)/, $(kapisyshdr-y))
+targets		+= $(addprefix ../../../../, $(uapisyshdr-y) $(kapisyshdr-y))
 
-sysnr_abi_unistd_nr := common,32,64
-$(kapi)/unistd_nr.h: $(syscall) $(systbl) FORCE
-	$(call if_changed,sysnr)
+PHONY += all
+all: $(uapisyshdr-y) $(kapisyshdr-y)
+	@:
diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl
index 5863787ab036..417ed16b3c63 100644
--- a/arch/s390/kernel/syscalls/syscall.tbl
+++ b/arch/s390/kernel/syscalls/syscall.tbl
@@ -3,473 +3,397 @@
 # System call table for s390
 #
 # Format:
+# <nr> <abi> <syscall> <entry>
 #
-# <nr> <abi> <syscall> <entry-64bit> <compat-entry>
-#
-# where <abi> can be common, 64, or 32
+# <abi> is always common.
 
-1    common	exit			sys_exit			sys_exit
-2    common	fork			sys_fork			sys_fork
-3    common	read			sys_read			compat_sys_s390_read
-4    common	write			sys_write			compat_sys_s390_write
-5    common	open			sys_open			compat_sys_open
-6    common	close			sys_close			sys_close
-7    common	restart_syscall		sys_restart_syscall		sys_restart_syscall
-8    common	creat			sys_creat			sys_creat
-9    common	link			sys_link			sys_link
-10   common	unlink			sys_unlink			sys_unlink
-11   common	execve			sys_execve			compat_sys_execve
-12   common	chdir			sys_chdir			sys_chdir
-13   32		time			-				sys_time32
-14   common	mknod			sys_mknod			sys_mknod
-15   common	chmod			sys_chmod			sys_chmod
-16   32		lchown			-				sys_lchown16
-19   common	lseek			sys_lseek			compat_sys_lseek
-20   common	getpid			sys_getpid			sys_getpid
-21   common	mount			sys_mount			sys_mount
-22   common	umount			sys_oldumount			sys_oldumount
-23   32		setuid			-				sys_setuid16
-24   32		getuid			-				sys_getuid16
-25   32		stime			-				sys_stime32
-26   common	ptrace			sys_ptrace			compat_sys_ptrace
-27   common	alarm			sys_alarm			sys_alarm
-29   common	pause			sys_pause			sys_pause
-30   common	utime			sys_utime			sys_utime32
-33   common	access			sys_access			sys_access
-34   common	nice			sys_nice			sys_nice
-36   common	sync			sys_sync			sys_sync
-37   common	kill			sys_kill			sys_kill
-38   common	rename			sys_rename			sys_rename
-39   common	mkdir			sys_mkdir			sys_mkdir
-40   common	rmdir			sys_rmdir			sys_rmdir
-41   common	dup			sys_dup				sys_dup
-42   common	pipe			sys_pipe			sys_pipe
-43   common	times			sys_times			compat_sys_times
-45   common	brk			sys_brk				sys_brk
-46   32		setgid			-				sys_setgid16
-47   32		getgid			-				sys_getgid16
-48   common	signal			sys_signal			sys_signal
-49   32		geteuid			-				sys_geteuid16
-50   32		getegid			-				sys_getegid16
-51   common	acct			sys_acct			sys_acct
-52   common	umount2			sys_umount			sys_umount
-54   common	ioctl			sys_ioctl			compat_sys_ioctl
-55   common	fcntl			sys_fcntl			compat_sys_fcntl
-57   common	setpgid			sys_setpgid			sys_setpgid
-60   common	umask			sys_umask			sys_umask
-61   common	chroot			sys_chroot			sys_chroot
-62   common	ustat			sys_ustat			compat_sys_ustat
-63   common	dup2			sys_dup2			sys_dup2
-64   common	getppid			sys_getppid			sys_getppid
-65   common	getpgrp			sys_getpgrp			sys_getpgrp
-66   common	setsid			sys_setsid			sys_setsid
-67   common	sigaction		sys_sigaction			compat_sys_sigaction
-70   32		setreuid		-				sys_setreuid16
-71   32		setregid		-				sys_setregid16
-72   common	sigsuspend		sys_sigsuspend			sys_sigsuspend
-73   common	sigpending		sys_sigpending			compat_sys_sigpending
-74   common	sethostname		sys_sethostname			sys_sethostname
-75   common	setrlimit		sys_setrlimit			compat_sys_setrlimit
-76   32		getrlimit		-				compat_sys_old_getrlimit
-77   common	getrusage		sys_getrusage			compat_sys_getrusage
-78   common	gettimeofday		sys_gettimeofday		compat_sys_gettimeofday
-79   common	settimeofday		sys_settimeofday		compat_sys_settimeofday
-80   32		getgroups		-				sys_getgroups16
-81   32		setgroups		-				sys_setgroups16
-83   common	symlink			sys_symlink			sys_symlink
-85   common	readlink		sys_readlink			sys_readlink
-86   common	uselib			sys_uselib			sys_uselib
-87   common	swapon			sys_swapon			sys_swapon
-88   common	reboot			sys_reboot			sys_reboot
-89   common	readdir			-				compat_sys_old_readdir
-90   common	mmap			sys_old_mmap			compat_sys_s390_old_mmap
-91   common	munmap			sys_munmap			sys_munmap
-92   common	truncate		sys_truncate			compat_sys_truncate
-93   common	ftruncate		sys_ftruncate			compat_sys_ftruncate
-94   common	fchmod			sys_fchmod			sys_fchmod
-95   32		fchown			-				sys_fchown16
-96   common	getpriority		sys_getpriority			sys_getpriority
-97   common	setpriority		sys_setpriority			sys_setpriority
-99   common	statfs			sys_statfs			compat_sys_statfs
-100  common	fstatfs			sys_fstatfs			compat_sys_fstatfs
-101  32		ioperm			-				-
-102  common	socketcall		sys_socketcall			compat_sys_socketcall
-103  common	syslog			sys_syslog			sys_syslog
-104  common	setitimer		sys_setitimer			compat_sys_setitimer
-105  common	getitimer		sys_getitimer			compat_sys_getitimer
-106  common	stat			sys_newstat			compat_sys_newstat
-107  common	lstat			sys_newlstat			compat_sys_newlstat
-108  common	fstat			sys_newfstat			compat_sys_newfstat
-110  common	lookup_dcookie		-				-
-111  common	vhangup			sys_vhangup			sys_vhangup
-112  common	idle			-				-
-114  common	wait4			sys_wait4			compat_sys_wait4
-115  common	swapoff			sys_swapoff			sys_swapoff
-116  common	sysinfo			sys_sysinfo			compat_sys_sysinfo
-117  common	ipc			sys_s390_ipc			compat_sys_s390_ipc
-118  common	fsync			sys_fsync			sys_fsync
-119  common	sigreturn		sys_sigreturn			compat_sys_sigreturn
-120  common	clone			sys_clone			sys_clone
-121  common	setdomainname		sys_setdomainname		sys_setdomainname
-122  common	uname			sys_newuname			sys_newuname
-124  common	adjtimex		sys_adjtimex			sys_adjtimex_time32
-125  common	mprotect		sys_mprotect			sys_mprotect
-126  common	sigprocmask		sys_sigprocmask			compat_sys_sigprocmask
-127  common	create_module		-				-
-128  common	init_module		sys_init_module			sys_init_module
-129  common	delete_module		sys_delete_module		sys_delete_module
-130  common	get_kernel_syms		-				-
-131  common	quotactl		sys_quotactl			sys_quotactl
-132  common	getpgid			sys_getpgid			sys_getpgid
-133  common	fchdir			sys_fchdir			sys_fchdir
-134  common	bdflush			sys_ni_syscall			sys_ni_syscall
-135  common	sysfs			sys_sysfs			sys_sysfs
-136  common	personality		sys_s390_personality		sys_s390_personality
-137  common	afs_syscall		-				-
-138  32		setfsuid		-				sys_setfsuid16
-139  32		setfsgid		-				sys_setfsgid16
-140  32		_llseek			-				sys_llseek
-141  common	getdents		sys_getdents			compat_sys_getdents
-142  32		_newselect		-				compat_sys_select
-142  64		select			sys_select			-
-143  common	flock			sys_flock			sys_flock
-144  common	msync			sys_msync			sys_msync
-145  common	readv			sys_readv			sys_readv
-146  common	writev			sys_writev			sys_writev
-147  common	getsid			sys_getsid			sys_getsid
-148  common	fdatasync		sys_fdatasync			sys_fdatasync
-149  common	_sysctl			-				-
-150  common	mlock			sys_mlock			sys_mlock
-151  common	munlock			sys_munlock			sys_munlock
-152  common	mlockall		sys_mlockall			sys_mlockall
-153  common	munlockall		sys_munlockall			sys_munlockall
-154  common	sched_setparam		sys_sched_setparam		sys_sched_setparam
-155  common	sched_getparam		sys_sched_getparam		sys_sched_getparam
-156  common	sched_setscheduler	sys_sched_setscheduler		sys_sched_setscheduler
-157  common	sched_getscheduler	sys_sched_getscheduler		sys_sched_getscheduler
-158  common	sched_yield		sys_sched_yield			sys_sched_yield
-159  common	sched_get_priority_max	sys_sched_get_priority_max	sys_sched_get_priority_max
-160  common	sched_get_priority_min	sys_sched_get_priority_min	sys_sched_get_priority_min
-161  common	sched_rr_get_interval	sys_sched_rr_get_interval	sys_sched_rr_get_interval_time32
-162  common	nanosleep		sys_nanosleep			sys_nanosleep_time32
-163  common	mremap			sys_mremap			sys_mremap
-164  32		setresuid		-				sys_setresuid16
-165  32		getresuid		-				sys_getresuid16
-167  common	query_module		-				-
-168  common	poll			sys_poll			sys_poll
-169  common	nfsservctl		-				-
-170  32		setresgid		-				sys_setresgid16
-171  32		getresgid		-				sys_getresgid16
-172  common	prctl			sys_prctl			sys_prctl
-173  common	rt_sigreturn		sys_rt_sigreturn		compat_sys_rt_sigreturn
-174  common	rt_sigaction		sys_rt_sigaction		compat_sys_rt_sigaction
-175  common	rt_sigprocmask		sys_rt_sigprocmask		compat_sys_rt_sigprocmask
-176  common	rt_sigpending		sys_rt_sigpending		compat_sys_rt_sigpending
-177  common	rt_sigtimedwait		sys_rt_sigtimedwait		compat_sys_rt_sigtimedwait_time32
-178  common	rt_sigqueueinfo		sys_rt_sigqueueinfo		compat_sys_rt_sigqueueinfo
-179  common	rt_sigsuspend		sys_rt_sigsuspend		compat_sys_rt_sigsuspend
-180  common	pread64			sys_pread64			compat_sys_s390_pread64
-181  common	pwrite64		sys_pwrite64			compat_sys_s390_pwrite64
-182  32		chown			-				sys_chown16
-183  common	getcwd			sys_getcwd			sys_getcwd
-184  common	capget			sys_capget			sys_capget
-185  common	capset			sys_capset			sys_capset
-186  common	sigaltstack		sys_sigaltstack			compat_sys_sigaltstack
-187  common	sendfile		sys_sendfile64			compat_sys_sendfile
-188  common	getpmsg			-				-
-189  common	putpmsg			-				-
-190  common	vfork			sys_vfork			sys_vfork
-191  32		ugetrlimit		-				compat_sys_getrlimit
-191  64		getrlimit		sys_getrlimit			-
-192  32		mmap2			-				compat_sys_s390_mmap2
-193  32		truncate64		-				compat_sys_s390_truncate64
-194  32		ftruncate64		-				compat_sys_s390_ftruncate64
-195  32		stat64			-				compat_sys_s390_stat64
-196  32		lstat64			-				compat_sys_s390_lstat64
-197  32		fstat64			-				compat_sys_s390_fstat64
-198  32		lchown32		-				sys_lchown
-198  64		lchown			sys_lchown			-
-199  32		getuid32		-				sys_getuid
-199  64		getuid			sys_getuid			-
-200  32		getgid32		-				sys_getgid
-200  64		getgid			sys_getgid			-
-201  32		geteuid32		-				sys_geteuid
-201  64		geteuid			sys_geteuid			-
-202  32		getegid32		-				sys_getegid
-202  64		getegid			sys_getegid			-
-203  32		setreuid32		-				sys_setreuid
-203  64		setreuid		sys_setreuid			-
-204  32		setregid32		-				sys_setregid
-204  64		setregid		sys_setregid			-
-205  32		getgroups32		-				sys_getgroups
-205  64		getgroups		sys_getgroups			-
-206  32		setgroups32		-				sys_setgroups
-206  64		setgroups		sys_setgroups			-
-207  32		fchown32		-				sys_fchown
-207  64		fchown			sys_fchown			-
-208  32		setresuid32		-				sys_setresuid
-208  64		setresuid		sys_setresuid			-
-209  32		getresuid32		-				sys_getresuid
-209  64		getresuid		sys_getresuid			-
-210  32		setresgid32		-				sys_setresgid
-210  64		setresgid		sys_setresgid			-
-211  32		getresgid32		-				sys_getresgid
-211  64		getresgid		sys_getresgid			-
-212  32		chown32			-				sys_chown
-212  64		chown			sys_chown			-
-213  32		setuid32		-				sys_setuid
-213  64		setuid			sys_setuid			-
-214  32		setgid32		-				sys_setgid
-214  64		setgid			sys_setgid			-
-215  32		setfsuid32		-				sys_setfsuid
-215  64		setfsuid		sys_setfsuid			-
-216  32		setfsgid32		-				sys_setfsgid
-216  64		setfsgid		sys_setfsgid			-
-217  common	pivot_root		sys_pivot_root			sys_pivot_root
-218  common	mincore			sys_mincore			sys_mincore
-219  common	madvise			sys_madvise			sys_madvise
-220  common	getdents64		sys_getdents64			sys_getdents64
-221  32		fcntl64			-				compat_sys_fcntl64
-222  common	readahead		sys_readahead			compat_sys_s390_readahead
-223  32		sendfile64		-				compat_sys_sendfile64
-224  common	setxattr		sys_setxattr			sys_setxattr
-225  common	lsetxattr		sys_lsetxattr			sys_lsetxattr
-226  common	fsetxattr		sys_fsetxattr			sys_fsetxattr
-227  common	getxattr		sys_getxattr			sys_getxattr
-228  common	lgetxattr		sys_lgetxattr			sys_lgetxattr
-229  common	fgetxattr		sys_fgetxattr			sys_fgetxattr
-230  common	listxattr		sys_listxattr			sys_listxattr
-231  common	llistxattr		sys_llistxattr			sys_llistxattr
-232  common	flistxattr		sys_flistxattr			sys_flistxattr
-233  common	removexattr		sys_removexattr			sys_removexattr
-234  common	lremovexattr		sys_lremovexattr		sys_lremovexattr
-235  common	fremovexattr		sys_fremovexattr		sys_fremovexattr
-236  common	gettid			sys_gettid			sys_gettid
-237  common	tkill			sys_tkill			sys_tkill
-238  common	futex			sys_futex			sys_futex_time32
-239  common	sched_setaffinity	sys_sched_setaffinity		compat_sys_sched_setaffinity
-240  common	sched_getaffinity	sys_sched_getaffinity		compat_sys_sched_getaffinity
-241  common	tgkill			sys_tgkill			sys_tgkill
-243  common	io_setup		sys_io_setup			compat_sys_io_setup
-244  common	io_destroy		sys_io_destroy			sys_io_destroy
-245  common	io_getevents		sys_io_getevents		sys_io_getevents_time32
-246  common	io_submit		sys_io_submit			compat_sys_io_submit
-247  common	io_cancel		sys_io_cancel			sys_io_cancel
-248  common	exit_group		sys_exit_group			sys_exit_group
-249  common	epoll_create		sys_epoll_create		sys_epoll_create
-250  common	epoll_ctl		sys_epoll_ctl			sys_epoll_ctl
-251  common	epoll_wait		sys_epoll_wait			sys_epoll_wait
-252  common	set_tid_address		sys_set_tid_address		sys_set_tid_address
-253  common	fadvise64		sys_fadvise64_64		compat_sys_s390_fadvise64
-254  common	timer_create		sys_timer_create		compat_sys_timer_create
-255  common	timer_settime		sys_timer_settime		sys_timer_settime32
-256  common	timer_gettime		sys_timer_gettime		sys_timer_gettime32
-257  common	timer_getoverrun	sys_timer_getoverrun		sys_timer_getoverrun
-258  common	timer_delete		sys_timer_delete		sys_timer_delete
-259  common	clock_settime		sys_clock_settime		sys_clock_settime32
-260  common	clock_gettime		sys_clock_gettime		sys_clock_gettime32
-261  common	clock_getres		sys_clock_getres		sys_clock_getres_time32
-262  common	clock_nanosleep		sys_clock_nanosleep		sys_clock_nanosleep_time32
-264  32		fadvise64_64		-				compat_sys_s390_fadvise64_64
-265  common	statfs64		sys_statfs64			compat_sys_statfs64
-266  common	fstatfs64		sys_fstatfs64			compat_sys_fstatfs64
-267  common	remap_file_pages	sys_remap_file_pages		sys_remap_file_pages
-268  common	mbind			sys_mbind			sys_mbind
-269  common	get_mempolicy		sys_get_mempolicy		sys_get_mempolicy
-270  common	set_mempolicy		sys_set_mempolicy		sys_set_mempolicy
-271  common	mq_open			sys_mq_open			compat_sys_mq_open
-272  common	mq_unlink		sys_mq_unlink			sys_mq_unlink
-273  common	mq_timedsend		sys_mq_timedsend		sys_mq_timedsend_time32
-274  common	mq_timedreceive		sys_mq_timedreceive		sys_mq_timedreceive_time32
-275  common	mq_notify		sys_mq_notify			compat_sys_mq_notify
-276  common	mq_getsetattr		sys_mq_getsetattr		compat_sys_mq_getsetattr
-277  common	kexec_load		sys_kexec_load			compat_sys_kexec_load
-278  common	add_key			sys_add_key			sys_add_key
-279  common	request_key		sys_request_key			sys_request_key
-280  common	keyctl			sys_keyctl			compat_sys_keyctl
-281  common	waitid			sys_waitid			compat_sys_waitid
-282  common	ioprio_set		sys_ioprio_set			sys_ioprio_set
-283  common	ioprio_get		sys_ioprio_get			sys_ioprio_get
-284  common	inotify_init		sys_inotify_init		sys_inotify_init
-285  common	inotify_add_watch	sys_inotify_add_watch		sys_inotify_add_watch
-286  common	inotify_rm_watch	sys_inotify_rm_watch		sys_inotify_rm_watch
-287  common	migrate_pages		sys_migrate_pages		sys_migrate_pages
-288  common	openat			sys_openat			compat_sys_openat
-289  common	mkdirat			sys_mkdirat			sys_mkdirat
-290  common	mknodat			sys_mknodat			sys_mknodat
-291  common	fchownat		sys_fchownat			sys_fchownat
-292  common	futimesat		sys_futimesat			sys_futimesat_time32
-293  32		fstatat64		-				compat_sys_s390_fstatat64
-293  64		newfstatat		sys_newfstatat			-
-294  common	unlinkat		sys_unlinkat			sys_unlinkat
-295  common	renameat		sys_renameat			sys_renameat
-296  common	linkat			sys_linkat			sys_linkat
-297  common	symlinkat		sys_symlinkat			sys_symlinkat
-298  common	readlinkat		sys_readlinkat			sys_readlinkat
-299  common	fchmodat		sys_fchmodat			sys_fchmodat
-300  common	faccessat		sys_faccessat			sys_faccessat
-301  common	pselect6		sys_pselect6			compat_sys_pselect6_time32
-302  common	ppoll			sys_ppoll			compat_sys_ppoll_time32
-303  common	unshare			sys_unshare			sys_unshare
-304  common	set_robust_list		sys_set_robust_list		compat_sys_set_robust_list
-305  common	get_robust_list		sys_get_robust_list		compat_sys_get_robust_list
-306  common	splice			sys_splice			sys_splice
-307  common	sync_file_range		sys_sync_file_range		compat_sys_s390_sync_file_range
-308  common	tee			sys_tee				sys_tee
-309  common	vmsplice		sys_vmsplice			sys_vmsplice
-310  common	move_pages		sys_move_pages			sys_move_pages
-311  common	getcpu			sys_getcpu			sys_getcpu
-312  common	epoll_pwait		sys_epoll_pwait			compat_sys_epoll_pwait
-313  common	utimes			sys_utimes			sys_utimes_time32
-314  common	fallocate		sys_fallocate			compat_sys_s390_fallocate
-315  common	utimensat		sys_utimensat			sys_utimensat_time32
-316  common	signalfd		sys_signalfd			compat_sys_signalfd
-317  common	timerfd			-				-
-318  common	eventfd			sys_eventfd			sys_eventfd
-319  common	timerfd_create		sys_timerfd_create		sys_timerfd_create
-320  common	timerfd_settime		sys_timerfd_settime		sys_timerfd_settime32
-321  common	timerfd_gettime		sys_timerfd_gettime		sys_timerfd_gettime32
-322  common	signalfd4		sys_signalfd4			compat_sys_signalfd4
-323  common	eventfd2		sys_eventfd2			sys_eventfd2
-324  common	inotify_init1		sys_inotify_init1		sys_inotify_init1
-325  common	pipe2			sys_pipe2			sys_pipe2
-326  common	dup3			sys_dup3			sys_dup3
-327  common	epoll_create1		sys_epoll_create1		sys_epoll_create1
-328  common	preadv			sys_preadv			compat_sys_preadv
-329  common	pwritev			sys_pwritev			compat_sys_pwritev
-330  common	rt_tgsigqueueinfo	sys_rt_tgsigqueueinfo		compat_sys_rt_tgsigqueueinfo
-331  common	perf_event_open		sys_perf_event_open		sys_perf_event_open
-332  common	fanotify_init		sys_fanotify_init		sys_fanotify_init
-333  common	fanotify_mark		sys_fanotify_mark		compat_sys_fanotify_mark
-334  common	prlimit64		sys_prlimit64			sys_prlimit64
-335  common	name_to_handle_at	sys_name_to_handle_at		sys_name_to_handle_at
-336  common	open_by_handle_at	sys_open_by_handle_at		compat_sys_open_by_handle_at
-337  common	clock_adjtime		sys_clock_adjtime		sys_clock_adjtime32
-338  common	syncfs			sys_syncfs			sys_syncfs
-339  common	setns			sys_setns			sys_setns
-340  common	process_vm_readv	sys_process_vm_readv		sys_process_vm_readv
-341  common	process_vm_writev	sys_process_vm_writev		sys_process_vm_writev
-342  common	s390_runtime_instr	sys_s390_runtime_instr		sys_s390_runtime_instr
-343  common	kcmp			sys_kcmp			sys_kcmp
-344  common	finit_module		sys_finit_module		sys_finit_module
-345  common	sched_setattr		sys_sched_setattr		sys_sched_setattr
-346  common	sched_getattr		sys_sched_getattr		sys_sched_getattr
-347  common	renameat2		sys_renameat2			sys_renameat2
-348  common	seccomp			sys_seccomp			sys_seccomp
-349  common	getrandom		sys_getrandom			sys_getrandom
-350  common	memfd_create		sys_memfd_create		sys_memfd_create
-351  common	bpf			sys_bpf				sys_bpf
-352  common	s390_pci_mmio_write	sys_s390_pci_mmio_write		sys_s390_pci_mmio_write
-353  common	s390_pci_mmio_read	sys_s390_pci_mmio_read		sys_s390_pci_mmio_read
-354  common	execveat		sys_execveat			compat_sys_execveat
-355  common	userfaultfd		sys_userfaultfd			sys_userfaultfd
-356  common	membarrier		sys_membarrier			sys_membarrier
-357  common	recvmmsg		sys_recvmmsg			compat_sys_recvmmsg_time32
-358  common	sendmmsg		sys_sendmmsg			compat_sys_sendmmsg
-359  common	socket			sys_socket			sys_socket
-360  common	socketpair		sys_socketpair			sys_socketpair
-361  common	bind			sys_bind			sys_bind
-362  common	connect			sys_connect			sys_connect
-363  common	listen			sys_listen			sys_listen
-364  common	accept4			sys_accept4			sys_accept4
-365  common	getsockopt		sys_getsockopt			sys_getsockopt
-366  common	setsockopt		sys_setsockopt			sys_setsockopt
-367  common	getsockname		sys_getsockname			sys_getsockname
-368  common	getpeername		sys_getpeername			sys_getpeername
-369  common	sendto			sys_sendto			sys_sendto
-370  common	sendmsg			sys_sendmsg			compat_sys_sendmsg
-371  common	recvfrom		sys_recvfrom			compat_sys_recvfrom
-372  common	recvmsg			sys_recvmsg			compat_sys_recvmsg
-373  common	shutdown		sys_shutdown			sys_shutdown
-374  common	mlock2			sys_mlock2			sys_mlock2
-375  common	copy_file_range		sys_copy_file_range		sys_copy_file_range
-376  common	preadv2			sys_preadv2			compat_sys_preadv2
-377  common	pwritev2		sys_pwritev2			compat_sys_pwritev2
-378  common	s390_guarded_storage	sys_s390_guarded_storage	sys_s390_guarded_storage
-379  common	statx			sys_statx			sys_statx
-380  common	s390_sthyi		sys_s390_sthyi			sys_s390_sthyi
-381  common	kexec_file_load		sys_kexec_file_load		sys_kexec_file_load
-382  common	io_pgetevents		sys_io_pgetevents		compat_sys_io_pgetevents
-383  common	rseq			sys_rseq			sys_rseq
-384  common	pkey_mprotect		sys_pkey_mprotect		sys_pkey_mprotect
-385  common	pkey_alloc		sys_pkey_alloc			sys_pkey_alloc
-386  common	pkey_free		sys_pkey_free			sys_pkey_free
+1	common	exit				sys_exit
+2	common	fork				sys_fork
+3	common	read				sys_read
+4	common	write				sys_write
+5	common	open				sys_open
+6	common	close				sys_close
+7	common	restart_syscall			sys_restart_syscall
+8	common	creat				sys_creat
+9	common	link				sys_link
+10	common	unlink				sys_unlink
+11	common	execve				sys_execve
+12	common	chdir				sys_chdir
+14	common	mknod				sys_mknod
+15	common	chmod				sys_chmod
+19	common	lseek				sys_lseek
+20	common	getpid				sys_getpid
+21	common	mount				sys_mount
+22	common	umount				sys_oldumount
+26	common	ptrace				sys_ptrace
+27	common	alarm				sys_alarm
+29	common	pause				sys_pause
+30	common	utime				sys_utime
+33	common	access				sys_access
+34	common	nice				sys_nice
+36	common	sync				sys_sync
+37	common	kill				sys_kill
+38	common	rename				sys_rename
+39	common	mkdir				sys_mkdir
+40	common	rmdir				sys_rmdir
+41	common	dup				sys_dup
+42	common	pipe				sys_pipe
+43	common	times				sys_times
+45	common	brk				sys_brk
+48	common	signal				sys_signal
+51	common	acct				sys_acct
+52	common	umount2				sys_umount
+54	common	ioctl				sys_ioctl
+55	common	fcntl				sys_fcntl
+57	common	setpgid				sys_setpgid
+60	common	umask				sys_umask
+61	common	chroot				sys_chroot
+62	common	ustat				sys_ustat
+63	common	dup2				sys_dup2
+64	common	getppid				sys_getppid
+65	common	getpgrp				sys_getpgrp
+66	common	setsid				sys_setsid
+67	common	sigaction			sys_sigaction
+72	common	sigsuspend			sys_sigsuspend
+73	common	sigpending			sys_sigpending
+74	common	sethostname			sys_sethostname
+75	common	setrlimit			sys_setrlimit
+77	common	getrusage			sys_getrusage
+78	common	gettimeofday			sys_gettimeofday
+79	common	settimeofday			sys_settimeofday
+83	common	symlink				sys_symlink
+85	common	readlink			sys_readlink
+86	common	uselib				sys_uselib
+87	common	swapon				sys_swapon
+88	common	reboot				sys_reboot
+89	common	readdir				sys_ni_syscall
+90	common	mmap				sys_old_mmap
+91	common	munmap				sys_munmap
+92	common	truncate			sys_truncate
+93	common	ftruncate			sys_ftruncate
+94	common	fchmod				sys_fchmod
+96	common	getpriority			sys_getpriority
+97	common	setpriority			sys_setpriority
+99	common	statfs				sys_statfs
+100	common	fstatfs				sys_fstatfs
+102	common	socketcall			sys_socketcall
+103	common	syslog				sys_syslog
+104	common	setitimer			sys_setitimer
+105	common	getitimer			sys_getitimer
+106	common	stat				sys_newstat
+107	common	lstat				sys_newlstat
+108	common	fstat				sys_newfstat
+110	common	lookup_dcookie			sys_ni_syscall
+111	common	vhangup				sys_vhangup
+112	common	idle				sys_ni_syscall
+114	common	wait4				sys_wait4
+115	common	swapoff				sys_swapoff
+116	common	sysinfo				sys_sysinfo
+117	common	ipc				sys_s390_ipc
+118	common	fsync				sys_fsync
+119	common	sigreturn			sys_sigreturn
+120	common	clone				sys_clone
+121	common	setdomainname			sys_setdomainname
+122	common	uname				sys_newuname
+124	common	adjtimex			sys_adjtimex
+125	common	mprotect			sys_mprotect
+126	common	sigprocmask			sys_sigprocmask
+127	common	create_module			sys_ni_syscall
+128	common	init_module			sys_init_module
+129	common	delete_module			sys_delete_module
+130	common	get_kernel_syms			sys_ni_syscall
+131	common	quotactl			sys_quotactl
+132	common	getpgid				sys_getpgid
+133	common	fchdir				sys_fchdir
+134	common	bdflush				sys_ni_syscall
+135	common	sysfs				sys_sysfs
+136	common	personality			sys_s390_personality
+137	common	afs_syscall			sys_ni_syscall
+141	common	getdents			sys_getdents
+142	common	select				sys_select
+143	common	flock				sys_flock
+144	common	msync				sys_msync
+145	common	readv				sys_readv
+146	common	writev				sys_writev
+147	common	getsid				sys_getsid
+148	common	fdatasync			sys_fdatasync
+149	common	_sysctl				sys_ni_syscall
+150	common	mlock				sys_mlock
+151	common	munlock				sys_munlock
+152	common	mlockall			sys_mlockall
+153	common	munlockall			sys_munlockall
+154	common	sched_setparam			sys_sched_setparam
+155	common	sched_getparam			sys_sched_getparam
+156	common	sched_setscheduler		sys_sched_setscheduler
+157	common	sched_getscheduler		sys_sched_getscheduler
+158	common	sched_yield			sys_sched_yield
+159	common	sched_get_priority_max		sys_sched_get_priority_max
+160	common	sched_get_priority_min		sys_sched_get_priority_min
+161	common	sched_rr_get_interval		sys_sched_rr_get_interval
+162	common	nanosleep			sys_nanosleep
+163	common	mremap				sys_mremap
+167	common	query_module			sys_ni_syscall
+168	common	poll				sys_poll
+169	common	nfsservctl			sys_ni_syscall
+172	common	prctl				sys_prctl
+173	common	rt_sigreturn			sys_rt_sigreturn
+174	common	rt_sigaction			sys_rt_sigaction
+175	common	rt_sigprocmask			sys_rt_sigprocmask
+176	common	rt_sigpending			sys_rt_sigpending
+177	common	rt_sigtimedwait			sys_rt_sigtimedwait
+178	common	rt_sigqueueinfo			sys_rt_sigqueueinfo
+179	common	rt_sigsuspend			sys_rt_sigsuspend
+180	common	pread64				sys_pread64
+181	common	pwrite64			sys_pwrite64
+183	common	getcwd				sys_getcwd
+184	common	capget				sys_capget
+185	common	capset				sys_capset
+186	common	sigaltstack			sys_sigaltstack
+187	common	sendfile			sys_sendfile64
+188	common	getpmsg				sys_ni_syscall
+189	common	putpmsg				sys_ni_syscall
+190	common	vfork				sys_vfork
+191	common	getrlimit			sys_getrlimit
+198	common	lchown				sys_lchown
+199	common	getuid				sys_getuid
+200	common	getgid				sys_getgid
+201	common	geteuid				sys_geteuid
+202	common	getegid				sys_getegid
+203	common	setreuid			sys_setreuid
+204	common	setregid			sys_setregid
+205	common	getgroups			sys_getgroups
+206	common	setgroups			sys_setgroups
+207	common	fchown				sys_fchown
+208	common	setresuid			sys_setresuid
+209	common	getresuid			sys_getresuid
+210	common	setresgid			sys_setresgid
+211	common	getresgid			sys_getresgid
+212	common	chown				sys_chown
+213	common	setuid				sys_setuid
+214	common	setgid				sys_setgid
+215	common	setfsuid			sys_setfsuid
+216	common	setfsgid			sys_setfsgid
+217	common	pivot_root			sys_pivot_root
+218	common	mincore				sys_mincore
+219	common	madvise				sys_madvise
+220	common	getdents64			sys_getdents64
+222	common	readahead			sys_readahead
+224	common	setxattr			sys_setxattr
+225	common	lsetxattr			sys_lsetxattr
+226	common	fsetxattr			sys_fsetxattr
+227	common	getxattr			sys_getxattr
+228	common	lgetxattr			sys_lgetxattr
+229	common	fgetxattr			sys_fgetxattr
+230	common	listxattr			sys_listxattr
+231	common	llistxattr			sys_llistxattr
+232	common	flistxattr			sys_flistxattr
+233	common	removexattr			sys_removexattr
+234	common	lremovexattr			sys_lremovexattr
+235	common	fremovexattr			sys_fremovexattr
+236	common	gettid				sys_gettid
+237	common	tkill				sys_tkill
+238	common	futex				sys_futex
+239	common	sched_setaffinity		sys_sched_setaffinity
+240	common	sched_getaffinity		sys_sched_getaffinity
+241	common	tgkill				sys_tgkill
+243	common	io_setup			sys_io_setup
+244	common	io_destroy			sys_io_destroy
+245	common	io_getevents			sys_io_getevents
+246	common	io_submit			sys_io_submit
+247	common	io_cancel			sys_io_cancel
+248	common	exit_group			sys_exit_group
+249	common	epoll_create			sys_epoll_create
+250	common	epoll_ctl			sys_epoll_ctl
+251	common	epoll_wait			sys_epoll_wait
+252	common	set_tid_address			sys_set_tid_address
+253	common	fadvise64			sys_fadvise64_64
+254	common	timer_create			sys_timer_create
+255	common	timer_settime			sys_timer_settime
+256	common	timer_gettime			sys_timer_gettime
+257	common	timer_getoverrun		sys_timer_getoverrun
+258	common	timer_delete			sys_timer_delete
+259	common	clock_settime			sys_clock_settime
+260	common	clock_gettime			sys_clock_gettime
+261	common	clock_getres			sys_clock_getres
+262	common	clock_nanosleep			sys_clock_nanosleep
+265	common	statfs64			sys_statfs64
+266	common	fstatfs64			sys_fstatfs64
+267	common	remap_file_pages		sys_remap_file_pages
+268	common	mbind				sys_mbind
+269	common	get_mempolicy			sys_get_mempolicy
+270	common	set_mempolicy			sys_set_mempolicy
+271	common	mq_open				sys_mq_open
+272	common	mq_unlink			sys_mq_unlink
+273	common	mq_timedsend			sys_mq_timedsend
+274	common	mq_timedreceive			sys_mq_timedreceive
+275	common	mq_notify			sys_mq_notify
+276	common	mq_getsetattr			sys_mq_getsetattr
+277	common	kexec_load			sys_kexec_load
+278	common	add_key				sys_add_key
+279	common	request_key			sys_request_key
+280	common	keyctl				sys_keyctl
+281	common	waitid				sys_waitid
+282	common	ioprio_set			sys_ioprio_set
+283	common	ioprio_get			sys_ioprio_get
+284	common	inotify_init			sys_inotify_init
+285	common	inotify_add_watch		sys_inotify_add_watch
+286	common	inotify_rm_watch		sys_inotify_rm_watch
+287	common	migrate_pages			sys_migrate_pages
+288	common	openat				sys_openat
+289	common	mkdirat				sys_mkdirat
+290	common	mknodat				sys_mknodat
+291	common	fchownat			sys_fchownat
+292	common	futimesat			sys_futimesat
+293	common	newfstatat			sys_newfstatat
+294	common	unlinkat			sys_unlinkat
+295	common	renameat			sys_renameat
+296	common	linkat				sys_linkat
+297	common	symlinkat			sys_symlinkat
+298	common	readlinkat			sys_readlinkat
+299	common	fchmodat			sys_fchmodat
+300	common	faccessat			sys_faccessat
+301	common	pselect6			sys_pselect6
+302	common	ppoll				sys_ppoll
+303	common	unshare				sys_unshare
+304	common	set_robust_list			sys_set_robust_list
+305	common	get_robust_list			sys_get_robust_list
+306	common	splice				sys_splice
+307	common	sync_file_range			sys_sync_file_range
+308	common	tee				sys_tee
+309	common	vmsplice			sys_vmsplice
+310	common	move_pages			sys_move_pages
+311	common	getcpu				sys_getcpu
+312	common	epoll_pwait			sys_epoll_pwait
+313	common	utimes				sys_utimes
+314	common	fallocate			sys_fallocate
+315	common	utimensat			sys_utimensat
+316	common	signalfd			sys_signalfd
+317	common	timerfd				sys_ni_syscall
+318	common	eventfd				sys_eventfd
+319	common	timerfd_create			sys_timerfd_create
+320	common	timerfd_settime			sys_timerfd_settime
+321	common	timerfd_gettime			sys_timerfd_gettime
+322	common	signalfd4			sys_signalfd4
+323	common	eventfd2			sys_eventfd2
+324	common	inotify_init1			sys_inotify_init1
+325	common	pipe2				sys_pipe2
+326	common	dup3				sys_dup3
+327	common	epoll_create1			sys_epoll_create1
+328	common	preadv				sys_preadv
+329	common	pwritev				sys_pwritev
+330	common	rt_tgsigqueueinfo		sys_rt_tgsigqueueinfo
+331	common	perf_event_open			sys_perf_event_open
+332	common	fanotify_init			sys_fanotify_init
+333	common	fanotify_mark			sys_fanotify_mark
+334	common	prlimit64			sys_prlimit64
+335	common	name_to_handle_at		sys_name_to_handle_at
+336	common	open_by_handle_at		sys_open_by_handle_at
+337	common	clock_adjtime			sys_clock_adjtime
+338	common	syncfs				sys_syncfs
+339	common	setns				sys_setns
+340	common	process_vm_readv		sys_process_vm_readv
+341	common	process_vm_writev		sys_process_vm_writev
+342	common	s390_runtime_instr		sys_s390_runtime_instr
+343	common	kcmp				sys_kcmp
+344	common	finit_module			sys_finit_module
+345	common	sched_setattr			sys_sched_setattr
+346	common	sched_getattr			sys_sched_getattr
+347	common	renameat2			sys_renameat2
+348	common	seccomp				sys_seccomp
+349	common	getrandom			sys_getrandom
+350	common	memfd_create			sys_memfd_create
+351	common	bpf				sys_bpf
+352	common	s390_pci_mmio_write		sys_s390_pci_mmio_write
+353	common	s390_pci_mmio_read		sys_s390_pci_mmio_read
+354	common	execveat			sys_execveat
+355	common	userfaultfd			sys_userfaultfd
+356	common	membarrier			sys_membarrier
+357	common	recvmmsg			sys_recvmmsg
+358	common	sendmmsg			sys_sendmmsg
+359	common	socket				sys_socket
+360	common	socketpair			sys_socketpair
+361	common	bind				sys_bind
+362	common	connect				sys_connect
+363	common	listen				sys_listen
+364	common	accept4				sys_accept4
+365	common	getsockopt			sys_getsockopt
+366	common	setsockopt			sys_setsockopt
+367	common	getsockname			sys_getsockname
+368	common	getpeername			sys_getpeername
+369	common	sendto				sys_sendto
+370	common	sendmsg				sys_sendmsg
+371	common	recvfrom			sys_recvfrom
+372	common	recvmsg				sys_recvmsg
+373	common	shutdown			sys_shutdown
+374	common	mlock2				sys_mlock2
+375	common	copy_file_range			sys_copy_file_range
+376	common	preadv2				sys_preadv2
+377	common	pwritev2			sys_pwritev2
+378	common	s390_guarded_storage		sys_s390_guarded_storage
+379	common	statx				sys_statx
+380	common	s390_sthyi			sys_s390_sthyi
+381	common	kexec_file_load			sys_kexec_file_load
+382	common	io_pgetevents			sys_io_pgetevents
+383	common	rseq				sys_rseq
+384	common	pkey_mprotect			sys_pkey_mprotect
+385	common	pkey_alloc			sys_pkey_alloc
+386	common	pkey_free			sys_pkey_free
 # room for arch specific syscalls
-392	64	semtimedop		sys_semtimedop			-
-393  common	semget			sys_semget			sys_semget
-394  common	semctl			sys_semctl			compat_sys_semctl
-395  common	shmget			sys_shmget			sys_shmget
-396  common	shmctl			sys_shmctl			compat_sys_shmctl
-397  common	shmat			sys_shmat			compat_sys_shmat
-398  common	shmdt			sys_shmdt 			sys_shmdt
-399  common	msgget			sys_msgget			sys_msgget
-400  common	msgsnd			sys_msgsnd			compat_sys_msgsnd
-401  common	msgrcv			sys_msgrcv			compat_sys_msgrcv
-402  common	msgctl			sys_msgctl			compat_sys_msgctl
-403	32	clock_gettime64		-				sys_clock_gettime
-404	32	clock_settime64		-				sys_clock_settime
-405	32	clock_adjtime64		-				sys_clock_adjtime
-406	32	clock_getres_time64	-				sys_clock_getres
-407	32	clock_nanosleep_time64	-				sys_clock_nanosleep
-408	32	timer_gettime64		-				sys_timer_gettime
-409	32	timer_settime64		-				sys_timer_settime
-410	32	timerfd_gettime64	-				sys_timerfd_gettime
-411	32	timerfd_settime64	-				sys_timerfd_settime
-412	32	utimensat_time64	-				sys_utimensat
-413	32	pselect6_time64		-				compat_sys_pselect6_time64
-414	32	ppoll_time64		-				compat_sys_ppoll_time64
-416	32	io_pgetevents_time64	-				compat_sys_io_pgetevents_time64
-417	32	recvmmsg_time64		-				compat_sys_recvmmsg_time64
-418	32	mq_timedsend_time64	-				sys_mq_timedsend
-419	32	mq_timedreceive_time64	-				sys_mq_timedreceive
-420	32	semtimedop_time64	-				sys_semtimedop
-421	32	rt_sigtimedwait_time64	-				compat_sys_rt_sigtimedwait_time64
-422	32	futex_time64		-				sys_futex
-423	32	sched_rr_get_interval_time64	-			sys_sched_rr_get_interval
-424  common	pidfd_send_signal	sys_pidfd_send_signal		sys_pidfd_send_signal
-425  common	io_uring_setup		sys_io_uring_setup              sys_io_uring_setup
-426  common	io_uring_enter		sys_io_uring_enter              sys_io_uring_enter
-427  common	io_uring_register	sys_io_uring_register           sys_io_uring_register
-428  common	open_tree		sys_open_tree			sys_open_tree
-429  common	move_mount		sys_move_mount			sys_move_mount
-430  common	fsopen			sys_fsopen			sys_fsopen
-431  common	fsconfig		sys_fsconfig			sys_fsconfig
-432  common	fsmount			sys_fsmount			sys_fsmount
-433  common	fspick			sys_fspick			sys_fspick
-434  common	pidfd_open		sys_pidfd_open			sys_pidfd_open
-435  common	clone3			sys_clone3			sys_clone3
-436  common	close_range		sys_close_range			sys_close_range
-437  common	openat2			sys_openat2			sys_openat2
-438  common	pidfd_getfd		sys_pidfd_getfd			sys_pidfd_getfd
-439  common	faccessat2		sys_faccessat2			sys_faccessat2
-440  common	process_madvise		sys_process_madvise		sys_process_madvise
-441  common	epoll_pwait2		sys_epoll_pwait2		compat_sys_epoll_pwait2
-442  common	mount_setattr		sys_mount_setattr		sys_mount_setattr
-443  common	quotactl_fd		sys_quotactl_fd			sys_quotactl_fd
-444  common	landlock_create_ruleset	sys_landlock_create_ruleset	sys_landlock_create_ruleset
-445  common	landlock_add_rule	sys_landlock_add_rule		sys_landlock_add_rule
-446  common	landlock_restrict_self	sys_landlock_restrict_self	sys_landlock_restrict_self
-447  common	memfd_secret		sys_memfd_secret		sys_memfd_secret
-448  common	process_mrelease	sys_process_mrelease		sys_process_mrelease
-449  common	futex_waitv		sys_futex_waitv			sys_futex_waitv
-450  common	set_mempolicy_home_node	sys_set_mempolicy_home_node	sys_set_mempolicy_home_node
-451  common	cachestat		sys_cachestat			sys_cachestat
-452  common	fchmodat2		sys_fchmodat2			sys_fchmodat2
-453  common	map_shadow_stack	sys_map_shadow_stack		sys_map_shadow_stack
-454  common	futex_wake		sys_futex_wake			sys_futex_wake
-455  common	futex_wait		sys_futex_wait			sys_futex_wait
-456  common	futex_requeue		sys_futex_requeue		sys_futex_requeue
-457  common	statmount		sys_statmount			sys_statmount
-458  common	listmount		sys_listmount			sys_listmount
-459  common	lsm_get_self_attr	sys_lsm_get_self_attr		sys_lsm_get_self_attr
-460  common	lsm_set_self_attr	sys_lsm_set_self_attr		sys_lsm_set_self_attr
-461  common	lsm_list_modules	sys_lsm_list_modules		sys_lsm_list_modules
-462  common	mseal			sys_mseal			sys_mseal
-463  common	setxattrat		sys_setxattrat			sys_setxattrat
-464  common	getxattrat		sys_getxattrat			sys_getxattrat
-465  common	listxattrat		sys_listxattrat			sys_listxattrat
-466  common	removexattrat		sys_removexattrat		sys_removexattrat
-467  common	open_tree_attr		sys_open_tree_attr		sys_open_tree_attr
-468  common	file_getattr		sys_file_getattr		sys_file_getattr
-469  common	file_setattr		sys_file_setattr		sys_file_setattr
-470  common	listns			sys_listns			sys_listns
+392	common	semtimedop			sys_semtimedop
+393	common	semget				sys_semget
+394	common	semctl				sys_semctl
+395	common	shmget				sys_shmget
+396	common	shmctl				sys_shmctl
+397	common	shmat				sys_shmat
+398	common	shmdt				sys_shmdt
+399	common	msgget				sys_msgget
+400	common	msgsnd				sys_msgsnd
+401	common	msgrcv				sys_msgrcv
+402	common	msgctl				sys_msgctl
+424	common	pidfd_send_signal		sys_pidfd_send_signal
+425	common	io_uring_setup			sys_io_uring_setup
+426	common	io_uring_enter			sys_io_uring_enter
+427	common	io_uring_register		sys_io_uring_register
+428	common	open_tree			sys_open_tree
+429	common	move_mount			sys_move_mount
+430	common	fsopen				sys_fsopen
+431	common	fsconfig			sys_fsconfig
+432	common	fsmount				sys_fsmount
+433	common	fspick				sys_fspick
+434	common	pidfd_open			sys_pidfd_open
+435	common	clone3				sys_clone3
+436	common	close_range			sys_close_range
+437	common	openat2				sys_openat2
+438	common	pidfd_getfd			sys_pidfd_getfd
+439	common	faccessat2			sys_faccessat2
+440	common	process_madvise			sys_process_madvise
+441	common	epoll_pwait2			sys_epoll_pwait2
+442	common	mount_setattr			sys_mount_setattr
+443	common	quotactl_fd			sys_quotactl_fd
+444	common	landlock_create_ruleset		sys_landlock_create_ruleset
+445	common	landlock_add_rule		sys_landlock_add_rule
+446	common	landlock_restrict_self		sys_landlock_restrict_self
+447	common	memfd_secret			sys_memfd_secret
+448	common	process_mrelease		sys_process_mrelease
+449	common	futex_waitv			sys_futex_waitv
+450	common	set_mempolicy_home_node		sys_set_mempolicy_home_node
+451	common	cachestat			sys_cachestat
+452	common	fchmodat2			sys_fchmodat2
+453	common	map_shadow_stack		sys_map_shadow_stack
+454	common	futex_wake			sys_futex_wake
+455	common	futex_wait			sys_futex_wait
+456	common	futex_requeue			sys_futex_requeue
+457	common	statmount			sys_statmount
+458	common	listmount			sys_listmount
+459	common	lsm_get_self_attr		sys_lsm_get_self_attr
+460	common	lsm_set_self_attr		sys_lsm_set_self_attr
+461	common	lsm_list_modules		sys_lsm_list_modules
+462	common	mseal				sys_mseal
+463	common	setxattrat			sys_setxattrat
+464	common	getxattrat			sys_getxattrat
+465	common	listxattrat			sys_listxattrat
+466	common	removexattrat			sys_removexattrat
+467	common	open_tree_attr			sys_open_tree_attr
+468	common	file_getattr			sys_file_getattr
+469	common	file_setattr			sys_file_setattr
+470	common	listns				sys_listns
diff --git a/arch/s390/kernel/syscalls/syscalltbl b/arch/s390/kernel/syscalls/syscalltbl
deleted file mode 100755
index fbac1732f874..000000000000
--- a/arch/s390/kernel/syscalls/syscalltbl
+++ /dev/null
@@ -1,232 +0,0 @@
-#!/bin/sh
-# SPDX-License-Identifier: GPL-2.0
-#
-# Generate system call table and header files
-#
-# Copyright IBM Corp. 2018
-# Author(s):  Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
-
-#
-# File path to the system call table definition.
-# You can set the path with the -i option.  If omitted,
-# system call table definitions are read from standard input.
-#
-SYSCALL_TBL=""
-
-
-create_syscall_table_entries()
-{
-	local nr abi name entry64 entry32 _ignore
-	local temp=$(mktemp ${TMPDIR:-/tmp}/syscalltbl-common.XXXXXXXXX)
-
-	(
-	#
-	# Initialize with 0 to create an NI_SYSCALL for 0
-	#
-	local prev_nr=0 prev_32=sys_ni_syscall prev_64=sys_ni_syscall
-	while read nr abi name entry64 entry32 _ignore; do
-		test x$entry32 = x- && entry32=sys_ni_syscall
-		test x$entry64 = x- && entry64=sys_ni_syscall
-
-		if test $prev_nr -eq $nr; then
-			#
-			# Same syscall but different ABI, just update
-			# the respective entry point
-			#
-			case $abi in
-			32)
-				prev_32=$entry32
-			;;
-			64)
-				prev_64=$entry64
-			;;
-			esac
-			continue;
-		else
-			printf "%d\t%s\t%s\n" $prev_nr $prev_64 $prev_32
-		fi
-
-		prev_nr=$nr
-		prev_64=$entry64
-		prev_32=$entry32
-	done
-	printf "%d\t%s\t%s\n" $prev_nr $prev_64 $prev_32
-	) >> $temp
-
-	#
-	# Check for duplicate syscall numbers
-	#
-	if ! cat $temp |cut -f1 |uniq -d 2>&1; then
-		echo "Error: generated system call table contains duplicate entries: $temp" >&2
-		exit 1
-	fi
-
-	#
-	# Generate syscall table
-	#
-	prev_nr=0
-	while read nr entry64 entry32; do
-		while test $prev_nr -lt $((nr - 1)); do
-			printf "NI_SYSCALL\n"
-			prev_nr=$((prev_nr + 1))
-		done
-		if test x$entry64 = xsys_ni_syscall &&
-		   test x$entry32 = xsys_ni_syscall; then
-			printf "NI_SYSCALL\n"
-		else
-			printf "SYSCALL(%s,%s)\n" $entry64 $entry32
-		fi
-		prev_nr=$nr
-	done < $temp
-	rm $temp
-}
-
-generate_syscall_table()
-{
-	cat <<-EoHEADER
-	/* SPDX-License-Identifier: GPL-2.0 */
-	/*
-	 * Definitions for sys_call_table, each line represents an
-	 * entry in the table in the form
-	 * SYSCALL(64 bit syscall, 31 bit emulated syscall)
-	 *
-	 * This file is meant to be included from entry.S.
-	 */
-
-	#define NI_SYSCALL SYSCALL(sys_ni_syscall,sys_ni_syscall)
-
-EoHEADER
-	grep -Ev '^(#|[[:blank:]]*$)' $SYSCALL_TBL	\
-		|sort -k1 -n				\
-		|create_syscall_table_entries
-}
-
-create_header_defines()
-{
-	local nr abi name _ignore
-
-	while read nr abi name _ignore; do
-		printf "#define __NR_%s %d\n" $name $nr
-	done
-}
-
-normalize_fileguard()
-{
-	local fileguard="$1"
-
-	echo "$1" |tr '[[:lower:]]' '[[:upper:]]' \
-		  |sed -e 's/[^A-Z0-9_]/_/g' -e 's/__/_/g'
-}
-
-generate_syscall_header()
-{
-	local abis=$(echo "($1)" | tr ',' '|')
-	local filename="$2"
-	local fileguard suffix
-
-	if test "$filename"; then
-		fileguard=$(normalize_fileguard "__UAPI_ASM_S390_$2")
-	else
-		case "$abis" in
-		*64*) suffix=64 ;;
-		*32*) suffix=32 ;;
-		esac
-		fileguard=$(normalize_fileguard "__UAPI_ASM_S390_SYSCALLS_$suffix")
-	fi
-
-	cat <<-EoHEADER
-	/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-	#ifndef ${fileguard}
-	#define ${fileguard}
-
-EoHEADER
-
-	grep -E "^[[:digit:]]+[[:space:]]+${abis}" $SYSCALL_TBL	\
-		|sort -k1 -n					\
-		|create_header_defines
-
-	cat <<-EoFOOTER
-
-	#endif /* ${fileguard} */
-EoFOOTER
-}
-
-__max_syscall_nr()
-{
-	local abis=$(echo "($1)" | tr ',' '|')
-
-	grep -E "^[[:digit:]]+[[:space:]]+${abis}" $SYSCALL_TBL	 \
-		|sed -ne 's/^\([[:digit:]]*\)[[:space:]].*/\1/p' \
-		|sort -n					 \
-		|tail -1
-}
-
-
-generate_syscall_nr()
-{
-	local abis="$1"
-	local max_syscall_nr num_syscalls
-
-	max_syscall_nr=$(__max_syscall_nr "$abis")
-	num_syscalls=$((max_syscall_nr + 1))
-
-	cat <<-EoHEADER
-	/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-	#ifndef __ASM_S390_SYSCALLS_NR
-	#define __ASM_S390_SYSCALLS_NR
-
-	#define NR_syscalls ${num_syscalls}
-
-	#endif /* __ASM_S390_SYSCALLS_NR */
-EoHEADER
-}
-
-
-#
-# Parse command line arguments
-#
-do_syscall_header=""
-do_syscall_table=""
-do_syscall_nr=""
-output_file=""
-abi_list="common,64"
-filename=""
-while getopts ":HNSXi:a:f:" arg; do
-	case $arg in
-	a)
-		abi_list="$OPTARG"
-		;;
-	i)
-		SYSCALL_TBL="$OPTARG"
-		;;
-	f)
-		filename=${OPTARG##*/}
-		;;
-	H)
-		do_syscall_header=1
-		;;
-	N)
-		do_syscall_nr=1
-		;;
-	S)
-		do_syscall_table=1
-		;;
-	X)
-		set -x
-		;;
-	:)
-		echo "Missing argument for -$OPTARG" >&2
-		exit 1
-	;;
-	\?)
-		echo "Invalid option specified" >&2
-		exit 1
-	;;
-	esac
-done
-
-test "$do_syscall_header" && generate_syscall_header "$abi_list" "$filename"
-test "$do_syscall_table" && generate_syscall_table
-test "$do_syscall_nr" && generate_syscall_nr "$abi_list"
-
-exit 0
diff --git a/arch/s390/kernel/sysinfo.c b/arch/s390/kernel/sysinfo.c
index 1ea84e942bd4..33ca3e47a0e6 100644
--- a/arch/s390/kernel/sysinfo.c
+++ b/arch/s390/kernel/sysinfo.c
@@ -526,7 +526,7 @@ static __init int stsi_init_debugfs(void)
 	if (IS_ENABLED(CONFIG_SCHED_TOPOLOGY) && cpu_has_topology()) {
 		char link_to[10];
 
-		sprintf(link_to, "15_1_%d", topology_mnest_limit());
+		snprintf(link_to, sizeof(link_to), "15_1_%d", topology_mnest_limit());
 		debugfs_create_symlink("topology", stsi_root, link_to);
 	}
 	return 0;
diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c
index 63517b85f4c9..bd0df61d1907 100644
--- a/arch/s390/kernel/time.c
+++ b/arch/s390/kernel/time.c
@@ -12,8 +12,7 @@
  *    Copyright (C) 1991, 1992, 1995  Linus Torvalds
  */
 
-#define KMSG_COMPONENT "time"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "time: " fmt
 
 #include <linux/kernel_stat.h>
 #include <linux/errno.h>
diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c
index 1594c80e9bc4..1913a5566ac2 100644
--- a/arch/s390/kernel/topology.c
+++ b/arch/s390/kernel/topology.c
@@ -3,8 +3,7 @@
  *    Copyright IBM Corp. 2007, 2011
  */
 
-#define KMSG_COMPONENT "cpu"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "cpu: " fmt
 
 #include <linux/cpufeature.h>
 #include <linux/workqueue.h>
diff --git a/arch/s390/kernel/uprobes.c b/arch/s390/kernel/uprobes.c
index 5b0633ea8d93..c624f3361e43 100644
--- a/arch/s390/kernel/uprobes.c
+++ b/arch/s390/kernel/uprobes.c
@@ -8,7 +8,6 @@
 
 #include <linux/uaccess.h>
 #include <linux/uprobes.h>
-#include <linux/compat.h>
 #include <linux/kdebug.h>
 #include <linux/sched/task_stack.h>
 
@@ -29,7 +28,7 @@ int arch_uprobe_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
 {
 	if (psw_bits(regs->psw).eaba == PSW_BITS_AMODE_24BIT)
 		return -EINVAL;
-	if (!is_compat_task() && psw_bits(regs->psw).eaba == PSW_BITS_AMODE_31BIT)
+	if (psw_bits(regs->psw).eaba == PSW_BITS_AMODE_31BIT)
 		return -EINVAL;
 	clear_thread_flag(TIF_PER_TRAP);
 	auprobe->saved_per = psw_bits(regs->psw).per;
@@ -161,11 +160,6 @@ bool arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check ctx,
 
 /* Instruction Emulation */
 
-static void adjust_psw_addr(psw_t *psw, unsigned long len)
-{
-	psw->addr = __rewind_psw(*psw, -len);
-}
-
 #define EMU_ILLEGAL_OP		1
 #define EMU_SPECIFICATION	2
 #define EMU_ADDRESSING		3
@@ -353,7 +347,7 @@ static void handle_insn_ril(struct arch_uprobe *auprobe, struct pt_regs *regs)
 		}
 		break;
 	}
-	adjust_psw_addr(&regs->psw, ilen);
+	regs->psw.addr = __forward_psw(regs->psw, ilen);
 	switch (rc) {
 	case EMU_ILLEGAL_OP:
 		regs->int_code = ilen << 16 | 0x0001;
@@ -373,8 +367,7 @@ static void handle_insn_ril(struct arch_uprobe *auprobe, struct pt_regs *regs)
 bool arch_uprobe_skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs)
 {
 	if ((psw_bits(regs->psw).eaba == PSW_BITS_AMODE_24BIT) ||
-	    ((psw_bits(regs->psw).eaba == PSW_BITS_AMODE_31BIT) &&
-	     !is_compat_task())) {
+	    (psw_bits(regs->psw).eaba == PSW_BITS_AMODE_31BIT)) {
 		regs->psw.addr = __rewind_psw(regs->psw, UPROBE_SWBP_INSN_SIZE);
 		do_report_trap(regs, SIGILL, ILL_ILLADR, NULL);
 		return true;
diff --git a/arch/s390/kernel/uv.c b/arch/s390/kernel/uv.c
index 5d17609bcfe1..ed46950be86f 100644
--- a/arch/s390/kernel/uv.c
+++ b/arch/s390/kernel/uv.c
@@ -4,8 +4,7 @@
  *
  * Copyright IBM Corp. 2019, 2024
  */
-#define KMSG_COMPONENT "prot_virt"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "prot_virt: " fmt
 
 #include <linux/export.h>
 #include <linux/kernel.h>
diff --git a/arch/s390/kernel/vdso.c b/arch/s390/kernel/vdso.c
index 430feb1a5013..a27a90a199be 100644
--- a/arch/s390/kernel/vdso.c
+++ b/arch/s390/kernel/vdso.c
@@ -7,7 +7,6 @@
  */
 
 #include <linux/binfmts.h>
-#include <linux/compat.h>
 #include <linux/elf.h>
 #include <linux/errno.h>
 #include <linux/init.h>
@@ -23,8 +22,7 @@
 #include <asm/alternative.h>
 #include <asm/vdso.h>
 
-extern char vdso64_start[], vdso64_end[];
-extern char vdso32_start[], vdso32_end[];
+extern char vdso_start[], vdso_end[];
 
 static int vdso_mremap(const struct vm_special_mapping *sm,
 		       struct vm_area_struct *vma)
@@ -33,12 +31,7 @@ static int vdso_mremap(const struct vm_special_mapping *sm,
 	return 0;
 }
 
-static struct vm_special_mapping vdso64_mapping = {
-	.name = "[vdso]",
-	.mremap = vdso_mremap,
-};
-
-static struct vm_special_mapping vdso32_mapping = {
+static struct vm_special_mapping vdso_mapping = {
 	.name = "[vdso]",
 	.mremap = vdso_mremap,
 };
@@ -53,7 +46,6 @@ early_initcall(vdso_getcpu_init); /* Must be called before SMP init */
 static int map_vdso(unsigned long addr, unsigned long vdso_mapping_len)
 {
 	unsigned long vvar_start, vdso_text_start, vdso_text_len;
-	struct vm_special_mapping *vdso_mapping;
 	struct mm_struct *mm = current->mm;
 	struct vm_area_struct *vma;
 	int rc;
@@ -62,13 +54,7 @@ static int map_vdso(unsigned long addr, unsigned long vdso_mapping_len)
 	if (mmap_write_lock_killable(mm))
 		return -EINTR;
 
-	if (is_compat_task()) {
-		vdso_text_len = vdso32_end - vdso32_start;
-		vdso_mapping = &vdso32_mapping;
-	} else {
-		vdso_text_len = vdso64_end - vdso64_start;
-		vdso_mapping = &vdso64_mapping;
-	}
+	vdso_text_len = vdso_end - vdso_start;
 	vvar_start = get_unmapped_area(NULL, addr, vdso_mapping_len, 0, 0);
 	rc = vvar_start;
 	if (IS_ERR_VALUE(vvar_start))
@@ -82,7 +68,7 @@ static int map_vdso(unsigned long addr, unsigned long vdso_mapping_len)
 	vma = _install_special_mapping(mm, vdso_text_start, vdso_text_len,
 				       VM_READ|VM_EXEC|VM_SEALED_SYSMAP|
 				       VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC,
-				       vdso_mapping);
+				       &vdso_mapping);
 	if (IS_ERR(vma)) {
 		do_munmap(mm, vvar_start, PAGE_SIZE, NULL);
 		rc = PTR_ERR(vma);
@@ -122,13 +108,7 @@ static unsigned long vdso_addr(unsigned long start, unsigned long len)
 
 unsigned long vdso_text_size(void)
 {
-	unsigned long size;
-
-	if (is_compat_task())
-		size = vdso32_end - vdso32_start;
-	else
-		size = vdso64_end - vdso64_start;
-	return PAGE_ALIGN(size);
+	return PAGE_ALIGN(vdso_end - vdso_start);
 }
 
 unsigned long vdso_size(void)
@@ -166,7 +146,7 @@ static void vdso_apply_alternatives(void)
 	struct alt_instr *start, *end;
 	const struct elf64_hdr *hdr;
 
-	hdr = (struct elf64_hdr *)vdso64_start;
+	hdr = (struct elf64_hdr *)vdso_start;
 	shdr = (void *)hdr + hdr->e_shoff;
 	alt = find_section(hdr, shdr, ".altinstructions");
 	if (!alt)
@@ -179,9 +159,7 @@ static void vdso_apply_alternatives(void)
 static int __init vdso_init(void)
 {
 	vdso_apply_alternatives();
-	vdso64_mapping.pages = vdso_setup_pages(vdso64_start, vdso64_end);
-	if (IS_ENABLED(CONFIG_COMPAT))
-		vdso32_mapping.pages = vdso_setup_pages(vdso32_start, vdso32_end);
+	vdso_mapping.pages = vdso_setup_pages(vdso_start, vdso_end);
 	return 0;
 }
 arch_initcall(vdso_init);
diff --git a/arch/s390/kernel/vdso32/.gitignore b/arch/s390/kernel/vdso/.gitignore
index 5167384843b9..652e31d82582 100644
--- a/arch/s390/kernel/vdso32/.gitignore
+++ b/arch/s390/kernel/vdso/.gitignore
@@ -1,2 +1,2 @@
 # SPDX-License-Identifier: GPL-2.0-only
-vdso32.lds
+vdso.lds
diff --git a/arch/s390/kernel/vdso/Makefile b/arch/s390/kernel/vdso/Makefile
new file mode 100644
index 000000000000..2fa12d4ac106
--- /dev/null
+++ b/arch/s390/kernel/vdso/Makefile
@@ -0,0 +1,76 @@
+# SPDX-License-Identifier: GPL-2.0
+# List of files in the vdso
+
+# Include the generic Makefile to check the built vdso.
+include $(srctree)/lib/vdso/Makefile.include
+obj-vdso = vdso_user_wrapper.o note.o vgetrandom-chacha.o
+obj-cvdso = vdso_generic.o getcpu.o vgetrandom.o
+VDSO_CFLAGS_REMOVE := -pg $(CC_FLAGS_FTRACE) $(CC_FLAGS_EXPOLINE)
+CFLAGS_REMOVE_getcpu.o = $(VDSO_CFLAGS_REMOVE)
+CFLAGS_REMOVE_vgetrandom.o = $(VDSO_CFLAGS_REMOVE)
+CFLAGS_REMOVE_vdso_generic.o = $(VDSO_CFLAGS_REMOVE)
+
+ifneq ($(c-getrandom-y),)
+	CFLAGS_vgetrandom.o += -include $(c-getrandom-y)
+endif
+
+# Build rules
+
+targets := $(obj-vdso) $(obj-cvdso) vdso.so vdso.so.dbg
+obj-vdso := $(addprefix $(obj)/, $(obj-vdso))
+obj-cvdso := $(addprefix $(obj)/, $(obj-cvdso))
+
+KBUILD_AFLAGS_VDSO := $(KBUILD_AFLAGS) -DBUILD_VDSO
+
+KBUILD_CFLAGS_VDSO := $(KBUILD_CFLAGS) -DBUILD_VDSO -DDISABLE_BRANCH_PROFILING
+KBUILD_CFLAGS_VDSO := $(filter-out -mpacked-stack,$(KBUILD_CFLAGS_VDSO))
+KBUILD_CFLAGS_VDSO := $(filter-out -mno-pic-data-is-text-relative,$(KBUILD_CFLAGS_VDSO))
+KBUILD_CFLAGS_VDSO := $(filter-out -munaligned-symbols,$(KBUILD_CFLAGS_VDSO))
+KBUILD_CFLAGS_VDSO := $(filter-out -fno-asynchronous-unwind-tables,$(KBUILD_CFLAGS_VDSO))
+KBUILD_CFLAGS_VDSO += -fPIC -fno-common -fno-builtin -fasynchronous-unwind-tables
+KBUILD_CFLAGS_VDSO += -fno-stack-protector
+ldflags-y := -shared -soname=linux-vdso.so.1 \
+	     --hash-style=both --build-id=sha1 -T
+
+$(targets:%=$(obj)/%.dbg): KBUILD_CFLAGS = $(KBUILD_CFLAGS_VDSO)
+$(targets:%=$(obj)/%.dbg): KBUILD_AFLAGS = $(KBUILD_AFLAGS_VDSO)
+
+obj-y += vdso_wrapper.o
+targets += vdso.lds
+CPPFLAGS_vdso.lds += -P -C -U$(ARCH)
+
+# Force dependency (incbin is bad)
+$(obj)/vdso_wrapper.o : $(obj)/vdso.so
+
+quiet_cmd_vdso_and_check = VDSO    $@
+      cmd_vdso_and_check = $(cmd_ld); $(cmd_vdso_check)
+
+# link rule for the .so file, .lds has to be first
+$(obj)/vdso.so.dbg: $(obj)/vdso.lds $(obj-vdso) $(obj-cvdso) FORCE
+	$(call if_changed,vdso_and_check)
+
+# strip rule for the .so file
+$(obj)/%.so: OBJCOPYFLAGS := -S
+$(obj)/%.so: $(obj)/%.so.dbg FORCE
+	$(call if_changed,objcopy)
+
+# assembly rules for the .S files
+$(obj-vdso): %.o: %.S FORCE
+	$(call if_changed_dep,vdsoas)
+
+$(obj-cvdso): %.o: %.c FORCE
+	$(call if_changed_dep,vdsocc)
+
+# actual build commands
+quiet_cmd_vdsoas = VDSOA $@
+      cmd_vdsoas = $(CC) $(a_flags) -c -o $@ $<
+quiet_cmd_vdsocc = VDSOC $@
+      cmd_vdsocc = $(CC) $(c_flags) -c -o $@ $<
+
+# Generate VDSO offsets using helper script
+gen-vdsosym := $(src)/gen_vdso_offsets.sh
+quiet_cmd_vdsosym = VDSOSYM $@
+	cmd_vdsosym = $(NM) $< | $(gen-vdsosym) | LC_ALL=C sort > $@
+
+include/generated/vdso-offsets.h: $(obj)/vdso.so.dbg FORCE
+	$(call if_changed,vdsosym)
diff --git a/arch/s390/kernel/vdso64/gen_vdso_offsets.sh b/arch/s390/kernel/vdso/gen_vdso_offsets.sh
index 37f05cb38dad..359982fb002d 100755
--- a/arch/s390/kernel/vdso64/gen_vdso_offsets.sh
+++ b/arch/s390/kernel/vdso/gen_vdso_offsets.sh
@@ -12,4 +12,4 @@
 #
 
 LC_ALL=C
-sed -n 's/\([0-9a-f]*\) . __kernel_\(.*\)/\#define vdso64_offset_\2\t0x\1/p'
+sed -n 's/\([0-9a-f]*\) . __kernel_\(.*\)/\#define vdso_offset_\2\t0x\1/p'
diff --git a/arch/s390/kernel/vdso64/getcpu.c b/arch/s390/kernel/vdso/getcpu.c
index 5c5d4a848b76..5c5d4a848b76 100644
--- a/arch/s390/kernel/vdso64/getcpu.c
+++ b/arch/s390/kernel/vdso/getcpu.c
diff --git a/arch/s390/kernel/vdso32/note.S b/arch/s390/kernel/vdso/note.S
index db19d0680a0a..db19d0680a0a 100644
--- a/arch/s390/kernel/vdso32/note.S
+++ b/arch/s390/kernel/vdso/note.S
diff --git a/arch/s390/kernel/vdso64/vdso.h b/arch/s390/kernel/vdso/vdso.h
index 9e5397e7b590..8cff033dd854 100644
--- a/arch/s390/kernel/vdso64/vdso.h
+++ b/arch/s390/kernel/vdso/vdso.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-#ifndef __ARCH_S390_KERNEL_VDSO64_VDSO_H
-#define __ARCH_S390_KERNEL_VDSO64_VDSO_H
+#ifndef __ARCH_S390_KERNEL_VDSO_VDSO_H
+#define __ARCH_S390_KERNEL_VDSO_VDSO_H
 
 #include <vdso/datapage.h>
 
@@ -12,4 +12,4 @@ int __s390_vdso_clock_gettime(clockid_t clock, struct __kernel_timespec *ts);
 int __s390_vdso_clock_getres(clockid_t clock, struct __kernel_timespec *ts);
 ssize_t __kernel_getrandom(void *buffer, size_t len, unsigned int flags, void *opaque_state, size_t opaque_len);
 
-#endif /* __ARCH_S390_KERNEL_VDSO64_VDSO_H */
+#endif /* __ARCH_S390_KERNEL_VDSO_VDSO_H */
diff --git a/arch/s390/kernel/vdso64/vdso64.lds.S b/arch/s390/kernel/vdso/vdso.lds.S
index e4f6551ae898..7bec4de0e8e0 100644
--- a/arch/s390/kernel/vdso64/vdso64.lds.S
+++ b/arch/s390/kernel/vdso/vdso.lds.S
@@ -7,6 +7,7 @@
 #include <asm/vdso/vsyscall.h>
 #include <asm/page.h>
 #include <asm/vdso.h>
+#include <asm-generic/vmlinux.lds.h>
 #include <vdso/datapage.h>
 
 OUTPUT_FORMAT("elf64-s390", "elf64-s390", "elf64-s390")
@@ -59,47 +60,9 @@ SECTIONS
 	_end = .;
 	PROVIDE(end = .);
 
-	/*
-	 * Stabs debugging sections are here too.
-	 */
-	.stab	       0 : { *(.stab) }
-	.stabstr       0 : { *(.stabstr) }
-	.stab.excl     0 : { *(.stab.excl) }
-	.stab.exclstr  0 : { *(.stab.exclstr) }
-	.stab.index    0 : { *(.stab.index) }
-	.stab.indexstr 0 : { *(.stab.indexstr) }
+	STABS_DEBUG
+	DWARF_DEBUG
 	.comment       0 : { *(.comment) }
-
-	/*
-	 * DWARF debug sections.
-	 * Symbols in the DWARF debugging sections are relative to the
-	 * beginning of the section so we begin them at 0.
-	 */
-	/* DWARF 1 */
-	.debug		0 : { *(.debug) }
-	.line		0 : { *(.line) }
-	/* GNU DWARF 1 extensions */
-	.debug_srcinfo	0 : { *(.debug_srcinfo) }
-	.debug_sfnames	0 : { *(.debug_sfnames) }
-	/* DWARF 1.1 and DWARF 2 */
-	.debug_aranges	0 : { *(.debug_aranges) }
-	.debug_pubnames 0 : { *(.debug_pubnames) }
-	/* DWARF 2 */
-	.debug_info	0 : { *(.debug_info .gnu.linkonce.wi.*) }
-	.debug_abbrev	0 : { *(.debug_abbrev) }
-	.debug_line	0 : { *(.debug_line) }
-	.debug_frame	0 : { *(.debug_frame) }
-	.debug_str	0 : { *(.debug_str) }
-	.debug_loc	0 : { *(.debug_loc) }
-	.debug_macinfo	0 : { *(.debug_macinfo) }
-	/* SGI/MIPS DWARF 2 extensions */
-	.debug_weaknames 0 : { *(.debug_weaknames) }
-	.debug_funcnames 0 : { *(.debug_funcnames) }
-	.debug_typenames 0 : { *(.debug_typenames) }
-	.debug_varnames  0 : { *(.debug_varnames) }
-	/* DWARF 3 */
-	.debug_pubtypes 0 : { *(.debug_pubtypes) }
-	.debug_ranges	0 : { *(.debug_ranges) }
 	.gnu.attributes 0 : { KEEP (*(.gnu.attributes)) }
 
 	/DISCARD/	: {
diff --git a/arch/s390/kernel/vdso64/vdso64_generic.c b/arch/s390/kernel/vdso/vdso_generic.c
index a9aa75643c08..a9aa75643c08 100644
--- a/arch/s390/kernel/vdso64/vdso64_generic.c
+++ b/arch/s390/kernel/vdso/vdso_generic.c
diff --git a/arch/s390/kernel/vdso64/vdso_user_wrapper.S b/arch/s390/kernel/vdso/vdso_user_wrapper.S
index aa06c85bcbd3..aa06c85bcbd3 100644
--- a/arch/s390/kernel/vdso64/vdso_user_wrapper.S
+++ b/arch/s390/kernel/vdso/vdso_user_wrapper.S
diff --git a/arch/s390/kernel/vdso32/vdso32_wrapper.S b/arch/s390/kernel/vdso/vdso_wrapper.S
index de2fb930471a..f69e62a14978 100644
--- a/arch/s390/kernel/vdso32/vdso32_wrapper.S
+++ b/arch/s390/kernel/vdso/vdso_wrapper.S
@@ -5,11 +5,11 @@
 
 	__PAGE_ALIGNED_DATA
 
-	.globl vdso32_start, vdso32_end
+	.globl vdso_start, vdso_end
 	.balign PAGE_SIZE
-vdso32_start:
-	.incbin "arch/s390/kernel/vdso32/vdso32.so"
+vdso_start:
+	.incbin "arch/s390/kernel/vdso/vdso.so"
 	.balign PAGE_SIZE
-vdso32_end:
+vdso_end:
 
 	.previous
diff --git a/arch/s390/kernel/vdso64/vgetrandom-chacha.S b/arch/s390/kernel/vdso/vgetrandom-chacha.S
index 09c034c2f853..09c034c2f853 100644
--- a/arch/s390/kernel/vdso64/vgetrandom-chacha.S
+++ b/arch/s390/kernel/vdso/vgetrandom-chacha.S
diff --git a/arch/s390/kernel/vdso64/vgetrandom.c b/arch/s390/kernel/vdso/vgetrandom.c
index b5268b507fb5..b5268b507fb5 100644
--- a/arch/s390/kernel/vdso64/vgetrandom.c
+++ b/arch/s390/kernel/vdso/vgetrandom.c
diff --git a/arch/s390/kernel/vdso32/Makefile b/arch/s390/kernel/vdso32/Makefile
deleted file mode 100644
index 1e4ddd1a683f..000000000000
--- a/arch/s390/kernel/vdso32/Makefile
+++ /dev/null
@@ -1,64 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-# List of files in the vdso
-
-# Include the generic Makefile to check the built vdso.
-include $(srctree)/lib/vdso/Makefile.include
-obj-vdso32 = vdso_user_wrapper-32.o note-32.o
-
-# Build rules
-
-targets := $(obj-vdso32) vdso32.so vdso32.so.dbg
-obj-vdso32 := $(addprefix $(obj)/, $(obj-vdso32))
-
-KBUILD_AFLAGS += -DBUILD_VDSO
-KBUILD_CFLAGS += -DBUILD_VDSO -DDISABLE_BRANCH_PROFILING
-
-KBUILD_AFLAGS_32 := $(filter-out -m64,$(KBUILD_AFLAGS))
-KBUILD_AFLAGS_32 += -m31 -s
-
-KBUILD_CFLAGS_32 := $(filter-out -m64,$(KBUILD_CFLAGS))
-KBUILD_CFLAGS_32 := $(filter-out -mpacked-stack,$(KBUILD_CFLAGS))
-KBUILD_CFLAGS_32 := $(filter-out -mno-pic-data-is-text-relative,$(KBUILD_CFLAGS_32))
-KBUILD_CFLAGS_32 := $(filter-out -fno-asynchronous-unwind-tables,$(KBUILD_CFLAGS_32))
-KBUILD_CFLAGS_32 += -m31 -fPIC -shared -fno-common -fno-builtin -fasynchronous-unwind-tables
-
-LDFLAGS_vdso32.so.dbg += -shared -soname=linux-vdso32.so.1 \
-	--hash-style=both --build-id=sha1 -melf_s390 -T
-
-$(targets:%=$(obj)/%.dbg): KBUILD_CFLAGS = $(KBUILD_CFLAGS_32)
-$(targets:%=$(obj)/%.dbg): KBUILD_AFLAGS = $(KBUILD_AFLAGS_32)
-
-obj-y += vdso32_wrapper.o
-targets += vdso32.lds
-CPPFLAGS_vdso32.lds += -P -C -U$(ARCH)
-
-# Force dependency (incbin is bad)
-$(obj)/vdso32_wrapper.o : $(obj)/vdso32.so
-
-quiet_cmd_vdso_and_check = VDSO    $@
-      cmd_vdso_and_check = $(cmd_ld); $(cmd_vdso_check)
-
-$(obj)/vdso32.so.dbg: $(obj)/vdso32.lds $(obj-vdso32) FORCE
-	$(call if_changed,vdso_and_check)
-
-# strip rule for the .so file
-$(obj)/%.so: OBJCOPYFLAGS := -S
-$(obj)/%.so: $(obj)/%.so.dbg FORCE
-	$(call if_changed,objcopy)
-
-$(obj-vdso32): %-32.o: %.S FORCE
-	$(call if_changed_dep,vdso32as)
-
-# actual build commands
-quiet_cmd_vdso32as = VDSO32A $@
-      cmd_vdso32as = $(CC) $(a_flags) -c -o $@ $<
-quiet_cmd_vdso32cc = VDSO32C $@
-      cmd_vdso32cc = $(CC) $(c_flags) -c -o $@ $<
-
-# Generate VDSO offsets using helper script
-gen-vdsosym := $(src)/gen_vdso_offsets.sh
-quiet_cmd_vdsosym = VDSOSYM $@
-	cmd_vdsosym = $(NM) $< | $(gen-vdsosym) | LC_ALL=C sort > $@
-
-include/generated/vdso32-offsets.h: $(obj)/vdso32.so.dbg FORCE
-	$(call if_changed,vdsosym)
diff --git a/arch/s390/kernel/vdso32/gen_vdso_offsets.sh b/arch/s390/kernel/vdso32/gen_vdso_offsets.sh
deleted file mode 100755
index 9c4f951e227d..000000000000
--- a/arch/s390/kernel/vdso32/gen_vdso_offsets.sh
+++ /dev/null
@@ -1,15 +0,0 @@
-#!/bin/sh
-# SPDX-License-Identifier: GPL-2.0
-
-#
-# Match symbols in the DSO that look like VDSO_*; produce a header file
-# of constant offsets into the shared object.
-#
-# Doing this inside the Makefile will break the $(filter-out) function,
-# causing Kbuild to rebuild the vdso-offsets header file every time.
-#
-# Inspired by arm64 version.
-#
-
-LC_ALL=C
-sed -n 's/\([0-9a-f]*\) . __kernel_compat_\(.*\)/\#define vdso32_offset_\2\t0x\1/p'
diff --git a/arch/s390/kernel/vdso32/vdso32.lds.S b/arch/s390/kernel/vdso32/vdso32.lds.S
deleted file mode 100644
index 9630d58c2080..000000000000
--- a/arch/s390/kernel/vdso32/vdso32.lds.S
+++ /dev/null
@@ -1,140 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * This is the infamous ld script for the 64 bits vdso
- * library
- */
-
-#include <asm/page.h>
-#include <asm/vdso.h>
-#include <vdso/datapage.h>
-
-OUTPUT_FORMAT("elf32-s390", "elf32-s390", "elf32-s390")
-OUTPUT_ARCH(s390:31-bit)
-
-SECTIONS
-{
-	VDSO_VVAR_SYMS
-
-	. = SIZEOF_HEADERS;
-
-	.hash		: { *(.hash) }			:text
-	.gnu.hash	: { *(.gnu.hash) }
-	.dynsym		: { *(.dynsym) }
-	.dynstr		: { *(.dynstr) }
-	.gnu.version	: { *(.gnu.version) }
-	.gnu.version_d	: { *(.gnu.version_d) }
-	.gnu.version_r	: { *(.gnu.version_r) }
-
-	.note		: { *(.note.*) }		:text	:note
-
-	. = ALIGN(16);
-	.text		: {
-		*(.text .stub .text.* .gnu.linkonce.t.*)
-	} :text
-	PROVIDE(__etext = .);
-	PROVIDE(_etext = .);
-	PROVIDE(etext = .);
-
-	/*
-	 * Other stuff is appended to the text segment:
-	 */
-	.rodata		: { *(.rodata .rodata.* .gnu.linkonce.r.*) }
-	.rodata1	: { *(.rodata1) }
-
-	.dynamic	: { *(.dynamic) }		:text	:dynamic
-
-	.eh_frame_hdr	: { *(.eh_frame_hdr) }		:text	:eh_frame_hdr
-	.eh_frame	: { KEEP (*(.eh_frame)) }	:text
-	.gcc_except_table : { *(.gcc_except_table .gcc_except_table.*) }
-
-	.rela.dyn ALIGN(8) : { *(.rela.dyn) }
-	.got ALIGN(8)	: { *(.got .toc) }
-	.got.plt ALIGN(8) : { *(.got.plt) }
-
-	_end = .;
-	PROVIDE(end = .);
-
-	/*
-	 * Stabs debugging sections are here too.
-	 */
-	.stab	       0 : { *(.stab) }
-	.stabstr       0 : { *(.stabstr) }
-	.stab.excl     0 : { *(.stab.excl) }
-	.stab.exclstr  0 : { *(.stab.exclstr) }
-	.stab.index    0 : { *(.stab.index) }
-	.stab.indexstr 0 : { *(.stab.indexstr) }
-	.comment       0 : { *(.comment) }
-
-	/*
-	 * DWARF debug sections.
-	 * Symbols in the DWARF debugging sections are relative to the
-	 * beginning of the section so we begin them at 0.
-	 */
-	/* DWARF 1 */
-	.debug		0 : { *(.debug) }
-	.line		0 : { *(.line) }
-	/* GNU DWARF 1 extensions */
-	.debug_srcinfo	0 : { *(.debug_srcinfo) }
-	.debug_sfnames	0 : { *(.debug_sfnames) }
-	/* DWARF 1.1 and DWARF 2 */
-	.debug_aranges	0 : { *(.debug_aranges) }
-	.debug_pubnames 0 : { *(.debug_pubnames) }
-	/* DWARF 2 */
-	.debug_info	0 : { *(.debug_info .gnu.linkonce.wi.*) }
-	.debug_abbrev	0 : { *(.debug_abbrev) }
-	.debug_line	0 : { *(.debug_line) }
-	.debug_frame	0 : { *(.debug_frame) }
-	.debug_str	0 : { *(.debug_str) }
-	.debug_loc	0 : { *(.debug_loc) }
-	.debug_macinfo	0 : { *(.debug_macinfo) }
-	/* SGI/MIPS DWARF 2 extensions */
-	.debug_weaknames 0 : { *(.debug_weaknames) }
-	.debug_funcnames 0 : { *(.debug_funcnames) }
-	.debug_typenames 0 : { *(.debug_typenames) }
-	.debug_varnames  0 : { *(.debug_varnames) }
-	/* DWARF 3 */
-	.debug_pubtypes 0 : { *(.debug_pubtypes) }
-	.debug_ranges	0 : { *(.debug_ranges) }
-	.gnu.attributes 0 : { KEEP (*(.gnu.attributes)) }
-
-	/DISCARD/	: {
-		*(.note.GNU-stack)
-		*(.branch_lt)
-		*(.data .data.* .gnu.linkonce.d.* .sdata*)
-		*(.bss .sbss .dynbss .dynsbss)
-	}
-}
-
-/*
- * Very old versions of ld do not recognize this name token; use the constant.
- */
-#define PT_GNU_EH_FRAME	0x6474e550
-
-/*
- * We must supply the ELF program headers explicitly to get just one
- * PT_LOAD segment, and set the flags explicitly to make segments read-only.
- */
-PHDRS
-{
-	text		PT_LOAD FILEHDR PHDRS FLAGS(5);	/* PF_R|PF_X */
-	dynamic		PT_DYNAMIC FLAGS(4);		/* PF_R */
-	note		PT_NOTE FLAGS(4);		/* PF_R */
-	eh_frame_hdr	PT_GNU_EH_FRAME;
-}
-
-/*
- * This controls what symbols we export from the DSO.
- */
-VERSION
-{
-	VDSO_VERSION_STRING {
-	global:
-		/*
-		 * Has to be there for the kernel to find
-		 */
-		__kernel_compat_restart_syscall;
-		__kernel_compat_rt_sigreturn;
-		__kernel_compat_sigreturn;
-	local: *;
-	};
-}
diff --git a/arch/s390/kernel/vdso32/vdso_user_wrapper.S b/arch/s390/kernel/vdso32/vdso_user_wrapper.S
deleted file mode 100644
index 2e645003fdaf..000000000000
--- a/arch/s390/kernel/vdso32/vdso_user_wrapper.S
+++ /dev/null
@@ -1,22 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-
-#include <linux/linkage.h>
-#include <asm/unistd.h>
-#include <asm/dwarf.h>
-
-.macro vdso_syscall func,syscall
-	.globl __kernel_compat_\func
-	.type  __kernel_compat_\func,@function
-	__ALIGN
-__kernel_compat_\func:
-	CFI_STARTPROC
-	svc	\syscall
-	/* Make sure we notice when a syscall returns, which shouldn't happen */
-	.word	0
-	CFI_ENDPROC
-	.size	__kernel_compat_\func,.-__kernel_compat_\func
-.endm
-
-vdso_syscall restart_syscall,__NR_restart_syscall
-vdso_syscall sigreturn,__NR_sigreturn
-vdso_syscall rt_sigreturn,__NR_rt_sigreturn
diff --git a/arch/s390/kernel/vdso64/.gitignore b/arch/s390/kernel/vdso64/.gitignore
deleted file mode 100644
index 4ec80685fecc..000000000000
--- a/arch/s390/kernel/vdso64/.gitignore
+++ /dev/null
@@ -1,2 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-vdso64.lds
diff --git a/arch/s390/kernel/vdso64/Makefile b/arch/s390/kernel/vdso64/Makefile
deleted file mode 100644
index d8f0df742809..000000000000
--- a/arch/s390/kernel/vdso64/Makefile
+++ /dev/null
@@ -1,79 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-# List of files in the vdso
-
-# Include the generic Makefile to check the built vdso.
-include $(srctree)/lib/vdso/Makefile.include
-obj-vdso64 = vdso_user_wrapper.o note.o vgetrandom-chacha.o
-obj-cvdso64 = vdso64_generic.o getcpu.o vgetrandom.o
-VDSO_CFLAGS_REMOVE := -pg $(CC_FLAGS_FTRACE) $(CC_FLAGS_EXPOLINE)
-CFLAGS_REMOVE_getcpu.o = $(VDSO_CFLAGS_REMOVE)
-CFLAGS_REMOVE_vgetrandom.o = $(VDSO_CFLAGS_REMOVE)
-CFLAGS_REMOVE_vdso64_generic.o = $(VDSO_CFLAGS_REMOVE)
-
-ifneq ($(c-getrandom-y),)
-	CFLAGS_vgetrandom.o += -include $(c-getrandom-y)
-endif
-
-# Build rules
-
-targets := $(obj-vdso64) $(obj-cvdso64) vdso64.so vdso64.so.dbg
-obj-vdso64 := $(addprefix $(obj)/, $(obj-vdso64))
-obj-cvdso64 := $(addprefix $(obj)/, $(obj-cvdso64))
-
-KBUILD_AFLAGS += -DBUILD_VDSO
-KBUILD_CFLAGS += -DBUILD_VDSO -DDISABLE_BRANCH_PROFILING
-
-KBUILD_AFLAGS_64 := $(filter-out -m64,$(KBUILD_AFLAGS))
-KBUILD_AFLAGS_64 += -m64
-
-KBUILD_CFLAGS_64 := $(filter-out -m64,$(KBUILD_CFLAGS))
-KBUILD_CFLAGS_64 := $(filter-out -mpacked-stack,$(KBUILD_CFLAGS_64))
-KBUILD_CFLAGS_64 := $(filter-out -mno-pic-data-is-text-relative,$(KBUILD_CFLAGS_64))
-KBUILD_CFLAGS_64 := $(filter-out -munaligned-symbols,$(KBUILD_CFLAGS_64))
-KBUILD_CFLAGS_64 := $(filter-out -fno-asynchronous-unwind-tables,$(KBUILD_CFLAGS_64))
-KBUILD_CFLAGS_64 += -m64 -fPIC -fno-common -fno-builtin -fasynchronous-unwind-tables
-ldflags-y := -shared -soname=linux-vdso64.so.1 \
-	     --hash-style=both --build-id=sha1 -T
-
-$(targets:%=$(obj)/%.dbg): KBUILD_CFLAGS = $(KBUILD_CFLAGS_64)
-$(targets:%=$(obj)/%.dbg): KBUILD_AFLAGS = $(KBUILD_AFLAGS_64)
-
-obj-y += vdso64_wrapper.o
-targets += vdso64.lds
-CPPFLAGS_vdso64.lds += -P -C -U$(ARCH)
-
-# Force dependency (incbin is bad)
-$(obj)/vdso64_wrapper.o : $(obj)/vdso64.so
-
-quiet_cmd_vdso_and_check = VDSO    $@
-      cmd_vdso_and_check = $(cmd_ld); $(cmd_vdso_check)
-
-# link rule for the .so file, .lds has to be first
-$(obj)/vdso64.so.dbg: $(obj)/vdso64.lds $(obj-vdso64) $(obj-cvdso64) FORCE
-	$(call if_changed,vdso_and_check)
-
-# strip rule for the .so file
-$(obj)/%.so: OBJCOPYFLAGS := -S
-$(obj)/%.so: $(obj)/%.so.dbg FORCE
-	$(call if_changed,objcopy)
-
-# assembly rules for the .S files
-$(obj-vdso64): %.o: %.S FORCE
-	$(call if_changed_dep,vdso64as)
-
-$(obj-cvdso64): %.o: %.c FORCE
-	$(call if_changed_dep,vdso64cc)
-
-# actual build commands
-quiet_cmd_vdso64as = VDSO64A $@
-      cmd_vdso64as = $(CC) $(a_flags) -c -o $@ $<
-quiet_cmd_vdso64cc = VDSO64C $@
-      cmd_vdso64cc = $(CC) $(c_flags) -c -o $@ $<
-
-# Generate VDSO offsets using helper script
-gen-vdsosym := $(src)/gen_vdso_offsets.sh
-quiet_cmd_vdsosym = VDSOSYM $@
-	cmd_vdsosym = $(NM) $< | $(gen-vdsosym) | LC_ALL=C sort > $@
-
-include/generated/vdso64-offsets.h: $(obj)/vdso64.so.dbg FORCE
-	$(call if_changed,vdsosym)
diff --git a/arch/s390/kernel/vdso64/note.S b/arch/s390/kernel/vdso64/note.S
deleted file mode 100644
index db19d0680a0a..000000000000
--- a/arch/s390/kernel/vdso64/note.S
+++ /dev/null
@@ -1,13 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * This supplies .note.* sections to go into the PT_NOTE inside the vDSO text.
- * Here we can supply some information useful to userland.
- */
-
-#include <linux/uts.h>
-#include <linux/version.h>
-#include <linux/elfnote.h>
-
-ELFNOTE_START(Linux, 0, "a")
-	.long LINUX_VERSION_CODE
-ELFNOTE_END
diff --git a/arch/s390/kernel/vdso64/vdso64_wrapper.S b/arch/s390/kernel/vdso64/vdso64_wrapper.S
deleted file mode 100644
index 672184998623..000000000000
--- a/arch/s390/kernel/vdso64/vdso64_wrapper.S
+++ /dev/null
@@ -1,15 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#include <linux/init.h>
-#include <linux/linkage.h>
-#include <asm/page.h>
-
-	__PAGE_ALIGNED_DATA
-
-	.globl vdso64_start, vdso64_end
-	.balign PAGE_SIZE
-vdso64_start:
-	.incbin "arch/s390/kernel/vdso64/vdso64.so"
-	.balign PAGE_SIZE
-vdso64_end:
-
-	.previous
diff --git a/arch/s390/kernel/vmlinux.lds.S b/arch/s390/kernel/vmlinux.lds.S
index 8609126961dc..53bcbb91bb9b 100644
--- a/arch/s390/kernel/vmlinux.lds.S
+++ b/arch/s390/kernel/vmlinux.lds.S
@@ -150,6 +150,15 @@ SECTIONS
 		*(.altinstr_replacement)
 	}
 
+#ifdef CONFIG_STACKPROTECTOR
+	. = ALIGN(8);
+	.stack_prot_table : {
+		__stack_prot_start = .;
+		KEEP(*(__stack_protector_loc))
+		__stack_prot_end = .;
+	}
+#endif
+
 	/*
 	 * Table with the patch locations to undo expolines
 	*/
@@ -257,6 +266,10 @@ SECTIONS
 		QUAD(invalid_pg_dir)
 		QUAD(__alt_instructions)
 		QUAD(__alt_instructions_end)
+#ifdef CONFIG_STACKPROTECTOR
+		QUAD(__stack_prot_start)
+		QUAD(__stack_prot_end)
+#endif
 #ifdef CONFIG_KASAN
 		QUAD(kasan_early_shadow_page)
 		QUAD(kasan_early_shadow_pte)
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index c62a868cf2b6..f55574af98cc 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -7,8 +7,7 @@
  *    Author(s): Carsten Otte <cotte@de.ibm.com>
  */
 
-#define KMSG_COMPONENT "kvm-s390"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "kvm-s390: " fmt
 
 #include <linux/cpufeature.h>
 #include <linux/interrupt.h>
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 16ba04062854..ff3a185f156c 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -10,8 +10,7 @@
  *               Jason J. Herne <jjherne@us.ibm.com>
  */
 
-#define KMSG_COMPONENT "kvm-s390"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "kvm-s390: " fmt
 
 #include <linux/compiler.h>
 #include <linux/export.h>
diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
index 9a71b6e00948..0b14d894f38a 100644
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -754,7 +754,7 @@ int is_valid_psw(psw_t *psw)
 int kvm_s390_handle_lpsw(struct kvm_vcpu *vcpu)
 {
 	psw_t *gpsw = &vcpu->arch.sie_block->gpsw;
-	psw_compat_t new_psw;
+	psw32_t new_psw;
 	u64 addr;
 	int rc;
 	u8 ar;
diff --git a/arch/s390/mm/cmm.c b/arch/s390/mm/cmm.c
index e2a6eb92420f..eb7ef63fab1e 100644
--- a/arch/s390/mm/cmm.c
+++ b/arch/s390/mm/cmm.c
@@ -321,8 +321,8 @@ static int cmm_timeout_handler(const struct ctl_table *ctl, int write,
 		cmm_set_timeout(nr, seconds);
 		*ppos += *lenp;
 	} else {
-		len = sprintf(buf, "%ld %ld\n",
-			      cmm_timeout_pages, cmm_timeout_seconds);
+		len = scnprintf(buf, sizeof(buf), "%ld %ld\n",
+				cmm_timeout_pages, cmm_timeout_seconds);
 		if (len > *lenp)
 			len = *lenp;
 		memcpy(buffer, buf, len);
diff --git a/arch/s390/mm/dump_pagetables.c b/arch/s390/mm/dump_pagetables.c
index 528d7c70979f..89badbe72ae7 100644
--- a/arch/s390/mm/dump_pagetables.c
+++ b/arch/s390/mm/dump_pagetables.c
@@ -51,7 +51,7 @@ struct pg_state {
 	struct seq_file *__m = (m);		\
 						\
 	if (__m)				\
-		seq_printf(__m, fmt);		\
+		seq_puts(__m, fmt);		\
 })
 
 static void print_prot(struct seq_file *m, unsigned int pr, int level)
diff --git a/arch/s390/mm/extmem.c b/arch/s390/mm/extmem.c
index f7da53e212f5..6cc33c705de2 100644
--- a/arch/s390/mm/extmem.c
+++ b/arch/s390/mm/extmem.c
@@ -7,8 +7,7 @@
  * Copyright IBM Corp. 2002, 2004
  */
 
-#define KMSG_COMPONENT "extmem"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "extmem: " fmt
 
 #include <linux/kernel.h>
 #include <linux/string.h>
@@ -598,14 +597,16 @@ segment_save(char *name)
 		goto out;
 	}
 
-	sprintf(cmd1, "DEFSEG %s", name);
+	snprintf(cmd1, sizeof(cmd1), "DEFSEG %s", name);
 	for (i=0; i<seg->segcnt; i++) {
-		sprintf(cmd1+strlen(cmd1), " %lX-%lX %s",
-			seg->range[i].start >> PAGE_SHIFT,
-			seg->range[i].end >> PAGE_SHIFT,
-			segtype_string[seg->range[i].start & 0xff]);
+		size_t len = strlen(cmd1);
+
+		snprintf(cmd1 + len, sizeof(cmd1) - len, " %lX-%lX %s",
+			 seg->range[i].start >> PAGE_SHIFT,
+			 seg->range[i].end >> PAGE_SHIFT,
+			 segtype_string[seg->range[i].start & 0xff]);
 	}
-	sprintf(cmd2, "SAVESEG %s", name);
+	snprintf(cmd2, sizeof(cmd2), "SAVESEG %s", name);
 	response = 0;
 	cpcmd(cmd1, NULL, 0, &response);
 	if (response) {
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index e1ad05bfd28a..e2e13778c36a 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -23,7 +23,6 @@
 #include <linux/ptrace.h>
 #include <linux/mman.h>
 #include <linux/mm.h>
-#include <linux/compat.h>
 #include <linux/smp.h>
 #include <linux/kdebug.h>
 #include <linux/init.h>
@@ -133,8 +132,17 @@ static void dump_fault_info(struct pt_regs *regs)
 	union teid teid = { .val = regs->int_parm_long };
 	unsigned long asce;
 
-	pr_alert("Failing address: %016lx TEID: %016lx\n",
+	pr_alert("Failing address: %016lx TEID: %016lx",
 		 get_fault_address(regs), teid.val);
+	if (test_facility(131))
+		pr_cont(" ESOP-2");
+	else if (machine_has_esop())
+		pr_cont(" ESOP-1");
+	else
+		pr_cont(" SOP");
+	if (test_facility(75))
+		pr_cont(" FSI");
+	pr_cont("\n");
 	pr_alert("Fault in ");
 	switch (teid.as) {
 	case PSW_BITS_AS_HOME:
@@ -365,23 +373,20 @@ void do_protection_exception(struct pt_regs *regs)
 	 * The exception to this rule are aborted transactions, for these
 	 * the PSW already points to the correct location.
 	 */
-	if (!(regs->int_code & 0x200))
+	if (!(regs->int_code & 0x200)) {
 		regs->psw.addr = __rewind_psw(regs->psw, regs->int_code >> 16);
+		set_pt_regs_flag(regs, PIF_PSW_ADDR_ADJUSTED);
+	}
 	/*
-	 * Check for low-address protection.  This needs to be treated
-	 * as a special case because the translation exception code
-	 * field is not guaranteed to contain valid data in this case.
+	 * If bit 61 if the TEID is not set, the remainder of the
+	 * TEID is unpredictable. Special handling is required.
 	 */
 	if (unlikely(!teid.b61)) {
 		if (user_mode(regs)) {
-			/* Low-address protection in user mode: cannot happen */
 			dump_fault_info(regs);
-			die(regs, "Low-address protection");
+			die(regs, "Unexpected TEID");
 		}
-		/*
-		 * Low-address protection in kernel mode means
-		 * NULL pointer write access in kernel mode.
-		 */
+		/* Assume low-address protection in kernel mode. */
 		return handle_fault_error_nolock(regs, 0);
 	}
 	if (unlikely(cpu_has_nx() && teid.b56)) {
diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
index 8ff6bba107e8..603d9e5febb5 100644
--- a/arch/s390/mm/gmap.c
+++ b/arch/s390/mm/gmap.c
@@ -138,10 +138,7 @@ EXPORT_SYMBOL_GPL(gmap_create);
 
 static void gmap_flush_tlb(struct gmap *gmap)
 {
-	if (cpu_has_idte())
-		__tlb_flush_idte(gmap->asce);
-	else
-		__tlb_flush_global();
+	__tlb_flush_idte(gmap->asce);
 }
 
 static void gmap_radix_tree_free(struct radix_tree_root *root)
@@ -1988,10 +1985,8 @@ static void gmap_pmdp_xchg(struct gmap *gmap, pmd_t *pmdp, pmd_t new,
 	if (machine_has_tlb_guest())
 		__pmdp_idte(gaddr, (pmd_t *)pmdp, IDTE_GUEST_ASCE, gmap->asce,
 			    IDTE_GLOBAL);
-	else if (cpu_has_idte())
-		__pmdp_idte(gaddr, (pmd_t *)pmdp, 0, 0, IDTE_GLOBAL);
 	else
-		__pmdp_csp(pmdp);
+		__pmdp_idte(gaddr, (pmd_t *)pmdp, 0, 0, IDTE_GLOBAL);
 	set_pmd(pmdp, new);
 }
 
@@ -2012,7 +2007,7 @@ static void gmap_pmdp_clear(struct mm_struct *mm, unsigned long vmaddr,
 						   _SEGMENT_ENTRY_GMAP_UC |
 						   _SEGMENT_ENTRY));
 			if (purge)
-				__pmdp_csp(pmdp);
+				__pmdp_cspg(pmdp);
 			set_pmd(pmdp, __pmd(_SEGMENT_ENTRY_EMPTY));
 		}
 		spin_unlock(&gmap->guest_table_lock);
@@ -2033,17 +2028,6 @@ void gmap_pmdp_invalidate(struct mm_struct *mm, unsigned long vmaddr)
 EXPORT_SYMBOL_GPL(gmap_pmdp_invalidate);
 
 /**
- * gmap_pmdp_csp - csp all affected guest pmd entries
- * @mm: pointer to the process mm_struct
- * @vmaddr: virtual address in the process address space
- */
-void gmap_pmdp_csp(struct mm_struct *mm, unsigned long vmaddr)
-{
-	gmap_pmdp_clear(mm, vmaddr, 1);
-}
-EXPORT_SYMBOL_GPL(gmap_pmdp_csp);
-
-/**
  * gmap_pmdp_idte_local - invalidate and clear a guest pmd entry
  * @mm: pointer to the process mm_struct
  * @vmaddr: virtual address in the process address space
@@ -2066,7 +2050,7 @@ void gmap_pmdp_idte_local(struct mm_struct *mm, unsigned long vmaddr)
 			if (machine_has_tlb_guest())
 				__pmdp_idte(gaddr, pmdp, IDTE_GUEST_ASCE,
 					    gmap->asce, IDTE_LOCAL);
-			else if (cpu_has_idte())
+			else
 				__pmdp_idte(gaddr, pmdp, 0, 0, IDTE_LOCAL);
 			*pmdp = __pmd(_SEGMENT_ENTRY_EMPTY);
 		}
@@ -2099,10 +2083,8 @@ void gmap_pmdp_idte_global(struct mm_struct *mm, unsigned long vmaddr)
 			if (machine_has_tlb_guest())
 				__pmdp_idte(gaddr, pmdp, IDTE_GUEST_ASCE,
 					    gmap->asce, IDTE_GLOBAL);
-			else if (cpu_has_idte())
-				__pmdp_idte(gaddr, pmdp, 0, 0, IDTE_GLOBAL);
 			else
-				__pmdp_csp(pmdp);
+				__pmdp_idte(gaddr, pmdp, 0, 0, IDTE_GLOBAL);
 			*pmdp = __pmd(_SEGMENT_ENTRY_EMPTY);
 		}
 		spin_unlock(&gmap->guest_table_lock);
diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c
index 72e8fa136af5..d42e61c7594e 100644
--- a/arch/s390/mm/hugetlbpage.c
+++ b/arch/s390/mm/hugetlbpage.c
@@ -6,8 +6,7 @@
  *    Author(s): Gerald Schaefer <gerald.schaefer@de.ibm.com>
  */
 
-#define KMSG_COMPONENT "hugetlb"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "hugetlb: " fmt
 
 #include <linux/cpufeature.h>
 #include <linux/mm.h>
diff --git a/arch/s390/mm/mmap.c b/arch/s390/mm/mmap.c
index 197c1d9497a7..2a222a7e14f4 100644
--- a/arch/s390/mm/mmap.c
+++ b/arch/s390/mm/mmap.c
@@ -15,7 +15,6 @@
 #include <linux/sched/signal.h>
 #include <linux/sched/mm.h>
 #include <linux/random.h>
-#include <linux/compat.h>
 #include <linux/security.h>
 #include <linux/hugetlb.h>
 #include <asm/elf.h>
diff --git a/arch/s390/mm/pageattr.c b/arch/s390/mm/pageattr.c
index 348e759840e7..3042647c9dbf 100644
--- a/arch/s390/mm/pageattr.c
+++ b/arch/s390/mm/pageattr.c
@@ -78,10 +78,8 @@ static void pgt_set(unsigned long *old, unsigned long new, unsigned long addr,
 		}
 		table = (unsigned long *)((unsigned long)old & mask);
 		crdte(*old, new, table, dtt, addr, get_lowcore()->kernel_asce.val);
-	} else if (cpu_has_idte()) {
-		cspg(old, *old, new);
 	} else {
-		csp((unsigned int *)old + 1, *old, new);
+		cspg(old, *old, new);
 	}
 }
 
diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c
index 626fca116cd7..7df23528c01b 100644
--- a/arch/s390/mm/pgalloc.c
+++ b/arch/s390/mm/pgalloc.c
@@ -164,6 +164,8 @@ void page_table_free(struct mm_struct *mm, unsigned long *table)
 {
 	struct ptdesc *ptdesc = virt_to_ptdesc(table);
 
+	if (pagetable_is_reserved(ptdesc))
+		return free_reserved_ptdesc(ptdesc);
 	pagetable_dtor_free(ptdesc);
 }
 
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index 05974304d622..7ae77df276b5 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -360,14 +360,10 @@ static inline void pmdp_idte_global(struct mm_struct *mm,
 			    mm->context.asce, IDTE_GLOBAL);
 		if (mm_has_pgste(mm) && mm->context.allow_gmap_hpage_1m)
 			gmap_pmdp_idte_global(mm, addr);
-	} else if (cpu_has_idte()) {
+	} else {
 		__pmdp_idte(addr, pmdp, 0, 0, IDTE_GLOBAL);
 		if (mm_has_pgste(mm) && mm->context.allow_gmap_hpage_1m)
 			gmap_pmdp_idte_global(mm, addr);
-	} else {
-		__pmdp_csp(pmdp);
-		if (mm_has_pgste(mm) && mm->context.allow_gmap_hpage_1m)
-			gmap_pmdp_csp(mm, addr);
 	}
 }
 
@@ -487,14 +483,8 @@ static inline void pudp_idte_global(struct mm_struct *mm,
 	if (machine_has_tlb_guest())
 		__pudp_idte(addr, pudp, IDTE_NODAT | IDTE_GUEST_ASCE,
 			    mm->context.asce, IDTE_GLOBAL);
-	else if (cpu_has_idte())
-		__pudp_idte(addr, pudp, 0, 0, IDTE_GLOBAL);
 	else
-		/*
-		 * Invalid bit position is the same for pmd and pud, so we can
-		 * reuse _pmd_csp() here
-		 */
-		__pmdp_csp((pmd_t *) pudp);
+		__pudp_idte(addr, pudp, 0, 0, IDTE_GLOBAL);
 }
 
 static inline pud_t pudp_flush_direct(struct mm_struct *mm,
diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c
index f48ef361bc83..d96587b84e81 100644
--- a/arch/s390/mm/vmem.c
+++ b/arch/s390/mm/vmem.c
@@ -4,6 +4,7 @@
  */
 
 #include <linux/memory_hotplug.h>
+#include <linux/bootmem_info.h>
 #include <linux/cpufeature.h>
 #include <linux/memblock.h>
 #include <linux/pfn.h>
@@ -39,15 +40,21 @@ static void __ref *vmem_alloc_pages(unsigned int order)
 
 static void vmem_free_pages(unsigned long addr, int order, struct vmem_altmap *altmap)
 {
+	unsigned int nr_pages = 1 << order;
+	struct page *page;
+
 	if (altmap) {
 		vmem_altmap_free(altmap, 1 << order);
 		return;
 	}
-	/* We don't expect boot memory to be removed ever. */
-	if (!slab_is_available() ||
-	    WARN_ON_ONCE(PageReserved(virt_to_page((void *)addr))))
-		return;
-	free_pages(addr, order);
+	page = virt_to_page((void *)addr);
+	if (PageReserved(page)) {
+		/* allocated from memblock */
+		while (nr_pages--)
+			free_bootmem_page(page++);
+	} else {
+		free_pages(addr, order);
+	}
 }
 
 void *vmem_crst_alloc(unsigned long val)
@@ -79,10 +86,6 @@ pte_t __ref *vmem_pte_alloc(void)
 
 static void vmem_pte_free(unsigned long *table)
 {
-	/* We don't expect boot memory to be removed ever. */
-	if (!slab_is_available() ||
-	    WARN_ON_ONCE(PageReserved(virt_to_page(table))))
-		return;
 	page_table_free(&init_mm, table);
 }
 
diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c
index cf461d76e9da..3238c178bed8 100644
--- a/arch/s390/net/bpf_jit_comp.c
+++ b/arch/s390/net/bpf_jit_comp.c
@@ -15,8 +15,7 @@
  *	      Michael Holzheu <holzheu@linux.vnet.ibm.com>
  */
 
-#define KMSG_COMPONENT "bpf_jit"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "bpf_jit: " fmt
 
 #include <linux/netdevice.h>
 #include <linux/filter.h>
diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c
index c82c577db2bc..93d2c9c780fc 100644
--- a/arch/s390/pci/pci.c
+++ b/arch/s390/pci/pci.c
@@ -16,8 +16,7 @@
  *   Thomas Klein
  */
 
-#define KMSG_COMPONENT "zpci"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "zpci: " fmt
 
 #include <linux/kernel.h>
 #include <linux/slab.h>
diff --git a/arch/s390/pci/pci_bus.c b/arch/s390/pci/pci_bus.c
index 45a1c36c5a54..72adc8f6e94f 100644
--- a/arch/s390/pci/pci_bus.c
+++ b/arch/s390/pci/pci_bus.c
@@ -7,8 +7,7 @@
  *
  */
 
-#define KMSG_COMPONENT "zpci"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "zpci: " fmt
 
 #include <linux/kernel.h>
 #include <linux/slab.h>
@@ -45,8 +44,10 @@ static int zpci_bus_prepare_device(struct zpci_dev *zdev)
 
 	if (!zdev_enabled(zdev)) {
 		rc = zpci_enable_device(zdev);
-		if (rc)
+		if (rc) {
+			pr_err("Enabling PCI function %08x failed\n", zdev->fid);
 			return rc;
+		}
 	}
 
 	if (!zdev->has_resources) {
diff --git a/arch/s390/pci/pci_clp.c b/arch/s390/pci/pci_clp.c
index 241f7251c873..177aa0214547 100644
--- a/arch/s390/pci/pci_clp.c
+++ b/arch/s390/pci/pci_clp.c
@@ -6,10 +6,8 @@
  *   Jan Glauber <jang@linux.vnet.ibm.com>
  */
 
-#define KMSG_COMPONENT "zpci"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "zpci: " fmt
 
-#include <linux/compat.h>
 #include <linux/kernel.h>
 #include <linux/miscdevice.h>
 #include <linux/slab.h>
@@ -651,7 +649,7 @@ static long clp_misc_ioctl(struct file *filp, unsigned int cmd,
 	if (cmd != CLP_SYNC)
 		return -EINVAL;
 
-	argp = is_compat_task() ? compat_ptr(arg) : (void __user *) arg;
+	argp = (void __user *)arg;
 	if (copy_from_user(&req, argp, sizeof(req)))
 		return -EFAULT;
 	if (req.r != 0)
@@ -669,7 +667,6 @@ static const struct file_operations clp_misc_fops = {
 	.open = nonseekable_open,
 	.release = clp_misc_release,
 	.unlocked_ioctl = clp_misc_ioctl,
-	.compat_ioctl = clp_misc_ioctl,
 };
 
 static struct miscdevice clp_misc_device = {
diff --git a/arch/s390/pci/pci_debug.c b/arch/s390/pci/pci_debug.c
index 38014206c16b..c7ed7bf254b5 100644
--- a/arch/s390/pci/pci_debug.c
+++ b/arch/s390/pci/pci_debug.c
@@ -6,8 +6,7 @@
  *    Jan Glauber <jang@linux.vnet.ibm.com>
  */
 
-#define KMSG_COMPONENT "zpci"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "zpci: " fmt
 
 #include <linux/kernel.h>
 #include <linux/seq_file.h>
diff --git a/arch/s390/pci/pci_event.c b/arch/s390/pci/pci_event.c
index 27db1e72c623..839bd91c056e 100644
--- a/arch/s390/pci/pci_event.c
+++ b/arch/s390/pci/pci_event.c
@@ -6,8 +6,7 @@
  *    Jan Glauber <jang@linux.vnet.ibm.com>
  */
 
-#define KMSG_COMPONENT "zpci"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "zpci: " fmt
 
 #include <linux/kernel.h>
 #include <linux/pci.h>
diff --git a/arch/s390/pci/pci_iov.c b/arch/s390/pci/pci_iov.c
index 191e56a623f6..13050ce5c3e9 100644
--- a/arch/s390/pci/pci_iov.c
+++ b/arch/s390/pci/pci_iov.c
@@ -7,8 +7,7 @@
  *
  */
 
-#define KMSG_COMPONENT "zpci"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "zpci: " fmt
 
 #include <linux/kernel.h>
 #include <linux/pci.h>
diff --git a/arch/s390/pci/pci_irq.c b/arch/s390/pci/pci_irq.c
index e73be96ce5fe..2a06df8c2498 100644
--- a/arch/s390/pci/pci_irq.c
+++ b/arch/s390/pci/pci_irq.c
@@ -1,6 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-#define KMSG_COMPONENT "zpci"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "zpci: " fmt
 
 #include <linux/kernel.h>
 #include <linux/irq.h>
diff --git a/arch/s390/pci/pci_report.c b/arch/s390/pci/pci_report.c
index 1b494e5ecc4d..7030f7052926 100644
--- a/arch/s390/pci/pci_report.c
+++ b/arch/s390/pci/pci_report.c
@@ -7,8 +7,7 @@
  *
  */
 
-#define KMSG_COMPONENT "zpci"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "zpci: " fmt
 
 #include <linux/kernel.h>
 #include <linux/sprintf.h>
diff --git a/arch/s390/pci/pci_sysfs.c b/arch/s390/pci/pci_sysfs.c
index 12060870e2aa..c2444a23e26c 100644
--- a/arch/s390/pci/pci_sysfs.c
+++ b/arch/s390/pci/pci_sysfs.c
@@ -6,8 +6,7 @@
  *   Jan Glauber <jang@linux.vnet.ibm.com>
  */
 
-#define KMSG_COMPONENT "zpci"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "zpci: " fmt
 
 #include <linux/kernel.h>
 #include <linux/stat.h>
diff --git a/arch/s390/tools/gen_facilities.c b/arch/s390/tools/gen_facilities.c
index d5c68ade71ab..2d28a569f793 100644
--- a/arch/s390/tools/gen_facilities.c
+++ b/arch/s390/tools/gen_facilities.c
@@ -29,6 +29,7 @@ static struct facility_def facility_defs[] = {
 		.bits = (int[]){
 			0,  /* N3 instructions */
 			1,  /* z/Arch mode installed */
+			3,  /* dat-enhancement 1 */
 			18, /* long displacement facility */
 			21, /* extended-immediate facility */
 			25, /* store clock fast */
diff --git a/arch/x86/Kconfig.cpufeatures b/arch/x86/Kconfig.cpufeatures
index 250c10627ab3..733d5aff2456 100644
--- a/arch/x86/Kconfig.cpufeatures
+++ b/arch/x86/Kconfig.cpufeatures
@@ -124,6 +124,10 @@ config X86_DISABLED_FEATURE_PCID
 	def_bool y
 	depends on !X86_64
 
+config X86_DISABLED_FEATURE_LASS
+	def_bool y
+	depends on X86_32
+
 config X86_DISABLED_FEATURE_PKU
 	def_bool y
 	depends on !X86_INTEL_MEMORY_PROTECTION_KEYS
diff --git a/arch/x86/crypto/Kconfig b/arch/x86/crypto/Kconfig
index 48d3076b6053..3fd2423d3cf8 100644
--- a/arch/x86/crypto/Kconfig
+++ b/arch/x86/crypto/Kconfig
@@ -353,16 +353,6 @@ config CRYPTO_NHPOLY1305_AVX2
 	  Architecture: x86_64 using:
 	  - AVX2 (Advanced Vector Extensions 2)
 
-config CRYPTO_POLYVAL_CLMUL_NI
-	tristate "Hash functions: POLYVAL (CLMUL-NI)"
-	depends on 64BIT
-	select CRYPTO_POLYVAL
-	help
-	  POLYVAL hash function for HCTR2
-
-	  Architecture: x86_64 using:
-	  - CLMUL-NI (carry-less multiplication new instructions)
-
 config CRYPTO_SM3_AVX_X86_64
 	tristate "Hash functions: SM3 (AVX)"
 	depends on 64BIT
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index 2d30d5d36145..5f2fb4f148fe 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -46,15 +46,13 @@ obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o
 aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o
 aesni-intel-$(CONFIG_64BIT) += aes-ctr-avx-x86_64.o \
 			       aes-gcm-aesni-x86_64.o \
-			       aes-xts-avx-x86_64.o \
-			       aes-gcm-avx10-x86_64.o
+			       aes-gcm-vaes-avx2.o \
+			       aes-gcm-vaes-avx512.o \
+			       aes-xts-avx-x86_64.o
 
 obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o
 ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
 
-obj-$(CONFIG_CRYPTO_POLYVAL_CLMUL_NI) += polyval-clmulni.o
-polyval-clmulni-y := polyval-clmulni_asm.o polyval-clmulni_glue.o
-
 obj-$(CONFIG_CRYPTO_NHPOLY1305_SSE2) += nhpoly1305-sse2.o
 nhpoly1305-sse2-y := nh-sse2-x86_64.o nhpoly1305-sse2-glue.o
 obj-$(CONFIG_CRYPTO_NHPOLY1305_AVX2) += nhpoly1305-avx2.o
diff --git a/arch/x86/crypto/aes-gcm-aesni-x86_64.S b/arch/x86/crypto/aes-gcm-aesni-x86_64.S
index 45940e2883a0..7c8a8a32bd3c 100644
--- a/arch/x86/crypto/aes-gcm-aesni-x86_64.S
+++ b/arch/x86/crypto/aes-gcm-aesni-x86_64.S
@@ -61,15 +61,15 @@
 // for the *_aesni functions or AVX for the *_aesni_avx ones.  (But it seems
 // there are no CPUs that support AES-NI without also PCLMULQDQ and SSE4.1.)
 //
-// The design generally follows that of aes-gcm-avx10-x86_64.S, and that file is
+// The design generally follows that of aes-gcm-vaes-avx512.S, and that file is
 // more thoroughly commented.  This file has the following notable changes:
 //
 //    - The vector length is fixed at 128-bit, i.e. xmm registers.  This means
 //      there is only one AES block (and GHASH block) per register.
 //
-//    - Without AVX512 / AVX10, only 16 SIMD registers are available instead of
-//      32.  We work around this by being much more careful about using
-//      registers, relying heavily on loads to load values as they are needed.
+//    - Without AVX512, only 16 SIMD registers are available instead of 32.  We
+//      work around this by being much more careful about using registers,
+//      relying heavily on loads to load values as they are needed.
 //
 //    - Masking is not available either.  We work around this by implementing
 //      partial block loads and stores using overlapping scalar loads and stores
@@ -90,8 +90,8 @@
 //      multiplication instead of schoolbook multiplication.  This saves one
 //      pclmulqdq instruction per block, at the cost of one 64-bit load, one
 //      pshufd, and 0.25 pxors per block.  (This is without the three-argument
-//      XOR support that would be provided by AVX512 / AVX10, which would be
-//      more beneficial to schoolbook than Karatsuba.)
+//      XOR support that would be provided by AVX512, which would be more
+//      beneficial to schoolbook than Karatsuba.)
 //
 //      As a rough approximation, we can assume that Karatsuba multiplication is
 //      faster than schoolbook multiplication in this context if one pshufd and
diff --git a/arch/x86/crypto/aes-gcm-vaes-avx2.S b/arch/x86/crypto/aes-gcm-vaes-avx2.S
new file mode 100644
index 000000000000..93c9504a488f
--- /dev/null
+++ b/arch/x86/crypto/aes-gcm-vaes-avx2.S
@@ -0,0 +1,1146 @@
+/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */
+//
+// AES-GCM implementation for x86_64 CPUs that support the following CPU
+// features: VAES && VPCLMULQDQ && AVX2
+//
+// Copyright 2025 Google LLC
+//
+// Author: Eric Biggers <ebiggers@google.com>
+//
+//------------------------------------------------------------------------------
+//
+// This file is dual-licensed, meaning that you can use it under your choice of
+// either of the following two licenses:
+//
+// Licensed under the Apache License 2.0 (the "License").  You may obtain a copy
+// of the License at
+//
+//	http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// or
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// 1. Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// -----------------------------------------------------------------------------
+//
+// This is similar to aes-gcm-vaes-avx512.S, but it uses AVX2 instead of AVX512.
+// This means it can only use 16 vector registers instead of 32, the maximum
+// vector length is 32 bytes, and some instructions such as vpternlogd and
+// masked loads/stores are unavailable.  However, it is able to run on CPUs that
+// have VAES without AVX512, namely AMD Zen 3 (including "Milan" server CPUs),
+// various Intel client CPUs such as Alder Lake, and Intel Sierra Forest.
+//
+// This implementation also uses Karatsuba multiplication instead of schoolbook
+// multiplication for GHASH in its main loop.  This does not help much on Intel,
+// but it improves performance by ~5% on AMD Zen 3.  Other factors weighing
+// slightly in favor of Karatsuba multiplication in this implementation are the
+// lower maximum vector length (which means there are fewer key powers, so we
+// can cache the halves of each key power XOR'd together and still use less
+// memory than the AVX512 implementation), and the unavailability of the
+// vpternlogd instruction (which helped schoolbook a bit more than Karatsuba).
+
+#include <linux/linkage.h>
+
+.section .rodata
+.p2align 4
+
+	// The below three 16-byte values must be in the order that they are, as
+	// they are really two 32-byte tables and a 16-byte value that overlap:
+	//
+	// - The first 32-byte table begins at .Lselect_high_bytes_table.
+	//   For 0 <= len <= 16, the 16-byte value at
+	//   '.Lselect_high_bytes_table + len' selects the high 'len' bytes of
+	//   another 16-byte value when AND'ed with it.
+	//
+	// - The second 32-byte table begins at .Lrshift_and_bswap_table.
+	//   For 0 <= len <= 16, the 16-byte value at
+	//   '.Lrshift_and_bswap_table + len' is a vpshufb mask that does the
+	//   following operation: right-shift by '16 - len' bytes (shifting in
+	//   zeroes), then reflect all 16 bytes.
+	//
+	// - The 16-byte value at .Lbswap_mask is a vpshufb mask that reflects
+	//   all 16 bytes.
+.Lselect_high_bytes_table:
+	.octa	0
+.Lrshift_and_bswap_table:
+	.octa	0xffffffffffffffffffffffffffffffff
+.Lbswap_mask:
+	.octa	0x000102030405060708090a0b0c0d0e0f
+
+	// Sixteen 0x0f bytes.  By XOR'ing an entry of .Lrshift_and_bswap_table
+	// with this, we get a mask that left-shifts by '16 - len' bytes.
+.Lfifteens:
+	.octa	0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
+
+	// This is the GHASH reducing polynomial without its constant term, i.e.
+	// x^128 + x^7 + x^2 + x, represented using the backwards mapping
+	// between bits and polynomial coefficients.
+	//
+	// Alternatively, it can be interpreted as the naturally-ordered
+	// representation of the polynomial x^127 + x^126 + x^121 + 1, i.e. the
+	// "reversed" GHASH reducing polynomial without its x^128 term.
+.Lgfpoly:
+	.octa	0xc2000000000000000000000000000001
+
+	// Same as above, but with the (1 << 64) bit set.
+.Lgfpoly_and_internal_carrybit:
+	.octa	0xc2000000000000010000000000000001
+
+	// Values needed to prepare the initial vector of counter blocks.
+.Lctr_pattern:
+	.octa	0
+	.octa	1
+
+	// The number of AES blocks per vector, as a 128-bit value.
+.Linc_2blocks:
+	.octa	2
+
+// Offsets in struct aes_gcm_key_vaes_avx2
+#define OFFSETOF_AESKEYLEN	480
+#define OFFSETOF_H_POWERS	512
+#define NUM_H_POWERS		8
+#define OFFSETOFEND_H_POWERS    (OFFSETOF_H_POWERS + (NUM_H_POWERS * 16))
+#define OFFSETOF_H_POWERS_XORED	OFFSETOFEND_H_POWERS
+
+.text
+
+// Do one step of GHASH-multiplying the 128-bit lanes of \a by the 128-bit lanes
+// of \b and storing the reduced products in \dst.  Uses schoolbook
+// multiplication.
+.macro	_ghash_mul_step	i, a, b, dst, gfpoly, t0, t1, t2
+.if \i == 0
+	vpclmulqdq	$0x00, \a, \b, \t0	  // LO = a_L * b_L
+	vpclmulqdq	$0x01, \a, \b, \t1	  // MI_0 = a_L * b_H
+.elseif \i == 1
+	vpclmulqdq	$0x10, \a, \b, \t2	  // MI_1 = a_H * b_L
+.elseif \i == 2
+	vpxor		\t2, \t1, \t1		  // MI = MI_0 + MI_1
+.elseif \i == 3
+	vpclmulqdq	$0x01, \t0, \gfpoly, \t2  // LO_L*(x^63 + x^62 + x^57)
+.elseif \i == 4
+	vpshufd		$0x4e, \t0, \t0		  // Swap halves of LO
+.elseif \i == 5
+	vpxor		\t0, \t1, \t1		  // Fold LO into MI (part 1)
+	vpxor		\t2, \t1, \t1		  // Fold LO into MI (part 2)
+.elseif \i == 6
+	vpclmulqdq	$0x11, \a, \b, \dst	  // HI = a_H * b_H
+.elseif \i == 7
+	vpclmulqdq	$0x01, \t1, \gfpoly, \t0  // MI_L*(x^63 + x^62 + x^57)
+.elseif \i == 8
+	vpshufd		$0x4e, \t1, \t1		  // Swap halves of MI
+.elseif \i == 9
+	vpxor		\t1, \dst, \dst		  // Fold MI into HI (part 1)
+	vpxor		\t0, \dst, \dst		  // Fold MI into HI (part 2)
+.endif
+.endm
+
+// GHASH-multiply the 128-bit lanes of \a by the 128-bit lanes of \b and store
+// the reduced products in \dst.  See _ghash_mul_step for full explanation.
+.macro	_ghash_mul	a, b, dst, gfpoly, t0, t1, t2
+.irp i, 0,1,2,3,4,5,6,7,8,9
+	_ghash_mul_step	\i, \a, \b, \dst, \gfpoly, \t0, \t1, \t2
+.endr
+.endm
+
+// GHASH-multiply the 128-bit lanes of \a by the 128-bit lanes of \b and add the
+// *unreduced* products to \lo, \mi, and \hi.
+.macro	_ghash_mul_noreduce	a, b, lo, mi, hi, t0
+	vpclmulqdq	$0x00, \a, \b, \t0	// a_L * b_L
+	vpxor		\t0, \lo, \lo
+	vpclmulqdq	$0x01, \a, \b, \t0	// a_L * b_H
+	vpxor		\t0, \mi, \mi
+	vpclmulqdq	$0x10, \a, \b, \t0	// a_H * b_L
+	vpxor		\t0, \mi, \mi
+	vpclmulqdq	$0x11, \a, \b, \t0	// a_H * b_H
+	vpxor		\t0, \hi, \hi
+.endm
+
+// Reduce the unreduced products from \lo, \mi, and \hi and store the 128-bit
+// reduced products in \hi.  See _ghash_mul_step for explanation of reduction.
+.macro	_ghash_reduce	lo, mi, hi, gfpoly, t0
+	vpclmulqdq	$0x01, \lo, \gfpoly, \t0
+	vpshufd		$0x4e, \lo, \lo
+	vpxor		\lo, \mi, \mi
+	vpxor		\t0, \mi, \mi
+	vpclmulqdq	$0x01, \mi, \gfpoly, \t0
+	vpshufd		$0x4e, \mi, \mi
+	vpxor		\mi, \hi, \hi
+	vpxor		\t0, \hi, \hi
+.endm
+
+// This is a specialized version of _ghash_mul that computes \a * \a, i.e. it
+// squares \a.  It skips computing MI = (a_L * a_H) + (a_H * a_L) = 0.
+.macro	_ghash_square	a, dst, gfpoly, t0, t1
+	vpclmulqdq	$0x00, \a, \a, \t0	  // LO = a_L * a_L
+	vpclmulqdq	$0x11, \a, \a, \dst	  // HI = a_H * a_H
+	vpclmulqdq	$0x01, \t0, \gfpoly, \t1  // LO_L*(x^63 + x^62 + x^57)
+	vpshufd		$0x4e, \t0, \t0		  // Swap halves of LO
+	vpxor		\t0, \t1, \t1		  // Fold LO into MI
+	vpclmulqdq	$0x01, \t1, \gfpoly, \t0  // MI_L*(x^63 + x^62 + x^57)
+	vpshufd		$0x4e, \t1, \t1		  // Swap halves of MI
+	vpxor		\t1, \dst, \dst		  // Fold MI into HI (part 1)
+	vpxor		\t0, \dst, \dst		  // Fold MI into HI (part 2)
+.endm
+
+// void aes_gcm_precompute_vaes_avx2(struct aes_gcm_key_vaes_avx2 *key);
+//
+// Given the expanded AES key |key->base.aes_key|, derive the GHASH subkey and
+// initialize |key->h_powers| and |key->h_powers_xored|.
+//
+// We use h_powers[0..7] to store H^8 through H^1, and h_powers_xored[0..7] to
+// store the 64-bit halves of the key powers XOR'd together (for Karatsuba
+// multiplication) in the order 8,6,7,5,4,2,3,1.
+SYM_FUNC_START(aes_gcm_precompute_vaes_avx2)
+
+	// Function arguments
+	.set	KEY,		%rdi
+
+	// Additional local variables
+	.set	POWERS_PTR,	%rsi
+	.set	RNDKEYLAST_PTR,	%rdx
+	.set	TMP0,		%ymm0
+	.set	TMP0_XMM,	%xmm0
+	.set	TMP1,		%ymm1
+	.set	TMP1_XMM,	%xmm1
+	.set	TMP2,		%ymm2
+	.set	TMP2_XMM,	%xmm2
+	.set	H_CUR,		%ymm3
+	.set	H_CUR_XMM,	%xmm3
+	.set	H_CUR2,		%ymm4
+	.set	H_INC,		%ymm5
+	.set	H_INC_XMM,	%xmm5
+	.set	GFPOLY,		%ymm6
+	.set	GFPOLY_XMM,	%xmm6
+
+	// Encrypt an all-zeroes block to get the raw hash subkey.
+	movl		OFFSETOF_AESKEYLEN(KEY), %eax
+	lea		6*16(KEY,%rax,4), RNDKEYLAST_PTR
+	vmovdqu		(KEY), H_CUR_XMM  // Zero-th round key XOR all-zeroes block
+	lea		16(KEY), %rax
+1:
+	vaesenc		(%rax), H_CUR_XMM, H_CUR_XMM
+	add		$16, %rax
+	cmp		%rax, RNDKEYLAST_PTR
+	jne		1b
+	vaesenclast	(RNDKEYLAST_PTR), H_CUR_XMM, H_CUR_XMM
+
+	// Reflect the bytes of the raw hash subkey.
+	vpshufb		.Lbswap_mask(%rip), H_CUR_XMM, H_CUR_XMM
+
+	// Finish preprocessing the byte-reflected hash subkey by multiplying it
+	// by x^-1 ("standard" interpretation of polynomial coefficients) or
+	// equivalently x^1 (natural interpretation).  This gets the key into a
+	// format that avoids having to bit-reflect the data blocks later.
+	vpshufd		$0xd3, H_CUR_XMM, TMP0_XMM
+	vpsrad		$31, TMP0_XMM, TMP0_XMM
+	vpaddq		H_CUR_XMM, H_CUR_XMM, H_CUR_XMM
+	vpand		.Lgfpoly_and_internal_carrybit(%rip), TMP0_XMM, TMP0_XMM
+	vpxor		TMP0_XMM, H_CUR_XMM, H_CUR_XMM
+
+	// Load the gfpoly constant.
+	vbroadcasti128	.Lgfpoly(%rip), GFPOLY
+
+	// Square H^1 to get H^2.
+	_ghash_square	H_CUR_XMM, H_INC_XMM, GFPOLY_XMM, TMP0_XMM, TMP1_XMM
+
+	// Create H_CUR = [H^2, H^1] and H_INC = [H^2, H^2].
+	vinserti128	$1, H_CUR_XMM, H_INC, H_CUR
+	vinserti128	$1, H_INC_XMM, H_INC, H_INC
+
+	// Compute H_CUR2 = [H^4, H^3].
+	_ghash_mul	H_INC, H_CUR, H_CUR2, GFPOLY, TMP0, TMP1, TMP2
+
+	// Store [H^2, H^1] and [H^4, H^3].
+	vmovdqu		H_CUR, OFFSETOF_H_POWERS+3*32(KEY)
+	vmovdqu		H_CUR2, OFFSETOF_H_POWERS+2*32(KEY)
+
+	// For Karatsuba multiplication: compute and store the two 64-bit halves
+	// of each key power XOR'd together.  Order is 4,2,3,1.
+	vpunpcklqdq	H_CUR, H_CUR2, TMP0
+	vpunpckhqdq	H_CUR, H_CUR2, TMP1
+	vpxor		TMP1, TMP0, TMP0
+	vmovdqu		TMP0, OFFSETOF_H_POWERS_XORED+32(KEY)
+
+	// Compute and store H_CUR = [H^6, H^5] and H_CUR2 = [H^8, H^7].
+	_ghash_mul	H_INC, H_CUR2, H_CUR, GFPOLY, TMP0, TMP1, TMP2
+	_ghash_mul	H_INC, H_CUR, H_CUR2, GFPOLY, TMP0, TMP1, TMP2
+	vmovdqu		H_CUR, OFFSETOF_H_POWERS+1*32(KEY)
+	vmovdqu		H_CUR2, OFFSETOF_H_POWERS+0*32(KEY)
+
+	// Again, compute and store the two 64-bit halves of each key power
+	// XOR'd together.  Order is 8,6,7,5.
+	vpunpcklqdq	H_CUR, H_CUR2, TMP0
+	vpunpckhqdq	H_CUR, H_CUR2, TMP1
+	vpxor		TMP1, TMP0, TMP0
+	vmovdqu		TMP0, OFFSETOF_H_POWERS_XORED(KEY)
+
+	vzeroupper
+	RET
+SYM_FUNC_END(aes_gcm_precompute_vaes_avx2)
+
+// Do one step of the GHASH update of four vectors of data blocks.
+//   \i: the step to do, 0 through 9
+//   \ghashdata_ptr: pointer to the data blocks (ciphertext or AAD)
+//   KEY: pointer to struct aes_gcm_key_vaes_avx2
+//   BSWAP_MASK: mask for reflecting the bytes of blocks
+//   H_POW[2-1]_XORED: cached values from KEY->h_powers_xored
+//   TMP[0-2]: temporary registers.  TMP[1-2] must be preserved across steps.
+//   LO, MI: working state for this macro that must be preserved across steps
+//   GHASH_ACC: the GHASH accumulator (input/output)
+.macro	_ghash_step_4x	i, ghashdata_ptr
+	.set		HI, GHASH_ACC # alias
+	.set		HI_XMM, GHASH_ACC_XMM
+.if \i == 0
+	// First vector
+	vmovdqu		0*32(\ghashdata_ptr), TMP1
+	vpshufb		BSWAP_MASK, TMP1, TMP1
+	vmovdqu		OFFSETOF_H_POWERS+0*32(KEY), TMP2
+	vpxor		GHASH_ACC, TMP1, TMP1
+	vpclmulqdq	$0x00, TMP2, TMP1, LO
+	vpclmulqdq	$0x11, TMP2, TMP1, HI
+	vpunpckhqdq	TMP1, TMP1, TMP0
+	vpxor		TMP1, TMP0, TMP0
+	vpclmulqdq	$0x00, H_POW2_XORED, TMP0, MI
+.elseif \i == 1
+.elseif \i == 2
+	// Second vector
+	vmovdqu		1*32(\ghashdata_ptr), TMP1
+	vpshufb		BSWAP_MASK, TMP1, TMP1
+	vmovdqu		OFFSETOF_H_POWERS+1*32(KEY), TMP2
+	vpclmulqdq	$0x00, TMP2, TMP1, TMP0
+	vpxor		TMP0, LO, LO
+	vpclmulqdq	$0x11, TMP2, TMP1, TMP0
+	vpxor		TMP0, HI, HI
+	vpunpckhqdq	TMP1, TMP1, TMP0
+	vpxor		TMP1, TMP0, TMP0
+	vpclmulqdq	$0x10, H_POW2_XORED, TMP0, TMP0
+	vpxor		TMP0, MI, MI
+.elseif \i == 3
+	// Third vector
+	vmovdqu		2*32(\ghashdata_ptr), TMP1
+	vpshufb		BSWAP_MASK, TMP1, TMP1
+	vmovdqu		OFFSETOF_H_POWERS+2*32(KEY), TMP2
+.elseif \i == 4
+	vpclmulqdq	$0x00, TMP2, TMP1, TMP0
+	vpxor		TMP0, LO, LO
+	vpclmulqdq	$0x11, TMP2, TMP1, TMP0
+	vpxor		TMP0, HI, HI
+.elseif \i == 5
+	vpunpckhqdq	TMP1, TMP1, TMP0
+	vpxor		TMP1, TMP0, TMP0
+	vpclmulqdq	$0x00, H_POW1_XORED, TMP0, TMP0
+	vpxor		TMP0, MI, MI
+
+	// Fourth vector
+	vmovdqu		3*32(\ghashdata_ptr), TMP1
+	vpshufb		BSWAP_MASK, TMP1, TMP1
+.elseif \i == 6
+	vmovdqu		OFFSETOF_H_POWERS+3*32(KEY), TMP2
+	vpclmulqdq	$0x00, TMP2, TMP1, TMP0
+	vpxor		TMP0, LO, LO
+	vpclmulqdq	$0x11, TMP2, TMP1, TMP0
+	vpxor		TMP0, HI, HI
+	vpunpckhqdq	TMP1, TMP1, TMP0
+	vpxor		TMP1, TMP0, TMP0
+	vpclmulqdq	$0x10, H_POW1_XORED, TMP0, TMP0
+	vpxor		TMP0, MI, MI
+.elseif \i == 7
+	// Finalize 'mi' following Karatsuba multiplication.
+	vpxor		LO, MI, MI
+	vpxor		HI, MI, MI
+
+	// Fold lo into mi.
+	vbroadcasti128	.Lgfpoly(%rip), TMP2
+	vpclmulqdq	$0x01, LO, TMP2, TMP0
+	vpshufd		$0x4e, LO, LO
+	vpxor		LO, MI, MI
+	vpxor		TMP0, MI, MI
+.elseif \i == 8
+	// Fold mi into hi.
+	vpclmulqdq	$0x01, MI, TMP2, TMP0
+	vpshufd		$0x4e, MI, MI
+	vpxor		MI, HI, HI
+	vpxor		TMP0, HI, HI
+.elseif \i == 9
+	vextracti128	$1, HI, TMP0_XMM
+	vpxor		TMP0_XMM, HI_XMM, GHASH_ACC_XMM
+.endif
+.endm
+
+// Update GHASH with four vectors of data blocks.  See _ghash_step_4x for full
+// explanation.
+.macro	_ghash_4x	ghashdata_ptr
+.irp i, 0,1,2,3,4,5,6,7,8,9
+	_ghash_step_4x	\i, \ghashdata_ptr
+.endr
+.endm
+
+// Load 1 <= %ecx <= 16 bytes from the pointer \src into the xmm register \dst
+// and zeroize any remaining bytes.  Clobbers %rax, %rcx, and \tmp{64,32}.
+.macro	_load_partial_block	src, dst, tmp64, tmp32
+	sub		$8, %ecx		// LEN - 8
+	jle		.Lle8\@
+
+	// Load 9 <= LEN <= 16 bytes.
+	vmovq		(\src), \dst		// Load first 8 bytes
+	mov		(\src, %rcx), %rax	// Load last 8 bytes
+	neg		%ecx
+	shl		$3, %ecx
+	shr		%cl, %rax		// Discard overlapping bytes
+	vpinsrq		$1, %rax, \dst, \dst
+	jmp		.Ldone\@
+
+.Lle8\@:
+	add		$4, %ecx		// LEN - 4
+	jl		.Llt4\@
+
+	// Load 4 <= LEN <= 8 bytes.
+	mov		(\src), %eax		// Load first 4 bytes
+	mov		(\src, %rcx), \tmp32	// Load last 4 bytes
+	jmp		.Lcombine\@
+
+.Llt4\@:
+	// Load 1 <= LEN <= 3 bytes.
+	add		$2, %ecx		// LEN - 2
+	movzbl		(\src), %eax		// Load first byte
+	jl		.Lmovq\@
+	movzwl		(\src, %rcx), \tmp32	// Load last 2 bytes
+.Lcombine\@:
+	shl		$3, %ecx
+	shl		%cl, \tmp64
+	or		\tmp64, %rax		// Combine the two parts
+.Lmovq\@:
+	vmovq		%rax, \dst
+.Ldone\@:
+.endm
+
+// Store 1 <= %ecx <= 16 bytes from the xmm register \src to the pointer \dst.
+// Clobbers %rax, %rcx, and \tmp{64,32}.
+.macro	_store_partial_block	src, dst, tmp64, tmp32
+	sub		$8, %ecx		// LEN - 8
+	jl		.Llt8\@
+
+	// Store 8 <= LEN <= 16 bytes.
+	vpextrq		$1, \src, %rax
+	mov		%ecx, \tmp32
+	shl		$3, %ecx
+	ror		%cl, %rax
+	mov		%rax, (\dst, \tmp64)	// Store last LEN - 8 bytes
+	vmovq		\src, (\dst)		// Store first 8 bytes
+	jmp		.Ldone\@
+
+.Llt8\@:
+	add		$4, %ecx		// LEN - 4
+	jl		.Llt4\@
+
+	// Store 4 <= LEN <= 7 bytes.
+	vpextrd		$1, \src, %eax
+	mov		%ecx, \tmp32
+	shl		$3, %ecx
+	ror		%cl, %eax
+	mov		%eax, (\dst, \tmp64)	// Store last LEN - 4 bytes
+	vmovd		\src, (\dst)		// Store first 4 bytes
+	jmp		.Ldone\@
+
+.Llt4\@:
+	// Store 1 <= LEN <= 3 bytes.
+	vpextrb		$0, \src, 0(\dst)
+	cmp		$-2, %ecx		// LEN - 4 == -2, i.e. LEN == 2?
+	jl		.Ldone\@
+	vpextrb		$1, \src, 1(\dst)
+	je		.Ldone\@
+	vpextrb		$2, \src, 2(\dst)
+.Ldone\@:
+.endm
+
+// void aes_gcm_aad_update_vaes_avx2(const struct aes_gcm_key_vaes_avx2 *key,
+//				     u8 ghash_acc[16],
+//				     const u8 *aad, int aadlen);
+//
+// This function processes the AAD (Additional Authenticated Data) in GCM.
+// Using the key |key|, it updates the GHASH accumulator |ghash_acc| with the
+// data given by |aad| and |aadlen|.  On the first call, |ghash_acc| must be all
+// zeroes.  |aadlen| must be a multiple of 16, except on the last call where it
+// can be any length.  The caller must do any buffering needed to ensure this.
+//
+// This handles large amounts of AAD efficiently, while also keeping overhead
+// low for small amounts which is the common case.  TLS and IPsec use less than
+// one block of AAD, but (uncommonly) other use cases may use much more.
+SYM_FUNC_START(aes_gcm_aad_update_vaes_avx2)
+
+	// Function arguments
+	.set	KEY,		%rdi
+	.set	GHASH_ACC_PTR,	%rsi
+	.set	AAD,		%rdx
+	.set	AADLEN,		%ecx	// Must be %ecx for _load_partial_block
+	.set	AADLEN64,	%rcx	// Zero-extend AADLEN before using!
+
+	// Additional local variables.
+	// %rax and %r8 are used as temporary registers.
+	.set	TMP0,		%ymm0
+	.set	TMP0_XMM,	%xmm0
+	.set	TMP1,		%ymm1
+	.set	TMP1_XMM,	%xmm1
+	.set	TMP2,		%ymm2
+	.set	TMP2_XMM,	%xmm2
+	.set	LO,		%ymm3
+	.set	LO_XMM,		%xmm3
+	.set	MI,		%ymm4
+	.set	MI_XMM,		%xmm4
+	.set	GHASH_ACC,	%ymm5
+	.set	GHASH_ACC_XMM,	%xmm5
+	.set	BSWAP_MASK,	%ymm6
+	.set	BSWAP_MASK_XMM,	%xmm6
+	.set	GFPOLY,		%ymm7
+	.set	GFPOLY_XMM,	%xmm7
+	.set	H_POW2_XORED,	%ymm8
+	.set	H_POW1_XORED,	%ymm9
+
+	// Load the bswap_mask and gfpoly constants.  Since AADLEN is usually
+	// small, usually only 128-bit vectors will be used.  So as an
+	// optimization, don't broadcast these constants to both 128-bit lanes
+	// quite yet.
+	vmovdqu		.Lbswap_mask(%rip), BSWAP_MASK_XMM
+	vmovdqu		.Lgfpoly(%rip), GFPOLY_XMM
+
+	// Load the GHASH accumulator.
+	vmovdqu		(GHASH_ACC_PTR), GHASH_ACC_XMM
+
+	// Check for the common case of AADLEN <= 16, as well as AADLEN == 0.
+	test		AADLEN, AADLEN
+	jz		.Laad_done
+	cmp		$16, AADLEN
+	jle		.Laad_lastblock
+
+	// AADLEN > 16, so we'll operate on full vectors.  Broadcast bswap_mask
+	// and gfpoly to both 128-bit lanes.
+	vinserti128	$1, BSWAP_MASK_XMM, BSWAP_MASK, BSWAP_MASK
+	vinserti128	$1, GFPOLY_XMM, GFPOLY, GFPOLY
+
+	// If AADLEN >= 128, update GHASH with 128 bytes of AAD at a time.
+	add		$-128, AADLEN	// 128 is 4 bytes, -128 is 1 byte
+	jl		.Laad_loop_4x_done
+	vmovdqu		OFFSETOF_H_POWERS_XORED(KEY), H_POW2_XORED
+	vmovdqu		OFFSETOF_H_POWERS_XORED+32(KEY), H_POW1_XORED
+.Laad_loop_4x:
+	_ghash_4x	AAD
+	sub		$-128, AAD
+	add		$-128, AADLEN
+	jge		.Laad_loop_4x
+.Laad_loop_4x_done:
+
+	// If AADLEN >= 32, update GHASH with 32 bytes of AAD at a time.
+	add		$96, AADLEN
+	jl		.Laad_loop_1x_done
+.Laad_loop_1x:
+	vmovdqu		(AAD), TMP0
+	vpshufb		BSWAP_MASK, TMP0, TMP0
+	vpxor		TMP0, GHASH_ACC, GHASH_ACC
+	vmovdqu		OFFSETOFEND_H_POWERS-32(KEY), TMP0
+	_ghash_mul	TMP0, GHASH_ACC, GHASH_ACC, GFPOLY, TMP1, TMP2, LO
+	vextracti128	$1, GHASH_ACC, TMP0_XMM
+	vpxor		TMP0_XMM, GHASH_ACC_XMM, GHASH_ACC_XMM
+	add		$32, AAD
+	sub		$32, AADLEN
+	jge		.Laad_loop_1x
+.Laad_loop_1x_done:
+	add		$32, AADLEN
+	// Now 0 <= AADLEN < 32.
+
+	jz		.Laad_done
+	cmp		$16, AADLEN
+	jle		.Laad_lastblock
+
+	// Update GHASH with the remaining 17 <= AADLEN <= 31 bytes of AAD.
+	mov		AADLEN, AADLEN	// Zero-extend AADLEN to AADLEN64.
+	vmovdqu		(AAD), TMP0_XMM
+	vmovdqu		-16(AAD, AADLEN64), TMP1_XMM
+	vpshufb		BSWAP_MASK_XMM, TMP0_XMM, TMP0_XMM
+	vpxor		TMP0_XMM, GHASH_ACC_XMM, GHASH_ACC_XMM
+	lea		.Lrshift_and_bswap_table(%rip), %rax
+	vpshufb		-16(%rax, AADLEN64), TMP1_XMM, TMP1_XMM
+	vinserti128	$1, TMP1_XMM, GHASH_ACC, GHASH_ACC
+	vmovdqu		OFFSETOFEND_H_POWERS-32(KEY), TMP0
+	_ghash_mul	TMP0, GHASH_ACC, GHASH_ACC, GFPOLY, TMP1, TMP2, LO
+	vextracti128	$1, GHASH_ACC, TMP0_XMM
+	vpxor		TMP0_XMM, GHASH_ACC_XMM, GHASH_ACC_XMM
+	jmp		.Laad_done
+
+.Laad_lastblock:
+	// Update GHASH with the remaining 1 <= AADLEN <= 16 bytes of AAD.
+	_load_partial_block	AAD, TMP0_XMM, %r8, %r8d
+	vpshufb		BSWAP_MASK_XMM, TMP0_XMM, TMP0_XMM
+	vpxor		TMP0_XMM, GHASH_ACC_XMM, GHASH_ACC_XMM
+	vmovdqu		OFFSETOFEND_H_POWERS-16(KEY), TMP0_XMM
+	_ghash_mul	TMP0_XMM, GHASH_ACC_XMM, GHASH_ACC_XMM, GFPOLY_XMM, \
+			TMP1_XMM, TMP2_XMM, LO_XMM
+
+.Laad_done:
+	// Store the updated GHASH accumulator back to memory.
+	vmovdqu		GHASH_ACC_XMM, (GHASH_ACC_PTR)
+
+	vzeroupper
+	RET
+SYM_FUNC_END(aes_gcm_aad_update_vaes_avx2)
+
+// Do one non-last round of AES encryption on the blocks in the given AESDATA
+// vectors using the round key that has been broadcast to all 128-bit lanes of
+// \round_key.
+.macro	_vaesenc	round_key, vecs:vararg
+.irp i, \vecs
+	vaesenc		\round_key, AESDATA\i, AESDATA\i
+.endr
+.endm
+
+// Generate counter blocks in the given AESDATA vectors, then do the zero-th AES
+// round on them.  Clobbers TMP0.
+.macro	_ctr_begin	vecs:vararg
+	vbroadcasti128	.Linc_2blocks(%rip), TMP0
+.irp i, \vecs
+	vpshufb		BSWAP_MASK, LE_CTR, AESDATA\i
+	vpaddd		TMP0, LE_CTR, LE_CTR
+.endr
+.irp i, \vecs
+	vpxor		RNDKEY0, AESDATA\i, AESDATA\i
+.endr
+.endm
+
+// Generate and encrypt counter blocks in the given AESDATA vectors, excluding
+// the last AES round.  Clobbers %rax and TMP0.
+.macro	_aesenc_loop	vecs:vararg
+	_ctr_begin	\vecs
+	lea		16(KEY), %rax
+.Laesenc_loop\@:
+	vbroadcasti128	(%rax), TMP0
+	_vaesenc	TMP0, \vecs
+	add		$16, %rax
+	cmp		%rax, RNDKEYLAST_PTR
+	jne		.Laesenc_loop\@
+.endm
+
+// Finalize the keystream blocks in the given AESDATA vectors by doing the last
+// AES round, then XOR those keystream blocks with the corresponding data.
+// Reduce latency by doing the XOR before the vaesenclast, utilizing the
+// property vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a).  Clobbers TMP0.
+.macro	_aesenclast_and_xor	vecs:vararg
+.irp i, \vecs
+	vpxor		\i*32(SRC), RNDKEYLAST, TMP0
+	vaesenclast	TMP0, AESDATA\i, AESDATA\i
+.endr
+.irp i, \vecs
+	vmovdqu		AESDATA\i, \i*32(DST)
+.endr
+.endm
+
+// void aes_gcm_{enc,dec}_update_vaes_avx2(const struct aes_gcm_key_vaes_avx2 *key,
+//					   const u32 le_ctr[4], u8 ghash_acc[16],
+//					   const u8 *src, u8 *dst, int datalen);
+//
+// This macro generates a GCM encryption or decryption update function with the
+// above prototype (with \enc selecting which one).  The function computes the
+// next portion of the CTR keystream, XOR's it with |datalen| bytes from |src|,
+// and writes the resulting encrypted or decrypted data to |dst|.  It also
+// updates the GHASH accumulator |ghash_acc| using the next |datalen| ciphertext
+// bytes.
+//
+// |datalen| must be a multiple of 16, except on the last call where it can be
+// any length.  The caller must do any buffering needed to ensure this.  Both
+// in-place and out-of-place en/decryption are supported.
+//
+// |le_ctr| must give the current counter in little-endian format.  This
+// function loads the counter from |le_ctr| and increments the loaded counter as
+// needed, but it does *not* store the updated counter back to |le_ctr|.  The
+// caller must update |le_ctr| if any more data segments follow.  Internally,
+// only the low 32-bit word of the counter is incremented, following the GCM
+// standard.
+.macro	_aes_gcm_update	enc
+
+	// Function arguments
+	.set	KEY,		%rdi
+	.set	LE_CTR_PTR,	%rsi
+	.set	LE_CTR_PTR32,	%esi
+	.set	GHASH_ACC_PTR,	%rdx
+	.set	SRC,		%rcx	// Assumed to be %rcx.
+					// See .Ltail_xor_and_ghash_1to16bytes
+	.set	DST,		%r8
+	.set	DATALEN,	%r9d
+	.set	DATALEN64,	%r9	// Zero-extend DATALEN before using!
+
+	// Additional local variables
+
+	// %rax is used as a temporary register.  LE_CTR_PTR is also available
+	// as a temporary register after the counter is loaded.
+
+	// AES key length in bytes
+	.set	AESKEYLEN,	%r10d
+	.set	AESKEYLEN64,	%r10
+
+	// Pointer to the last AES round key for the chosen AES variant
+	.set	RNDKEYLAST_PTR,	%r11
+
+	// BSWAP_MASK is the shuffle mask for byte-reflecting 128-bit values
+	// using vpshufb, copied to all 128-bit lanes.
+	.set	BSWAP_MASK,	%ymm0
+	.set	BSWAP_MASK_XMM,	%xmm0
+
+	// GHASH_ACC is the accumulator variable for GHASH.  When fully reduced,
+	// only the lowest 128-bit lane can be nonzero.  When not fully reduced,
+	// more than one lane may be used, and they need to be XOR'd together.
+	.set	GHASH_ACC,	%ymm1
+	.set	GHASH_ACC_XMM,	%xmm1
+
+	// TMP[0-2] are temporary registers.
+	.set	TMP0,		%ymm2
+	.set	TMP0_XMM,	%xmm2
+	.set	TMP1,		%ymm3
+	.set	TMP1_XMM,	%xmm3
+	.set	TMP2,		%ymm4
+	.set	TMP2_XMM,	%xmm4
+
+	// LO and MI are used to accumulate unreduced GHASH products.
+	.set	LO,		%ymm5
+	.set	LO_XMM,		%xmm5
+	.set	MI,		%ymm6
+	.set	MI_XMM,		%xmm6
+
+	// H_POW[2-1]_XORED contain cached values from KEY->h_powers_xored.  The
+	// descending numbering reflects the order of the key powers.
+	.set	H_POW2_XORED,	%ymm7
+	.set	H_POW2_XORED_XMM, %xmm7
+	.set	H_POW1_XORED,	%ymm8
+
+	// RNDKEY0 caches the zero-th round key, and RNDKEYLAST the last one.
+	.set	RNDKEY0,	%ymm9
+	.set	RNDKEYLAST,	%ymm10
+
+	// LE_CTR contains the next set of little-endian counter blocks.
+	.set	LE_CTR,		%ymm11
+
+	// AESDATA[0-3] hold the counter blocks that are being encrypted by AES.
+	.set	AESDATA0,	%ymm12
+	.set	AESDATA0_XMM,	%xmm12
+	.set	AESDATA1,	%ymm13
+	.set	AESDATA1_XMM,	%xmm13
+	.set	AESDATA2,	%ymm14
+	.set	AESDATA3,	%ymm15
+
+.if \enc
+	.set	GHASHDATA_PTR,	DST
+.else
+	.set	GHASHDATA_PTR,	SRC
+.endif
+
+	vbroadcasti128	.Lbswap_mask(%rip), BSWAP_MASK
+
+	// Load the GHASH accumulator and the starting counter.
+	vmovdqu		(GHASH_ACC_PTR), GHASH_ACC_XMM
+	vbroadcasti128	(LE_CTR_PTR), LE_CTR
+
+	// Load the AES key length in bytes.
+	movl		OFFSETOF_AESKEYLEN(KEY), AESKEYLEN
+
+	// Make RNDKEYLAST_PTR point to the last AES round key.  This is the
+	// round key with index 10, 12, or 14 for AES-128, AES-192, or AES-256
+	// respectively.  Then load the zero-th and last round keys.
+	lea		6*16(KEY,AESKEYLEN64,4), RNDKEYLAST_PTR
+	vbroadcasti128	(KEY), RNDKEY0
+	vbroadcasti128	(RNDKEYLAST_PTR), RNDKEYLAST
+
+	// Finish initializing LE_CTR by adding 1 to the second block.
+	vpaddd		.Lctr_pattern(%rip), LE_CTR, LE_CTR
+
+	// If there are at least 128 bytes of data, then continue into the loop
+	// that processes 128 bytes of data at a time.  Otherwise skip it.
+	add		$-128, DATALEN	// 128 is 4 bytes, -128 is 1 byte
+	jl		.Lcrypt_loop_4x_done\@
+
+	vmovdqu		OFFSETOF_H_POWERS_XORED(KEY), H_POW2_XORED
+	vmovdqu		OFFSETOF_H_POWERS_XORED+32(KEY), H_POW1_XORED
+
+	// Main loop: en/decrypt and hash 4 vectors (128 bytes) at a time.
+
+.if \enc
+	// Encrypt the first 4 vectors of plaintext blocks.
+	_aesenc_loop	0,1,2,3
+	_aesenclast_and_xor	0,1,2,3
+	sub		$-128, SRC	// 128 is 4 bytes, -128 is 1 byte
+	add		$-128, DATALEN
+	jl		.Lghash_last_ciphertext_4x\@
+.endif
+
+.align 16
+.Lcrypt_loop_4x\@:
+
+	// Start the AES encryption of the counter blocks.
+	_ctr_begin	0,1,2,3
+	cmp		$24, AESKEYLEN
+	jl		128f	// AES-128?
+	je		192f	// AES-192?
+	// AES-256
+	vbroadcasti128	-13*16(RNDKEYLAST_PTR), TMP0
+	_vaesenc	TMP0, 0,1,2,3
+	vbroadcasti128	-12*16(RNDKEYLAST_PTR), TMP0
+	_vaesenc	TMP0, 0,1,2,3
+192:
+	vbroadcasti128	-11*16(RNDKEYLAST_PTR), TMP0
+	_vaesenc	TMP0, 0,1,2,3
+	vbroadcasti128	-10*16(RNDKEYLAST_PTR), TMP0
+	_vaesenc	TMP0, 0,1,2,3
+128:
+
+	// Finish the AES encryption of the counter blocks in AESDATA[0-3],
+	// interleaved with the GHASH update of the ciphertext blocks.
+.irp i, 9,8,7,6,5,4,3,2,1
+	_ghash_step_4x  (9 - \i), GHASHDATA_PTR
+	vbroadcasti128	-\i*16(RNDKEYLAST_PTR), TMP0
+	_vaesenc	TMP0, 0,1,2,3
+.endr
+	_ghash_step_4x	9, GHASHDATA_PTR
+.if \enc
+	sub		$-128, DST	// 128 is 4 bytes, -128 is 1 byte
+.endif
+	_aesenclast_and_xor	0,1,2,3
+	sub		$-128, SRC
+.if !\enc
+	sub		$-128, DST
+.endif
+	add		$-128, DATALEN
+	jge		.Lcrypt_loop_4x\@
+
+.if \enc
+.Lghash_last_ciphertext_4x\@:
+	// Update GHASH with the last set of ciphertext blocks.
+	_ghash_4x	DST
+	sub		$-128, DST
+.endif
+
+.Lcrypt_loop_4x_done\@:
+
+	// Undo the extra subtraction by 128 and check whether data remains.
+	sub		$-128, DATALEN	// 128 is 4 bytes, -128 is 1 byte
+	jz		.Ldone\@
+
+	// The data length isn't a multiple of 128 bytes.  Process the remaining
+	// data of length 1 <= DATALEN < 128.
+	//
+	// Since there are enough key powers available for all remaining data,
+	// there is no need to do a GHASH reduction after each iteration.
+	// Instead, multiply each remaining block by its own key power, and only
+	// do a GHASH reduction at the very end.
+
+	// Make POWERS_PTR point to the key powers [H^N, H^(N-1), ...] where N
+	// is the number of blocks that remain.
+	.set		POWERS_PTR, LE_CTR_PTR	// LE_CTR_PTR is free to be reused.
+	.set		POWERS_PTR32, LE_CTR_PTR32
+	mov		DATALEN, %eax
+	neg		%rax
+	and		$~15, %rax  // -round_up(DATALEN, 16)
+	lea		OFFSETOFEND_H_POWERS(KEY,%rax), POWERS_PTR
+
+	// Start collecting the unreduced GHASH intermediate value LO, MI, HI.
+	.set		HI, H_POW2_XORED	// H_POW2_XORED is free to be reused.
+	.set		HI_XMM, H_POW2_XORED_XMM
+	vpxor		LO_XMM, LO_XMM, LO_XMM
+	vpxor		MI_XMM, MI_XMM, MI_XMM
+	vpxor		HI_XMM, HI_XMM, HI_XMM
+
+	// 1 <= DATALEN < 128.  Generate 2 or 4 more vectors of keystream blocks
+	// excluding the last AES round, depending on the remaining DATALEN.
+	cmp		$64, DATALEN
+	jg		.Ltail_gen_4_keystream_vecs\@
+	_aesenc_loop	0,1
+	cmp		$32, DATALEN
+	jge		.Ltail_xor_and_ghash_full_vec_loop\@
+	jmp		.Ltail_xor_and_ghash_partial_vec\@
+.Ltail_gen_4_keystream_vecs\@:
+	_aesenc_loop	0,1,2,3
+
+	// XOR the remaining data and accumulate the unreduced GHASH products
+	// for DATALEN >= 32, starting with one full 32-byte vector at a time.
+.Ltail_xor_and_ghash_full_vec_loop\@:
+.if \enc
+	_aesenclast_and_xor	0
+	vpshufb		BSWAP_MASK, AESDATA0, AESDATA0
+.else
+	vmovdqu		(SRC), TMP1
+	vpxor		TMP1, RNDKEYLAST, TMP0
+	vaesenclast	TMP0, AESDATA0, AESDATA0
+	vmovdqu		AESDATA0, (DST)
+	vpshufb		BSWAP_MASK, TMP1, AESDATA0
+.endif
+	// The ciphertext blocks (i.e. GHASH input data) are now in AESDATA0.
+	vpxor		GHASH_ACC, AESDATA0, AESDATA0
+	vmovdqu		(POWERS_PTR), TMP2
+	_ghash_mul_noreduce	TMP2, AESDATA0, LO, MI, HI, TMP0
+	vmovdqa		AESDATA1, AESDATA0
+	vmovdqa		AESDATA2, AESDATA1
+	vmovdqa		AESDATA3, AESDATA2
+	vpxor		GHASH_ACC_XMM, GHASH_ACC_XMM, GHASH_ACC_XMM
+	add		$32, SRC
+	add		$32, DST
+	add		$32, POWERS_PTR
+	sub		$32, DATALEN
+	cmp		$32, DATALEN
+	jge		.Ltail_xor_and_ghash_full_vec_loop\@
+	test		DATALEN, DATALEN
+	jz		.Ltail_ghash_reduce\@
+
+.Ltail_xor_and_ghash_partial_vec\@:
+	// XOR the remaining data and accumulate the unreduced GHASH products,
+	// for 1 <= DATALEN < 32.
+	vaesenclast	RNDKEYLAST, AESDATA0, AESDATA0
+	cmp		$16, DATALEN
+	jle		.Ltail_xor_and_ghash_1to16bytes\@
+
+	// Handle 17 <= DATALEN < 32.
+
+	// Load a vpshufb mask that will right-shift by '32 - DATALEN' bytes
+	// (shifting in zeroes), then reflect all 16 bytes.
+	lea		.Lrshift_and_bswap_table(%rip), %rax
+	vmovdqu		-16(%rax, DATALEN64), TMP2_XMM
+
+	// Move the second keystream block to its own register and left-align it
+	vextracti128	$1, AESDATA0, AESDATA1_XMM
+	vpxor		.Lfifteens(%rip), TMP2_XMM, TMP0_XMM
+	vpshufb		TMP0_XMM, AESDATA1_XMM, AESDATA1_XMM
+
+	// Using overlapping loads and stores, XOR the source data with the
+	// keystream and write the destination data.  Then prepare the GHASH
+	// input data: the full ciphertext block and the zero-padded partial
+	// ciphertext block, both byte-reflected, in AESDATA0.
+.if \enc
+	vpxor		-16(SRC, DATALEN64), AESDATA1_XMM, AESDATA1_XMM
+	vpxor		(SRC), AESDATA0_XMM, AESDATA0_XMM
+	vmovdqu		AESDATA1_XMM, -16(DST, DATALEN64)
+	vmovdqu		AESDATA0_XMM, (DST)
+	vpshufb		TMP2_XMM, AESDATA1_XMM, AESDATA1_XMM
+	vpshufb		BSWAP_MASK_XMM, AESDATA0_XMM, AESDATA0_XMM
+.else
+	vmovdqu		-16(SRC, DATALEN64), TMP1_XMM
+	vmovdqu		(SRC), TMP0_XMM
+	vpxor		TMP1_XMM, AESDATA1_XMM, AESDATA1_XMM
+	vpxor		TMP0_XMM, AESDATA0_XMM, AESDATA0_XMM
+	vmovdqu		AESDATA1_XMM, -16(DST, DATALEN64)
+	vmovdqu		AESDATA0_XMM, (DST)
+	vpshufb		TMP2_XMM, TMP1_XMM, AESDATA1_XMM
+	vpshufb		BSWAP_MASK_XMM, TMP0_XMM, AESDATA0_XMM
+.endif
+	vpxor		GHASH_ACC_XMM, AESDATA0_XMM, AESDATA0_XMM
+	vinserti128	$1, AESDATA1_XMM, AESDATA0, AESDATA0
+	vmovdqu		(POWERS_PTR), TMP2
+	jmp		.Ltail_ghash_last_vec\@
+
+.Ltail_xor_and_ghash_1to16bytes\@:
+	// Handle 1 <= DATALEN <= 16.  Carefully load and store the
+	// possibly-partial block, which we mustn't access out of bounds.
+	vmovdqu		(POWERS_PTR), TMP2_XMM
+	mov		SRC, KEY	// Free up %rcx, assuming SRC == %rcx
+	mov		DATALEN, %ecx
+	_load_partial_block	KEY, TMP0_XMM, POWERS_PTR, POWERS_PTR32
+	vpxor		TMP0_XMM, AESDATA0_XMM, AESDATA0_XMM
+	mov		DATALEN, %ecx
+	_store_partial_block	AESDATA0_XMM, DST, POWERS_PTR, POWERS_PTR32
+.if \enc
+	lea		.Lselect_high_bytes_table(%rip), %rax
+	vpshufb		BSWAP_MASK_XMM, AESDATA0_XMM, AESDATA0_XMM
+	vpand		(%rax, DATALEN64), AESDATA0_XMM, AESDATA0_XMM
+.else
+	vpshufb		BSWAP_MASK_XMM, TMP0_XMM, AESDATA0_XMM
+.endif
+	vpxor		GHASH_ACC_XMM, AESDATA0_XMM, AESDATA0_XMM
+
+.Ltail_ghash_last_vec\@:
+	// Accumulate the unreduced GHASH products for the last 1-2 blocks.  The
+	// GHASH input data is in AESDATA0.  If only one block remains, then the
+	// second block in AESDATA0 is zero and does not affect the result.
+	_ghash_mul_noreduce	TMP2, AESDATA0, LO, MI, HI, TMP0
+
+.Ltail_ghash_reduce\@:
+	// Finally, do the GHASH reduction.
+	vbroadcasti128	.Lgfpoly(%rip), TMP0
+	_ghash_reduce	LO, MI, HI, TMP0, TMP1
+	vextracti128	$1, HI, GHASH_ACC_XMM
+	vpxor		HI_XMM, GHASH_ACC_XMM, GHASH_ACC_XMM
+
+.Ldone\@:
+	// Store the updated GHASH accumulator back to memory.
+	vmovdqu		GHASH_ACC_XMM, (GHASH_ACC_PTR)
+
+	vzeroupper
+	RET
+.endm
+
+// void aes_gcm_enc_final_vaes_avx2(const struct aes_gcm_key_vaes_avx2 *key,
+//				    const u32 le_ctr[4], u8 ghash_acc[16],
+//				    u64 total_aadlen, u64 total_datalen);
+// bool aes_gcm_dec_final_vaes_avx2(const struct aes_gcm_key_vaes_avx2 *key,
+//				    const u32 le_ctr[4], const u8 ghash_acc[16],
+//				    u64 total_aadlen, u64 total_datalen,
+//				    const u8 tag[16], int taglen);
+//
+// This macro generates one of the above two functions (with \enc selecting
+// which one).  Both functions finish computing the GCM authentication tag by
+// updating GHASH with the lengths block and encrypting the GHASH accumulator.
+// |total_aadlen| and |total_datalen| must be the total length of the additional
+// authenticated data and the en/decrypted data in bytes, respectively.
+//
+// The encryption function then stores the full-length (16-byte) computed
+// authentication tag to |ghash_acc|.  The decryption function instead loads the
+// expected authentication tag (the one that was transmitted) from the 16-byte
+// buffer |tag|, compares the first 4 <= |taglen| <= 16 bytes of it to the
+// computed tag in constant time, and returns true if and only if they match.
+.macro	_aes_gcm_final	enc
+
+	// Function arguments
+	.set	KEY,		%rdi
+	.set	LE_CTR_PTR,	%rsi
+	.set	GHASH_ACC_PTR,	%rdx
+	.set	TOTAL_AADLEN,	%rcx
+	.set	TOTAL_DATALEN,	%r8
+	.set	TAG,		%r9
+	.set	TAGLEN,		%r10d	// Originally at 8(%rsp)
+	.set	TAGLEN64,	%r10
+
+	// Additional local variables.
+	// %rax and %xmm0-%xmm3 are used as temporary registers.
+	.set	AESKEYLEN,	%r11d
+	.set	AESKEYLEN64,	%r11
+	.set	GFPOLY,		%xmm4
+	.set	BSWAP_MASK,	%xmm5
+	.set	LE_CTR,		%xmm6
+	.set	GHASH_ACC,	%xmm7
+	.set	H_POW1,		%xmm8
+
+	// Load some constants.
+	vmovdqa		.Lgfpoly(%rip), GFPOLY
+	vmovdqa		.Lbswap_mask(%rip), BSWAP_MASK
+
+	// Load the AES key length in bytes.
+	movl		OFFSETOF_AESKEYLEN(KEY), AESKEYLEN
+
+	// Set up a counter block with 1 in the low 32-bit word.  This is the
+	// counter that produces the ciphertext needed to encrypt the auth tag.
+	// GFPOLY has 1 in the low word, so grab the 1 from there using a blend.
+	vpblendd	$0xe, (LE_CTR_PTR), GFPOLY, LE_CTR
+
+	// Build the lengths block and XOR it with the GHASH accumulator.
+	// Although the lengths block is defined as the AAD length followed by
+	// the en/decrypted data length, both in big-endian byte order, a byte
+	// reflection of the full block is needed because of the way we compute
+	// GHASH (see _ghash_mul_step).  By using little-endian values in the
+	// opposite order, we avoid having to reflect any bytes here.
+	vmovq		TOTAL_DATALEN, %xmm0
+	vpinsrq		$1, TOTAL_AADLEN, %xmm0, %xmm0
+	vpsllq		$3, %xmm0, %xmm0	// Bytes to bits
+	vpxor		(GHASH_ACC_PTR), %xmm0, GHASH_ACC
+
+	// Load the first hash key power (H^1), which is stored last.
+	vmovdqu		OFFSETOFEND_H_POWERS-16(KEY), H_POW1
+
+	// Load TAGLEN if decrypting.
+.if !\enc
+	movl		8(%rsp), TAGLEN
+.endif
+
+	// Make %rax point to the last AES round key for the chosen AES variant.
+	lea		6*16(KEY,AESKEYLEN64,4), %rax
+
+	// Start the AES encryption of the counter block by swapping the counter
+	// block to big-endian and XOR-ing it with the zero-th AES round key.
+	vpshufb		BSWAP_MASK, LE_CTR, %xmm0
+	vpxor		(KEY), %xmm0, %xmm0
+
+	// Complete the AES encryption and multiply GHASH_ACC by H^1.
+	// Interleave the AES and GHASH instructions to improve performance.
+	cmp		$24, AESKEYLEN
+	jl		128f	// AES-128?
+	je		192f	// AES-192?
+	// AES-256
+	vaesenc		-13*16(%rax), %xmm0, %xmm0
+	vaesenc		-12*16(%rax), %xmm0, %xmm0
+192:
+	vaesenc		-11*16(%rax), %xmm0, %xmm0
+	vaesenc		-10*16(%rax), %xmm0, %xmm0
+128:
+.irp i, 0,1,2,3,4,5,6,7,8
+	_ghash_mul_step	\i, H_POW1, GHASH_ACC, GHASH_ACC, GFPOLY, \
+			%xmm1, %xmm2, %xmm3
+	vaesenc		(\i-9)*16(%rax), %xmm0, %xmm0
+.endr
+	_ghash_mul_step	9, H_POW1, GHASH_ACC, GHASH_ACC, GFPOLY, \
+			%xmm1, %xmm2, %xmm3
+
+	// Undo the byte reflection of the GHASH accumulator.
+	vpshufb		BSWAP_MASK, GHASH_ACC, GHASH_ACC
+
+	// Do the last AES round and XOR the resulting keystream block with the
+	// GHASH accumulator to produce the full computed authentication tag.
+	//
+	// Reduce latency by taking advantage of the property vaesenclast(key,
+	// a) ^ b == vaesenclast(key ^ b, a).  I.e., XOR GHASH_ACC into the last
+	// round key, instead of XOR'ing the final AES output with GHASH_ACC.
+	//
+	// enc_final then returns the computed auth tag, while dec_final
+	// compares it with the transmitted one and returns a bool.  To compare
+	// the tags, dec_final XORs them together and uses vptest to check
+	// whether the result is all-zeroes.  This should be constant-time.
+	// dec_final applies the vaesenclast optimization to this additional
+	// value XOR'd too.
+.if \enc
+	vpxor		(%rax), GHASH_ACC, %xmm1
+	vaesenclast	%xmm1, %xmm0, GHASH_ACC
+	vmovdqu		GHASH_ACC, (GHASH_ACC_PTR)
+.else
+	vpxor		(TAG), GHASH_ACC, GHASH_ACC
+	vpxor		(%rax), GHASH_ACC, GHASH_ACC
+	vaesenclast	GHASH_ACC, %xmm0, %xmm0
+	lea		.Lselect_high_bytes_table(%rip), %rax
+	vmovdqu		(%rax, TAGLEN64), %xmm1
+	vpshufb		BSWAP_MASK, %xmm1, %xmm1 // select low bytes, not high
+	xor		%eax, %eax
+	vptest		%xmm1, %xmm0
+	sete		%al
+.endif
+	// No need for vzeroupper here, since only used xmm registers were used.
+	RET
+.endm
+
+SYM_FUNC_START(aes_gcm_enc_update_vaes_avx2)
+	_aes_gcm_update	1
+SYM_FUNC_END(aes_gcm_enc_update_vaes_avx2)
+SYM_FUNC_START(aes_gcm_dec_update_vaes_avx2)
+	_aes_gcm_update	0
+SYM_FUNC_END(aes_gcm_dec_update_vaes_avx2)
+
+SYM_FUNC_START(aes_gcm_enc_final_vaes_avx2)
+	_aes_gcm_final	1
+SYM_FUNC_END(aes_gcm_enc_final_vaes_avx2)
+SYM_FUNC_START(aes_gcm_dec_final_vaes_avx2)
+	_aes_gcm_final	0
+SYM_FUNC_END(aes_gcm_dec_final_vaes_avx2)
diff --git a/arch/x86/crypto/aes-gcm-avx10-x86_64.S b/arch/x86/crypto/aes-gcm-vaes-avx512.S
index 02ee11083d4f..06b71314d65c 100644
--- a/arch/x86/crypto/aes-gcm-avx10-x86_64.S
+++ b/arch/x86/crypto/aes-gcm-vaes-avx512.S
@@ -1,6 +1,7 @@
 /* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */
 //
-// VAES and VPCLMULQDQ optimized AES-GCM for x86_64
+// AES-GCM implementation for x86_64 CPUs that support the following CPU
+// features: VAES && VPCLMULQDQ && AVX512BW && AVX512VL && BMI2
 //
 // Copyright 2024 Google LLC
 //
@@ -45,41 +46,6 @@
 // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 // POSSIBILITY OF SUCH DAMAGE.
-//
-//------------------------------------------------------------------------------
-//
-// This file implements AES-GCM (Galois/Counter Mode) for x86_64 CPUs that
-// support VAES (vector AES), VPCLMULQDQ (vector carryless multiplication), and
-// either AVX512 or AVX10.  Some of the functions, notably the encryption and
-// decryption update functions which are the most performance-critical, are
-// provided in two variants generated from a macro: one using 256-bit vectors
-// (suffix: vaes_avx10_256) and one using 512-bit vectors (vaes_avx10_512).  The
-// other, "shared" functions (vaes_avx10) use at most 256-bit vectors.
-//
-// The functions that use 512-bit vectors are intended for CPUs that support
-// 512-bit vectors *and* where using them doesn't cause significant
-// downclocking.  They require the following CPU features:
-//
-//	VAES && VPCLMULQDQ && BMI2 && ((AVX512BW && AVX512VL) || AVX10/512)
-//
-// The other functions require the following CPU features:
-//
-//	VAES && VPCLMULQDQ && BMI2 && ((AVX512BW && AVX512VL) || AVX10/256)
-//
-// All functions use the "System V" ABI.  The Windows ABI is not supported.
-//
-// Note that we use "avx10" in the names of the functions as a shorthand to
-// really mean "AVX10 or a certain set of AVX512 features".  Due to Intel's
-// introduction of AVX512 and then its replacement by AVX10, there doesn't seem
-// to be a simple way to name things that makes sense on all CPUs.
-//
-// Note that the macros that support both 256-bit and 512-bit vectors could
-// fairly easily be changed to support 128-bit too.  However, this would *not*
-// be sufficient to allow the code to run on CPUs without AVX512 or AVX10,
-// because the code heavily uses several features of these extensions other than
-// the vector length: the increase in the number of SIMD registers from 16 to
-// 32, masking support, and new instructions such as vpternlogd (which can do a
-// three-argument XOR).  These features are very useful for AES-GCM.
 
 #include <linux/linkage.h>
 
@@ -104,16 +70,14 @@
 .Lgfpoly_and_internal_carrybit:
 	.octa	0xc2000000000000010000000000000001
 
-	// The below constants are used for incrementing the counter blocks.
-	// ctr_pattern points to the four 128-bit values [0, 1, 2, 3].
-	// inc_2blocks and inc_4blocks point to the single 128-bit values 2 and
-	// 4.  Note that the same '2' is reused in ctr_pattern and inc_2blocks.
+	// Values needed to prepare the initial vector of counter blocks.
 .Lctr_pattern:
 	.octa	0
 	.octa	1
-.Linc_2blocks:
 	.octa	2
 	.octa	3
+
+	// The number of AES blocks per vector, as a 128-bit value.
 .Linc_4blocks:
 	.octa	4
 
@@ -130,29 +94,13 @@
 // Offset to end of hash key powers array in the key struct.
 //
 // This is immediately followed by three zeroized padding blocks, which are
-// included so that partial vectors can be handled more easily.  E.g. if VL=64
-// and two blocks remain, we load the 4 values [H^2, H^1, 0, 0].  The most
-// padding blocks needed is 3, which occurs if [H^1, 0, 0, 0] is loaded.
+// included so that partial vectors can be handled more easily.  E.g. if two
+// blocks remain, we load the 4 values [H^2, H^1, 0, 0].  The most padding
+// blocks needed is 3, which occurs if [H^1, 0, 0, 0] is loaded.
 #define OFFSETOFEND_H_POWERS	(OFFSETOF_H_POWERS + (NUM_H_POWERS * 16))
 
 .text
 
-// Set the vector length in bytes.  This sets the VL variable and defines
-// register aliases V0-V31 that map to the ymm or zmm registers.
-.macro	_set_veclen	vl
-	.set	VL,	\vl
-.irp i, 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, \
-	16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
-.if VL == 32
-	.set	V\i,	%ymm\i
-.elseif VL == 64
-	.set	V\i,	%zmm\i
-.else
-	.error "Unsupported vector length"
-.endif
-.endr
-.endm
-
 // The _ghash_mul_step macro does one step of GHASH multiplication of the
 // 128-bit lanes of \a by the corresponding 128-bit lanes of \b and storing the
 // reduced products in \dst.  \t0, \t1, and \t2 are temporary registers of the
@@ -312,39 +260,44 @@
 	vpternlogd	$0x96, \t0, \mi, \hi
 .endm
 
-// void aes_gcm_precompute_##suffix(struct aes_gcm_key_avx10 *key);
-//
-// Given the expanded AES key |key->aes_key|, this function derives the GHASH
-// subkey and initializes |key->ghash_key_powers| with powers of it.
-//
-// The number of key powers initialized is NUM_H_POWERS, and they are stored in
-// the order H^NUM_H_POWERS to H^1.  The zeroized padding blocks after the key
-// powers themselves are also initialized.
+// This is a specialized version of _ghash_mul that computes \a * \a, i.e. it
+// squares \a.  It skips computing MI = (a_L * a_H) + (a_H * a_L) = 0.
+.macro	_ghash_square	a, dst, gfpoly, t0, t1
+	vpclmulqdq	$0x00, \a, \a, \t0	  // LO = a_L * a_L
+	vpclmulqdq	$0x11, \a, \a, \dst	  // HI = a_H * a_H
+	vpclmulqdq	$0x01, \t0, \gfpoly, \t1  // LO_L*(x^63 + x^62 + x^57)
+	vpshufd		$0x4e, \t0, \t0		  // Swap halves of LO
+	vpxord		\t0, \t1, \t1		  // Fold LO into MI
+	vpclmulqdq	$0x01, \t1, \gfpoly, \t0  // MI_L*(x^63 + x^62 + x^57)
+	vpshufd		$0x4e, \t1, \t1		  // Swap halves of MI
+	vpternlogd	$0x96, \t0, \t1, \dst	  // Fold MI into HI
+.endm
+
+// void aes_gcm_precompute_vaes_avx512(struct aes_gcm_key_vaes_avx512 *key);
 //
-// This macro supports both VL=32 and VL=64.  _set_veclen must have been invoked
-// with the desired length.  In the VL=32 case, the function computes twice as
-// many key powers than are actually used by the VL=32 GCM update functions.
-// This is done to keep the key format the same regardless of vector length.
-.macro	_aes_gcm_precompute
+// Given the expanded AES key |key->base.aes_key|, derive the GHASH subkey and
+// initialize |key->h_powers| and |key->padding|.
+SYM_FUNC_START(aes_gcm_precompute_vaes_avx512)
 
 	// Function arguments
 	.set	KEY,		%rdi
 
-	// Additional local variables.  V0-V2 and %rax are used as temporaries.
+	// Additional local variables.
+	// %zmm[0-2] and %rax are used as temporaries.
 	.set	POWERS_PTR,	%rsi
 	.set	RNDKEYLAST_PTR,	%rdx
-	.set	H_CUR,		V3
+	.set	H_CUR,		%zmm3
 	.set	H_CUR_YMM,	%ymm3
 	.set	H_CUR_XMM,	%xmm3
-	.set	H_INC,		V4
+	.set	H_INC,		%zmm4
 	.set	H_INC_YMM,	%ymm4
 	.set	H_INC_XMM,	%xmm4
-	.set	GFPOLY,		V5
+	.set	GFPOLY,		%zmm5
 	.set	GFPOLY_YMM,	%ymm5
 	.set	GFPOLY_XMM,	%xmm5
 
 	// Get pointer to lowest set of key powers (located at end of array).
-	lea		OFFSETOFEND_H_POWERS-VL(KEY), POWERS_PTR
+	lea		OFFSETOFEND_H_POWERS-64(KEY), POWERS_PTR
 
 	// Encrypt an all-zeroes block to get the raw hash subkey.
 	movl		OFFSETOF_AESKEYLEN(KEY), %eax
@@ -363,8 +316,8 @@
 
 	// Zeroize the padding blocks.
 	vpxor		%xmm0, %xmm0, %xmm0
-	vmovdqu		%ymm0, VL(POWERS_PTR)
-	vmovdqu		%xmm0, VL+2*16(POWERS_PTR)
+	vmovdqu		%ymm0, 64(POWERS_PTR)
+	vmovdqu		%xmm0, 64+2*16(POWERS_PTR)
 
 	// Finish preprocessing the first key power, H^1.  Since this GHASH
 	// implementation operates directly on values with the backwards bit
@@ -397,54 +350,44 @@
 	// special needs to be done to make this happen, though: H^1 * H^1 would
 	// end up with two factors of x^-1, but the multiplication consumes one.
 	// So the product H^2 ends up with the desired one factor of x^-1.
-	_ghash_mul	H_CUR_XMM, H_CUR_XMM, H_INC_XMM, GFPOLY_XMM, \
-			%xmm0, %xmm1, %xmm2
+	_ghash_square	H_CUR_XMM, H_INC_XMM, GFPOLY_XMM, %xmm0, %xmm1
 
 	// Create H_CUR_YMM = [H^2, H^1] and H_INC_YMM = [H^2, H^2].
 	vinserti128	$1, H_CUR_XMM, H_INC_YMM, H_CUR_YMM
 	vinserti128	$1, H_INC_XMM, H_INC_YMM, H_INC_YMM
 
-.if VL == 64
 	// Create H_CUR = [H^4, H^3, H^2, H^1] and H_INC = [H^4, H^4, H^4, H^4].
 	_ghash_mul	H_INC_YMM, H_CUR_YMM, H_INC_YMM, GFPOLY_YMM, \
 			%ymm0, %ymm1, %ymm2
 	vinserti64x4	$1, H_CUR_YMM, H_INC, H_CUR
 	vshufi64x2	$0, H_INC, H_INC, H_INC
-.endif
 
 	// Store the lowest set of key powers.
 	vmovdqu8	H_CUR, (POWERS_PTR)
 
-	// Compute and store the remaining key powers.  With VL=32, repeatedly
-	// multiply [H^(i+1), H^i] by [H^2, H^2] to get [H^(i+3), H^(i+2)].
-	// With VL=64, repeatedly multiply [H^(i+3), H^(i+2), H^(i+1), H^i] by
+	// Compute and store the remaining key powers.
+	// Repeatedly multiply [H^(i+3), H^(i+2), H^(i+1), H^i] by
 	// [H^4, H^4, H^4, H^4] to get [H^(i+7), H^(i+6), H^(i+5), H^(i+4)].
-	mov		$(NUM_H_POWERS*16/VL) - 1, %eax
-.Lprecompute_next\@:
-	sub		$VL, POWERS_PTR
-	_ghash_mul	H_INC, H_CUR, H_CUR, GFPOLY, V0, V1, V2
+	mov		$3, %eax
+.Lprecompute_next:
+	sub		$64, POWERS_PTR
+	_ghash_mul	H_INC, H_CUR, H_CUR, GFPOLY, %zmm0, %zmm1, %zmm2
 	vmovdqu8	H_CUR, (POWERS_PTR)
 	dec		%eax
-	jnz		.Lprecompute_next\@
+	jnz		.Lprecompute_next
 
 	vzeroupper	// This is needed after using ymm or zmm registers.
 	RET
-.endm
+SYM_FUNC_END(aes_gcm_precompute_vaes_avx512)
 
 // XOR together the 128-bit lanes of \src (whose low lane is \src_xmm) and store
 // the result in \dst_xmm.  This implicitly zeroizes the other lanes of dst.
 .macro	_horizontal_xor	src, src_xmm, dst_xmm, t0_xmm, t1_xmm, t2_xmm
 	vextracti32x4	$1, \src, \t0_xmm
-.if VL == 32
-	vpxord		\t0_xmm, \src_xmm, \dst_xmm
-.elseif VL == 64
 	vextracti32x4	$2, \src, \t1_xmm
 	vextracti32x4	$3, \src, \t2_xmm
 	vpxord		\t0_xmm, \src_xmm, \dst_xmm
 	vpternlogd	$0x96, \t1_xmm, \t2_xmm, \dst_xmm
-.else
-	.error "Unsupported vector length"
-.endif
 .endm
 
 // Do one step of the GHASH update of the data blocks given in the vector
@@ -458,25 +401,21 @@
 //
 // The GHASH update does: GHASH_ACC = H_POW4*(GHASHDATA0 + GHASH_ACC) +
 // H_POW3*GHASHDATA1 + H_POW2*GHASHDATA2 + H_POW1*GHASHDATA3, where the
-// operations are vectorized operations on vectors of 16-byte blocks.  E.g.,
-// with VL=32 there are 2 blocks per vector and the vectorized terms correspond
-// to the following non-vectorized terms:
-//
-//	H_POW4*(GHASHDATA0 + GHASH_ACC) => H^8*(blk0 + GHASH_ACC_XMM) and H^7*(blk1 + 0)
-//	H_POW3*GHASHDATA1 => H^6*blk2 and H^5*blk3
-//	H_POW2*GHASHDATA2 => H^4*blk4 and H^3*blk5
-//	H_POW1*GHASHDATA3 => H^2*blk6 and H^1*blk7
+// operations are vectorized operations on 512-bit vectors of 128-bit blocks.
+// The vectorized terms correspond to the following non-vectorized terms:
 //
-// With VL=64, we use 4 blocks/vector, H^16 through H^1, and blk0 through blk15.
+//       H_POW4*(GHASHDATA0 + GHASH_ACC) => H^16*(blk0 + GHASH_ACC_XMM),
+//              H^15*(blk1 + 0), H^14*(blk2 + 0), and H^13*(blk3 + 0)
+//       H_POW3*GHASHDATA1 => H^12*blk4, H^11*blk5, H^10*blk6, and H^9*blk7
+//       H_POW2*GHASHDATA2 => H^8*blk8,  H^7*blk9,  H^6*blk10, and H^5*blk11
+//       H_POW1*GHASHDATA3 => H^4*blk12, H^3*blk13, H^2*blk14, and H^1*blk15
 //
 // More concretely, this code does:
 //   - Do vectorized "schoolbook" multiplications to compute the intermediate
 //     256-bit product of each block and its corresponding hash key power.
-//     There are 4*VL/16 of these intermediate products.
-//   - Sum (XOR) the intermediate 256-bit products across vectors.  This leaves
-//     VL/16 256-bit intermediate values.
+//   - Sum (XOR) the intermediate 256-bit products across vectors.
 //   - Do a vectorized reduction of these 256-bit intermediate values to
-//     128-bits each.  This leaves VL/16 128-bit intermediate values.
+//     128-bits each.
 //   - Sum (XOR) these values and store the 128-bit result in GHASH_ACC_XMM.
 //
 // See _ghash_mul_step for the full explanation of the operations performed for
@@ -532,85 +471,224 @@
 .endif
 .endm
 
-// Do one non-last round of AES encryption on the counter blocks in V0-V3 using
-// the round key that has been broadcast to all 128-bit lanes of \round_key.
+// Update GHASH with four vectors of data blocks.  See _ghash_step_4x for full
+// explanation.
+.macro	_ghash_4x
+.irp i, 0,1,2,3,4,5,6,7,8,9
+	_ghash_step_4x	\i
+.endr
+.endm
+
+// void aes_gcm_aad_update_vaes_avx512(const struct aes_gcm_key_vaes_avx512 *key,
+//				       u8 ghash_acc[16],
+//				       const u8 *aad, int aadlen);
+//
+// This function processes the AAD (Additional Authenticated Data) in GCM.
+// Using the key |key|, it updates the GHASH accumulator |ghash_acc| with the
+// data given by |aad| and |aadlen|.  On the first call, |ghash_acc| must be all
+// zeroes.  |aadlen| must be a multiple of 16, except on the last call where it
+// can be any length.  The caller must do any buffering needed to ensure this.
+//
+// This handles large amounts of AAD efficiently, while also keeping overhead
+// low for small amounts which is the common case.  TLS and IPsec use less than
+// one block of AAD, but (uncommonly) other use cases may use much more.
+SYM_FUNC_START(aes_gcm_aad_update_vaes_avx512)
+
+	// Function arguments
+	.set	KEY,		%rdi
+	.set	GHASH_ACC_PTR,	%rsi
+	.set	AAD,		%rdx
+	.set	AADLEN,		%ecx
+	.set	AADLEN64,	%rcx	// Zero-extend AADLEN before using!
+
+	// Additional local variables.
+	// %rax and %k1 are used as temporary registers.
+	.set	GHASHDATA0,	%zmm0
+	.set	GHASHDATA0_XMM,	%xmm0
+	.set	GHASHDATA1,	%zmm1
+	.set	GHASHDATA1_XMM,	%xmm1
+	.set	GHASHDATA2,	%zmm2
+	.set	GHASHDATA2_XMM,	%xmm2
+	.set	GHASHDATA3,	%zmm3
+	.set	BSWAP_MASK,	%zmm4
+	.set	BSWAP_MASK_XMM,	%xmm4
+	.set	GHASH_ACC,	%zmm5
+	.set	GHASH_ACC_XMM,	%xmm5
+	.set	H_POW4,		%zmm6
+	.set	H_POW3,		%zmm7
+	.set	H_POW2,		%zmm8
+	.set	H_POW1,		%zmm9
+	.set	H_POW1_XMM,	%xmm9
+	.set	GFPOLY,		%zmm10
+	.set	GFPOLY_XMM,	%xmm10
+	.set	GHASHTMP0,	%zmm11
+	.set	GHASHTMP1,	%zmm12
+	.set	GHASHTMP2,	%zmm13
+
+	// Load the GHASH accumulator.
+	vmovdqu		(GHASH_ACC_PTR), GHASH_ACC_XMM
+
+	// Check for the common case of AADLEN <= 16, as well as AADLEN == 0.
+	cmp		$16, AADLEN
+	jg		.Laad_more_than_16bytes
+	test		AADLEN, AADLEN
+	jz		.Laad_done
+
+	// Fast path: update GHASH with 1 <= AADLEN <= 16 bytes of AAD.
+	vmovdqu		.Lbswap_mask(%rip), BSWAP_MASK_XMM
+	vmovdqu		.Lgfpoly(%rip), GFPOLY_XMM
+	mov		$-1, %eax
+	bzhi		AADLEN, %eax, %eax
+	kmovd		%eax, %k1
+	vmovdqu8	(AAD), GHASHDATA0_XMM{%k1}{z}
+	vmovdqu		OFFSETOFEND_H_POWERS-16(KEY), H_POW1_XMM
+	vpshufb		BSWAP_MASK_XMM, GHASHDATA0_XMM, GHASHDATA0_XMM
+	vpxor		GHASHDATA0_XMM, GHASH_ACC_XMM, GHASH_ACC_XMM
+	_ghash_mul	H_POW1_XMM, GHASH_ACC_XMM, GHASH_ACC_XMM, GFPOLY_XMM, \
+			GHASHDATA0_XMM, GHASHDATA1_XMM, GHASHDATA2_XMM
+	jmp		.Laad_done
+
+.Laad_more_than_16bytes:
+	vbroadcasti32x4	.Lbswap_mask(%rip), BSWAP_MASK
+	vbroadcasti32x4	.Lgfpoly(%rip), GFPOLY
+
+	// If AADLEN >= 256, update GHASH with 256 bytes of AAD at a time.
+	sub		$256, AADLEN
+	jl		.Laad_loop_4x_done
+	vmovdqu8	OFFSETOFEND_H_POWERS-4*64(KEY), H_POW4
+	vmovdqu8	OFFSETOFEND_H_POWERS-3*64(KEY), H_POW3
+	vmovdqu8	OFFSETOFEND_H_POWERS-2*64(KEY), H_POW2
+	vmovdqu8	OFFSETOFEND_H_POWERS-1*64(KEY), H_POW1
+.Laad_loop_4x:
+	vmovdqu8	0*64(AAD), GHASHDATA0
+	vmovdqu8	1*64(AAD), GHASHDATA1
+	vmovdqu8	2*64(AAD), GHASHDATA2
+	vmovdqu8	3*64(AAD), GHASHDATA3
+	_ghash_4x
+	add		$256, AAD
+	sub		$256, AADLEN
+	jge		.Laad_loop_4x
+.Laad_loop_4x_done:
+
+	// If AADLEN >= 64, update GHASH with 64 bytes of AAD at a time.
+	add		$192, AADLEN
+	jl		.Laad_loop_1x_done
+	vmovdqu8	OFFSETOFEND_H_POWERS-1*64(KEY), H_POW1
+.Laad_loop_1x:
+	vmovdqu8	(AAD), GHASHDATA0
+	vpshufb		BSWAP_MASK, GHASHDATA0, GHASHDATA0
+	vpxord		GHASHDATA0, GHASH_ACC, GHASH_ACC
+	_ghash_mul	H_POW1, GHASH_ACC, GHASH_ACC, GFPOLY, \
+			GHASHDATA0, GHASHDATA1, GHASHDATA2
+	_horizontal_xor GHASH_ACC, GHASH_ACC_XMM, GHASH_ACC_XMM, \
+			GHASHDATA0_XMM, GHASHDATA1_XMM, GHASHDATA2_XMM
+	add		$64, AAD
+	sub		$64, AADLEN
+	jge		.Laad_loop_1x
+.Laad_loop_1x_done:
+
+	// Update GHASH with the remaining 0 <= AADLEN < 64 bytes of AAD.
+	add		$64, AADLEN
+	jz		.Laad_done
+	mov		$-1, %rax
+	bzhi		AADLEN64, %rax, %rax
+	kmovq		%rax, %k1
+	vmovdqu8	(AAD), GHASHDATA0{%k1}{z}
+	neg		AADLEN64
+	and		$~15, AADLEN64  // -round_up(AADLEN, 16)
+	vmovdqu8	OFFSETOFEND_H_POWERS(KEY,AADLEN64), H_POW1
+	vpshufb		BSWAP_MASK, GHASHDATA0, GHASHDATA0
+	vpxord		GHASHDATA0, GHASH_ACC, GHASH_ACC
+	_ghash_mul	H_POW1, GHASH_ACC, GHASH_ACC, GFPOLY, \
+			GHASHDATA0, GHASHDATA1, GHASHDATA2
+	_horizontal_xor GHASH_ACC, GHASH_ACC_XMM, GHASH_ACC_XMM, \
+			GHASHDATA0_XMM, GHASHDATA1_XMM, GHASHDATA2_XMM
+
+.Laad_done:
+	// Store the updated GHASH accumulator back to memory.
+	vmovdqu		GHASH_ACC_XMM, (GHASH_ACC_PTR)
+
+	vzeroupper	// This is needed after using ymm or zmm registers.
+	RET
+SYM_FUNC_END(aes_gcm_aad_update_vaes_avx512)
+
+// Do one non-last round of AES encryption on the blocks in %zmm[0-3] using the
+// round key that has been broadcast to all 128-bit lanes of \round_key.
 .macro	_vaesenc_4x	round_key
-	vaesenc		\round_key, V0, V0
-	vaesenc		\round_key, V1, V1
-	vaesenc		\round_key, V2, V2
-	vaesenc		\round_key, V3, V3
+	vaesenc		\round_key, %zmm0, %zmm0
+	vaesenc		\round_key, %zmm1, %zmm1
+	vaesenc		\round_key, %zmm2, %zmm2
+	vaesenc		\round_key, %zmm3, %zmm3
 .endm
 
 // Start the AES encryption of four vectors of counter blocks.
 .macro	_ctr_begin_4x
 
 	// Increment LE_CTR four times to generate four vectors of little-endian
-	// counter blocks, swap each to big-endian, and store them in V0-V3.
-	vpshufb		BSWAP_MASK, LE_CTR, V0
+	// counter blocks, swap each to big-endian, and store them in %zmm[0-3].
+	vpshufb		BSWAP_MASK, LE_CTR, %zmm0
 	vpaddd		LE_CTR_INC, LE_CTR, LE_CTR
-	vpshufb		BSWAP_MASK, LE_CTR, V1
+	vpshufb		BSWAP_MASK, LE_CTR, %zmm1
 	vpaddd		LE_CTR_INC, LE_CTR, LE_CTR
-	vpshufb		BSWAP_MASK, LE_CTR, V2
+	vpshufb		BSWAP_MASK, LE_CTR, %zmm2
 	vpaddd		LE_CTR_INC, LE_CTR, LE_CTR
-	vpshufb		BSWAP_MASK, LE_CTR, V3
+	vpshufb		BSWAP_MASK, LE_CTR, %zmm3
 	vpaddd		LE_CTR_INC, LE_CTR, LE_CTR
 
 	// AES "round zero": XOR in the zero-th round key.
-	vpxord		RNDKEY0, V0, V0
-	vpxord		RNDKEY0, V1, V1
-	vpxord		RNDKEY0, V2, V2
-	vpxord		RNDKEY0, V3, V3
+	vpxord		RNDKEY0, %zmm0, %zmm0
+	vpxord		RNDKEY0, %zmm1, %zmm1
+	vpxord		RNDKEY0, %zmm2, %zmm2
+	vpxord		RNDKEY0, %zmm3, %zmm3
 .endm
 
-// Do the last AES round for four vectors of counter blocks V0-V3, XOR source
-// data with the resulting keystream, and write the result to DST and
+// Do the last AES round for four vectors of counter blocks %zmm[0-3], XOR
+// source data with the resulting keystream, and write the result to DST and
 // GHASHDATA[0-3].  (Implementation differs slightly, but has the same effect.)
 .macro	_aesenclast_and_xor_4x
 	// XOR the source data with the last round key, saving the result in
 	// GHASHDATA[0-3].  This reduces latency by taking advantage of the
 	// property vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a).
-	vpxord		0*VL(SRC), RNDKEYLAST, GHASHDATA0
-	vpxord		1*VL(SRC), RNDKEYLAST, GHASHDATA1
-	vpxord		2*VL(SRC), RNDKEYLAST, GHASHDATA2
-	vpxord		3*VL(SRC), RNDKEYLAST, GHASHDATA3
+	vpxord		0*64(SRC), RNDKEYLAST, GHASHDATA0
+	vpxord		1*64(SRC), RNDKEYLAST, GHASHDATA1
+	vpxord		2*64(SRC), RNDKEYLAST, GHASHDATA2
+	vpxord		3*64(SRC), RNDKEYLAST, GHASHDATA3
 
 	// Do the last AES round.  This handles the XOR with the source data
 	// too, as per the optimization described above.
-	vaesenclast	GHASHDATA0, V0, GHASHDATA0
-	vaesenclast	GHASHDATA1, V1, GHASHDATA1
-	vaesenclast	GHASHDATA2, V2, GHASHDATA2
-	vaesenclast	GHASHDATA3, V3, GHASHDATA3
+	vaesenclast	GHASHDATA0, %zmm0, GHASHDATA0
+	vaesenclast	GHASHDATA1, %zmm1, GHASHDATA1
+	vaesenclast	GHASHDATA2, %zmm2, GHASHDATA2
+	vaesenclast	GHASHDATA3, %zmm3, GHASHDATA3
 
 	// Store the en/decrypted data to DST.
-	vmovdqu8	GHASHDATA0, 0*VL(DST)
-	vmovdqu8	GHASHDATA1, 1*VL(DST)
-	vmovdqu8	GHASHDATA2, 2*VL(DST)
-	vmovdqu8	GHASHDATA3, 3*VL(DST)
+	vmovdqu8	GHASHDATA0, 0*64(DST)
+	vmovdqu8	GHASHDATA1, 1*64(DST)
+	vmovdqu8	GHASHDATA2, 2*64(DST)
+	vmovdqu8	GHASHDATA3, 3*64(DST)
 .endm
 
-// void aes_gcm_{enc,dec}_update_##suffix(const struct aes_gcm_key_avx10 *key,
-//					  const u32 le_ctr[4], u8 ghash_acc[16],
-//					  const u8 *src, u8 *dst, int datalen);
+// void aes_gcm_{enc,dec}_update_vaes_avx512(const struct aes_gcm_key_vaes_avx512 *key,
+//					     const u32 le_ctr[4], u8 ghash_acc[16],
+//					     const u8 *src, u8 *dst, int datalen);
 //
 // This macro generates a GCM encryption or decryption update function with the
-// above prototype (with \enc selecting which one).  This macro supports both
-// VL=32 and VL=64.  _set_veclen must have been invoked with the desired length.
-//
-// This function computes the next portion of the CTR keystream, XOR's it with
-// |datalen| bytes from |src|, and writes the resulting encrypted or decrypted
-// data to |dst|.  It also updates the GHASH accumulator |ghash_acc| using the
-// next |datalen| ciphertext bytes.
+// above prototype (with \enc selecting which one).  The function computes the
+// next portion of the CTR keystream, XOR's it with |datalen| bytes from |src|,
+// and writes the resulting encrypted or decrypted data to |dst|.  It also
+// updates the GHASH accumulator |ghash_acc| using the next |datalen| ciphertext
+// bytes.
 //
 // |datalen| must be a multiple of 16, except on the last call where it can be
 // any length.  The caller must do any buffering needed to ensure this.  Both
 // in-place and out-of-place en/decryption are supported.
 //
-// |le_ctr| must give the current counter in little-endian format.  For a new
-// message, the low word of the counter must be 2.  This function loads the
-// counter from |le_ctr| and increments the loaded counter as needed, but it
-// does *not* store the updated counter back to |le_ctr|.  The caller must
-// update |le_ctr| if any more data segments follow.  Internally, only the low
-// 32-bit word of the counter is incremented, following the GCM standard.
+// |le_ctr| must give the current counter in little-endian format.  This
+// function loads the counter from |le_ctr| and increments the loaded counter as
+// needed, but it does *not* store the updated counter back to |le_ctr|.  The
+// caller must update |le_ctr| if any more data segments follow.  Internally,
+// only the low 32-bit word of the counter is incremented, following the GCM
+// standard.
 .macro	_aes_gcm_update	enc
 
 	// Function arguments
@@ -634,69 +712,69 @@
 	// Pointer to the last AES round key for the chosen AES variant
 	.set	RNDKEYLAST_PTR,	%r11
 
-	// In the main loop, V0-V3 are used as AES input and output.  Elsewhere
-	// they are used as temporary registers.
+	// In the main loop, %zmm[0-3] are used as AES input and output.
+	// Elsewhere they are used as temporary registers.
 
 	// GHASHDATA[0-3] hold the ciphertext blocks and GHASH input data.
-	.set	GHASHDATA0,	V4
+	.set	GHASHDATA0,	%zmm4
 	.set	GHASHDATA0_XMM,	%xmm4
-	.set	GHASHDATA1,	V5
+	.set	GHASHDATA1,	%zmm5
 	.set	GHASHDATA1_XMM,	%xmm5
-	.set	GHASHDATA2,	V6
+	.set	GHASHDATA2,	%zmm6
 	.set	GHASHDATA2_XMM,	%xmm6
-	.set	GHASHDATA3,	V7
+	.set	GHASHDATA3,	%zmm7
 
 	// BSWAP_MASK is the shuffle mask for byte-reflecting 128-bit values
 	// using vpshufb, copied to all 128-bit lanes.
-	.set	BSWAP_MASK,	V8
+	.set	BSWAP_MASK,	%zmm8
 
 	// RNDKEY temporarily holds the next AES round key.
-	.set	RNDKEY,		V9
+	.set	RNDKEY,		%zmm9
 
 	// GHASH_ACC is the accumulator variable for GHASH.  When fully reduced,
 	// only the lowest 128-bit lane can be nonzero.  When not fully reduced,
 	// more than one lane may be used, and they need to be XOR'd together.
-	.set	GHASH_ACC,	V10
+	.set	GHASH_ACC,	%zmm10
 	.set	GHASH_ACC_XMM,	%xmm10
 
 	// LE_CTR_INC is the vector of 32-bit words that need to be added to a
 	// vector of little-endian counter blocks to advance it forwards.
-	.set	LE_CTR_INC,	V11
+	.set	LE_CTR_INC,	%zmm11
 
 	// LE_CTR contains the next set of little-endian counter blocks.
-	.set	LE_CTR,		V12
+	.set	LE_CTR,		%zmm12
 
 	// RNDKEY0, RNDKEYLAST, and RNDKEY_M[9-1] contain cached AES round keys,
 	// copied to all 128-bit lanes.  RNDKEY0 is the zero-th round key,
 	// RNDKEYLAST the last, and RNDKEY_M\i the one \i-th from the last.
-	.set	RNDKEY0,	V13
-	.set	RNDKEYLAST,	V14
-	.set	RNDKEY_M9,	V15
-	.set	RNDKEY_M8,	V16
-	.set	RNDKEY_M7,	V17
-	.set	RNDKEY_M6,	V18
-	.set	RNDKEY_M5,	V19
-	.set	RNDKEY_M4,	V20
-	.set	RNDKEY_M3,	V21
-	.set	RNDKEY_M2,	V22
-	.set	RNDKEY_M1,	V23
+	.set	RNDKEY0,	%zmm13
+	.set	RNDKEYLAST,	%zmm14
+	.set	RNDKEY_M9,	%zmm15
+	.set	RNDKEY_M8,	%zmm16
+	.set	RNDKEY_M7,	%zmm17
+	.set	RNDKEY_M6,	%zmm18
+	.set	RNDKEY_M5,	%zmm19
+	.set	RNDKEY_M4,	%zmm20
+	.set	RNDKEY_M3,	%zmm21
+	.set	RNDKEY_M2,	%zmm22
+	.set	RNDKEY_M1,	%zmm23
 
 	// GHASHTMP[0-2] are temporary variables used by _ghash_step_4x.  These
 	// cannot coincide with anything used for AES encryption, since for
 	// performance reasons GHASH and AES encryption are interleaved.
-	.set	GHASHTMP0,	V24
-	.set	GHASHTMP1,	V25
-	.set	GHASHTMP2,	V26
+	.set	GHASHTMP0,	%zmm24
+	.set	GHASHTMP1,	%zmm25
+	.set	GHASHTMP2,	%zmm26
 
-	// H_POW[4-1] contain the powers of the hash key H^(4*VL/16)...H^1.  The
+	// H_POW[4-1] contain the powers of the hash key H^16...H^1.  The
 	// descending numbering reflects the order of the key powers.
-	.set	H_POW4,		V27
-	.set	H_POW3,		V28
-	.set	H_POW2,		V29
-	.set	H_POW1,		V30
+	.set	H_POW4,		%zmm27
+	.set	H_POW3,		%zmm28
+	.set	H_POW2,		%zmm29
+	.set	H_POW1,		%zmm30
 
 	// GFPOLY contains the .Lgfpoly constant, copied to all 128-bit lanes.
-	.set	GFPOLY,		V31
+	.set	GFPOLY,		%zmm31
 
 	// Load some constants.
 	vbroadcasti32x4	.Lbswap_mask(%rip), BSWAP_MASK
@@ -719,29 +797,23 @@
 	// Finish initializing LE_CTR by adding [0, 1, ...] to its low words.
 	vpaddd		.Lctr_pattern(%rip), LE_CTR, LE_CTR
 
-	// Initialize LE_CTR_INC to contain VL/16 in all 128-bit lanes.
-.if VL == 32
-	vbroadcasti32x4	.Linc_2blocks(%rip), LE_CTR_INC
-.elseif VL == 64
+	// Load 4 into all 128-bit lanes of LE_CTR_INC.
 	vbroadcasti32x4	.Linc_4blocks(%rip), LE_CTR_INC
-.else
-	.error "Unsupported vector length"
-.endif
 
-	// If there are at least 4*VL bytes of data, then continue into the loop
-	// that processes 4*VL bytes of data at a time.  Otherwise skip it.
+	// If there are at least 256 bytes of data, then continue into the loop
+	// that processes 256 bytes of data at a time.  Otherwise skip it.
 	//
-	// Pre-subtracting 4*VL from DATALEN saves an instruction from the main
+	// Pre-subtracting 256 from DATALEN saves an instruction from the main
 	// loop and also ensures that at least one write always occurs to
 	// DATALEN, zero-extending it and allowing DATALEN64 to be used later.
-	add		$-4*VL, DATALEN  // shorter than 'sub 4*VL' when VL=32
+	sub		$256, DATALEN
 	jl		.Lcrypt_loop_4x_done\@
 
 	// Load powers of the hash key.
-	vmovdqu8	OFFSETOFEND_H_POWERS-4*VL(KEY), H_POW4
-	vmovdqu8	OFFSETOFEND_H_POWERS-3*VL(KEY), H_POW3
-	vmovdqu8	OFFSETOFEND_H_POWERS-2*VL(KEY), H_POW2
-	vmovdqu8	OFFSETOFEND_H_POWERS-1*VL(KEY), H_POW1
+	vmovdqu8	OFFSETOFEND_H_POWERS-4*64(KEY), H_POW4
+	vmovdqu8	OFFSETOFEND_H_POWERS-3*64(KEY), H_POW3
+	vmovdqu8	OFFSETOFEND_H_POWERS-2*64(KEY), H_POW2
+	vmovdqu8	OFFSETOFEND_H_POWERS-1*64(KEY), H_POW1
 
 	// Main loop: en/decrypt and hash 4 vectors at a time.
 	//
@@ -770,9 +842,9 @@
 	cmp		%rax, RNDKEYLAST_PTR
 	jne		1b
 	_aesenclast_and_xor_4x
-	sub		$-4*VL, SRC  // shorter than 'add 4*VL' when VL=32
-	sub		$-4*VL, DST
-	add		$-4*VL, DATALEN
+	add		$256, SRC
+	add		$256, DST
+	sub		$256, DATALEN
 	jl		.Lghash_last_ciphertext_4x\@
 .endif
 
@@ -786,10 +858,10 @@
 	// If decrypting, load more ciphertext blocks into GHASHDATA[0-3].  If
 	// encrypting, GHASHDATA[0-3] already contain the previous ciphertext.
 .if !\enc
-	vmovdqu8	0*VL(SRC), GHASHDATA0
-	vmovdqu8	1*VL(SRC), GHASHDATA1
-	vmovdqu8	2*VL(SRC), GHASHDATA2
-	vmovdqu8	3*VL(SRC), GHASHDATA3
+	vmovdqu8	0*64(SRC), GHASHDATA0
+	vmovdqu8	1*64(SRC), GHASHDATA1
+	vmovdqu8	2*64(SRC), GHASHDATA2
+	vmovdqu8	3*64(SRC), GHASHDATA3
 .endif
 
 	// Start the AES encryption of the counter blocks.
@@ -809,44 +881,44 @@
 	_vaesenc_4x	RNDKEY
 128:
 
-	// Finish the AES encryption of the counter blocks in V0-V3, interleaved
-	// with the GHASH update of the ciphertext blocks in GHASHDATA[0-3].
+	// Finish the AES encryption of the counter blocks in %zmm[0-3],
+	// interleaved with the GHASH update of the ciphertext blocks in
+	// GHASHDATA[0-3].
 .irp i, 9,8,7,6,5,4,3,2,1
 	_ghash_step_4x  (9 - \i)
 	_vaesenc_4x	RNDKEY_M\i
 .endr
 	_ghash_step_4x	9
 	_aesenclast_and_xor_4x
-	sub		$-4*VL, SRC  // shorter than 'add 4*VL' when VL=32
-	sub		$-4*VL, DST
-	add		$-4*VL, DATALEN
+	add		$256, SRC
+	add		$256, DST
+	sub		$256, DATALEN
 	jge		.Lcrypt_loop_4x\@
 
 .if \enc
 .Lghash_last_ciphertext_4x\@:
 	// Update GHASH with the last set of ciphertext blocks.
-.irp i, 0,1,2,3,4,5,6,7,8,9
-	_ghash_step_4x	\i
-.endr
+	_ghash_4x
 .endif
 
 .Lcrypt_loop_4x_done\@:
 
-	// Undo the extra subtraction by 4*VL and check whether data remains.
-	sub		$-4*VL, DATALEN  // shorter than 'add 4*VL' when VL=32
+	// Undo the extra subtraction by 256 and check whether data remains.
+	add		$256, DATALEN
 	jz		.Ldone\@
 
-	// The data length isn't a multiple of 4*VL.  Process the remaining data
-	// of length 1 <= DATALEN < 4*VL, up to one vector (VL bytes) at a time.
-	// Going one vector at a time may seem inefficient compared to having
-	// separate code paths for each possible number of vectors remaining.
-	// However, using a loop keeps the code size down, and it performs
-	// surprising well; modern CPUs will start executing the next iteration
-	// before the previous one finishes and also predict the number of loop
-	// iterations.  For a similar reason, we roll up the AES rounds.
+	// The data length isn't a multiple of 256 bytes.  Process the remaining
+	// data of length 1 <= DATALEN < 256, up to one 64-byte vector at a
+	// time.  Going one vector at a time may seem inefficient compared to
+	// having separate code paths for each possible number of vectors
+	// remaining.  However, using a loop keeps the code size down, and it
+	// performs surprising well; modern CPUs will start executing the next
+	// iteration before the previous one finishes and also predict the
+	// number of loop iterations.  For a similar reason, we roll up the AES
+	// rounds.
 	//
-	// On the last iteration, the remaining length may be less than VL.
-	// Handle this using masking.
+	// On the last iteration, the remaining length may be less than 64
+	// bytes.  Handle this using masking.
 	//
 	// Since there are enough key powers available for all remaining data,
 	// there is no need to do a GHASH reduction after each iteration.
@@ -875,65 +947,60 @@
 .Lcrypt_loop_1x\@:
 
 	// Select the appropriate mask for this iteration: all 1's if
-	// DATALEN >= VL, otherwise DATALEN 1's.  Do this branchlessly using the
+	// DATALEN >= 64, otherwise DATALEN 1's.  Do this branchlessly using the
 	// bzhi instruction from BMI2.  (This relies on DATALEN <= 255.)
-.if VL < 64
-	mov		$-1, %eax
-	bzhi		DATALEN, %eax, %eax
-	kmovd		%eax, %k1
-.else
 	mov		$-1, %rax
 	bzhi		DATALEN64, %rax, %rax
 	kmovq		%rax, %k1
-.endif
 
 	// Encrypt a vector of counter blocks.  This does not need to be masked.
-	vpshufb		BSWAP_MASK, LE_CTR, V0
+	vpshufb		BSWAP_MASK, LE_CTR, %zmm0
 	vpaddd		LE_CTR_INC, LE_CTR, LE_CTR
-	vpxord		RNDKEY0, V0, V0
+	vpxord		RNDKEY0, %zmm0, %zmm0
 	lea		16(KEY), %rax
 1:
 	vbroadcasti32x4	(%rax), RNDKEY
-	vaesenc		RNDKEY, V0, V0
+	vaesenc		RNDKEY, %zmm0, %zmm0
 	add		$16, %rax
 	cmp		%rax, RNDKEYLAST_PTR
 	jne		1b
-	vaesenclast	RNDKEYLAST, V0, V0
+	vaesenclast	RNDKEYLAST, %zmm0, %zmm0
 
 	// XOR the data with the appropriate number of keystream bytes.
-	vmovdqu8	(SRC), V1{%k1}{z}
-	vpxord		V1, V0, V0
-	vmovdqu8	V0, (DST){%k1}
+	vmovdqu8	(SRC), %zmm1{%k1}{z}
+	vpxord		%zmm1, %zmm0, %zmm0
+	vmovdqu8	%zmm0, (DST){%k1}
 
 	// Update GHASH with the ciphertext block(s), without reducing.
 	//
-	// In the case of DATALEN < VL, the ciphertext is zero-padded to VL.
-	// (If decrypting, it's done by the above masked load.  If encrypting,
-	// it's done by the below masked register-to-register move.)  Note that
-	// if DATALEN <= VL - 16, there will be additional padding beyond the
-	// padding of the last block specified by GHASH itself; i.e., there may
-	// be whole block(s) that get processed by the GHASH multiplication and
-	// reduction instructions but should not actually be included in the
+	// In the case of DATALEN < 64, the ciphertext is zero-padded to 64
+	// bytes.  (If decrypting, it's done by the above masked load.  If
+	// encrypting, it's done by the below masked register-to-register move.)
+	// Note that if DATALEN <= 48, there will be additional padding beyond
+	// the padding of the last block specified by GHASH itself; i.e., there
+	// may be whole block(s) that get processed by the GHASH multiplication
+	// and reduction instructions but should not actually be included in the
 	// GHASH.  However, any such blocks are all-zeroes, and the values that
 	// they're multiplied with are also all-zeroes.  Therefore they just add
 	// 0 * 0 = 0 to the final GHASH result, which makes no difference.
 	vmovdqu8	(POWERS_PTR), H_POW1
 .if \enc
-	vmovdqu8	V0, V1{%k1}{z}
+	vmovdqu8	%zmm0, %zmm1{%k1}{z}
 .endif
-	vpshufb		BSWAP_MASK, V1, V0
-	vpxord		GHASH_ACC, V0, V0
-	_ghash_mul_noreduce	H_POW1, V0, LO, MI, HI, GHASHDATA3, V1, V2, V3
+	vpshufb		BSWAP_MASK, %zmm1, %zmm0
+	vpxord		GHASH_ACC, %zmm0, %zmm0
+	_ghash_mul_noreduce	H_POW1, %zmm0, LO, MI, HI, \
+				GHASHDATA3, %zmm1, %zmm2, %zmm3
 	vpxor		GHASH_ACC_XMM, GHASH_ACC_XMM, GHASH_ACC_XMM
 
-	add		$VL, POWERS_PTR
-	add		$VL, SRC
-	add		$VL, DST
-	sub		$VL, DATALEN
+	add		$64, POWERS_PTR
+	add		$64, SRC
+	add		$64, DST
+	sub		$64, DATALEN
 	jg		.Lcrypt_loop_1x\@
 
 	// Finally, do the GHASH reduction.
-	_ghash_reduce	LO, MI, HI, GFPOLY, V0
+	_ghash_reduce	LO, MI, HI, GFPOLY, %zmm0
 	_horizontal_xor	HI, HI_XMM, GHASH_ACC_XMM, %xmm0, %xmm1, %xmm2
 
 .Ldone\@:
@@ -944,14 +1011,14 @@
 	RET
 .endm
 
-// void aes_gcm_enc_final_vaes_avx10(const struct aes_gcm_key_avx10 *key,
-//				     const u32 le_ctr[4], u8 ghash_acc[16],
-//				     u64 total_aadlen, u64 total_datalen);
-// bool aes_gcm_dec_final_vaes_avx10(const struct aes_gcm_key_avx10 *key,
-//				     const u32 le_ctr[4],
-//				     const u8 ghash_acc[16],
-//				     u64 total_aadlen, u64 total_datalen,
-//				     const u8 tag[16], int taglen);
+// void aes_gcm_enc_final_vaes_avx512(const struct aes_gcm_key_vaes_avx512 *key,
+//				      const u32 le_ctr[4], u8 ghash_acc[16],
+//				      u64 total_aadlen, u64 total_datalen);
+// bool aes_gcm_dec_final_vaes_avx512(const struct aes_gcm_key_vaes_avx512 *key,
+//				      const u32 le_ctr[4],
+//				      const u8 ghash_acc[16],
+//				      u64 total_aadlen, u64 total_datalen,
+//				      const u8 tag[16], int taglen);
 //
 // This macro generates one of the above two functions (with \enc selecting
 // which one).  Both functions finish computing the GCM authentication tag by
@@ -1081,119 +1148,16 @@
 	RET
 .endm
 
-_set_veclen 32
-SYM_FUNC_START(aes_gcm_precompute_vaes_avx10_256)
-	_aes_gcm_precompute
-SYM_FUNC_END(aes_gcm_precompute_vaes_avx10_256)
-SYM_FUNC_START(aes_gcm_enc_update_vaes_avx10_256)
-	_aes_gcm_update	1
-SYM_FUNC_END(aes_gcm_enc_update_vaes_avx10_256)
-SYM_FUNC_START(aes_gcm_dec_update_vaes_avx10_256)
-	_aes_gcm_update	0
-SYM_FUNC_END(aes_gcm_dec_update_vaes_avx10_256)
-
-_set_veclen 64
-SYM_FUNC_START(aes_gcm_precompute_vaes_avx10_512)
-	_aes_gcm_precompute
-SYM_FUNC_END(aes_gcm_precompute_vaes_avx10_512)
-SYM_FUNC_START(aes_gcm_enc_update_vaes_avx10_512)
+SYM_FUNC_START(aes_gcm_enc_update_vaes_avx512)
 	_aes_gcm_update	1
-SYM_FUNC_END(aes_gcm_enc_update_vaes_avx10_512)
-SYM_FUNC_START(aes_gcm_dec_update_vaes_avx10_512)
+SYM_FUNC_END(aes_gcm_enc_update_vaes_avx512)
+SYM_FUNC_START(aes_gcm_dec_update_vaes_avx512)
 	_aes_gcm_update	0
-SYM_FUNC_END(aes_gcm_dec_update_vaes_avx10_512)
-
-// void aes_gcm_aad_update_vaes_avx10(const struct aes_gcm_key_avx10 *key,
-//				      u8 ghash_acc[16],
-//				      const u8 *aad, int aadlen);
-//
-// This function processes the AAD (Additional Authenticated Data) in GCM.
-// Using the key |key|, it updates the GHASH accumulator |ghash_acc| with the
-// data given by |aad| and |aadlen|.  |key->ghash_key_powers| must have been
-// initialized.  On the first call, |ghash_acc| must be all zeroes.  |aadlen|
-// must be a multiple of 16, except on the last call where it can be any length.
-// The caller must do any buffering needed to ensure this.
-//
-// AES-GCM is almost always used with small amounts of AAD, less than 32 bytes.
-// Therefore, for AAD processing we currently only provide this implementation
-// which uses 256-bit vectors (ymm registers) and only has a 1x-wide loop.  This
-// keeps the code size down, and it enables some micro-optimizations, e.g. using
-// VEX-coded instructions instead of EVEX-coded to save some instruction bytes.
-// To optimize for large amounts of AAD, we could implement a 4x-wide loop and
-// provide a version using 512-bit vectors, but that doesn't seem to be useful.
-SYM_FUNC_START(aes_gcm_aad_update_vaes_avx10)
-
-	// Function arguments
-	.set	KEY,		%rdi
-	.set	GHASH_ACC_PTR,	%rsi
-	.set	AAD,		%rdx
-	.set	AADLEN,		%ecx
-	.set	AADLEN64,	%rcx	// Zero-extend AADLEN before using!
-
-	// Additional local variables.
-	// %rax, %ymm0-%ymm3, and %k1 are used as temporary registers.
-	.set	BSWAP_MASK,	%ymm4
-	.set	GFPOLY,		%ymm5
-	.set	GHASH_ACC,	%ymm6
-	.set	GHASH_ACC_XMM,	%xmm6
-	.set	H_POW1,		%ymm7
-
-	// Load some constants.
-	vbroadcasti128	.Lbswap_mask(%rip), BSWAP_MASK
-	vbroadcasti128	.Lgfpoly(%rip), GFPOLY
-
-	// Load the GHASH accumulator.
-	vmovdqu		(GHASH_ACC_PTR), GHASH_ACC_XMM
-
-	// Update GHASH with 32 bytes of AAD at a time.
-	//
-	// Pre-subtracting 32 from AADLEN saves an instruction from the loop and
-	// also ensures that at least one write always occurs to AADLEN,
-	// zero-extending it and allowing AADLEN64 to be used later.
-	sub		$32, AADLEN
-	jl		.Laad_loop_1x_done
-	vmovdqu8	OFFSETOFEND_H_POWERS-32(KEY), H_POW1	// [H^2, H^1]
-.Laad_loop_1x:
-	vmovdqu		(AAD), %ymm0
-	vpshufb		BSWAP_MASK, %ymm0, %ymm0
-	vpxor		%ymm0, GHASH_ACC, GHASH_ACC
-	_ghash_mul	H_POW1, GHASH_ACC, GHASH_ACC, GFPOLY, \
-			%ymm0, %ymm1, %ymm2
-	vextracti128	$1, GHASH_ACC, %xmm0
-	vpxor		%xmm0, GHASH_ACC_XMM, GHASH_ACC_XMM
-	add		$32, AAD
-	sub		$32, AADLEN
-	jge		.Laad_loop_1x
-.Laad_loop_1x_done:
-	add		$32, AADLEN
-	jz		.Laad_done
-
-	// Update GHASH with the remaining 1 <= AADLEN < 32 bytes of AAD.
-	mov		$-1, %eax
-	bzhi		AADLEN, %eax, %eax
-	kmovd		%eax, %k1
-	vmovdqu8	(AAD), %ymm0{%k1}{z}
-	neg		AADLEN64
-	and		$~15, AADLEN64  // -round_up(AADLEN, 16)
-	vmovdqu8	OFFSETOFEND_H_POWERS(KEY,AADLEN64), H_POW1
-	vpshufb		BSWAP_MASK, %ymm0, %ymm0
-	vpxor		%ymm0, GHASH_ACC, GHASH_ACC
-	_ghash_mul	H_POW1, GHASH_ACC, GHASH_ACC, GFPOLY, \
-			%ymm0, %ymm1, %ymm2
-	vextracti128	$1, GHASH_ACC, %xmm0
-	vpxor		%xmm0, GHASH_ACC_XMM, GHASH_ACC_XMM
-
-.Laad_done:
-	// Store the updated GHASH accumulator back to memory.
-	vmovdqu		GHASH_ACC_XMM, (GHASH_ACC_PTR)
-
-	vzeroupper	// This is needed after using ymm or zmm registers.
-	RET
-SYM_FUNC_END(aes_gcm_aad_update_vaes_avx10)
+SYM_FUNC_END(aes_gcm_dec_update_vaes_avx512)
 
-SYM_FUNC_START(aes_gcm_enc_final_vaes_avx10)
+SYM_FUNC_START(aes_gcm_enc_final_vaes_avx512)
 	_aes_gcm_final	1
-SYM_FUNC_END(aes_gcm_enc_final_vaes_avx10)
-SYM_FUNC_START(aes_gcm_dec_final_vaes_avx10)
+SYM_FUNC_END(aes_gcm_enc_final_vaes_avx512)
+SYM_FUNC_START(aes_gcm_dec_final_vaes_avx512)
 	_aes_gcm_final	0
-SYM_FUNC_END(aes_gcm_dec_final_vaes_avx10)
+SYM_FUNC_END(aes_gcm_dec_final_vaes_avx512)
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index d953ac470aae..bb6e2c47ffc6 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -874,8 +874,38 @@ struct aes_gcm_key_aesni {
 #define AES_GCM_KEY_AESNI_SIZE	\
 	(sizeof(struct aes_gcm_key_aesni) + (15 & ~(CRYPTO_MINALIGN - 1)))
 
-/* Key struct used by the VAES + AVX10 implementations of AES-GCM */
-struct aes_gcm_key_avx10 {
+/* Key struct used by the VAES + AVX2 implementation of AES-GCM */
+struct aes_gcm_key_vaes_avx2 {
+	/*
+	 * Common part of the key.  The assembly code prefers 16-byte alignment
+	 * for the round keys; we get this by them being located at the start of
+	 * the struct and the whole struct being 32-byte aligned.
+	 */
+	struct aes_gcm_key base;
+
+	/*
+	 * Powers of the hash key H^8 through H^1.  These are 128-bit values.
+	 * They all have an extra factor of x^-1 and are byte-reversed.
+	 * The assembly code prefers 32-byte alignment for this.
+	 */
+	u64 h_powers[8][2] __aligned(32);
+
+	/*
+	 * Each entry in this array contains the two halves of an entry of
+	 * h_powers XOR'd together, in the following order:
+	 * H^8,H^6,H^7,H^5,H^4,H^2,H^3,H^1 i.e. indices 0,2,1,3,4,6,5,7.
+	 * This is used for Karatsuba multiplication.
+	 */
+	u64 h_powers_xored[8];
+};
+
+#define AES_GCM_KEY_VAES_AVX2(key) \
+	container_of((key), struct aes_gcm_key_vaes_avx2, base)
+#define AES_GCM_KEY_VAES_AVX2_SIZE \
+	(sizeof(struct aes_gcm_key_vaes_avx2) + (31 & ~(CRYPTO_MINALIGN - 1)))
+
+/* Key struct used by the VAES + AVX512 implementation of AES-GCM */
+struct aes_gcm_key_vaes_avx512 {
 	/*
 	 * Common part of the key.  The assembly code prefers 16-byte alignment
 	 * for the round keys; we get this by them being located at the start of
@@ -895,10 +925,10 @@ struct aes_gcm_key_avx10 {
 	/* Three padding blocks required by the assembly code */
 	u64 padding[3][2];
 };
-#define AES_GCM_KEY_AVX10(key)	\
-	container_of((key), struct aes_gcm_key_avx10, base)
-#define AES_GCM_KEY_AVX10_SIZE	\
-	(sizeof(struct aes_gcm_key_avx10) + (63 & ~(CRYPTO_MINALIGN - 1)))
+#define AES_GCM_KEY_VAES_AVX512(key) \
+	container_of((key), struct aes_gcm_key_vaes_avx512, base)
+#define AES_GCM_KEY_VAES_AVX512_SIZE \
+	(sizeof(struct aes_gcm_key_vaes_avx512) + (63 & ~(CRYPTO_MINALIGN - 1)))
 
 /*
  * These flags are passed to the AES-GCM helper functions to specify the
@@ -910,14 +940,16 @@ struct aes_gcm_key_avx10 {
 #define FLAG_RFC4106	BIT(0)
 #define FLAG_ENC	BIT(1)
 #define FLAG_AVX	BIT(2)
-#define FLAG_AVX10_256	BIT(3)
-#define FLAG_AVX10_512	BIT(4)
+#define FLAG_VAES_AVX2	BIT(3)
+#define FLAG_VAES_AVX512 BIT(4)
 
 static inline struct aes_gcm_key *
 aes_gcm_key_get(struct crypto_aead *tfm, int flags)
 {
-	if (flags & (FLAG_AVX10_256 | FLAG_AVX10_512))
+	if (flags & FLAG_VAES_AVX512)
 		return PTR_ALIGN(crypto_aead_ctx(tfm), 64);
+	else if (flags & FLAG_VAES_AVX2)
+		return PTR_ALIGN(crypto_aead_ctx(tfm), 32);
 	else
 		return PTR_ALIGN(crypto_aead_ctx(tfm), 16);
 }
@@ -927,26 +959,16 @@ aes_gcm_precompute_aesni(struct aes_gcm_key_aesni *key);
 asmlinkage void
 aes_gcm_precompute_aesni_avx(struct aes_gcm_key_aesni *key);
 asmlinkage void
-aes_gcm_precompute_vaes_avx10_256(struct aes_gcm_key_avx10 *key);
+aes_gcm_precompute_vaes_avx2(struct aes_gcm_key_vaes_avx2 *key);
 asmlinkage void
-aes_gcm_precompute_vaes_avx10_512(struct aes_gcm_key_avx10 *key);
+aes_gcm_precompute_vaes_avx512(struct aes_gcm_key_vaes_avx512 *key);
 
 static void aes_gcm_precompute(struct aes_gcm_key *key, int flags)
 {
-	/*
-	 * To make things a bit easier on the assembly side, the AVX10
-	 * implementations use the same key format.  Therefore, a single
-	 * function using 256-bit vectors would suffice here.  However, it's
-	 * straightforward to provide a 512-bit one because of how the assembly
-	 * code is structured, and it works nicely because the total size of the
-	 * key powers is a multiple of 512 bits.  So we take advantage of that.
-	 *
-	 * A similar situation applies to the AES-NI implementations.
-	 */
-	if (flags & FLAG_AVX10_512)
-		aes_gcm_precompute_vaes_avx10_512(AES_GCM_KEY_AVX10(key));
-	else if (flags & FLAG_AVX10_256)
-		aes_gcm_precompute_vaes_avx10_256(AES_GCM_KEY_AVX10(key));
+	if (flags & FLAG_VAES_AVX512)
+		aes_gcm_precompute_vaes_avx512(AES_GCM_KEY_VAES_AVX512(key));
+	else if (flags & FLAG_VAES_AVX2)
+		aes_gcm_precompute_vaes_avx2(AES_GCM_KEY_VAES_AVX2(key));
 	else if (flags & FLAG_AVX)
 		aes_gcm_precompute_aesni_avx(AES_GCM_KEY_AESNI(key));
 	else
@@ -960,15 +982,21 @@ asmlinkage void
 aes_gcm_aad_update_aesni_avx(const struct aes_gcm_key_aesni *key,
 			     u8 ghash_acc[16], const u8 *aad, int aadlen);
 asmlinkage void
-aes_gcm_aad_update_vaes_avx10(const struct aes_gcm_key_avx10 *key,
-			      u8 ghash_acc[16], const u8 *aad, int aadlen);
+aes_gcm_aad_update_vaes_avx2(const struct aes_gcm_key_vaes_avx2 *key,
+			     u8 ghash_acc[16], const u8 *aad, int aadlen);
+asmlinkage void
+aes_gcm_aad_update_vaes_avx512(const struct aes_gcm_key_vaes_avx512 *key,
+			       u8 ghash_acc[16], const u8 *aad, int aadlen);
 
 static void aes_gcm_aad_update(const struct aes_gcm_key *key, u8 ghash_acc[16],
 			       const u8 *aad, int aadlen, int flags)
 {
-	if (flags & (FLAG_AVX10_256 | FLAG_AVX10_512))
-		aes_gcm_aad_update_vaes_avx10(AES_GCM_KEY_AVX10(key), ghash_acc,
-					      aad, aadlen);
+	if (flags & FLAG_VAES_AVX512)
+		aes_gcm_aad_update_vaes_avx512(AES_GCM_KEY_VAES_AVX512(key),
+					       ghash_acc, aad, aadlen);
+	else if (flags & FLAG_VAES_AVX2)
+		aes_gcm_aad_update_vaes_avx2(AES_GCM_KEY_VAES_AVX2(key),
+					     ghash_acc, aad, aadlen);
 	else if (flags & FLAG_AVX)
 		aes_gcm_aad_update_aesni_avx(AES_GCM_KEY_AESNI(key), ghash_acc,
 					     aad, aadlen);
@@ -986,13 +1014,13 @@ aes_gcm_enc_update_aesni_avx(const struct aes_gcm_key_aesni *key,
 			     const u32 le_ctr[4], u8 ghash_acc[16],
 			     const u8 *src, u8 *dst, int datalen);
 asmlinkage void
-aes_gcm_enc_update_vaes_avx10_256(const struct aes_gcm_key_avx10 *key,
-				  const u32 le_ctr[4], u8 ghash_acc[16],
-				  const u8 *src, u8 *dst, int datalen);
+aes_gcm_enc_update_vaes_avx2(const struct aes_gcm_key_vaes_avx2 *key,
+			     const u32 le_ctr[4], u8 ghash_acc[16],
+			     const u8 *src, u8 *dst, int datalen);
 asmlinkage void
-aes_gcm_enc_update_vaes_avx10_512(const struct aes_gcm_key_avx10 *key,
-				  const u32 le_ctr[4], u8 ghash_acc[16],
-				  const u8 *src, u8 *dst, int datalen);
+aes_gcm_enc_update_vaes_avx512(const struct aes_gcm_key_vaes_avx512 *key,
+			       const u32 le_ctr[4], u8 ghash_acc[16],
+			       const u8 *src, u8 *dst, int datalen);
 
 asmlinkage void
 aes_gcm_dec_update_aesni(const struct aes_gcm_key_aesni *key,
@@ -1003,13 +1031,13 @@ aes_gcm_dec_update_aesni_avx(const struct aes_gcm_key_aesni *key,
 			     const u32 le_ctr[4], u8 ghash_acc[16],
 			     const u8 *src, u8 *dst, int datalen);
 asmlinkage void
-aes_gcm_dec_update_vaes_avx10_256(const struct aes_gcm_key_avx10 *key,
-				  const u32 le_ctr[4], u8 ghash_acc[16],
-				  const u8 *src, u8 *dst, int datalen);
+aes_gcm_dec_update_vaes_avx2(const struct aes_gcm_key_vaes_avx2 *key,
+			     const u32 le_ctr[4], u8 ghash_acc[16],
+			     const u8 *src, u8 *dst, int datalen);
 asmlinkage void
-aes_gcm_dec_update_vaes_avx10_512(const struct aes_gcm_key_avx10 *key,
-				  const u32 le_ctr[4], u8 ghash_acc[16],
-				  const u8 *src, u8 *dst, int datalen);
+aes_gcm_dec_update_vaes_avx512(const struct aes_gcm_key_vaes_avx512 *key,
+			       const u32 le_ctr[4], u8 ghash_acc[16],
+			       const u8 *src, u8 *dst, int datalen);
 
 /* __always_inline to optimize out the branches based on @flags */
 static __always_inline void
@@ -1018,14 +1046,14 @@ aes_gcm_update(const struct aes_gcm_key *key,
 	       const u8 *src, u8 *dst, int datalen, int flags)
 {
 	if (flags & FLAG_ENC) {
-		if (flags & FLAG_AVX10_512)
-			aes_gcm_enc_update_vaes_avx10_512(AES_GCM_KEY_AVX10(key),
-							  le_ctr, ghash_acc,
-							  src, dst, datalen);
-		else if (flags & FLAG_AVX10_256)
-			aes_gcm_enc_update_vaes_avx10_256(AES_GCM_KEY_AVX10(key),
-							  le_ctr, ghash_acc,
-							  src, dst, datalen);
+		if (flags & FLAG_VAES_AVX512)
+			aes_gcm_enc_update_vaes_avx512(AES_GCM_KEY_VAES_AVX512(key),
+						       le_ctr, ghash_acc,
+						       src, dst, datalen);
+		else if (flags & FLAG_VAES_AVX2)
+			aes_gcm_enc_update_vaes_avx2(AES_GCM_KEY_VAES_AVX2(key),
+						     le_ctr, ghash_acc,
+						     src, dst, datalen);
 		else if (flags & FLAG_AVX)
 			aes_gcm_enc_update_aesni_avx(AES_GCM_KEY_AESNI(key),
 						     le_ctr, ghash_acc,
@@ -1034,14 +1062,14 @@ aes_gcm_update(const struct aes_gcm_key *key,
 			aes_gcm_enc_update_aesni(AES_GCM_KEY_AESNI(key), le_ctr,
 						 ghash_acc, src, dst, datalen);
 	} else {
-		if (flags & FLAG_AVX10_512)
-			aes_gcm_dec_update_vaes_avx10_512(AES_GCM_KEY_AVX10(key),
-							  le_ctr, ghash_acc,
-							  src, dst, datalen);
-		else if (flags & FLAG_AVX10_256)
-			aes_gcm_dec_update_vaes_avx10_256(AES_GCM_KEY_AVX10(key),
-							  le_ctr, ghash_acc,
-							  src, dst, datalen);
+		if (flags & FLAG_VAES_AVX512)
+			aes_gcm_dec_update_vaes_avx512(AES_GCM_KEY_VAES_AVX512(key),
+						       le_ctr, ghash_acc,
+						       src, dst, datalen);
+		else if (flags & FLAG_VAES_AVX2)
+			aes_gcm_dec_update_vaes_avx2(AES_GCM_KEY_VAES_AVX2(key),
+						     le_ctr, ghash_acc,
+						     src, dst, datalen);
 		else if (flags & FLAG_AVX)
 			aes_gcm_dec_update_aesni_avx(AES_GCM_KEY_AESNI(key),
 						     le_ctr, ghash_acc,
@@ -1062,9 +1090,13 @@ aes_gcm_enc_final_aesni_avx(const struct aes_gcm_key_aesni *key,
 			    const u32 le_ctr[4], u8 ghash_acc[16],
 			    u64 total_aadlen, u64 total_datalen);
 asmlinkage void
-aes_gcm_enc_final_vaes_avx10(const struct aes_gcm_key_avx10 *key,
-			     const u32 le_ctr[4], u8 ghash_acc[16],
-			     u64 total_aadlen, u64 total_datalen);
+aes_gcm_enc_final_vaes_avx2(const struct aes_gcm_key_vaes_avx2 *key,
+			    const u32 le_ctr[4], u8 ghash_acc[16],
+			    u64 total_aadlen, u64 total_datalen);
+asmlinkage void
+aes_gcm_enc_final_vaes_avx512(const struct aes_gcm_key_vaes_avx512 *key,
+			      const u32 le_ctr[4], u8 ghash_acc[16],
+			      u64 total_aadlen, u64 total_datalen);
 
 /* __always_inline to optimize out the branches based on @flags */
 static __always_inline void
@@ -1072,10 +1104,14 @@ aes_gcm_enc_final(const struct aes_gcm_key *key,
 		  const u32 le_ctr[4], u8 ghash_acc[16],
 		  u64 total_aadlen, u64 total_datalen, int flags)
 {
-	if (flags & (FLAG_AVX10_256 | FLAG_AVX10_512))
-		aes_gcm_enc_final_vaes_avx10(AES_GCM_KEY_AVX10(key),
-					     le_ctr, ghash_acc,
-					     total_aadlen, total_datalen);
+	if (flags & FLAG_VAES_AVX512)
+		aes_gcm_enc_final_vaes_avx512(AES_GCM_KEY_VAES_AVX512(key),
+					      le_ctr, ghash_acc,
+					      total_aadlen, total_datalen);
+	else if (flags & FLAG_VAES_AVX2)
+		aes_gcm_enc_final_vaes_avx2(AES_GCM_KEY_VAES_AVX2(key),
+					    le_ctr, ghash_acc,
+					    total_aadlen, total_datalen);
 	else if (flags & FLAG_AVX)
 		aes_gcm_enc_final_aesni_avx(AES_GCM_KEY_AESNI(key),
 					    le_ctr, ghash_acc,
@@ -1097,10 +1133,15 @@ aes_gcm_dec_final_aesni_avx(const struct aes_gcm_key_aesni *key,
 			    u64 total_aadlen, u64 total_datalen,
 			    const u8 tag[16], int taglen);
 asmlinkage bool __must_check
-aes_gcm_dec_final_vaes_avx10(const struct aes_gcm_key_avx10 *key,
-			     const u32 le_ctr[4], const u8 ghash_acc[16],
-			     u64 total_aadlen, u64 total_datalen,
-			     const u8 tag[16], int taglen);
+aes_gcm_dec_final_vaes_avx2(const struct aes_gcm_key_vaes_avx2 *key,
+			    const u32 le_ctr[4], const u8 ghash_acc[16],
+			    u64 total_aadlen, u64 total_datalen,
+			    const u8 tag[16], int taglen);
+asmlinkage bool __must_check
+aes_gcm_dec_final_vaes_avx512(const struct aes_gcm_key_vaes_avx512 *key,
+			      const u32 le_ctr[4], const u8 ghash_acc[16],
+			      u64 total_aadlen, u64 total_datalen,
+			      const u8 tag[16], int taglen);
 
 /* __always_inline to optimize out the branches based on @flags */
 static __always_inline bool __must_check
@@ -1108,11 +1149,16 @@ aes_gcm_dec_final(const struct aes_gcm_key *key, const u32 le_ctr[4],
 		  u8 ghash_acc[16], u64 total_aadlen, u64 total_datalen,
 		  u8 tag[16], int taglen, int flags)
 {
-	if (flags & (FLAG_AVX10_256 | FLAG_AVX10_512))
-		return aes_gcm_dec_final_vaes_avx10(AES_GCM_KEY_AVX10(key),
-						    le_ctr, ghash_acc,
-						    total_aadlen, total_datalen,
-						    tag, taglen);
+	if (flags & FLAG_VAES_AVX512)
+		return aes_gcm_dec_final_vaes_avx512(AES_GCM_KEY_VAES_AVX512(key),
+						     le_ctr, ghash_acc,
+						     total_aadlen, total_datalen,
+						     tag, taglen);
+	else if (flags & FLAG_VAES_AVX2)
+		return aes_gcm_dec_final_vaes_avx2(AES_GCM_KEY_VAES_AVX2(key),
+						   le_ctr, ghash_acc,
+						   total_aadlen, total_datalen,
+						   tag, taglen);
 	else if (flags & FLAG_AVX)
 		return aes_gcm_dec_final_aesni_avx(AES_GCM_KEY_AESNI(key),
 						   le_ctr, ghash_acc,
@@ -1195,10 +1241,14 @@ static int gcm_setkey(struct crypto_aead *tfm, const u8 *raw_key,
 	BUILD_BUG_ON(offsetof(struct aes_gcm_key_aesni, h_powers) != 496);
 	BUILD_BUG_ON(offsetof(struct aes_gcm_key_aesni, h_powers_xored) != 624);
 	BUILD_BUG_ON(offsetof(struct aes_gcm_key_aesni, h_times_x64) != 688);
-	BUILD_BUG_ON(offsetof(struct aes_gcm_key_avx10, base.aes_key.key_enc) != 0);
-	BUILD_BUG_ON(offsetof(struct aes_gcm_key_avx10, base.aes_key.key_length) != 480);
-	BUILD_BUG_ON(offsetof(struct aes_gcm_key_avx10, h_powers) != 512);
-	BUILD_BUG_ON(offsetof(struct aes_gcm_key_avx10, padding) != 768);
+	BUILD_BUG_ON(offsetof(struct aes_gcm_key_vaes_avx2, base.aes_key.key_enc) != 0);
+	BUILD_BUG_ON(offsetof(struct aes_gcm_key_vaes_avx2, base.aes_key.key_length) != 480);
+	BUILD_BUG_ON(offsetof(struct aes_gcm_key_vaes_avx2, h_powers) != 512);
+	BUILD_BUG_ON(offsetof(struct aes_gcm_key_vaes_avx2, h_powers_xored) != 640);
+	BUILD_BUG_ON(offsetof(struct aes_gcm_key_vaes_avx512, base.aes_key.key_enc) != 0);
+	BUILD_BUG_ON(offsetof(struct aes_gcm_key_vaes_avx512, base.aes_key.key_length) != 480);
+	BUILD_BUG_ON(offsetof(struct aes_gcm_key_vaes_avx512, h_powers) != 512);
+	BUILD_BUG_ON(offsetof(struct aes_gcm_key_vaes_avx512, padding) != 768);
 
 	if (likely(crypto_simd_usable())) {
 		err = aes_check_keylen(keylen);
@@ -1231,8 +1281,9 @@ static int gcm_setkey(struct crypto_aead *tfm, const u8 *raw_key,
 		gf128mul_lle(&h, (const be128 *)x_to_the_minus1);
 
 		/* Compute the needed key powers */
-		if (flags & (FLAG_AVX10_256 | FLAG_AVX10_512)) {
-			struct aes_gcm_key_avx10 *k = AES_GCM_KEY_AVX10(key);
+		if (flags & FLAG_VAES_AVX512) {
+			struct aes_gcm_key_vaes_avx512 *k =
+				AES_GCM_KEY_VAES_AVX512(key);
 
 			for (i = ARRAY_SIZE(k->h_powers) - 1; i >= 0; i--) {
 				k->h_powers[i][0] = be64_to_cpu(h.b);
@@ -1240,6 +1291,22 @@ static int gcm_setkey(struct crypto_aead *tfm, const u8 *raw_key,
 				gf128mul_lle(&h, &h1);
 			}
 			memset(k->padding, 0, sizeof(k->padding));
+		} else if (flags & FLAG_VAES_AVX2) {
+			struct aes_gcm_key_vaes_avx2 *k =
+				AES_GCM_KEY_VAES_AVX2(key);
+			static const u8 indices[8] = { 0, 2, 1, 3, 4, 6, 5, 7 };
+
+			for (i = ARRAY_SIZE(k->h_powers) - 1; i >= 0; i--) {
+				k->h_powers[i][0] = be64_to_cpu(h.b);
+				k->h_powers[i][1] = be64_to_cpu(h.a);
+				gf128mul_lle(&h, &h1);
+			}
+			for (i = 0; i < ARRAY_SIZE(k->h_powers_xored); i++) {
+				int j = indices[i];
+
+				k->h_powers_xored[i] = k->h_powers[j][0] ^
+						       k->h_powers[j][1];
+			}
 		} else {
 			struct aes_gcm_key_aesni *k = AES_GCM_KEY_AESNI(key);
 
@@ -1508,15 +1575,15 @@ DEFINE_GCM_ALGS(aesni_avx, FLAG_AVX,
 		"generic-gcm-aesni-avx", "rfc4106-gcm-aesni-avx",
 		AES_GCM_KEY_AESNI_SIZE, 500);
 
-/* aes_gcm_algs_vaes_avx10_256 */
-DEFINE_GCM_ALGS(vaes_avx10_256, FLAG_AVX10_256,
-		"generic-gcm-vaes-avx10_256", "rfc4106-gcm-vaes-avx10_256",
-		AES_GCM_KEY_AVX10_SIZE, 700);
+/* aes_gcm_algs_vaes_avx2 */
+DEFINE_GCM_ALGS(vaes_avx2, FLAG_VAES_AVX2,
+		"generic-gcm-vaes-avx2", "rfc4106-gcm-vaes-avx2",
+		AES_GCM_KEY_VAES_AVX2_SIZE, 600);
 
-/* aes_gcm_algs_vaes_avx10_512 */
-DEFINE_GCM_ALGS(vaes_avx10_512, FLAG_AVX10_512,
-		"generic-gcm-vaes-avx10_512", "rfc4106-gcm-vaes-avx10_512",
-		AES_GCM_KEY_AVX10_SIZE, 800);
+/* aes_gcm_algs_vaes_avx512 */
+DEFINE_GCM_ALGS(vaes_avx512, FLAG_VAES_AVX512,
+		"generic-gcm-vaes-avx512", "rfc4106-gcm-vaes-avx512",
+		AES_GCM_KEY_VAES_AVX512_SIZE, 800);
 
 static int __init register_avx_algs(void)
 {
@@ -1548,6 +1615,10 @@ static int __init register_avx_algs(void)
 					ARRAY_SIZE(skcipher_algs_vaes_avx2));
 	if (err)
 		return err;
+	err = crypto_register_aeads(aes_gcm_algs_vaes_avx2,
+				    ARRAY_SIZE(aes_gcm_algs_vaes_avx2));
+	if (err)
+		return err;
 
 	if (!boot_cpu_has(X86_FEATURE_AVX512BW) ||
 	    !boot_cpu_has(X86_FEATURE_AVX512VL) ||
@@ -1556,26 +1627,21 @@ static int __init register_avx_algs(void)
 			       XFEATURE_MASK_AVX512, NULL))
 		return 0;
 
-	err = crypto_register_aeads(aes_gcm_algs_vaes_avx10_256,
-				    ARRAY_SIZE(aes_gcm_algs_vaes_avx10_256));
-	if (err)
-		return err;
-
 	if (boot_cpu_has(X86_FEATURE_PREFER_YMM)) {
 		int i;
 
 		for (i = 0; i < ARRAY_SIZE(skcipher_algs_vaes_avx512); i++)
 			skcipher_algs_vaes_avx512[i].base.cra_priority = 1;
-		for (i = 0; i < ARRAY_SIZE(aes_gcm_algs_vaes_avx10_512); i++)
-			aes_gcm_algs_vaes_avx10_512[i].base.cra_priority = 1;
+		for (i = 0; i < ARRAY_SIZE(aes_gcm_algs_vaes_avx512); i++)
+			aes_gcm_algs_vaes_avx512[i].base.cra_priority = 1;
 	}
 
 	err = crypto_register_skciphers(skcipher_algs_vaes_avx512,
 					ARRAY_SIZE(skcipher_algs_vaes_avx512));
 	if (err)
 		return err;
-	err = crypto_register_aeads(aes_gcm_algs_vaes_avx10_512,
-				    ARRAY_SIZE(aes_gcm_algs_vaes_avx10_512));
+	err = crypto_register_aeads(aes_gcm_algs_vaes_avx512,
+				    ARRAY_SIZE(aes_gcm_algs_vaes_avx512));
 	if (err)
 		return err;
 
@@ -1595,8 +1661,8 @@ static void unregister_avx_algs(void)
 	unregister_aeads(aes_gcm_algs_aesni_avx);
 	unregister_skciphers(skcipher_algs_vaes_avx2);
 	unregister_skciphers(skcipher_algs_vaes_avx512);
-	unregister_aeads(aes_gcm_algs_vaes_avx10_256);
-	unregister_aeads(aes_gcm_algs_vaes_avx10_512);
+	unregister_aeads(aes_gcm_algs_vaes_avx2);
+	unregister_aeads(aes_gcm_algs_vaes_avx512);
 }
 #else /* CONFIG_X86_64 */
 static struct aead_alg aes_gcm_algs_aesni[0];
diff --git a/arch/x86/crypto/polyval-clmulni_glue.c b/arch/x86/crypto/polyval-clmulni_glue.c
deleted file mode 100644
index 6b466867f91a..000000000000
--- a/arch/x86/crypto/polyval-clmulni_glue.c
+++ /dev/null
@@ -1,180 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Glue code for POLYVAL using PCMULQDQ-NI
- *
- * Copyright (c) 2007 Nokia Siemens Networks - Mikko Herranen <mh1@iki.fi>
- * Copyright (c) 2009 Intel Corp.
- *   Author: Huang Ying <ying.huang@intel.com>
- * Copyright 2021 Google LLC
- */
-
-/*
- * Glue code based on ghash-clmulni-intel_glue.c.
- *
- * This implementation of POLYVAL uses montgomery multiplication
- * accelerated by PCLMULQDQ-NI to implement the finite field
- * operations.
- */
-
-#include <asm/cpu_device_id.h>
-#include <asm/fpu/api.h>
-#include <crypto/internal/hash.h>
-#include <crypto/polyval.h>
-#include <crypto/utils.h>
-#include <linux/errno.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/string.h>
-
-#define POLYVAL_ALIGN	16
-#define POLYVAL_ALIGN_ATTR __aligned(POLYVAL_ALIGN)
-#define POLYVAL_ALIGN_EXTRA ((POLYVAL_ALIGN - 1) & ~(CRYPTO_MINALIGN - 1))
-#define POLYVAL_CTX_SIZE (sizeof(struct polyval_tfm_ctx) + POLYVAL_ALIGN_EXTRA)
-#define NUM_KEY_POWERS	8
-
-struct polyval_tfm_ctx {
-	/*
-	 * These powers must be in the order h^8, ..., h^1.
-	 */
-	u8 key_powers[NUM_KEY_POWERS][POLYVAL_BLOCK_SIZE] POLYVAL_ALIGN_ATTR;
-};
-
-struct polyval_desc_ctx {
-	u8 buffer[POLYVAL_BLOCK_SIZE];
-};
-
-asmlinkage void clmul_polyval_update(const struct polyval_tfm_ctx *keys,
-	const u8 *in, size_t nblocks, u8 *accumulator);
-asmlinkage void clmul_polyval_mul(u8 *op1, const u8 *op2);
-
-static inline struct polyval_tfm_ctx *polyval_tfm_ctx(struct crypto_shash *tfm)
-{
-	return PTR_ALIGN(crypto_shash_ctx(tfm), POLYVAL_ALIGN);
-}
-
-static void internal_polyval_update(const struct polyval_tfm_ctx *keys,
-	const u8 *in, size_t nblocks, u8 *accumulator)
-{
-	kernel_fpu_begin();
-	clmul_polyval_update(keys, in, nblocks, accumulator);
-	kernel_fpu_end();
-}
-
-static void internal_polyval_mul(u8 *op1, const u8 *op2)
-{
-	kernel_fpu_begin();
-	clmul_polyval_mul(op1, op2);
-	kernel_fpu_end();
-}
-
-static int polyval_x86_setkey(struct crypto_shash *tfm,
-			const u8 *key, unsigned int keylen)
-{
-	struct polyval_tfm_ctx *tctx = polyval_tfm_ctx(tfm);
-	int i;
-
-	if (keylen != POLYVAL_BLOCK_SIZE)
-		return -EINVAL;
-
-	memcpy(tctx->key_powers[NUM_KEY_POWERS-1], key, POLYVAL_BLOCK_SIZE);
-
-	for (i = NUM_KEY_POWERS-2; i >= 0; i--) {
-		memcpy(tctx->key_powers[i], key, POLYVAL_BLOCK_SIZE);
-		internal_polyval_mul(tctx->key_powers[i],
-				     tctx->key_powers[i+1]);
-	}
-
-	return 0;
-}
-
-static int polyval_x86_init(struct shash_desc *desc)
-{
-	struct polyval_desc_ctx *dctx = shash_desc_ctx(desc);
-
-	memset(dctx, 0, sizeof(*dctx));
-
-	return 0;
-}
-
-static int polyval_x86_update(struct shash_desc *desc,
-			 const u8 *src, unsigned int srclen)
-{
-	struct polyval_desc_ctx *dctx = shash_desc_ctx(desc);
-	const struct polyval_tfm_ctx *tctx = polyval_tfm_ctx(desc->tfm);
-	unsigned int nblocks;
-
-	do {
-		/* Allow rescheduling every 4K bytes. */
-		nblocks = min(srclen, 4096U) / POLYVAL_BLOCK_SIZE;
-		internal_polyval_update(tctx, src, nblocks, dctx->buffer);
-		srclen -= nblocks * POLYVAL_BLOCK_SIZE;
-		src += nblocks * POLYVAL_BLOCK_SIZE;
-	} while (srclen >= POLYVAL_BLOCK_SIZE);
-
-	return srclen;
-}
-
-static int polyval_x86_finup(struct shash_desc *desc, const u8 *src,
-			     unsigned int len, u8 *dst)
-{
-	struct polyval_desc_ctx *dctx = shash_desc_ctx(desc);
-	const struct polyval_tfm_ctx *tctx = polyval_tfm_ctx(desc->tfm);
-
-	if (len) {
-		crypto_xor(dctx->buffer, src, len);
-		internal_polyval_mul(dctx->buffer,
-				     tctx->key_powers[NUM_KEY_POWERS-1]);
-	}
-
-	memcpy(dst, dctx->buffer, POLYVAL_BLOCK_SIZE);
-
-	return 0;
-}
-
-static struct shash_alg polyval_alg = {
-	.digestsize	= POLYVAL_DIGEST_SIZE,
-	.init		= polyval_x86_init,
-	.update		= polyval_x86_update,
-	.finup		= polyval_x86_finup,
-	.setkey		= polyval_x86_setkey,
-	.descsize	= sizeof(struct polyval_desc_ctx),
-	.base		= {
-		.cra_name		= "polyval",
-		.cra_driver_name	= "polyval-clmulni",
-		.cra_priority		= 200,
-		.cra_flags		= CRYPTO_AHASH_ALG_BLOCK_ONLY,
-		.cra_blocksize		= POLYVAL_BLOCK_SIZE,
-		.cra_ctxsize		= POLYVAL_CTX_SIZE,
-		.cra_module		= THIS_MODULE,
-	},
-};
-
-__maybe_unused static const struct x86_cpu_id pcmul_cpu_id[] = {
-	X86_MATCH_FEATURE(X86_FEATURE_PCLMULQDQ, NULL),
-	{}
-};
-MODULE_DEVICE_TABLE(x86cpu, pcmul_cpu_id);
-
-static int __init polyval_clmulni_mod_init(void)
-{
-	if (!x86_match_cpu(pcmul_cpu_id))
-		return -ENODEV;
-
-	if (!boot_cpu_has(X86_FEATURE_AVX))
-		return -ENODEV;
-
-	return crypto_register_shash(&polyval_alg);
-}
-
-static void __exit polyval_clmulni_mod_exit(void)
-{
-	crypto_unregister_shash(&polyval_alg);
-}
-
-module_init(polyval_clmulni_mod_init);
-module_exit(polyval_clmulni_mod_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("POLYVAL hash function accelerated by PCLMULQDQ-NI");
-MODULE_ALIAS_CRYPTO("polyval");
-MODULE_ALIAS_CRYPTO("polyval-clmulni");
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index fc5f32d4da6e..4b1a6ade1700 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -30,7 +30,7 @@ enum cpuid_leafs
 	CPUID_6_EAX,
 	CPUID_8000_000A_EDX,
 	CPUID_7_ECX,
-	CPUID_8000_0007_EBX,
+	CPUID_LNX_6,
 	CPUID_7_EDX,
 	CPUID_8000_001F_EAX,
 	CPUID_8000_0021_EAX,
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 9e54fc0e7ed3..d90ce601917c 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -314,6 +314,7 @@
 #define X86_FEATURE_SM4			(12*32+ 2) /* SM4 instructions */
 #define X86_FEATURE_AVX_VNNI		(12*32+ 4) /* "avx_vnni" AVX VNNI instructions */
 #define X86_FEATURE_AVX512_BF16		(12*32+ 5) /* "avx512_bf16" AVX512 BFLOAT16 instructions */
+#define X86_FEATURE_LASS		(12*32+ 6) /* "lass" Linear Address Space Separation */
 #define X86_FEATURE_CMPCCXADD           (12*32+ 7) /* CMPccXADD instructions */
 #define X86_FEATURE_ARCH_PERFMON_EXT	(12*32+ 8) /* Intel Architectural PerfMon Extension */
 #define X86_FEATURE_FZRM		(12*32+10) /* Fast zero-length REP MOVSB */
@@ -407,9 +408,12 @@
 #define X86_FEATURE_ENQCMD		(16*32+29) /* "enqcmd" ENQCMD and ENQCMDS instructions */
 #define X86_FEATURE_SGX_LC		(16*32+30) /* "sgx_lc" Software Guard Extensions Launch Control */
 
-/* AMD-defined CPU features, CPUID level 0x80000007 (EBX), word 17 */
+/*
+ * Linux-defined word for use with scattered/synthetic bits.
+ */
 #define X86_FEATURE_OVERFLOW_RECOV	(17*32+ 0) /* "overflow_recov" MCA overflow recovery support */
 #define X86_FEATURE_SUCCOR		(17*32+ 1) /* "succor" Uncorrectable error containment and recovery */
+
 #define X86_FEATURE_SMCA		(17*32+ 3) /* "smca" Scalable MCA */
 
 /* Intel-defined CPU features, CPUID level 0x00000007:0 (EDX), word 18 */
diff --git a/arch/x86/include/asm/smap.h b/arch/x86/include/asm/smap.h
index 4f84d421d1cf..20a3baae9568 100644
--- a/arch/x86/include/asm/smap.h
+++ b/arch/x86/include/asm/smap.h
@@ -23,18 +23,55 @@
 
 #else /* __ASSEMBLER__ */
 
+/*
+ * The CLAC/STAC instructions toggle the enforcement of
+ * X86_FEATURE_SMAP along with X86_FEATURE_LASS.
+ *
+ * SMAP enforcement is based on the _PAGE_BIT_USER bit in the page
+ * tables. The kernel is not allowed to touch pages with that bit set
+ * unless the AC bit is set.
+ *
+ * Use stac()/clac() when accessing userspace (_PAGE_USER) mappings,
+ * regardless of location.
+ *
+ * Note: a barrier is implicit in alternative().
+ */
+
 static __always_inline void clac(void)
 {
-	/* Note: a barrier is implicit in alternative() */
 	alternative("", "clac", X86_FEATURE_SMAP);
 }
 
 static __always_inline void stac(void)
 {
-	/* Note: a barrier is implicit in alternative() */
 	alternative("", "stac", X86_FEATURE_SMAP);
 }
 
+/*
+ * LASS enforcement is based on bit 63 of the virtual address. The
+ * kernel is not allowed to touch memory in the lower half of the
+ * virtual address space.
+ *
+ * Use lass_stac()/lass_clac() to toggle the AC bit for kernel data
+ * accesses (!_PAGE_USER) that are blocked by LASS, but not by SMAP.
+ *
+ * Even with the AC bit set, LASS will continue to block instruction
+ * fetches from the user half of the address space. To allow those,
+ * clear CR4.LASS to disable the LASS mechanism entirely.
+ *
+ * Note: a barrier is implicit in alternative().
+ */
+
+static __always_inline void lass_clac(void)
+{
+	alternative("", "clac", X86_FEATURE_LASS);
+}
+
+static __always_inline void lass_stac(void)
+{
+	alternative("", "stac", X86_FEATURE_LASS);
+}
+
 static __always_inline unsigned long smap_save(void)
 {
 	unsigned long flags;
diff --git a/arch/x86/include/asm/string.h b/arch/x86/include/asm/string.h
index c3c2c1914d65..9cb5aae7fba9 100644
--- a/arch/x86/include/asm/string.h
+++ b/arch/x86/include/asm/string.h
@@ -1,6 +1,32 @@
 /* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_STRING_H
+#define _ASM_X86_STRING_H
+
 #ifdef CONFIG_X86_32
 # include <asm/string_32.h>
 #else
 # include <asm/string_64.h>
 #endif
+
+static __always_inline void *__inline_memcpy(void *to, const void *from, size_t len)
+{
+	void *ret = to;
+
+	asm volatile("rep movsb"
+		     : "+D" (to), "+S" (from), "+c" (len)
+		     : : "memory");
+	return ret;
+}
+
+static __always_inline void *__inline_memset(void *s, int v, size_t n)
+{
+	void *ret = s;
+
+	asm volatile("rep stosb"
+		     : "+D" (s), "+c" (n)
+		     : "a" ((uint8_t)v)
+		     : "memory");
+	return ret;
+}
+
+#endif /* _ASM_X86_STRING_H */
diff --git a/arch/x86/include/uapi/asm/processor-flags.h b/arch/x86/include/uapi/asm/processor-flags.h
index f1a4adc78272..81d0c8bf1137 100644
--- a/arch/x86/include/uapi/asm/processor-flags.h
+++ b/arch/x86/include/uapi/asm/processor-flags.h
@@ -136,6 +136,8 @@
 #define X86_CR4_PKE		_BITUL(X86_CR4_PKE_BIT)
 #define X86_CR4_CET_BIT		23 /* enable Control-flow Enforcement Technology */
 #define X86_CR4_CET		_BITUL(X86_CR4_CET_BIT)
+#define X86_CR4_LASS_BIT	27 /* enable Linear Address Space Separation support */
+#define X86_CR4_LASS		_BITUL(X86_CR4_LASS_BIT)
 #define X86_CR4_LAM_SUP_BIT	28 /* LAM for supervisor pointers */
 #define X86_CR4_LAM_SUP		_BITUL(X86_CR4_LAM_SUP_BIT)
 
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index e377b06e70e3..74f4c659f9c9 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -2453,16 +2453,30 @@ void __init_or_module text_poke_early(void *addr, const void *opcode,
 __ro_after_init struct mm_struct *text_poke_mm;
 __ro_after_init unsigned long text_poke_mm_addr;
 
+/*
+ * Text poking creates and uses a mapping in the lower half of the
+ * address space. Relax LASS enforcement when accessing the poking
+ * address.
+ *
+ * objtool enforces a strict policy of "no function calls within AC=1
+ * regions". Adhere to the policy by using inline versions of
+ * memcpy()/memset() that will never result in a function call.
+ */
+
 static void text_poke_memcpy(void *dst, const void *src, size_t len)
 {
-	memcpy(dst, src, len);
+	lass_stac();
+	__inline_memcpy(dst, src, len);
+	lass_clac();
 }
 
 static void text_poke_memset(void *dst, const void *src, size_t len)
 {
 	int c = *(const int *)src;
 
-	memset(dst, c, len);
+	lass_stac();
+	__inline_memset(dst, c, len);
+	lass_clac();
 }
 
 typedef void text_poke_f(void *dst, const void *src, size_t len);
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 9aae990ed7c7..e7ab22fce3b5 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -406,6 +406,28 @@ out:
 	cr4_clear_bits(X86_CR4_UMIP);
 }
 
+static __always_inline void setup_lass(struct cpuinfo_x86 *c)
+{
+	if (!cpu_feature_enabled(X86_FEATURE_LASS))
+		return;
+
+	/*
+	 * Legacy vsyscall page access causes a #GP when LASS is active.
+	 * Disable LASS because the #GP handler doesn't support vsyscall
+	 * emulation.
+	 *
+	 * Also disable LASS when running under EFI, as some runtime and
+	 * boot services rely on 1:1 mappings in the lower half.
+	 */
+	if (IS_ENABLED(CONFIG_X86_VSYSCALL_EMULATION) ||
+	    IS_ENABLED(CONFIG_EFI)) {
+		setup_clear_cpu_cap(X86_FEATURE_LASS);
+		return;
+	}
+
+	cr4_set_bits(X86_CR4_LASS);
+}
+
 /* These bits should not change their value after CPU init is finished. */
 static const unsigned long cr4_pinned_mask = X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_UMIP |
 					     X86_CR4_FSGSBASE | X86_CR4_CET | X86_CR4_FRED;
@@ -1026,12 +1048,8 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
 		c->x86_capability[CPUID_8000_0001_EDX] = edx;
 	}
 
-	if (c->extended_cpuid_level >= 0x80000007) {
-		cpuid(0x80000007, &eax, &ebx, &ecx, &edx);
-
-		c->x86_capability[CPUID_8000_0007_EBX] = ebx;
-		c->x86_power = edx;
-	}
+	if (c->extended_cpuid_level >= 0x80000007)
+		c->x86_power = cpuid_edx(0x80000007);
 
 	if (c->extended_cpuid_level >= 0x80000008) {
 		cpuid(0x80000008, &eax, &ebx, &ecx, &edx);
@@ -2016,10 +2034,10 @@ static void identify_cpu(struct cpuinfo_x86 *c)
 	/* Disable the PN if appropriate */
 	squash_the_stupid_serial_number(c);
 
-	/* Set up SMEP/SMAP/UMIP */
 	setup_smep(c);
 	setup_smap(c);
 	setup_umip(c);
+	setup_lass(c);
 
 	/* Enable FSGSBASE instructions if available. */
 	if (cpu_has(c, X86_FEATURE_FSGSBASE)) {
diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c
index a40f5545e25b..146f6f8b0650 100644
--- a/arch/x86/kernel/cpu/cpuid-deps.c
+++ b/arch/x86/kernel/cpu/cpuid-deps.c
@@ -91,6 +91,7 @@ static const struct cpuid_dep cpuid_deps[] = {
 	{ X86_FEATURE_SHSTK,			X86_FEATURE_XSAVES    },
 	{ X86_FEATURE_FRED,			X86_FEATURE_LKGS      },
 	{ X86_FEATURE_SPEC_CTRL_SSBD,		X86_FEATURE_SPEC_CTRL },
+	{ X86_FEATURE_LASS,			X86_FEATURE_SMAP      },
 	{}
 };
 
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index 0524ac0260fc..cde4b6cd3471 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -45,6 +45,9 @@ static const struct cpuid_bit cpuid_bits[] = {
 	{ X86_FEATURE_SGX2,			CPUID_EAX,  1, 0x00000012, 0 },
 	{ X86_FEATURE_SGX_EUPDATESVN,		CPUID_EAX, 10, 0x00000012, 0 },
 	{ X86_FEATURE_SGX_EDECCSSA,		CPUID_EAX, 11, 0x00000012, 0 },
+	{ X86_FEATURE_OVERFLOW_RECOV,		CPUID_EBX,  0, 0x80000007, 0 },
+	{ X86_FEATURE_SUCCOR,			CPUID_EBX,  1, 0x80000007, 0 },
+	{ X86_FEATURE_SMCA,			CPUID_EBX,  3, 0x80000007, 0 },
 	{ X86_FEATURE_HW_PSTATE,		CPUID_EDX,  7, 0x80000007, 0 },
 	{ X86_FEATURE_CPB,			CPUID_EDX,  9, 0x80000007, 0 },
 	{ X86_FEATURE_PROC_FEEDBACK,		CPUID_EDX, 11, 0x80000007, 0 },
diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S
index 11e20bb13aca..4ffba68dc57b 100644
--- a/arch/x86/kernel/relocate_kernel_64.S
+++ b/arch/x86/kernel/relocate_kernel_64.S
@@ -95,9 +95,12 @@ SYM_CODE_START_NOALIGN(relocate_kernel)
 	/* Leave CR4 in %r13 to enable the right paging mode later. */
 	movq	%cr4, %r13
 
-	/* Disable global pages immediately to ensure this mapping is RWX */
+	/*
+	 * Disable global pages immediately to ensure this mapping is RWX.
+	 * Disable LASS before jumping to the identity mapped page.
+	 */
 	movq	%r13, %r12
-	andq	$~(X86_CR4_PGE), %r12
+	andq	$~(X86_CR4_PGE | X86_CR4_LASS), %r12
 	movq	%r12, %cr4
 
 	/* Save %rsp and CRs. */
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index cb324cc1fd99..bcf1dedc1d00 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -732,13 +732,23 @@ DEFINE_IDTENTRY(exc_bounds)
 enum kernel_gp_hint {
 	GP_NO_HINT,
 	GP_NON_CANONICAL,
-	GP_CANONICAL
+	GP_CANONICAL,
+	GP_LASS_VIOLATION,
+	GP_NULL_POINTER,
+};
+
+static const char * const kernel_gp_hint_help[] = {
+	[GP_NON_CANONICAL]	= "probably for non-canonical address",
+	[GP_CANONICAL]		= "maybe for address",
+	[GP_LASS_VIOLATION]	= "probably LASS violation for address",
+	[GP_NULL_POINTER]	= "kernel NULL pointer dereference",
 };
 
 /*
  * When an uncaught #GP occurs, try to determine the memory address accessed by
  * the instruction and return that address to the caller. Also, try to figure
- * out whether any part of the access to that address was non-canonical.
+ * out whether any part of the access to that address was non-canonical or
+ * across privilege levels.
  */
 static enum kernel_gp_hint get_kernel_gp_address(struct pt_regs *regs,
 						 unsigned long *addr)
@@ -760,14 +770,28 @@ static enum kernel_gp_hint get_kernel_gp_address(struct pt_regs *regs,
 		return GP_NO_HINT;
 
 #ifdef CONFIG_X86_64
+	/* Operand is in the kernel half */
+	if (*addr >= ~__VIRTUAL_MASK)
+		return GP_CANONICAL;
+
+	/* The last byte of the operand is not in the user canonical half */
+	if (*addr + insn.opnd_bytes - 1 > __VIRTUAL_MASK)
+		return GP_NON_CANONICAL;
+
 	/*
-	 * Check that:
-	 *  - the operand is not in the kernel half
-	 *  - the last byte of the operand is not in the user canonical half
+	 * A NULL pointer dereference usually causes a #PF. However, it
+	 * can result in a #GP when LASS is active. Provide the same
+	 * hint in the rare case that the condition is hit without LASS.
 	 */
-	if (*addr < ~__VIRTUAL_MASK &&
-	    *addr + insn.opnd_bytes - 1 > __VIRTUAL_MASK)
-		return GP_NON_CANONICAL;
+	if (*addr < PAGE_SIZE)
+		return GP_NULL_POINTER;
+
+	/*
+	 * Assume that LASS caused the exception, because the address is
+	 * canonical and in the user half.
+	 */
+	if (cpu_feature_enabled(X86_FEATURE_LASS))
+		return GP_LASS_VIOLATION;
 #endif
 
 	return GP_CANONICAL;
@@ -930,9 +954,7 @@ DEFINE_IDTENTRY_ERRORCODE(exc_general_protection)
 
 	if (hint != GP_NO_HINT)
 		snprintf(desc, sizeof(desc), GPFSTR ", %s 0x%lx",
-			 (hint == GP_NON_CANONICAL) ? "probably for non-canonical address"
-						    : "maybe for address",
-			 gp_addr);
+			 kernel_gp_hint_help[hint], gp_addr);
 
 	/*
 	 * KASAN is interested only in the non-canonical case, clear it
diff --git a/arch/x86/kvm/reverse_cpuid.h b/arch/x86/kvm/reverse_cpuid.h
index 743ab25ba787..81b4a7acf72e 100644
--- a/arch/x86/kvm/reverse_cpuid.h
+++ b/arch/x86/kvm/reverse_cpuid.h
@@ -78,7 +78,6 @@ static const struct cpuid_reg reverse_cpuid[] = {
 	[CPUID_6_EAX]         = {         6, 0, CPUID_EAX},
 	[CPUID_8000_000A_EDX] = {0x8000000a, 0, CPUID_EDX},
 	[CPUID_7_ECX]         = {         7, 0, CPUID_ECX},
-	[CPUID_8000_0007_EBX] = {0x80000007, 0, CPUID_EBX},
 	[CPUID_7_EDX]         = {         7, 0, CPUID_EDX},
 	[CPUID_7_1_EAX]       = {         7, 1, CPUID_EAX},
 	[CPUID_12_EAX]        = {0x00000012, 0, CPUID_EAX},
diff --git a/crypto/Kconfig b/crypto/Kconfig
index a04595f9d0ca..bf8b8a60a0c0 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -696,7 +696,7 @@ config CRYPTO_ECB
 config CRYPTO_HCTR2
 	tristate "HCTR2"
 	select CRYPTO_XCTR
-	select CRYPTO_POLYVAL
+	select CRYPTO_LIB_POLYVAL
 	select CRYPTO_MANAGER
 	help
 	  HCTR2 length-preserving encryption mode
@@ -881,6 +881,7 @@ menu "Hashes, digests, and MACs"
 config CRYPTO_BLAKE2B
 	tristate "BLAKE2b"
 	select CRYPTO_HASH
+	select CRYPTO_LIB_BLAKE2B
 	help
 	  BLAKE2b cryptographic hash function (RFC 7693)
 
@@ -947,16 +948,6 @@ config CRYPTO_MICHAEL_MIC
 	  This algorithm is required for TKIP, but it should not be used for
 	  other purposes because of the weakness of the algorithm.
 
-config CRYPTO_POLYVAL
-	tristate
-	select CRYPTO_HASH
-	select CRYPTO_LIB_GF128MUL
-	help
-	  POLYVAL hash function for HCTR2
-
-	  This is used in HCTR2.  It is not a general-purpose
-	  cryptographic hash function.
-
 config CRYPTO_RMD160
 	tristate "RIPEMD-160"
 	select CRYPTO_HASH
@@ -1005,6 +996,7 @@ config CRYPTO_SHA512
 config CRYPTO_SHA3
 	tristate "SHA-3"
 	select CRYPTO_HASH
+	select CRYPTO_LIB_SHA3
 	help
 	  SHA-3 secure hash algorithms (FIPS 202, ISO/IEC 10118-3)
 
diff --git a/crypto/Makefile b/crypto/Makefile
index e430e6e99b6a..093c56a45d3f 100644
--- a/crypto/Makefile
+++ b/crypto/Makefile
@@ -78,13 +78,12 @@ obj-$(CONFIG_CRYPTO_RMD160) += rmd160.o
 obj-$(CONFIG_CRYPTO_SHA1) += sha1.o
 obj-$(CONFIG_CRYPTO_SHA256) += sha256.o
 obj-$(CONFIG_CRYPTO_SHA512) += sha512.o
-obj-$(CONFIG_CRYPTO_SHA3) += sha3_generic.o
+obj-$(CONFIG_CRYPTO_SHA3) += sha3.o
 obj-$(CONFIG_CRYPTO_SM3_GENERIC) += sm3_generic.o
 obj-$(CONFIG_CRYPTO_STREEBOG) += streebog_generic.o
 obj-$(CONFIG_CRYPTO_WP512) += wp512.o
 CFLAGS_wp512.o := $(call cc-option,-fno-schedule-insns)  # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=79149
-obj-$(CONFIG_CRYPTO_BLAKE2B) += blake2b_generic.o
-CFLAGS_blake2b_generic.o := -Wframe-larger-than=4096 #  https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105930
+obj-$(CONFIG_CRYPTO_BLAKE2B) += blake2b.o
 obj-$(CONFIG_CRYPTO_ECB) += ecb.o
 obj-$(CONFIG_CRYPTO_CBC) += cbc.o
 obj-$(CONFIG_CRYPTO_PCBC) += pcbc.o
@@ -173,7 +172,6 @@ jitterentropy_rng-y := jitterentropy.o jitterentropy-kcapi.o
 obj-$(CONFIG_CRYPTO_JITTERENTROPY_TESTINTERFACE) += jitterentropy-testing.o
 obj-$(CONFIG_CRYPTO_BENCHMARK) += tcrypt.o
 obj-$(CONFIG_CRYPTO_GHASH) += ghash-generic.o
-obj-$(CONFIG_CRYPTO_POLYVAL) += polyval-generic.o
 obj-$(CONFIG_CRYPTO_USER_API) += af_alg.o
 obj-$(CONFIG_CRYPTO_USER_API_HASH) += algif_hash.o
 obj-$(CONFIG_CRYPTO_USER_API_SKCIPHER) += algif_skcipher.o
diff --git a/crypto/aegis128-neon.c b/crypto/aegis128-neon.c
index 9ee50549e823..b41807e63bd3 100644
--- a/crypto/aegis128-neon.c
+++ b/crypto/aegis128-neon.c
@@ -4,7 +4,7 @@
  */
 
 #include <asm/cpufeature.h>
-#include <asm/neon.h>
+#include <asm/simd.h>
 
 #include "aegis.h"
 #include "aegis-neon.h"
@@ -24,32 +24,28 @@ void crypto_aegis128_init_simd(struct aegis_state *state,
 			       const union aegis_block *key,
 			       const u8 *iv)
 {
-	kernel_neon_begin();
-	crypto_aegis128_init_neon(state, key, iv);
-	kernel_neon_end();
+	scoped_ksimd()
+		crypto_aegis128_init_neon(state, key, iv);
 }
 
 void crypto_aegis128_update_simd(struct aegis_state *state, const void *msg)
 {
-	kernel_neon_begin();
-	crypto_aegis128_update_neon(state, msg);
-	kernel_neon_end();
+	scoped_ksimd()
+		crypto_aegis128_update_neon(state, msg);
 }
 
 void crypto_aegis128_encrypt_chunk_simd(struct aegis_state *state, u8 *dst,
 					const u8 *src, unsigned int size)
 {
-	kernel_neon_begin();
-	crypto_aegis128_encrypt_chunk_neon(state, dst, src, size);
-	kernel_neon_end();
+	scoped_ksimd()
+		crypto_aegis128_encrypt_chunk_neon(state, dst, src, size);
 }
 
 void crypto_aegis128_decrypt_chunk_simd(struct aegis_state *state, u8 *dst,
 					const u8 *src, unsigned int size)
 {
-	kernel_neon_begin();
-	crypto_aegis128_decrypt_chunk_neon(state, dst, src, size);
-	kernel_neon_end();
+	scoped_ksimd()
+		crypto_aegis128_decrypt_chunk_neon(state, dst, src, size);
 }
 
 int crypto_aegis128_final_simd(struct aegis_state *state,
@@ -58,12 +54,7 @@ int crypto_aegis128_final_simd(struct aegis_state *state,
 			       unsigned int cryptlen,
 			       unsigned int authsize)
 {
-	int ret;
-
-	kernel_neon_begin();
-	ret = crypto_aegis128_final_neon(state, tag_xor, assoclen, cryptlen,
-					 authsize);
-	kernel_neon_end();
-
-	return ret;
+	scoped_ksimd()
+		return crypto_aegis128_final_neon(state, tag_xor, assoclen,
+						  cryptlen, authsize);
 }
diff --git a/crypto/blake2b.c b/crypto/blake2b.c
new file mode 100644
index 000000000000..67a6dae43a54
--- /dev/null
+++ b/crypto/blake2b.c
@@ -0,0 +1,111 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Crypto API support for BLAKE2b
+ *
+ * Copyright 2025 Google LLC
+ */
+#include <crypto/blake2b.h>
+#include <crypto/internal/hash.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+
+struct blake2b_tfm_ctx {
+	unsigned int keylen;
+	u8 key[BLAKE2B_KEY_SIZE];
+};
+
+static int crypto_blake2b_setkey(struct crypto_shash *tfm,
+				 const u8 *key, unsigned int keylen)
+{
+	struct blake2b_tfm_ctx *tctx = crypto_shash_ctx(tfm);
+
+	if (keylen > BLAKE2B_KEY_SIZE)
+		return -EINVAL;
+	memcpy(tctx->key, key, keylen);
+	tctx->keylen = keylen;
+	return 0;
+}
+
+#define BLAKE2B_CTX(desc) ((struct blake2b_ctx *)shash_desc_ctx(desc))
+
+static int crypto_blake2b_init(struct shash_desc *desc)
+{
+	const struct blake2b_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm);
+	unsigned int digestsize = crypto_shash_digestsize(desc->tfm);
+
+	blake2b_init_key(BLAKE2B_CTX(desc), digestsize,
+			 tctx->key, tctx->keylen);
+	return 0;
+}
+
+static int crypto_blake2b_update(struct shash_desc *desc,
+				 const u8 *data, unsigned int len)
+{
+	blake2b_update(BLAKE2B_CTX(desc), data, len);
+	return 0;
+}
+
+static int crypto_blake2b_final(struct shash_desc *desc, u8 *out)
+{
+	blake2b_final(BLAKE2B_CTX(desc), out);
+	return 0;
+}
+
+static int crypto_blake2b_digest(struct shash_desc *desc,
+				 const u8 *data, unsigned int len, u8 *out)
+{
+	const struct blake2b_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm);
+	unsigned int digestsize = crypto_shash_digestsize(desc->tfm);
+
+	blake2b(tctx->key, tctx->keylen, data, len, out, digestsize);
+	return 0;
+}
+
+#define BLAKE2B_ALG(name, digest_size)					\
+	{								\
+		.base.cra_name		= name,				\
+		.base.cra_driver_name	= name "-lib",			\
+		.base.cra_priority	= 300,				\
+		.base.cra_flags		= CRYPTO_ALG_OPTIONAL_KEY,	\
+		.base.cra_blocksize	= BLAKE2B_BLOCK_SIZE,		\
+		.base.cra_ctxsize	= sizeof(struct blake2b_tfm_ctx), \
+		.base.cra_module	= THIS_MODULE,			\
+		.digestsize		= digest_size,			\
+		.setkey			= crypto_blake2b_setkey,	\
+		.init			= crypto_blake2b_init,		\
+		.update			= crypto_blake2b_update,	\
+		.final			= crypto_blake2b_final,		\
+		.digest			= crypto_blake2b_digest,	\
+		.descsize		= sizeof(struct blake2b_ctx),	\
+	}
+
+static struct shash_alg algs[] = {
+	BLAKE2B_ALG("blake2b-160", BLAKE2B_160_HASH_SIZE),
+	BLAKE2B_ALG("blake2b-256", BLAKE2B_256_HASH_SIZE),
+	BLAKE2B_ALG("blake2b-384", BLAKE2B_384_HASH_SIZE),
+	BLAKE2B_ALG("blake2b-512", BLAKE2B_512_HASH_SIZE),
+};
+
+static int __init crypto_blake2b_mod_init(void)
+{
+	return crypto_register_shashes(algs, ARRAY_SIZE(algs));
+}
+module_init(crypto_blake2b_mod_init);
+
+static void __exit crypto_blake2b_mod_exit(void)
+{
+	crypto_unregister_shashes(algs, ARRAY_SIZE(algs));
+}
+module_exit(crypto_blake2b_mod_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Crypto API support for BLAKE2b");
+
+MODULE_ALIAS_CRYPTO("blake2b-160");
+MODULE_ALIAS_CRYPTO("blake2b-160-lib");
+MODULE_ALIAS_CRYPTO("blake2b-256");
+MODULE_ALIAS_CRYPTO("blake2b-256-lib");
+MODULE_ALIAS_CRYPTO("blake2b-384");
+MODULE_ALIAS_CRYPTO("blake2b-384-lib");
+MODULE_ALIAS_CRYPTO("blake2b-512");
+MODULE_ALIAS_CRYPTO("blake2b-512-lib");
diff --git a/crypto/blake2b_generic.c b/crypto/blake2b_generic.c
deleted file mode 100644
index 60f056217510..000000000000
--- a/crypto/blake2b_generic.c
+++ /dev/null
@@ -1,192 +0,0 @@
-// SPDX-License-Identifier: (GPL-2.0-only OR Apache-2.0)
-/*
- * Generic implementation of the BLAKE2b digest algorithm.  Based on the BLAKE2b
- * reference implementation, but it has been heavily modified for use in the
- * kernel.  The reference implementation was:
- *
- *	Copyright 2012, Samuel Neves <sneves@dei.uc.pt>.  You may use this under
- *	the terms of the CC0, the OpenSSL Licence, or the Apache Public License
- *	2.0, at your option.  The terms of these licenses can be found at:
- *
- *	- CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0
- *	- OpenSSL license   : https://www.openssl.org/source/license.html
- *	- Apache 2.0        : https://www.apache.org/licenses/LICENSE-2.0
- *
- * More information about BLAKE2 can be found at https://blake2.net.
- */
-
-#include <crypto/internal/blake2b.h>
-#include <crypto/internal/hash.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/string.h>
-#include <linux/unaligned.h>
-
-static const u8 blake2b_sigma[12][16] = {
-	{  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 },
-	{ 14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3 },
-	{ 11,  8, 12,  0,  5,  2, 15, 13, 10, 14,  3,  6,  7,  1,  9,  4 },
-	{  7,  9,  3,  1, 13, 12, 11, 14,  2,  6,  5, 10,  4,  0, 15,  8 },
-	{  9,  0,  5,  7,  2,  4, 10, 15, 14,  1, 11, 12,  6,  8,  3, 13 },
-	{  2, 12,  6, 10,  0, 11,  8,  3,  4, 13,  7,  5, 15, 14,  1,  9 },
-	{ 12,  5,  1, 15, 14, 13,  4, 10,  0,  7,  6,  3,  9,  2,  8, 11 },
-	{ 13, 11,  7, 14, 12,  1,  3,  9,  5,  0, 15,  4,  8,  6,  2, 10 },
-	{  6, 15, 14,  9, 11,  3,  0,  8, 12,  2, 13,  7,  1,  4, 10,  5 },
-	{ 10,  2,  8,  4,  7,  6,  1,  5, 15, 11,  9, 14,  3, 12, 13,  0 },
-	{  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 },
-	{ 14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3 }
-};
-
-static void blake2b_increment_counter(struct blake2b_state *S, const u64 inc)
-{
-	S->t[0] += inc;
-	S->t[1] += (S->t[0] < inc);
-}
-
-#define G(r,i,a,b,c,d)                                  \
-	do {                                            \
-		a = a + b + m[blake2b_sigma[r][2*i+0]]; \
-		d = ror64(d ^ a, 32);                   \
-		c = c + d;                              \
-		b = ror64(b ^ c, 24);                   \
-		a = a + b + m[blake2b_sigma[r][2*i+1]]; \
-		d = ror64(d ^ a, 16);                   \
-		c = c + d;                              \
-		b = ror64(b ^ c, 63);                   \
-	} while (0)
-
-#define ROUND(r)                                \
-	do {                                    \
-		G(r,0,v[ 0],v[ 4],v[ 8],v[12]); \
-		G(r,1,v[ 1],v[ 5],v[ 9],v[13]); \
-		G(r,2,v[ 2],v[ 6],v[10],v[14]); \
-		G(r,3,v[ 3],v[ 7],v[11],v[15]); \
-		G(r,4,v[ 0],v[ 5],v[10],v[15]); \
-		G(r,5,v[ 1],v[ 6],v[11],v[12]); \
-		G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \
-		G(r,7,v[ 3],v[ 4],v[ 9],v[14]); \
-	} while (0)
-
-static void blake2b_compress_one_generic(struct blake2b_state *S,
-					 const u8 block[BLAKE2B_BLOCK_SIZE])
-{
-	u64 m[16];
-	u64 v[16];
-	size_t i;
-
-	for (i = 0; i < 16; ++i)
-		m[i] = get_unaligned_le64(block + i * sizeof(m[i]));
-
-	for (i = 0; i < 8; ++i)
-		v[i] = S->h[i];
-
-	v[ 8] = BLAKE2B_IV0;
-	v[ 9] = BLAKE2B_IV1;
-	v[10] = BLAKE2B_IV2;
-	v[11] = BLAKE2B_IV3;
-	v[12] = BLAKE2B_IV4 ^ S->t[0];
-	v[13] = BLAKE2B_IV5 ^ S->t[1];
-	v[14] = BLAKE2B_IV6 ^ S->f[0];
-	v[15] = BLAKE2B_IV7 ^ S->f[1];
-
-	ROUND(0);
-	ROUND(1);
-	ROUND(2);
-	ROUND(3);
-	ROUND(4);
-	ROUND(5);
-	ROUND(6);
-	ROUND(7);
-	ROUND(8);
-	ROUND(9);
-	ROUND(10);
-	ROUND(11);
-#ifdef CONFIG_CC_IS_CLANG
-#pragma nounroll /* https://llvm.org/pr45803 */
-#endif
-	for (i = 0; i < 8; ++i)
-		S->h[i] = S->h[i] ^ v[i] ^ v[i + 8];
-}
-
-#undef G
-#undef ROUND
-
-static void blake2b_compress_generic(struct blake2b_state *state,
-				     const u8 *block, size_t nblocks, u32 inc)
-{
-	do {
-		blake2b_increment_counter(state, inc);
-		blake2b_compress_one_generic(state, block);
-		block += BLAKE2B_BLOCK_SIZE;
-	} while (--nblocks);
-}
-
-static int crypto_blake2b_update_generic(struct shash_desc *desc,
-					 const u8 *in, unsigned int inlen)
-{
-	return crypto_blake2b_update_bo(desc, in, inlen,
-					blake2b_compress_generic);
-}
-
-static int crypto_blake2b_finup_generic(struct shash_desc *desc, const u8 *in,
-					unsigned int inlen, u8 *out)
-{
-	return crypto_blake2b_finup(desc, in, inlen, out,
-				    blake2b_compress_generic);
-}
-
-#define BLAKE2B_ALG(name, driver_name, digest_size)			\
-	{								\
-		.base.cra_name		= name,				\
-		.base.cra_driver_name	= driver_name,			\
-		.base.cra_priority	= 100,				\
-		.base.cra_flags		= CRYPTO_ALG_OPTIONAL_KEY |	\
-					  CRYPTO_AHASH_ALG_BLOCK_ONLY |	\
-					  CRYPTO_AHASH_ALG_FINAL_NONZERO, \
-		.base.cra_blocksize	= BLAKE2B_BLOCK_SIZE,		\
-		.base.cra_ctxsize	= sizeof(struct blake2b_tfm_ctx), \
-		.base.cra_module	= THIS_MODULE,			\
-		.digestsize		= digest_size,			\
-		.setkey			= crypto_blake2b_setkey,	\
-		.init			= crypto_blake2b_init,		\
-		.update			= crypto_blake2b_update_generic, \
-		.finup			= crypto_blake2b_finup_generic,	\
-		.descsize		= BLAKE2B_DESC_SIZE,		\
-		.statesize		= BLAKE2B_STATE_SIZE,		\
-	}
-
-static struct shash_alg blake2b_algs[] = {
-	BLAKE2B_ALG("blake2b-160", "blake2b-160-generic",
-		    BLAKE2B_160_HASH_SIZE),
-	BLAKE2B_ALG("blake2b-256", "blake2b-256-generic",
-		    BLAKE2B_256_HASH_SIZE),
-	BLAKE2B_ALG("blake2b-384", "blake2b-384-generic",
-		    BLAKE2B_384_HASH_SIZE),
-	BLAKE2B_ALG("blake2b-512", "blake2b-512-generic",
-		    BLAKE2B_512_HASH_SIZE),
-};
-
-static int __init blake2b_mod_init(void)
-{
-	return crypto_register_shashes(blake2b_algs, ARRAY_SIZE(blake2b_algs));
-}
-
-static void __exit blake2b_mod_fini(void)
-{
-	crypto_unregister_shashes(blake2b_algs, ARRAY_SIZE(blake2b_algs));
-}
-
-module_init(blake2b_mod_init);
-module_exit(blake2b_mod_fini);
-
-MODULE_AUTHOR("David Sterba <kdave@kernel.org>");
-MODULE_DESCRIPTION("BLAKE2b generic implementation");
-MODULE_LICENSE("GPL");
-MODULE_ALIAS_CRYPTO("blake2b-160");
-MODULE_ALIAS_CRYPTO("blake2b-160-generic");
-MODULE_ALIAS_CRYPTO("blake2b-256");
-MODULE_ALIAS_CRYPTO("blake2b-256-generic");
-MODULE_ALIAS_CRYPTO("blake2b-384");
-MODULE_ALIAS_CRYPTO("blake2b-384-generic");
-MODULE_ALIAS_CRYPTO("blake2b-512");
-MODULE_ALIAS_CRYPTO("blake2b-512-generic");
diff --git a/crypto/hctr2.c b/crypto/hctr2.c
index c8932777bba8..f4cd6c29b4d3 100644
--- a/crypto/hctr2.c
+++ b/crypto/hctr2.c
@@ -17,7 +17,6 @@
  */
 
 #include <crypto/internal/cipher.h>
-#include <crypto/internal/hash.h>
 #include <crypto/internal/skcipher.h>
 #include <crypto/polyval.h>
 #include <crypto/scatterwalk.h>
@@ -37,23 +36,14 @@
 struct hctr2_instance_ctx {
 	struct crypto_cipher_spawn blockcipher_spawn;
 	struct crypto_skcipher_spawn xctr_spawn;
-	struct crypto_shash_spawn polyval_spawn;
 };
 
 struct hctr2_tfm_ctx {
 	struct crypto_cipher *blockcipher;
 	struct crypto_skcipher *xctr;
-	struct crypto_shash *polyval;
+	struct polyval_key poly_key;
+	struct polyval_elem hashed_tweaklens[2];
 	u8 L[BLOCKCIPHER_BLOCK_SIZE];
-	int hashed_tweak_offset;
-	/*
-	 * This struct is allocated with extra space for two exported hash
-	 * states.  Since the hash state size is not known at compile-time, we
-	 * can't add these to the struct directly.
-	 *
-	 * hashed_tweaklen_divisible;
-	 * hashed_tweaklen_remainder;
-	 */
 };
 
 struct hctr2_request_ctx {
@@ -63,39 +53,17 @@ struct hctr2_request_ctx {
 	struct scatterlist *bulk_part_src;
 	struct scatterlist sg_src[2];
 	struct scatterlist sg_dst[2];
+	struct polyval_elem hashed_tweak;
 	/*
-	 * Sub-request sizes are unknown at compile-time, so they need to go
-	 * after the members with known sizes.
+	 * skcipher sub-request size is unknown at compile-time, so it needs to
+	 * go after the members with known sizes.
 	 */
 	union {
-		struct shash_desc hash_desc;
+		struct polyval_ctx poly_ctx;
 		struct skcipher_request xctr_req;
 	} u;
-	/*
-	 * This struct is allocated with extra space for one exported hash
-	 * state.  Since the hash state size is not known at compile-time, we
-	 * can't add it to the struct directly.
-	 *
-	 * hashed_tweak;
-	 */
 };
 
-static inline u8 *hctr2_hashed_tweaklen(const struct hctr2_tfm_ctx *tctx,
-					bool has_remainder)
-{
-	u8 *p = (u8 *)tctx + sizeof(*tctx);
-
-	if (has_remainder) /* For messages not a multiple of block length */
-		p += crypto_shash_statesize(tctx->polyval);
-	return p;
-}
-
-static inline u8 *hctr2_hashed_tweak(const struct hctr2_tfm_ctx *tctx,
-				     struct hctr2_request_ctx *rctx)
-{
-	return (u8 *)rctx + tctx->hashed_tweak_offset;
-}
-
 /*
  * The input data for each HCTR2 hash step begins with a 16-byte block that
  * contains the tweak length and a flag that indicates whether the input is evenly
@@ -106,24 +74,23 @@ static inline u8 *hctr2_hashed_tweak(const struct hctr2_tfm_ctx *tctx,
  *
  * These precomputed hashes are stored in hctr2_tfm_ctx.
  */
-static int hctr2_hash_tweaklen(struct hctr2_tfm_ctx *tctx, bool has_remainder)
+static void hctr2_hash_tweaklens(struct hctr2_tfm_ctx *tctx)
 {
-	SHASH_DESC_ON_STACK(shash, tfm->polyval);
-	__le64 tweak_length_block[2];
-	int err;
-
-	shash->tfm = tctx->polyval;
-	memset(tweak_length_block, 0, sizeof(tweak_length_block));
-
-	tweak_length_block[0] = cpu_to_le64(TWEAK_SIZE * 8 * 2 + 2 + has_remainder);
-	err = crypto_shash_init(shash);
-	if (err)
-		return err;
-	err = crypto_shash_update(shash, (u8 *)tweak_length_block,
-				  POLYVAL_BLOCK_SIZE);
-	if (err)
-		return err;
-	return crypto_shash_export(shash, hctr2_hashed_tweaklen(tctx, has_remainder));
+	struct polyval_ctx ctx;
+
+	for (int has_remainder = 0; has_remainder < 2; has_remainder++) {
+		const __le64 tweak_length_block[2] = {
+			cpu_to_le64(TWEAK_SIZE * 8 * 2 + 2 + has_remainder),
+		};
+
+		polyval_init(&ctx, &tctx->poly_key);
+		polyval_update(&ctx, (const u8 *)&tweak_length_block,
+			       sizeof(tweak_length_block));
+		static_assert(sizeof(tweak_length_block) == POLYVAL_BLOCK_SIZE);
+		polyval_export_blkaligned(
+			&ctx, &tctx->hashed_tweaklens[has_remainder]);
+	}
+	memzero_explicit(&ctx, sizeof(ctx));
 }
 
 static int hctr2_setkey(struct crypto_skcipher *tfm, const u8 *key,
@@ -156,51 +123,42 @@ static int hctr2_setkey(struct crypto_skcipher *tfm, const u8 *key,
 	tctx->L[0] = 0x01;
 	crypto_cipher_encrypt_one(tctx->blockcipher, tctx->L, tctx->L);
 
-	crypto_shash_clear_flags(tctx->polyval, CRYPTO_TFM_REQ_MASK);
-	crypto_shash_set_flags(tctx->polyval, crypto_skcipher_get_flags(tfm) &
-			       CRYPTO_TFM_REQ_MASK);
-	err = crypto_shash_setkey(tctx->polyval, hbar, BLOCKCIPHER_BLOCK_SIZE);
-	if (err)
-		return err;
+	static_assert(sizeof(hbar) == POLYVAL_BLOCK_SIZE);
+	polyval_preparekey(&tctx->poly_key, hbar);
 	memzero_explicit(hbar, sizeof(hbar));
 
-	return hctr2_hash_tweaklen(tctx, true) ?: hctr2_hash_tweaklen(tctx, false);
+	hctr2_hash_tweaklens(tctx);
+	return 0;
 }
 
-static int hctr2_hash_tweak(struct skcipher_request *req)
+static void hctr2_hash_tweak(struct skcipher_request *req)
 {
 	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
 	const struct hctr2_tfm_ctx *tctx = crypto_skcipher_ctx(tfm);
 	struct hctr2_request_ctx *rctx = skcipher_request_ctx(req);
-	struct shash_desc *hash_desc = &rctx->u.hash_desc;
-	int err;
+	struct polyval_ctx *poly_ctx = &rctx->u.poly_ctx;
 	bool has_remainder = req->cryptlen % POLYVAL_BLOCK_SIZE;
 
-	hash_desc->tfm = tctx->polyval;
-	err = crypto_shash_import(hash_desc, hctr2_hashed_tweaklen(tctx, has_remainder));
-	if (err)
-		return err;
-	err = crypto_shash_update(hash_desc, req->iv, TWEAK_SIZE);
-	if (err)
-		return err;
+	polyval_import_blkaligned(poly_ctx, &tctx->poly_key,
+				  &tctx->hashed_tweaklens[has_remainder]);
+	polyval_update(poly_ctx, req->iv, TWEAK_SIZE);
 
 	// Store the hashed tweak, since we need it when computing both
 	// H(T || N) and H(T || V).
-	return crypto_shash_export(hash_desc, hctr2_hashed_tweak(tctx, rctx));
+	static_assert(TWEAK_SIZE % POLYVAL_BLOCK_SIZE == 0);
+	polyval_export_blkaligned(poly_ctx, &rctx->hashed_tweak);
 }
 
-static int hctr2_hash_message(struct skcipher_request *req,
-			      struct scatterlist *sgl,
-			      u8 digest[POLYVAL_DIGEST_SIZE])
+static void hctr2_hash_message(struct skcipher_request *req,
+			       struct scatterlist *sgl,
+			       u8 digest[POLYVAL_DIGEST_SIZE])
 {
-	static const u8 padding[BLOCKCIPHER_BLOCK_SIZE] = { 0x1 };
+	static const u8 padding = 0x1;
 	struct hctr2_request_ctx *rctx = skcipher_request_ctx(req);
-	struct shash_desc *hash_desc = &rctx->u.hash_desc;
+	struct polyval_ctx *poly_ctx = &rctx->u.poly_ctx;
 	const unsigned int bulk_len = req->cryptlen - BLOCKCIPHER_BLOCK_SIZE;
 	struct sg_mapping_iter miter;
-	unsigned int remainder = bulk_len % BLOCKCIPHER_BLOCK_SIZE;
 	int i;
-	int err = 0;
 	int n = 0;
 
 	sg_miter_start(&miter, sgl, sg_nents(sgl),
@@ -208,22 +166,13 @@ static int hctr2_hash_message(struct skcipher_request *req,
 	for (i = 0; i < bulk_len; i += n) {
 		sg_miter_next(&miter);
 		n = min_t(unsigned int, miter.length, bulk_len - i);
-		err = crypto_shash_update(hash_desc, miter.addr, n);
-		if (err)
-			break;
+		polyval_update(poly_ctx, miter.addr, n);
 	}
 	sg_miter_stop(&miter);
 
-	if (err)
-		return err;
-
-	if (remainder) {
-		err = crypto_shash_update(hash_desc, padding,
-					  BLOCKCIPHER_BLOCK_SIZE - remainder);
-		if (err)
-			return err;
-	}
-	return crypto_shash_final(hash_desc, digest);
+	if (req->cryptlen % BLOCKCIPHER_BLOCK_SIZE)
+		polyval_update(poly_ctx, &padding, 1);
+	polyval_final(poly_ctx, digest);
 }
 
 static int hctr2_finish(struct skcipher_request *req)
@@ -231,19 +180,14 @@ static int hctr2_finish(struct skcipher_request *req)
 	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
 	const struct hctr2_tfm_ctx *tctx = crypto_skcipher_ctx(tfm);
 	struct hctr2_request_ctx *rctx = skcipher_request_ctx(req);
+	struct polyval_ctx *poly_ctx = &rctx->u.poly_ctx;
 	u8 digest[POLYVAL_DIGEST_SIZE];
-	struct shash_desc *hash_desc = &rctx->u.hash_desc;
-	int err;
 
 	// U = UU ^ H(T || V)
 	// or M = MM ^ H(T || N)
-	hash_desc->tfm = tctx->polyval;
-	err = crypto_shash_import(hash_desc, hctr2_hashed_tweak(tctx, rctx));
-	if (err)
-		return err;
-	err = hctr2_hash_message(req, rctx->bulk_part_dst, digest);
-	if (err)
-		return err;
+	polyval_import_blkaligned(poly_ctx, &tctx->poly_key,
+				  &rctx->hashed_tweak);
+	hctr2_hash_message(req, rctx->bulk_part_dst, digest);
 	crypto_xor(rctx->first_block, digest, BLOCKCIPHER_BLOCK_SIZE);
 
 	// Copy U (or M) into dst scatterlist
@@ -269,7 +213,6 @@ static int hctr2_crypt(struct skcipher_request *req, bool enc)
 	struct hctr2_request_ctx *rctx = skcipher_request_ctx(req);
 	u8 digest[POLYVAL_DIGEST_SIZE];
 	int bulk_len = req->cryptlen - BLOCKCIPHER_BLOCK_SIZE;
-	int err;
 
 	// Requests must be at least one block
 	if (req->cryptlen < BLOCKCIPHER_BLOCK_SIZE)
@@ -287,12 +230,8 @@ static int hctr2_crypt(struct skcipher_request *req, bool enc)
 
 	// MM = M ^ H(T || N)
 	// or UU = U ^ H(T || V)
-	err = hctr2_hash_tweak(req);
-	if (err)
-		return err;
-	err = hctr2_hash_message(req, rctx->bulk_part_src, digest);
-	if (err)
-		return err;
+	hctr2_hash_tweak(req);
+	hctr2_hash_message(req, rctx->bulk_part_src, digest);
 	crypto_xor(digest, rctx->first_block, BLOCKCIPHER_BLOCK_SIZE);
 
 	// UU = E(MM)
@@ -338,8 +277,6 @@ static int hctr2_init_tfm(struct crypto_skcipher *tfm)
 	struct hctr2_tfm_ctx *tctx = crypto_skcipher_ctx(tfm);
 	struct crypto_skcipher *xctr;
 	struct crypto_cipher *blockcipher;
-	struct crypto_shash *polyval;
-	unsigned int subreq_size;
 	int err;
 
 	xctr = crypto_spawn_skcipher(&ictx->xctr_spawn);
@@ -352,31 +289,17 @@ static int hctr2_init_tfm(struct crypto_skcipher *tfm)
 		goto err_free_xctr;
 	}
 
-	polyval = crypto_spawn_shash(&ictx->polyval_spawn);
-	if (IS_ERR(polyval)) {
-		err = PTR_ERR(polyval);
-		goto err_free_blockcipher;
-	}
-
 	tctx->xctr = xctr;
 	tctx->blockcipher = blockcipher;
-	tctx->polyval = polyval;
 
 	BUILD_BUG_ON(offsetofend(struct hctr2_request_ctx, u) !=
 				 sizeof(struct hctr2_request_ctx));
-	subreq_size = max(sizeof_field(struct hctr2_request_ctx, u.hash_desc) +
-			  crypto_shash_descsize(polyval),
-			  sizeof_field(struct hctr2_request_ctx, u.xctr_req) +
-			  crypto_skcipher_reqsize(xctr));
-
-	tctx->hashed_tweak_offset = offsetof(struct hctr2_request_ctx, u) +
-				    subreq_size;
-	crypto_skcipher_set_reqsize(tfm, tctx->hashed_tweak_offset +
-				    crypto_shash_statesize(polyval));
+	crypto_skcipher_set_reqsize(
+		tfm, max(sizeof(struct hctr2_request_ctx),
+			 offsetofend(struct hctr2_request_ctx, u.xctr_req) +
+				 crypto_skcipher_reqsize(xctr)));
 	return 0;
 
-err_free_blockcipher:
-	crypto_free_cipher(blockcipher);
 err_free_xctr:
 	crypto_free_skcipher(xctr);
 	return err;
@@ -388,7 +311,6 @@ static void hctr2_exit_tfm(struct crypto_skcipher *tfm)
 
 	crypto_free_cipher(tctx->blockcipher);
 	crypto_free_skcipher(tctx->xctr);
-	crypto_free_shash(tctx->polyval);
 }
 
 static void hctr2_free_instance(struct skcipher_instance *inst)
@@ -397,21 +319,17 @@ static void hctr2_free_instance(struct skcipher_instance *inst)
 
 	crypto_drop_cipher(&ictx->blockcipher_spawn);
 	crypto_drop_skcipher(&ictx->xctr_spawn);
-	crypto_drop_shash(&ictx->polyval_spawn);
 	kfree(inst);
 }
 
-static int hctr2_create_common(struct crypto_template *tmpl,
-			       struct rtattr **tb,
-			       const char *xctr_name,
-			       const char *polyval_name)
+static int hctr2_create_common(struct crypto_template *tmpl, struct rtattr **tb,
+			       const char *xctr_name)
 {
 	struct skcipher_alg_common *xctr_alg;
 	u32 mask;
 	struct skcipher_instance *inst;
 	struct hctr2_instance_ctx *ictx;
 	struct crypto_alg *blockcipher_alg;
-	struct shash_alg *polyval_alg;
 	char blockcipher_name[CRYPTO_MAX_ALG_NAME];
 	int len;
 	int err;
@@ -457,19 +375,6 @@ static int hctr2_create_common(struct crypto_template *tmpl,
 	if (blockcipher_alg->cra_blocksize != BLOCKCIPHER_BLOCK_SIZE)
 		goto err_free_inst;
 
-	/* Polyval ε-∆U hash function */
-	err = crypto_grab_shash(&ictx->polyval_spawn,
-				skcipher_crypto_instance(inst),
-				polyval_name, 0, mask);
-	if (err)
-		goto err_free_inst;
-	polyval_alg = crypto_spawn_shash_alg(&ictx->polyval_spawn);
-
-	/* Ensure Polyval is being used */
-	err = -EINVAL;
-	if (strcmp(polyval_alg->base.cra_name, "polyval") != 0)
-		goto err_free_inst;
-
 	/* Instance fields */
 
 	err = -ENAMETOOLONG;
@@ -477,22 +382,16 @@ static int hctr2_create_common(struct crypto_template *tmpl,
 		     blockcipher_alg->cra_name) >= CRYPTO_MAX_ALG_NAME)
 		goto err_free_inst;
 	if (snprintf(inst->alg.base.cra_driver_name, CRYPTO_MAX_ALG_NAME,
-		     "hctr2_base(%s,%s)",
-		     xctr_alg->base.cra_driver_name,
-		     polyval_alg->base.cra_driver_name) >= CRYPTO_MAX_ALG_NAME)
+		     "hctr2_base(%s,polyval-lib)",
+		     xctr_alg->base.cra_driver_name) >= CRYPTO_MAX_ALG_NAME)
 		goto err_free_inst;
 
 	inst->alg.base.cra_blocksize = BLOCKCIPHER_BLOCK_SIZE;
-	inst->alg.base.cra_ctxsize = sizeof(struct hctr2_tfm_ctx) +
-				     polyval_alg->statesize * 2;
+	inst->alg.base.cra_ctxsize = sizeof(struct hctr2_tfm_ctx);
 	inst->alg.base.cra_alignmask = xctr_alg->base.cra_alignmask;
-	/*
-	 * The hash function is called twice, so it is weighted higher than the
-	 * xctr and blockcipher.
-	 */
 	inst->alg.base.cra_priority = (2 * xctr_alg->base.cra_priority +
-				       4 * polyval_alg->base.cra_priority +
-				       blockcipher_alg->cra_priority) / 7;
+				       blockcipher_alg->cra_priority) /
+				      3;
 
 	inst->alg.setkey = hctr2_setkey;
 	inst->alg.encrypt = hctr2_encrypt;
@@ -525,8 +424,11 @@ static int hctr2_create_base(struct crypto_template *tmpl, struct rtattr **tb)
 	polyval_name = crypto_attr_alg_name(tb[2]);
 	if (IS_ERR(polyval_name))
 		return PTR_ERR(polyval_name);
+	if (strcmp(polyval_name, "polyval") != 0 &&
+	    strcmp(polyval_name, "polyval-lib") != 0)
+		return -ENOENT;
 
-	return hctr2_create_common(tmpl, tb, xctr_name, polyval_name);
+	return hctr2_create_common(tmpl, tb, xctr_name);
 }
 
 static int hctr2_create(struct crypto_template *tmpl, struct rtattr **tb)
@@ -542,7 +444,7 @@ static int hctr2_create(struct crypto_template *tmpl, struct rtattr **tb)
 		    blockcipher_name) >= CRYPTO_MAX_ALG_NAME)
 		return -ENAMETOOLONG;
 
-	return hctr2_create_common(tmpl, tb, xctr_name, "polyval");
+	return hctr2_create_common(tmpl, tb, xctr_name);
 }
 
 static struct crypto_template hctr2_tmpls[] = {
diff --git a/crypto/jitterentropy-kcapi.c b/crypto/jitterentropy-kcapi.c
index a53de7affe8d..7c880cf34c52 100644
--- a/crypto/jitterentropy-kcapi.c
+++ b/crypto/jitterentropy-kcapi.c
@@ -48,7 +48,7 @@
 
 #include "jitterentropy.h"
 
-#define JENT_CONDITIONING_HASH	"sha3-256-generic"
+#define JENT_CONDITIONING_HASH	"sha3-256"
 
 /***************************************************************************
  * Helper function
@@ -230,15 +230,7 @@ static int jent_kcapi_init(struct crypto_tfm *tfm)
 
 	spin_lock_init(&rng->jent_lock);
 
-	/*
-	 * Use SHA3-256 as conditioner. We allocate only the generic
-	 * implementation as we are not interested in high-performance. The
-	 * execution time of the SHA3 operation is measured and adds to the
-	 * Jitter RNG's unpredictable behavior. If we have a slower hash
-	 * implementation, the execution timing variations are larger. When
-	 * using a fast implementation, we would need to call it more often
-	 * as its variations are lower.
-	 */
+	/* Use SHA3-256 as conditioner */
 	hash = crypto_alloc_shash(JENT_CONDITIONING_HASH, 0, 0);
 	if (IS_ERR(hash)) {
 		pr_err("Cannot allocate conditioning digest\n");
diff --git a/crypto/polyval-generic.c b/crypto/polyval-generic.c
deleted file mode 100644
index db8adb56e4ca..000000000000
--- a/crypto/polyval-generic.c
+++ /dev/null
@@ -1,205 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * POLYVAL: hash function for HCTR2.
- *
- * Copyright (c) 2007 Nokia Siemens Networks - Mikko Herranen <mh1@iki.fi>
- * Copyright (c) 2009 Intel Corp.
- *   Author: Huang Ying <ying.huang@intel.com>
- * Copyright 2021 Google LLC
- */
-
-/*
- * Code based on crypto/ghash-generic.c
- *
- * POLYVAL is a keyed hash function similar to GHASH. POLYVAL uses a different
- * modulus for finite field multiplication which makes hardware accelerated
- * implementations on little-endian machines faster. POLYVAL is used in the
- * kernel to implement HCTR2, but was originally specified for AES-GCM-SIV
- * (RFC 8452).
- *
- * For more information see:
- * Length-preserving encryption with HCTR2:
- *   https://eprint.iacr.org/2021/1441.pdf
- * AES-GCM-SIV: Nonce Misuse-Resistant Authenticated Encryption:
- *   https://datatracker.ietf.org/doc/html/rfc8452
- *
- * Like GHASH, POLYVAL is not a cryptographic hash function and should
- * not be used outside of crypto modes explicitly designed to use POLYVAL.
- *
- * This implementation uses a convenient trick involving the GHASH and POLYVAL
- * fields. This trick allows multiplication in the POLYVAL field to be
- * implemented by using multiplication in the GHASH field as a subroutine. An
- * element of the POLYVAL field can be converted to an element of the GHASH
- * field by computing x*REVERSE(a), where REVERSE reverses the byte-ordering of
- * a. Similarly, an element of the GHASH field can be converted back to the
- * POLYVAL field by computing REVERSE(x^{-1}*a). For more information, see:
- * https://datatracker.ietf.org/doc/html/rfc8452#appendix-A
- *
- * By using this trick, we do not need to implement the POLYVAL field for the
- * generic implementation.
- *
- * Warning: this generic implementation is not intended to be used in practice
- * and is not constant time. For practical use, a hardware accelerated
- * implementation of POLYVAL should be used instead.
- *
- */
-
-#include <crypto/gf128mul.h>
-#include <crypto/internal/hash.h>
-#include <crypto/polyval.h>
-#include <crypto/utils.h>
-#include <linux/errno.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/string.h>
-#include <linux/unaligned.h>
-
-struct polyval_tfm_ctx {
-	struct gf128mul_4k *gf128;
-};
-
-struct polyval_desc_ctx {
-	union {
-		u8 buffer[POLYVAL_BLOCK_SIZE];
-		be128 buffer128;
-	};
-};
-
-static void copy_and_reverse(u8 dst[POLYVAL_BLOCK_SIZE],
-			     const u8 src[POLYVAL_BLOCK_SIZE])
-{
-	u64 a = get_unaligned((const u64 *)&src[0]);
-	u64 b = get_unaligned((const u64 *)&src[8]);
-
-	put_unaligned(swab64(a), (u64 *)&dst[8]);
-	put_unaligned(swab64(b), (u64 *)&dst[0]);
-}
-
-static int polyval_setkey(struct crypto_shash *tfm,
-			  const u8 *key, unsigned int keylen)
-{
-	struct polyval_tfm_ctx *ctx = crypto_shash_ctx(tfm);
-	be128 k;
-
-	if (keylen != POLYVAL_BLOCK_SIZE)
-		return -EINVAL;
-
-	gf128mul_free_4k(ctx->gf128);
-
-	BUILD_BUG_ON(sizeof(k) != POLYVAL_BLOCK_SIZE);
-	copy_and_reverse((u8 *)&k, key);
-	gf128mul_x_lle(&k, &k);
-
-	ctx->gf128 = gf128mul_init_4k_lle(&k);
-	memzero_explicit(&k, POLYVAL_BLOCK_SIZE);
-
-	if (!ctx->gf128)
-		return -ENOMEM;
-
-	return 0;
-}
-
-static int polyval_init(struct shash_desc *desc)
-{
-	struct polyval_desc_ctx *dctx = shash_desc_ctx(desc);
-
-	memset(dctx, 0, sizeof(*dctx));
-
-	return 0;
-}
-
-static int polyval_update(struct shash_desc *desc,
-			 const u8 *src, unsigned int srclen)
-{
-	struct polyval_desc_ctx *dctx = shash_desc_ctx(desc);
-	const struct polyval_tfm_ctx *ctx = crypto_shash_ctx(desc->tfm);
-	u8 tmp[POLYVAL_BLOCK_SIZE];
-
-	do {
-		copy_and_reverse(tmp, src);
-		crypto_xor(dctx->buffer, tmp, POLYVAL_BLOCK_SIZE);
-		gf128mul_4k_lle(&dctx->buffer128, ctx->gf128);
-		src += POLYVAL_BLOCK_SIZE;
-		srclen -= POLYVAL_BLOCK_SIZE;
-	} while (srclen >= POLYVAL_BLOCK_SIZE);
-
-	return srclen;
-}
-
-static int polyval_finup(struct shash_desc *desc, const u8 *src,
-			 unsigned int len, u8 *dst)
-{
-	struct polyval_desc_ctx *dctx = shash_desc_ctx(desc);
-
-	if (len) {
-		u8 tmp[POLYVAL_BLOCK_SIZE] = {};
-
-		memcpy(tmp, src, len);
-		polyval_update(desc, tmp, POLYVAL_BLOCK_SIZE);
-	}
-	copy_and_reverse(dst, dctx->buffer);
-	return 0;
-}
-
-static int polyval_export(struct shash_desc *desc, void *out)
-{
-	struct polyval_desc_ctx *dctx = shash_desc_ctx(desc);
-
-	copy_and_reverse(out, dctx->buffer);
-	return 0;
-}
-
-static int polyval_import(struct shash_desc *desc, const void *in)
-{
-	struct polyval_desc_ctx *dctx = shash_desc_ctx(desc);
-
-	copy_and_reverse(dctx->buffer, in);
-	return 0;
-}
-
-static void polyval_exit_tfm(struct crypto_shash *tfm)
-{
-	struct polyval_tfm_ctx *ctx = crypto_shash_ctx(tfm);
-
-	gf128mul_free_4k(ctx->gf128);
-}
-
-static struct shash_alg polyval_alg = {
-	.digestsize	= POLYVAL_DIGEST_SIZE,
-	.init		= polyval_init,
-	.update		= polyval_update,
-	.finup		= polyval_finup,
-	.setkey		= polyval_setkey,
-	.export		= polyval_export,
-	.import		= polyval_import,
-	.exit_tfm	= polyval_exit_tfm,
-	.statesize	= sizeof(struct polyval_desc_ctx),
-	.descsize	= sizeof(struct polyval_desc_ctx),
-	.base		= {
-		.cra_name		= "polyval",
-		.cra_driver_name	= "polyval-generic",
-		.cra_priority		= 100,
-		.cra_flags		= CRYPTO_AHASH_ALG_BLOCK_ONLY,
-		.cra_blocksize		= POLYVAL_BLOCK_SIZE,
-		.cra_ctxsize		= sizeof(struct polyval_tfm_ctx),
-		.cra_module		= THIS_MODULE,
-	},
-};
-
-static int __init polyval_mod_init(void)
-{
-	return crypto_register_shash(&polyval_alg);
-}
-
-static void __exit polyval_mod_exit(void)
-{
-	crypto_unregister_shash(&polyval_alg);
-}
-
-module_init(polyval_mod_init);
-module_exit(polyval_mod_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("POLYVAL hash function");
-MODULE_ALIAS_CRYPTO("polyval");
-MODULE_ALIAS_CRYPTO("polyval-generic");
diff --git a/crypto/sha3.c b/crypto/sha3.c
new file mode 100644
index 000000000000..8f364979ec89
--- /dev/null
+++ b/crypto/sha3.c
@@ -0,0 +1,166 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Crypto API support for SHA-3
+ * (https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.202.pdf)
+ */
+#include <crypto/internal/hash.h>
+#include <crypto/sha3.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+
+#define SHA3_CTX(desc) ((struct sha3_ctx *)shash_desc_ctx(desc))
+
+static int crypto_sha3_224_init(struct shash_desc *desc)
+{
+	sha3_224_init(SHA3_CTX(desc));
+	return 0;
+}
+
+static int crypto_sha3_256_init(struct shash_desc *desc)
+{
+	sha3_256_init(SHA3_CTX(desc));
+	return 0;
+}
+
+static int crypto_sha3_384_init(struct shash_desc *desc)
+{
+	sha3_384_init(SHA3_CTX(desc));
+	return 0;
+}
+
+static int crypto_sha3_512_init(struct shash_desc *desc)
+{
+	sha3_512_init(SHA3_CTX(desc));
+	return 0;
+}
+
+static int crypto_sha3_update(struct shash_desc *desc, const u8 *data,
+			      unsigned int len)
+{
+	sha3_update(SHA3_CTX(desc), data, len);
+	return 0;
+}
+
+static int crypto_sha3_final(struct shash_desc *desc, u8 *out)
+{
+	sha3_final(SHA3_CTX(desc), out);
+	return 0;
+}
+
+static int crypto_sha3_224_digest(struct shash_desc *desc,
+				  const u8 *data, unsigned int len, u8 *out)
+{
+	sha3_224(data, len, out);
+	return 0;
+}
+
+static int crypto_sha3_256_digest(struct shash_desc *desc,
+				  const u8 *data, unsigned int len, u8 *out)
+{
+	sha3_256(data, len, out);
+	return 0;
+}
+
+static int crypto_sha3_384_digest(struct shash_desc *desc,
+				  const u8 *data, unsigned int len, u8 *out)
+{
+	sha3_384(data, len, out);
+	return 0;
+}
+
+static int crypto_sha3_512_digest(struct shash_desc *desc,
+				  const u8 *data, unsigned int len, u8 *out)
+{
+	sha3_512(data, len, out);
+	return 0;
+}
+
+static int crypto_sha3_export_core(struct shash_desc *desc, void *out)
+{
+	memcpy(out, SHA3_CTX(desc), sizeof(struct sha3_ctx));
+	return 0;
+}
+
+static int crypto_sha3_import_core(struct shash_desc *desc, const void *in)
+{
+	memcpy(SHA3_CTX(desc), in, sizeof(struct sha3_ctx));
+	return 0;
+}
+
+static struct shash_alg algs[] = { {
+	.digestsize		= SHA3_224_DIGEST_SIZE,
+	.init			= crypto_sha3_224_init,
+	.update			= crypto_sha3_update,
+	.final			= crypto_sha3_final,
+	.digest			= crypto_sha3_224_digest,
+	.export_core		= crypto_sha3_export_core,
+	.import_core		= crypto_sha3_import_core,
+	.descsize		= sizeof(struct sha3_ctx),
+	.base.cra_name		= "sha3-224",
+	.base.cra_driver_name	= "sha3-224-lib",
+	.base.cra_blocksize	= SHA3_224_BLOCK_SIZE,
+	.base.cra_module	= THIS_MODULE,
+}, {
+	.digestsize		= SHA3_256_DIGEST_SIZE,
+	.init			= crypto_sha3_256_init,
+	.update			= crypto_sha3_update,
+	.final			= crypto_sha3_final,
+	.digest			= crypto_sha3_256_digest,
+	.export_core		= crypto_sha3_export_core,
+	.import_core		= crypto_sha3_import_core,
+	.descsize		= sizeof(struct sha3_ctx),
+	.base.cra_name		= "sha3-256",
+	.base.cra_driver_name	= "sha3-256-lib",
+	.base.cra_blocksize	= SHA3_256_BLOCK_SIZE,
+	.base.cra_module	= THIS_MODULE,
+}, {
+	.digestsize		= SHA3_384_DIGEST_SIZE,
+	.init			= crypto_sha3_384_init,
+	.update			= crypto_sha3_update,
+	.final			= crypto_sha3_final,
+	.digest			= crypto_sha3_384_digest,
+	.export_core		= crypto_sha3_export_core,
+	.import_core		= crypto_sha3_import_core,
+	.descsize		= sizeof(struct sha3_ctx),
+	.base.cra_name		= "sha3-384",
+	.base.cra_driver_name	= "sha3-384-lib",
+	.base.cra_blocksize	= SHA3_384_BLOCK_SIZE,
+	.base.cra_module	= THIS_MODULE,
+}, {
+	.digestsize		= SHA3_512_DIGEST_SIZE,
+	.init			= crypto_sha3_512_init,
+	.update			= crypto_sha3_update,
+	.final			= crypto_sha3_final,
+	.digest			= crypto_sha3_512_digest,
+	.export_core		= crypto_sha3_export_core,
+	.import_core		= crypto_sha3_import_core,
+	.descsize		= sizeof(struct sha3_ctx),
+	.base.cra_name		= "sha3-512",
+	.base.cra_driver_name	= "sha3-512-lib",
+	.base.cra_blocksize	= SHA3_512_BLOCK_SIZE,
+	.base.cra_module	= THIS_MODULE,
+} };
+
+static int __init crypto_sha3_mod_init(void)
+{
+	return crypto_register_shashes(algs, ARRAY_SIZE(algs));
+}
+module_init(crypto_sha3_mod_init);
+
+static void __exit crypto_sha3_mod_exit(void)
+{
+	crypto_unregister_shashes(algs, ARRAY_SIZE(algs));
+}
+module_exit(crypto_sha3_mod_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Crypto API support for SHA-3");
+
+MODULE_ALIAS_CRYPTO("sha3-224");
+MODULE_ALIAS_CRYPTO("sha3-224-lib");
+MODULE_ALIAS_CRYPTO("sha3-256");
+MODULE_ALIAS_CRYPTO("sha3-256-lib");
+MODULE_ALIAS_CRYPTO("sha3-384");
+MODULE_ALIAS_CRYPTO("sha3-384-lib");
+MODULE_ALIAS_CRYPTO("sha3-512");
+MODULE_ALIAS_CRYPTO("sha3-512-lib");
diff --git a/crypto/sha3_generic.c b/crypto/sha3_generic.c
deleted file mode 100644
index 41d1e506e6de..000000000000
--- a/crypto/sha3_generic.c
+++ /dev/null
@@ -1,290 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * Cryptographic API.
- *
- * SHA-3, as specified in
- * https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.202.pdf
- *
- * SHA-3 code by Jeff Garzik <jeff@garzik.org>
- *               Ard Biesheuvel <ard.biesheuvel@linaro.org>
- */
-#include <crypto/internal/hash.h>
-#include <crypto/sha3.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/string.h>
-#include <linux/unaligned.h>
-
-/*
- * On some 32-bit architectures (h8300), GCC ends up using
- * over 1 KB of stack if we inline the round calculation into the loop
- * in keccakf(). On the other hand, on 64-bit architectures with plenty
- * of [64-bit wide] general purpose registers, not inlining it severely
- * hurts performance. So let's use 64-bitness as a heuristic to decide
- * whether to inline or not.
- */
-#ifdef CONFIG_64BIT
-#define SHA3_INLINE	inline
-#else
-#define SHA3_INLINE	noinline
-#endif
-
-#define KECCAK_ROUNDS 24
-
-static const u64 keccakf_rndc[24] = {
-	0x0000000000000001ULL, 0x0000000000008082ULL, 0x800000000000808aULL,
-	0x8000000080008000ULL, 0x000000000000808bULL, 0x0000000080000001ULL,
-	0x8000000080008081ULL, 0x8000000000008009ULL, 0x000000000000008aULL,
-	0x0000000000000088ULL, 0x0000000080008009ULL, 0x000000008000000aULL,
-	0x000000008000808bULL, 0x800000000000008bULL, 0x8000000000008089ULL,
-	0x8000000000008003ULL, 0x8000000000008002ULL, 0x8000000000000080ULL,
-	0x000000000000800aULL, 0x800000008000000aULL, 0x8000000080008081ULL,
-	0x8000000000008080ULL, 0x0000000080000001ULL, 0x8000000080008008ULL
-};
-
-/* update the state with given number of rounds */
-
-static SHA3_INLINE void keccakf_round(u64 st[25])
-{
-	u64 t[5], tt, bc[5];
-
-	/* Theta */
-	bc[0] = st[0] ^ st[5] ^ st[10] ^ st[15] ^ st[20];
-	bc[1] = st[1] ^ st[6] ^ st[11] ^ st[16] ^ st[21];
-	bc[2] = st[2] ^ st[7] ^ st[12] ^ st[17] ^ st[22];
-	bc[3] = st[3] ^ st[8] ^ st[13] ^ st[18] ^ st[23];
-	bc[4] = st[4] ^ st[9] ^ st[14] ^ st[19] ^ st[24];
-
-	t[0] = bc[4] ^ rol64(bc[1], 1);
-	t[1] = bc[0] ^ rol64(bc[2], 1);
-	t[2] = bc[1] ^ rol64(bc[3], 1);
-	t[3] = bc[2] ^ rol64(bc[4], 1);
-	t[4] = bc[3] ^ rol64(bc[0], 1);
-
-	st[0] ^= t[0];
-
-	/* Rho Pi */
-	tt = st[1];
-	st[ 1] = rol64(st[ 6] ^ t[1], 44);
-	st[ 6] = rol64(st[ 9] ^ t[4], 20);
-	st[ 9] = rol64(st[22] ^ t[2], 61);
-	st[22] = rol64(st[14] ^ t[4], 39);
-	st[14] = rol64(st[20] ^ t[0], 18);
-	st[20] = rol64(st[ 2] ^ t[2], 62);
-	st[ 2] = rol64(st[12] ^ t[2], 43);
-	st[12] = rol64(st[13] ^ t[3], 25);
-	st[13] = rol64(st[19] ^ t[4],  8);
-	st[19] = rol64(st[23] ^ t[3], 56);
-	st[23] = rol64(st[15] ^ t[0], 41);
-	st[15] = rol64(st[ 4] ^ t[4], 27);
-	st[ 4] = rol64(st[24] ^ t[4], 14);
-	st[24] = rol64(st[21] ^ t[1],  2);
-	st[21] = rol64(st[ 8] ^ t[3], 55);
-	st[ 8] = rol64(st[16] ^ t[1], 45);
-	st[16] = rol64(st[ 5] ^ t[0], 36);
-	st[ 5] = rol64(st[ 3] ^ t[3], 28);
-	st[ 3] = rol64(st[18] ^ t[3], 21);
-	st[18] = rol64(st[17] ^ t[2], 15);
-	st[17] = rol64(st[11] ^ t[1], 10);
-	st[11] = rol64(st[ 7] ^ t[2],  6);
-	st[ 7] = rol64(st[10] ^ t[0],  3);
-	st[10] = rol64(    tt ^ t[1],  1);
-
-	/* Chi */
-	bc[ 0] = ~st[ 1] & st[ 2];
-	bc[ 1] = ~st[ 2] & st[ 3];
-	bc[ 2] = ~st[ 3] & st[ 4];
-	bc[ 3] = ~st[ 4] & st[ 0];
-	bc[ 4] = ~st[ 0] & st[ 1];
-	st[ 0] ^= bc[ 0];
-	st[ 1] ^= bc[ 1];
-	st[ 2] ^= bc[ 2];
-	st[ 3] ^= bc[ 3];
-	st[ 4] ^= bc[ 4];
-
-	bc[ 0] = ~st[ 6] & st[ 7];
-	bc[ 1] = ~st[ 7] & st[ 8];
-	bc[ 2] = ~st[ 8] & st[ 9];
-	bc[ 3] = ~st[ 9] & st[ 5];
-	bc[ 4] = ~st[ 5] & st[ 6];
-	st[ 5] ^= bc[ 0];
-	st[ 6] ^= bc[ 1];
-	st[ 7] ^= bc[ 2];
-	st[ 8] ^= bc[ 3];
-	st[ 9] ^= bc[ 4];
-
-	bc[ 0] = ~st[11] & st[12];
-	bc[ 1] = ~st[12] & st[13];
-	bc[ 2] = ~st[13] & st[14];
-	bc[ 3] = ~st[14] & st[10];
-	bc[ 4] = ~st[10] & st[11];
-	st[10] ^= bc[ 0];
-	st[11] ^= bc[ 1];
-	st[12] ^= bc[ 2];
-	st[13] ^= bc[ 3];
-	st[14] ^= bc[ 4];
-
-	bc[ 0] = ~st[16] & st[17];
-	bc[ 1] = ~st[17] & st[18];
-	bc[ 2] = ~st[18] & st[19];
-	bc[ 3] = ~st[19] & st[15];
-	bc[ 4] = ~st[15] & st[16];
-	st[15] ^= bc[ 0];
-	st[16] ^= bc[ 1];
-	st[17] ^= bc[ 2];
-	st[18] ^= bc[ 3];
-	st[19] ^= bc[ 4];
-
-	bc[ 0] = ~st[21] & st[22];
-	bc[ 1] = ~st[22] & st[23];
-	bc[ 2] = ~st[23] & st[24];
-	bc[ 3] = ~st[24] & st[20];
-	bc[ 4] = ~st[20] & st[21];
-	st[20] ^= bc[ 0];
-	st[21] ^= bc[ 1];
-	st[22] ^= bc[ 2];
-	st[23] ^= bc[ 3];
-	st[24] ^= bc[ 4];
-}
-
-static void keccakf(u64 st[25])
-{
-	int round;
-
-	for (round = 0; round < KECCAK_ROUNDS; round++) {
-		keccakf_round(st);
-		/* Iota */
-		st[0] ^= keccakf_rndc[round];
-	}
-}
-
-int crypto_sha3_init(struct shash_desc *desc)
-{
-	struct sha3_state *sctx = shash_desc_ctx(desc);
-
-	memset(sctx->st, 0, sizeof(sctx->st));
-	return 0;
-}
-EXPORT_SYMBOL(crypto_sha3_init);
-
-static int crypto_sha3_update(struct shash_desc *desc, const u8 *data,
-			      unsigned int len)
-{
-	unsigned int rsiz = crypto_shash_blocksize(desc->tfm);
-	struct sha3_state *sctx = shash_desc_ctx(desc);
-	unsigned int rsizw = rsiz / 8;
-
-	do {
-		int i;
-
-		for (i = 0; i < rsizw; i++)
-			sctx->st[i] ^= get_unaligned_le64(data + 8 * i);
-		keccakf(sctx->st);
-
-		data += rsiz;
-		len -= rsiz;
-	} while (len >= rsiz);
-	return len;
-}
-
-static int crypto_sha3_finup(struct shash_desc *desc, const u8 *src,
-			     unsigned int len, u8 *out)
-{
-	unsigned int digest_size = crypto_shash_digestsize(desc->tfm);
-	unsigned int rsiz = crypto_shash_blocksize(desc->tfm);
-	struct sha3_state *sctx = shash_desc_ctx(desc);
-	__le64 block[SHA3_224_BLOCK_SIZE / 8] = {};
-	__le64 *digest = (__le64 *)out;
-	unsigned int rsizw = rsiz / 8;
-	u8 *p;
-	int i;
-
-	p = memcpy(block, src, len);
-	p[len++] = 0x06;
-	p[rsiz - 1] |= 0x80;
-
-	for (i = 0; i < rsizw; i++)
-		sctx->st[i] ^= le64_to_cpu(block[i]);
-	memzero_explicit(block, sizeof(block));
-
-	keccakf(sctx->st);
-
-	for (i = 0; i < digest_size / 8; i++)
-		put_unaligned_le64(sctx->st[i], digest++);
-
-	if (digest_size & 4)
-		put_unaligned_le32(sctx->st[i], (__le32 *)digest);
-
-	return 0;
-}
-
-static struct shash_alg algs[] = { {
-	.digestsize		= SHA3_224_DIGEST_SIZE,
-	.init			= crypto_sha3_init,
-	.update			= crypto_sha3_update,
-	.finup			= crypto_sha3_finup,
-	.descsize		= SHA3_STATE_SIZE,
-	.base.cra_name		= "sha3-224",
-	.base.cra_driver_name	= "sha3-224-generic",
-	.base.cra_flags		= CRYPTO_AHASH_ALG_BLOCK_ONLY,
-	.base.cra_blocksize	= SHA3_224_BLOCK_SIZE,
-	.base.cra_module	= THIS_MODULE,
-}, {
-	.digestsize		= SHA3_256_DIGEST_SIZE,
-	.init			= crypto_sha3_init,
-	.update			= crypto_sha3_update,
-	.finup			= crypto_sha3_finup,
-	.descsize		= SHA3_STATE_SIZE,
-	.base.cra_name		= "sha3-256",
-	.base.cra_driver_name	= "sha3-256-generic",
-	.base.cra_flags		= CRYPTO_AHASH_ALG_BLOCK_ONLY,
-	.base.cra_blocksize	= SHA3_256_BLOCK_SIZE,
-	.base.cra_module	= THIS_MODULE,
-}, {
-	.digestsize		= SHA3_384_DIGEST_SIZE,
-	.init			= crypto_sha3_init,
-	.update			= crypto_sha3_update,
-	.finup			= crypto_sha3_finup,
-	.descsize		= SHA3_STATE_SIZE,
-	.base.cra_name		= "sha3-384",
-	.base.cra_driver_name	= "sha3-384-generic",
-	.base.cra_flags		= CRYPTO_AHASH_ALG_BLOCK_ONLY,
-	.base.cra_blocksize	= SHA3_384_BLOCK_SIZE,
-	.base.cra_module	= THIS_MODULE,
-}, {
-	.digestsize		= SHA3_512_DIGEST_SIZE,
-	.init			= crypto_sha3_init,
-	.update			= crypto_sha3_update,
-	.finup			= crypto_sha3_finup,
-	.descsize		= SHA3_STATE_SIZE,
-	.base.cra_name		= "sha3-512",
-	.base.cra_driver_name	= "sha3-512-generic",
-	.base.cra_flags		= CRYPTO_AHASH_ALG_BLOCK_ONLY,
-	.base.cra_blocksize	= SHA3_512_BLOCK_SIZE,
-	.base.cra_module	= THIS_MODULE,
-} };
-
-static int __init sha3_generic_mod_init(void)
-{
-	return crypto_register_shashes(algs, ARRAY_SIZE(algs));
-}
-
-static void __exit sha3_generic_mod_fini(void)
-{
-	crypto_unregister_shashes(algs, ARRAY_SIZE(algs));
-}
-
-module_init(sha3_generic_mod_init);
-module_exit(sha3_generic_mod_fini);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("SHA-3 Secure Hash Algorithm");
-
-MODULE_ALIAS_CRYPTO("sha3-224");
-MODULE_ALIAS_CRYPTO("sha3-224-generic");
-MODULE_ALIAS_CRYPTO("sha3-256");
-MODULE_ALIAS_CRYPTO("sha3-256-generic");
-MODULE_ALIAS_CRYPTO("sha3-384");
-MODULE_ALIAS_CRYPTO("sha3-384-generic");
-MODULE_ALIAS_CRYPTO("sha3-512");
-MODULE_ALIAS_CRYPTO("sha3-512-generic");
diff --git a/crypto/tcrypt.c b/crypto/tcrypt.c
index d1d88debbd71..32d9eaf2c8af 100644
--- a/crypto/tcrypt.c
+++ b/crypto/tcrypt.c
@@ -1690,10 +1690,6 @@ static int do_test(const char *alg, u32 type, u32 mask, int m, u32 num_mb)
 		ret = min(ret, tcrypt_test("ccm(sm4)"));
 		break;
 
-	case 57:
-		ret = min(ret, tcrypt_test("polyval"));
-		break;
-
 	case 58:
 		ret = min(ret, tcrypt_test("gcm(aria)"));
 		break;
diff --git a/crypto/testmgr.c b/crypto/testmgr.c
index 6a490aaa71b9..6fb53978df11 100644
--- a/crypto/testmgr.c
+++ b/crypto/testmgr.c
@@ -4332,6 +4332,7 @@ static const struct alg_test_desc alg_test_descs[] = {
 		.fips_allowed = 1,
 	}, {
 		.alg = "blake2b-160",
+		.generic_driver = "blake2b-160-lib",
 		.test = alg_test_hash,
 		.fips_allowed = 0,
 		.suite = {
@@ -4339,6 +4340,7 @@ static const struct alg_test_desc alg_test_descs[] = {
 		}
 	}, {
 		.alg = "blake2b-256",
+		.generic_driver = "blake2b-256-lib",
 		.test = alg_test_hash,
 		.fips_allowed = 0,
 		.suite = {
@@ -4346,6 +4348,7 @@ static const struct alg_test_desc alg_test_descs[] = {
 		}
 	}, {
 		.alg = "blake2b-384",
+		.generic_driver = "blake2b-384-lib",
 		.test = alg_test_hash,
 		.fips_allowed = 0,
 		.suite = {
@@ -4353,6 +4356,7 @@ static const struct alg_test_desc alg_test_descs[] = {
 		}
 	}, {
 		.alg = "blake2b-512",
+		.generic_driver = "blake2b-512-lib",
 		.test = alg_test_hash,
 		.fips_allowed = 0,
 		.suite = {
@@ -5055,8 +5059,7 @@ static const struct alg_test_desc alg_test_descs[] = {
 		}
 	}, {
 		.alg = "hctr2(aes)",
-		.generic_driver =
-		    "hctr2_base(xctr(aes-generic),polyval-generic)",
+		.generic_driver = "hctr2_base(xctr(aes-generic),polyval-lib)",
 		.test = alg_test_skcipher,
 		.suite = {
 			.cipher = __VECS(aes_hctr2_tv_template)
@@ -5100,6 +5103,7 @@ static const struct alg_test_desc alg_test_descs[] = {
 		}
 	}, {
 		.alg = "hmac(sha3-224)",
+		.generic_driver = "hmac(sha3-224-lib)",
 		.test = alg_test_hash,
 		.fips_allowed = 1,
 		.suite = {
@@ -5107,6 +5111,7 @@ static const struct alg_test_desc alg_test_descs[] = {
 		}
 	}, {
 		.alg = "hmac(sha3-256)",
+		.generic_driver = "hmac(sha3-256-lib)",
 		.test = alg_test_hash,
 		.fips_allowed = 1,
 		.suite = {
@@ -5114,6 +5119,7 @@ static const struct alg_test_desc alg_test_descs[] = {
 		}
 	}, {
 		.alg = "hmac(sha3-384)",
+		.generic_driver = "hmac(sha3-384-lib)",
 		.test = alg_test_hash,
 		.fips_allowed = 1,
 		.suite = {
@@ -5121,6 +5127,7 @@ static const struct alg_test_desc alg_test_descs[] = {
 		}
 	}, {
 		.alg = "hmac(sha3-512)",
+		.generic_driver = "hmac(sha3-512-lib)",
 		.test = alg_test_hash,
 		.fips_allowed = 1,
 		.suite = {
@@ -5364,12 +5371,6 @@ static const struct alg_test_desc alg_test_descs[] = {
 		.test = alg_test_null,
 		.fips_allowed = 1,
 	}, {
-		.alg = "polyval",
-		.test = alg_test_hash,
-		.suite = {
-			.hash = __VECS(polyval_tv_template)
-		}
-	}, {
 		.alg = "rfc3686(ctr(aes))",
 		.test = alg_test_skcipher,
 		.fips_allowed = 1,
@@ -5474,6 +5475,7 @@ static const struct alg_test_desc alg_test_descs[] = {
 		}
 	}, {
 		.alg = "sha3-224",
+		.generic_driver = "sha3-224-lib",
 		.test = alg_test_hash,
 		.fips_allowed = 1,
 		.suite = {
@@ -5481,6 +5483,7 @@ static const struct alg_test_desc alg_test_descs[] = {
 		}
 	}, {
 		.alg = "sha3-256",
+		.generic_driver = "sha3-256-lib",
 		.test = alg_test_hash,
 		.fips_allowed = 1,
 		.suite = {
@@ -5488,6 +5491,7 @@ static const struct alg_test_desc alg_test_descs[] = {
 		}
 	}, {
 		.alg = "sha3-384",
+		.generic_driver = "sha3-384-lib",
 		.test = alg_test_hash,
 		.fips_allowed = 1,
 		.suite = {
@@ -5495,6 +5499,7 @@ static const struct alg_test_desc alg_test_descs[] = {
 		}
 	}, {
 		.alg = "sha3-512",
+		.generic_driver = "sha3-512-lib",
 		.test = alg_test_hash,
 		.fips_allowed = 1,
 		.suite = {
diff --git a/crypto/testmgr.h b/crypto/testmgr.h
index 268231227282..a3e4695945ca 100644
--- a/crypto/testmgr.h
+++ b/crypto/testmgr.h
@@ -36237,177 +36237,6 @@ static const struct cipher_testvec aes_xctr_tv_template[] = {
 
 /*
  * Test vectors generated using https://github.com/google/hctr2
- *
- * To ensure compatibility with RFC 8452, some tests were sourced from
- * https://datatracker.ietf.org/doc/html/rfc8452
- */
-static const struct hash_testvec polyval_tv_template[] = {
-	{ // From RFC 8452
-		.key	= "\x31\x07\x28\xd9\x91\x1f\x1f\x38"
-			  "\x37\xb2\x43\x16\xc3\xfa\xb9\xa0",
-		.plaintext	= "\x65\x78\x61\x6d\x70\x6c\x65\x00"
-			  "\x00\x00\x00\x00\x00\x00\x00\x00"
-			  "\x48\x65\x6c\x6c\x6f\x20\x77\x6f"
-			  "\x72\x6c\x64\x00\x00\x00\x00\x00"
-			  "\x38\x00\x00\x00\x00\x00\x00\x00"
-			  "\x58\x00\x00\x00\x00\x00\x00\x00",
-		.digest	= "\xad\x7f\xcf\x0b\x51\x69\x85\x16"
-			  "\x62\x67\x2f\x3c\x5f\x95\x13\x8f",
-		.psize	= 48,
-		.ksize	= 16,
-	},
-	{ // From RFC 8452
-		.key	= "\xd9\xb3\x60\x27\x96\x94\x94\x1a"
-			  "\xc5\xdb\xc6\x98\x7a\xda\x73\x77",
-		.plaintext	= "\x00\x00\x00\x00\x00\x00\x00\x00"
-			  "\x00\x00\x00\x00\x00\x00\x00\x00",
-		.digest	= "\x00\x00\x00\x00\x00\x00\x00\x00"
-			  "\x00\x00\x00\x00\x00\x00\x00\x00",
-		.psize	= 16,
-		.ksize	= 16,
-	},
-	{ // From RFC 8452
-		.key	= "\xd9\xb3\x60\x27\x96\x94\x94\x1a"
-			  "\xc5\xdb\xc6\x98\x7a\xda\x73\x77",
-		.plaintext	= "\x01\x00\x00\x00\x00\x00\x00\x00"
-			  "\x00\x00\x00\x00\x00\x00\x00\x00"
-			  "\x00\x00\x00\x00\x00\x00\x00\x00"
-			  "\x40\x00\x00\x00\x00\x00\x00\x00",
-		.digest	= "\xeb\x93\xb7\x74\x09\x62\xc5\xe4"
-			  "\x9d\x2a\x90\xa7\xdc\x5c\xec\x74",
-		.psize	= 32,
-		.ksize	= 16,
-	},
-	{ // From RFC 8452
-		.key	= "\xd9\xb3\x60\x27\x96\x94\x94\x1a"
-			  "\xc5\xdb\xc6\x98\x7a\xda\x73\x77",
-		.plaintext	= "\x01\x00\x00\x00\x00\x00\x00\x00"
-			  "\x00\x00\x00\x00\x00\x00\x00\x00"
-			  "\x02\x00\x00\x00\x00\x00\x00\x00"
-			  "\x00\x00\x00\x00\x00\x00\x00\x00"
-			  "\x03\x00\x00\x00\x00\x00\x00\x00"
-			  "\x00\x00\x00\x00\x00\x00\x00\x00"
-			  "\x00\x00\x00\x00\x00\x00\x00\x00"
-			  "\x80\x01\x00\x00\x00\x00\x00\x00",
-		.digest	= "\x81\x38\x87\x46\xbc\x22\xd2\x6b"
-			  "\x2a\xbc\x3d\xcb\x15\x75\x42\x22",
-		.psize	= 64,
-		.ksize	= 16,
-	},
-	{ // From RFC 8452
-		.key	= "\xd9\xb3\x60\x27\x96\x94\x94\x1a"
-			  "\xc5\xdb\xc6\x98\x7a\xda\x73\x77",
-		.plaintext	= "\x01\x00\x00\x00\x00\x00\x00\x00"
-			  "\x00\x00\x00\x00\x00\x00\x00\x00"
-			  "\x02\x00\x00\x00\x00\x00\x00\x00"
-			  "\x00\x00\x00\x00\x00\x00\x00\x00"
-			  "\x03\x00\x00\x00\x00\x00\x00\x00"
-			  "\x00\x00\x00\x00\x00\x00\x00\x00"
-			  "\x04\x00\x00\x00\x00\x00\x00\x00"
-			  "\x00\x00\x00\x00\x00\x00\x00\x00"
-			  "\x00\x00\x00\x00\x00\x00\x00\x00"
-			  "\x00\x02\x00\x00\x00\x00\x00\x00",
-		.digest	= "\x1e\x39\xb6\xd3\x34\x4d\x34\x8f"
-			  "\x60\x44\xf8\x99\x35\xd1\xcf\x78",
-		.psize	= 80,
-		.ksize	= 16,
-	},
-	{ // From RFC 8452
-		.key	= "\xd9\xb3\x60\x27\x96\x94\x94\x1a"
-			  "\xc5\xdb\xc6\x98\x7a\xda\x73\x77",
-		.plaintext	= "\x01\x00\x00\x00\x00\x00\x00\x00"
-			  "\x00\x00\x00\x00\x00\x00\x00\x00"
-			  "\x02\x00\x00\x00\x00\x00\x00\x00"
-			  "\x00\x00\x00\x00\x00\x00\x00\x00"
-			  "\x03\x00\x00\x00\x00\x00\x00\x00"
-			  "\x00\x00\x00\x00\x00\x00\x00\x00"
-			  "\x04\x00\x00\x00\x00\x00\x00\x00"
-			  "\x00\x00\x00\x00\x00\x00\x00\x00"
-			  "\x05\x00\x00\x00\x00\x00\x00\x00"
-			  "\x00\x00\x00\x00\x00\x00\x00\x00"
-			  "\x08\x00\x00\x00\x00\x00\x00\x00"
-			  "\x00\x02\x00\x00\x00\x00\x00\x00",
-		.digest	= "\xff\xcd\x05\xd5\x77\x0f\x34\xad"
-			  "\x92\x67\xf0\xa5\x99\x94\xb1\x5a",
-		.psize	= 96,
-		.ksize	= 16,
-	},
-	{ // Random ( 1)
-		.key	= "\x90\xcc\xac\xee\xba\xd7\xd4\x68"
-			  "\x98\xa6\x79\x70\xdf\x66\x15\x6c",
-		.plaintext	= "",
-		.digest	= "\x00\x00\x00\x00\x00\x00\x00\x00"
-			  "\x00\x00\x00\x00\x00\x00\x00\x00",
-		.psize	= 0,
-		.ksize	= 16,
-	},
-	{ // Random ( 1)
-		.key	= "\xc1\x45\x71\xf0\x30\x07\x94\xe7"
-			  "\x3a\xdd\xe4\xc6\x19\x2d\x02\xa2",
-		.plaintext	= "\xc1\x5d\x47\xc7\x4c\x7c\x5e\x07"
-			  "\x85\x14\x8f\x79\xcc\x73\x83\xf7"
-			  "\x35\xb8\xcb\x73\x61\xf0\x53\x31"
-			  "\xbf\x84\xde\xb6\xde\xaf\xb0\xb8"
-			  "\xb7\xd9\x11\x91\x89\xfd\x1e\x4c"
-			  "\x84\x4a\x1f\x2a\x87\xa4\xaf\x62"
-			  "\x8d\x7d\x58\xf6\x43\x35\xfc\x53"
-			  "\x8f\x1a\xf6\x12\xe1\x13\x3f\x66"
-			  "\x91\x4b\x13\xd6\x45\xfb\xb0\x7a"
-			  "\xe0\x8b\x8e\x99\xf7\x86\x46\x37"
-			  "\xd1\x22\x9e\x52\xf3\x3f\xd9\x75"
-			  "\x2c\x2c\xc6\xbb\x0e\x08\x14\x29"
-			  "\xe8\x50\x2f\xd8\xbe\xf4\xe9\x69"
-			  "\x4a\xee\xf7\xae\x15\x65\x35\x1e",
-		.digest	= "\x00\x4f\x5d\xe9\x3b\xc0\xd6\x50"
-			  "\x3e\x38\x73\x86\xc6\xda\xca\x7f",
-		.psize	= 112,
-		.ksize	= 16,
-	},
-	{ // Random ( 1)
-		.key	= "\x37\xbe\x68\x16\x50\xb9\x4e\xb0"
-			  "\x47\xde\xe2\xbd\xde\xe4\x48\x09",
-		.plaintext	= "\x87\xfc\x68\x9f\xff\xf2\x4a\x1e"
-			  "\x82\x3b\x73\x8f\xc1\xb2\x1b\x7a"
-			  "\x6c\x4f\x81\xbc\x88\x9b\x6c\xa3"
-			  "\x9c\xc2\xa5\xbc\x14\x70\x4c\x9b"
-			  "\x0c\x9f\x59\x92\x16\x4b\x91\x3d"
-			  "\x18\x55\x22\x68\x12\x8c\x63\xb2"
-			  "\x51\xcb\x85\x4b\xd2\xae\x0b\x1c"
-			  "\x5d\x28\x9d\x1d\xb1\xc8\xf0\x77"
-			  "\xe9\xb5\x07\x4e\x06\xc8\xee\xf8"
-			  "\x1b\xed\x72\x2a\x55\x7d\x16\xc9"
-			  "\xf2\x54\xe7\xe9\xe0\x44\x5b\x33"
-			  "\xb1\x49\xee\xff\x43\xfb\x82\xcd"
-			  "\x4a\x70\x78\x81\xa4\x34\x36\xe8"
-			  "\x4c\x28\x54\xa6\x6c\xc3\x6b\x78"
-			  "\xe7\xc0\x5d\xc6\x5d\x81\xab\x70"
-			  "\x08\x86\xa1\xfd\xf4\x77\x55\xfd"
-			  "\xa3\xe9\xe2\x1b\xdf\x99\xb7\x80"
-			  "\xf9\x0a\x4f\x72\x4a\xd3\xaf\xbb"
-			  "\xb3\x3b\xeb\x08\x58\x0f\x79\xce"
-			  "\xa5\x99\x05\x12\x34\xd4\xf4\x86"
-			  "\x37\x23\x1d\xc8\x49\xc0\x92\xae"
-			  "\xa6\xac\x9b\x31\x55\xed\x15\xc6"
-			  "\x05\x17\x37\x8d\x90\x42\xe4\x87"
-			  "\x89\x62\x88\x69\x1c\x6a\xfd\xe3"
-			  "\x00\x2b\x47\x1a\x73\xc1\x51\xc2"
-			  "\xc0\x62\x74\x6a\x9e\xb2\xe5\x21"
-			  "\xbe\x90\xb5\xb0\x50\xca\x88\x68"
-			  "\xe1\x9d\x7a\xdf\x6c\xb7\xb9\x98"
-			  "\xee\x28\x62\x61\x8b\xd1\x47\xf9"
-			  "\x04\x7a\x0b\x5d\xcd\x2b\x65\xf5"
-			  "\x12\xa3\xfe\x1a\xaa\x2c\x78\x42"
-			  "\xb8\xbe\x7d\x74\xeb\x59\xba\xba",
-		.digest	= "\xae\x11\xd4\x60\x2a\x5f\x9e\x42"
-			  "\x89\x04\xc2\x34\x8d\x55\x94\x0a",
-		.psize	= 256,
-		.ksize	= 16,
-	},
-
-};
-
-/*
- * Test vectors generated using https://github.com/google/hctr2
  */
 static const struct cipher_testvec aes_hctr2_tv_template[] = {
 	{
diff --git a/drivers/Kconfig b/drivers/Kconfig
index 4915a63866b0..3054b50a2f4c 100644
--- a/drivers/Kconfig
+++ b/drivers/Kconfig
@@ -251,4 +251,6 @@ source "drivers/hte/Kconfig"
 
 source "drivers/cdx/Kconfig"
 
+source "drivers/resctrl/Kconfig"
+
 endmenu
diff --git a/drivers/Makefile b/drivers/Makefile
index 8e1ffa4358d5..20eb17596b89 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -194,6 +194,7 @@ obj-$(CONFIG_HTE)		+= hte/
 obj-$(CONFIG_DRM_ACCEL)		+= accel/
 obj-$(CONFIG_CDX_BUS)		+= cdx/
 obj-$(CONFIG_DPLL)		+= dpll/
+obj-y				+= resctrl/
 
 obj-$(CONFIG_DIBS)		+= dibs/
 obj-$(CONFIG_S390)		+= s390/
diff --git a/drivers/acpi/acpi_tad.c b/drivers/acpi/acpi_tad.c
index 33418dd6768a..6d870d97ada6 100644
--- a/drivers/acpi/acpi_tad.c
+++ b/drivers/acpi/acpi_tad.c
@@ -90,19 +90,18 @@ static int acpi_tad_set_real_time(struct device *dev, struct acpi_tad_rt *rt)
 	args[0].buffer.pointer = (u8 *)rt;
 	args[0].buffer.length = sizeof(*rt);
 
-	pm_runtime_get_sync(dev);
+	PM_RUNTIME_ACQUIRE(dev, pm);
+	if (PM_RUNTIME_ACQUIRE_ERR(&pm))
+		return -ENXIO;
 
 	status = acpi_evaluate_integer(handle, "_SRT", &arg_list, &retval);
-
-	pm_runtime_put_sync(dev);
-
 	if (ACPI_FAILURE(status) || retval)
 		return -EIO;
 
 	return 0;
 }
 
-static int acpi_tad_get_real_time(struct device *dev, struct acpi_tad_rt *rt)
+static int acpi_tad_evaluate_grt(struct device *dev, struct acpi_tad_rt *rt)
 {
 	acpi_handle handle = ACPI_HANDLE(dev);
 	struct acpi_buffer output = { ACPI_ALLOCATE_BUFFER };
@@ -111,12 +110,7 @@ static int acpi_tad_get_real_time(struct device *dev, struct acpi_tad_rt *rt)
 	acpi_status status;
 	int ret = -EIO;
 
-	pm_runtime_get_sync(dev);
-
 	status = acpi_evaluate_object(handle, "_GRT", NULL, &output);
-
-	pm_runtime_put_sync(dev);
-
 	if (ACPI_FAILURE(status))
 		goto out_free;
 
@@ -139,6 +133,21 @@ out_free:
 	return ret;
 }
 
+static int acpi_tad_get_real_time(struct device *dev, struct acpi_tad_rt *rt)
+{
+	int ret;
+
+	PM_RUNTIME_ACQUIRE(dev, pm);
+	if (PM_RUNTIME_ACQUIRE_ERR(&pm))
+		return -ENXIO;
+
+	ret = acpi_tad_evaluate_grt(dev, rt);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
 static char *acpi_tad_rt_next_field(char *s, int *val)
 {
 	char *p;
@@ -266,12 +275,11 @@ static int acpi_tad_wake_set(struct device *dev, char *method, u32 timer_id,
 	args[0].integer.value = timer_id;
 	args[1].integer.value = value;
 
-	pm_runtime_get_sync(dev);
+	PM_RUNTIME_ACQUIRE(dev, pm);
+	if (PM_RUNTIME_ACQUIRE_ERR(&pm))
+		return -ENXIO;
 
 	status = acpi_evaluate_integer(handle, method, &arg_list, &retval);
-
-	pm_runtime_put_sync(dev);
-
 	if (ACPI_FAILURE(status) || retval)
 		return -EIO;
 
@@ -314,12 +322,11 @@ static ssize_t acpi_tad_wake_read(struct device *dev, char *buf, char *method,
 
 	args[0].integer.value = timer_id;
 
-	pm_runtime_get_sync(dev);
+	PM_RUNTIME_ACQUIRE(dev, pm);
+	if (PM_RUNTIME_ACQUIRE_ERR(&pm))
+		return -ENXIO;
 
 	status = acpi_evaluate_integer(handle, method, &arg_list, &retval);
-
-	pm_runtime_put_sync(dev);
-
 	if (ACPI_FAILURE(status))
 		return -EIO;
 
@@ -370,12 +377,11 @@ static int acpi_tad_clear_status(struct device *dev, u32 timer_id)
 
 	args[0].integer.value = timer_id;
 
-	pm_runtime_get_sync(dev);
+	PM_RUNTIME_ACQUIRE(dev, pm);
+	if (PM_RUNTIME_ACQUIRE_ERR(&pm))
+		return -ENXIO;
 
 	status = acpi_evaluate_integer(handle, "_CWS", &arg_list, &retval);
-
-	pm_runtime_put_sync(dev);
-
 	if (ACPI_FAILURE(status) || retval)
 		return -EIO;
 
@@ -411,12 +417,11 @@ static ssize_t acpi_tad_status_read(struct device *dev, char *buf, u32 timer_id)
 
 	args[0].integer.value = timer_id;
 
-	pm_runtime_get_sync(dev);
+	PM_RUNTIME_ACQUIRE(dev, pm);
+	if (PM_RUNTIME_ACQUIRE_ERR(&pm))
+		return -ENXIO;
 
 	status = acpi_evaluate_integer(handle, "_GWS", &arg_list, &retval);
-
-	pm_runtime_put_sync(dev);
-
 	if (ACPI_FAILURE(status))
 		return -EIO;
 
@@ -563,8 +568,6 @@ static void acpi_tad_remove(struct platform_device *pdev)
 
 	device_init_wakeup(dev, false);
 
-	pm_runtime_get_sync(dev);
-
 	if (dd->capabilities & ACPI_TAD_RT)
 		sysfs_remove_group(&dev->kobj, &acpi_tad_time_attr_group);
 
@@ -573,14 +576,16 @@ static void acpi_tad_remove(struct platform_device *pdev)
 
 	sysfs_remove_group(&dev->kobj, &acpi_tad_attr_group);
 
-	acpi_tad_disable_timer(dev, ACPI_TAD_AC_TIMER);
-	acpi_tad_clear_status(dev, ACPI_TAD_AC_TIMER);
-	if (dd->capabilities & ACPI_TAD_DC_WAKE) {
-		acpi_tad_disable_timer(dev, ACPI_TAD_DC_TIMER);
-		acpi_tad_clear_status(dev, ACPI_TAD_DC_TIMER);
+	scoped_guard(pm_runtime_noresume, dev) {
+		acpi_tad_disable_timer(dev, ACPI_TAD_AC_TIMER);
+		acpi_tad_clear_status(dev, ACPI_TAD_AC_TIMER);
+		if (dd->capabilities & ACPI_TAD_DC_WAKE) {
+			acpi_tad_disable_timer(dev, ACPI_TAD_DC_TIMER);
+			acpi_tad_clear_status(dev, ACPI_TAD_DC_TIMER);
+		}
 	}
 
-	pm_runtime_put_sync(dev);
+	pm_runtime_suspend(dev);
 	pm_runtime_disable(dev);
 	acpi_remove_cmos_rtc_space_handler(handle);
 }
diff --git a/drivers/acpi/acpica/nswalk.c b/drivers/acpi/acpica/nswalk.c
index a2ac06a26e92..5670ff5a43cd 100644
--- a/drivers/acpi/acpica/nswalk.c
+++ b/drivers/acpi/acpica/nswalk.c
@@ -169,9 +169,12 @@ acpi_ns_walk_namespace(acpi_object_type type,
 
 	if (start_node == ACPI_ROOT_OBJECT) {
 		start_node = acpi_gbl_root_node;
-		if (!start_node) {
-			return_ACPI_STATUS(AE_NO_NAMESPACE);
-		}
+	}
+
+	/* Avoid walking the namespace if the StartNode is NULL */
+
+	if (!start_node) {
+		return_ACPI_STATUS(AE_NO_NAMESPACE);
 	}
 
 	/* Null child means "get first node" */
diff --git a/drivers/acpi/arm64/Kconfig b/drivers/acpi/arm64/Kconfig
index b3ed6212244c..f2fd79f22e7d 100644
--- a/drivers/acpi/arm64/Kconfig
+++ b/drivers/acpi/arm64/Kconfig
@@ -21,3 +21,6 @@ config ACPI_AGDI
 
 config ACPI_APMT
 	bool
+
+config ACPI_MPAM
+	bool
diff --git a/drivers/acpi/arm64/Makefile b/drivers/acpi/arm64/Makefile
index 05ecde9eaabe..9390b57cb564 100644
--- a/drivers/acpi/arm64/Makefile
+++ b/drivers/acpi/arm64/Makefile
@@ -4,6 +4,7 @@ obj-$(CONFIG_ACPI_APMT) 	+= apmt.o
 obj-$(CONFIG_ACPI_FFH)		+= ffh.o
 obj-$(CONFIG_ACPI_GTDT) 	+= gtdt.o
 obj-$(CONFIG_ACPI_IORT) 	+= iort.o
+obj-$(CONFIG_ACPI_MPAM) 	+= mpam.o
 obj-$(CONFIG_ACPI_PROCESSOR_IDLE) += cpuidle.o
 obj-$(CONFIG_ARM_AMBA)		+= amba.o
 obj-y				+= dma.o init.o
diff --git a/drivers/acpi/arm64/gtdt.c b/drivers/acpi/arm64/gtdt.c
index 8cc8af8fd408..ffc867bac2d6 100644
--- a/drivers/acpi/arm64/gtdt.c
+++ b/drivers/acpi/arm64/gtdt.c
@@ -303,40 +303,6 @@ error:
 	return -EINVAL;
 }
 
-/**
- * acpi_arch_timer_mem_init() - Get the info of all GT blocks in GTDT table.
- * @timer_mem:	The pointer to the array of struct arch_timer_mem for returning
- *		the result of parsing. The element number of this array should
- *		be platform_timer_count(the total number of platform timers).
- * @timer_count: It points to a integer variable which is used for storing the
- *		number of GT blocks we have parsed.
- *
- * Return: 0 if success, -EINVAL/-ENODEV if error.
- */
-int __init acpi_arch_timer_mem_init(struct arch_timer_mem *timer_mem,
-				    int *timer_count)
-{
-	int ret;
-	void *platform_timer;
-
-	*timer_count = 0;
-	for_each_platform_timer(platform_timer) {
-		if (is_timer_block(platform_timer)) {
-			ret = gtdt_parse_timer_block(platform_timer, timer_mem);
-			if (ret)
-				return ret;
-			timer_mem++;
-			(*timer_count)++;
-		}
-	}
-
-	if (*timer_count)
-		pr_info("found %d memory-mapped timer block(s).\n",
-			*timer_count);
-
-	return 0;
-}
-
 /*
  * Initialize a SBSA generic Watchdog platform device info from GTDT
  */
diff --git a/drivers/acpi/arm64/mpam.c b/drivers/acpi/arm64/mpam.c
new file mode 100644
index 000000000000..84963a20c3e7
--- /dev/null
+++ b/drivers/acpi/arm64/mpam.c
@@ -0,0 +1,411 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (C) 2025 Arm Ltd.
+
+/* Parse the MPAM ACPI table feeding the discovered nodes into the driver */
+
+#define pr_fmt(fmt) "ACPI MPAM: " fmt
+
+#include <linux/acpi.h>
+#include <linux/arm_mpam.h>
+#include <linux/bits.h>
+#include <linux/cpu.h>
+#include <linux/cpumask.h>
+#include <linux/platform_device.h>
+
+#include <acpi/processor.h>
+
+/*
+ * Flags for acpi_table_mpam_msc.*_interrupt_flags.
+ * See 2.1.1 Interrupt Flags, Table 5, of DEN0065B_MPAM_ACPI_3.0-bet.
+ */
+#define ACPI_MPAM_MSC_IRQ_MODE                              BIT(0)
+#define ACPI_MPAM_MSC_IRQ_TYPE_MASK                         GENMASK(2, 1)
+#define ACPI_MPAM_MSC_IRQ_TYPE_WIRED                        0
+#define ACPI_MPAM_MSC_IRQ_AFFINITY_TYPE_MASK                BIT(3)
+#define ACPI_MPAM_MSC_IRQ_AFFINITY_TYPE_PROCESSOR           0
+#define ACPI_MPAM_MSC_IRQ_AFFINITY_TYPE_PROCESSOR_CONTAINER 1
+#define ACPI_MPAM_MSC_IRQ_AFFINITY_VALID                    BIT(4)
+
+/*
+ * Encodings for the MSC node body interface type field.
+ * See 2.1 MPAM MSC node, Table 4 of DEN0065B_MPAM_ACPI_3.0-bet.
+ */
+#define ACPI_MPAM_MSC_IFACE_MMIO   0x00
+#define ACPI_MPAM_MSC_IFACE_PCC    0x0a
+
+static bool _is_ppi_partition(u32 flags)
+{
+	u32 aff_type, is_ppi;
+	bool ret;
+
+	is_ppi = FIELD_GET(ACPI_MPAM_MSC_IRQ_AFFINITY_VALID, flags);
+	if (!is_ppi)
+		return false;
+
+	aff_type = FIELD_GET(ACPI_MPAM_MSC_IRQ_AFFINITY_TYPE_MASK, flags);
+	ret = (aff_type == ACPI_MPAM_MSC_IRQ_AFFINITY_TYPE_PROCESSOR_CONTAINER);
+	if (ret)
+		pr_err_once("Partitioned interrupts not supported\n");
+
+	return ret;
+}
+
+static int acpi_mpam_register_irq(struct platform_device *pdev,
+				  u32 intid, u32 flags)
+{
+	int irq;
+	u32 int_type;
+	int trigger;
+
+	if (!intid)
+		return -EINVAL;
+
+	if (_is_ppi_partition(flags))
+		return -EINVAL;
+
+	trigger = FIELD_GET(ACPI_MPAM_MSC_IRQ_MODE, flags);
+	int_type = FIELD_GET(ACPI_MPAM_MSC_IRQ_TYPE_MASK, flags);
+	if (int_type != ACPI_MPAM_MSC_IRQ_TYPE_WIRED)
+		return -EINVAL;
+
+	irq = acpi_register_gsi(&pdev->dev, intid, trigger, ACPI_ACTIVE_HIGH);
+	if (irq < 0)
+		pr_err_once("Failed to register interrupt 0x%x with ACPI\n", intid);
+
+	return irq;
+}
+
+static void acpi_mpam_parse_irqs(struct platform_device *pdev,
+				 struct acpi_mpam_msc_node *tbl_msc,
+				 struct resource *res, int *res_idx)
+{
+	u32 flags, intid;
+	int irq;
+
+	intid = tbl_msc->overflow_interrupt;
+	flags = tbl_msc->overflow_interrupt_flags;
+	irq = acpi_mpam_register_irq(pdev, intid, flags);
+	if (irq > 0)
+		res[(*res_idx)++] = DEFINE_RES_IRQ_NAMED(irq, "overflow");
+
+	intid = tbl_msc->error_interrupt;
+	flags = tbl_msc->error_interrupt_flags;
+	irq = acpi_mpam_register_irq(pdev, intid, flags);
+	if (irq > 0)
+		res[(*res_idx)++] = DEFINE_RES_IRQ_NAMED(irq, "error");
+}
+
+static int acpi_mpam_parse_resource(struct mpam_msc *msc,
+				    struct acpi_mpam_resource_node *res)
+{
+	int level, nid;
+	u32 cache_id;
+
+	switch (res->locator_type) {
+	case ACPI_MPAM_LOCATION_TYPE_PROCESSOR_CACHE:
+		cache_id = res->locator.cache_locator.cache_reference;
+		level = find_acpi_cache_level_from_id(cache_id);
+		if (level <= 0) {
+			pr_err_once("Bad level (%d) for cache with id %u\n", level, cache_id);
+			return -EINVAL;
+		}
+		return mpam_ris_create(msc, res->ris_index, MPAM_CLASS_CACHE,
+				       level, cache_id);
+	case ACPI_MPAM_LOCATION_TYPE_MEMORY:
+		nid = pxm_to_node(res->locator.memory_locator.proximity_domain);
+		if (nid == NUMA_NO_NODE) {
+			pr_debug("Bad proximity domain %lld, using node 0 instead\n",
+				 res->locator.memory_locator.proximity_domain);
+			nid = 0;
+		}
+		return mpam_ris_create(msc, res->ris_index, MPAM_CLASS_MEMORY,
+				       MPAM_CLASS_ID_DEFAULT, nid);
+	default:
+		/* These get discovered later and are treated as unknown */
+		return 0;
+	}
+}
+
+int acpi_mpam_parse_resources(struct mpam_msc *msc,
+			      struct acpi_mpam_msc_node *tbl_msc)
+{
+	int i, err;
+	char *ptr, *table_end;
+	struct acpi_mpam_resource_node *resource;
+
+	table_end = (char *)tbl_msc + tbl_msc->length;
+	ptr = (char *)(tbl_msc + 1);
+	for (i = 0; i < tbl_msc->num_resource_nodes; i++) {
+		u64 max_deps, remaining_table;
+
+		if (ptr + sizeof(*resource) > table_end)
+			return -EINVAL;
+
+		resource = (struct acpi_mpam_resource_node *)ptr;
+
+		remaining_table = table_end - ptr;
+		max_deps = remaining_table / sizeof(struct acpi_mpam_func_deps);
+		if (resource->num_functional_deps > max_deps) {
+			pr_debug("MSC has impossible number of functional dependencies\n");
+			return -EINVAL;
+		}
+
+		err = acpi_mpam_parse_resource(msc, resource);
+		if (err)
+			return err;
+
+		ptr += sizeof(*resource);
+		ptr += resource->num_functional_deps * sizeof(struct acpi_mpam_func_deps);
+	}
+
+	return 0;
+}
+
+/*
+ * Creates the device power management link and returns true if the
+ * acpi id is valid and usable for cpu affinity.  This is the case
+ * when the linked device is a processor or a processor container.
+ */
+static bool __init parse_msc_pm_link(struct acpi_mpam_msc_node *tbl_msc,
+				     struct platform_device *pdev,
+				     u32 *acpi_id)
+{
+	char hid[sizeof(tbl_msc->hardware_id_linked_device) + 1] = { 0 };
+	bool acpi_id_valid = false;
+	struct acpi_device *buddy;
+	char uid[11];
+	int len;
+
+	memcpy(hid, &tbl_msc->hardware_id_linked_device,
+	       sizeof(tbl_msc->hardware_id_linked_device));
+
+	if (!strcmp(hid, ACPI_PROCESSOR_CONTAINER_HID)) {
+		*acpi_id = tbl_msc->instance_id_linked_device;
+		acpi_id_valid = true;
+	}
+
+	len = snprintf(uid, sizeof(uid), "%u",
+		       tbl_msc->instance_id_linked_device);
+	if (len >= sizeof(uid)) {
+		pr_debug("Failed to convert uid of device for power management.");
+		return acpi_id_valid;
+	}
+
+	buddy = acpi_dev_get_first_match_dev(hid, uid, -1);
+	if (buddy) {
+		device_link_add(&pdev->dev, &buddy->dev, DL_FLAG_STATELESS);
+		acpi_dev_put(buddy);
+	}
+
+	return acpi_id_valid;
+}
+
+static int decode_interface_type(struct acpi_mpam_msc_node *tbl_msc,
+				 enum mpam_msc_iface *iface)
+{
+	switch (tbl_msc->interface_type) {
+	case ACPI_MPAM_MSC_IFACE_MMIO:
+		*iface = MPAM_IFACE_MMIO;
+		return 0;
+	case ACPI_MPAM_MSC_IFACE_PCC:
+		*iface = MPAM_IFACE_PCC;
+		return 0;
+	default:
+		return -EINVAL;
+	}
+}
+
+static struct platform_device * __init acpi_mpam_parse_msc(struct acpi_mpam_msc_node *tbl_msc)
+{
+	struct platform_device *pdev __free(platform_device_put) =
+		platform_device_alloc("mpam_msc", tbl_msc->identifier);
+	int next_res = 0, next_prop = 0, err;
+	/* pcc, nrdy, affinity and a sentinel */
+	struct property_entry props[4] = { 0 };
+	/* mmio, 2xirq, no sentinel. */
+	struct resource res[3] = { 0 };
+	struct acpi_device *companion;
+	enum mpam_msc_iface iface;
+	char uid[16];
+	u32 acpi_id;
+
+	if (!pdev)
+		return ERR_PTR(-ENOMEM);
+
+	/* Some power management is described in the namespace: */
+	err = snprintf(uid, sizeof(uid), "%u", tbl_msc->identifier);
+	if (err > 0 && err < sizeof(uid)) {
+		companion = acpi_dev_get_first_match_dev("ARMHAA5C", uid, -1);
+		if (companion) {
+			ACPI_COMPANION_SET(&pdev->dev, companion);
+			acpi_dev_put(companion);
+		} else {
+			pr_debug("MSC.%u: missing namespace entry\n", tbl_msc->identifier);
+		}
+	}
+
+	if (decode_interface_type(tbl_msc, &iface)) {
+		pr_debug("MSC.%u: unknown interface type\n", tbl_msc->identifier);
+		return ERR_PTR(-EINVAL);
+	}
+
+	if (iface == MPAM_IFACE_MMIO) {
+		res[next_res++] = DEFINE_RES_MEM_NAMED(tbl_msc->base_address,
+						       tbl_msc->mmio_size,
+						       "MPAM:MSC");
+	} else if (iface == MPAM_IFACE_PCC) {
+		props[next_prop++] = PROPERTY_ENTRY_U32("pcc-channel",
+							tbl_msc->base_address);
+	}
+
+	acpi_mpam_parse_irqs(pdev, tbl_msc, res, &next_res);
+
+	WARN_ON_ONCE(next_res > ARRAY_SIZE(res));
+	err = platform_device_add_resources(pdev, res, next_res);
+	if (err)
+		return ERR_PTR(err);
+
+	props[next_prop++] = PROPERTY_ENTRY_U32("arm,not-ready-us",
+						tbl_msc->max_nrdy_usec);
+
+	/*
+	 * The MSC's CPU affinity is described via its linked power
+	 * management device, but only if it points at a Processor or
+	 * Processor Container.
+	 */
+	if (parse_msc_pm_link(tbl_msc, pdev, &acpi_id))
+		props[next_prop++] = PROPERTY_ENTRY_U32("cpu_affinity", acpi_id);
+
+	WARN_ON_ONCE(next_prop > ARRAY_SIZE(props) - 1);
+	err = device_create_managed_software_node(&pdev->dev, props, NULL);
+	if (err)
+		return ERR_PTR(err);
+
+	/*
+	 * Stash the table entry for acpi_mpam_parse_resources() to discover
+	 * what this MSC controls.
+	 */
+	err = platform_device_add_data(pdev, tbl_msc, tbl_msc->length);
+	if (err)
+		return ERR_PTR(err);
+
+	err = platform_device_add(pdev);
+	if (err)
+		return ERR_PTR(err);
+
+	return_ptr(pdev);
+}
+
+static int __init acpi_mpam_parse(void)
+{
+	char *table_end, *table_offset;
+	struct acpi_mpam_msc_node *tbl_msc;
+	struct platform_device *pdev;
+
+	if (acpi_disabled || !system_supports_mpam())
+		return 0;
+
+	struct acpi_table_header *table __free(acpi_put_table) =
+		acpi_get_table_pointer(ACPI_SIG_MPAM, 0);
+
+	if (IS_ERR(table))
+		return 0;
+
+	if (table->revision < 1) {
+		pr_debug("MPAM ACPI table revision %d not supported\n", table->revision);
+		return 0;
+	}
+
+	table_offset = (char *)(table + 1);
+	table_end = (char *)table + table->length;
+
+	while (table_offset < table_end) {
+		tbl_msc = (struct acpi_mpam_msc_node *)table_offset;
+		if (table_offset + sizeof(*tbl_msc) > table_end ||
+		    table_offset + tbl_msc->length > table_end) {
+			pr_err("MSC entry overlaps end of ACPI table\n");
+			return -EINVAL;
+		}
+		table_offset += tbl_msc->length;
+
+		/*
+		 * If any of the reserved fields are set, make no attempt to
+		 * parse the MSC structure. This MSC will still be counted by
+		 * acpi_mpam_count_msc(), meaning the MPAM driver can't probe
+		 * against all MSC, and will never be enabled. There is no way
+		 * to enable it safely, because we cannot determine safe
+		 * system-wide partid and pmg ranges in this situation.
+		 */
+		if (tbl_msc->reserved || tbl_msc->reserved1 || tbl_msc->reserved2) {
+			pr_err_once("Unrecognised MSC, MPAM not usable\n");
+			pr_debug("MSC.%u: reserved field set\n", tbl_msc->identifier);
+			continue;
+		}
+
+		if (!tbl_msc->mmio_size) {
+			pr_debug("MSC.%u: marked as disabled\n", tbl_msc->identifier);
+			continue;
+		}
+
+		pdev = acpi_mpam_parse_msc(tbl_msc);
+		if (IS_ERR(pdev))
+			return PTR_ERR(pdev);
+	}
+
+	return 0;
+}
+
+/**
+ * acpi_mpam_count_msc() - Count the number of MSC described by firmware.
+ *
+ * Returns the number of MSCs, or zero for an error.
+ *
+ * This can be called before or in parallel with acpi_mpam_parse().
+ */
+int acpi_mpam_count_msc(void)
+{
+	char *table_end, *table_offset;
+	struct acpi_mpam_msc_node *tbl_msc;
+	int count = 0;
+
+	if (acpi_disabled || !system_supports_mpam())
+		return 0;
+
+	struct acpi_table_header *table __free(acpi_put_table) =
+		acpi_get_table_pointer(ACPI_SIG_MPAM, 0);
+
+	if (IS_ERR(table))
+		return 0;
+
+	if (table->revision < 1)
+		return 0;
+
+	table_offset = (char *)(table + 1);
+	table_end = (char *)table + table->length;
+
+	while (table_offset < table_end) {
+		tbl_msc = (struct acpi_mpam_msc_node *)table_offset;
+
+		if (table_offset + sizeof(*tbl_msc) > table_end)
+			return -EINVAL;
+		if (tbl_msc->length < sizeof(*tbl_msc))
+			return -EINVAL;
+		if (tbl_msc->length > table_end - table_offset)
+			return -EINVAL;
+		table_offset += tbl_msc->length;
+
+		if (!tbl_msc->mmio_size)
+			continue;
+
+		count++;
+	}
+
+	return count;
+}
+
+/*
+ * Call after ACPI devices have been created, which happens behind acpi_scan_init()
+ * called from subsys_initcall(). PCC requires the mailbox driver, which is
+ * initialised from postcore_initcall().
+ */
+subsys_initcall_sync(acpi_mpam_parse);
diff --git a/drivers/acpi/battery.c b/drivers/acpi/battery.c
index 67b76492c839..34181fa52e93 100644
--- a/drivers/acpi/battery.c
+++ b/drivers/acpi/battery.c
@@ -91,7 +91,6 @@ enum {
 };
 
 struct acpi_battery {
-	struct mutex lock;
 	struct mutex update_lock;
 	struct power_supply *bat;
 	struct power_supply_desc bat_desc;
@@ -535,11 +534,9 @@ static int acpi_battery_get_info(struct acpi_battery *battery)
 		struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL };
 		acpi_status status = AE_ERROR;
 
-		mutex_lock(&battery->lock);
 		status = acpi_evaluate_object(battery->device->handle,
 					      use_bix ? "_BIX":"_BIF",
 					      NULL, &buffer);
-		mutex_unlock(&battery->lock);
 
 		if (ACPI_FAILURE(status)) {
 			acpi_handle_info(battery->device->handle,
@@ -576,11 +573,8 @@ static int acpi_battery_get_state(struct acpi_battery *battery)
 			msecs_to_jiffies(cache_time)))
 		return 0;
 
-	mutex_lock(&battery->lock);
 	status = acpi_evaluate_object(battery->device->handle, "_BST",
 				      NULL, &buffer);
-	mutex_unlock(&battery->lock);
-
 	if (ACPI_FAILURE(status)) {
 		acpi_handle_info(battery->device->handle,
 				 "_BST evaluation failed: %s",
@@ -628,11 +622,8 @@ static int acpi_battery_set_alarm(struct acpi_battery *battery)
 	    !test_bit(ACPI_BATTERY_ALARM_PRESENT, &battery->flags))
 		return -ENODEV;
 
-	mutex_lock(&battery->lock);
 	status = acpi_execute_simple_method(battery->device->handle, "_BTP",
 					    battery->alarm);
-	mutex_unlock(&battery->lock);
-
 	if (ACPI_FAILURE(status))
 		return -ENODEV;
 
@@ -1235,9 +1226,6 @@ static int acpi_battery_add(struct acpi_device *device)
 	strscpy(acpi_device_name(device), ACPI_BATTERY_DEVICE_NAME);
 	strscpy(acpi_device_class(device), ACPI_BATTERY_CLASS);
 	device->driver_data = battery;
-	result = devm_mutex_init(&device->dev, &battery->lock);
-	if (result)
-		return result;
 
 	result = devm_mutex_init(&device->dev, &battery->update_lock);
 	if (result)
diff --git a/drivers/acpi/dptf/Makefile b/drivers/acpi/dptf/Makefile
index 297340682f66..e912a3be1d28 100644
--- a/drivers/acpi/dptf/Makefile
+++ b/drivers/acpi/dptf/Makefile
@@ -1,4 +1,3 @@
 # SPDX-License-Identifier: GPL-2.0-only
-obj-$(CONFIG_ACPI)             += int340x_thermal.o
 obj-$(CONFIG_DPTF_POWER)	+= dptf_power.o
 obj-$(CONFIG_DPTF_PCH_FIVR)	+= dptf_pch_fivr.o
diff --git a/drivers/acpi/dptf/dptf_pch_fivr.c b/drivers/acpi/dptf/dptf_pch_fivr.c
index 952216c67d58..8d7e555929d3 100644
--- a/drivers/acpi/dptf/dptf_pch_fivr.c
+++ b/drivers/acpi/dptf/dptf_pch_fivr.c
@@ -41,7 +41,7 @@ static int pch_fivr_read(acpi_handle handle, char *method, struct pch_fivr_resp
 	ret = 0;
 
 release_buffer:
-	kfree(buffer.pointer);
+	ACPI_FREE(buffer.pointer);
 	return ret;
 }
 
diff --git a/drivers/acpi/dptf/dptf_power.c b/drivers/acpi/dptf/dptf_power.c
index 776914f31b9e..55ccbb8ddbe3 100644
--- a/drivers/acpi/dptf/dptf_power.c
+++ b/drivers/acpi/dptf/dptf_power.c
@@ -240,6 +240,8 @@ static const struct acpi_device_id int3407_device_ids[] = {
 	{"INTC10D9", 0},
 	{"INTC1100", 0},
 	{"INTC1101", 0},
+	{"INTC10F7", 0},
+	{"INTC10F8", 0},
 	{"", 0},
 };
 MODULE_DEVICE_TABLE(acpi, int3407_device_ids);
diff --git a/drivers/acpi/dptf/int340x_thermal.c b/drivers/acpi/dptf/int340x_thermal.c
deleted file mode 100644
index a222df059a16..000000000000
--- a/drivers/acpi/dptf/int340x_thermal.c
+++ /dev/null
@@ -1,94 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * ACPI support for int340x thermal drivers
- *
- * Copyright (C) 2014, Intel Corporation
- * Authors: Zhang Rui <rui.zhang@intel.com>
- */
-
-#include <linux/acpi.h>
-#include <linux/module.h>
-
-#include "../internal.h"
-
-#define INT3401_DEVICE 0X01
-static const struct acpi_device_id int340x_thermal_device_ids[] = {
-	{"INT3400"},
-	{"INT3401", INT3401_DEVICE},
-	{"INT3402"},
-	{"INT3403"},
-	{"INT3404"},
-	{"INT3406"},
-	{"INT3407"},
-	{"INT3408"},
-	{"INT3409"},
-	{"INT340A"},
-	{"INT340B"},
-	{"INT3532"},
-	{"INTC1040"},
-	{"INTC1041"},
-	{"INTC1042"},
-	{"INTC1043"},
-	{"INTC1044"},
-	{"INTC1045"},
-	{"INTC1046"},
-	{"INTC1047"},
-	{"INTC1048"},
-	{"INTC1049"},
-	{"INTC1050"},
-	{"INTC1060"},
-	{"INTC1061"},
-	{"INTC1062"},
-	{"INTC1063"},
-	{"INTC1064"},
-	{"INTC1065"},
-	{"INTC1066"},
-	{"INTC1068"},
-	{"INTC1069"},
-	{"INTC106A"},
-	{"INTC106B"},
-	{"INTC106C"},
-	{"INTC106D"},
-	{"INTC10A0"},
-	{"INTC10A1"},
-	{"INTC10A2"},
-	{"INTC10A3"},
-	{"INTC10A4"},
-	{"INTC10A5"},
-	{"INTC10D4"},
-	{"INTC10D5"},
-	{"INTC10D6"},
-	{"INTC10D7"},
-	{"INTC10D8"},
-	{"INTC10D9"},
-	{"INTC10FC"},
-	{"INTC10FD"},
-	{"INTC10FE"},
-	{"INTC10FF"},
-	{"INTC1100"},
-	{"INTC1101"},
-	{"INTC1102"},
-	{""},
-};
-
-static int int340x_thermal_handler_attach(struct acpi_device *adev,
-					const struct acpi_device_id *id)
-{
-	if (IS_ENABLED(CONFIG_INT340X_THERMAL))
-		acpi_create_platform_device(adev, NULL);
-	/* Intel SoC DTS thermal driver needs INT3401 to set IRQ descriptor */
-	else if (IS_ENABLED(CONFIG_INTEL_SOC_DTS_THERMAL) &&
-		 id->driver_data == INT3401_DEVICE)
-		acpi_create_platform_device(adev, NULL);
-	return 1;
-}
-
-static struct acpi_scan_handler int340x_thermal_handler = {
-	.ids = int340x_thermal_device_ids,
-	.attach = int340x_thermal_handler_attach,
-};
-
-void __init acpi_int340x_thermal_init(void)
-{
-	acpi_scan_add_handler(&int340x_thermal_handler);
-}
diff --git a/drivers/acpi/ec.c b/drivers/acpi/ec.c
index 7855bbf752b1..59b3d50ff01e 100644
--- a/drivers/acpi/ec.c
+++ b/drivers/acpi/ec.c
@@ -2294,7 +2294,8 @@ static int acpi_ec_init_workqueues(void)
 		ec_wq = alloc_ordered_workqueue("kec", 0);
 
 	if (!ec_query_wq)
-		ec_query_wq = alloc_workqueue("kec_query", 0, ec_max_queries);
+		ec_query_wq = alloc_workqueue("kec_query", WQ_PERCPU,
+					      ec_max_queries);
 
 	if (!ec_wq || !ec_query_wq) {
 		acpi_ec_destroy_workqueues();
diff --git a/drivers/acpi/fan.h b/drivers/acpi/fan.h
index bedbab0e8e4e..97ce3212edf3 100644
--- a/drivers/acpi/fan.h
+++ b/drivers/acpi/fan.h
@@ -11,6 +11,7 @@
 #define _ACPI_FAN_H_
 
 #include <linux/kconfig.h>
+#include <linux/limits.h>
 
 #define ACPI_FAN_DEVICE_IDS	\
 	{"INT3404", }, /* Fan */ \
@@ -21,6 +22,7 @@
 	{"INTC10A2", }, /* Fan for Raptor Lake generation */ \
 	{"INTC10D6", }, /* Fan for Panther Lake generation */ \
 	{"INTC10FE", }, /* Fan for Wildcat Lake generation */ \
+	{"INTC10F5", }, /* Fan for Nova Lake generation */ \
 	{"PNP0C0B", } /* Generic ACPI fan */
 
 #define ACPI_FPS_NAME_LEN	20
@@ -55,19 +57,58 @@ struct acpi_fan {
 	struct acpi_fan_fif fif;
 	struct acpi_fan_fps *fps;
 	int fps_count;
+	/* A value of 0 means that trippoint-related functions are not supported */
+	u32 fan_trip_granularity;
+#if IS_REACHABLE(CONFIG_HWMON)
+	struct device *hdev;
+#endif
 	struct thermal_cooling_device *cdev;
 	struct device_attribute fst_speed;
 	struct device_attribute fine_grain_control;
 };
 
+/**
+ * acpi_fan_speed_valid - Check if fan speed value is valid
+ * @speeed: Speed value returned by the ACPI firmware
+ *
+ * Check if the fan speed value returned by the ACPI firmware is valid. This function is
+ * necessary as ACPI firmware implementations can return 0xFFFFFFFF to signal that the
+ * ACPI fan does not support speed reporting. Additionally, some buggy ACPI firmware
+ * implementations return a value larger than the 32-bit integer value defined by
+ * the ACPI specification when using placeholder values. Such invalid values are also
+ * detected by this function.
+ *
+ * Returns: True if the fan speed value is valid, false otherwise.
+ */
+static inline bool acpi_fan_speed_valid(u64 speed)
+{
+	return speed < U32_MAX;
+}
+
+/**
+ * acpi_fan_power_valid - Check if fan power value is valid
+ * @power: Power value returned by the ACPI firmware
+ *
+ * Check if the fan power value returned by the ACPI firmware is valid.
+ * See acpi_fan_speed_valid() for details.
+ *
+ * Returns: True if the fan power value is valid, false otherwise.
+ */
+static inline bool acpi_fan_power_valid(u64 power)
+{
+	return power < U32_MAX;
+}
+
 int acpi_fan_get_fst(acpi_handle handle, struct acpi_fan_fst *fst);
 int acpi_fan_create_attributes(struct acpi_device *device);
 void acpi_fan_delete_attributes(struct acpi_device *device);
 
 #if IS_REACHABLE(CONFIG_HWMON)
 int devm_acpi_fan_create_hwmon(struct device *dev);
+void acpi_fan_notify_hwmon(struct device *dev);
 #else
 static inline int devm_acpi_fan_create_hwmon(struct device *dev) { return 0; };
+static inline void acpi_fan_notify_hwmon(struct device *dev) { };
 #endif
 
 #endif
diff --git a/drivers/acpi/fan_core.c b/drivers/acpi/fan_core.c
index 46e7fe7a506d..fb08b8549ed7 100644
--- a/drivers/acpi/fan_core.c
+++ b/drivers/acpi/fan_core.c
@@ -7,11 +7,16 @@
  *  Copyright (C) 2022 Intel Corporation. All rights reserved.
  */
 
+#include <linux/bits.h>
 #include <linux/kernel.h>
+#include <linux/limits.h>
+#include <linux/math.h>
+#include <linux/math64.h>
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/types.h>
 #include <linux/uaccess.h>
+#include <linux/uuid.h>
 #include <linux/thermal.h>
 #include <linux/acpi.h>
 #include <linux/platform_device.h>
@@ -19,6 +24,26 @@
 
 #include "fan.h"
 
+#define ACPI_FAN_NOTIFY_STATE_CHANGED	0x80
+
+/*
+ * Defined inside the "Fan Noise Signal" section at
+ * https://learn.microsoft.com/en-us/windows-hardware/design/device-experiences/design-guide.
+ */
+static const guid_t acpi_fan_microsoft_guid = GUID_INIT(0xA7611840, 0x99FE, 0x41AE, 0xA4, 0x88,
+							0x35, 0xC7, 0x59, 0x26, 0xC8, 0xEB);
+#define ACPI_FAN_DSM_GET_TRIP_POINT_GRANULARITY 1
+#define ACPI_FAN_DSM_SET_TRIP_POINTS		2
+#define ACPI_FAN_DSM_GET_OPERATING_RANGES	3
+
+/*
+ * Ensures that fans with a very low trip point granularity
+ * do not send too many notifications.
+ */
+static uint min_trip_distance = 100;
+module_param(min_trip_distance, uint, 0);
+MODULE_PARM_DESC(min_trip_distance, "Minimum distance between fan speed trip points in RPM");
+
 static const struct acpi_device_id fan_device_ids[] = {
 	ACPI_FAN_DEVICE_IDS,
 	{"", 0},
@@ -308,6 +333,182 @@ err:
 	return status;
 }
 
+static int acpi_fan_dsm_init(struct device *dev)
+{
+	union acpi_object dummy = {
+		.package = {
+			.type = ACPI_TYPE_PACKAGE,
+			.count = 0,
+			.elements = NULL,
+		},
+	};
+	struct acpi_fan *fan = dev_get_drvdata(dev);
+	union acpi_object *obj;
+	int ret = 0;
+
+	if (!acpi_check_dsm(fan->handle, &acpi_fan_microsoft_guid, 0,
+			    BIT(ACPI_FAN_DSM_GET_TRIP_POINT_GRANULARITY) |
+			    BIT(ACPI_FAN_DSM_SET_TRIP_POINTS)))
+		return 0;
+
+	dev_info(dev, "Using Microsoft fan extensions\n");
+
+	obj = acpi_evaluate_dsm_typed(fan->handle, &acpi_fan_microsoft_guid, 0,
+				      ACPI_FAN_DSM_GET_TRIP_POINT_GRANULARITY, &dummy,
+				      ACPI_TYPE_INTEGER);
+	if (!obj)
+		return -EIO;
+
+	if (obj->integer.value > U32_MAX)
+		ret = -EOVERFLOW;
+	else
+		fan->fan_trip_granularity = obj->integer.value;
+
+	kfree(obj);
+
+	return ret;
+}
+
+static int acpi_fan_dsm_set_trip_points(struct device *dev, u64 upper, u64 lower)
+{
+	union acpi_object args[2] = {
+		{
+			.integer = {
+				.type = ACPI_TYPE_INTEGER,
+				.value = lower,
+			},
+		},
+		{
+			.integer = {
+				.type = ACPI_TYPE_INTEGER,
+				.value = upper,
+			},
+		},
+	};
+	struct acpi_fan *fan = dev_get_drvdata(dev);
+	union acpi_object in = {
+		.package = {
+			.type = ACPI_TYPE_PACKAGE,
+			.count = ARRAY_SIZE(args),
+			.elements = args,
+		},
+	};
+	union acpi_object *obj;
+
+	obj = acpi_evaluate_dsm(fan->handle, &acpi_fan_microsoft_guid, 0,
+				ACPI_FAN_DSM_SET_TRIP_POINTS, &in);
+	kfree(obj);
+
+	return 0;
+}
+
+static int acpi_fan_dsm_start(struct device *dev)
+{
+	struct acpi_fan *fan = dev_get_drvdata(dev);
+	int ret;
+
+	if (!fan->fan_trip_granularity)
+		return 0;
+
+	/*
+	 * Some firmware implementations only update the values returned by the
+	 * _FST control method when a notification is received. This usually
+	 * works with Microsoft Windows as setting up trip points will keep
+	 * triggering said notifications, but will cause issues when using _FST
+	 * without the Microsoft-specific trip point extension.
+	 *
+	 * Because of this, an initial notification needs to be triggered to
+	 * start the cycle of trip points updates. This is achieved by setting
+	 * the trip points sequencially to two separate ranges. As by the
+	 * Microsoft specification the firmware should trigger a notification
+	 * immediately if the fan speed is outside the trip point range. This
+	 * _should_ result in at least one notification as both ranges do not
+	 * overlap, meaning that the current fan speed needs to be outside at
+	 * least one range.
+	 */
+	ret = acpi_fan_dsm_set_trip_points(dev, fan->fan_trip_granularity, 0);
+	if (ret < 0)
+		return ret;
+
+	return acpi_fan_dsm_set_trip_points(dev, fan->fan_trip_granularity * 3,
+					    fan->fan_trip_granularity * 2);
+}
+
+static int acpi_fan_dsm_update_trips_points(struct device *dev, struct acpi_fan_fst *fst)
+{
+	struct acpi_fan *fan = dev_get_drvdata(dev);
+	u64 upper, lower;
+
+	if (!fan->fan_trip_granularity)
+		return 0;
+
+	if (!acpi_fan_speed_valid(fst->speed))
+		return -EINVAL;
+
+	upper = roundup_u64(fst->speed + min_trip_distance, fan->fan_trip_granularity);
+	if (fst->speed <= min_trip_distance) {
+		lower = 0;
+	} else {
+		/*
+		 * Valid fan speed values cannot be larger than 32 bit, so
+		 * we can safely assume that no overflow will happen here.
+		 */
+		lower = rounddown((u32)fst->speed - min_trip_distance, fan->fan_trip_granularity);
+	}
+
+	return acpi_fan_dsm_set_trip_points(dev, upper, lower);
+}
+
+static void acpi_fan_notify_handler(acpi_handle handle, u32 event, void *context)
+{
+	struct device *dev = context;
+	struct acpi_fan_fst fst;
+	int ret;
+
+	switch (event) {
+	case ACPI_FAN_NOTIFY_STATE_CHANGED:
+		/*
+		 * The ACPI specification says that we must evaluate _FST when we
+		 * receive an ACPI event indicating that the fan state has changed.
+		 */
+		ret = acpi_fan_get_fst(handle, &fst);
+		if (ret < 0) {
+			dev_err(dev, "Error retrieving current fan status: %d\n", ret);
+		} else {
+			ret = acpi_fan_dsm_update_trips_points(dev, &fst);
+			if (ret < 0)
+				dev_err(dev, "Failed to update trip points: %d\n", ret);
+		}
+
+		acpi_fan_notify_hwmon(dev);
+		acpi_bus_generate_netlink_event("fan", dev_name(dev), event, 0);
+		break;
+	default:
+		dev_dbg(dev, "Unsupported ACPI notification 0x%x\n", event);
+		break;
+	}
+}
+
+static void acpi_fan_notify_remove(void *data)
+{
+	struct acpi_fan *fan = data;
+
+	acpi_remove_notify_handler(fan->handle, ACPI_DEVICE_NOTIFY, acpi_fan_notify_handler);
+}
+
+static int devm_acpi_fan_notify_init(struct device *dev)
+{
+	struct acpi_fan *fan = dev_get_drvdata(dev);
+	acpi_status status;
+
+	status = acpi_install_notify_handler(fan->handle, ACPI_DEVICE_NOTIFY,
+					     acpi_fan_notify_handler, dev);
+	if (ACPI_FAILURE(status))
+		return -EIO;
+
+	return devm_add_action_or_reset(dev, acpi_fan_notify_remove, fan);
+}
+
 static int acpi_fan_probe(struct platform_device *pdev)
 {
 	int result = 0;
@@ -347,10 +548,24 @@ static int acpi_fan_probe(struct platform_device *pdev)
 	}
 
 	if (fan->has_fst) {
+		result = acpi_fan_dsm_init(&pdev->dev);
+		if (result)
+			return result;
+
 		result = devm_acpi_fan_create_hwmon(&pdev->dev);
 		if (result)
 			return result;
 
+		result = devm_acpi_fan_notify_init(&pdev->dev);
+		if (result)
+			return result;
+
+		result = acpi_fan_dsm_start(&pdev->dev);
+		if (result) {
+			dev_err(&pdev->dev, "Failed to start Microsoft fan extensions\n");
+			return result;
+		}
+
 		result = acpi_fan_create_attributes(device);
 		if (result)
 			return result;
@@ -436,8 +651,14 @@ static int acpi_fan_suspend(struct device *dev)
 
 static int acpi_fan_resume(struct device *dev)
 {
-	int result;
 	struct acpi_fan *fan = dev_get_drvdata(dev);
+	int result;
+
+	if (fan->has_fst) {
+		result = acpi_fan_dsm_start(dev);
+		if (result)
+			dev_err(dev, "Failed to start Microsoft fan extensions: %d\n", result);
+	}
 
 	if (fan->acpi4)
 		return 0;
diff --git a/drivers/acpi/fan_hwmon.c b/drivers/acpi/fan_hwmon.c
index 4b2c2007f2d7..d3374f8f524b 100644
--- a/drivers/acpi/fan_hwmon.c
+++ b/drivers/acpi/fan_hwmon.c
@@ -15,10 +15,6 @@
 
 #include "fan.h"
 
-/* Returned when the ACPI fan does not support speed reporting */
-#define FAN_SPEED_UNAVAILABLE	U32_MAX
-#define FAN_POWER_UNAVAILABLE	U32_MAX
-
 static struct acpi_fan_fps *acpi_fan_get_current_fps(struct acpi_fan *fan, u64 control)
 {
 	unsigned int i;
@@ -77,7 +73,7 @@ static umode_t acpi_fan_hwmon_is_visible(const void *drvdata, enum hwmon_sensor_
 			 * when the associated attribute should not be created.
 			 */
 			for (i = 0; i < fan->fps_count; i++) {
-				if (fan->fps[i].power != FAN_POWER_UNAVAILABLE)
+				if (acpi_fan_power_valid(fan->fps[i].power))
 					return 0444;
 			}
 
@@ -106,7 +102,7 @@ static int acpi_fan_hwmon_read(struct device *dev, enum hwmon_sensor_types type,
 	case hwmon_fan:
 		switch (attr) {
 		case hwmon_fan_input:
-			if (fst.speed == FAN_SPEED_UNAVAILABLE)
+			if (!acpi_fan_speed_valid(fst.speed))
 				return -ENODEV;
 
 			if (fst.speed > LONG_MAX)
@@ -134,7 +130,7 @@ static int acpi_fan_hwmon_read(struct device *dev, enum hwmon_sensor_types type,
 			if (!fps)
 				return -EIO;
 
-			if (fps->power == FAN_POWER_UNAVAILABLE)
+			if (!acpi_fan_power_valid(fps->power))
 				return -ENODEV;
 
 			if (fps->power > LONG_MAX / MICROWATT_PER_MILLIWATT)
@@ -166,12 +162,19 @@ static const struct hwmon_chip_info acpi_fan_hwmon_chip_info = {
 	.info = acpi_fan_hwmon_info,
 };
 
+void acpi_fan_notify_hwmon(struct device *dev)
+{
+	struct acpi_fan *fan = dev_get_drvdata(dev);
+
+	hwmon_notify_event(fan->hdev, hwmon_fan, hwmon_fan_input, 0);
+}
+
 int devm_acpi_fan_create_hwmon(struct device *dev)
 {
 	struct acpi_fan *fan = dev_get_drvdata(dev);
-	struct device *hdev;
 
-	hdev = devm_hwmon_device_register_with_info(dev, "acpi_fan", fan, &acpi_fan_hwmon_chip_info,
-						    NULL);
-	return PTR_ERR_OR_ZERO(hdev);
+	fan->hdev = devm_hwmon_device_register_with_info(dev, "acpi_fan", fan,
+							 &acpi_fan_hwmon_chip_info, NULL);
+
+	return PTR_ERR_OR_ZERO(fan->hdev);
 }
diff --git a/drivers/acpi/internal.h b/drivers/acpi/internal.h
index 63354972ab0b..40f875b265a9 100644
--- a/drivers/acpi/internal.h
+++ b/drivers/acpi/internal.h
@@ -27,7 +27,6 @@ static inline void acpi_pci_link_init(void) {}
 void acpi_processor_init(void);
 void acpi_platform_init(void);
 void acpi_pnp_init(void);
-void acpi_int340x_thermal_init(void);
 int acpi_sysfs_init(void);
 void acpi_gpe_apply_masked_gpes(void);
 void acpi_container_init(void);
diff --git a/drivers/acpi/osl.c b/drivers/acpi/osl.c
index 5ff343096ece..05393a7315fe 100644
--- a/drivers/acpi/osl.c
+++ b/drivers/acpi/osl.c
@@ -398,7 +398,7 @@ static void acpi_os_drop_map_ref(struct acpi_ioremap *map)
 	list_del_rcu(&map->list);
 
 	INIT_RCU_WORK(&map->track.rwork, acpi_os_map_remove);
-	queue_rcu_work(system_wq, &map->track.rwork);
+	queue_rcu_work(system_percpu_wq, &map->track.rwork);
 }
 
 /**
@@ -1694,8 +1694,8 @@ acpi_status __init acpi_os_initialize(void)
 
 acpi_status __init acpi_os_initialize1(void)
 {
-	kacpid_wq = alloc_workqueue("kacpid", 0, 1);
-	kacpi_notify_wq = alloc_workqueue("kacpi_notify", 0, 0);
+	kacpid_wq = alloc_workqueue("kacpid", WQ_PERCPU, 1);
+	kacpi_notify_wq = alloc_workqueue("kacpi_notify", WQ_PERCPU, 0);
 	kacpi_hotplug_wq = alloc_ordered_workqueue("kacpi_hotplug", 0);
 	BUG_ON(!kacpid_wq);
 	BUG_ON(!kacpi_notify_wq);
diff --git a/drivers/acpi/pptt.c b/drivers/acpi/pptt.c
index 54676e3d82dd..de5f8c018333 100644
--- a/drivers/acpi/pptt.c
+++ b/drivers/acpi/pptt.c
@@ -21,6 +21,25 @@
 #include <linux/cacheinfo.h>
 #include <acpi/processor.h>
 
+/*
+ * The acpi_pptt_cache_v1 in actbl2.h, which is imported from acpica,
+ * only contains the cache_id field rather than all the fields of the
+ * Cache Type Structure. Use this alternative structure until it is
+ * resolved in acpica.
+ */
+struct acpi_pptt_cache_v1_full {
+	struct acpi_subtable_header header;
+	u16 reserved;
+	u32 flags;
+	u32 next_level_of_cache;
+	u32 size;
+	u32 number_of_sets;
+	u8 associativity;
+	u8 attributes;
+	u16 line_size;
+	u32 cache_id;
+} __packed;
+
 static struct acpi_subtable_header *fetch_pptt_subtable(struct acpi_table_header *table_hdr,
 							u32 pptt_ref)
 {
@@ -56,6 +75,18 @@ static struct acpi_pptt_cache *fetch_pptt_cache(struct acpi_table_header *table_
 	return (struct acpi_pptt_cache *)fetch_pptt_subtable(table_hdr, pptt_ref);
 }
 
+static struct acpi_pptt_cache_v1_full *upgrade_pptt_cache(struct acpi_pptt_cache *cache)
+{
+	if (cache->header.length < sizeof(struct acpi_pptt_cache_v1_full))
+		return NULL;
+
+	/* No use for v1 if the only additional field is invalid */
+	if (!(cache->flags & ACPI_PPTT_CACHE_ID_VALID))
+		return NULL;
+
+	return (struct acpi_pptt_cache_v1_full *)cache;
+}
+
 static struct acpi_subtable_header *acpi_get_pptt_resource(struct acpi_table_header *table_hdr,
 							   struct acpi_pptt_processor *node,
 							   int resource)
@@ -177,14 +208,14 @@ acpi_find_cache_level(struct acpi_table_header *table_hdr,
 }
 
 /**
- * acpi_count_levels() - Given a PPTT table, and a CPU node, count the cache
- * levels and split cache levels (data/instruction).
+ * acpi_count_levels() - Given a PPTT table, and a CPU node, count the
+ * total number of levels and split cache levels (data/instruction).
  * @table_hdr: Pointer to the head of the PPTT table
  * @cpu_node: processor node we wish to count caches for
- * @levels: Number of levels if success.
  * @split_levels:	Number of split cache levels (data/instruction) if
  *			success. Can by NULL.
  *
+ * Return: number of levels.
  * Given a processor node containing a processing unit, walk into it and count
  * how many levels exist solely for it, and then walk up each level until we hit
  * the root node (ignore the package level because it may be possible to have
@@ -192,14 +223,18 @@ acpi_find_cache_level(struct acpi_table_header *table_hdr,
  * split cache levels (data/instruction) that exist at each level on the way
  * up.
  */
-static void acpi_count_levels(struct acpi_table_header *table_hdr,
-			      struct acpi_pptt_processor *cpu_node,
-			      unsigned int *levels, unsigned int *split_levels)
+static int acpi_count_levels(struct acpi_table_header *table_hdr,
+			     struct acpi_pptt_processor *cpu_node,
+			     unsigned int *split_levels)
 {
+	int current_level = 0;
+
 	do {
-		acpi_find_cache_level(table_hdr, cpu_node, levels, split_levels, 0, 0);
+		acpi_find_cache_level(table_hdr, cpu_node, &current_level, split_levels, 0, 0);
 		cpu_node = fetch_pptt_node(table_hdr, cpu_node->parent);
 	} while (cpu_node);
+
+	return current_level;
 }
 
 /**
@@ -351,7 +386,6 @@ static struct acpi_pptt_cache *acpi_find_cache_node(struct acpi_table_header *ta
  * @this_leaf: Kernel cache info structure being updated
  * @found_cache: The PPTT node describing this cache instance
  * @cpu_node: A unique reference to describe this cache instance
- * @revision: The revision of the PPTT table
  *
  * The ACPI spec implies that the fields in the cache structures are used to
  * extend and correct the information probed from the hardware. Lets only
@@ -361,10 +395,9 @@ static struct acpi_pptt_cache *acpi_find_cache_node(struct acpi_table_header *ta
  */
 static void update_cache_properties(struct cacheinfo *this_leaf,
 				    struct acpi_pptt_cache *found_cache,
-				    struct acpi_pptt_processor *cpu_node,
-				    u8 revision)
+				    struct acpi_pptt_processor *cpu_node)
 {
-	struct acpi_pptt_cache_v1* found_cache_v1;
+	struct acpi_pptt_cache_v1_full *found_cache_v1;
 
 	this_leaf->fw_token = cpu_node;
 	if (found_cache->flags & ACPI_PPTT_SIZE_PROPERTY_VALID)
@@ -414,9 +447,8 @@ static void update_cache_properties(struct cacheinfo *this_leaf,
 	    found_cache->flags & ACPI_PPTT_CACHE_TYPE_VALID)
 		this_leaf->type = CACHE_TYPE_UNIFIED;
 
-	if (revision >= 3 && (found_cache->flags & ACPI_PPTT_CACHE_ID_VALID)) {
-		found_cache_v1 = ACPI_ADD_PTR(struct acpi_pptt_cache_v1,
-	                                      found_cache, sizeof(struct acpi_pptt_cache));
+	found_cache_v1 = upgrade_pptt_cache(found_cache);
+	if (found_cache_v1) {
 		this_leaf->id = found_cache_v1->cache_id;
 		this_leaf->attributes |= CACHE_ID;
 	}
@@ -441,8 +473,7 @@ static void cache_setup_acpi_cpu(struct acpi_table_header *table,
 		pr_debug("found = %p %p\n", found_cache, cpu_node);
 		if (found_cache)
 			update_cache_properties(this_leaf, found_cache,
-						ACPI_TO_POINTER(ACPI_PTR_DIFF(cpu_node, table)),
-						table->revision);
+						ACPI_TO_POINTER(ACPI_PTR_DIFF(cpu_node, table)));
 
 		index++;
 	}
@@ -645,7 +676,7 @@ int acpi_get_cache_info(unsigned int cpu, unsigned int *levels,
 	if (!cpu_node)
 		return -ENOENT;
 
-	acpi_count_levels(table, cpu_node, levels, split_levels);
+	*levels = acpi_count_levels(table, cpu_node, split_levels);
 
 	pr_debug("Cache Setup: last_level=%d split_levels=%d\n",
 		 *levels, split_levels ? *split_levels : -1);
@@ -817,3 +848,218 @@ int find_acpi_cpu_topology_hetero_id(unsigned int cpu)
 	return find_acpi_cpu_topology_tag(cpu, PPTT_ABORT_PACKAGE,
 					  ACPI_PPTT_ACPI_IDENTICAL);
 }
+
+/**
+ * acpi_pptt_get_child_cpus() - Find all the CPUs below a PPTT
+ * processor hierarchy node
+ *
+ * @table_hdr:		A reference to the PPTT table
+ * @parent_node:	A pointer to the processor hierarchy node in the
+ *			table_hdr
+ * @cpus:		A cpumask to fill with the CPUs below @parent_node
+ *
+ * Walks up the PPTT from every possible CPU to find if the provided
+ * @parent_node is a parent of this CPU.
+ */
+static void acpi_pptt_get_child_cpus(struct acpi_table_header *table_hdr,
+				     struct acpi_pptt_processor *parent_node,
+				     cpumask_t *cpus)
+{
+	struct acpi_pptt_processor *cpu_node;
+	u32 acpi_id;
+	int cpu;
+
+	cpumask_clear(cpus);
+
+	for_each_possible_cpu(cpu) {
+		acpi_id = get_acpi_id_for_cpu(cpu);
+		cpu_node = acpi_find_processor_node(table_hdr, acpi_id);
+
+		while (cpu_node) {
+			if (cpu_node == parent_node) {
+				cpumask_set_cpu(cpu, cpus);
+				break;
+			}
+			cpu_node = fetch_pptt_node(table_hdr, cpu_node->parent);
+		}
+	}
+}
+
+/**
+ * acpi_pptt_get_cpus_from_container() - Populate a cpumask with all CPUs in a
+ *                                       processor container
+ * @acpi_cpu_id:	The UID of the processor container
+ * @cpus:		The resulting CPU mask
+ *
+ * Find the specified Processor Container, and fill @cpus with all the cpus
+ * below it.
+ *
+ * Not all 'Processor Hierarchy' entries in the PPTT are either a CPU
+ * or a Processor Container, they may exist purely to describe a
+ * Private resource. CPUs have to be leaves, so a Processor Container
+ * is a non-leaf that has the 'ACPI Processor ID valid' flag set.
+ */
+void acpi_pptt_get_cpus_from_container(u32 acpi_cpu_id, cpumask_t *cpus)
+{
+	struct acpi_table_header *table_hdr;
+	struct acpi_subtable_header *entry;
+	unsigned long table_end;
+	u32 proc_sz;
+
+	cpumask_clear(cpus);
+
+	table_hdr = acpi_get_pptt();
+	if (!table_hdr)
+		return;
+
+	table_end = (unsigned long)table_hdr + table_hdr->length;
+	entry = ACPI_ADD_PTR(struct acpi_subtable_header, table_hdr,
+			     sizeof(struct acpi_table_pptt));
+	proc_sz = sizeof(struct acpi_pptt_processor);
+	while ((unsigned long)entry + proc_sz <= table_end) {
+		if (entry->type == ACPI_PPTT_TYPE_PROCESSOR) {
+			struct acpi_pptt_processor *cpu_node;
+
+			cpu_node = (struct acpi_pptt_processor *)entry;
+			if (cpu_node->flags & ACPI_PPTT_ACPI_PROCESSOR_ID_VALID &&
+			    !acpi_pptt_leaf_node(table_hdr, cpu_node) &&
+			    cpu_node->acpi_processor_id == acpi_cpu_id) {
+				acpi_pptt_get_child_cpus(table_hdr, cpu_node, cpus);
+				break;
+			}
+		}
+		entry = ACPI_ADD_PTR(struct acpi_subtable_header, entry,
+				     entry->length);
+	}
+}
+
+/**
+ * find_acpi_cache_level_from_id() - Get the level of the specified cache
+ * @cache_id: The id field of the cache
+ *
+ * Determine the level relative to any CPU for the cache identified by
+ * cache_id. This allows the property to be found even if the CPUs are offline.
+ *
+ * The returned level can be used to group caches that are peers.
+ *
+ * The PPTT table must be rev 3 or later.
+ *
+ * If one CPU's L2 is shared with another CPU as L3, this function will return
+ * an unpredictable value.
+ *
+ * Return: -ENOENT if the PPTT doesn't exist, the revision isn't supported or
+ * the cache cannot be found.
+ * Otherwise returns a value which represents the level of the specified cache.
+ */
+int find_acpi_cache_level_from_id(u32 cache_id)
+{
+	int cpu;
+	struct acpi_table_header *table;
+
+	table = acpi_get_pptt();
+	if (!table)
+		return -ENOENT;
+
+	if (table->revision < 3)
+		return -ENOENT;
+
+	for_each_possible_cpu(cpu) {
+		bool empty;
+		int level = 1;
+		u32 acpi_cpu_id = get_acpi_id_for_cpu(cpu);
+		struct acpi_pptt_cache *cache;
+		struct acpi_pptt_processor *cpu_node;
+
+		cpu_node = acpi_find_processor_node(table, acpi_cpu_id);
+		if (!cpu_node)
+			continue;
+
+		do {
+			int cache_type[] = {CACHE_TYPE_INST, CACHE_TYPE_DATA, CACHE_TYPE_UNIFIED};
+
+			empty = true;
+			for (int i = 0; i < ARRAY_SIZE(cache_type); i++) {
+				struct acpi_pptt_cache_v1_full *cache_v1;
+
+				cache = acpi_find_cache_node(table, acpi_cpu_id, cache_type[i],
+							     level, &cpu_node);
+				if (!cache)
+					continue;
+
+				empty = false;
+
+				cache_v1 = upgrade_pptt_cache(cache);
+				if (cache_v1 && cache_v1->cache_id == cache_id)
+					return level;
+			}
+			level++;
+		} while (!empty);
+	}
+
+	return -ENOENT;
+}
+
+/**
+ * acpi_pptt_get_cpumask_from_cache_id() - Get the cpus associated with the
+ *					   specified cache
+ * @cache_id: The id field of the cache
+ * @cpus: Where to build the cpumask
+ *
+ * Determine which CPUs are below this cache in the PPTT. This allows the property
+ * to be found even if the CPUs are offline.
+ *
+ * The PPTT table must be rev 3 or later,
+ *
+ * Return: -ENOENT if the PPTT doesn't exist, or the cache cannot be found.
+ * Otherwise returns 0 and sets the cpus in the provided cpumask.
+ */
+int acpi_pptt_get_cpumask_from_cache_id(u32 cache_id, cpumask_t *cpus)
+{
+	int cpu;
+	struct acpi_table_header *table;
+
+	cpumask_clear(cpus);
+
+	table = acpi_get_pptt();
+	if (!table)
+		return -ENOENT;
+
+	if (table->revision < 3)
+		return -ENOENT;
+
+	for_each_possible_cpu(cpu) {
+		bool empty;
+		int level = 1;
+		u32 acpi_cpu_id = get_acpi_id_for_cpu(cpu);
+		struct acpi_pptt_cache *cache;
+		struct acpi_pptt_processor *cpu_node;
+
+		cpu_node = acpi_find_processor_node(table, acpi_cpu_id);
+		if (!cpu_node)
+			continue;
+
+		do {
+			int cache_type[] = {CACHE_TYPE_INST, CACHE_TYPE_DATA, CACHE_TYPE_UNIFIED};
+
+			empty = true;
+			for (int i = 0; i < ARRAY_SIZE(cache_type); i++) {
+				struct acpi_pptt_cache_v1_full *cache_v1;
+
+				cache = acpi_find_cache_node(table, acpi_cpu_id, cache_type[i],
+							     level, &cpu_node);
+
+				if (!cache)
+					continue;
+
+				empty = false;
+
+				cache_v1 = upgrade_pptt_cache(cache);
+				if (cache_v1 && cache_v1->cache_id == cache_id)
+					cpumask_set_cpu(cpu, cpus);
+			}
+			level++;
+		} while (!empty);
+	}
+
+	return 0;
+}
diff --git a/drivers/acpi/processor_core.c b/drivers/acpi/processor_core.c
index 9b6b71a2ffb5..a4498357bd16 100644
--- a/drivers/acpi/processor_core.c
+++ b/drivers/acpi/processor_core.c
@@ -54,7 +54,7 @@ static int map_x2apic_id(struct acpi_subtable_header *entry,
 	if (!(apic->lapic_flags & ACPI_MADT_ENABLED))
 		return -ENODEV;
 
-	if (device_declaration && (apic->uid == acpi_id)) {
+	if (apic->uid == acpi_id && (device_declaration || acpi_id < 255)) {
 		*apic_id = apic->local_apic_id;
 		return 0;
 	}
diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
index 4166090db642..89f2f08b2554 100644
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -732,18 +732,16 @@ static int __cpuidle acpi_idle_enter_s2idle(struct cpuidle_device *dev,
 	return 0;
 }
 
-static int acpi_processor_setup_cpuidle_cx(struct acpi_processor *pr,
-					   struct cpuidle_device *dev)
+static void acpi_processor_setup_cpuidle_cx(struct acpi_processor *pr,
+					    struct cpuidle_device *dev)
 {
 	int i, count = ACPI_IDLE_STATE_START;
 	struct acpi_processor_cx *cx;
-	struct cpuidle_state *state;
 
 	if (max_cstate == 0)
 		max_cstate = 1;
 
 	for (i = 1; i < ACPI_PROCESSOR_MAX_POWER && i <= max_cstate; i++) {
-		state = &acpi_idle_driver.states[count];
 		cx = &pr->power.states[i];
 
 		if (!cx->valid)
@@ -751,27 +749,13 @@ static int acpi_processor_setup_cpuidle_cx(struct acpi_processor *pr,
 
 		per_cpu(acpi_cstate[count], dev->cpu) = cx;
 
-		if (lapic_timer_needs_broadcast(pr, cx))
-			state->flags |= CPUIDLE_FLAG_TIMER_STOP;
-
-		if (cx->type == ACPI_STATE_C3) {
-			state->flags |= CPUIDLE_FLAG_TLB_FLUSHED;
-			if (pr->flags.bm_check)
-				state->flags |= CPUIDLE_FLAG_RCU_IDLE;
-		}
-
 		count++;
 		if (count == CPUIDLE_STATE_MAX)
 			break;
 	}
-
-	if (!count)
-		return -EINVAL;
-
-	return 0;
 }
 
-static int acpi_processor_setup_cstates(struct acpi_processor *pr)
+static void acpi_processor_setup_cstates(struct acpi_processor *pr)
 {
 	int i, count;
 	struct acpi_processor_cx *cx;
@@ -818,17 +802,21 @@ static int acpi_processor_setup_cstates(struct acpi_processor *pr)
 		if (cx->type != ACPI_STATE_C1 && !acpi_idle_fallback_to_c1(pr))
 			state->enter_s2idle = acpi_idle_enter_s2idle;
 
+		if (lapic_timer_needs_broadcast(pr, cx))
+			state->flags |= CPUIDLE_FLAG_TIMER_STOP;
+
+		if (cx->type == ACPI_STATE_C3) {
+			state->flags |= CPUIDLE_FLAG_TLB_FLUSHED;
+			if (pr->flags.bm_check)
+				state->flags |= CPUIDLE_FLAG_RCU_IDLE;
+		}
+
 		count++;
 		if (count == CPUIDLE_STATE_MAX)
 			break;
 	}
 
 	drv->state_count = count;
-
-	if (!count)
-		return -EINVAL;
-
-	return 0;
 }
 
 static inline void acpi_processor_cstate_first_run_checks(void)
@@ -1243,7 +1231,8 @@ static int acpi_processor_setup_cpuidle_states(struct acpi_processor *pr)
 	if (pr->flags.has_lpi)
 		return acpi_processor_setup_lpi_states(pr);
 
-	return acpi_processor_setup_cstates(pr);
+	acpi_processor_setup_cstates(pr);
+	return 0;
 }
 
 /**
@@ -1263,7 +1252,8 @@ static int acpi_processor_setup_cpuidle_dev(struct acpi_processor *pr,
 	if (pr->flags.has_lpi)
 		return acpi_processor_ffh_lpi_probe(pr->id);
 
-	return acpi_processor_setup_cpuidle_cx(pr, dev);
+	acpi_processor_setup_cpuidle_cx(pr, dev);
+	return 0;
 }
 
 static int acpi_processor_get_power_info(struct acpi_processor *pr)
diff --git a/drivers/acpi/property.c b/drivers/acpi/property.c
index 43d5e457814e..18e90067d567 100644
--- a/drivers/acpi/property.c
+++ b/drivers/acpi/property.c
@@ -1280,7 +1280,7 @@ static int acpi_data_prop_read(const struct acpi_device_data *data,
 		ret = acpi_copy_property_array_uint(items, (u64 *)val, nval);
 		break;
 	case DEV_PROP_STRING:
-		nval = min_t(u32, nval, obj->package.count);
+		nval = min(nval, obj->package.count);
 		if (nval == 0)
 			return -ENODATA;
 
@@ -1329,13 +1329,14 @@ static int stop_on_next(struct acpi_device *adev, void *data)
 	return 0;
 }
 
-/**
+/*
  * acpi_get_next_subnode - Return the next child node handle for a fwnode
  * @fwnode: Firmware node to find the next child node for.
  * @child: Handle to one of the device's child nodes or a null handle.
  */
-struct fwnode_handle *acpi_get_next_subnode(const struct fwnode_handle *fwnode,
-					    struct fwnode_handle *child)
+static struct fwnode_handle *
+acpi_get_next_subnode(const struct fwnode_handle *fwnode,
+		      struct fwnode_handle *child)
 {
 	struct acpi_device *adev = to_acpi_device_node(fwnode);
 
@@ -1472,7 +1473,7 @@ static struct fwnode_handle *acpi_graph_get_next_endpoint(
 
 	if (!prev) {
 		do {
-			port = fwnode_get_next_child_node(fwnode, port);
+			port = acpi_get_next_subnode(fwnode, port);
 			/*
 			 * The names of the port nodes begin with "port@"
 			 * followed by the number of the port node and they also
@@ -1490,14 +1491,17 @@ static struct fwnode_handle *acpi_graph_get_next_endpoint(
 	if (!port)
 		return NULL;
 
-	endpoint = fwnode_get_next_child_node(port, prev);
-	while (!endpoint) {
-		port = fwnode_get_next_child_node(fwnode, port);
-		if (!port)
+	do {
+		endpoint = acpi_get_next_subnode(port, prev);
+		if (endpoint)
 			break;
-		if (is_acpi_graph_node(port, "port"))
-			endpoint = fwnode_get_next_child_node(port, NULL);
-	}
+
+		prev = NULL;
+
+		do {
+			port = acpi_get_next_subnode(fwnode, port);
+		} while (port && !is_acpi_graph_node(port, "port"));
+	} while (port);
 
 	/*
 	 * The names of the endpoint nodes begin with "endpoint@" followed by
@@ -1714,6 +1718,7 @@ static int acpi_fwnode_graph_parse_endpoint(const struct fwnode_handle *fwnode,
 	if (fwnode_property_read_u32(fwnode, "reg", &endpoint->id))
 		fwnode_property_read_u32(fwnode, "endpoint", &endpoint->id);
 
+	fwnode_handle_put(port_fwnode);
 	return 0;
 }
 
diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c
index ef16d58b2949..416d87f9bd10 100644
--- a/drivers/acpi/scan.c
+++ b/drivers/acpi/scan.c
@@ -2397,7 +2397,7 @@ static bool acpi_scan_clear_dep_queue(struct acpi_device *adev)
 	 * initial enumeration of devices is complete, put it into the unbound
 	 * workqueue.
 	 */
-	queue_work(system_unbound_wq, &cdw->work);
+	queue_work(system_dfl_wq, &cdw->work);
 
 	return true;
 }
@@ -2711,7 +2711,6 @@ void __init acpi_scan_init(void)
 	acpi_watchdog_init();
 	acpi_pnp_init();
 	acpi_power_resources_init();
-	acpi_int340x_thermal_init();
 	acpi_init_lpit();
 
 	acpi_scan_add_handler(&generic_device_handler);
diff --git a/drivers/acpi/sleep.c b/drivers/acpi/sleep.c
index c8ee8e42b0f6..68943b98333d 100644
--- a/drivers/acpi/sleep.c
+++ b/drivers/acpi/sleep.c
@@ -642,7 +642,7 @@ static int acpi_suspend_enter(suspend_state_t pm_state)
 	/*
 	 * Disable all GPE and clear their status bits before interrupts are
 	 * enabled. Some GPEs (like wakeup GPEs) have no handlers and this can
-	 * prevent them from producing spurious interrups.
+	 * prevent them from producing spurious interrupts.
 	 *
 	 * acpi_leave_sleep_state() will reenable specific GPEs later.
 	 *
diff --git a/drivers/acpi/sleep.h b/drivers/acpi/sleep.h
index d960a238be4e..9c3cb109c5d2 100644
--- a/drivers/acpi/sleep.h
+++ b/drivers/acpi/sleep.h
@@ -17,10 +17,7 @@ static inline acpi_status acpi_set_waking_vector(u32 wakeup_address)
 
 extern int acpi_s2idle_begin(void);
 extern int acpi_s2idle_prepare(void);
-extern int acpi_s2idle_prepare_late(void);
-extern void acpi_s2idle_check(void);
 extern bool acpi_s2idle_wake(void);
-extern void acpi_s2idle_restore_early(void);
 extern void acpi_s2idle_restore(void);
 extern void acpi_s2idle_end(void);
 
diff --git a/drivers/acpi/tables.c b/drivers/acpi/tables.c
index 57fc8bc56166..4286e4af1092 100644
--- a/drivers/acpi/tables.c
+++ b/drivers/acpi/tables.c
@@ -408,7 +408,7 @@ static const char table_sigs[][ACPI_NAMESEG_SIZE] __nonstring_array __initconst
 	ACPI_SIG_PSDT, ACPI_SIG_RSDT, ACPI_SIG_XSDT, ACPI_SIG_SSDT,
 	ACPI_SIG_IORT, ACPI_SIG_NFIT, ACPI_SIG_HMAT, ACPI_SIG_PPTT,
 	ACPI_SIG_NHLT, ACPI_SIG_AEST, ACPI_SIG_CEDT, ACPI_SIG_AGDI,
-	ACPI_SIG_NBFT, ACPI_SIG_SWFT};
+	ACPI_SIG_NBFT, ACPI_SIG_SWFT, ACPI_SIG_MPAM};
 
 #define ACPI_HEADER_SIZE sizeof(struct acpi_table_header)
 
diff --git a/drivers/acpi/thermal.c b/drivers/acpi/thermal.c
index 8537395b417b..a511f9ea0267 100644
--- a/drivers/acpi/thermal.c
+++ b/drivers/acpi/thermal.c
@@ -1060,7 +1060,8 @@ static int __init acpi_thermal_init(void)
 	}
 
 	acpi_thermal_pm_queue = alloc_workqueue("acpi_thermal_pm",
-						WQ_HIGHPRI | WQ_MEM_RECLAIM, 0);
+						WQ_HIGHPRI | WQ_MEM_RECLAIM | WQ_PERCPU,
+						0);
 	if (!acpi_thermal_pm_queue)
 		return -ENODEV;
 
diff --git a/drivers/acpi/x86/lpss.c b/drivers/acpi/x86/lpss.c
index 6daa6372f980..1dcb80ab0d23 100644
--- a/drivers/acpi/x86/lpss.c
+++ b/drivers/acpi/x86/lpss.c
@@ -181,7 +181,7 @@ static void byt_i2c_setup(struct lpss_private_data *pdata)
 	acpi_status status;
 	u64 uid;
 
-	/* Expected to always be successfull, but better safe then sorry */
+	/* Expected to always be successful, but better safe then sorry */
 	if (!acpi_dev_uid_to_integer(pdata->adev, &uid) && uid) {
 		/* Detect I2C bus shared with PUNIT and ignore its d3 status */
 		status = acpi_evaluate_integer(handle, "_SEM", NULL, &shared_host);
diff --git a/drivers/acpi/x86/s2idle.c b/drivers/acpi/x86/s2idle.c
index dd0b40b9bbe8..6d4d06236f61 100644
--- a/drivers/acpi/x86/s2idle.c
+++ b/drivers/acpi/x86/s2idle.c
@@ -299,34 +299,13 @@ free_acpi_buffer:
 	ACPI_FREE(out_obj);
 }
 
-/**
- * acpi_get_lps0_constraint - Get the LPS0 constraint for a device.
- * @adev: Device to get the constraint for.
- *
- * The LPS0 constraint is the shallowest (minimum) power state in which the
- * device can be so as to allow the platform as a whole to achieve additional
- * energy conservation by utilizing a system-wide low-power state.
- *
- * Returns:
- *  - ACPI power state value of the constraint for @adev on success.
- *  - Otherwise, ACPI_STATE_UNKNOWN.
- */
-int acpi_get_lps0_constraint(struct acpi_device *adev)
-{
-	struct lpi_constraints *entry;
-
-	for_each_lpi_constraint(entry) {
-		if (adev->handle == entry->handle)
-			return entry->min_dstate;
-	}
-
-	return ACPI_STATE_UNKNOWN;
-}
-
 static void lpi_check_constraints(void)
 {
 	struct lpi_constraints *entry;
 
+	if (IS_ERR_OR_NULL(lpi_constraints_table))
+		return;
+
 	for_each_lpi_constraint(entry) {
 		struct acpi_device *adev = acpi_fetch_acpi_dev(entry->handle);
 
@@ -508,11 +487,6 @@ static int lps0_device_attach(struct acpi_device *adev,
 
 	lps0_device_handle = adev->handle;
 
-	if (acpi_s2idle_vendor_amd())
-		lpi_device_get_constraints_amd();
-	else
-		lpi_device_get_constraints();
-
 	/*
 	 * Use suspend-to-idle by default if ACPI_FADT_LOW_POWER_S0 is set in
 	 * the FADT and the default suspend mode was not set from the command
@@ -539,7 +513,26 @@ static struct acpi_scan_handler lps0_handler = {
 	.attach = lps0_device_attach,
 };
 
-int acpi_s2idle_prepare_late(void)
+static int acpi_s2idle_begin_lps0(void)
+{
+	if (pm_debug_messages_on && !lpi_constraints_table) {
+		if (acpi_s2idle_vendor_amd())
+			lpi_device_get_constraints_amd();
+		else
+			lpi_device_get_constraints();
+
+		/*
+		 * Try to retrieve the constraints only once because failures
+		 * to do so usually are sticky.
+		 */
+		if (!lpi_constraints_table)
+			lpi_constraints_table = ERR_PTR(-ENODATA);
+	}
+
+	return acpi_s2idle_begin();
+}
+
+static int acpi_s2idle_prepare_late_lps0(void)
 {
 	struct acpi_s2idle_dev_ops *handler;
 
@@ -585,7 +578,7 @@ int acpi_s2idle_prepare_late(void)
 	return 0;
 }
 
-void acpi_s2idle_check(void)
+static void acpi_s2idle_check_lps0(void)
 {
 	struct acpi_s2idle_dev_ops *handler;
 
@@ -598,7 +591,7 @@ void acpi_s2idle_check(void)
 	}
 }
 
-void acpi_s2idle_restore_early(void)
+static void acpi_s2idle_restore_early_lps0(void)
 {
 	struct acpi_s2idle_dev_ops *handler;
 
@@ -636,12 +629,12 @@ void acpi_s2idle_restore_early(void)
 }
 
 static const struct platform_s2idle_ops acpi_s2idle_ops_lps0 = {
-	.begin = acpi_s2idle_begin,
+	.begin = acpi_s2idle_begin_lps0,
 	.prepare = acpi_s2idle_prepare,
-	.prepare_late = acpi_s2idle_prepare_late,
-	.check = acpi_s2idle_check,
+	.prepare_late = acpi_s2idle_prepare_late_lps0,
+	.check = acpi_s2idle_check_lps0,
 	.wake = acpi_s2idle_wake,
-	.restore_early = acpi_s2idle_restore_early,
+	.restore_early = acpi_s2idle_restore_early_lps0,
 	.restore = acpi_s2idle_restore,
 	.end = acpi_s2idle_end,
 };
diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 6d84a02cfa5d..fc43f2703ae0 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -226,7 +226,6 @@ static int memory_block_online(struct memory_block *mem)
 	unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
 	unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
 	unsigned long nr_vmemmap_pages = 0;
-	struct memory_notify arg;
 	struct zone *zone;
 	int ret;
 
@@ -246,19 +245,9 @@ static int memory_block_online(struct memory_block *mem)
 	if (mem->altmap)
 		nr_vmemmap_pages = mem->altmap->free;
 
-	arg.altmap_start_pfn = start_pfn;
-	arg.altmap_nr_pages = nr_vmemmap_pages;
-	arg.start_pfn = start_pfn + nr_vmemmap_pages;
-	arg.nr_pages = nr_pages - nr_vmemmap_pages;
 	mem_hotplug_begin();
-	ret = memory_notify(MEM_PREPARE_ONLINE, &arg);
-	ret = notifier_to_errno(ret);
-	if (ret)
-		goto out_notifier;
-
 	if (nr_vmemmap_pages) {
-		ret = mhp_init_memmap_on_memory(start_pfn, nr_vmemmap_pages,
-						zone, mem->altmap->inaccessible);
+		ret = mhp_init_memmap_on_memory(start_pfn, nr_vmemmap_pages, zone);
 		if (ret)
 			goto out;
 	}
@@ -280,11 +269,7 @@ static int memory_block_online(struct memory_block *mem)
 					  nr_vmemmap_pages);
 
 	mem->zone = zone;
-	mem_hotplug_done();
-	return ret;
 out:
-	memory_notify(MEM_FINISH_OFFLINE, &arg);
-out_notifier:
 	mem_hotplug_done();
 	return ret;
 }
@@ -297,7 +282,6 @@ static int memory_block_offline(struct memory_block *mem)
 	unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
 	unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
 	unsigned long nr_vmemmap_pages = 0;
-	struct memory_notify arg;
 	int ret;
 
 	if (!mem->zone)
@@ -329,11 +313,6 @@ static int memory_block_offline(struct memory_block *mem)
 		mhp_deinit_memmap_on_memory(start_pfn, nr_vmemmap_pages);
 
 	mem->zone = NULL;
-	arg.altmap_start_pfn = start_pfn;
-	arg.altmap_nr_pages = nr_vmemmap_pages;
-	arg.start_pfn = start_pfn + nr_vmemmap_pages;
-	arg.nr_pages = nr_pages - nr_vmemmap_pages;
-	memory_notify(MEM_FINISH_OFFLINE, &arg);
 out:
 	mem_hotplug_done();
 	return ret;
diff --git a/drivers/base/power/generic_ops.c b/drivers/base/power/generic_ops.c
index 6502720bb564..af99bbcf281c 100644
--- a/drivers/base/power/generic_ops.c
+++ b/drivers/base/power/generic_ops.c
@@ -8,6 +8,13 @@
 #include <linux/pm_runtime.h>
 #include <linux/export.h>
 
+#define CALL_PM_OP(dev, op) \
+({ \
+	struct device *_dev = (dev); \
+	const struct dev_pm_ops *pm = _dev->driver ? _dev->driver->pm : NULL; \
+	pm && pm->op ? pm->op(_dev) : 0; \
+})
+
 #ifdef CONFIG_PM
 /**
  * pm_generic_runtime_suspend - Generic runtime suspend callback for subsystems.
@@ -19,12 +26,7 @@
  */
 int pm_generic_runtime_suspend(struct device *dev)
 {
-	const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL;
-	int ret;
-
-	ret = pm && pm->runtime_suspend ? pm->runtime_suspend(dev) : 0;
-
-	return ret;
+	return CALL_PM_OP(dev, runtime_suspend);
 }
 EXPORT_SYMBOL_GPL(pm_generic_runtime_suspend);
 
@@ -38,12 +40,7 @@ EXPORT_SYMBOL_GPL(pm_generic_runtime_suspend);
  */
 int pm_generic_runtime_resume(struct device *dev)
 {
-	const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL;
-	int ret;
-
-	ret = pm && pm->runtime_resume ? pm->runtime_resume(dev) : 0;
-
-	return ret;
+	return CALL_PM_OP(dev, runtime_resume);
 }
 EXPORT_SYMBOL_GPL(pm_generic_runtime_resume);
 #endif /* CONFIG_PM */
@@ -72,9 +69,7 @@ int pm_generic_prepare(struct device *dev)
  */
 int pm_generic_suspend_noirq(struct device *dev)
 {
-	const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL;
-
-	return pm && pm->suspend_noirq ? pm->suspend_noirq(dev) : 0;
+	return CALL_PM_OP(dev, suspend_noirq);
 }
 EXPORT_SYMBOL_GPL(pm_generic_suspend_noirq);
 
@@ -84,9 +79,7 @@ EXPORT_SYMBOL_GPL(pm_generic_suspend_noirq);
  */
 int pm_generic_suspend_late(struct device *dev)
 {
-	const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL;
-
-	return pm && pm->suspend_late ? pm->suspend_late(dev) : 0;
+	return CALL_PM_OP(dev, suspend_late);
 }
 EXPORT_SYMBOL_GPL(pm_generic_suspend_late);
 
@@ -96,9 +89,7 @@ EXPORT_SYMBOL_GPL(pm_generic_suspend_late);
  */
 int pm_generic_suspend(struct device *dev)
 {
-	const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL;
-
-	return pm && pm->suspend ? pm->suspend(dev) : 0;
+	return CALL_PM_OP(dev, suspend);
 }
 EXPORT_SYMBOL_GPL(pm_generic_suspend);
 
@@ -108,9 +99,7 @@ EXPORT_SYMBOL_GPL(pm_generic_suspend);
  */
 int pm_generic_freeze_noirq(struct device *dev)
 {
-	const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL;
-
-	return pm && pm->freeze_noirq ? pm->freeze_noirq(dev) : 0;
+	return CALL_PM_OP(dev, freeze_noirq);
 }
 EXPORT_SYMBOL_GPL(pm_generic_freeze_noirq);
 
@@ -120,9 +109,7 @@ EXPORT_SYMBOL_GPL(pm_generic_freeze_noirq);
  */
 int pm_generic_freeze(struct device *dev)
 {
-	const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL;
-
-	return pm && pm->freeze ? pm->freeze(dev) : 0;
+	return CALL_PM_OP(dev, freeze);
 }
 EXPORT_SYMBOL_GPL(pm_generic_freeze);
 
@@ -132,9 +119,7 @@ EXPORT_SYMBOL_GPL(pm_generic_freeze);
  */
 int pm_generic_poweroff_noirq(struct device *dev)
 {
-	const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL;
-
-	return pm && pm->poweroff_noirq ? pm->poweroff_noirq(dev) : 0;
+	return CALL_PM_OP(dev, poweroff_noirq);
 }
 EXPORT_SYMBOL_GPL(pm_generic_poweroff_noirq);
 
@@ -144,9 +129,7 @@ EXPORT_SYMBOL_GPL(pm_generic_poweroff_noirq);
  */
 int pm_generic_poweroff_late(struct device *dev)
 {
-	const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL;
-
-	return pm && pm->poweroff_late ? pm->poweroff_late(dev) : 0;
+	return CALL_PM_OP(dev, poweroff_late);
 }
 EXPORT_SYMBOL_GPL(pm_generic_poweroff_late);
 
@@ -156,9 +139,7 @@ EXPORT_SYMBOL_GPL(pm_generic_poweroff_late);
  */
 int pm_generic_poweroff(struct device *dev)
 {
-	const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL;
-
-	return pm && pm->poweroff ? pm->poweroff(dev) : 0;
+	return CALL_PM_OP(dev, poweroff);
 }
 EXPORT_SYMBOL_GPL(pm_generic_poweroff);
 
@@ -168,9 +149,7 @@ EXPORT_SYMBOL_GPL(pm_generic_poweroff);
  */
 int pm_generic_thaw_noirq(struct device *dev)
 {
-	const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL;
-
-	return pm && pm->thaw_noirq ? pm->thaw_noirq(dev) : 0;
+	return CALL_PM_OP(dev, thaw_noirq);
 }
 EXPORT_SYMBOL_GPL(pm_generic_thaw_noirq);
 
@@ -180,9 +159,7 @@ EXPORT_SYMBOL_GPL(pm_generic_thaw_noirq);
  */
 int pm_generic_thaw(struct device *dev)
 {
-	const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL;
-
-	return pm && pm->thaw ? pm->thaw(dev) : 0;
+	return CALL_PM_OP(dev, thaw);
 }
 EXPORT_SYMBOL_GPL(pm_generic_thaw);
 
@@ -192,9 +169,7 @@ EXPORT_SYMBOL_GPL(pm_generic_thaw);
  */
 int pm_generic_resume_noirq(struct device *dev)
 {
-	const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL;
-
-	return pm && pm->resume_noirq ? pm->resume_noirq(dev) : 0;
+	return CALL_PM_OP(dev, resume_noirq);
 }
 EXPORT_SYMBOL_GPL(pm_generic_resume_noirq);
 
@@ -204,9 +179,7 @@ EXPORT_SYMBOL_GPL(pm_generic_resume_noirq);
  */
 int pm_generic_resume_early(struct device *dev)
 {
-	const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL;
-
-	return pm && pm->resume_early ? pm->resume_early(dev) : 0;
+	return CALL_PM_OP(dev, resume_early);
 }
 EXPORT_SYMBOL_GPL(pm_generic_resume_early);
 
@@ -216,9 +189,7 @@ EXPORT_SYMBOL_GPL(pm_generic_resume_early);
  */
 int pm_generic_resume(struct device *dev)
 {
-	const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL;
-
-	return pm && pm->resume ? pm->resume(dev) : 0;
+	return CALL_PM_OP(dev, resume);
 }
 EXPORT_SYMBOL_GPL(pm_generic_resume);
 
@@ -228,9 +199,7 @@ EXPORT_SYMBOL_GPL(pm_generic_resume);
  */
 int pm_generic_restore_noirq(struct device *dev)
 {
-	const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL;
-
-	return pm && pm->restore_noirq ? pm->restore_noirq(dev) : 0;
+	return CALL_PM_OP(dev, restore_noirq);
 }
 EXPORT_SYMBOL_GPL(pm_generic_restore_noirq);
 
@@ -240,9 +209,7 @@ EXPORT_SYMBOL_GPL(pm_generic_restore_noirq);
  */
 int pm_generic_restore_early(struct device *dev)
 {
-	const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL;
-
-	return pm && pm->restore_early ? pm->restore_early(dev) : 0;
+	return CALL_PM_OP(dev, restore_early);
 }
 EXPORT_SYMBOL_GPL(pm_generic_restore_early);
 
@@ -252,9 +219,7 @@ EXPORT_SYMBOL_GPL(pm_generic_restore_early);
  */
 int pm_generic_restore(struct device *dev)
 {
-	const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL;
-
-	return pm && pm->restore ? pm->restore(dev) : 0;
+	return CALL_PM_OP(dev, restore);
 }
 EXPORT_SYMBOL_GPL(pm_generic_restore);
 
diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
index 1de1cd72b616..a0225a83f50c 100644
--- a/drivers/base/power/main.c
+++ b/drivers/base/power/main.c
@@ -34,6 +34,7 @@
 #include <linux/cpufreq.h>
 #include <linux/devfreq.h>
 #include <linux/timer.h>
+#include <linux/nmi.h>
 
 #include "../base.h"
 #include "power.h"
@@ -95,6 +96,8 @@ static const char *pm_verb(int event)
 		return "restore";
 	case PM_EVENT_RECOVER:
 		return "recover";
+	case PM_EVENT_POWEROFF:
+		return "poweroff";
 	default:
 		return "(unknown PM event)";
 	}
@@ -367,6 +370,7 @@ static pm_callback_t pm_op(const struct dev_pm_ops *ops, pm_message_t state)
 	case PM_EVENT_FREEZE:
 	case PM_EVENT_QUIESCE:
 		return ops->freeze;
+	case PM_EVENT_POWEROFF:
 	case PM_EVENT_HIBERNATE:
 		return ops->poweroff;
 	case PM_EVENT_THAW:
@@ -401,6 +405,7 @@ static pm_callback_t pm_late_early_op(const struct dev_pm_ops *ops,
 	case PM_EVENT_FREEZE:
 	case PM_EVENT_QUIESCE:
 		return ops->freeze_late;
+	case PM_EVENT_POWEROFF:
 	case PM_EVENT_HIBERNATE:
 		return ops->poweroff_late;
 	case PM_EVENT_THAW:
@@ -435,6 +440,7 @@ static pm_callback_t pm_noirq_op(const struct dev_pm_ops *ops, pm_message_t stat
 	case PM_EVENT_FREEZE:
 	case PM_EVENT_QUIESCE:
 		return ops->freeze_noirq;
+	case PM_EVENT_POWEROFF:
 	case PM_EVENT_HIBERNATE:
 		return ops->poweroff_noirq;
 	case PM_EVENT_THAW:
@@ -515,6 +521,11 @@ struct dpm_watchdog {
 #define DECLARE_DPM_WATCHDOG_ON_STACK(wd) \
 	struct dpm_watchdog wd
 
+static bool __read_mostly dpm_watchdog_all_cpu_backtrace;
+module_param(dpm_watchdog_all_cpu_backtrace, bool, 0644);
+MODULE_PARM_DESC(dpm_watchdog_all_cpu_backtrace,
+		 "Backtrace all CPUs on DPM watchdog timeout");
+
 /**
  * dpm_watchdog_handler - Driver suspend / resume watchdog handler.
  * @t: The timer that PM watchdog depends on.
@@ -530,8 +541,12 @@ static void dpm_watchdog_handler(struct timer_list *t)
 	unsigned int time_left;
 
 	if (wd->fatal) {
+		unsigned int this_cpu = smp_processor_id();
+
 		dev_emerg(wd->dev, "**** DPM device timeout ****\n");
 		show_stack(wd->tsk, NULL, KERN_EMERG);
+		if (dpm_watchdog_all_cpu_backtrace)
+			trigger_allbutcpu_cpu_backtrace(this_cpu);
 		panic("%s %s: unrecoverable failure\n",
 			dev_driver_string(wd->dev), dev_name(wd->dev));
 	}
diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c
index 1b11a3cd4acc..62707738caa4 100644
--- a/drivers/base/power/runtime.c
+++ b/drivers/base/power/runtime.c
@@ -90,7 +90,7 @@ static void update_pm_runtime_accounting(struct device *dev)
 	/*
 	 * Because ktime_get_mono_fast_ns() is not monotonic during
 	 * timekeeping updates, ensure that 'now' is after the last saved
-	 * timesptamp.
+	 * timestamp.
 	 */
 	if (now < last)
 		return;
@@ -217,7 +217,7 @@ static int dev_memalloc_noio(struct device *dev, void *data)
  *     resume/suspend callback of any one of its ancestors(or the
  *     block device itself), the deadlock may be triggered inside the
  *     memory allocation since it might not complete until the block
- *     device becomes active and the involed page I/O finishes. The
+ *     device becomes active and the involved page I/O finishes. The
  *     situation is pointed out first by Alan Stern. Network device
  *     are involved in iSCSI kind of situation.
  *
@@ -1210,7 +1210,7 @@ EXPORT_SYMBOL_GPL(__pm_runtime_resume);
  *
  * Otherwise, if its runtime PM status is %RPM_ACTIVE and (1) @ign_usage_count
  * is set, or (2) @dev is not ignoring children and its active child count is
- * nonero, or (3) the runtime PM usage counter of @dev is not zero, increment
+ * nonzero, or (3) the runtime PM usage counter of @dev is not zero, increment
  * the usage counter of @dev and return 1.
  *
  * Otherwise, return 0 without changing the usage counter.
@@ -1664,9 +1664,12 @@ EXPORT_SYMBOL_GPL(devm_pm_runtime_get_noresume);
  * pm_runtime_forbid - Block runtime PM of a device.
  * @dev: Device to handle.
  *
- * Increase the device's usage count and clear its power.runtime_auto flag,
- * so that it cannot be suspended at run time until pm_runtime_allow() is called
- * for it.
+ * Resume @dev if already suspended and block runtime suspend of @dev in such
+ * a way that it can be unblocked via the /sys/devices/.../power/control
+ * interface, or otherwise by calling pm_runtime_allow().
+ *
+ * Calling this function many times in a row has the same effect as calling it
+ * once.
  */
 void pm_runtime_forbid(struct device *dev)
 {
@@ -1687,7 +1690,13 @@ EXPORT_SYMBOL_GPL(pm_runtime_forbid);
  * pm_runtime_allow - Unblock runtime PM of a device.
  * @dev: Device to handle.
  *
- * Decrease the device's usage count and set its power.runtime_auto flag.
+ * Unblock runtime suspend of @dev after it has been blocked by
+ * pm_runtime_forbid() (for instance, if it has been blocked via the
+ * /sys/devices/.../power/control interface), check if @dev can be
+ * suspended and suspend it in that case.
+ *
+ * Calling this function many times in a row has the same effect as calling it
+ * once.
  */
 void pm_runtime_allow(struct device *dev)
 {
diff --git a/drivers/base/power/trace.c b/drivers/base/power/trace.c
index cd6e559648b2..d8da7195bb00 100644
--- a/drivers/base/power/trace.c
+++ b/drivers/base/power/trace.c
@@ -238,10 +238,8 @@ int show_trace_dev_match(char *buf, size_t size)
 		unsigned int hash = hash_string(DEVSEED, dev_name(dev),
 						DEVHASH);
 		if (hash == value) {
-			int len = snprintf(buf, size, "%s\n",
+			int len = scnprintf(buf, size, "%s\n",
 					    dev_driver_string(dev));
-			if (len > size)
-				len = size;
 			buf += len;
 			ret += len;
 			size -= len;
diff --git a/drivers/base/power/wakeup.c b/drivers/base/power/wakeup.c
index d1283ff1080b..1e1a0e7eeac5 100644
--- a/drivers/base/power/wakeup.c
+++ b/drivers/base/power/wakeup.c
@@ -189,17 +189,16 @@ static void wakeup_source_remove(struct wakeup_source *ws)
 	if (WARN_ON(!ws))
 		return;
 
+	/*
+	 * After shutting down the timer, wakeup_source_activate() will warn if
+	 * the given wakeup source is passed to it.
+	 */
+	timer_shutdown_sync(&ws->timer);
+
 	raw_spin_lock_irqsave(&events_lock, flags);
 	list_del_rcu(&ws->entry);
 	raw_spin_unlock_irqrestore(&events_lock, flags);
 	synchronize_srcu(&wakeup_srcu);
-
-	timer_delete_sync(&ws->timer);
-	/*
-	 * Clear timer.function to make wakeup_source_not_registered() treat
-	 * this wakeup source as not registered.
-	 */
-	ws->timer.function = NULL;
 }
 
 /**
@@ -506,14 +505,14 @@ int device_set_wakeup_enable(struct device *dev, bool enable)
 EXPORT_SYMBOL_GPL(device_set_wakeup_enable);
 
 /**
- * wakeup_source_not_registered - validate the given wakeup source.
+ * wakeup_source_not_usable - validate the given wakeup source.
  * @ws: Wakeup source to be validated.
  */
-static bool wakeup_source_not_registered(struct wakeup_source *ws)
+static bool wakeup_source_not_usable(struct wakeup_source *ws)
 {
 	/*
-	 * Use timer struct to check if the given source is initialized
-	 * by wakeup_source_add.
+	 * Use the timer struct to check if the given wakeup source has been
+	 * initialized by wakeup_source_add() and it is not going away.
 	 */
 	return ws->timer.function != pm_wakeup_timer_fn;
 }
@@ -558,8 +557,7 @@ static void wakeup_source_activate(struct wakeup_source *ws)
 {
 	unsigned int cec;
 
-	if (WARN_ONCE(wakeup_source_not_registered(ws),
-			"unregistered wakeup source\n"))
+	if (WARN_ONCE(wakeup_source_not_usable(ws), "unusable wakeup source\n"))
 		return;
 
 	ws->active = true;
diff --git a/drivers/char/hw_random/s390-trng.c b/drivers/char/hw_random/s390-trng.c
index d27e32e9bfee..3024d5e9fd61 100644
--- a/drivers/char/hw_random/s390-trng.c
+++ b/drivers/char/hw_random/s390-trng.c
@@ -9,8 +9,7 @@
  * Author(s): Harald Freudenberger <freude@de.ibm.com>
  */
 
-#define KMSG_COMPONENT "trng"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "trng: " fmt
 
 #include <linux/hw_random.h>
 #include <linux/kernel.h>
diff --git a/drivers/char/random.c b/drivers/char/random.c
index b8b24b6ed3fe..bab03c7c4194 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -259,8 +259,8 @@ static void crng_reseed(struct work_struct *work)
 	u8 key[CHACHA_KEY_SIZE];
 
 	/* Immediately schedule the next reseeding, so that it fires sooner rather than later. */
-	if (likely(system_unbound_wq))
-		queue_delayed_work(system_unbound_wq, &next_reseed, crng_reseed_interval());
+	if (likely(system_dfl_wq))
+		queue_delayed_work(system_dfl_wq, &next_reseed, crng_reseed_interval());
 
 	extract_entropy(key, sizeof(key));
 
@@ -427,7 +427,7 @@ static void _get_random_bytes(void *buf, size_t len)
 
 /*
  * This returns random bytes in arbitrary quantities. The quality of the
- * random bytes is good as /dev/urandom. In order to ensure that the
+ * random bytes is as good as /dev/urandom. In order to ensure that the
  * randomness provided by this function is okay, the function
  * wait_for_random_bytes() should be called and return 0 at least once
  * at any point prior.
@@ -491,7 +491,7 @@ out_zero_chacha:
 
 /*
  * Batched entropy returns random integers. The quality of the random
- * number is good as /dev/urandom. In order to ensure that the randomness
+ * number is as good as /dev/urandom. In order to ensure that the randomness
  * provided by this function is okay, the function wait_for_random_bytes()
  * should be called and return 0 at least once at any point prior.
  */
@@ -636,7 +636,7 @@ enum {
 };
 
 static struct {
-	struct blake2s_state hash;
+	struct blake2s_ctx hash;
 	spinlock_t lock;
 	unsigned int init_bits;
 } input_pool = {
@@ -701,7 +701,7 @@ static void extract_entropy(void *buf, size_t len)
 
 	/* next_key = HASHPRF(seed, RDSEED || 0) */
 	block.counter = 0;
-	blake2s(next_key, (u8 *)&block, seed, sizeof(next_key), sizeof(block), sizeof(seed));
+	blake2s(seed, sizeof(seed), (const u8 *)&block, sizeof(block), next_key, sizeof(next_key));
 	blake2s_init_key(&input_pool.hash, BLAKE2S_HASH_SIZE, next_key, sizeof(next_key));
 
 	spin_unlock_irqrestore(&input_pool.lock, flags);
@@ -711,7 +711,7 @@ static void extract_entropy(void *buf, size_t len)
 		i = min_t(size_t, len, BLAKE2S_HASH_SIZE);
 		/* output = HASHPRF(seed, RDSEED || ++counter) */
 		++block.counter;
-		blake2s(buf, (u8 *)&block, seed, i, sizeof(block), sizeof(seed));
+		blake2s(seed, sizeof(seed), (const u8 *)&block, sizeof(block), buf, i);
 		len -= i;
 		buf += i;
 	}
@@ -741,8 +741,8 @@ static void __cold _credit_init_bits(size_t bits)
 
 	if (orig < POOL_READY_BITS && new >= POOL_READY_BITS) {
 		crng_reseed(NULL); /* Sets crng_init to CRNG_READY under base_crng.lock. */
-		if (static_key_initialized && system_unbound_wq)
-			queue_work(system_unbound_wq, &set_ready);
+		if (system_dfl_wq)
+			queue_work(system_dfl_wq, &set_ready);
 		atomic_notifier_call_chain(&random_ready_notifier, 0, NULL);
 #ifdef CONFIG_VDSO_GETRANDOM
 		WRITE_ONCE(vdso_k_rng_data->is_ready, true);
@@ -794,7 +794,7 @@ static void __cold _credit_init_bits(size_t bits)
  *
  * add_bootloader_randomness() is called by bootloader drivers, such as EFI
  * and device tree, and credits its input depending on whether or not the
- * command line option 'random.trust_bootloader'.
+ * command line option 'random.trust_bootloader' is set.
  *
  * add_vmfork_randomness() adds a unique (but not necessarily secret) ID
  * representing the current instance of a VM to the pool, without crediting,
@@ -915,9 +915,8 @@ void __init random_init(void)
 	add_latent_entropy();
 
 	/*
-	 * If we were initialized by the cpu or bootloader before jump labels
-	 * or workqueues are initialized, then we should enable the static
-	 * branch here, where it's guaranteed that these have been initialized.
+	 * If we were initialized by the cpu or bootloader before workqueues
+	 * are initialized, then we should enable the static branch here.
 	 */
 	if (!static_branch_likely(&crng_is_ready) && crng_init >= CRNG_READY)
 		crng_set_ready(NULL);
@@ -1296,6 +1295,7 @@ static void __cold try_to_generate_entropy(void)
 	struct entropy_timer_state *stack = PTR_ALIGN((void *)stack_bytes, SMP_CACHE_BYTES);
 	unsigned int i, num_different = 0;
 	unsigned long last = random_get_entropy();
+	cpumask_var_t timer_cpus;
 	int cpu = -1;
 
 	for (i = 0; i < NUM_TRIAL_SAMPLES - 1; ++i) {
@@ -1310,13 +1310,15 @@ static void __cold try_to_generate_entropy(void)
 
 	atomic_set(&stack->samples, 0);
 	timer_setup_on_stack(&stack->timer, entropy_timer, 0);
+	if (!alloc_cpumask_var(&timer_cpus, GFP_KERNEL))
+		goto out;
+
 	while (!crng_ready() && !signal_pending(current)) {
 		/*
 		 * Check !timer_pending() and then ensure that any previous callback has finished
 		 * executing by checking timer_delete_sync_try(), before queueing the next one.
 		 */
 		if (!timer_pending(&stack->timer) && timer_delete_sync_try(&stack->timer) >= 0) {
-			struct cpumask timer_cpus;
 			unsigned int num_cpus;
 
 			/*
@@ -1326,19 +1328,19 @@ static void __cold try_to_generate_entropy(void)
 			preempt_disable();
 
 			/* Only schedule callbacks on timer CPUs that are online. */
-			cpumask_and(&timer_cpus, housekeeping_cpumask(HK_TYPE_TIMER), cpu_online_mask);
-			num_cpus = cpumask_weight(&timer_cpus);
+			cpumask_and(timer_cpus, housekeeping_cpumask(HK_TYPE_TIMER), cpu_online_mask);
+			num_cpus = cpumask_weight(timer_cpus);
 			/* In very bizarre case of misconfiguration, fallback to all online. */
 			if (unlikely(num_cpus == 0)) {
-				timer_cpus = *cpu_online_mask;
-				num_cpus = cpumask_weight(&timer_cpus);
+				*timer_cpus = *cpu_online_mask;
+				num_cpus = cpumask_weight(timer_cpus);
 			}
 
 			/* Basic CPU round-robin, which avoids the current CPU. */
 			do {
-				cpu = cpumask_next(cpu, &timer_cpus);
+				cpu = cpumask_next(cpu, timer_cpus);
 				if (cpu >= nr_cpu_ids)
-					cpu = cpumask_first(&timer_cpus);
+					cpu = cpumask_first(timer_cpus);
 			} while (cpu == smp_processor_id() && num_cpus > 1);
 
 			/* Expiring the timer at `jiffies` means it's the next tick. */
@@ -1354,6 +1356,8 @@ static void __cold try_to_generate_entropy(void)
 	}
 	mix_pool_bytes(&stack->entropy, sizeof(stack->entropy));
 
+	free_cpumask_var(timer_cpus);
+out:
 	timer_delete_sync(&stack->timer);
 	timer_destroy_on_stack(&stack->timer);
 }
diff --git a/drivers/cpufreq/acpi-cpufreq.c b/drivers/cpufreq/acpi-cpufreq.c
index 083d8369a591..e73a66785d69 100644
--- a/drivers/cpufreq/acpi-cpufreq.c
+++ b/drivers/cpufreq/acpi-cpufreq.c
@@ -395,7 +395,7 @@ static unsigned int check_freqs(struct cpufreq_policy *policy,
 		cur_freq = extract_freq(policy, get_cur_val(mask, data));
 		if (cur_freq == freq)
 			return 1;
-		udelay(10);
+		usleep_range(10, 15);
 	}
 	return 0;
 }
diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c
index b44f0f7a5ba1..c45bc98721d2 100644
--- a/drivers/cpufreq/amd-pstate.c
+++ b/drivers/cpufreq/amd-pstate.c
@@ -65,13 +65,13 @@ static const char * const amd_pstate_mode_string[] = {
 	[AMD_PSTATE_PASSIVE]     = "passive",
 	[AMD_PSTATE_ACTIVE]      = "active",
 	[AMD_PSTATE_GUIDED]      = "guided",
-	NULL,
 };
+static_assert(ARRAY_SIZE(amd_pstate_mode_string) == AMD_PSTATE_MAX);
 
 const char *amd_pstate_get_mode_string(enum amd_pstate_mode mode)
 {
-	if (mode < 0 || mode >= AMD_PSTATE_MAX)
-		return NULL;
+	if (mode < AMD_PSTATE_UNDEFINED || mode >= AMD_PSTATE_MAX)
+		mode = AMD_PSTATE_UNDEFINED;
 	return amd_pstate_mode_string[mode];
 }
 EXPORT_SYMBOL_GPL(amd_pstate_get_mode_string);
@@ -110,6 +110,7 @@ enum energy_perf_value_index {
 	EPP_INDEX_BALANCE_PERFORMANCE,
 	EPP_INDEX_BALANCE_POWERSAVE,
 	EPP_INDEX_POWERSAVE,
+	EPP_INDEX_MAX,
 };
 
 static const char * const energy_perf_strings[] = {
@@ -118,8 +119,8 @@ static const char * const energy_perf_strings[] = {
 	[EPP_INDEX_BALANCE_PERFORMANCE] = "balance_performance",
 	[EPP_INDEX_BALANCE_POWERSAVE] = "balance_power",
 	[EPP_INDEX_POWERSAVE] = "power",
-	NULL
 };
+static_assert(ARRAY_SIZE(energy_perf_strings) == EPP_INDEX_MAX);
 
 static unsigned int epp_values[] = {
 	[EPP_INDEX_DEFAULT] = 0,
@@ -127,7 +128,8 @@ static unsigned int epp_values[] = {
 	[EPP_INDEX_BALANCE_PERFORMANCE] = AMD_CPPC_EPP_BALANCE_PERFORMANCE,
 	[EPP_INDEX_BALANCE_POWERSAVE] = AMD_CPPC_EPP_BALANCE_POWERSAVE,
 	[EPP_INDEX_POWERSAVE] = AMD_CPPC_EPP_POWERSAVE,
- };
+};
+static_assert(ARRAY_SIZE(epp_values) == EPP_INDEX_MAX);
 
 typedef int (*cppc_mode_transition_fn)(int);
 
@@ -183,7 +185,7 @@ static inline int get_mode_idx_from_str(const char *str, size_t size)
 {
 	int i;
 
-	for (i=0; i < AMD_PSTATE_MAX; i++) {
+	for (i = 0; i < AMD_PSTATE_MAX; i++) {
 		if (!strncmp(str, amd_pstate_mode_string[i], size))
 			return i;
 	}
@@ -1137,16 +1139,15 @@ static ssize_t show_amd_pstate_hw_prefcore(struct cpufreq_policy *policy,
 static ssize_t show_energy_performance_available_preferences(
 				struct cpufreq_policy *policy, char *buf)
 {
-	int i = 0;
-	int offset = 0;
+	int offset = 0, i;
 	struct amd_cpudata *cpudata = policy->driver_data;
 
 	if (cpudata->policy == CPUFREQ_POLICY_PERFORMANCE)
 		return sysfs_emit_at(buf, offset, "%s\n",
 				energy_perf_strings[EPP_INDEX_PERFORMANCE]);
 
-	while (energy_perf_strings[i] != NULL)
-		offset += sysfs_emit_at(buf, offset, "%s ", energy_perf_strings[i++]);
+	for (i = 0; i < ARRAY_SIZE(energy_perf_strings); i++)
+		offset += sysfs_emit_at(buf, offset, "%s ", energy_perf_strings[i]);
 
 	offset += sysfs_emit_at(buf, offset, "\n");
 
@@ -1157,15 +1158,10 @@ static ssize_t store_energy_performance_preference(
 		struct cpufreq_policy *policy, const char *buf, size_t count)
 {
 	struct amd_cpudata *cpudata = policy->driver_data;
-	char str_preference[21];
 	ssize_t ret;
 	u8 epp;
 
-	ret = sscanf(buf, "%20s", str_preference);
-	if (ret != 1)
-		return -EINVAL;
-
-	ret = match_string(energy_perf_strings, -1, str_preference);
+	ret = sysfs_match_string(energy_perf_strings, buf);
 	if (ret < 0)
 		return -EINVAL;
 
@@ -1282,7 +1278,7 @@ static int amd_pstate_change_mode_without_dvr_change(int mode)
 	if (cpu_feature_enabled(X86_FEATURE_CPPC) || cppc_state == AMD_PSTATE_ACTIVE)
 		return 0;
 
-	for_each_present_cpu(cpu) {
+	for_each_online_cpu(cpu) {
 		cppc_set_auto_sel(cpu, (cppc_state == AMD_PSTATE_PASSIVE) ? 0 : 1);
 	}
 
@@ -1353,9 +1349,8 @@ int amd_pstate_update_status(const char *buf, size_t size)
 		return -EINVAL;
 
 	mode_idx = get_mode_idx_from_str(buf, size);
-
-	if (mode_idx < 0 || mode_idx >= AMD_PSTATE_MAX)
-		return -EINVAL;
+	if (mode_idx < 0)
+		return mode_idx;
 
 	if (mode_state_machine[cppc_state][mode_idx]) {
 		guard(mutex)(&amd_pstate_driver_lock);
diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c
index e23d9abea135..9eac77c4f294 100644
--- a/drivers/cpufreq/cppc_cpufreq.c
+++ b/drivers/cpufreq/cppc_cpufreq.c
@@ -142,16 +142,15 @@ static void cppc_cpufreq_cpu_fie_init(struct cpufreq_policy *policy)
 		init_irq_work(&cppc_fi->irq_work, cppc_irq_work);
 
 		ret = cppc_get_perf_ctrs(cpu, &cppc_fi->prev_perf_fb_ctrs);
-		if (ret) {
-			pr_warn("%s: failed to read perf counters for cpu:%d: %d\n",
-				__func__, cpu, ret);
 
-			/*
-			 * Don't abort if the CPU was offline while the driver
-			 * was getting registered.
-			 */
-			if (cpu_online(cpu))
-				return;
+		/*
+		 * Don't abort as the CPU was offline while the driver was
+		 * getting registered.
+		 */
+		if (ret && cpu_online(cpu)) {
+			pr_debug("%s: failed to read perf counters for cpu:%d: %d\n",
+				__func__, cpu, ret);
+			return;
 		}
 	}
 
diff --git a/drivers/cpufreq/cpufreq-dt-platdev.c b/drivers/cpufreq/cpufreq-dt-platdev.c
index cd1816a12bb9..dc11b62399ad 100644
--- a/drivers/cpufreq/cpufreq-dt-platdev.c
+++ b/drivers/cpufreq/cpufreq-dt-platdev.c
@@ -87,6 +87,7 @@ static const struct of_device_id allowlist[] __initconst = {
 	{ .compatible = "st-ericsson,u9540", },
 
 	{ .compatible = "starfive,jh7110", },
+	{ .compatible = "starfive,jh7110s", },
 
 	{ .compatible = "ti,omap2", },
 	{ .compatible = "ti,omap4", },
diff --git a/drivers/cpufreq/cpufreq-nforce2.c b/drivers/cpufreq/cpufreq-nforce2.c
index fedad1081973..fbbbe501cf2d 100644
--- a/drivers/cpufreq/cpufreq-nforce2.c
+++ b/drivers/cpufreq/cpufreq-nforce2.c
@@ -145,6 +145,8 @@ static unsigned int nforce2_fsb_read(int bootfsb)
 	pci_read_config_dword(nforce2_sub5, NFORCE2_BOOTFSB, &fsb);
 	fsb /= 1000000;
 
+	pci_dev_put(nforce2_sub5);
+
 	/* Check if PLL register is already set */
 	pci_read_config_byte(nforce2_dev, NFORCE2_PLLENABLE, (u8 *)&temp);
 
@@ -426,6 +428,7 @@ static int __init nforce2_init(void)
 static void __exit nforce2_exit(void)
 {
 	cpufreq_unregister_driver(&nforce2_driver);
+	pci_dev_put(nforce2_dev);
 }
 
 module_init(nforce2_init);
diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 852e024facc3..4472bb1ec83c 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -1421,9 +1421,12 @@ static int cpufreq_policy_online(struct cpufreq_policy *policy,
 		 * If there is a problem with its frequency table, take it
 		 * offline and drop it.
 		 */
-		ret = cpufreq_table_validate_and_sort(policy);
-		if (ret)
-			goto out_offline_policy;
+		if (policy->freq_table_sorted != CPUFREQ_TABLE_SORTED_ASCENDING &&
+		    policy->freq_table_sorted != CPUFREQ_TABLE_SORTED_DESCENDING) {
+			ret = cpufreq_table_validate_and_sort(policy);
+			if (ret)
+				goto out_offline_policy;
+		}
 
 		/* related_cpus should at least include policy->cpus. */
 		cpumask_copy(policy->related_cpus, policy->cpus);
@@ -2550,7 +2553,7 @@ void cpufreq_unregister_governor(struct cpufreq_governor *governor)
 	for_each_inactive_policy(policy) {
 		if (!strcmp(policy->last_governor, governor->name)) {
 			policy->governor = NULL;
-			strcpy(policy->last_governor, "\0");
+			policy->last_governor[0] = '\0';
 		}
 	}
 	read_unlock_irqrestore(&cpufreq_driver_lock, flags);
diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index 492a10f1bdbf..ec4abe374573 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -575,13 +575,18 @@ static void intel_pstate_hybrid_hwp_adjust(struct cpudata *cpu)
 	int scaling = cpu->pstate.scaling;
 	int freq;
 
-	pr_debug("CPU%d: perf_ctl_max_phys = %d\n", cpu->cpu, perf_ctl_max_phys);
-	pr_debug("CPU%d: perf_ctl_turbo = %d\n", cpu->cpu, perf_ctl_turbo);
-	pr_debug("CPU%d: perf_ctl_scaling = %d\n", cpu->cpu, perf_ctl_scaling);
+	pr_debug("CPU%d: PERF_CTL max_phys = %d\n", cpu->cpu, perf_ctl_max_phys);
+	pr_debug("CPU%d: PERF_CTL turbo = %d\n", cpu->cpu, perf_ctl_turbo);
+	pr_debug("CPU%d: PERF_CTL scaling = %d\n", cpu->cpu, perf_ctl_scaling);
 	pr_debug("CPU%d: HWP_CAP guaranteed = %d\n", cpu->cpu, cpu->pstate.max_pstate);
 	pr_debug("CPU%d: HWP_CAP highest = %d\n", cpu->cpu, cpu->pstate.turbo_pstate);
 	pr_debug("CPU%d: HWP-to-frequency scaling factor: %d\n", cpu->cpu, scaling);
 
+	if (scaling == perf_ctl_scaling)
+		return;
+
+	hwp_is_hybrid = true;
+
 	cpu->pstate.turbo_freq = rounddown(cpu->pstate.turbo_pstate * scaling,
 					   perf_ctl_scaling);
 	cpu->pstate.max_freq = rounddown(cpu->pstate.max_pstate * scaling,
@@ -909,6 +914,11 @@ static struct freq_attr *hwp_cpufreq_attrs[] = {
 	[HWP_CPUFREQ_ATTR_COUNT] = NULL,
 };
 
+static u8 hybrid_get_cpu_type(unsigned int cpu)
+{
+	return cpu_data(cpu).topo.intel_type;
+}
+
 static bool no_cas __ro_after_init;
 
 static struct cpudata *hybrid_max_perf_cpu __read_mostly;
@@ -925,11 +935,8 @@ static int hybrid_active_power(struct device *dev, unsigned long *power,
 			       unsigned long *freq)
 {
 	/*
-	 * Create "utilization bins" of 0-40%, 40%-60%, 60%-80%, and 80%-100%
-	 * of the maximum capacity such that two CPUs of the same type will be
-	 * regarded as equally attractive if the utilization of each of them
-	 * falls into the same bin, which should prevent tasks from being
-	 * migrated between them too often.
+	 * Create four "states" corresponding to 40%, 60%, 80%, and 100% of the
+	 * full capacity.
 	 *
 	 * For this purpose, return the "frequency" of 2 for the first
 	 * performance level and otherwise leave the value set by the caller.
@@ -943,38 +950,40 @@ static int hybrid_active_power(struct device *dev, unsigned long *power,
 	return 0;
 }
 
+static bool hybrid_has_l3(unsigned int cpu)
+{
+	struct cpu_cacheinfo *cacheinfo = get_cpu_cacheinfo(cpu);
+	unsigned int i;
+
+	if (!cacheinfo)
+		return false;
+
+	for (i = 0; i < cacheinfo->num_leaves; i++) {
+		if (cacheinfo->info_list[i].level == 3)
+			return true;
+	}
+
+	return false;
+}
+
 static int hybrid_get_cost(struct device *dev, unsigned long freq,
 			   unsigned long *cost)
 {
-	struct pstate_data *pstate = &all_cpu_data[dev->id]->pstate;
-	struct cpu_cacheinfo *cacheinfo = get_cpu_cacheinfo(dev->id);
-
+	/* Facilitate load balancing between CPUs of the same type. */
+	*cost = freq;
 	/*
-	 * The smaller the perf-to-frequency scaling factor, the larger the IPC
-	 * ratio between the given CPU and the least capable CPU in the system.
-	 * Regard that IPC ratio as the primary cost component and assume that
-	 * the scaling factors for different CPU types will differ by at least
-	 * 5% and they will not be above INTEL_PSTATE_CORE_SCALING.
+	 * Adjust the cost depending on CPU type.
 	 *
-	 * Add the freq value to the cost, so that the cost of running on CPUs
-	 * of the same type in different "utilization bins" is different.
-	 */
-	*cost = div_u64(100ULL * INTEL_PSTATE_CORE_SCALING, pstate->scaling) + freq;
-	/*
-	 * Increase the cost slightly for CPUs able to access L3 to avoid
-	 * touching it in case some other CPUs of the same type can do the work
-	 * without it.
+	 * The idea is to start loading up LPE-cores before E-cores and start
+	 * to populate E-cores when LPE-cores are utilized above 60% of the
+	 * capacity.  Similarly, P-cores start to be populated when E-cores are
+	 * utilized above 60% of the capacity.
 	 */
-	if (cacheinfo) {
-		unsigned int i;
-
-		/* Check if L3 cache is there. */
-		for (i = 0; i < cacheinfo->num_leaves; i++) {
-			if (cacheinfo->info_list[i].level == 3) {
-				*cost += 2;
-				break;
-			}
-		}
+	if (hybrid_get_cpu_type(dev->id) == INTEL_CPU_TYPE_ATOM) {
+		if (hybrid_has_l3(dev->id)) /* E-core */
+			*cost += 1;
+	} else { /* P-core */
+		*cost += 2;
 	}
 
 	return 0;
@@ -1037,9 +1046,9 @@ static void hybrid_set_cpu_capacity(struct cpudata *cpu)
 
 	topology_set_cpu_scale(cpu->cpu, arch_scale_cpu_capacity(cpu->cpu));
 
-	pr_debug("CPU%d: perf = %u, max. perf = %u, base perf = %d\n", cpu->cpu,
-		 cpu->capacity_perf, hybrid_max_perf_cpu->capacity_perf,
-		 cpu->pstate.max_pstate_physical);
+	pr_debug("CPU%d: capacity perf = %u, base perf = %u, sys max perf = %u\n",
+		 cpu->cpu, cpu->capacity_perf, cpu->pstate.max_pstate_physical,
+		 hybrid_max_perf_cpu->capacity_perf);
 }
 
 static void hybrid_clear_cpu_capacity(unsigned int cpunum)
@@ -1384,7 +1393,8 @@ static void set_power_ctl_ee_state(bool input)
 {
 	u64 power_ctl;
 
-	mutex_lock(&intel_pstate_driver_lock);
+	guard(mutex)(&intel_pstate_driver_lock);
+
 	rdmsrq(MSR_IA32_POWER_CTL, power_ctl);
 	if (input) {
 		power_ctl &= ~BIT(MSR_IA32_POWER_CTL_BIT_EE);
@@ -1394,7 +1404,6 @@ static void set_power_ctl_ee_state(bool input)
 		power_ctl_ee_state = POWER_CTL_EE_DISABLE;
 	}
 	wrmsrq(MSR_IA32_POWER_CTL, power_ctl);
-	mutex_unlock(&intel_pstate_driver_lock);
 }
 
 static void intel_pstate_hwp_enable(struct cpudata *cpudata);
@@ -1516,13 +1525,9 @@ static int intel_pstate_update_status(const char *buf, size_t size);
 static ssize_t show_status(struct kobject *kobj,
 			   struct kobj_attribute *attr, char *buf)
 {
-	ssize_t ret;
-
-	mutex_lock(&intel_pstate_driver_lock);
-	ret = intel_pstate_show_status(buf);
-	mutex_unlock(&intel_pstate_driver_lock);
+	guard(mutex)(&intel_pstate_driver_lock);
 
-	return ret;
+	return intel_pstate_show_status(buf);
 }
 
 static ssize_t store_status(struct kobject *a, struct kobj_attribute *b,
@@ -1531,11 +1536,13 @@ static ssize_t store_status(struct kobject *a, struct kobj_attribute *b,
 	char *p = memchr(buf, '\n', count);
 	int ret;
 
-	mutex_lock(&intel_pstate_driver_lock);
+	guard(mutex)(&intel_pstate_driver_lock);
+
 	ret = intel_pstate_update_status(buf, p ? p - buf : count);
-	mutex_unlock(&intel_pstate_driver_lock);
+	if (ret < 0)
+		return ret;
 
-	return ret < 0 ? ret : count;
+	return count;
 }
 
 static ssize_t show_turbo_pct(struct kobject *kobj,
@@ -1545,12 +1552,10 @@ static ssize_t show_turbo_pct(struct kobject *kobj,
 	int total, no_turbo, turbo_pct;
 	uint32_t turbo_fp;
 
-	mutex_lock(&intel_pstate_driver_lock);
+	guard(mutex)(&intel_pstate_driver_lock);
 
-	if (!intel_pstate_driver) {
-		mutex_unlock(&intel_pstate_driver_lock);
+	if (!intel_pstate_driver)
 		return -EAGAIN;
-	}
 
 	cpu = all_cpu_data[0];
 
@@ -1559,8 +1564,6 @@ static ssize_t show_turbo_pct(struct kobject *kobj,
 	turbo_fp = div_fp(no_turbo, total);
 	turbo_pct = 100 - fp_toint(mul_fp(turbo_fp, int_tofp(100)));
 
-	mutex_unlock(&intel_pstate_driver_lock);
-
 	return sprintf(buf, "%u\n", turbo_pct);
 }
 
@@ -1570,38 +1573,26 @@ static ssize_t show_num_pstates(struct kobject *kobj,
 	struct cpudata *cpu;
 	int total;
 
-	mutex_lock(&intel_pstate_driver_lock);
+	guard(mutex)(&intel_pstate_driver_lock);
 
-	if (!intel_pstate_driver) {
-		mutex_unlock(&intel_pstate_driver_lock);
+	if (!intel_pstate_driver)
 		return -EAGAIN;
-	}
 
 	cpu = all_cpu_data[0];
 	total = cpu->pstate.turbo_pstate - cpu->pstate.min_pstate + 1;
 
-	mutex_unlock(&intel_pstate_driver_lock);
-
 	return sprintf(buf, "%u\n", total);
 }
 
 static ssize_t show_no_turbo(struct kobject *kobj,
 			     struct kobj_attribute *attr, char *buf)
 {
-	ssize_t ret;
+	guard(mutex)(&intel_pstate_driver_lock);
 
-	mutex_lock(&intel_pstate_driver_lock);
-
-	if (!intel_pstate_driver) {
-		mutex_unlock(&intel_pstate_driver_lock);
+	if (!intel_pstate_driver)
 		return -EAGAIN;
-	}
-
-	ret = sprintf(buf, "%u\n", global.no_turbo);
-
-	mutex_unlock(&intel_pstate_driver_lock);
 
-	return ret;
+	return sprintf(buf, "%u\n", global.no_turbo);
 }
 
 static ssize_t store_no_turbo(struct kobject *a, struct kobj_attribute *b,
@@ -1613,29 +1604,25 @@ static ssize_t store_no_turbo(struct kobject *a, struct kobj_attribute *b,
 	if (sscanf(buf, "%u", &input) != 1)
 		return -EINVAL;
 
-	mutex_lock(&intel_pstate_driver_lock);
+	guard(mutex)(&intel_pstate_driver_lock);
 
-	if (!intel_pstate_driver) {
-		count = -EAGAIN;
-		goto unlock_driver;
-	}
+	if (!intel_pstate_driver)
+		return -EAGAIN;
 
 	no_turbo = !!clamp_t(int, input, 0, 1);
 
 	WRITE_ONCE(global.turbo_disabled, turbo_is_disabled());
 	if (global.turbo_disabled && !no_turbo) {
 		pr_notice("Turbo disabled by BIOS or unavailable on processor\n");
-		count = -EPERM;
 		if (global.no_turbo)
-			goto unlock_driver;
-		else
-			no_turbo = 1;
-	}
+			return -EPERM;
 
-	if (no_turbo == global.no_turbo) {
-		goto unlock_driver;
+		no_turbo = 1;
 	}
 
+	if (no_turbo == global.no_turbo)
+		return count;
+
 	WRITE_ONCE(global.no_turbo, no_turbo);
 
 	mutex_lock(&intel_pstate_limits_lock);
@@ -1654,9 +1641,6 @@ static ssize_t store_no_turbo(struct kobject *a, struct kobj_attribute *b,
 	intel_pstate_update_limits_for_all();
 	arch_set_max_freq_ratio(no_turbo);
 
-unlock_driver:
-	mutex_unlock(&intel_pstate_driver_lock);
-
 	return count;
 }
 
@@ -1706,12 +1690,10 @@ static ssize_t store_max_perf_pct(struct kobject *a, struct kobj_attribute *b,
 	if (ret != 1)
 		return -EINVAL;
 
-	mutex_lock(&intel_pstate_driver_lock);
+	guard(mutex)(&intel_pstate_driver_lock);
 
-	if (!intel_pstate_driver) {
-		mutex_unlock(&intel_pstate_driver_lock);
+	if (!intel_pstate_driver)
 		return -EAGAIN;
-	}
 
 	mutex_lock(&intel_pstate_limits_lock);
 
@@ -1724,8 +1706,6 @@ static ssize_t store_max_perf_pct(struct kobject *a, struct kobj_attribute *b,
 	else
 		update_qos_requests(FREQ_QOS_MAX);
 
-	mutex_unlock(&intel_pstate_driver_lock);
-
 	return count;
 }
 
@@ -1739,12 +1719,10 @@ static ssize_t store_min_perf_pct(struct kobject *a, struct kobj_attribute *b,
 	if (ret != 1)
 		return -EINVAL;
 
-	mutex_lock(&intel_pstate_driver_lock);
+	guard(mutex)(&intel_pstate_driver_lock);
 
-	if (!intel_pstate_driver) {
-		mutex_unlock(&intel_pstate_driver_lock);
+	if (!intel_pstate_driver)
 		return -EAGAIN;
-	}
 
 	mutex_lock(&intel_pstate_limits_lock);
 
@@ -1758,8 +1736,6 @@ static ssize_t store_min_perf_pct(struct kobject *a, struct kobj_attribute *b,
 	else
 		update_qos_requests(FREQ_QOS_MIN);
 
-	mutex_unlock(&intel_pstate_driver_lock);
-
 	return count;
 }
 
@@ -1780,10 +1756,10 @@ static ssize_t store_hwp_dynamic_boost(struct kobject *a,
 	if (ret)
 		return ret;
 
-	mutex_lock(&intel_pstate_driver_lock);
+	guard(mutex)(&intel_pstate_driver_lock);
+
 	hwp_boost = !!input;
 	intel_pstate_update_policies();
-	mutex_unlock(&intel_pstate_driver_lock);
 
 	return count;
 }
@@ -2072,6 +2048,18 @@ static void intel_pstate_hwp_enable(struct cpudata *cpudata)
 	intel_pstate_update_epp_defaults(cpudata);
 }
 
+static u64 get_perf_ctl_val(int pstate)
+{
+	u64 val;
+
+	val = (u64)pstate << 8;
+	if (READ_ONCE(global.no_turbo) && !READ_ONCE(global.turbo_disabled) &&
+	    cpu_feature_enabled(X86_FEATURE_IDA))
+		val |= (u64)1 << 32;
+
+	return val;
+}
+
 static int atom_get_min_pstate(int not_used)
 {
 	u64 value;
@@ -2098,15 +2086,10 @@ static int atom_get_turbo_pstate(int not_used)
 
 static u64 atom_get_val(struct cpudata *cpudata, int pstate)
 {
-	u64 val;
+	u64 val = get_perf_ctl_val(pstate);
 	int32_t vid_fp;
 	u32 vid;
 
-	val = (u64)pstate << 8;
-	if (READ_ONCE(global.no_turbo) && !READ_ONCE(global.turbo_disabled) &&
-	    cpu_feature_enabled(X86_FEATURE_IDA))
-		val |= (u64)1 << 32;
-
 	vid_fp = cpudata->vid.min + mul_fp(
 		int_tofp(pstate - cpudata->pstate.min_pstate),
 		cpudata->vid.ratio);
@@ -2266,14 +2249,7 @@ static int core_get_turbo_pstate(int cpu)
 
 static u64 core_get_val(struct cpudata *cpudata, int pstate)
 {
-	u64 val;
-
-	val = (u64)pstate << 8;
-	if (READ_ONCE(global.no_turbo) && !READ_ONCE(global.turbo_disabled) &&
-	    cpu_feature_enabled(X86_FEATURE_IDA))
-		val |= (u64)1 << 32;
-
-	return val;
+	return get_perf_ctl_val(pstate);
 }
 
 static int knl_get_aperf_mperf_shift(void)
@@ -2297,18 +2273,14 @@ static int knl_get_turbo_pstate(int cpu)
 static int hwp_get_cpu_scaling(int cpu)
 {
 	if (hybrid_scaling_factor) {
-		struct cpuinfo_x86 *c = &cpu_data(cpu);
-		u8 cpu_type = c->topo.intel_type;
-
 		/*
 		 * Return the hybrid scaling factor for P-cores and use the
 		 * default core scaling for E-cores.
 		 */
-		if (cpu_type == INTEL_CPU_TYPE_CORE)
+		if (hybrid_get_cpu_type(cpu) == INTEL_CPU_TYPE_CORE)
 			return hybrid_scaling_factor;
 
-		if (cpu_type == INTEL_CPU_TYPE_ATOM)
-			return core_get_scaling();
+		return core_get_scaling();
 	}
 
 	/* Use core scaling on non-hybrid systems. */
@@ -2343,11 +2315,10 @@ static void intel_pstate_set_min_pstate(struct cpudata *cpu)
 
 static void intel_pstate_get_cpu_pstates(struct cpudata *cpu)
 {
-	int perf_ctl_max_phys = pstate_funcs.get_max_physical(cpu->cpu);
 	int perf_ctl_scaling = pstate_funcs.get_scaling();
 
+	cpu->pstate.max_pstate_physical = pstate_funcs.get_max_physical(cpu->cpu);
 	cpu->pstate.min_pstate = pstate_funcs.get_min(cpu->cpu);
-	cpu->pstate.max_pstate_physical = perf_ctl_max_phys;
 	cpu->pstate.perf_ctl_scaling = perf_ctl_scaling;
 
 	if (hwp_active && !hwp_mode_bdw) {
@@ -2355,10 +2326,7 @@ static void intel_pstate_get_cpu_pstates(struct cpudata *cpu)
 
 		if (pstate_funcs.get_cpu_scaling) {
 			cpu->pstate.scaling = pstate_funcs.get_cpu_scaling(cpu->cpu);
-			if (cpu->pstate.scaling != perf_ctl_scaling) {
-				intel_pstate_hybrid_hwp_adjust(cpu);
-				hwp_is_hybrid = true;
-			}
+			intel_pstate_hybrid_hwp_adjust(cpu);
 		} else {
 			cpu->pstate.scaling = perf_ctl_scaling;
 		}
@@ -2760,6 +2728,7 @@ static const struct x86_cpu_id intel_pstate_cpu_oob_ids[] __initconst = {
 	X86_MATCH(INTEL_ATOM_CRESTMONT,		core_funcs),
 	X86_MATCH(INTEL_ATOM_CRESTMONT_X,	core_funcs),
 	X86_MATCH(INTEL_ATOM_DARKMONT_X,	core_funcs),
+	X86_MATCH(INTEL_DIAMONDRAPIDS_X,	core_funcs),
 	{}
 };
 #endif
@@ -3912,9 +3881,9 @@ hwp_cpu_matched:
 
 	}
 
-	mutex_lock(&intel_pstate_driver_lock);
-	rc = intel_pstate_register_driver(default_driver);
-	mutex_unlock(&intel_pstate_driver_lock);
+	scoped_guard(mutex, &intel_pstate_driver_lock) {
+		rc = intel_pstate_register_driver(default_driver);
+	}
 	if (rc) {
 		intel_pstate_sysfs_remove();
 		return rc;
diff --git a/drivers/cpufreq/qcom-cpufreq-nvmem.c b/drivers/cpufreq/qcom-cpufreq-nvmem.c
index 765a5bb81829..81e16b5a0245 100644
--- a/drivers/cpufreq/qcom-cpufreq-nvmem.c
+++ b/drivers/cpufreq/qcom-cpufreq-nvmem.c
@@ -256,13 +256,22 @@ len_error:
 	return ret;
 }
 
+static const struct of_device_id qcom_cpufreq_ipq806x_match_list[] __maybe_unused = {
+	{ .compatible = "qcom,ipq8062", .data = (const void *)QCOM_ID_IPQ8062 },
+	{ .compatible = "qcom,ipq8064", .data = (const void *)QCOM_ID_IPQ8064 },
+	{ .compatible = "qcom,ipq8065", .data = (const void *)QCOM_ID_IPQ8065 },
+	{ .compatible = "qcom,ipq8066", .data = (const void *)QCOM_ID_IPQ8066 },
+	{ .compatible = "qcom,ipq8068", .data = (const void *)QCOM_ID_IPQ8068 },
+	{ .compatible = "qcom,ipq8069", .data = (const void *)QCOM_ID_IPQ8069 },
+};
+
 static int qcom_cpufreq_ipq8064_name_version(struct device *cpu_dev,
 					     struct nvmem_cell *speedbin_nvmem,
 					     char **pvs_name,
 					     struct qcom_cpufreq_drv *drv)
 {
+	int msm_id = -1, ret = 0;
 	int speed = 0, pvs = 0;
-	int msm_id, ret = 0;
 	u8 *speedbin;
 	size_t len;
 
@@ -279,8 +288,30 @@ static int qcom_cpufreq_ipq8064_name_version(struct device *cpu_dev,
 	get_krait_bin_format_a(cpu_dev, &speed, &pvs, speedbin);
 
 	ret = qcom_smem_get_soc_id(&msm_id);
-	if (ret)
+	if (ret == -ENODEV) {
+		const struct of_device_id *match;
+		struct device_node *root;
+
+		root = of_find_node_by_path("/");
+		if (!root) {
+			ret = -ENODEV;
+			goto exit;
+		}
+
+		/* Fallback to compatible match with no SMEM initialized */
+		match = of_match_node(qcom_cpufreq_ipq806x_match_list, root);
+		of_node_put(root);
+		if (!match) {
+			ret = -ENODEV;
+			goto exit;
+		}
+
+		/* We found a matching device, get the msm_id from the data entry */
+		msm_id = (int)(uintptr_t)match->data;
+		ret = 0;
+	} else if (ret) {
 		goto exit;
+	}
 
 	switch (msm_id) {
 	case QCOM_ID_IPQ8062:
diff --git a/drivers/cpufreq/s5pv210-cpufreq.c b/drivers/cpufreq/s5pv210-cpufreq.c
index 4215621deb3f..ba8a1c96427a 100644
--- a/drivers/cpufreq/s5pv210-cpufreq.c
+++ b/drivers/cpufreq/s5pv210-cpufreq.c
@@ -518,7 +518,7 @@ static int s5pv210_cpu_init(struct cpufreq_policy *policy)
 
 	if (policy->cpu != 0) {
 		ret = -EINVAL;
-		goto out_dmc1;
+		goto out;
 	}
 
 	/*
@@ -530,7 +530,7 @@ static int s5pv210_cpu_init(struct cpufreq_policy *policy)
 	if ((mem_type != LPDDR) && (mem_type != LPDDR2)) {
 		pr_err("CPUFreq doesn't support this memory type\n");
 		ret = -EINVAL;
-		goto out_dmc1;
+		goto out;
 	}
 
 	/* Find current refresh counter and frequency each DMC */
@@ -544,6 +544,8 @@ static int s5pv210_cpu_init(struct cpufreq_policy *policy)
 	cpufreq_generic_init(policy, s5pv210_freq_table, 40000);
 	return 0;
 
+out:
+	clk_put(dmc1_clk);
 out_dmc1:
 	clk_put(dmc0_clk);
 out_dmc0:
diff --git a/drivers/cpufreq/tegra186-cpufreq.c b/drivers/cpufreq/tegra186-cpufreq.c
index 136ab102f636..34ed943c5f34 100644
--- a/drivers/cpufreq/tegra186-cpufreq.c
+++ b/drivers/cpufreq/tegra186-cpufreq.c
@@ -8,6 +8,7 @@
 #include <linux/module.h>
 #include <linux/of.h>
 #include <linux/platform_device.h>
+#include <linux/units.h>
 
 #include <soc/tegra/bpmp.h>
 #include <soc/tegra/bpmp-abi.h>
@@ -58,7 +59,7 @@ static const struct tegra186_cpufreq_cpu tegra186_cpus[] = {
 };
 
 struct tegra186_cpufreq_cluster {
-	struct cpufreq_frequency_table *table;
+	struct cpufreq_frequency_table *bpmp_lut;
 	u32 ref_clk_khz;
 	u32 div;
 };
@@ -66,16 +67,119 @@ struct tegra186_cpufreq_cluster {
 struct tegra186_cpufreq_data {
 	void __iomem *regs;
 	const struct tegra186_cpufreq_cpu *cpus;
+	bool icc_dram_bw_scaling;
 	struct tegra186_cpufreq_cluster clusters[];
 };
 
+static int tegra_cpufreq_set_bw(struct cpufreq_policy *policy, unsigned long freq_khz)
+{
+	struct tegra186_cpufreq_data *data = cpufreq_get_driver_data();
+	struct device *dev;
+	int ret;
+
+	dev = get_cpu_device(policy->cpu);
+	if (!dev)
+		return -ENODEV;
+
+	struct dev_pm_opp *opp __free(put_opp) =
+		dev_pm_opp_find_freq_exact(dev, freq_khz * HZ_PER_KHZ, true);
+	if (IS_ERR(opp))
+		return PTR_ERR(opp);
+
+	ret = dev_pm_opp_set_opp(dev, opp);
+	if (ret)
+		data->icc_dram_bw_scaling = false;
+
+	return ret;
+}
+
+static int tegra_cpufreq_init_cpufreq_table(struct cpufreq_policy *policy,
+					    struct cpufreq_frequency_table *bpmp_lut,
+					    struct cpufreq_frequency_table **opp_table)
+{
+	struct tegra186_cpufreq_data *data = cpufreq_get_driver_data();
+	struct cpufreq_frequency_table *freq_table = NULL;
+	struct cpufreq_frequency_table *pos;
+	struct device *cpu_dev;
+	unsigned long rate;
+	int ret, max_opps;
+	int j = 0;
+
+	cpu_dev = get_cpu_device(policy->cpu);
+	if (!cpu_dev) {
+		pr_err("%s: failed to get cpu%d device\n", __func__, policy->cpu);
+		return -ENODEV;
+	}
+
+	/* Initialize OPP table mentioned in operating-points-v2 property in DT */
+	ret = dev_pm_opp_of_add_table_indexed(cpu_dev, 0);
+	if (ret) {
+		dev_err(cpu_dev, "Invalid or empty opp table in device tree\n");
+		data->icc_dram_bw_scaling = false;
+		return ret;
+	}
+
+	max_opps = dev_pm_opp_get_opp_count(cpu_dev);
+	if (max_opps <= 0) {
+		dev_err(cpu_dev, "Failed to add OPPs\n");
+		return max_opps;
+	}
+
+	/* Disable all opps and cross-validate against LUT later */
+	for (rate = 0; ; rate++) {
+		struct dev_pm_opp *opp __free(put_opp) =
+			dev_pm_opp_find_freq_ceil(cpu_dev, &rate);
+		if (IS_ERR(opp))
+			break;
+
+		dev_pm_opp_disable(cpu_dev, rate);
+	}
+
+	freq_table = kcalloc((max_opps + 1), sizeof(*freq_table), GFP_KERNEL);
+	if (!freq_table)
+		return -ENOMEM;
+
+	/*
+	 * Cross check the frequencies from BPMP-FW LUT against the OPP's present in DT.
+	 * Enable only those DT OPP's which are present in LUT also.
+	 */
+	cpufreq_for_each_valid_entry(pos, bpmp_lut) {
+		struct dev_pm_opp *opp __free(put_opp) =
+			dev_pm_opp_find_freq_exact(cpu_dev, pos->frequency * HZ_PER_KHZ, false);
+		if (IS_ERR(opp))
+			continue;
+
+		ret = dev_pm_opp_enable(cpu_dev, pos->frequency * HZ_PER_KHZ);
+		if (ret < 0)
+			return ret;
+
+		freq_table[j].driver_data = pos->driver_data;
+		freq_table[j].frequency = pos->frequency;
+		j++;
+	}
+
+	freq_table[j].driver_data = pos->driver_data;
+	freq_table[j].frequency = CPUFREQ_TABLE_END;
+
+	*opp_table = &freq_table[0];
+
+	dev_pm_opp_set_sharing_cpus(cpu_dev, policy->cpus);
+
+	/* Prime interconnect data */
+	tegra_cpufreq_set_bw(policy, freq_table[j - 1].frequency);
+
+	return ret;
+}
+
 static int tegra186_cpufreq_init(struct cpufreq_policy *policy)
 {
 	struct tegra186_cpufreq_data *data = cpufreq_get_driver_data();
 	unsigned int cluster = data->cpus[policy->cpu].bpmp_cluster_id;
+	struct cpufreq_frequency_table *freq_table;
+	struct cpufreq_frequency_table *bpmp_lut;
 	u32 cpu;
+	int ret;
 
-	policy->freq_table = data->clusters[cluster].table;
 	policy->cpuinfo.transition_latency = 300 * 1000;
 	policy->driver_data = NULL;
 
@@ -85,6 +189,20 @@ static int tegra186_cpufreq_init(struct cpufreq_policy *policy)
 			cpumask_set_cpu(cpu, policy->cpus);
 	}
 
+	bpmp_lut = data->clusters[cluster].bpmp_lut;
+
+	if (data->icc_dram_bw_scaling) {
+		ret = tegra_cpufreq_init_cpufreq_table(policy, bpmp_lut, &freq_table);
+		if (!ret) {
+			policy->freq_table = freq_table;
+			return 0;
+		}
+	}
+
+	data->icc_dram_bw_scaling = false;
+	policy->freq_table = bpmp_lut;
+	pr_info("OPP tables missing from DT, EMC frequency scaling disabled\n");
+
 	return 0;
 }
 
@@ -102,6 +220,10 @@ static int tegra186_cpufreq_set_target(struct cpufreq_policy *policy,
 		writel(edvd_val, data->regs + edvd_offset);
 	}
 
+	if (data->icc_dram_bw_scaling)
+		tegra_cpufreq_set_bw(policy, tbl->frequency);
+
+
 	return 0;
 }
 
@@ -134,7 +256,7 @@ static struct cpufreq_driver tegra186_cpufreq_driver = {
 	.init = tegra186_cpufreq_init,
 };
 
-static struct cpufreq_frequency_table *init_vhint_table(
+static struct cpufreq_frequency_table *tegra_cpufreq_bpmp_read_lut(
 	struct platform_device *pdev, struct tegra_bpmp *bpmp,
 	struct tegra186_cpufreq_cluster *cluster, unsigned int cluster_id,
 	int *num_rates)
@@ -229,6 +351,7 @@ static int tegra186_cpufreq_probe(struct platform_device *pdev)
 {
 	struct tegra186_cpufreq_data *data;
 	struct tegra_bpmp *bpmp;
+	struct device *cpu_dev;
 	unsigned int i = 0, err, edvd_offset;
 	int num_rates = 0;
 	u32 edvd_val, cpu;
@@ -254,9 +377,9 @@ static int tegra186_cpufreq_probe(struct platform_device *pdev)
 	for (i = 0; i < TEGRA186_NUM_CLUSTERS; i++) {
 		struct tegra186_cpufreq_cluster *cluster = &data->clusters[i];
 
-		cluster->table = init_vhint_table(pdev, bpmp, cluster, i, &num_rates);
-		if (IS_ERR(cluster->table)) {
-			err = PTR_ERR(cluster->table);
+		cluster->bpmp_lut = tegra_cpufreq_bpmp_read_lut(pdev, bpmp, cluster, i, &num_rates);
+		if (IS_ERR(cluster->bpmp_lut)) {
+			err = PTR_ERR(cluster->bpmp_lut);
 			goto put_bpmp;
 		} else if (!num_rates) {
 			err = -EINVAL;
@@ -265,7 +388,7 @@ static int tegra186_cpufreq_probe(struct platform_device *pdev)
 
 		for (cpu = 0; cpu < ARRAY_SIZE(tegra186_cpus); cpu++) {
 			if (data->cpus[cpu].bpmp_cluster_id == i) {
-				edvd_val = cluster->table[num_rates - 1].driver_data;
+				edvd_val = cluster->bpmp_lut[num_rates - 1].driver_data;
 				edvd_offset = data->cpus[cpu].edvd_offset;
 				writel(edvd_val, data->regs + edvd_offset);
 			}
@@ -274,6 +397,19 @@ static int tegra186_cpufreq_probe(struct platform_device *pdev)
 
 	tegra186_cpufreq_driver.driver_data = data;
 
+	/* Check for optional OPPv2 and interconnect paths on CPU0 to enable ICC scaling */
+	cpu_dev = get_cpu_device(0);
+	if (!cpu_dev) {
+		err = -EPROBE_DEFER;
+		goto put_bpmp;
+	}
+
+	if (dev_pm_opp_of_get_opp_desc_node(cpu_dev)) {
+		err = dev_pm_opp_of_find_icc_paths(cpu_dev, NULL);
+		if (!err)
+			data->icc_dram_bw_scaling = true;
+	}
+
 	err = cpufreq_register_driver(&tegra186_cpufreq_driver);
 
 put_bpmp:
diff --git a/drivers/cpufreq/tegra194-cpufreq.c b/drivers/cpufreq/tegra194-cpufreq.c
index 9b4f516f313e..695599e1001f 100644
--- a/drivers/cpufreq/tegra194-cpufreq.c
+++ b/drivers/cpufreq/tegra194-cpufreq.c
@@ -750,7 +750,8 @@ static int tegra194_cpufreq_probe(struct platform_device *pdev)
 	if (IS_ERR(bpmp))
 		return PTR_ERR(bpmp);
 
-	read_counters_wq = alloc_workqueue("read_counters_wq", __WQ_LEGACY, 1);
+	read_counters_wq = alloc_workqueue("read_counters_wq",
+					   __WQ_LEGACY | WQ_PERCPU, 1);
 	if (!read_counters_wq) {
 		dev_err(&pdev->dev, "fail to create_workqueue\n");
 		err = -EINVAL;
diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index 56132e843c99..c7876e9e024f 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -184,20 +184,22 @@ static noinstr void enter_s2idle_proper(struct cpuidle_driver *drv,
  * cpuidle_enter_s2idle - Enter an idle state suitable for suspend-to-idle.
  * @drv: cpuidle driver for the given CPU.
  * @dev: cpuidle device for the given CPU.
+ * @latency_limit_ns: Idle state exit latency limit
  *
  * If there are states with the ->enter_s2idle callback, find the deepest of
  * them and enter it with frozen tick.
  */
-int cpuidle_enter_s2idle(struct cpuidle_driver *drv, struct cpuidle_device *dev)
+int cpuidle_enter_s2idle(struct cpuidle_driver *drv, struct cpuidle_device *dev,
+			 u64 latency_limit_ns)
 {
 	int index;
 
 	/*
-	 * Find the deepest state with ->enter_s2idle present, which guarantees
-	 * that interrupts won't be enabled when it exits and allows the tick to
-	 * be frozen safely.
+	 * Find the deepest state with ->enter_s2idle present that meets the
+	 * specified latency limit, which guarantees that interrupts won't be
+	 * enabled when it exits and allows the tick to be frozen safely.
 	 */
-	index = find_deepest_state(drv, dev, U64_MAX, 0, true);
+	index = find_deepest_state(drv, dev, latency_limit_ns, 0, true);
 	if (index > 0) {
 		enter_s2idle_proper(drv, dev, index);
 		local_irq_enable();
diff --git a/drivers/cpuidle/driver.c b/drivers/cpuidle/driver.c
index 9bbfa594c442..370664c47e65 100644
--- a/drivers/cpuidle/driver.c
+++ b/drivers/cpuidle/driver.c
@@ -8,6 +8,8 @@
  * This code is licenced under the GPL.
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/mutex.h>
 #include <linux/module.h>
 #include <linux/sched.h>
@@ -193,6 +195,14 @@ static void __cpuidle_driver_init(struct cpuidle_driver *drv)
 			s->exit_latency_ns =  0;
 		else
 			s->exit_latency = div_u64(s->exit_latency_ns, NSEC_PER_USEC);
+
+		/*
+		 * Warn if the exit latency of a CPU idle state exceeds its
+		 * target residency which is assumed to never happen in cpuidle
+		 * in multiple places.
+		 */
+		if (s->exit_latency_ns > s->target_residency_ns)
+			pr_warn("Idle state %d target residency too low\n", i);
 	}
 }
 
diff --git a/drivers/cpuidle/governor.c b/drivers/cpuidle/governor.c
index 0d0f9751ff8f..5d0e7f78c6c5 100644
--- a/drivers/cpuidle/governor.c
+++ b/drivers/cpuidle/governor.c
@@ -111,6 +111,10 @@ s64 cpuidle_governor_latency_req(unsigned int cpu)
 	struct device *device = get_cpu_device(cpu);
 	int device_req = dev_pm_qos_raw_resume_latency(device);
 	int global_req = cpu_latency_qos_limit();
+	int global_wake_req = cpu_wakeup_latency_qos_limit();
+
+	if (global_req > global_wake_req)
+		global_req = global_wake_req;
 
 	if (device_req > global_req)
 		device_req = global_req;
diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c
index 23239b0c04f9..64d6f7a1c776 100644
--- a/drivers/cpuidle/governors/menu.c
+++ b/drivers/cpuidle/governors/menu.c
@@ -317,12 +317,13 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 		}
 
 		/*
-		 * Use a physical idle state, not busy polling, unless a timer
-		 * is going to trigger soon enough or the exit latency of the
-		 * idle state in question is greater than the predicted idle
-		 * duration.
+		 * Use a physical idle state instead of busy polling so long as
+		 * its target residency is below the residency threshold, its
+		 * exit latency is not greater than the predicted idle duration,
+		 * and the next timer doesn't expire soon.
 		 */
 		if ((drv->states[idx].flags & CPUIDLE_FLAG_POLLING) &&
+		    s->target_residency_ns < RESIDENCY_THRESHOLD_NS &&
 		    s->target_residency_ns <= data->next_timer_ns &&
 		    s->exit_latency_ns <= predicted_ns) {
 			predicted_ns = s->target_residency_ns;
diff --git a/drivers/cpuidle/governors/teo.c b/drivers/cpuidle/governors/teo.c
index bfa55c1eab5b..81ac5fd58a1c 100644
--- a/drivers/cpuidle/governors/teo.c
+++ b/drivers/cpuidle/governors/teo.c
@@ -76,7 +76,7 @@
  *      likely woken up by a non-timer wakeup source).
  *
  * 2. If the second sum computed in step 1 is greater than a half of the sum of
- *    both metrics for the candidate state bin and all subsequent bins(if any),
+ *    both metrics for the candidate state bin and all subsequent bins (if any),
  *    a shallower idle state is likely to be more suitable, so look for it.
  *
  *    - Traverse the enabled idle states shallower than the candidate one in the
@@ -133,21 +133,33 @@ struct teo_bin {
  * @sleep_length_ns: Time till the closest timer event (at the selection time).
  * @state_bins: Idle state data bins for this CPU.
  * @total: Grand total of the "intercepts" and "hits" metrics for all bins.
+ * @total_tick: Wakeups by the scheduler tick.
  * @tick_intercepts: "Intercepts" before TICK_NSEC.
  * @short_idles: Wakeups after short idle periods.
- * @artificial_wakeup: Set if the wakeup has been triggered by a safety net.
+ * @tick_wakeup: Set if the last wakeup was by the scheduler tick.
  */
 struct teo_cpu {
 	s64 sleep_length_ns;
 	struct teo_bin state_bins[CPUIDLE_STATE_MAX];
 	unsigned int total;
+	unsigned int total_tick;
 	unsigned int tick_intercepts;
 	unsigned int short_idles;
-	bool artificial_wakeup;
+	bool tick_wakeup;
 };
 
 static DEFINE_PER_CPU(struct teo_cpu, teo_cpus);
 
+static void teo_decay(unsigned int *metric)
+{
+	unsigned int delta = *metric >> DECAY_SHIFT;
+
+	if (delta)
+		*metric -= delta;
+	else
+		*metric = 0;
+}
+
 /**
  * teo_update - Update CPU metrics after wakeup.
  * @drv: cpuidle driver containing state data.
@@ -155,21 +167,22 @@ static DEFINE_PER_CPU(struct teo_cpu, teo_cpus);
  */
 static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev)
 {
-	struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu);
+	struct teo_cpu *cpu_data = this_cpu_ptr(&teo_cpus);
 	int i, idx_timer = 0, idx_duration = 0;
-	s64 target_residency_ns;
-	u64 measured_ns;
+	s64 target_residency_ns, measured_ns;
+	unsigned int total = 0;
 
-	cpu_data->short_idles -= cpu_data->short_idles >> DECAY_SHIFT;
+	teo_decay(&cpu_data->short_idles);
 
-	if (cpu_data->artificial_wakeup) {
+	if (dev->poll_time_limit) {
+		dev->poll_time_limit = false;
 		/*
-		 * If one of the safety nets has triggered, assume that this
+		 * Polling state timeout has triggered, so assume that this
 		 * might have been a long sleep.
 		 */
-		measured_ns = U64_MAX;
+		measured_ns = S64_MAX;
 	} else {
-		u64 lat_ns = drv->states[dev->last_state_idx].exit_latency_ns;
+		s64 lat_ns = drv->states[dev->last_state_idx].exit_latency_ns;
 
 		measured_ns = dev->last_residency_ns;
 		/*
@@ -196,8 +209,10 @@ static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev)
 	for (i = 0; i < drv->state_count; i++) {
 		struct teo_bin *bin = &cpu_data->state_bins[i];
 
-		bin->hits -= bin->hits >> DECAY_SHIFT;
-		bin->intercepts -= bin->intercepts >> DECAY_SHIFT;
+		teo_decay(&bin->hits);
+		total += bin->hits;
+		teo_decay(&bin->intercepts);
+		total += bin->intercepts;
 
 		target_residency_ns = drv->states[i].target_residency_ns;
 
@@ -208,7 +223,24 @@ static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev)
 		}
 	}
 
-	cpu_data->tick_intercepts -= cpu_data->tick_intercepts >> DECAY_SHIFT;
+	cpu_data->total = total + PULSE;
+
+	teo_decay(&cpu_data->tick_intercepts);
+
+	teo_decay(&cpu_data->total_tick);
+	if (cpu_data->tick_wakeup) {
+		cpu_data->total_tick += PULSE;
+		/*
+		 * If tick wakeups dominate the wakeup pattern, count this one
+		 * as a hit on the deepest available idle state to increase the
+		 * likelihood of stopping the tick.
+		 */
+		if (3 * cpu_data->total_tick > 2 * cpu_data->total) {
+			cpu_data->state_bins[drv->state_count-1].hits += PULSE;
+			return;
+		}
+	}
+
 	/*
 	 * If the measured idle duration falls into the same bin as the sleep
 	 * length, this is a "hit", so update the "hits" metric for that bin.
@@ -219,18 +251,9 @@ static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev)
 		cpu_data->state_bins[idx_timer].hits += PULSE;
 	} else {
 		cpu_data->state_bins[idx_duration].intercepts += PULSE;
-		if (TICK_NSEC <= measured_ns)
+		if (measured_ns <= TICK_NSEC)
 			cpu_data->tick_intercepts += PULSE;
 	}
-
-	cpu_data->total -= cpu_data->total >> DECAY_SHIFT;
-	cpu_data->total += PULSE;
-}
-
-static bool teo_state_ok(int i, struct cpuidle_driver *drv)
-{
-	return !tick_nohz_tick_stopped() ||
-		drv->states[i].target_residency_ns >= TICK_NSEC;
 }
 
 /**
@@ -239,17 +262,15 @@ static bool teo_state_ok(int i, struct cpuidle_driver *drv)
  * @dev: Target CPU.
  * @state_idx: Index of the capping idle state.
  * @duration_ns: Idle duration value to match.
- * @no_poll: Don't consider polling states.
  */
 static int teo_find_shallower_state(struct cpuidle_driver *drv,
 				    struct cpuidle_device *dev, int state_idx,
-				    s64 duration_ns, bool no_poll)
+				    s64 duration_ns)
 {
 	int i;
 
 	for (i = state_idx - 1; i >= 0; i--) {
-		if (dev->states_usage[i].disable ||
-				(no_poll && drv->states[i].flags & CPUIDLE_FLAG_POLLING))
+		if (dev->states_usage[i].disable)
 			continue;
 
 		state_idx = i;
@@ -268,7 +289,7 @@ static int teo_find_shallower_state(struct cpuidle_driver *drv,
 static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 		      bool *stop_tick)
 {
-	struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu);
+	struct teo_cpu *cpu_data = this_cpu_ptr(&teo_cpus);
 	s64 latency_req = cpuidle_governor_latency_req(dev->cpu);
 	ktime_t delta_tick = TICK_NSEC / 2;
 	unsigned int idx_intercept_sum = 0;
@@ -356,7 +377,18 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 	 * better choice.
 	 */
 	if (2 * idx_intercept_sum > cpu_data->total - idx_hit_sum) {
-		int first_suitable_idx = idx;
+		int min_idx = idx0;
+
+		if (tick_nohz_tick_stopped()) {
+			/*
+			 * Look for the shallowest idle state below the current
+			 * candidate one whose target residency is at least
+			 * equal to the tick period length.
+			 */
+			while (min_idx < idx &&
+			       drv->states[min_idx].target_residency_ns < TICK_NSEC)
+				min_idx++;
+		}
 
 		/*
 		 * Look for the deepest idle state whose target residency had
@@ -366,49 +398,14 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 		 * Take the possible duration limitation present if the tick
 		 * has been stopped already into account.
 		 */
-		intercept_sum = 0;
-
-		for (i = idx - 1; i >= 0; i--) {
-			struct teo_bin *bin = &cpu_data->state_bins[i];
-
-			intercept_sum += bin->intercepts;
-
-			if (2 * intercept_sum > idx_intercept_sum) {
-				/*
-				 * Use the current state unless it is too
-				 * shallow or disabled, in which case take the
-				 * first enabled state that is deep enough.
-				 */
-				if (teo_state_ok(i, drv) &&
-				    !dev->states_usage[i].disable) {
-					idx = i;
-					break;
-				}
-				idx = first_suitable_idx;
-				break;
-			}
+		for (i = idx - 1, intercept_sum = 0; i >= min_idx; i--) {
+			intercept_sum += cpu_data->state_bins[i].intercepts;
 
 			if (dev->states_usage[i].disable)
 				continue;
 
-			if (teo_state_ok(i, drv)) {
-				/*
-				 * The current state is deep enough, but still
-				 * there may be a better one.
-				 */
-				first_suitable_idx = i;
-				continue;
-			}
-
-			/*
-			 * The current state is too shallow, so if no suitable
-			 * states other than the initial candidate have been
-			 * found, give up (the remaining states to check are
-			 * shallower still), but otherwise the first suitable
-			 * state other than the initial candidate may turn out
-			 * to be preferable.
-			 */
-			if (first_suitable_idx == idx)
+			idx = i;
+			if (2 * intercept_sum > idx_intercept_sum)
 				break;
 		}
 	}
@@ -458,11 +455,8 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 	 * If the closest expected timer is before the target residency of the
 	 * candidate state, a shallower one needs to be found.
 	 */
-	if (drv->states[idx].target_residency_ns > duration_ns) {
-		i = teo_find_shallower_state(drv, dev, idx, duration_ns, false);
-		if (teo_state_ok(i, drv))
-			idx = i;
-	}
+	if (drv->states[idx].target_residency_ns > duration_ns)
+		idx = teo_find_shallower_state(drv, dev, idx, duration_ns);
 
 	/*
 	 * If the selected state's target residency is below the tick length
@@ -490,7 +484,7 @@ end:
 	 */
 	if (idx > idx0 &&
 	    drv->states[idx].target_residency_ns > delta_tick)
-		idx = teo_find_shallower_state(drv, dev, idx, delta_tick, false);
+		idx = teo_find_shallower_state(drv, dev, idx, delta_tick);
 
 out_tick:
 	*stop_tick = false;
@@ -504,20 +498,11 @@ out_tick:
  */
 static void teo_reflect(struct cpuidle_device *dev, int state)
 {
-	struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu);
+	struct teo_cpu *cpu_data = this_cpu_ptr(&teo_cpus);
+
+	cpu_data->tick_wakeup = tick_nohz_idle_got_tick();
 
 	dev->last_state_idx = state;
-	if (dev->poll_time_limit ||
-	    (tick_nohz_idle_got_tick() && cpu_data->sleep_length_ns > TICK_NSEC)) {
-		/*
-		 * The wakeup was not "genuine", but triggered by one of the
-		 * safety nets.
-		 */
-		dev->poll_time_limit = false;
-		cpu_data->artificial_wakeup = true;
-	} else {
-		cpu_data->artificial_wakeup = false;
-	}
 }
 
 /**
diff --git a/drivers/cpuidle/poll_state.c b/drivers/cpuidle/poll_state.c
index 9b6d90a72601..c7524e4c522a 100644
--- a/drivers/cpuidle/poll_state.c
+++ b/drivers/cpuidle/poll_state.c
@@ -4,9 +4,13 @@
  */
 
 #include <linux/cpuidle.h>
+#include <linux/export.h>
+#include <linux/irqflags.h>
 #include <linux/sched.h>
 #include <linux/sched/clock.h>
 #include <linux/sched/idle.h>
+#include <linux/sprintf.h>
+#include <linux/types.h>
 
 #define POLL_IDLE_RELAX_COUNT	200
 
diff --git a/drivers/devfreq/devfreq.c b/drivers/devfreq/devfreq.c
index 2e8d01d47f69..00979f2e0e27 100644
--- a/drivers/devfreq/devfreq.c
+++ b/drivers/devfreq/devfreq.c
@@ -20,6 +20,7 @@
 #include <linux/stat.h>
 #include <linux/pm_opp.h>
 #include <linux/devfreq.h>
+#include <linux/devfreq-governor.h>
 #include <linux/workqueue.h>
 #include <linux/platform_device.h>
 #include <linux/list.h>
@@ -28,7 +29,6 @@
 #include <linux/of.h>
 #include <linux/pm_qos.h>
 #include <linux/units.h>
-#include "governor.h"
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/devfreq.h>
diff --git a/drivers/devfreq/governor_passive.c b/drivers/devfreq/governor_passive.c
index 953cf9a1e9f7..8cd6f9a59f64 100644
--- a/drivers/devfreq/governor_passive.c
+++ b/drivers/devfreq/governor_passive.c
@@ -14,8 +14,33 @@
 #include <linux/slab.h>
 #include <linux/device.h>
 #include <linux/devfreq.h>
+#include <linux/devfreq-governor.h>
 #include <linux/units.h>
-#include "governor.h"
+
+/**
+ * struct devfreq_cpu_data - Hold the per-cpu data
+ * @node:	list node
+ * @dev:	reference to cpu device.
+ * @first_cpu:	the cpumask of the first cpu of a policy.
+ * @opp_table:	reference to cpu opp table.
+ * @cur_freq:	the current frequency of the cpu.
+ * @min_freq:	the min frequency of the cpu.
+ * @max_freq:	the max frequency of the cpu.
+ *
+ * This structure stores the required cpu_data of a cpu.
+ * This is auto-populated by the governor.
+ */
+struct devfreq_cpu_data {
+	struct list_head node;
+
+	struct device *dev;
+	unsigned int first_cpu;
+
+	struct opp_table *opp_table;
+	unsigned int cur_freq;
+	unsigned int min_freq;
+	unsigned int max_freq;
+};
 
 static struct devfreq_cpu_data *
 get_parent_cpu_data(struct devfreq_passive_data *p_data,
diff --git a/drivers/devfreq/governor_performance.c b/drivers/devfreq/governor_performance.c
index 2e4e981446fa..fdb22bf512cf 100644
--- a/drivers/devfreq/governor_performance.c
+++ b/drivers/devfreq/governor_performance.c
@@ -7,8 +7,8 @@
  */
 
 #include <linux/devfreq.h>
+#include <linux/devfreq-governor.h>
 #include <linux/module.h>
-#include "governor.h"
 
 static int devfreq_performance_func(struct devfreq *df,
 				    unsigned long *freq)
diff --git a/drivers/devfreq/governor_powersave.c b/drivers/devfreq/governor_powersave.c
index f059e8814804..ee2d6ec8a512 100644
--- a/drivers/devfreq/governor_powersave.c
+++ b/drivers/devfreq/governor_powersave.c
@@ -7,8 +7,8 @@
  */
 
 #include <linux/devfreq.h>
+#include <linux/devfreq-governor.h>
 #include <linux/module.h>
-#include "governor.h"
 
 static int devfreq_powersave_func(struct devfreq *df,
 				  unsigned long *freq)
diff --git a/drivers/devfreq/governor_simpleondemand.c b/drivers/devfreq/governor_simpleondemand.c
index c23435736367..ac9c5e9e51a4 100644
--- a/drivers/devfreq/governor_simpleondemand.c
+++ b/drivers/devfreq/governor_simpleondemand.c
@@ -9,12 +9,12 @@
 #include <linux/errno.h>
 #include <linux/module.h>
 #include <linux/devfreq.h>
+#include <linux/devfreq-governor.h>
 #include <linux/math64.h>
-#include "governor.h"
 
 /* Default constants for DevFreq-Simple-Ondemand (DFSO) */
 #define DFSO_UPTHRESHOLD	(90)
-#define DFSO_DOWNDIFFERENCTIAL	(5)
+#define DFSO_DOWNDIFFERENTIAL	(5)
 static int devfreq_simple_ondemand_func(struct devfreq *df,
 					unsigned long *freq)
 {
@@ -22,7 +22,7 @@ static int devfreq_simple_ondemand_func(struct devfreq *df,
 	struct devfreq_dev_status *stat;
 	unsigned long long a, b;
 	unsigned int dfso_upthreshold = DFSO_UPTHRESHOLD;
-	unsigned int dfso_downdifferential = DFSO_DOWNDIFFERENCTIAL;
+	unsigned int dfso_downdifferential = DFSO_DOWNDIFFERENTIAL;
 	struct devfreq_simple_ondemand_data *data = df->data;
 
 	err = devfreq_update_stats(df);
diff --git a/drivers/devfreq/governor_userspace.c b/drivers/devfreq/governor_userspace.c
index 175de0c0b50e..395174f93960 100644
--- a/drivers/devfreq/governor_userspace.c
+++ b/drivers/devfreq/governor_userspace.c
@@ -9,11 +9,11 @@
 #include <linux/slab.h>
 #include <linux/device.h>
 #include <linux/devfreq.h>
+#include <linux/devfreq-governor.h>
 #include <linux/kstrtox.h>
 #include <linux/pm.h>
 #include <linux/mutex.h>
 #include <linux/module.h>
-#include "governor.h"
 
 struct userspace_data {
 	unsigned long user_frequency;
diff --git a/drivers/devfreq/hisi_uncore_freq.c b/drivers/devfreq/hisi_uncore_freq.c
index 96d1815059e3..4d00d813c8ac 100644
--- a/drivers/devfreq/hisi_uncore_freq.c
+++ b/drivers/devfreq/hisi_uncore_freq.c
@@ -9,6 +9,7 @@
 #include <linux/bits.h>
 #include <linux/cleanup.h>
 #include <linux/devfreq.h>
+#include <linux/devfreq-governor.h>
 #include <linux/device.h>
 #include <linux/dev_printk.h>
 #include <linux/errno.h>
@@ -26,8 +27,6 @@
 #include <linux/units.h>
 #include <acpi/pcc.h>
 
-#include "governor.h"
-
 struct hisi_uncore_pcc_data {
 	u16 status;
 	u16 resv;
@@ -265,10 +264,11 @@ static int hisi_uncore_target(struct device *dev, unsigned long *freq,
 		dev_err(dev, "Failed to get opp for freq %lu hz\n", *freq);
 		return PTR_ERR(opp);
 	}
-	dev_pm_opp_put(opp);
 
 	data = (u32)(dev_pm_opp_get_freq(opp) / HZ_PER_MHZ);
 
+	dev_pm_opp_put(opp);
+
 	return hisi_uncore_cmd_send(uncore, HUCF_PCC_CMD_SET_FREQ, &data);
 }
 
diff --git a/drivers/devfreq/tegra30-devfreq.c b/drivers/devfreq/tegra30-devfreq.c
index 4a4f0106ab9d..8b57194ac698 100644
--- a/drivers/devfreq/tegra30-devfreq.c
+++ b/drivers/devfreq/tegra30-devfreq.c
@@ -9,9 +9,11 @@
 #include <linux/clk.h>
 #include <linux/cpufreq.h>
 #include <linux/devfreq.h>
+#include <linux/devfreq-governor.h>
 #include <linux/interrupt.h>
 #include <linux/io.h>
 #include <linux/irq.h>
+#include <linux/minmax.h>
 #include <linux/module.h>
 #include <linux/of.h>
 #include <linux/platform_device.h>
@@ -21,8 +23,6 @@
 
 #include <soc/tegra/fuse.h>
 
-#include "governor.h"
-
 #define ACTMON_GLB_STATUS					0x0
 #define ACTMON_GLB_PERIOD_CTRL					0x4
 
@@ -326,14 +326,9 @@ static unsigned long actmon_cpu_to_emc_rate(struct tegra_devfreq *tegra,
 	unsigned int i;
 	const struct tegra_actmon_emc_ratio *ratio = actmon_emc_ratios;
 
-	for (i = 0; i < ARRAY_SIZE(actmon_emc_ratios); i++, ratio++) {
-		if (cpu_freq >= ratio->cpu_freq) {
-			if (ratio->emc_freq >= tegra->max_freq)
-				return tegra->max_freq;
-			else
-				return ratio->emc_freq;
-		}
-	}
+	for (i = 0; i < ARRAY_SIZE(actmon_emc_ratios); i++, ratio++)
+		if (cpu_freq >= ratio->cpu_freq)
+			return min(ratio->emc_freq, tegra->max_freq);
 
 	return 0;
 }
diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c
index 1ce428e2ac8a..a9070d00b833 100644
--- a/drivers/firmware/efi/efi.c
+++ b/drivers/firmware/efi/efi.c
@@ -74,6 +74,9 @@ struct mm_struct efi_mm = {
 	.page_table_lock	= __SPIN_LOCK_UNLOCKED(efi_mm.page_table_lock),
 	.mmlist			= LIST_HEAD_INIT(efi_mm.mmlist),
 	.cpu_bitmap		= { [BITS_TO_LONGS(NR_CPUS)] = 0},
+#ifdef CONFIG_SCHED_MM_CID
+	.mm_cid.lock		= __RAW_SPIN_LOCK_UNLOCKED(efi_mm.mm_cid.lock),
+#endif
 };
 
 struct workqueue_struct *efi_rts_wq;
diff --git a/drivers/firmware/efi/runtime-wrappers.c b/drivers/firmware/efi/runtime-wrappers.c
index 708b777857d3..da8d29621644 100644
--- a/drivers/firmware/efi/runtime-wrappers.c
+++ b/drivers/firmware/efi/runtime-wrappers.c
@@ -202,6 +202,8 @@ void efi_call_virt_check_flags(unsigned long flags, const void *caller)
  */
 static DEFINE_SEMAPHORE(efi_runtime_lock, 1);
 
+static struct task_struct *efi_runtime_lock_owner;
+
 /*
  * Expose the EFI runtime lock to the UV platform
  */
@@ -219,6 +221,8 @@ static void __nocfi efi_call_rts(struct work_struct *work)
 	efi_status_t status = EFI_NOT_FOUND;
 	unsigned long flags;
 
+	efi_runtime_lock_owner = current;
+
 	arch_efi_call_virt_setup();
 	flags = efi_call_virt_save_flags();
 
@@ -310,6 +314,7 @@ static void __nocfi efi_call_rts(struct work_struct *work)
 
 	efi_rts_work.status = status;
 	complete(&efi_rts_work.efi_rts_comp);
+	efi_runtime_lock_owner = NULL;
 }
 
 static efi_status_t __efi_queue_work(enum efi_rts_ids id,
@@ -444,8 +449,10 @@ virt_efi_set_variable_nb(efi_char16_t *name, efi_guid_t *vendor, u32 attr,
 	if (down_trylock(&efi_runtime_lock))
 		return EFI_NOT_READY;
 
+	efi_runtime_lock_owner = current;
 	status = efi_call_virt_pointer(efi.runtime, set_variable, name, vendor,
 				       attr, data_size, data);
+	efi_runtime_lock_owner = NULL;
 	up(&efi_runtime_lock);
 	return status;
 }
@@ -481,9 +488,11 @@ virt_efi_query_variable_info_nb(u32 attr, u64 *storage_space,
 	if (down_trylock(&efi_runtime_lock))
 		return EFI_NOT_READY;
 
+	efi_runtime_lock_owner = current;
 	status = efi_call_virt_pointer(efi.runtime, query_variable_info, attr,
 				       storage_space, remaining_space,
 				       max_variable_size);
+	efi_runtime_lock_owner = NULL;
 	up(&efi_runtime_lock);
 	return status;
 }
@@ -509,12 +518,13 @@ virt_efi_reset_system(int reset_type, efi_status_t status,
 		return;
 	}
 
+	efi_runtime_lock_owner = current;
 	arch_efi_call_virt_setup();
 	efi_rts_work.efi_rts_id = EFI_RESET_SYSTEM;
 	arch_efi_call_virt(efi.runtime, reset_system, reset_type, status,
 			   data_size, data);
 	arch_efi_call_virt_teardown();
-
+	efi_runtime_lock_owner = NULL;
 	up(&efi_runtime_lock);
 }
 
@@ -587,3 +597,8 @@ efi_call_acpi_prm_handler(efi_status_t (__efiapi *handler_addr)(u64, void *),
 }
 
 #endif
+
+void efi_runtime_assert_lock_held(void)
+{
+	WARN_ON(efi_runtime_lock_owner != current);
+}
diff --git a/drivers/media/test-drivers/vivid/vivid-vid-cap.c b/drivers/media/test-drivers/vivid/vivid-vid-cap.c
index 8b3162e82032..b95f06a9b5ae 100644
--- a/drivers/media/test-drivers/vivid/vivid-vid-cap.c
+++ b/drivers/media/test-drivers/vivid/vivid-vid-cap.c
@@ -302,8 +302,10 @@ void vivid_update_quality(struct vivid_dev *dev)
 	 */
 	freq_modulus = (dev->tv_freq - 676 /* (43.25-1) * 16 */) % (6 * 16);
 	if (freq_modulus > 2 * 16) {
+		struct rnd_state prng;
+		prandom_seed_state(&prng, dev->tv_freq ^ 0x55);
 		tpg_s_quality(&dev->tpg, TPG_QUAL_NOISE,
-			next_pseudo_random32(dev->tv_freq ^ 0x55) & 0x3f);
+			prandom_u32_state(&prng) & 0x3f);
 		return;
 	}
 	if (freq_modulus < 12 /*0.75 * 16*/ || freq_modulus > 20 /*1.25 * 16*/)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/wc.c b/drivers/net/ethernet/mellanox/mlx5/core/wc.c
index 05e5fd777d4f..815a7c97d6b0 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/wc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/wc.c
@@ -9,6 +9,7 @@
 
 #if IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && IS_ENABLED(CONFIG_ARM64)
 #include <asm/neon.h>
+#include <asm/simd.h>
 #endif
 
 #define TEST_WC_NUM_WQES 255
@@ -264,15 +265,15 @@ static void mlx5_iowrite64_copy(struct mlx5_wc_sq *sq, __be32 mmio_wqe[16],
 {
 #if IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && IS_ENABLED(CONFIG_ARM64)
 	if (cpu_has_neon()) {
-		kernel_neon_begin();
-		asm volatile
-		(".arch_extension simd\n\t"
-		"ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [%0]\n\t"
-		"st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [%1]"
-		:
-		: "r"(mmio_wqe), "r"(sq->bfreg.map + offset)
-		: "memory", "v0", "v1", "v2", "v3");
-		kernel_neon_end();
+		scoped_ksimd() {
+			asm volatile(
+				".arch_extension simd\n\t"
+				"ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [%0]\n\t"
+				"st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [%1]"
+				:
+				: "r"(mmio_wqe), "r"(sq->bfreg.map + offset)
+				: "memory", "v0", "v1", "v2", "v3");
+		}
 		return;
 	}
 #endif
diff --git a/drivers/net/wireguard/cookie.c b/drivers/net/wireguard/cookie.c
index 94d0a7206084..08731b3fa32b 100644
--- a/drivers/net/wireguard/cookie.c
+++ b/drivers/net/wireguard/cookie.c
@@ -33,7 +33,7 @@ static void precompute_key(u8 key[NOISE_SYMMETRIC_KEY_LEN],
 			   const u8 pubkey[NOISE_PUBLIC_KEY_LEN],
 			   const u8 label[COOKIE_KEY_LABEL_LEN])
 {
-	struct blake2s_state blake;
+	struct blake2s_ctx blake;
 
 	blake2s_init(&blake, NOISE_SYMMETRIC_KEY_LEN);
 	blake2s_update(&blake, label, COOKIE_KEY_LABEL_LEN);
@@ -77,7 +77,7 @@ static void compute_mac1(u8 mac1[COOKIE_LEN], const void *message, size_t len,
 {
 	len = len - sizeof(struct message_macs) +
 	      offsetof(struct message_macs, mac1);
-	blake2s(mac1, message, key, COOKIE_LEN, len, NOISE_SYMMETRIC_KEY_LEN);
+	blake2s(key, NOISE_SYMMETRIC_KEY_LEN, message, len, mac1, COOKIE_LEN);
 }
 
 static void compute_mac2(u8 mac2[COOKIE_LEN], const void *message, size_t len,
@@ -85,13 +85,13 @@ static void compute_mac2(u8 mac2[COOKIE_LEN], const void *message, size_t len,
 {
 	len = len - sizeof(struct message_macs) +
 	      offsetof(struct message_macs, mac2);
-	blake2s(mac2, message, cookie, COOKIE_LEN, len, COOKIE_LEN);
+	blake2s(cookie, COOKIE_LEN, message, len, mac2, COOKIE_LEN);
 }
 
 static void make_cookie(u8 cookie[COOKIE_LEN], struct sk_buff *skb,
 			struct cookie_checker *checker)
 {
-	struct blake2s_state state;
+	struct blake2s_ctx blake;
 
 	if (wg_birthdate_has_expired(checker->secret_birthdate,
 				     COOKIE_SECRET_MAX_AGE)) {
@@ -103,15 +103,15 @@ static void make_cookie(u8 cookie[COOKIE_LEN], struct sk_buff *skb,
 
 	down_read(&checker->secret_lock);
 
-	blake2s_init_key(&state, COOKIE_LEN, checker->secret, NOISE_HASH_LEN);
+	blake2s_init_key(&blake, COOKIE_LEN, checker->secret, NOISE_HASH_LEN);
 	if (skb->protocol == htons(ETH_P_IP))
-		blake2s_update(&state, (u8 *)&ip_hdr(skb)->saddr,
+		blake2s_update(&blake, (u8 *)&ip_hdr(skb)->saddr,
 			       sizeof(struct in_addr));
 	else if (skb->protocol == htons(ETH_P_IPV6))
-		blake2s_update(&state, (u8 *)&ipv6_hdr(skb)->saddr,
+		blake2s_update(&blake, (u8 *)&ipv6_hdr(skb)->saddr,
 			       sizeof(struct in6_addr));
-	blake2s_update(&state, (u8 *)&udp_hdr(skb)->source, sizeof(__be16));
-	blake2s_final(&state, cookie);
+	blake2s_update(&blake, (u8 *)&udp_hdr(skb)->source, sizeof(__be16));
+	blake2s_final(&blake, cookie);
 
 	up_read(&checker->secret_lock);
 }
diff --git a/drivers/net/wireguard/noise.c b/drivers/net/wireguard/noise.c
index 7eb9a23a3d4d..1fe8468f0bef 100644
--- a/drivers/net/wireguard/noise.c
+++ b/drivers/net/wireguard/noise.c
@@ -33,10 +33,10 @@ static atomic64_t keypair_counter = ATOMIC64_INIT(0);
 
 void __init wg_noise_init(void)
 {
-	struct blake2s_state blake;
+	struct blake2s_ctx blake;
 
-	blake2s(handshake_init_chaining_key, handshake_name, NULL,
-		NOISE_HASH_LEN, sizeof(handshake_name), 0);
+	blake2s(NULL, 0, handshake_name, sizeof(handshake_name),
+		handshake_init_chaining_key, NOISE_HASH_LEN);
 	blake2s_init(&blake, NOISE_HASH_LEN);
 	blake2s_update(&blake, handshake_init_chaining_key, NOISE_HASH_LEN);
 	blake2s_update(&blake, identifier_name, sizeof(identifier_name));
@@ -304,33 +304,33 @@ void wg_noise_set_static_identity_private_key(
 
 static void hmac(u8 *out, const u8 *in, const u8 *key, const size_t inlen, const size_t keylen)
 {
-	struct blake2s_state state;
+	struct blake2s_ctx blake;
 	u8 x_key[BLAKE2S_BLOCK_SIZE] __aligned(__alignof__(u32)) = { 0 };
 	u8 i_hash[BLAKE2S_HASH_SIZE] __aligned(__alignof__(u32));
 	int i;
 
 	if (keylen > BLAKE2S_BLOCK_SIZE) {
-		blake2s_init(&state, BLAKE2S_HASH_SIZE);
-		blake2s_update(&state, key, keylen);
-		blake2s_final(&state, x_key);
+		blake2s_init(&blake, BLAKE2S_HASH_SIZE);
+		blake2s_update(&blake, key, keylen);
+		blake2s_final(&blake, x_key);
 	} else
 		memcpy(x_key, key, keylen);
 
 	for (i = 0; i < BLAKE2S_BLOCK_SIZE; ++i)
 		x_key[i] ^= 0x36;
 
-	blake2s_init(&state, BLAKE2S_HASH_SIZE);
-	blake2s_update(&state, x_key, BLAKE2S_BLOCK_SIZE);
-	blake2s_update(&state, in, inlen);
-	blake2s_final(&state, i_hash);
+	blake2s_init(&blake, BLAKE2S_HASH_SIZE);
+	blake2s_update(&blake, x_key, BLAKE2S_BLOCK_SIZE);
+	blake2s_update(&blake, in, inlen);
+	blake2s_final(&blake, i_hash);
 
 	for (i = 0; i < BLAKE2S_BLOCK_SIZE; ++i)
 		x_key[i] ^= 0x5c ^ 0x36;
 
-	blake2s_init(&state, BLAKE2S_HASH_SIZE);
-	blake2s_update(&state, x_key, BLAKE2S_BLOCK_SIZE);
-	blake2s_update(&state, i_hash, BLAKE2S_HASH_SIZE);
-	blake2s_final(&state, i_hash);
+	blake2s_init(&blake, BLAKE2S_HASH_SIZE);
+	blake2s_update(&blake, x_key, BLAKE2S_BLOCK_SIZE);
+	blake2s_update(&blake, i_hash, BLAKE2S_HASH_SIZE);
+	blake2s_final(&blake, i_hash);
 
 	memcpy(out, i_hash, BLAKE2S_HASH_SIZE);
 	memzero_explicit(x_key, BLAKE2S_BLOCK_SIZE);
@@ -431,7 +431,7 @@ static bool __must_check mix_precomputed_dh(u8 chaining_key[NOISE_HASH_LEN],
 
 static void mix_hash(u8 hash[NOISE_HASH_LEN], const u8 *src, size_t src_len)
 {
-	struct blake2s_state blake;
+	struct blake2s_ctx blake;
 
 	blake2s_init(&blake, NOISE_HASH_LEN);
 	blake2s_update(&blake, hash, NOISE_HASH_LEN);
diff --git a/drivers/net/wireless/intel/iwlwifi/iwl-trans.c b/drivers/net/wireless/intel/iwlwifi/iwl-trans.c
index 5232f66c2d52..cc8a84018f70 100644
--- a/drivers/net/wireless/intel/iwlwifi/iwl-trans.c
+++ b/drivers/net/wireless/intel/iwlwifi/iwl-trans.c
@@ -129,7 +129,7 @@ static enum iwl_reset_mode
 iwl_trans_determine_restart_mode(struct iwl_trans *trans)
 {
 	struct iwl_trans_dev_restart_data *data;
-	enum iwl_reset_mode at_least = 0;
+	enum iwl_reset_mode min_mode = 0;
 	unsigned int index;
 	static const enum iwl_reset_mode escalation_list_old[] = {
 		IWL_RESET_MODE_SW_RESET,
@@ -173,11 +173,11 @@ iwl_trans_determine_restart_mode(struct iwl_trans *trans)
 	}
 
 	if (trans->restart.during_reset)
-		at_least = IWL_RESET_MODE_REPROBE;
+		min_mode = IWL_RESET_MODE_REPROBE;
 
 	data = iwl_trans_get_restart_data(trans->dev);
 	if (!data)
-		return at_least;
+		return min_mode;
 
 	if (!data->backoff &&
 	    ktime_get_boottime_seconds() - data->last_error >=
@@ -194,7 +194,7 @@ iwl_trans_determine_restart_mode(struct iwl_trans *trans)
 		data->backoff = false;
 	}
 
-	return max(at_least, escalation_list[index]);
+	return max(min_mode, escalation_list[index]);
 }
 
 #define IWL_TRANS_TOP_FOLLOWER_WAIT	180 /* ms */
diff --git a/drivers/opp/core.c b/drivers/opp/core.c
index bba4f7daff8c..dbebb8c829bc 100644
--- a/drivers/opp/core.c
+++ b/drivers/opp/core.c
@@ -309,9 +309,9 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_is_turbo);
  */
 unsigned long dev_pm_opp_get_max_clock_latency(struct device *dev)
 {
-	struct opp_table *opp_table __free(put_opp_table);
+	struct opp_table *opp_table __free(put_opp_table) =
+		_find_opp_table(dev);
 
-	opp_table = _find_opp_table(dev);
 	if (IS_ERR(opp_table))
 		return 0;
 
@@ -327,7 +327,6 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_get_max_clock_latency);
  */
 unsigned long dev_pm_opp_get_max_volt_latency(struct device *dev)
 {
-	struct opp_table *opp_table __free(put_opp_table);
 	struct dev_pm_opp *opp;
 	struct regulator *reg;
 	unsigned long latency_ns = 0;
@@ -337,7 +336,9 @@ unsigned long dev_pm_opp_get_max_volt_latency(struct device *dev)
 		unsigned long max;
 	} *uV;
 
-	opp_table = _find_opp_table(dev);
+	struct opp_table *opp_table __free(put_opp_table) =
+		_find_opp_table(dev);
+
 	if (IS_ERR(opp_table))
 		return 0;
 
@@ -409,10 +410,11 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_get_max_transition_latency);
  */
 unsigned long dev_pm_opp_get_suspend_opp_freq(struct device *dev)
 {
-	struct opp_table *opp_table __free(put_opp_table);
 	unsigned long freq = 0;
 
-	opp_table = _find_opp_table(dev);
+	struct opp_table *opp_table __free(put_opp_table) =
+		_find_opp_table(dev);
+
 	if (IS_ERR(opp_table))
 		return 0;
 
@@ -447,9 +449,9 @@ int _get_opp_count(struct opp_table *opp_table)
  */
 int dev_pm_opp_get_opp_count(struct device *dev)
 {
-	struct opp_table *opp_table __free(put_opp_table);
+	struct opp_table *opp_table __free(put_opp_table) =
+		_find_opp_table(dev);
 
-	opp_table = _find_opp_table(dev);
 	if (IS_ERR(opp_table)) {
 		dev_dbg(dev, "%s: OPP table not found (%ld)\n",
 			__func__, PTR_ERR(opp_table));
@@ -605,9 +607,9 @@ _find_key(struct device *dev, unsigned long *key, int index, bool available,
 			  unsigned long opp_key, unsigned long key),
 	  bool (*assert)(struct opp_table *opp_table, unsigned int index))
 {
-	struct opp_table *opp_table __free(put_opp_table);
+	struct opp_table *opp_table __free(put_opp_table) =
+		_find_opp_table(dev);
 
-	opp_table = _find_opp_table(dev);
 	if (IS_ERR(opp_table)) {
 		dev_err(dev, "%s: OPP table not found (%ld)\n", __func__,
 			PTR_ERR(opp_table));
@@ -1410,12 +1412,13 @@ static int _set_opp(struct device *dev, struct opp_table *opp_table,
  */
 int dev_pm_opp_set_rate(struct device *dev, unsigned long target_freq)
 {
-	struct opp_table *opp_table __free(put_opp_table);
 	struct dev_pm_opp *opp __free(put_opp) = NULL;
 	unsigned long freq = 0, temp_freq;
 	bool forced = false;
 
-	opp_table = _find_opp_table(dev);
+	struct opp_table *opp_table __free(put_opp_table) =
+		_find_opp_table(dev);
+
 	if (IS_ERR(opp_table)) {
 		dev_err(dev, "%s: device's opp table doesn't exist\n", __func__);
 		return PTR_ERR(opp_table);
@@ -1477,9 +1480,9 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_set_rate);
  */
 int dev_pm_opp_set_opp(struct device *dev, struct dev_pm_opp *opp)
 {
-	struct opp_table *opp_table __free(put_opp_table);
+	struct opp_table *opp_table __free(put_opp_table) =
+		_find_opp_table(dev);
 
-	opp_table = _find_opp_table(dev);
 	if (IS_ERR(opp_table)) {
 		dev_err(dev, "%s: device opp doesn't exist\n", __func__);
 		return PTR_ERR(opp_table);
@@ -1794,10 +1797,11 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_put);
  */
 void dev_pm_opp_remove(struct device *dev, unsigned long freq)
 {
-	struct opp_table *opp_table __free(put_opp_table);
 	struct dev_pm_opp *opp = NULL, *iter;
 
-	opp_table = _find_opp_table(dev);
+	struct opp_table *opp_table __free(put_opp_table) =
+		_find_opp_table(dev);
+
 	if (IS_ERR(opp_table))
 		return;
 
@@ -1885,9 +1889,9 @@ bool _opp_remove_all_static(struct opp_table *opp_table)
  */
 void dev_pm_opp_remove_all_dynamic(struct device *dev)
 {
-	struct opp_table *opp_table __free(put_opp_table);
+	struct opp_table *opp_table __free(put_opp_table) =
+		_find_opp_table(dev);
 
-	opp_table = _find_opp_table(dev);
 	if (IS_ERR(opp_table))
 		return;
 
@@ -2871,10 +2875,11 @@ static int _opp_set_availability(struct device *dev, unsigned long freq,
 				 bool availability_req)
 {
 	struct dev_pm_opp *opp __free(put_opp) = ERR_PTR(-ENODEV), *tmp_opp;
-	struct opp_table *opp_table __free(put_opp_table);
 
 	/* Find the opp_table */
-	opp_table = _find_opp_table(dev);
+	struct opp_table *opp_table __free(put_opp_table) =
+		_find_opp_table(dev);
+
 	if (IS_ERR(opp_table)) {
 		dev_warn(dev, "%s: Device OPP not found (%ld)\n", __func__,
 			 PTR_ERR(opp_table));
@@ -2932,11 +2937,12 @@ int dev_pm_opp_adjust_voltage(struct device *dev, unsigned long freq,
 
 {
 	struct dev_pm_opp *opp __free(put_opp) = ERR_PTR(-ENODEV), *tmp_opp;
-	struct opp_table *opp_table __free(put_opp_table);
 	int r;
 
 	/* Find the opp_table */
-	opp_table = _find_opp_table(dev);
+	struct opp_table *opp_table __free(put_opp_table) =
+		_find_opp_table(dev);
+
 	if (IS_ERR(opp_table)) {
 		r = PTR_ERR(opp_table);
 		dev_warn(dev, "%s: Device OPP not found (%d)\n", __func__, r);
@@ -2986,12 +2992,13 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_adjust_voltage);
  */
 int dev_pm_opp_sync_regulators(struct device *dev)
 {
-	struct opp_table *opp_table __free(put_opp_table);
 	struct regulator *reg;
 	int ret, i;
 
 	/* Device may not have OPP table */
-	opp_table = _find_opp_table(dev);
+	struct opp_table *opp_table __free(put_opp_table) =
+		_find_opp_table(dev);
+
 	if (IS_ERR(opp_table))
 		return 0;
 
@@ -3062,9 +3069,9 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_disable);
  */
 int dev_pm_opp_register_notifier(struct device *dev, struct notifier_block *nb)
 {
-	struct opp_table *opp_table __free(put_opp_table);
+	struct opp_table *opp_table __free(put_opp_table) =
+		_find_opp_table(dev);
 
-	opp_table = _find_opp_table(dev);
 	if (IS_ERR(opp_table))
 		return PTR_ERR(opp_table);
 
@@ -3082,9 +3089,9 @@ EXPORT_SYMBOL(dev_pm_opp_register_notifier);
 int dev_pm_opp_unregister_notifier(struct device *dev,
 				   struct notifier_block *nb)
 {
-	struct opp_table *opp_table __free(put_opp_table);
+	struct opp_table *opp_table __free(put_opp_table) =
+		_find_opp_table(dev);
 
-	opp_table = _find_opp_table(dev);
 	if (IS_ERR(opp_table))
 		return PTR_ERR(opp_table);
 
@@ -3101,10 +3108,10 @@ EXPORT_SYMBOL(dev_pm_opp_unregister_notifier);
  */
 void dev_pm_opp_remove_table(struct device *dev)
 {
-	struct opp_table *opp_table __free(put_opp_table);
-
 	/* Check for existing table for 'dev' */
-	opp_table = _find_opp_table(dev);
+	struct opp_table *opp_table __free(put_opp_table) =
+		_find_opp_table(dev);
+
 	if (IS_ERR(opp_table)) {
 		int error = PTR_ERR(opp_table);
 
diff --git a/drivers/opp/cpu.c b/drivers/opp/cpu.c
index 97989d4fe336..a6da7ee3ec76 100644
--- a/drivers/opp/cpu.c
+++ b/drivers/opp/cpu.c
@@ -56,10 +56,10 @@ int dev_pm_opp_init_cpufreq_table(struct device *dev,
 		return -ENOMEM;
 
 	for (i = 0, rate = 0; i < max_opps; i++, rate++) {
-		struct dev_pm_opp *opp __free(put_opp);
-
 		/* find next rate */
-		opp = dev_pm_opp_find_freq_ceil(dev, &rate);
+		struct dev_pm_opp *opp __free(put_opp) =
+			dev_pm_opp_find_freq_ceil(dev, &rate);
+
 		if (IS_ERR(opp)) {
 			ret = PTR_ERR(opp);
 			goto out;
@@ -154,12 +154,13 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_cpumask_remove_table);
 int dev_pm_opp_set_sharing_cpus(struct device *cpu_dev,
 				const struct cpumask *cpumask)
 {
-	struct opp_table *opp_table __free(put_opp_table);
 	struct opp_device *opp_dev;
 	struct device *dev;
 	int cpu;
 
-	opp_table = _find_opp_table(cpu_dev);
+	struct opp_table *opp_table __free(put_opp_table) =
+		_find_opp_table(cpu_dev);
+
 	if (IS_ERR(opp_table))
 		return PTR_ERR(opp_table);
 
@@ -201,10 +202,11 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_set_sharing_cpus);
  */
 int dev_pm_opp_get_sharing_cpus(struct device *cpu_dev, struct cpumask *cpumask)
 {
-	struct opp_table *opp_table __free(put_opp_table);
 	struct opp_device *opp_dev;
 
-	opp_table = _find_opp_table(cpu_dev);
+	struct opp_table *opp_table __free(put_opp_table) =
+		_find_opp_table(cpu_dev);
+
 	if (IS_ERR(opp_table))
 		return PTR_ERR(opp_table);
 
diff --git a/drivers/opp/of.c b/drivers/opp/of.c
index 505d79821584..1e0d0adb18e1 100644
--- a/drivers/opp/of.c
+++ b/drivers/opp/of.c
@@ -45,9 +45,10 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_of_get_opp_desc_node);
 struct opp_table *_managed_opp(struct device *dev, int index)
 {
 	struct opp_table *opp_table, *managed_table = NULL;
-	struct device_node *np __free(device_node);
 
-	np = _opp_of_get_opp_desc_node(dev->of_node, index);
+	struct device_node *np __free(device_node) =
+		_opp_of_get_opp_desc_node(dev->of_node, index);
+
 	if (!np)
 		return NULL;
 
@@ -95,10 +96,11 @@ static struct device_node *of_parse_required_opp(struct device_node *np,
 /* The caller must call dev_pm_opp_put_opp_table() after the table is used */
 static struct opp_table *_find_table_of_opp_np(struct device_node *opp_np)
 {
-	struct device_node *opp_table_np __free(device_node);
 	struct opp_table *opp_table;
 
-	opp_table_np = of_get_parent(opp_np);
+	struct device_node *opp_table_np __free(device_node) =
+		of_get_parent(opp_np);
+
 	if (!opp_table_np)
 		return ERR_PTR(-ENODEV);
 
@@ -146,12 +148,13 @@ static void _opp_table_alloc_required_tables(struct opp_table *opp_table,
 					     struct device_node *opp_np)
 {
 	struct opp_table **required_opp_tables;
-	struct device_node *np __free(device_node);
 	bool lazy = false;
 	int count, i, size;
 
 	/* Traversing the first OPP node is all we need */
-	np = of_get_next_available_child(opp_np, NULL);
+	struct device_node *np __free(device_node) =
+		of_get_next_available_child(opp_np, NULL);
+
 	if (!np) {
 		dev_warn(dev, "Empty OPP table\n");
 		return;
@@ -171,9 +174,9 @@ static void _opp_table_alloc_required_tables(struct opp_table *opp_table,
 	opp_table->required_opp_count = count;
 
 	for (i = 0; i < count; i++) {
-		struct device_node *required_np __free(device_node);
+		struct device_node *required_np __free(device_node) =
+			of_parse_required_opp(np, i);
 
-		required_np = of_parse_required_opp(np, i);
 		if (!required_np) {
 			_opp_table_free_required_tables(opp_table);
 			return;
@@ -199,14 +202,15 @@ static void _opp_table_alloc_required_tables(struct opp_table *opp_table,
 void _of_init_opp_table(struct opp_table *opp_table, struct device *dev,
 			int index)
 {
-	struct device_node *np __free(device_node), *opp_np;
+	struct device_node *opp_np;
 	u32 val;
 
 	/*
 	 * Only required for backward compatibility with v1 bindings, but isn't
 	 * harmful for other cases. And so we do it unconditionally.
 	 */
-	np = of_node_get(dev->of_node);
+	struct device_node *np __free(device_node) = of_node_get(dev->of_node);
+
 	if (!np)
 		return;
 
@@ -273,9 +277,9 @@ void _of_clear_opp(struct opp_table *opp_table, struct dev_pm_opp *opp)
 static int _link_required_opps(struct dev_pm_opp *opp,
 			       struct opp_table *required_table, int index)
 {
-	struct device_node *np __free(device_node);
+	struct device_node *np __free(device_node) =
+		of_parse_required_opp(opp->np, index);
 
-	np = of_parse_required_opp(opp->np, index);
 	if (unlikely(!np))
 		return -ENODEV;
 
@@ -349,16 +353,13 @@ static void lazy_link_required_opp_table(struct opp_table *new_table)
 	guard(mutex)(&opp_table_lock);
 
 	list_for_each_entry_safe(opp_table, temp, &lazy_opp_tables, lazy) {
-		struct device_node *opp_np __free(device_node);
 		bool lazy = false;
 
 		/* opp_np can't be invalid here */
-		opp_np = of_get_next_available_child(opp_table->np, NULL);
+		struct device_node *opp_np __free(device_node) =
+			of_get_next_available_child(opp_table->np, NULL);
 
 		for (i = 0; i < opp_table->required_opp_count; i++) {
-			struct device_node *required_np __free(device_node) = NULL;
-			struct device_node *required_table_np __free(device_node) = NULL;
-
 			required_opp_tables = opp_table->required_opp_tables;
 
 			/* Required opp-table is already parsed */
@@ -366,8 +367,10 @@ static void lazy_link_required_opp_table(struct opp_table *new_table)
 				continue;
 
 			/* required_np can't be invalid here */
-			required_np = of_parse_required_opp(opp_np, i);
-			required_table_np = of_get_parent(required_np);
+			struct device_node *required_np __free(device_node) =
+				of_parse_required_opp(opp_np, i);
+			struct device_node *required_table_np __free(device_node) =
+				of_get_parent(required_np);
 
 			/*
 			 * Newly added table isn't the required opp-table for
@@ -402,13 +405,12 @@ static void lazy_link_required_opp_table(struct opp_table *new_table)
 static int _bandwidth_supported(struct device *dev, struct opp_table *opp_table)
 {
 	struct device_node *opp_np __free(device_node) = NULL;
-	struct device_node *np __free(device_node) = NULL;
 	struct property *prop;
 
 	if (!opp_table) {
-		struct device_node *np __free(device_node);
+		struct device_node *np __free(device_node) =
+			of_node_get(dev->of_node);
 
-		np = of_node_get(dev->of_node);
 		if (!np)
 			return -ENODEV;
 
@@ -422,7 +424,9 @@ static int _bandwidth_supported(struct device *dev, struct opp_table *opp_table)
 		return 0;
 
 	/* Checking only first OPP is sufficient */
-	np = of_get_next_available_child(opp_np, NULL);
+	struct device_node *np __free(device_node) =
+		of_get_next_available_child(opp_np, NULL);
+
 	if (!np) {
 		dev_err(dev, "OPP table empty\n");
 		return -EINVAL;
@@ -1269,11 +1273,12 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_of_cpumask_add_table);
 int dev_pm_opp_of_get_sharing_cpus(struct device *cpu_dev,
 				   struct cpumask *cpumask)
 {
-	struct device_node *np __free(device_node);
 	int cpu;
 
 	/* Get OPP descriptor node */
-	np = dev_pm_opp_of_get_opp_desc_node(cpu_dev);
+	struct device_node *np __free(device_node) =
+		dev_pm_opp_of_get_opp_desc_node(cpu_dev);
+
 	if (!np) {
 		dev_dbg(cpu_dev, "%s: Couldn't find opp node.\n", __func__);
 		return -ENOENT;
@@ -1286,13 +1291,12 @@ int dev_pm_opp_of_get_sharing_cpus(struct device *cpu_dev,
 		return 0;
 
 	for_each_possible_cpu(cpu) {
-		struct device_node *cpu_np __free(device_node) = NULL;
-		struct device_node *tmp_np __free(device_node) = NULL;
-
 		if (cpu == cpu_dev->id)
 			continue;
 
-		cpu_np = of_cpu_device_node_get(cpu);
+		struct device_node *cpu_np __free(device_node) =
+			of_cpu_device_node_get(cpu);
+
 		if (!cpu_np) {
 			dev_err(cpu_dev, "%s: failed to get cpu%d node\n",
 				__func__, cpu);
@@ -1300,7 +1304,9 @@ int dev_pm_opp_of_get_sharing_cpus(struct device *cpu_dev,
 		}
 
 		/* Get OPP descriptor node */
-		tmp_np = _opp_of_get_opp_desc_node(cpu_np, 0);
+		struct device_node *tmp_np __free(device_node) =
+			_opp_of_get_opp_desc_node(cpu_np, 0);
+
 		if (!tmp_np) {
 			pr_err("%pOF: Couldn't find opp node\n", cpu_np);
 			return -ENOENT;
@@ -1328,16 +1334,17 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_of_get_sharing_cpus);
  */
 int of_get_required_opp_performance_state(struct device_node *np, int index)
 {
-	struct device_node *required_np __free(device_node);
-	struct opp_table *opp_table __free(put_opp_table) = NULL;
-	struct dev_pm_opp *opp __free(put_opp) = NULL;
 	int pstate = -EINVAL;
 
-	required_np = of_parse_required_opp(np, index);
+	struct device_node *required_np __free(device_node) =
+		of_parse_required_opp(np, index);
+
 	if (!required_np)
 		return -ENODEV;
 
-	opp_table = _find_table_of_opp_np(required_np);
+	struct opp_table *opp_table __free(put_opp_table) =
+		_find_table_of_opp_np(required_np);
+
 	if (IS_ERR(opp_table)) {
 		pr_err("%s: Failed to find required OPP table %pOF: %ld\n",
 		       __func__, np, PTR_ERR(opp_table));
@@ -1350,7 +1357,9 @@ int of_get_required_opp_performance_state(struct device_node *np, int index)
 		return -EINVAL;
 	}
 
-	opp = _find_opp_of_np(opp_table, required_np);
+	struct dev_pm_opp *opp __free(put_opp) =
+		_find_opp_of_np(opp_table, required_np);
+
 	if (opp) {
 		if (opp->level == OPP_LEVEL_UNSET) {
 			pr_err("%s: OPP levels aren't available for %pOF\n",
@@ -1376,14 +1385,17 @@ EXPORT_SYMBOL_GPL(of_get_required_opp_performance_state);
  */
 bool dev_pm_opp_of_has_required_opp(struct device *dev)
 {
-	struct device_node *np __free(device_node) = NULL, *opp_np __free(device_node);
 	int count;
 
-	opp_np = _opp_of_get_opp_desc_node(dev->of_node, 0);
+	struct device_node *opp_np __free(device_node) =
+		_opp_of_get_opp_desc_node(dev->of_node, 0);
+
 	if (!opp_np)
 		return false;
 
-	np = of_get_next_available_child(opp_np, NULL);
+	struct device_node *np __free(device_node) =
+		of_get_next_available_child(opp_np, NULL);
+
 	if (!np) {
 		dev_warn(dev, "Empty OPP table\n");
 		return false;
@@ -1425,12 +1437,14 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_get_of_node);
 static int __maybe_unused
 _get_dt_power(struct device *dev, unsigned long *uW, unsigned long *kHz)
 {
-	struct dev_pm_opp *opp __free(put_opp);
 	unsigned long opp_freq, opp_power;
 
 	/* Find the right frequency and related OPP */
 	opp_freq = *kHz * 1000;
-	opp = dev_pm_opp_find_freq_ceil(dev, &opp_freq);
+
+	struct dev_pm_opp *opp __free(put_opp) =
+		dev_pm_opp_find_freq_ceil(dev, &opp_freq);
+
 	if (IS_ERR(opp))
 		return -EINVAL;
 
@@ -1465,14 +1479,13 @@ _get_dt_power(struct device *dev, unsigned long *uW, unsigned long *kHz)
 int dev_pm_opp_calc_power(struct device *dev, unsigned long *uW,
 			  unsigned long *kHz)
 {
-	struct dev_pm_opp *opp __free(put_opp) = NULL;
-	struct device_node *np __free(device_node);
 	unsigned long mV, Hz;
 	u32 cap;
 	u64 tmp;
 	int ret;
 
-	np = of_node_get(dev->of_node);
+	struct device_node *np __free(device_node) = of_node_get(dev->of_node);
+
 	if (!np)
 		return -EINVAL;
 
@@ -1481,7 +1494,10 @@ int dev_pm_opp_calc_power(struct device *dev, unsigned long *uW,
 		return -EINVAL;
 
 	Hz = *kHz * 1000;
-	opp = dev_pm_opp_find_freq_ceil(dev, &Hz);
+
+	struct dev_pm_opp *opp __free(put_opp) =
+		dev_pm_opp_find_freq_ceil(dev, &Hz);
+
 	if (IS_ERR(opp))
 		return -EINVAL;
 
@@ -1502,11 +1518,12 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_calc_power);
 
 static bool _of_has_opp_microwatt_property(struct device *dev)
 {
-	struct dev_pm_opp *opp __free(put_opp);
 	unsigned long freq = 0;
 
 	/* Check if at least one OPP has needed property */
-	opp = dev_pm_opp_find_freq_ceil(dev, &freq);
+	struct dev_pm_opp *opp __free(put_opp) =
+		dev_pm_opp_find_freq_ceil(dev, &freq);
+
 	if (IS_ERR(opp))
 		return false;
 
@@ -1526,12 +1543,16 @@ static bool _of_has_opp_microwatt_property(struct device *dev)
  */
 int dev_pm_opp_of_register_em(struct device *dev, struct cpumask *cpus)
 {
-	struct device_node *np __free(device_node) = NULL;
 	struct em_data_callback em_cb;
 	int ret, nr_opp;
 	u32 cap;
 
-	if (IS_ERR_OR_NULL(dev)) {
+	if (IS_ERR_OR_NULL(dev))
+		return -EINVAL;
+
+	struct device_node *np __free(device_node) = of_node_get(dev->of_node);
+
+	if (!np) {
 		ret = -EINVAL;
 		goto failed;
 	}
@@ -1548,12 +1569,6 @@ int dev_pm_opp_of_register_em(struct device *dev, struct cpumask *cpus)
 		goto register_em;
 	}
 
-	np = of_node_get(dev->of_node);
-	if (!np) {
-		ret = -EINVAL;
-		goto failed;
-	}
-
 	/*
 	 * Register an EM only if the 'dynamic-power-coefficient' property is
 	 * set in devicetree. It is assumed the voltage values are known if that
diff --git a/drivers/pci/hotplug/s390_pci_hpc.c b/drivers/pci/hotplug/s390_pci_hpc.c
index d9996516f49e..a55967082ef6 100644
--- a/drivers/pci/hotplug/s390_pci_hpc.c
+++ b/drivers/pci/hotplug/s390_pci_hpc.c
@@ -8,8 +8,7 @@
  *   Jan Glauber <jang@linux.vnet.ibm.com>
  */
 
-#define KMSG_COMPONENT "zpci"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "zpci: " fmt
 
 #include <linux/kernel.h>
 #include <linux/slab.h>
diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c
index 9d6f74bd95f8..3881359440b1 100644
--- a/drivers/pci/pci-sysfs.c
+++ b/drivers/pci/pci-sysfs.c
@@ -1517,8 +1517,8 @@ static ssize_t reset_method_store(struct device *dev,
 		return count;
 	}
 
-	ACQUIRE(pm_runtime_active_try, pm)(dev);
-	if (ACQUIRE_ERR(pm_runtime_active_try, &pm))
+	PM_RUNTIME_ACQUIRE(dev, pm);
+	if (PM_RUNTIME_ACQUIRE_ERR(&pm))
 		return -ENXIO;
 
 	if (sysfs_streq(buf, "default")) {
diff --git a/drivers/perf/arm-ni.c b/drivers/perf/arm-ni.c
index 1615a0564031..66858c65215d 100644
--- a/drivers/perf/arm-ni.c
+++ b/drivers/perf/arm-ni.c
@@ -21,6 +21,11 @@
 
 #define NI_CHILD_NODE_INFO	0x004
 #define NI_CHILD_PTR(n)		(0x008 + (n) * 4)
+#define NI_NUM_SUB_FEATURES	0x100
+#define NI_SUB_FEATURE_TYPE(n)	(0x108 + (n) * 8)
+#define NI_SUB_FEATURE_PTR(n)	(0x10c + (n) * 8)
+
+#define NI_SUB_FEATURE_TYPE_FCU	0x2
 
 #define NI700_PMUSELA		0x00c
 
@@ -33,9 +38,10 @@
 #define NI_PIDR2_VERSION	GENMASK(7, 4)
 
 /* PMU node */
-#define NI_PMEVCNTR(n)		(0x008 + (n) * 8)
-#define NI_PMCCNTR_L		0x0f8
-#define NI_PMCCNTR_U		0x0fc
+#define NI700_PMEVCNTR(n)	(0x008 + (n) * 8)
+#define NI700_PMCCNTR_L		0x0f8
+#define NI_PMEVCNTR(n)		(0x200 + (n) * 8)
+#define NI_PMCCNTR_L		0x2f8
 #define NI_PMEVTYPER(n)		(0x400 + (n) * 4)
 #define NI_PMEVTYPER_NODE_TYPE	GENMASK(12, 9)
 #define NI_PMEVTYPER_NODE_ID	GENMASK(8, 0)
@@ -66,6 +72,8 @@
 enum ni_part {
 	PART_NI_700 = 0x43b,
 	PART_NI_710AE = 0x43d,
+	PART_NOC_S3 = 0x43f,
+	PART_SI_L1 = 0x455,
 };
 
 enum ni_node_type {
@@ -79,6 +87,10 @@ enum ni_node_type {
 	NI_HSNI,
 	NI_HMNI,
 	NI_PMNI,
+	NI_TSNI,
+	NI_TMNI,
+	NI_CMNI = 0x0e,
+	NI_MCN = 0x63,
 };
 
 struct arm_ni_node {
@@ -179,6 +191,9 @@ static struct attribute *arm_ni_event_attrs[] = {
 	NI_EVENT_ATTR(hsni, NI_HSNI),
 	NI_EVENT_ATTR(hmni, NI_HMNI),
 	NI_EVENT_ATTR(pmni, NI_PMNI),
+	NI_EVENT_ATTR(tsni, NI_TSNI),
+	NI_EVENT_ATTR(tmni, NI_TMNI),
+	NI_EVENT_ATTR(cmni, NI_CMNI),
 	NULL
 };
 
@@ -308,9 +323,15 @@ static int arm_ni_validate_group(struct perf_event *event)
 	return 0;
 }
 
+static bool arm_ni_is_7xx(const struct arm_ni *ni)
+{
+	return ni->part == PART_NI_700 || ni->part == PART_NI_710AE;
+}
+
 static int arm_ni_event_init(struct perf_event *event)
 {
 	struct arm_ni_cd *cd = pmu_to_cd(event->pmu);
+	struct arm_ni *ni;
 
 	if (event->attr.type != event->pmu->type)
 		return -ENOENT;
@@ -318,7 +339,10 @@ static int arm_ni_event_init(struct perf_event *event)
 	if (is_sampling_event(event))
 		return -EINVAL;
 
-	event->cpu = cd_to_ni(cd)->cpu;
+	ni = cd_to_ni(cd);
+	event->cpu = ni->cpu;
+	event->hw.flags = arm_ni_is_7xx(ni);
+
 	if (NI_EVENT_TYPE(event) == NI_PMU)
 		return arm_ni_validate_group(event);
 
@@ -332,16 +356,16 @@ static int arm_ni_event_init(struct perf_event *event)
 	return -EINVAL;
 }
 
-static u64 arm_ni_read_ccnt(struct arm_ni_cd *cd)
+static u64 arm_ni_read_ccnt(void __iomem *pmccntr)
 {
 	u64 l, u_old, u_new;
 	int retries = 3; /* 1st time unlucky, 2nd improbable, 3rd just broken */
 
-	u_new = readl_relaxed(cd->pmu_base + NI_PMCCNTR_U);
+	u_new = readl_relaxed(pmccntr + 4);
 	do {
 		u_old = u_new;
-		l = readl_relaxed(cd->pmu_base + NI_PMCCNTR_L);
-		u_new = readl_relaxed(cd->pmu_base + NI_PMCCNTR_U);
+		l = readl_relaxed(pmccntr);
+		u_new = readl_relaxed(pmccntr + 4);
 	} while (u_new != u_old && --retries);
 	WARN_ON(!retries);
 
@@ -350,7 +374,6 @@ static u64 arm_ni_read_ccnt(struct arm_ni_cd *cd)
 
 static void arm_ni_event_read(struct perf_event *event)
 {
-	struct arm_ni_cd *cd = pmu_to_cd(event->pmu);
 	struct hw_perf_event *hw = &event->hw;
 	u64 count, prev;
 	bool ccnt = hw->idx == NI_CCNT_IDX;
@@ -358,9 +381,9 @@ static void arm_ni_event_read(struct perf_event *event)
 	do {
 		prev = local64_read(&hw->prev_count);
 		if (ccnt)
-			count = arm_ni_read_ccnt(cd);
+			count = arm_ni_read_ccnt((void __iomem *)event->hw.event_base);
 		else
-			count = readl_relaxed(cd->pmu_base + NI_PMEVCNTR(hw->idx));
+			count = readl_relaxed((void __iomem *)event->hw.event_base);
 	} while (local64_cmpxchg(&hw->prev_count, prev, count) != prev);
 
 	count -= prev;
@@ -385,16 +408,16 @@ static void arm_ni_event_stop(struct perf_event *event, int flags)
 		arm_ni_event_read(event);
 }
 
-static void arm_ni_init_ccnt(struct arm_ni_cd *cd)
+static void arm_ni_init_ccnt(struct hw_perf_event *hw)
 {
-	local64_set(&cd->ccnt->hw.prev_count, S64_MIN);
-	lo_hi_writeq_relaxed(S64_MIN, cd->pmu_base + NI_PMCCNTR_L);
+	local64_set(&hw->prev_count, S64_MIN);
+	lo_hi_writeq_relaxed(S64_MIN, (void __iomem *)hw->event_base);
 }
 
-static void arm_ni_init_evcnt(struct arm_ni_cd *cd, int idx)
+static void arm_ni_init_evcnt(struct hw_perf_event *hw)
 {
-	local64_set(&cd->evcnt[idx]->hw.prev_count, S32_MIN);
-	writel_relaxed(S32_MIN, cd->pmu_base + NI_PMEVCNTR(idx));
+	local64_set(&hw->prev_count, S32_MIN);
+	writel_relaxed(S32_MIN, (void __iomem *)hw->event_base);
 }
 
 static int arm_ni_event_add(struct perf_event *event, int flags)
@@ -409,8 +432,10 @@ static int arm_ni_event_add(struct perf_event *event, int flags)
 		if (cd->ccnt)
 			return -ENOSPC;
 		hw->idx = NI_CCNT_IDX;
+		hw->event_base = (unsigned long)cd->pmu_base +
+				 (hw->flags ? NI700_PMCCNTR_L : NI_PMCCNTR_L);
 		cd->ccnt = event;
-		arm_ni_init_ccnt(cd);
+		arm_ni_init_ccnt(hw);
 	} else {
 		hw->idx = 0;
 		while (cd->evcnt[hw->idx]) {
@@ -420,7 +445,9 @@ static int arm_ni_event_add(struct perf_event *event, int flags)
 		cd->evcnt[hw->idx] = event;
 		unit = (void *)hw->config_base;
 		unit->event[hw->idx] = NI_EVENT_EVENTID(event);
-		arm_ni_init_evcnt(cd, hw->idx);
+		hw->event_base = (unsigned long)cd->pmu_base +
+				 (hw->flags ? NI700_PMEVCNTR(hw->idx) : NI_PMEVCNTR(hw->idx));
+		arm_ni_init_evcnt(hw);
 		lo_hi_writeq_relaxed(le64_to_cpu(unit->pmusel), unit->pmusela);
 
 		reg = FIELD_PREP(NI_PMEVTYPER_NODE_TYPE, type) |
@@ -457,7 +484,7 @@ static irqreturn_t arm_ni_handle_irq(int irq, void *dev_id)
 			ret = IRQ_HANDLED;
 			if (!(WARN_ON(!cd->ccnt))) {
 				arm_ni_event_read(cd->ccnt);
-				arm_ni_init_ccnt(cd);
+				arm_ni_init_ccnt(&cd->ccnt->hw);
 			}
 		}
 		for (int i = 0; i < NI_NUM_COUNTERS; i++) {
@@ -466,7 +493,7 @@ static irqreturn_t arm_ni_handle_irq(int irq, void *dev_id)
 			ret = IRQ_HANDLED;
 			if (!(WARN_ON(!cd->evcnt[i]))) {
 				arm_ni_event_read(cd->evcnt[i]);
-				arm_ni_init_evcnt(cd, i);
+				arm_ni_init_evcnt(&cd->evcnt[i]->hw);
 			}
 		}
 		writel_relaxed(reg, cd->pmu_base + NI_PMOVSCLR);
@@ -476,6 +503,25 @@ static irqreturn_t arm_ni_handle_irq(int irq, void *dev_id)
 	}
 }
 
+static void __iomem *arm_ni_get_pmusel(struct arm_ni *ni, void __iomem *unit_base)
+{
+	u32 type, ptr, num;
+
+	if (arm_ni_is_7xx(ni))
+		return unit_base + NI700_PMUSELA;
+
+	num = readl_relaxed(unit_base + NI_NUM_SUB_FEATURES);
+	for (int i = 0; i < num; i++) {
+		type = readl_relaxed(unit_base + NI_SUB_FEATURE_TYPE(i));
+		if (type != NI_SUB_FEATURE_TYPE_FCU)
+			continue;
+		ptr = readl_relaxed(unit_base + NI_SUB_FEATURE_PTR(i));
+		return ni->base + ptr;
+	}
+	/* Should be impossible */
+	return NULL;
+}
+
 static int arm_ni_init_cd(struct arm_ni *ni, struct arm_ni_node *node, u64 res_start)
 {
 	struct arm_ni_cd *cd = ni->cds + node->id;
@@ -512,13 +558,18 @@ static int arm_ni_init_cd(struct arm_ni *ni, struct arm_ni_node *node, u64 res_s
 		case NI_HSNI:
 		case NI_HMNI:
 		case NI_PMNI:
-			unit->pmusela = unit_base + NI700_PMUSELA;
+		case NI_TSNI:
+		case NI_TMNI:
+		case NI_CMNI:
+			unit->pmusela = arm_ni_get_pmusel(ni, unit_base);
 			writel_relaxed(1, unit->pmusela);
 			if (readl_relaxed(unit->pmusela) != 1)
 				dev_info(ni->dev, "No access to node 0x%04x%04x\n", unit->id, unit->type);
 			else
 				unit->ns = true;
 			break;
+		case NI_MCN:
+			break;
 		default:
 			/*
 			 * e.g. FMU - thankfully bits 3:2 of FMU_ERR_FR0 are RES0 so
@@ -649,6 +700,8 @@ static int arm_ni_probe(struct platform_device *pdev)
 	switch (part) {
 	case PART_NI_700:
 	case PART_NI_710AE:
+	case PART_NOC_S3:
+	case PART_SI_L1:
 		break;
 	default:
 		dev_WARN(&pdev->dev, "Unknown part number: 0x%03x, this may go badly\n", part);
diff --git a/drivers/perf/arm_cspmu/arm_cspmu.c b/drivers/perf/arm_cspmu/arm_cspmu.c
index efa9b229e701..34430b68f602 100644
--- a/drivers/perf/arm_cspmu/arm_cspmu.c
+++ b/drivers/perf/arm_cspmu/arm_cspmu.c
@@ -322,14 +322,14 @@ static struct arm_cspmu_impl_match impl_match[] = {
 	{
 		.module_name	= "nvidia_cspmu",
 		.pmiidr_val	= ARM_CSPMU_IMPL_ID_NVIDIA,
-		.pmiidr_mask	= ARM_CSPMU_PMIIDR_IMPLEMENTER,
+		.pmiidr_mask	= PMIIDR_IMPLEMENTER,
 		.module		= NULL,
 		.impl_init_ops	= NULL,
 	},
 	{
 		.module_name	= "ampere_cspmu",
 		.pmiidr_val	= ARM_CSPMU_IMPL_ID_AMPERE,
-		.pmiidr_mask	= ARM_CSPMU_PMIIDR_IMPLEMENTER,
+		.pmiidr_mask	= PMIIDR_IMPLEMENTER,
 		.module		= NULL,
 		.impl_init_ops	= NULL,
 	},
@@ -351,6 +351,44 @@ static struct arm_cspmu_impl_match *arm_cspmu_impl_match_get(u32 pmiidr)
 	return NULL;
 }
 
+static u32 arm_cspmu_get_pmiidr(struct arm_cspmu *cspmu)
+{
+	u32 pmiidr, pmpidr;
+
+	pmiidr = readl(cspmu->base0 + PMIIDR);
+
+	if (pmiidr != 0)
+		return pmiidr;
+
+	/* Construct PMIIDR value from PMPIDRs. */
+
+	pmpidr = readl(cspmu->base0 + PMPIDR0);
+	pmiidr |= FIELD_PREP(PMIIDR_PRODUCTID_PART_0,
+				FIELD_GET(PMPIDR0_PART_0, pmpidr));
+
+	pmpidr = readl(cspmu->base0 + PMPIDR1);
+	pmiidr |= FIELD_PREP(PMIIDR_PRODUCTID_PART_1,
+				FIELD_GET(PMPIDR1_PART_1, pmpidr));
+	pmiidr |= FIELD_PREP(PMIIDR_IMPLEMENTER_DES_0,
+				FIELD_GET(PMPIDR1_DES_0, pmpidr));
+
+	pmpidr = readl(cspmu->base0 + PMPIDR2);
+	pmiidr |= FIELD_PREP(PMIIDR_VARIANT,
+				FIELD_GET(PMPIDR2_REVISION, pmpidr));
+	pmiidr |= FIELD_PREP(PMIIDR_IMPLEMENTER_DES_1,
+				FIELD_GET(PMPIDR2_DES_1, pmpidr));
+
+	pmpidr = readl(cspmu->base0 + PMPIDR3);
+	pmiidr |= FIELD_PREP(PMIIDR_REVISION,
+				FIELD_GET(PMPIDR3_REVAND, pmpidr));
+
+	pmpidr = readl(cspmu->base0 + PMPIDR4);
+	pmiidr |= FIELD_PREP(PMIIDR_IMPLEMENTER_DES_2,
+				FIELD_GET(PMPIDR4_DES_2, pmpidr));
+
+	return pmiidr;
+}
+
 #define DEFAULT_IMPL_OP(name)	.name = arm_cspmu_##name
 
 static int arm_cspmu_init_impl_ops(struct arm_cspmu *cspmu)
@@ -361,7 +399,7 @@ static int arm_cspmu_init_impl_ops(struct arm_cspmu *cspmu)
 
 	/* Start with a default PMU implementation */
 	cspmu->impl.module = THIS_MODULE;
-	cspmu->impl.pmiidr = readl(cspmu->base0 + PMIIDR);
+	cspmu->impl.pmiidr = arm_cspmu_get_pmiidr(cspmu);
 	cspmu->impl.ops = (struct arm_cspmu_impl_ops) {
 		DEFAULT_IMPL_OP(get_event_attrs),
 		DEFAULT_IMPL_OP(get_format_attrs),
@@ -815,6 +853,10 @@ static void arm_cspmu_stop(struct perf_event *event, int pmu_flags)
 		return;
 
 	arm_cspmu_disable_counter(cspmu, hwc->idx);
+
+	if (cspmu->impl.ops.reset_ev_filter)
+		cspmu->impl.ops.reset_ev_filter(cspmu, event);
+
 	arm_cspmu_event_update(event);
 
 	hwc->state |= PERF_HES_STOPPED | PERF_HES_UPTODATE;
@@ -1365,8 +1407,10 @@ void arm_cspmu_impl_unregister(const struct arm_cspmu_impl_match *impl_match)
 
 	/* Unbind the driver from all matching backend devices. */
 	while ((dev = driver_find_device(&arm_cspmu_driver.driver, NULL,
-			match, arm_cspmu_match_device)))
+			match, arm_cspmu_match_device))) {
 		device_release_driver(dev);
+		put_device(dev);
+	}
 
 	mutex_lock(&arm_cspmu_lock);
 
diff --git a/drivers/perf/arm_cspmu/arm_cspmu.h b/drivers/perf/arm_cspmu/arm_cspmu.h
index 19684b76bd96..cd65a58dbd88 100644
--- a/drivers/perf/arm_cspmu/arm_cspmu.h
+++ b/drivers/perf/arm_cspmu/arm_cspmu.h
@@ -86,6 +86,11 @@
 #define PMCFGR				0xE00
 #define PMCR				0xE04
 #define PMIIDR				0xE08
+#define PMPIDR0				0xFE0
+#define PMPIDR1				0xFE4
+#define PMPIDR2				0xFE8
+#define PMPIDR3				0xFEC
+#define PMPIDR4				0xFD0
 
 /* PMCFGR register field */
 #define PMCFGR_NCG			GENMASK(31, 28)
@@ -115,8 +120,34 @@
 #define PMCR_E				BIT(0)
 
 /* PMIIDR register field */
-#define ARM_CSPMU_PMIIDR_IMPLEMENTER	GENMASK(11, 0)
-#define ARM_CSPMU_PMIIDR_PRODUCTID	GENMASK(31, 20)
+#define PMIIDR_IMPLEMENTER		GENMASK(11, 0)
+#define PMIIDR_IMPLEMENTER_DES_0	GENMASK(3, 0)
+#define PMIIDR_IMPLEMENTER_DES_1	GENMASK(6, 4)
+#define PMIIDR_IMPLEMENTER_DES_2	GENMASK(11, 8)
+#define PMIIDR_REVISION			GENMASK(15, 12)
+#define PMIIDR_VARIANT			GENMASK(19, 16)
+#define PMIIDR_PRODUCTID		GENMASK(31, 20)
+#define PMIIDR_PRODUCTID_PART_0		GENMASK(27, 20)
+#define PMIIDR_PRODUCTID_PART_1		GENMASK(31, 28)
+
+/* PMPIDR0 register field */
+#define PMPIDR0_PART_0			GENMASK(7, 0)
+
+/* PMPIDR1 register field */
+#define PMPIDR1_DES_0			GENMASK(7, 4)
+#define PMPIDR1_PART_1			GENMASK(3, 0)
+
+/* PMPIDR2 register field */
+#define PMPIDR2_REVISION		GENMASK(7, 4)
+#define PMPIDR2_DES_1			GENMASK(2, 0)
+
+/* PMPIDR3 register field */
+#define PMPIDR3_REVAND			GENMASK(7, 4)
+#define PMPIDR3_CMOD			GENMASK(3, 0)
+
+/* PMPIDR4 register field */
+#define PMPIDR4_SIZE			GENMASK(7, 4)
+#define PMPIDR4_DES_2			GENMASK(3, 0)
 
 /* JEDEC-assigned JEP106 identification code */
 #define ARM_CSPMU_IMPL_ID_NVIDIA	0x36B
@@ -152,11 +183,13 @@ struct arm_cspmu_impl_ops {
 	bool (*is_cycle_counter_event)(const struct perf_event *event);
 	/* Decode event type/id from configs */
 	u32 (*event_type)(const struct perf_event *event);
-	/* Set event filters */
+	/* Set/reset event filters */
 	void (*set_cc_filter)(struct arm_cspmu *cspmu,
 			      const struct perf_event *event);
 	void (*set_ev_filter)(struct arm_cspmu *cspmu,
 			      const struct perf_event *event);
+	void (*reset_ev_filter)(struct arm_cspmu *cspmu,
+				const struct perf_event *event);
 	/* Implementation specific event validation */
 	int (*validate_event)(struct arm_cspmu *cspmu,
 			      struct perf_event *event);
diff --git a/drivers/perf/arm_cspmu/nvidia_cspmu.c b/drivers/perf/arm_cspmu/nvidia_cspmu.c
index dc6d4e3e2a1b..e06a06d3407b 100644
--- a/drivers/perf/arm_cspmu/nvidia_cspmu.c
+++ b/drivers/perf/arm_cspmu/nvidia_cspmu.c
@@ -23,7 +23,7 @@
 
 #define NV_GENERIC_FILTER_ID_MASK    GENMASK_ULL(31, 0)
 
-#define NV_PRODID_MASK               GENMASK(31, 0)
+#define NV_PRODID_MASK	(PMIIDR_PRODUCTID | PMIIDR_VARIANT | PMIIDR_REVISION)
 
 #define NV_FORMAT_NAME_GENERIC	0
 
@@ -40,10 +40,21 @@
 
 struct nv_cspmu_ctx {
 	const char *name;
-	u32 filter_mask;
-	u32 filter_default_val;
+
 	struct attribute **event_attr;
 	struct attribute **format_attr;
+
+	u32 filter_mask;
+	u32 filter_default_val;
+	u32 filter2_mask;
+	u32 filter2_default_val;
+
+	u32 (*get_filter)(const struct perf_event *event);
+	u32 (*get_filter2)(const struct perf_event *event);
+
+	void *data;
+
+	int (*init_data)(struct arm_cspmu *cspmu);
 };
 
 static struct attribute *scf_pmu_event_attrs[] = {
@@ -144,6 +155,7 @@ static struct attribute *cnvlink_pmu_format_attrs[] = {
 static struct attribute *generic_pmu_format_attrs[] = {
 	ARM_CSPMU_FORMAT_EVENT_ATTR,
 	ARM_CSPMU_FORMAT_FILTER_ATTR,
+	ARM_CSPMU_FORMAT_FILTER2_ATTR,
 	NULL,
 };
 
@@ -184,13 +196,36 @@ static u32 nv_cspmu_event_filter(const struct perf_event *event)
 	return filter_val;
 }
 
+static u32 nv_cspmu_event_filter2(const struct perf_event *event)
+{
+	const struct nv_cspmu_ctx *ctx =
+		to_nv_cspmu_ctx(to_arm_cspmu(event->pmu));
+
+	const u32 filter_val = event->attr.config2 & ctx->filter2_mask;
+
+	if (filter_val == 0)
+		return ctx->filter2_default_val;
+
+	return filter_val;
+}
+
 static void nv_cspmu_set_ev_filter(struct arm_cspmu *cspmu,
 				   const struct perf_event *event)
 {
-	u32 filter = nv_cspmu_event_filter(event);
-	u32 offset = PMEVFILTR + (4 * event->hw.idx);
+	u32 filter, offset;
+	const struct nv_cspmu_ctx *ctx =
+		to_nv_cspmu_ctx(to_arm_cspmu(event->pmu));
+	offset = 4 * event->hw.idx;
+
+	if (ctx->get_filter) {
+		filter = ctx->get_filter(event);
+		writel(filter, cspmu->base0 + PMEVFILTR + offset);
+	}
 
-	writel(filter, cspmu->base0 + offset);
+	if (ctx->get_filter2) {
+		filter = ctx->get_filter2(event);
+		writel(filter, cspmu->base0 + PMEVFILT2R + offset);
+	}
 }
 
 static void nv_cspmu_set_cc_filter(struct arm_cspmu *cspmu,
@@ -210,74 +245,120 @@ enum nv_cspmu_name_fmt {
 struct nv_cspmu_match {
 	u32 prodid;
 	u32 prodid_mask;
-	u64 filter_mask;
-	u32 filter_default_val;
 	const char *name_pattern;
 	enum nv_cspmu_name_fmt name_fmt;
-	struct attribute **event_attr;
-	struct attribute **format_attr;
+	struct nv_cspmu_ctx template_ctx;
+	struct arm_cspmu_impl_ops ops;
 };
 
 static const struct nv_cspmu_match nv_cspmu_match[] = {
 	{
-	  .prodid = 0x103,
+	  .prodid = 0x10300000,
 	  .prodid_mask = NV_PRODID_MASK,
-	  .filter_mask = NV_PCIE_FILTER_ID_MASK,
-	  .filter_default_val = NV_PCIE_FILTER_ID_MASK,
 	  .name_pattern = "nvidia_pcie_pmu_%u",
 	  .name_fmt = NAME_FMT_SOCKET,
-	  .event_attr = mcf_pmu_event_attrs,
-	  .format_attr = pcie_pmu_format_attrs
+	  .template_ctx = {
+		.event_attr = mcf_pmu_event_attrs,
+		.format_attr = pcie_pmu_format_attrs,
+		.filter_mask = NV_PCIE_FILTER_ID_MASK,
+		.filter_default_val = NV_PCIE_FILTER_ID_MASK,
+		.filter2_mask = 0x0,
+		.filter2_default_val = 0x0,
+		.get_filter = nv_cspmu_event_filter,
+		.get_filter2 = NULL,
+		.data = NULL,
+		.init_data = NULL
+	  },
 	},
 	{
-	  .prodid = 0x104,
+	  .prodid = 0x10400000,
 	  .prodid_mask = NV_PRODID_MASK,
-	  .filter_mask = NV_NVL_C2C_FILTER_ID_MASK,
-	  .filter_default_val = NV_NVL_C2C_FILTER_ID_MASK,
 	  .name_pattern = "nvidia_nvlink_c2c1_pmu_%u",
 	  .name_fmt = NAME_FMT_SOCKET,
-	  .event_attr = mcf_pmu_event_attrs,
-	  .format_attr = nvlink_c2c_pmu_format_attrs
+	  .template_ctx = {
+		.event_attr = mcf_pmu_event_attrs,
+		.format_attr = nvlink_c2c_pmu_format_attrs,
+		.filter_mask = NV_NVL_C2C_FILTER_ID_MASK,
+		.filter_default_val = NV_NVL_C2C_FILTER_ID_MASK,
+		.filter2_mask = 0x0,
+		.filter2_default_val = 0x0,
+		.get_filter = nv_cspmu_event_filter,
+		.get_filter2 = NULL,
+		.data = NULL,
+		.init_data = NULL
+	  },
 	},
 	{
-	  .prodid = 0x105,
+	  .prodid = 0x10500000,
 	  .prodid_mask = NV_PRODID_MASK,
-	  .filter_mask = NV_NVL_C2C_FILTER_ID_MASK,
-	  .filter_default_val = NV_NVL_C2C_FILTER_ID_MASK,
 	  .name_pattern = "nvidia_nvlink_c2c0_pmu_%u",
 	  .name_fmt = NAME_FMT_SOCKET,
-	  .event_attr = mcf_pmu_event_attrs,
-	  .format_attr = nvlink_c2c_pmu_format_attrs
+	  .template_ctx = {
+		.event_attr = mcf_pmu_event_attrs,
+		.format_attr = nvlink_c2c_pmu_format_attrs,
+		.filter_mask = NV_NVL_C2C_FILTER_ID_MASK,
+		.filter_default_val = NV_NVL_C2C_FILTER_ID_MASK,
+		.filter2_mask = 0x0,
+		.filter2_default_val = 0x0,
+		.get_filter = nv_cspmu_event_filter,
+		.get_filter2 = NULL,
+		.data = NULL,
+		.init_data = NULL
+	  },
 	},
 	{
-	  .prodid = 0x106,
+	  .prodid = 0x10600000,
 	  .prodid_mask = NV_PRODID_MASK,
-	  .filter_mask = NV_CNVL_FILTER_ID_MASK,
-	  .filter_default_val = NV_CNVL_FILTER_ID_MASK,
 	  .name_pattern = "nvidia_cnvlink_pmu_%u",
 	  .name_fmt = NAME_FMT_SOCKET,
-	  .event_attr = mcf_pmu_event_attrs,
-	  .format_attr = cnvlink_pmu_format_attrs
+	  .template_ctx = {
+		.event_attr = mcf_pmu_event_attrs,
+		.format_attr = cnvlink_pmu_format_attrs,
+		.filter_mask = NV_CNVL_FILTER_ID_MASK,
+		.filter_default_val = NV_CNVL_FILTER_ID_MASK,
+		.filter2_mask = 0x0,
+		.filter2_default_val = 0x0,
+		.get_filter = nv_cspmu_event_filter,
+		.get_filter2 = NULL,
+		.data = NULL,
+		.init_data = NULL
+	  },
 	},
 	{
-	  .prodid = 0x2CF,
+	  .prodid = 0x2CF00000,
 	  .prodid_mask = NV_PRODID_MASK,
-	  .filter_mask = 0x0,
-	  .filter_default_val = 0x0,
 	  .name_pattern = "nvidia_scf_pmu_%u",
 	  .name_fmt = NAME_FMT_SOCKET,
-	  .event_attr = scf_pmu_event_attrs,
-	  .format_attr = scf_pmu_format_attrs
+	  .template_ctx = {
+		.event_attr = scf_pmu_event_attrs,
+		.format_attr = scf_pmu_format_attrs,
+		.filter_mask = 0x0,
+		.filter_default_val = 0x0,
+		.filter2_mask = 0x0,
+		.filter2_default_val = 0x0,
+		.get_filter = nv_cspmu_event_filter,
+		.get_filter2 = NULL,
+		.data = NULL,
+		.init_data = NULL
+	  },
 	},
 	{
 	  .prodid = 0,
 	  .prodid_mask = 0,
-	  .filter_mask = NV_GENERIC_FILTER_ID_MASK,
-	  .filter_default_val = NV_GENERIC_FILTER_ID_MASK,
 	  .name_pattern = "nvidia_uncore_pmu_%u",
 	  .name_fmt = NAME_FMT_GENERIC,
-	  .event_attr = generic_pmu_event_attrs,
-	  .format_attr = generic_pmu_format_attrs
+	  .template_ctx = {
+		.event_attr = generic_pmu_event_attrs,
+		.format_attr = generic_pmu_format_attrs,
+		.filter_mask = NV_GENERIC_FILTER_ID_MASK,
+		.filter_default_val = NV_GENERIC_FILTER_ID_MASK,
+		.filter2_mask = NV_GENERIC_FILTER_ID_MASK,
+		.filter2_default_val = NV_GENERIC_FILTER_ID_MASK,
+		.get_filter = nv_cspmu_event_filter,
+		.get_filter2 = nv_cspmu_event_filter2,
+		.data = NULL,
+		.init_data = NULL
+	  },
 	},
 };
 
@@ -310,9 +391,16 @@ static char *nv_cspmu_format_name(const struct arm_cspmu *cspmu,
 	return name;
 }
 
+#define SET_OP(name, impl, match, default_op) \
+	do { \
+		if (match->ops.name) \
+			impl->name = match->ops.name; \
+		else if (default_op != NULL) \
+			impl->name = default_op; \
+	} while (false)
+
 static int nv_cspmu_init_ops(struct arm_cspmu *cspmu)
 {
-	u32 prodid;
 	struct nv_cspmu_ctx *ctx;
 	struct device *dev = cspmu->dev;
 	struct arm_cspmu_impl_ops *impl_ops = &cspmu->impl.ops;
@@ -322,30 +410,30 @@ static int nv_cspmu_init_ops(struct arm_cspmu *cspmu)
 	if (!ctx)
 		return -ENOMEM;
 
-	prodid = FIELD_GET(ARM_CSPMU_PMIIDR_PRODUCTID, cspmu->impl.pmiidr);
-
 	/* Find matching PMU. */
 	for (; match->prodid; match++) {
 		const u32 prodid_mask = match->prodid_mask;
 
-		if ((match->prodid & prodid_mask) == (prodid & prodid_mask))
+		if ((match->prodid & prodid_mask) ==
+		    (cspmu->impl.pmiidr & prodid_mask))
 			break;
 	}
 
-	ctx->name		= nv_cspmu_format_name(cspmu, match);
-	ctx->filter_mask	= match->filter_mask;
-	ctx->filter_default_val = match->filter_default_val;
-	ctx->event_attr		= match->event_attr;
-	ctx->format_attr	= match->format_attr;
+	/* Initialize the context with the matched template. */
+	memcpy(ctx, &match->template_ctx, sizeof(struct nv_cspmu_ctx));
+	ctx->name = nv_cspmu_format_name(cspmu, match);
 
 	cspmu->impl.ctx = ctx;
 
 	/* NVIDIA specific callbacks. */
-	impl_ops->set_cc_filter			= nv_cspmu_set_cc_filter;
-	impl_ops->set_ev_filter			= nv_cspmu_set_ev_filter;
-	impl_ops->get_event_attrs		= nv_cspmu_get_event_attrs;
-	impl_ops->get_format_attrs		= nv_cspmu_get_format_attrs;
-	impl_ops->get_name			= nv_cspmu_get_name;
+	SET_OP(set_cc_filter, impl_ops, match, nv_cspmu_set_cc_filter);
+	SET_OP(set_ev_filter, impl_ops, match, nv_cspmu_set_ev_filter);
+	SET_OP(get_event_attrs, impl_ops, match, nv_cspmu_get_event_attrs);
+	SET_OP(get_format_attrs, impl_ops, match, nv_cspmu_get_format_attrs);
+	SET_OP(get_name, impl_ops, match, nv_cspmu_get_name);
+
+	if (ctx->init_data)
+		return ctx->init_data(cspmu);
 
 	return 0;
 }
diff --git a/drivers/perf/arm_pmu.c b/drivers/perf/arm_pmu.c
index f7abd1333963..973a027d9063 100644
--- a/drivers/perf/arm_pmu.c
+++ b/drivers/perf/arm_pmu.c
@@ -928,6 +928,12 @@ int armpmu_register(struct arm_pmu *pmu)
 	if (ret)
 		return ret;
 
+	/*
+	 * By this stage we know our supported CPUs on either DT/ACPI platforms,
+	 * detect the SMT implementation.
+	 */
+	pmu->has_smt = topology_core_has_smt(cpumask_first(&pmu->supported_cpus));
+
 	if (!pmu->set_event_filter)
 		pmu->pmu.capabilities |= PERF_PMU_CAP_NO_EXCLUDE;
 
diff --git a/drivers/perf/arm_pmuv3.c b/drivers/perf/arm_pmuv3.c
index ca8d706d4022..8014ff766cff 100644
--- a/drivers/perf/arm_pmuv3.c
+++ b/drivers/perf/arm_pmuv3.c
@@ -981,6 +981,7 @@ static int armv8pmu_get_chain_idx(struct pmu_hw_events *cpuc,
 static bool armv8pmu_can_use_pmccntr(struct pmu_hw_events *cpuc,
 				     struct perf_event *event)
 {
+	struct arm_pmu *cpu_pmu = to_arm_pmu(event->pmu);
 	struct hw_perf_event *hwc = &event->hw;
 	unsigned long evtype = hwc->config_base & ARMV8_PMU_EVTYPE_EVENT;
 
@@ -1001,6 +1002,15 @@ static bool armv8pmu_can_use_pmccntr(struct pmu_hw_events *cpuc,
 	if (has_branch_stack(event))
 		return false;
 
+	/*
+	 * The PMCCNTR_EL0 increments from the processor clock rather than
+	 * the PE clock (ARM DDI0487 L.b D13.1.3) which means it'll continue
+	 * counting on a WFI PE if one of its SMT sibling is not idle on a
+	 * multi-threaded implementation. So don't use it on SMT cores.
+	 */
+	if (cpu_pmu->has_smt)
+		return false;
+
 	return true;
 }
 
@@ -1465,6 +1475,10 @@ static int name##_pmu_init(struct arm_pmu *cpu_pmu)			\
 
 PMUV3_INIT_SIMPLE(armv8_pmuv3)
 
+PMUV3_INIT_SIMPLE(armv8_c1_nano)
+PMUV3_INIT_SIMPLE(armv8_c1_premium)
+PMUV3_INIT_SIMPLE(armv8_c1_pro)
+PMUV3_INIT_SIMPLE(armv8_c1_ultra)
 PMUV3_INIT_SIMPLE(armv8_cortex_a34)
 PMUV3_INIT_SIMPLE(armv8_cortex_a55)
 PMUV3_INIT_SIMPLE(armv8_cortex_a65)
@@ -1472,11 +1486,14 @@ PMUV3_INIT_SIMPLE(armv8_cortex_a75)
 PMUV3_INIT_SIMPLE(armv8_cortex_a76)
 PMUV3_INIT_SIMPLE(armv8_cortex_a77)
 PMUV3_INIT_SIMPLE(armv8_cortex_a78)
+PMUV3_INIT_SIMPLE(armv9_cortex_a320)
 PMUV3_INIT_SIMPLE(armv9_cortex_a510)
 PMUV3_INIT_SIMPLE(armv9_cortex_a520)
+PMUV3_INIT_SIMPLE(armv9_cortex_a520ae)
 PMUV3_INIT_SIMPLE(armv9_cortex_a710)
 PMUV3_INIT_SIMPLE(armv9_cortex_a715)
 PMUV3_INIT_SIMPLE(armv9_cortex_a720)
+PMUV3_INIT_SIMPLE(armv9_cortex_a720ae)
 PMUV3_INIT_SIMPLE(armv9_cortex_a725)
 PMUV3_INIT_SIMPLE(armv8_cortex_x1)
 PMUV3_INIT_SIMPLE(armv9_cortex_x2)
@@ -1508,6 +1525,10 @@ PMUV3_INIT_MAP_EVENT(armv8_brcm_vulcan, armv8_vulcan_map_event)
 
 static const struct of_device_id armv8_pmu_of_device_ids[] = {
 	{.compatible = "arm,armv8-pmuv3",	.data = armv8_pmuv3_pmu_init},
+	{.compatible = "arm,c1-nano-pmu",	.data = armv8_c1_nano_pmu_init},
+	{.compatible = "arm,c1-premium-pmu",	.data = armv8_c1_premium_pmu_init},
+	{.compatible = "arm,c1-pro-pmu",	.data = armv8_c1_pro_pmu_init},
+	{.compatible = "arm,c1-ultra-pmu",	.data = armv8_c1_ultra_pmu_init},
 	{.compatible = "arm,cortex-a34-pmu",	.data = armv8_cortex_a34_pmu_init},
 	{.compatible = "arm,cortex-a35-pmu",	.data = armv8_cortex_a35_pmu_init},
 	{.compatible = "arm,cortex-a53-pmu",	.data = armv8_cortex_a53_pmu_init},
@@ -1520,11 +1541,14 @@ static const struct of_device_id armv8_pmu_of_device_ids[] = {
 	{.compatible = "arm,cortex-a76-pmu",	.data = armv8_cortex_a76_pmu_init},
 	{.compatible = "arm,cortex-a77-pmu",	.data = armv8_cortex_a77_pmu_init},
 	{.compatible = "arm,cortex-a78-pmu",	.data = armv8_cortex_a78_pmu_init},
+	{.compatible = "arm,cortex-a320-pmu",	.data = armv9_cortex_a320_pmu_init},
 	{.compatible = "arm,cortex-a510-pmu",	.data = armv9_cortex_a510_pmu_init},
 	{.compatible = "arm,cortex-a520-pmu",	.data = armv9_cortex_a520_pmu_init},
+	{.compatible = "arm,cortex-a520ae-pmu",	.data = armv9_cortex_a520ae_pmu_init},
 	{.compatible = "arm,cortex-a710-pmu",	.data = armv9_cortex_a710_pmu_init},
 	{.compatible = "arm,cortex-a715-pmu",	.data = armv9_cortex_a715_pmu_init},
 	{.compatible = "arm,cortex-a720-pmu",	.data = armv9_cortex_a720_pmu_init},
+	{.compatible = "arm,cortex-a720ae-pmu",	.data = armv9_cortex_a720ae_pmu_init},
 	{.compatible = "arm,cortex-a725-pmu",	.data = armv9_cortex_a725_pmu_init},
 	{.compatible = "arm,cortex-x1-pmu",	.data = armv8_cortex_x1_pmu_init},
 	{.compatible = "arm,cortex-x2-pmu",	.data = armv9_cortex_x2_pmu_init},
diff --git a/drivers/perf/arm_spe_pmu.c b/drivers/perf/arm_spe_pmu.c
index 87908f0712c0..4801115f2b54 100644
--- a/drivers/perf/arm_spe_pmu.c
+++ b/drivers/perf/arm_spe_pmu.c
@@ -87,6 +87,7 @@ struct arm_spe_pmu {
 #define SPE_PMU_FEAT_INV_FILT_EVT		(1UL << 6)
 #define SPE_PMU_FEAT_DISCARD			(1UL << 7)
 #define SPE_PMU_FEAT_EFT			(1UL << 8)
+#define SPE_PMU_FEAT_FDS			(1UL << 9)
 #define SPE_PMU_FEAT_DEV_PROBED			(1UL << 63)
 	u64					features;
 
@@ -252,6 +253,10 @@ static const struct attribute_group arm_spe_pmu_cap_group = {
 #define ATTR_CFG_FLD_inv_event_filter_LO	0
 #define ATTR_CFG_FLD_inv_event_filter_HI	63
 
+#define ATTR_CFG_FLD_inv_data_src_filter_CFG	config4	/* inverse of PMSDSFR_EL1 */
+#define ATTR_CFG_FLD_inv_data_src_filter_LO	0
+#define ATTR_CFG_FLD_inv_data_src_filter_HI	63
+
 GEN_PMU_FORMAT_ATTR(ts_enable);
 GEN_PMU_FORMAT_ATTR(pa_enable);
 GEN_PMU_FORMAT_ATTR(pct_enable);
@@ -268,6 +273,7 @@ GEN_PMU_FORMAT_ATTR(float_filter);
 GEN_PMU_FORMAT_ATTR(float_filter_mask);
 GEN_PMU_FORMAT_ATTR(event_filter);
 GEN_PMU_FORMAT_ATTR(inv_event_filter);
+GEN_PMU_FORMAT_ATTR(inv_data_src_filter);
 GEN_PMU_FORMAT_ATTR(min_latency);
 GEN_PMU_FORMAT_ATTR(discard);
 
@@ -288,6 +294,7 @@ static struct attribute *arm_spe_pmu_formats_attr[] = {
 	&format_attr_float_filter_mask.attr,
 	&format_attr_event_filter.attr,
 	&format_attr_inv_event_filter.attr,
+	&format_attr_inv_data_src_filter.attr,
 	&format_attr_min_latency.attr,
 	&format_attr_discard.attr,
 	NULL,
@@ -306,6 +313,10 @@ static umode_t arm_spe_pmu_format_attr_is_visible(struct kobject *kobj,
 	if (attr == &format_attr_inv_event_filter.attr && !(spe_pmu->features & SPE_PMU_FEAT_INV_FILT_EVT))
 		return 0;
 
+	if (attr == &format_attr_inv_data_src_filter.attr &&
+	    !(spe_pmu->features & SPE_PMU_FEAT_FDS))
+		return 0;
+
 	if ((attr == &format_attr_branch_filter_mask.attr ||
 	     attr == &format_attr_load_filter_mask.attr ||
 	     attr == &format_attr_store_filter_mask.attr ||
@@ -430,6 +441,9 @@ static u64 arm_spe_event_to_pmsfcr(struct perf_event *event)
 	if (ATTR_CFG_GET_FLD(attr, inv_event_filter))
 		reg |= PMSFCR_EL1_FnE;
 
+	if (ATTR_CFG_GET_FLD(attr, inv_data_src_filter))
+		reg |= PMSFCR_EL1_FDS;
+
 	if (ATTR_CFG_GET_FLD(attr, min_latency))
 		reg |= PMSFCR_EL1_FL;
 
@@ -454,6 +468,17 @@ static u64 arm_spe_event_to_pmslatfr(struct perf_event *event)
 	return FIELD_PREP(PMSLATFR_EL1_MINLAT, ATTR_CFG_GET_FLD(attr, min_latency));
 }
 
+static u64 arm_spe_event_to_pmsdsfr(struct perf_event *event)
+{
+	struct perf_event_attr *attr = &event->attr;
+
+	/*
+	 * Data src filter is inverted so that the default value of 0 is
+	 * equivalent to no filtering.
+	 */
+	return ~ATTR_CFG_GET_FLD(attr, inv_data_src_filter);
+}
+
 static void arm_spe_pmu_pad_buf(struct perf_output_handle *handle, int len)
 {
 	struct arm_spe_pmu_buf *buf = perf_get_aux(handle);
@@ -791,6 +816,10 @@ static int arm_spe_pmu_event_init(struct perf_event *event)
 	if (arm_spe_event_to_pmsnevfr(event) & spe_pmu->pmsevfr_res0)
 		return -EOPNOTSUPP;
 
+	if (arm_spe_event_to_pmsdsfr(event) != U64_MAX &&
+	    !(spe_pmu->features & SPE_PMU_FEAT_FDS))
+		return -EOPNOTSUPP;
+
 	if (attr->exclude_idle)
 		return -EOPNOTSUPP;
 
@@ -866,6 +895,11 @@ static void arm_spe_pmu_start(struct perf_event *event, int flags)
 		write_sysreg_s(reg, SYS_PMSNEVFR_EL1);
 	}
 
+	if (spe_pmu->features & SPE_PMU_FEAT_FDS) {
+		reg = arm_spe_event_to_pmsdsfr(event);
+		write_sysreg_s(reg, SYS_PMSDSFR_EL1);
+	}
+
 	reg = arm_spe_event_to_pmslatfr(event);
 	write_sysreg_s(reg, SYS_PMSLATFR_EL1);
 
@@ -1125,6 +1159,9 @@ static void __arm_spe_pmu_dev_probe(void *info)
 	if (FIELD_GET(PMSIDR_EL1_EFT, reg))
 		spe_pmu->features |= SPE_PMU_FEAT_EFT;
 
+	if (FIELD_GET(PMSIDR_EL1_FDS, reg))
+		spe_pmu->features |= SPE_PMU_FEAT_FDS;
+
 	/* This field has a spaced out encoding, so just use a look-up */
 	fld = FIELD_GET(PMSIDR_EL1_INTERVAL, reg);
 	switch (fld) {
diff --git a/drivers/perf/fsl_imx8_ddr_perf.c b/drivers/perf/fsl_imx8_ddr_perf.c
index b989ffa95d69..bcdf5575d71c 100644
--- a/drivers/perf/fsl_imx8_ddr_perf.c
+++ b/drivers/perf/fsl_imx8_ddr_perf.c
@@ -5,6 +5,7 @@
  */
 
 #include <linux/bitfield.h>
+#include <linux/clk.h>
 #include <linux/init.h>
 #include <linux/interrupt.h>
 #include <linux/io.h>
@@ -52,18 +53,27 @@
 #define to_ddr_pmu(p)		container_of(p, struct ddr_pmu, pmu)
 
 #define DDR_PERF_DEV_NAME	"imx8_ddr"
+#define DB_PERF_DEV_NAME	"imx8_db"
 #define DDR_CPUHP_CB_NAME	DDR_PERF_DEV_NAME "_perf_pmu"
 
 static DEFINE_IDA(ddr_ida);
+static DEFINE_IDA(db_ida);
 
 /* DDR Perf hardware feature */
 #define DDR_CAP_AXI_ID_FILTER			0x1     /* support AXI ID filter */
 #define DDR_CAP_AXI_ID_FILTER_ENHANCED		0x3     /* support enhanced AXI ID filter */
 #define DDR_CAP_AXI_ID_PORT_CHANNEL_FILTER	0x4	/* support AXI ID PORT CHANNEL filter */
 
+/* Perf type */
+enum fsl_ddr_type {
+	DDR_PERF_TYPE = 0,	/* ddr Perf (default) */
+	DB_PERF_TYPE,		/* db Perf */
+};
+
 struct fsl_ddr_devtype_data {
 	unsigned int quirks;    /* quirks needed for different DDR Perf core */
 	const char *identifier;	/* system PMU identifier for userspace */
+	enum fsl_ddr_type type;	/* types of Perf, ddr or db */
 };
 
 static const struct fsl_ddr_devtype_data imx8_devtype_data;
@@ -97,6 +107,12 @@ static const struct fsl_ddr_devtype_data imx8dxl_devtype_data = {
 	.identifier = "i.MX8DXL",
 };
 
+static const struct fsl_ddr_devtype_data imx8dxl_db_devtype_data = {
+	.quirks = DDR_CAP_AXI_ID_PORT_CHANNEL_FILTER,
+	.identifier = "i.MX8DXL",
+	.type = DB_PERF_TYPE,
+};
+
 static const struct of_device_id imx_ddr_pmu_dt_ids[] = {
 	{ .compatible = "fsl,imx8-ddr-pmu", .data = &imx8_devtype_data},
 	{ .compatible = "fsl,imx8m-ddr-pmu", .data = &imx8m_devtype_data},
@@ -105,6 +121,7 @@ static const struct of_device_id imx_ddr_pmu_dt_ids[] = {
 	{ .compatible = "fsl,imx8mn-ddr-pmu", .data = &imx8mn_devtype_data},
 	{ .compatible = "fsl,imx8mp-ddr-pmu", .data = &imx8mp_devtype_data},
 	{ .compatible = "fsl,imx8dxl-ddr-pmu", .data = &imx8dxl_devtype_data},
+	{ .compatible = "fsl,imx8dxl-db-pmu", .data = &imx8dxl_db_devtype_data},
 	{ /* sentinel */ }
 };
 MODULE_DEVICE_TABLE(of, imx_ddr_pmu_dt_ids);
@@ -284,9 +301,37 @@ static struct attribute *ddr_perf_events_attrs[] = {
 	NULL,
 };
 
+static const int ddr_perf_db_visible_event_list[] = {
+	EVENT_CYCLES_ID,
+	0x41,
+	0x42,
+};
+
+static umode_t ddr_perf_events_attrs_is_visible(struct kobject *kobj,
+						struct attribute *attr, int n)
+{
+	struct device *dev = kobj_to_dev(kobj);
+	struct ddr_pmu *pmu = dev_get_drvdata(dev);
+	struct perf_pmu_events_attr *pmu_attr;
+	unsigned int i;
+
+	pmu_attr = container_of(attr, struct perf_pmu_events_attr, attr.attr);
+
+	if (pmu->devtype_data->type == DDR_PERF_TYPE)
+		return attr->mode;
+
+	/* DB Type */
+	for (i = 0; i < ARRAY_SIZE(ddr_perf_db_visible_event_list); i++)
+		if (pmu_attr->id == ddr_perf_db_visible_event_list[i])
+			return attr->mode;
+
+	return 0;
+}
+
 static const struct attribute_group ddr_perf_events_attr_group = {
 	.name = "events",
 	.attrs = ddr_perf_events_attrs,
+	.is_visible = ddr_perf_events_attrs_is_visible,
 };
 
 PMU_FORMAT_ATTR(event, "config:0-7");
@@ -645,8 +690,8 @@ static void ddr_perf_pmu_disable(struct pmu *pmu)
 {
 }
 
-static int ddr_perf_init(struct ddr_pmu *pmu, void __iomem *base,
-			 struct device *dev)
+static void ddr_perf_init(struct ddr_pmu *pmu, void __iomem *base,
+			  struct device *dev)
 {
 	*pmu = (struct ddr_pmu) {
 		.pmu = (struct pmu) {
@@ -667,9 +712,6 @@ static int ddr_perf_init(struct ddr_pmu *pmu, void __iomem *base,
 		.base = base,
 		.dev = dev,
 	};
-
-	pmu->id = ida_alloc(&ddr_ida, GFP_KERNEL);
-	return pmu->id;
 }
 
 static irqreturn_t ddr_perf_irq_handler(int irq, void *p)
@@ -735,10 +777,13 @@ static int ddr_perf_offline_cpu(unsigned int cpu, struct hlist_node *node)
 
 static int ddr_perf_probe(struct platform_device *pdev)
 {
+	struct clk_bulk_data *clks;
 	struct ddr_pmu *pmu;
 	struct device_node *np;
 	void __iomem *base;
+	struct ida *ida;
 	char *name;
+	int nclks;
 	int num;
 	int ret;
 	int irq;
@@ -753,19 +798,33 @@ static int ddr_perf_probe(struct platform_device *pdev)
 	if (!pmu)
 		return -ENOMEM;
 
-	num = ddr_perf_init(pmu, base, &pdev->dev);
+	ddr_perf_init(pmu, base, &pdev->dev);
 
 	platform_set_drvdata(pdev, pmu);
 
-	name = devm_kasprintf(&pdev->dev, GFP_KERNEL, DDR_PERF_DEV_NAME "%d",
-			      num);
+	nclks = devm_clk_bulk_get_all_enabled(&pdev->dev, &clks);
+	if (nclks < 0)
+		return dev_err_probe(&pdev->dev, nclks, "Failure get clks\n");
+
+	pmu->devtype_data = of_device_get_match_data(&pdev->dev);
+
+	ida = pmu->devtype_data->type == DDR_PERF_TYPE ? &ddr_ida : &db_ida;
+	num = ida_alloc(ida, GFP_KERNEL);
+	if (num < 0)
+		return num;
+
+	pmu->id = num;
+
+	if (pmu->devtype_data->type == DDR_PERF_TYPE)
+		name = devm_kasprintf(&pdev->dev, GFP_KERNEL, DDR_PERF_DEV_NAME "%d", num);
+	else
+		name = devm_kasprintf(&pdev->dev, GFP_KERNEL, DB_PERF_DEV_NAME "%d", num);
+
 	if (!name) {
 		ret = -ENOMEM;
-		goto cpuhp_state_err;
+		goto idr_free;
 	}
 
-	pmu->devtype_data = of_device_get_match_data(&pdev->dev);
-
 	pmu->cpu = raw_smp_processor_id();
 	ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN,
 				      DDR_CPUHP_CB_NAME,
@@ -774,7 +833,7 @@ static int ddr_perf_probe(struct platform_device *pdev)
 
 	if (ret < 0) {
 		dev_err(&pdev->dev, "cpuhp_setup_state_multi failed\n");
-		goto cpuhp_state_err;
+		goto idr_free;
 	}
 
 	pmu->cpuhp_state = ret;
@@ -821,8 +880,8 @@ ddr_perf_err:
 	cpuhp_state_remove_instance_nocalls(pmu->cpuhp_state, &pmu->node);
 cpuhp_instance_err:
 	cpuhp_remove_multi_state(pmu->cpuhp_state);
-cpuhp_state_err:
-	ida_free(&ddr_ida, pmu->id);
+idr_free:
+	ida_free(ida, pmu->id);
 	dev_warn(&pdev->dev, "i.MX8 DDR Perf PMU failed (%d), disabled\n", ret);
 	return ret;
 }
@@ -836,7 +895,11 @@ static void ddr_perf_remove(struct platform_device *pdev)
 
 	perf_pmu_unregister(&pmu->pmu);
 
-	ida_free(&ddr_ida, pmu->id);
+	if (pmu->devtype_data->type == DDR_PERF_TYPE)
+		ida_free(&ddr_ida, pmu->id);
+	else
+		ida_free(&db_ida, pmu->id);
+
 }
 
 static struct platform_driver imx_ddr_pmu_driver = {
diff --git a/drivers/pmdomain/core.c b/drivers/pmdomain/core.c
index 61c2277c9ce3..4fd546ef0448 100644
--- a/drivers/pmdomain/core.c
+++ b/drivers/pmdomain/core.c
@@ -1425,8 +1425,14 @@ static void genpd_sync_power_off(struct generic_pm_domain *genpd, bool use_lock,
 			return;
 	}
 
-	/* Choose the deepest state when suspending */
-	genpd->state_idx = genpd->state_count - 1;
+	if (genpd->gov && genpd->gov->system_power_down_ok) {
+		if (!genpd->gov->system_power_down_ok(&genpd->domain))
+			return;
+	} else {
+		/* Default to the deepest state. */
+		genpd->state_idx = genpd->state_count - 1;
+	}
+
 	if (_genpd_power_off(genpd, false)) {
 		genpd->states[genpd->state_idx].rejected++;
 		return;
diff --git a/drivers/pmdomain/governor.c b/drivers/pmdomain/governor.c
index 39359811a930..05e68680f34b 100644
--- a/drivers/pmdomain/governor.c
+++ b/drivers/pmdomain/governor.c
@@ -351,7 +351,7 @@ static bool cpu_power_down_ok(struct dev_pm_domain *pd)
 	ktime_t domain_wakeup, next_hrtimer;
 	ktime_t now = ktime_get();
 	struct device *cpu_dev;
-	s64 cpu_constraint, global_constraint;
+	s64 cpu_constraint, global_constraint, wakeup_constraint;
 	s64 idle_duration_ns;
 	int cpu, i;
 
@@ -362,7 +362,11 @@ static bool cpu_power_down_ok(struct dev_pm_domain *pd)
 	if (!(genpd->flags & GENPD_FLAG_CPU_DOMAIN))
 		return true;
 
+	wakeup_constraint = cpu_wakeup_latency_qos_limit();
 	global_constraint = cpu_latency_qos_limit();
+	if (global_constraint > wakeup_constraint)
+		global_constraint = wakeup_constraint;
+
 	/*
 	 * Find the next wakeup for any of the online CPUs within the PM domain
 	 * and its subdomains. Note, we only need the genpd->cpus, as it already
@@ -415,9 +419,36 @@ static bool cpu_power_down_ok(struct dev_pm_domain *pd)
 	return false;
 }
 
+static bool cpu_system_power_down_ok(struct dev_pm_domain *pd)
+{
+	s64 constraint_ns = cpu_wakeup_latency_qos_limit() * NSEC_PER_USEC;
+	struct generic_pm_domain *genpd = pd_to_genpd(pd);
+	int state_idx = genpd->state_count - 1;
+
+	if (!(genpd->flags & GENPD_FLAG_CPU_DOMAIN)) {
+		genpd->state_idx = state_idx;
+		return true;
+	}
+
+	/* Find the deepest state for the latency constraint. */
+	while (state_idx >= 0) {
+		s64 latency_ns = genpd->states[state_idx].power_off_latency_ns +
+				 genpd->states[state_idx].power_on_latency_ns;
+
+		if (latency_ns <= constraint_ns) {
+			genpd->state_idx = state_idx;
+			return true;
+		}
+		state_idx--;
+	}
+
+	return false;
+}
+
 struct dev_power_governor pm_domain_cpu_gov = {
 	.suspend_ok = default_suspend_ok,
 	.power_down_ok = cpu_power_down_ok,
+	.system_power_down_ok = cpu_system_power_down_ok,
 };
 #endif
 
diff --git a/drivers/pnp/driver.c b/drivers/pnp/driver.c
index 7de7aabb275e..05e9840bc3d4 100644
--- a/drivers/pnp/driver.c
+++ b/drivers/pnp/driver.c
@@ -150,6 +150,24 @@ static void pnp_device_shutdown(struct device *dev)
 		drv->shutdown(pnp_dev);
 }
 
+static int pnp_uevent(const struct device *dev, struct kobj_uevent_env *env)
+{
+	struct pnp_id *pos;
+	const struct pnp_dev *pnp_dev = to_pnp_dev(dev);
+
+	if (!dev)
+		return -ENODEV;
+
+	pos = pnp_dev->id;
+	while (pos) {
+		if (add_uevent_var(env, "MODALIAS=pnp:d%s", pos->id))
+			return -ENOMEM;
+		pos = pos->next;
+	}
+
+	return 0;
+}
+
 static int pnp_bus_match(struct device *dev, const struct device_driver *drv)
 {
 	struct pnp_dev *pnp_dev = to_pnp_dev(dev);
@@ -259,6 +277,7 @@ static const struct dev_pm_ops pnp_bus_dev_pm_ops = {
 const struct bus_type pnp_bus_type = {
 	.name    = "pnp",
 	.match   = pnp_bus_match,
+	.uevent  = pnp_uevent,
 	.probe   = pnp_device_probe,
 	.remove  = pnp_device_remove,
 	.shutdown = pnp_device_shutdown,
diff --git a/drivers/powercap/intel_rapl_common.c b/drivers/powercap/intel_rapl_common.c
index c7e7f9bf5313..b9d87e56cbbc 100644
--- a/drivers/powercap/intel_rapl_common.c
+++ b/drivers/powercap/intel_rapl_common.c
@@ -253,7 +253,8 @@ struct rapl_primitive_info {
 static void rapl_init_domains(struct rapl_package *rp);
 static int rapl_read_data_raw(struct rapl_domain *rd,
 			      enum rapl_primitives prim,
-			      bool xlate, u64 *data);
+			      bool xlate, u64 *data,
+			      bool atomic);
 static int rapl_write_data_raw(struct rapl_domain *rd,
 			       enum rapl_primitives prim,
 			       unsigned long long value);
@@ -289,7 +290,7 @@ static int get_energy_counter(struct powercap_zone *power_zone,
 	cpus_read_lock();
 	rd = power_zone_to_rapl_domain(power_zone);
 
-	if (!rapl_read_data_raw(rd, ENERGY_COUNTER, true, &energy_now)) {
+	if (!rapl_read_data_raw(rd, ENERGY_COUNTER, true, &energy_now, false)) {
 		*energy_raw = energy_now;
 		cpus_read_unlock();
 
@@ -830,7 +831,8 @@ prim_fixups(struct rapl_domain *rd, enum rapl_primitives prim)
  * 63-------------------------- 31--------------------------- 0
  */
 static int rapl_read_data_raw(struct rapl_domain *rd,
-			      enum rapl_primitives prim, bool xlate, u64 *data)
+			      enum rapl_primitives prim, bool xlate, u64 *data,
+			      bool atomic)
 {
 	u64 value;
 	enum rapl_primitives prim_fixed = prim_fixups(rd, prim);
@@ -852,7 +854,7 @@ static int rapl_read_data_raw(struct rapl_domain *rd,
 
 	ra.mask = rpi->mask;
 
-	if (rd->rp->priv->read_raw(get_rid(rd->rp), &ra)) {
+	if (rd->rp->priv->read_raw(get_rid(rd->rp), &ra, atomic)) {
 		pr_debug("failed to read reg 0x%llx for %s:%s\n", ra.reg.val, rd->rp->name, rd->name);
 		return -EIO;
 	}
@@ -904,7 +906,7 @@ static int rapl_read_pl_data(struct rapl_domain *rd, int pl,
 	if (!is_pl_valid(rd, pl))
 		return -EINVAL;
 
-	return rapl_read_data_raw(rd, prim, xlate, data);
+	return rapl_read_data_raw(rd, prim, xlate, data, false);
 }
 
 static int rapl_write_pl_data(struct rapl_domain *rd, int pl,
@@ -941,7 +943,7 @@ static int rapl_check_unit_core(struct rapl_domain *rd)
 
 	ra.reg = rd->regs[RAPL_DOMAIN_REG_UNIT];
 	ra.mask = ~0;
-	if (rd->rp->priv->read_raw(get_rid(rd->rp), &ra)) {
+	if (rd->rp->priv->read_raw(get_rid(rd->rp), &ra, false)) {
 		pr_err("Failed to read power unit REG 0x%llx on %s:%s, exit.\n",
 			ra.reg.val, rd->rp->name, rd->name);
 		return -ENODEV;
@@ -969,7 +971,7 @@ static int rapl_check_unit_atom(struct rapl_domain *rd)
 
 	ra.reg = rd->regs[RAPL_DOMAIN_REG_UNIT];
 	ra.mask = ~0;
-	if (rd->rp->priv->read_raw(get_rid(rd->rp), &ra)) {
+	if (rd->rp->priv->read_raw(get_rid(rd->rp), &ra, false)) {
 		pr_err("Failed to read power unit REG 0x%llx on %s:%s, exit.\n",
 			ra.reg.val, rd->rp->name, rd->name);
 		return -ENODEV;
@@ -1156,7 +1158,7 @@ static int rapl_check_unit_tpmi(struct rapl_domain *rd)
 
 	ra.reg = rd->regs[RAPL_DOMAIN_REG_UNIT];
 	ra.mask = ~0;
-	if (rd->rp->priv->read_raw(get_rid(rd->rp), &ra)) {
+	if (rd->rp->priv->read_raw(get_rid(rd->rp), &ra, false)) {
 		pr_err("Failed to read power unit REG 0x%llx on %s:%s, exit.\n",
 			ra.reg.val, rd->rp->name, rd->name);
 		return -ENODEV;
@@ -1284,6 +1286,9 @@ static const struct x86_cpu_id rapl_ids[] __initconst = {
 	X86_MATCH_VFM(INTEL_EMERALDRAPIDS_X,	&rapl_defaults_spr_server),
 	X86_MATCH_VFM(INTEL_LUNARLAKE_M,	&rapl_defaults_core),
 	X86_MATCH_VFM(INTEL_PANTHERLAKE_L,	&rapl_defaults_core),
+	X86_MATCH_VFM(INTEL_WILDCATLAKE_L,	&rapl_defaults_core),
+	X86_MATCH_VFM(INTEL_NOVALAKE,		&rapl_defaults_core),
+	X86_MATCH_VFM(INTEL_NOVALAKE_L,		&rapl_defaults_core),
 	X86_MATCH_VFM(INTEL_ARROWLAKE_H,	&rapl_defaults_core),
 	X86_MATCH_VFM(INTEL_ARROWLAKE,		&rapl_defaults_core),
 	X86_MATCH_VFM(INTEL_ARROWLAKE_U,	&rapl_defaults_core),
@@ -1325,7 +1330,7 @@ static void rapl_update_domain_data(struct rapl_package *rp)
 			struct rapl_primitive_info *rpi = get_rpi(rp, prim);
 
 			if (!rapl_read_data_raw(&rp->domains[dmn], prim,
-						rpi->unit, &val))
+						rpi->unit, &val, false))
 				rp->domains[dmn].rdd.primitives[prim] = val;
 		}
 	}
@@ -1425,7 +1430,7 @@ static int rapl_check_domain(int domain, struct rapl_package *rp)
 	 */
 
 	ra.mask = ENERGY_STATUS_MASK;
-	if (rp->priv->read_raw(get_rid(rp), &ra) || !ra.value)
+	if (rp->priv->read_raw(get_rid(rp), &ra, false) || !ra.value)
 		return -ENODEV;
 
 	return 0;
@@ -1592,11 +1597,11 @@ static int get_pmu_cpu(struct rapl_package *rp)
 	if (!rp->has_pmu)
 		return nr_cpu_ids;
 
-	/* Only TPMI RAPL is supported for now */
-	if (rp->priv->type != RAPL_IF_TPMI)
+	/* Only TPMI & MSR RAPL are supported for now */
+	if (rp->priv->type != RAPL_IF_TPMI && rp->priv->type != RAPL_IF_MSR)
 		return nr_cpu_ids;
 
-	/* TPMI RAPL uses any CPU in the package for PMU */
+	/* TPMI/MSR RAPL uses any CPU in the package for PMU */
 	for_each_online_cpu(cpu)
 		if (topology_physical_package_id(cpu) == rp->id)
 			return cpu;
@@ -1609,11 +1614,11 @@ static bool is_rp_pmu_cpu(struct rapl_package *rp, int cpu)
 	if (!rp->has_pmu)
 		return false;
 
-	/* Only TPMI RAPL is supported for now */
-	if (rp->priv->type != RAPL_IF_TPMI)
+	/* Only TPMI & MSR RAPL are supported for now */
+	if (rp->priv->type != RAPL_IF_TPMI && rp->priv->type != RAPL_IF_MSR)
 		return false;
 
-	/* TPMI RAPL uses any CPU in the package for PMU */
+	/* TPMI/MSR RAPL uses any CPU in the package for PMU */
 	return topology_physical_package_id(cpu) == rp->id;
 }
 
@@ -1636,7 +1641,7 @@ static u64 event_read_counter(struct perf_event *event)
 	if (event->hw.idx < 0)
 		return 0;
 
-	ret = rapl_read_data_raw(&rp->domains[event->hw.idx], ENERGY_COUNTER, false, &val);
+	ret = rapl_read_data_raw(&rp->domains[event->hw.idx], ENERGY_COUNTER, false, &val, true);
 
 	/* Return 0 for failed read */
 	if (ret)
diff --git a/drivers/powercap/intel_rapl_msr.c b/drivers/powercap/intel_rapl_msr.c
index 4ed06c71a3ac..0ce1096b6314 100644
--- a/drivers/powercap/intel_rapl_msr.c
+++ b/drivers/powercap/intel_rapl_msr.c
@@ -33,6 +33,8 @@
 /* private data for RAPL MSR Interface */
 static struct rapl_if_priv *rapl_msr_priv;
 
+static bool rapl_msr_pmu __ro_after_init;
+
 static struct rapl_if_priv rapl_msr_priv_intel = {
 	.type = RAPL_IF_MSR,
 	.reg_unit.msr = MSR_RAPL_POWER_UNIT,
@@ -79,6 +81,8 @@ static int rapl_cpu_online(unsigned int cpu)
 		rp = rapl_add_package_cpuslocked(cpu, rapl_msr_priv, true);
 		if (IS_ERR(rp))
 			return PTR_ERR(rp);
+		if (rapl_msr_pmu)
+			rapl_package_add_pmu(rp);
 	}
 	cpumask_set_cpu(cpu, &rp->cpumask);
 	return 0;
@@ -95,19 +99,37 @@ static int rapl_cpu_down_prep(unsigned int cpu)
 
 	cpumask_clear_cpu(cpu, &rp->cpumask);
 	lead_cpu = cpumask_first(&rp->cpumask);
-	if (lead_cpu >= nr_cpu_ids)
+	if (lead_cpu >= nr_cpu_ids) {
+		if (rapl_msr_pmu)
+			rapl_package_remove_pmu(rp);
 		rapl_remove_package_cpuslocked(rp);
-	else if (rp->lead_cpu == cpu)
+	} else if (rp->lead_cpu == cpu) {
 		rp->lead_cpu = lead_cpu;
+	}
+
 	return 0;
 }
 
-static int rapl_msr_read_raw(int cpu, struct reg_action *ra)
+static int rapl_msr_read_raw(int cpu, struct reg_action *ra, bool atomic)
 {
+	/*
+	 * When called from atomic-context (eg PMU event handler)
+	 * perform MSR read directly using rdmsrq().
+	 */
+	if (atomic) {
+		if (unlikely(smp_processor_id() != cpu))
+			return -EIO;
+
+		rdmsrq(ra->reg.msr, ra->value);
+		goto out;
+	}
+
 	if (rdmsrq_safe_on_cpu(cpu, ra->reg.msr, &ra->value)) {
 		pr_debug("failed to read msr 0x%x on cpu %d\n", ra->reg.msr, cpu);
 		return -EIO;
 	}
+
+out:
 	ra->value &= ra->mask;
 	return 0;
 }
@@ -151,6 +173,16 @@ static const struct x86_cpu_id pl4_support_ids[] = {
 	X86_MATCH_VFM(INTEL_ARROWLAKE_U, NULL),
 	X86_MATCH_VFM(INTEL_ARROWLAKE_H, NULL),
 	X86_MATCH_VFM(INTEL_PANTHERLAKE_L, NULL),
+	X86_MATCH_VFM(INTEL_WILDCATLAKE_L, NULL),
+	X86_MATCH_VFM(INTEL_NOVALAKE, NULL),
+	X86_MATCH_VFM(INTEL_NOVALAKE_L, NULL),
+	{}
+};
+
+/* List of MSR-based RAPL PMU support CPUs */
+static const struct x86_cpu_id pmu_support_ids[] = {
+	X86_MATCH_VFM(INTEL_PANTHERLAKE_L, NULL),
+	X86_MATCH_VFM(INTEL_WILDCATLAKE_L, NULL),
 	{}
 };
 
@@ -181,6 +213,11 @@ static int rapl_msr_probe(struct platform_device *pdev)
 		pr_info("PL4 support detected.\n");
 	}
 
+	if (x86_match_cpu(pmu_support_ids)) {
+		rapl_msr_pmu = true;
+		pr_info("MSR-based RAPL PMU support enabled\n");
+	}
+
 	rapl_msr_priv->control_type = powercap_register_control_type(NULL, "intel-rapl", NULL);
 	if (IS_ERR(rapl_msr_priv->control_type)) {
 		pr_debug("failed to register powercap control_type.\n");
diff --git a/drivers/powercap/intel_rapl_tpmi.c b/drivers/powercap/intel_rapl_tpmi.c
index 82201bf4685d..0a0b85f4528b 100644
--- a/drivers/powercap/intel_rapl_tpmi.c
+++ b/drivers/powercap/intel_rapl_tpmi.c
@@ -60,7 +60,7 @@ static DEFINE_MUTEX(tpmi_rapl_lock);
 
 static struct powercap_control_type *tpmi_control_type;
 
-static int tpmi_rapl_read_raw(int id, struct reg_action *ra)
+static int tpmi_rapl_read_raw(int id, struct reg_action *ra, bool atomic)
 {
 	if (!ra->reg.mmio)
 		return -EINVAL;
diff --git a/drivers/resctrl/Kconfig b/drivers/resctrl/Kconfig
new file mode 100644
index 000000000000..c808e0470394
--- /dev/null
+++ b/drivers/resctrl/Kconfig
@@ -0,0 +1,24 @@
+menuconfig ARM64_MPAM_DRIVER
+	bool "MPAM driver"
+	depends on ARM64 && ARM64_MPAM && EXPERT
+	help
+	  Memory System Resource Partitioning and Monitoring (MPAM) driver for
+	  System IP, e.g. caches and memory controllers.
+
+if ARM64_MPAM_DRIVER
+
+config ARM64_MPAM_DRIVER_DEBUG
+	bool "Enable debug messages from the MPAM driver"
+	help
+	  Say yes here to enable debug messages from the MPAM driver.
+
+config MPAM_KUNIT_TEST
+	bool "KUnit tests for MPAM driver " if !KUNIT_ALL_TESTS
+	depends on KUNIT=y
+	default KUNIT_ALL_TESTS
+	help
+	  Enable this option to run tests in the MPAM driver.
+
+	  If unsure, say N.
+
+endif
diff --git a/drivers/resctrl/Makefile b/drivers/resctrl/Makefile
new file mode 100644
index 000000000000..898199dcf80d
--- /dev/null
+++ b/drivers/resctrl/Makefile
@@ -0,0 +1,4 @@
+obj-$(CONFIG_ARM64_MPAM_DRIVER)			+= mpam.o
+mpam-y						+= mpam_devices.o
+
+ccflags-$(CONFIG_ARM64_MPAM_DRIVER_DEBUG)	+= -DDEBUG
diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c
new file mode 100644
index 000000000000..0b5b158e1aaf
--- /dev/null
+++ b/drivers/resctrl/mpam_devices.c
@@ -0,0 +1,2723 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (C) 2025 Arm Ltd.
+
+#define pr_fmt(fmt) "%s:%s: " fmt, KBUILD_MODNAME, __func__
+
+#include <linux/acpi.h>
+#include <linux/atomic.h>
+#include <linux/arm_mpam.h>
+#include <linux/bitfield.h>
+#include <linux/bitmap.h>
+#include <linux/cacheinfo.h>
+#include <linux/cpu.h>
+#include <linux/cpumask.h>
+#include <linux/device.h>
+#include <linux/errno.h>
+#include <linux/gfp.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/irqdesc.h>
+#include <linux/list.h>
+#include <linux/lockdep.h>
+#include <linux/mutex.h>
+#include <linux/platform_device.h>
+#include <linux/printk.h>
+#include <linux/srcu.h>
+#include <linux/spinlock.h>
+#include <linux/types.h>
+#include <linux/workqueue.h>
+
+#include "mpam_internal.h"
+
+DEFINE_STATIC_KEY_FALSE(mpam_enabled); /* This moves to arch code */
+
+/*
+ * mpam_list_lock protects the SRCU lists when writing. Once the
+ * mpam_enabled key is enabled these lists are read-only,
+ * unless the error interrupt disables the driver.
+ */
+static DEFINE_MUTEX(mpam_list_lock);
+static LIST_HEAD(mpam_all_msc);
+
+struct srcu_struct mpam_srcu;
+
+/*
+ * Number of MSCs that have been probed. Once all MSCs have been probed MPAM
+ * can be enabled.
+ */
+static atomic_t mpam_num_msc;
+
+static int mpam_cpuhp_state;
+static DEFINE_MUTEX(mpam_cpuhp_state_lock);
+
+/*
+ * The smallest common values for any CPU or MSC in the system.
+ * Generating traffic outside this range will result in screaming interrupts.
+ */
+u16 mpam_partid_max;
+u8 mpam_pmg_max;
+static bool partid_max_init, partid_max_published;
+static DEFINE_SPINLOCK(partid_max_lock);
+
+/*
+ * mpam is enabled once all devices have been probed from CPU online callbacks,
+ * scheduled via this work_struct. If access to an MSC depends on a CPU that
+ * was not brought online at boot, this can happen surprisingly late.
+ */
+static DECLARE_WORK(mpam_enable_work, &mpam_enable);
+
+/*
+ * All mpam error interrupts indicate a software bug. On receipt, disable the
+ * driver.
+ */
+static DECLARE_WORK(mpam_broken_work, &mpam_disable);
+
+/* When mpam is disabled, the printed reason to aid debugging */
+static char *mpam_disable_reason;
+
+/*
+ * An MSC is a physical container for controls and monitors, each identified by
+ * their RIS index. These share a base-address, interrupts and some MMIO
+ * registers. A vMSC is a virtual container for RIS in an MSC that control or
+ * monitor the same thing. Members of a vMSC are all RIS in the same MSC, but
+ * not all RIS in an MSC share a vMSC.
+ *
+ * Components are a group of vMSC that control or monitor the same thing but
+ * are from different MSC, so have different base-address, interrupts etc.
+ * Classes are the set components of the same type.
+ *
+ * The features of a vMSC is the union of the RIS it contains.
+ * The features of a Class and Component are the common subset of the vMSC
+ * they contain.
+ *
+ * e.g. The system cache may have bandwidth controls on multiple interfaces,
+ * for regulating traffic from devices independently of traffic from CPUs.
+ * If these are two RIS in one MSC, they will be treated as controlling
+ * different things, and will not share a vMSC/component/class.
+ *
+ * e.g. The L2 may have one MSC and two RIS, one for cache-controls another
+ * for bandwidth. These two RIS are members of the same vMSC.
+ *
+ * e.g. The set of RIS that make up the L2 are grouped as a component. These
+ * are sometimes termed slices. They should be configured the same, as if there
+ * were only one.
+ *
+ * e.g. The SoC probably has more than one L2, each attached to a distinct set
+ * of CPUs. All the L2 components are grouped as a class.
+ *
+ * When creating an MSC, struct mpam_msc is added to the all mpam_all_msc list,
+ * then linked via struct mpam_ris to a vmsc, component and class.
+ * The same MSC may exist under different class->component->vmsc paths, but the
+ * RIS index will be unique.
+ */
+LIST_HEAD(mpam_classes);
+
+/* List of all objects that can be free()d after synchronise_srcu() */
+static LLIST_HEAD(mpam_garbage);
+
+static inline void init_garbage(struct mpam_garbage *garbage)
+{
+	init_llist_node(&garbage->llist);
+}
+
+#define add_to_garbage(x)				\
+do {							\
+	__typeof__(x) _x = (x);				\
+	_x->garbage.to_free = _x;			\
+	llist_add(&_x->garbage.llist, &mpam_garbage);	\
+} while (0)
+
+static void mpam_free_garbage(void)
+{
+	struct mpam_garbage *iter, *tmp;
+	struct llist_node *to_free = llist_del_all(&mpam_garbage);
+
+	if (!to_free)
+		return;
+
+	synchronize_srcu(&mpam_srcu);
+
+	llist_for_each_entry_safe(iter, tmp, to_free, llist) {
+		if (iter->pdev)
+			devm_kfree(&iter->pdev->dev, iter->to_free);
+		else
+			kfree(iter->to_free);
+	}
+}
+
+/*
+ * Once mpam is enabled, new requestors cannot further reduce the available
+ * partid. Assert that the size is fixed, and new requestors will be turned
+ * away.
+ */
+static void mpam_assert_partid_sizes_fixed(void)
+{
+	WARN_ON_ONCE(!partid_max_published);
+}
+
+static u32 __mpam_read_reg(struct mpam_msc *msc, u16 reg)
+{
+	WARN_ON_ONCE(!cpumask_test_cpu(smp_processor_id(), &msc->accessibility));
+
+	return readl_relaxed(msc->mapped_hwpage + reg);
+}
+
+static inline u32 _mpam_read_partsel_reg(struct mpam_msc *msc, u16 reg)
+{
+	lockdep_assert_held_once(&msc->part_sel_lock);
+	return __mpam_read_reg(msc, reg);
+}
+
+#define mpam_read_partsel_reg(msc, reg) _mpam_read_partsel_reg(msc, MPAMF_##reg)
+
+static void __mpam_write_reg(struct mpam_msc *msc, u16 reg, u32 val)
+{
+	WARN_ON_ONCE(reg + sizeof(u32) > msc->mapped_hwpage_sz);
+	WARN_ON_ONCE(!cpumask_test_cpu(smp_processor_id(), &msc->accessibility));
+
+	writel_relaxed(val, msc->mapped_hwpage + reg);
+}
+
+static inline void _mpam_write_partsel_reg(struct mpam_msc *msc, u16 reg, u32 val)
+{
+	lockdep_assert_held_once(&msc->part_sel_lock);
+	__mpam_write_reg(msc, reg, val);
+}
+
+#define mpam_write_partsel_reg(msc, reg, val)  _mpam_write_partsel_reg(msc, MPAMCFG_##reg, val)
+
+static inline u32 _mpam_read_monsel_reg(struct mpam_msc *msc, u16 reg)
+{
+	mpam_mon_sel_lock_held(msc);
+	return __mpam_read_reg(msc, reg);
+}
+
+#define mpam_read_monsel_reg(msc, reg) _mpam_read_monsel_reg(msc, MSMON_##reg)
+
+static inline void _mpam_write_monsel_reg(struct mpam_msc *msc, u16 reg, u32 val)
+{
+	mpam_mon_sel_lock_held(msc);
+	__mpam_write_reg(msc, reg, val);
+}
+
+#define mpam_write_monsel_reg(msc, reg, val)   _mpam_write_monsel_reg(msc, MSMON_##reg, val)
+
+static u64 mpam_msc_read_idr(struct mpam_msc *msc)
+{
+	u64 idr_high = 0, idr_low;
+
+	lockdep_assert_held(&msc->part_sel_lock);
+
+	idr_low = mpam_read_partsel_reg(msc, IDR);
+	if (FIELD_GET(MPAMF_IDR_EXT, idr_low))
+		idr_high = mpam_read_partsel_reg(msc, IDR + 4);
+
+	return (idr_high << 32) | idr_low;
+}
+
+static void mpam_msc_clear_esr(struct mpam_msc *msc)
+{
+	u64 esr_low = __mpam_read_reg(msc, MPAMF_ESR);
+
+	if (!esr_low)
+		return;
+
+	/*
+	 * Clearing the high/low bits of MPAMF_ESR can not be atomic.
+	 * Clear the top half first, so that the pending error bits in the
+	 * lower half prevent hardware from updating either half of the
+	 * register.
+	 */
+	if (msc->has_extd_esr)
+		__mpam_write_reg(msc, MPAMF_ESR + 4, 0);
+	__mpam_write_reg(msc, MPAMF_ESR, 0);
+}
+
+static u64 mpam_msc_read_esr(struct mpam_msc *msc)
+{
+	u64 esr_high = 0, esr_low;
+
+	esr_low = __mpam_read_reg(msc, MPAMF_ESR);
+	if (msc->has_extd_esr)
+		esr_high = __mpam_read_reg(msc, MPAMF_ESR + 4);
+
+	return (esr_high << 32) | esr_low;
+}
+
+static void __mpam_part_sel_raw(u32 partsel, struct mpam_msc *msc)
+{
+	lockdep_assert_held(&msc->part_sel_lock);
+
+	mpam_write_partsel_reg(msc, PART_SEL, partsel);
+}
+
+static void __mpam_part_sel(u8 ris_idx, u16 partid, struct mpam_msc *msc)
+{
+	u32 partsel = FIELD_PREP(MPAMCFG_PART_SEL_RIS, ris_idx) |
+		      FIELD_PREP(MPAMCFG_PART_SEL_PARTID_SEL, partid);
+
+	__mpam_part_sel_raw(partsel, msc);
+}
+
+static void __mpam_intpart_sel(u8 ris_idx, u16 intpartid, struct mpam_msc *msc)
+{
+	u32 partsel = FIELD_PREP(MPAMCFG_PART_SEL_RIS, ris_idx) |
+		      FIELD_PREP(MPAMCFG_PART_SEL_PARTID_SEL, intpartid) |
+		      MPAMCFG_PART_SEL_INTERNAL;
+
+	__mpam_part_sel_raw(partsel, msc);
+}
+
+int mpam_register_requestor(u16 partid_max, u8 pmg_max)
+{
+	guard(spinlock)(&partid_max_lock);
+	if (!partid_max_init) {
+		mpam_partid_max = partid_max;
+		mpam_pmg_max = pmg_max;
+		partid_max_init = true;
+	} else if (!partid_max_published) {
+		mpam_partid_max = min(mpam_partid_max, partid_max);
+		mpam_pmg_max = min(mpam_pmg_max, pmg_max);
+	} else {
+		/* New requestors can't lower the values */
+		if (partid_max < mpam_partid_max || pmg_max < mpam_pmg_max)
+			return -EBUSY;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(mpam_register_requestor);
+
+static struct mpam_class *
+mpam_class_alloc(u8 level_idx, enum mpam_class_types type)
+{
+	struct mpam_class *class;
+
+	lockdep_assert_held(&mpam_list_lock);
+
+	class = kzalloc(sizeof(*class), GFP_KERNEL);
+	if (!class)
+		return ERR_PTR(-ENOMEM);
+	init_garbage(&class->garbage);
+
+	INIT_LIST_HEAD_RCU(&class->components);
+	/* Affinity is updated when ris are added */
+	class->level = level_idx;
+	class->type = type;
+	INIT_LIST_HEAD_RCU(&class->classes_list);
+	ida_init(&class->ida_csu_mon);
+	ida_init(&class->ida_mbwu_mon);
+
+	list_add_rcu(&class->classes_list, &mpam_classes);
+
+	return class;
+}
+
+static void mpam_class_destroy(struct mpam_class *class)
+{
+	lockdep_assert_held(&mpam_list_lock);
+
+	list_del_rcu(&class->classes_list);
+	add_to_garbage(class);
+}
+
+static struct mpam_class *
+mpam_class_find(u8 level_idx, enum mpam_class_types type)
+{
+	struct mpam_class *class;
+
+	lockdep_assert_held(&mpam_list_lock);
+
+	list_for_each_entry(class, &mpam_classes, classes_list) {
+		if (class->type == type && class->level == level_idx)
+			return class;
+	}
+
+	return mpam_class_alloc(level_idx, type);
+}
+
+static struct mpam_component *
+mpam_component_alloc(struct mpam_class *class, int id)
+{
+	struct mpam_component *comp;
+
+	lockdep_assert_held(&mpam_list_lock);
+
+	comp = kzalloc(sizeof(*comp), GFP_KERNEL);
+	if (!comp)
+		return ERR_PTR(-ENOMEM);
+	init_garbage(&comp->garbage);
+
+	comp->comp_id = id;
+	INIT_LIST_HEAD_RCU(&comp->vmsc);
+	/* Affinity is updated when RIS are added */
+	INIT_LIST_HEAD_RCU(&comp->class_list);
+	comp->class = class;
+
+	list_add_rcu(&comp->class_list, &class->components);
+
+	return comp;
+}
+
+static void __destroy_component_cfg(struct mpam_component *comp);
+
+static void mpam_component_destroy(struct mpam_component *comp)
+{
+	struct mpam_class *class = comp->class;
+
+	lockdep_assert_held(&mpam_list_lock);
+
+	__destroy_component_cfg(comp);
+
+	list_del_rcu(&comp->class_list);
+	add_to_garbage(comp);
+
+	if (list_empty(&class->components))
+		mpam_class_destroy(class);
+}
+
+static struct mpam_component *
+mpam_component_find(struct mpam_class *class, int id)
+{
+	struct mpam_component *comp;
+
+	lockdep_assert_held(&mpam_list_lock);
+
+	list_for_each_entry(comp, &class->components, class_list) {
+		if (comp->comp_id == id)
+			return comp;
+	}
+
+	return mpam_component_alloc(class, id);
+}
+
+static struct mpam_vmsc *
+mpam_vmsc_alloc(struct mpam_component *comp, struct mpam_msc *msc)
+{
+	struct mpam_vmsc *vmsc;
+
+	lockdep_assert_held(&mpam_list_lock);
+
+	vmsc = kzalloc(sizeof(*vmsc), GFP_KERNEL);
+	if (!vmsc)
+		return ERR_PTR(-ENOMEM);
+	init_garbage(&vmsc->garbage);
+
+	INIT_LIST_HEAD_RCU(&vmsc->ris);
+	INIT_LIST_HEAD_RCU(&vmsc->comp_list);
+	vmsc->comp = comp;
+	vmsc->msc = msc;
+
+	list_add_rcu(&vmsc->comp_list, &comp->vmsc);
+
+	return vmsc;
+}
+
+static void mpam_vmsc_destroy(struct mpam_vmsc *vmsc)
+{
+	struct mpam_component *comp = vmsc->comp;
+
+	lockdep_assert_held(&mpam_list_lock);
+
+	list_del_rcu(&vmsc->comp_list);
+	add_to_garbage(vmsc);
+
+	if (list_empty(&comp->vmsc))
+		mpam_component_destroy(comp);
+}
+
+static struct mpam_vmsc *
+mpam_vmsc_find(struct mpam_component *comp, struct mpam_msc *msc)
+{
+	struct mpam_vmsc *vmsc;
+
+	lockdep_assert_held(&mpam_list_lock);
+
+	list_for_each_entry(vmsc, &comp->vmsc, comp_list) {
+		if (vmsc->msc->id == msc->id)
+			return vmsc;
+	}
+
+	return mpam_vmsc_alloc(comp, msc);
+}
+
+/*
+ * The cacheinfo structures are only populated when CPUs are online.
+ * This helper walks the acpi tables to include offline CPUs too.
+ */
+int mpam_get_cpumask_from_cache_id(unsigned long cache_id, u32 cache_level,
+				   cpumask_t *affinity)
+{
+	return acpi_pptt_get_cpumask_from_cache_id(cache_id, affinity);
+}
+
+/*
+ * cpumask_of_node() only knows about online CPUs. This can't tell us whether
+ * a class is represented on all possible CPUs.
+ */
+static void get_cpumask_from_node_id(u32 node_id, cpumask_t *affinity)
+{
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		if (node_id == cpu_to_node(cpu))
+			cpumask_set_cpu(cpu, affinity);
+	}
+}
+
+static int mpam_ris_get_affinity(struct mpam_msc *msc, cpumask_t *affinity,
+				 enum mpam_class_types type,
+				 struct mpam_class *class,
+				 struct mpam_component *comp)
+{
+	int err;
+
+	switch (type) {
+	case MPAM_CLASS_CACHE:
+		err = mpam_get_cpumask_from_cache_id(comp->comp_id, class->level,
+						     affinity);
+		if (err) {
+			dev_warn_once(&msc->pdev->dev,
+				      "Failed to determine CPU affinity\n");
+			return err;
+		}
+
+		if (cpumask_empty(affinity))
+			dev_warn_once(&msc->pdev->dev, "no CPUs associated with cache node\n");
+
+		break;
+	case MPAM_CLASS_MEMORY:
+		get_cpumask_from_node_id(comp->comp_id, affinity);
+		/* affinity may be empty for CPU-less memory nodes */
+		break;
+	case MPAM_CLASS_UNKNOWN:
+		return 0;
+	}
+
+	cpumask_and(affinity, affinity, &msc->accessibility);
+
+	return 0;
+}
+
+static int mpam_ris_create_locked(struct mpam_msc *msc, u8 ris_idx,
+				  enum mpam_class_types type, u8 class_id,
+				  int component_id)
+{
+	int err;
+	struct mpam_vmsc *vmsc;
+	struct mpam_msc_ris *ris;
+	struct mpam_class *class;
+	struct mpam_component *comp;
+	struct platform_device *pdev = msc->pdev;
+
+	lockdep_assert_held(&mpam_list_lock);
+
+	if (ris_idx > MPAM_MSC_MAX_NUM_RIS)
+		return -EINVAL;
+
+	if (test_and_set_bit(ris_idx, &msc->ris_idxs))
+		return -EBUSY;
+
+	ris = devm_kzalloc(&msc->pdev->dev, sizeof(*ris), GFP_KERNEL);
+	if (!ris)
+		return -ENOMEM;
+	init_garbage(&ris->garbage);
+	ris->garbage.pdev = pdev;
+
+	class = mpam_class_find(class_id, type);
+	if (IS_ERR(class))
+		return PTR_ERR(class);
+
+	comp = mpam_component_find(class, component_id);
+	if (IS_ERR(comp)) {
+		if (list_empty(&class->components))
+			mpam_class_destroy(class);
+		return PTR_ERR(comp);
+	}
+
+	vmsc = mpam_vmsc_find(comp, msc);
+	if (IS_ERR(vmsc)) {
+		if (list_empty(&comp->vmsc))
+			mpam_component_destroy(comp);
+		return PTR_ERR(vmsc);
+	}
+
+	err = mpam_ris_get_affinity(msc, &ris->affinity, type, class, comp);
+	if (err) {
+		if (list_empty(&vmsc->ris))
+			mpam_vmsc_destroy(vmsc);
+		return err;
+	}
+
+	ris->ris_idx = ris_idx;
+	INIT_LIST_HEAD_RCU(&ris->msc_list);
+	INIT_LIST_HEAD_RCU(&ris->vmsc_list);
+	ris->vmsc = vmsc;
+
+	cpumask_or(&comp->affinity, &comp->affinity, &ris->affinity);
+	cpumask_or(&class->affinity, &class->affinity, &ris->affinity);
+	list_add_rcu(&ris->vmsc_list, &vmsc->ris);
+	list_add_rcu(&ris->msc_list, &msc->ris);
+
+	return 0;
+}
+
+static void mpam_ris_destroy(struct mpam_msc_ris *ris)
+{
+	struct mpam_vmsc *vmsc = ris->vmsc;
+	struct mpam_msc *msc = vmsc->msc;
+	struct mpam_component *comp = vmsc->comp;
+	struct mpam_class *class = comp->class;
+
+	lockdep_assert_held(&mpam_list_lock);
+
+	/*
+	 * It is assumed affinities don't overlap. If they do the class becomes
+	 * unusable immediately.
+	 */
+	cpumask_andnot(&class->affinity, &class->affinity, &ris->affinity);
+	cpumask_andnot(&comp->affinity, &comp->affinity, &ris->affinity);
+	clear_bit(ris->ris_idx, &msc->ris_idxs);
+	list_del_rcu(&ris->msc_list);
+	list_del_rcu(&ris->vmsc_list);
+	add_to_garbage(ris);
+
+	if (list_empty(&vmsc->ris))
+		mpam_vmsc_destroy(vmsc);
+}
+
+int mpam_ris_create(struct mpam_msc *msc, u8 ris_idx,
+		    enum mpam_class_types type, u8 class_id, int component_id)
+{
+	int err;
+
+	mutex_lock(&mpam_list_lock);
+	err = mpam_ris_create_locked(msc, ris_idx, type, class_id,
+				     component_id);
+	mutex_unlock(&mpam_list_lock);
+	if (err)
+		mpam_free_garbage();
+
+	return err;
+}
+
+static struct mpam_msc_ris *mpam_get_or_create_ris(struct mpam_msc *msc,
+						   u8 ris_idx)
+{
+	int err;
+	struct mpam_msc_ris *ris;
+
+	lockdep_assert_held(&mpam_list_lock);
+
+	if (!test_bit(ris_idx, &msc->ris_idxs)) {
+		err = mpam_ris_create_locked(msc, ris_idx, MPAM_CLASS_UNKNOWN,
+					     0, 0);
+		if (err)
+			return ERR_PTR(err);
+	}
+
+	list_for_each_entry(ris, &msc->ris, msc_list) {
+		if (ris->ris_idx == ris_idx)
+			return ris;
+	}
+
+	return ERR_PTR(-ENOENT);
+}
+
+/*
+ * IHI009A.a has this nugget: "If a monitor does not support automatic behaviour
+ * of NRDY, software can use this bit for any purpose" - so hardware might not
+ * implement this - but it isn't RES0.
+ *
+ * Try and see what values stick in this bit. If we can write either value,
+ * its probably not implemented by hardware.
+ */
+static bool _mpam_ris_hw_probe_hw_nrdy(struct mpam_msc_ris *ris, u32 mon_reg)
+{
+	u32 now;
+	u64 mon_sel;
+	bool can_set, can_clear;
+	struct mpam_msc *msc = ris->vmsc->msc;
+
+	if (WARN_ON_ONCE(!mpam_mon_sel_lock(msc)))
+		return false;
+
+	mon_sel = FIELD_PREP(MSMON_CFG_MON_SEL_MON_SEL, 0) |
+		  FIELD_PREP(MSMON_CFG_MON_SEL_RIS, ris->ris_idx);
+	_mpam_write_monsel_reg(msc, mon_reg, mon_sel);
+
+	_mpam_write_monsel_reg(msc, mon_reg, MSMON___NRDY);
+	now = _mpam_read_monsel_reg(msc, mon_reg);
+	can_set = now & MSMON___NRDY;
+
+	_mpam_write_monsel_reg(msc, mon_reg, 0);
+	now = _mpam_read_monsel_reg(msc, mon_reg);
+	can_clear = !(now & MSMON___NRDY);
+	mpam_mon_sel_unlock(msc);
+
+	return (!can_set || !can_clear);
+}
+
+#define mpam_ris_hw_probe_hw_nrdy(_ris, _mon_reg)			\
+	_mpam_ris_hw_probe_hw_nrdy(_ris, MSMON_##_mon_reg)
+
+static void mpam_ris_hw_probe(struct mpam_msc_ris *ris)
+{
+	int err;
+	struct mpam_msc *msc = ris->vmsc->msc;
+	struct device *dev = &msc->pdev->dev;
+	struct mpam_props *props = &ris->props;
+	struct mpam_class *class = ris->vmsc->comp->class;
+
+	lockdep_assert_held(&msc->probe_lock);
+	lockdep_assert_held(&msc->part_sel_lock);
+
+	/* Cache Capacity Partitioning */
+	if (FIELD_GET(MPAMF_IDR_HAS_CCAP_PART, ris->idr)) {
+		u32 ccap_features = mpam_read_partsel_reg(msc, CCAP_IDR);
+
+		props->cmax_wd = FIELD_GET(MPAMF_CCAP_IDR_CMAX_WD, ccap_features);
+		if (props->cmax_wd &&
+		    FIELD_GET(MPAMF_CCAP_IDR_HAS_CMAX_SOFTLIM, ccap_features))
+			mpam_set_feature(mpam_feat_cmax_softlim, props);
+
+		if (props->cmax_wd &&
+		    !FIELD_GET(MPAMF_CCAP_IDR_NO_CMAX, ccap_features))
+			mpam_set_feature(mpam_feat_cmax_cmax, props);
+
+		if (props->cmax_wd &&
+		    FIELD_GET(MPAMF_CCAP_IDR_HAS_CMIN, ccap_features))
+			mpam_set_feature(mpam_feat_cmax_cmin, props);
+
+		props->cassoc_wd = FIELD_GET(MPAMF_CCAP_IDR_CASSOC_WD, ccap_features);
+		if (props->cassoc_wd &&
+		    FIELD_GET(MPAMF_CCAP_IDR_HAS_CASSOC, ccap_features))
+			mpam_set_feature(mpam_feat_cmax_cassoc, props);
+	}
+
+	/* Cache Portion partitioning */
+	if (FIELD_GET(MPAMF_IDR_HAS_CPOR_PART, ris->idr)) {
+		u32 cpor_features = mpam_read_partsel_reg(msc, CPOR_IDR);
+
+		props->cpbm_wd = FIELD_GET(MPAMF_CPOR_IDR_CPBM_WD, cpor_features);
+		if (props->cpbm_wd)
+			mpam_set_feature(mpam_feat_cpor_part, props);
+	}
+
+	/* Memory bandwidth partitioning */
+	if (FIELD_GET(MPAMF_IDR_HAS_MBW_PART, ris->idr)) {
+		u32 mbw_features = mpam_read_partsel_reg(msc, MBW_IDR);
+
+		/* portion bitmap resolution */
+		props->mbw_pbm_bits = FIELD_GET(MPAMF_MBW_IDR_BWPBM_WD, mbw_features);
+		if (props->mbw_pbm_bits &&
+		    FIELD_GET(MPAMF_MBW_IDR_HAS_PBM, mbw_features))
+			mpam_set_feature(mpam_feat_mbw_part, props);
+
+		props->bwa_wd = FIELD_GET(MPAMF_MBW_IDR_BWA_WD, mbw_features);
+		if (props->bwa_wd && FIELD_GET(MPAMF_MBW_IDR_HAS_MAX, mbw_features))
+			mpam_set_feature(mpam_feat_mbw_max, props);
+
+		if (props->bwa_wd && FIELD_GET(MPAMF_MBW_IDR_HAS_MIN, mbw_features))
+			mpam_set_feature(mpam_feat_mbw_min, props);
+
+		if (props->bwa_wd && FIELD_GET(MPAMF_MBW_IDR_HAS_PROP, mbw_features))
+			mpam_set_feature(mpam_feat_mbw_prop, props);
+	}
+
+	/* Priority partitioning */
+	if (FIELD_GET(MPAMF_IDR_HAS_PRI_PART, ris->idr)) {
+		u32 pri_features = mpam_read_partsel_reg(msc, PRI_IDR);
+
+		props->intpri_wd = FIELD_GET(MPAMF_PRI_IDR_INTPRI_WD, pri_features);
+		if (props->intpri_wd && FIELD_GET(MPAMF_PRI_IDR_HAS_INTPRI, pri_features)) {
+			mpam_set_feature(mpam_feat_intpri_part, props);
+			if (FIELD_GET(MPAMF_PRI_IDR_INTPRI_0_IS_LOW, pri_features))
+				mpam_set_feature(mpam_feat_intpri_part_0_low, props);
+		}
+
+		props->dspri_wd = FIELD_GET(MPAMF_PRI_IDR_DSPRI_WD, pri_features);
+		if (props->dspri_wd && FIELD_GET(MPAMF_PRI_IDR_HAS_DSPRI, pri_features)) {
+			mpam_set_feature(mpam_feat_dspri_part, props);
+			if (FIELD_GET(MPAMF_PRI_IDR_DSPRI_0_IS_LOW, pri_features))
+				mpam_set_feature(mpam_feat_dspri_part_0_low, props);
+		}
+	}
+
+	/* Performance Monitoring */
+	if (FIELD_GET(MPAMF_IDR_HAS_MSMON, ris->idr)) {
+		u32 msmon_features = mpam_read_partsel_reg(msc, MSMON_IDR);
+
+		/*
+		 * If the firmware max-nrdy-us property is missing, the
+		 * CSU counters can't be used. Should we wait forever?
+		 */
+		err = device_property_read_u32(&msc->pdev->dev,
+					       "arm,not-ready-us",
+					       &msc->nrdy_usec);
+
+		if (FIELD_GET(MPAMF_MSMON_IDR_MSMON_CSU, msmon_features)) {
+			u32 csumonidr;
+
+			csumonidr = mpam_read_partsel_reg(msc, CSUMON_IDR);
+			props->num_csu_mon = FIELD_GET(MPAMF_CSUMON_IDR_NUM_MON, csumonidr);
+			if (props->num_csu_mon) {
+				bool hw_managed;
+
+				mpam_set_feature(mpam_feat_msmon_csu, props);
+
+				if (FIELD_GET(MPAMF_CSUMON_IDR_HAS_XCL, csumonidr))
+					mpam_set_feature(mpam_feat_msmon_csu_xcl, props);
+
+				/* Is NRDY hardware managed? */
+				hw_managed = mpam_ris_hw_probe_hw_nrdy(ris, CSU);
+				if (hw_managed)
+					mpam_set_feature(mpam_feat_msmon_csu_hw_nrdy, props);
+			}
+
+			/*
+			 * Accept the missing firmware property if NRDY appears
+			 * un-implemented.
+			 */
+			if (err && mpam_has_feature(mpam_feat_msmon_csu_hw_nrdy, props))
+				dev_err_once(dev, "Counters are not usable because not-ready timeout was not provided by firmware.");
+		}
+		if (FIELD_GET(MPAMF_MSMON_IDR_MSMON_MBWU, msmon_features)) {
+			bool has_long, hw_managed;
+			u32 mbwumon_idr = mpam_read_partsel_reg(msc, MBWUMON_IDR);
+
+			props->num_mbwu_mon = FIELD_GET(MPAMF_MBWUMON_IDR_NUM_MON, mbwumon_idr);
+			if (props->num_mbwu_mon) {
+				mpam_set_feature(mpam_feat_msmon_mbwu, props);
+
+				if (FIELD_GET(MPAMF_MBWUMON_IDR_HAS_RWBW, mbwumon_idr))
+					mpam_set_feature(mpam_feat_msmon_mbwu_rwbw, props);
+
+				has_long = FIELD_GET(MPAMF_MBWUMON_IDR_HAS_LONG, mbwumon_idr);
+				if (has_long) {
+					if (FIELD_GET(MPAMF_MBWUMON_IDR_LWD, mbwumon_idr))
+						mpam_set_feature(mpam_feat_msmon_mbwu_63counter, props);
+					else
+						mpam_set_feature(mpam_feat_msmon_mbwu_44counter, props);
+				} else {
+					mpam_set_feature(mpam_feat_msmon_mbwu_31counter, props);
+				}
+
+				/* Is NRDY hardware managed? */
+				hw_managed = mpam_ris_hw_probe_hw_nrdy(ris, MBWU);
+				if (hw_managed)
+					mpam_set_feature(mpam_feat_msmon_mbwu_hw_nrdy, props);
+
+				/*
+				 * Don't warn about any missing firmware property for
+				 * MBWU NRDY - it doesn't make any sense!
+				 */
+			}
+		}
+	}
+
+	/*
+	 * RIS with PARTID narrowing don't have enough storage for one
+	 * configuration per PARTID. If these are in a class we could use,
+	 * reduce the supported partid_max to match the number of intpartid.
+	 * If the class is unknown, just ignore it.
+	 */
+	if (FIELD_GET(MPAMF_IDR_HAS_PARTID_NRW, ris->idr) &&
+	    class->type != MPAM_CLASS_UNKNOWN) {
+		u32 nrwidr = mpam_read_partsel_reg(msc, PARTID_NRW_IDR);
+		u16 partid_max = FIELD_GET(MPAMF_PARTID_NRW_IDR_INTPARTID_MAX, nrwidr);
+
+		mpam_set_feature(mpam_feat_partid_nrw, props);
+		msc->partid_max = min(msc->partid_max, partid_max);
+	}
+}
+
+static int mpam_msc_hw_probe(struct mpam_msc *msc)
+{
+	u64 idr;
+	u16 partid_max;
+	u8 ris_idx, pmg_max;
+	struct mpam_msc_ris *ris;
+	struct device *dev = &msc->pdev->dev;
+
+	lockdep_assert_held(&msc->probe_lock);
+
+	idr = __mpam_read_reg(msc, MPAMF_AIDR);
+	if ((idr & MPAMF_AIDR_ARCH_MAJOR_REV) != MPAM_ARCHITECTURE_V1) {
+		dev_err_once(dev, "MSC does not match MPAM architecture v1.x\n");
+		return -EIO;
+	}
+
+	/* Grab an IDR value to find out how many RIS there are */
+	mutex_lock(&msc->part_sel_lock);
+	idr = mpam_msc_read_idr(msc);
+	mutex_unlock(&msc->part_sel_lock);
+
+	msc->ris_max = FIELD_GET(MPAMF_IDR_RIS_MAX, idr);
+
+	/* Use these values so partid/pmg always starts with a valid value */
+	msc->partid_max = FIELD_GET(MPAMF_IDR_PARTID_MAX, idr);
+	msc->pmg_max = FIELD_GET(MPAMF_IDR_PMG_MAX, idr);
+
+	for (ris_idx = 0; ris_idx <= msc->ris_max; ris_idx++) {
+		mutex_lock(&msc->part_sel_lock);
+		__mpam_part_sel(ris_idx, 0, msc);
+		idr = mpam_msc_read_idr(msc);
+		mutex_unlock(&msc->part_sel_lock);
+
+		partid_max = FIELD_GET(MPAMF_IDR_PARTID_MAX, idr);
+		pmg_max = FIELD_GET(MPAMF_IDR_PMG_MAX, idr);
+		msc->partid_max = min(msc->partid_max, partid_max);
+		msc->pmg_max = min(msc->pmg_max, pmg_max);
+		msc->has_extd_esr = FIELD_GET(MPAMF_IDR_HAS_EXTD_ESR, idr);
+
+		mutex_lock(&mpam_list_lock);
+		ris = mpam_get_or_create_ris(msc, ris_idx);
+		mutex_unlock(&mpam_list_lock);
+		if (IS_ERR(ris))
+			return PTR_ERR(ris);
+		ris->idr = idr;
+
+		mutex_lock(&msc->part_sel_lock);
+		__mpam_part_sel(ris_idx, 0, msc);
+		mpam_ris_hw_probe(ris);
+		mutex_unlock(&msc->part_sel_lock);
+	}
+
+	/* Clear any stale errors */
+	mpam_msc_clear_esr(msc);
+
+	spin_lock(&partid_max_lock);
+	mpam_partid_max = min(mpam_partid_max, msc->partid_max);
+	mpam_pmg_max = min(mpam_pmg_max, msc->pmg_max);
+	spin_unlock(&partid_max_lock);
+
+	msc->probed = true;
+
+	return 0;
+}
+
+struct mon_read {
+	struct mpam_msc_ris		*ris;
+	struct mon_cfg			*ctx;
+	enum mpam_device_features	type;
+	u64				*val;
+	int				err;
+};
+
+static bool mpam_ris_has_mbwu_long_counter(struct mpam_msc_ris *ris)
+{
+	return (mpam_has_feature(mpam_feat_msmon_mbwu_63counter, &ris->props) ||
+		mpam_has_feature(mpam_feat_msmon_mbwu_44counter, &ris->props));
+}
+
+static u64 mpam_msc_read_mbwu_l(struct mpam_msc *msc)
+{
+	int retry = 3;
+	u32 mbwu_l_low;
+	u64 mbwu_l_high1, mbwu_l_high2;
+
+	mpam_mon_sel_lock_held(msc);
+
+	WARN_ON_ONCE((MSMON_MBWU_L + sizeof(u64)) > msc->mapped_hwpage_sz);
+	WARN_ON_ONCE(!cpumask_test_cpu(smp_processor_id(), &msc->accessibility));
+
+	mbwu_l_high2 = __mpam_read_reg(msc, MSMON_MBWU_L + 4);
+	do {
+		mbwu_l_high1 = mbwu_l_high2;
+		mbwu_l_low = __mpam_read_reg(msc, MSMON_MBWU_L);
+		mbwu_l_high2 = __mpam_read_reg(msc, MSMON_MBWU_L + 4);
+
+		retry--;
+	} while (mbwu_l_high1 != mbwu_l_high2 && retry > 0);
+
+	if (mbwu_l_high1 == mbwu_l_high2)
+		return (mbwu_l_high1 << 32) | mbwu_l_low;
+
+	pr_warn("Failed to read a stable value\n");
+	return MSMON___L_NRDY;
+}
+
+static void mpam_msc_zero_mbwu_l(struct mpam_msc *msc)
+{
+	mpam_mon_sel_lock_held(msc);
+
+	WARN_ON_ONCE((MSMON_MBWU_L + sizeof(u64)) > msc->mapped_hwpage_sz);
+	WARN_ON_ONCE(!cpumask_test_cpu(smp_processor_id(), &msc->accessibility));
+
+	__mpam_write_reg(msc, MSMON_MBWU_L, 0);
+	__mpam_write_reg(msc, MSMON_MBWU_L + 4, 0);
+}
+
+static void gen_msmon_ctl_flt_vals(struct mon_read *m, u32 *ctl_val,
+				   u32 *flt_val)
+{
+	struct mon_cfg *ctx = m->ctx;
+
+	/*
+	 * For CSU counters its implementation-defined what happens when not
+	 * filtering by partid.
+	 */
+	*ctl_val = MSMON_CFG_x_CTL_MATCH_PARTID;
+
+	*flt_val = FIELD_PREP(MSMON_CFG_x_FLT_PARTID, ctx->partid);
+
+	if (m->ctx->match_pmg) {
+		*ctl_val |= MSMON_CFG_x_CTL_MATCH_PMG;
+		*flt_val |= FIELD_PREP(MSMON_CFG_x_FLT_PMG, ctx->pmg);
+	}
+
+	switch (m->type) {
+	case mpam_feat_msmon_csu:
+		*ctl_val |= MSMON_CFG_CSU_CTL_TYPE_CSU;
+
+		if (mpam_has_feature(mpam_feat_msmon_csu_xcl, &m->ris->props))
+			*flt_val |= FIELD_PREP(MSMON_CFG_CSU_FLT_XCL, ctx->csu_exclude_clean);
+
+		break;
+	case mpam_feat_msmon_mbwu_31counter:
+	case mpam_feat_msmon_mbwu_44counter:
+	case mpam_feat_msmon_mbwu_63counter:
+		*ctl_val |= MSMON_CFG_MBWU_CTL_TYPE_MBWU;
+
+		if (mpam_has_feature(mpam_feat_msmon_mbwu_rwbw, &m->ris->props))
+			*flt_val |= FIELD_PREP(MSMON_CFG_MBWU_FLT_RWBW, ctx->opts);
+
+		break;
+	default:
+		pr_warn("Unexpected monitor type %d\n", m->type);
+	}
+}
+
+static void read_msmon_ctl_flt_vals(struct mon_read *m, u32 *ctl_val,
+				    u32 *flt_val)
+{
+	struct mpam_msc *msc = m->ris->vmsc->msc;
+
+	switch (m->type) {
+	case mpam_feat_msmon_csu:
+		*ctl_val = mpam_read_monsel_reg(msc, CFG_CSU_CTL);
+		*flt_val = mpam_read_monsel_reg(msc, CFG_CSU_FLT);
+		break;
+	case mpam_feat_msmon_mbwu_31counter:
+	case mpam_feat_msmon_mbwu_44counter:
+	case mpam_feat_msmon_mbwu_63counter:
+		*ctl_val = mpam_read_monsel_reg(msc, CFG_MBWU_CTL);
+		*flt_val = mpam_read_monsel_reg(msc, CFG_MBWU_FLT);
+		break;
+	default:
+		pr_warn("Unexpected monitor type %d\n", m->type);
+	}
+}
+
+/* Remove values set by the hardware to prevent apparent mismatches. */
+static inline void clean_msmon_ctl_val(u32 *cur_ctl)
+{
+	*cur_ctl &= ~MSMON_CFG_x_CTL_OFLOW_STATUS;
+
+	if (FIELD_GET(MSMON_CFG_x_CTL_TYPE, *cur_ctl) == MSMON_CFG_MBWU_CTL_TYPE_MBWU)
+		*cur_ctl &= ~MSMON_CFG_MBWU_CTL_OFLOW_STATUS_L;
+}
+
+static void write_msmon_ctl_flt_vals(struct mon_read *m, u32 ctl_val,
+				     u32 flt_val)
+{
+	struct mpam_msc *msc = m->ris->vmsc->msc;
+
+	/*
+	 * Write the ctl_val with the enable bit cleared, reset the counter,
+	 * then enable counter.
+	 */
+	switch (m->type) {
+	case mpam_feat_msmon_csu:
+		mpam_write_monsel_reg(msc, CFG_CSU_FLT, flt_val);
+		mpam_write_monsel_reg(msc, CFG_CSU_CTL, ctl_val);
+		mpam_write_monsel_reg(msc, CSU, 0);
+		mpam_write_monsel_reg(msc, CFG_CSU_CTL, ctl_val | MSMON_CFG_x_CTL_EN);
+		break;
+	case mpam_feat_msmon_mbwu_31counter:
+	case mpam_feat_msmon_mbwu_44counter:
+	case mpam_feat_msmon_mbwu_63counter:
+		mpam_write_monsel_reg(msc, CFG_MBWU_FLT, flt_val);
+		mpam_write_monsel_reg(msc, CFG_MBWU_CTL, ctl_val);
+		mpam_write_monsel_reg(msc, CFG_MBWU_CTL, ctl_val | MSMON_CFG_x_CTL_EN);
+		/* Counting monitors require NRDY to be reset by software */
+		if (m->type == mpam_feat_msmon_mbwu_31counter)
+			mpam_write_monsel_reg(msc, MBWU, 0);
+		else
+			mpam_msc_zero_mbwu_l(m->ris->vmsc->msc);
+		break;
+	default:
+		pr_warn("Unexpected monitor type %d\n", m->type);
+	}
+}
+
+static u64 mpam_msmon_overflow_val(enum mpam_device_features type)
+{
+	/* TODO: implement scaling counters */
+	switch (type) {
+	case mpam_feat_msmon_mbwu_63counter:
+		return BIT_ULL(hweight_long(MSMON___LWD_VALUE));
+	case mpam_feat_msmon_mbwu_44counter:
+		return BIT_ULL(hweight_long(MSMON___L_VALUE));
+	case mpam_feat_msmon_mbwu_31counter:
+		return BIT_ULL(hweight_long(MSMON___VALUE));
+	default:
+		return 0;
+	}
+}
+
+static void __ris_msmon_read(void *arg)
+{
+	u64 now;
+	bool nrdy = false;
+	bool config_mismatch;
+	bool overflow;
+	struct mon_read *m = arg;
+	struct mon_cfg *ctx = m->ctx;
+	bool reset_on_next_read = false;
+	struct mpam_msc_ris *ris = m->ris;
+	struct msmon_mbwu_state *mbwu_state;
+	struct mpam_props *rprops = &ris->props;
+	struct mpam_msc *msc = m->ris->vmsc->msc;
+	u32 mon_sel, ctl_val, flt_val, cur_ctl, cur_flt;
+
+	if (!mpam_mon_sel_lock(msc)) {
+		m->err = -EIO;
+		return;
+	}
+	mon_sel = FIELD_PREP(MSMON_CFG_MON_SEL_MON_SEL, ctx->mon) |
+		  FIELD_PREP(MSMON_CFG_MON_SEL_RIS, ris->ris_idx);
+	mpam_write_monsel_reg(msc, CFG_MON_SEL, mon_sel);
+
+	switch (m->type) {
+	case mpam_feat_msmon_mbwu_31counter:
+	case mpam_feat_msmon_mbwu_44counter:
+	case mpam_feat_msmon_mbwu_63counter:
+		mbwu_state = &ris->mbwu_state[ctx->mon];
+		if (mbwu_state) {
+			reset_on_next_read = mbwu_state->reset_on_next_read;
+			mbwu_state->reset_on_next_read = false;
+		}
+		break;
+	default:
+		break;
+	}
+
+	/*
+	 * Read the existing configuration to avoid re-writing the same values.
+	 * This saves waiting for 'nrdy' on subsequent reads.
+	 */
+	read_msmon_ctl_flt_vals(m, &cur_ctl, &cur_flt);
+
+	if (mpam_feat_msmon_mbwu_31counter == m->type)
+		overflow = cur_ctl & MSMON_CFG_x_CTL_OFLOW_STATUS;
+	else if (mpam_feat_msmon_mbwu_44counter == m->type ||
+		 mpam_feat_msmon_mbwu_63counter == m->type)
+		overflow = cur_ctl & MSMON_CFG_MBWU_CTL_OFLOW_STATUS_L;
+
+	clean_msmon_ctl_val(&cur_ctl);
+	gen_msmon_ctl_flt_vals(m, &ctl_val, &flt_val);
+	config_mismatch = cur_flt != flt_val ||
+			  cur_ctl != (ctl_val | MSMON_CFG_x_CTL_EN);
+
+	if (config_mismatch || reset_on_next_read) {
+		write_msmon_ctl_flt_vals(m, ctl_val, flt_val);
+		overflow = false;
+	} else if (overflow) {
+		mpam_write_monsel_reg(msc, CFG_MBWU_CTL,
+				      cur_ctl &
+				      ~(MSMON_CFG_x_CTL_OFLOW_STATUS |
+					MSMON_CFG_MBWU_CTL_OFLOW_STATUS_L));
+	}
+
+	switch (m->type) {
+	case mpam_feat_msmon_csu:
+		now = mpam_read_monsel_reg(msc, CSU);
+		if (mpam_has_feature(mpam_feat_msmon_csu_hw_nrdy, rprops))
+			nrdy = now & MSMON___NRDY;
+		now = FIELD_GET(MSMON___VALUE, now);
+		break;
+	case mpam_feat_msmon_mbwu_31counter:
+	case mpam_feat_msmon_mbwu_44counter:
+	case mpam_feat_msmon_mbwu_63counter:
+		if (m->type != mpam_feat_msmon_mbwu_31counter) {
+			now = mpam_msc_read_mbwu_l(msc);
+			if (mpam_has_feature(mpam_feat_msmon_mbwu_hw_nrdy, rprops))
+				nrdy = now & MSMON___L_NRDY;
+
+			if (m->type == mpam_feat_msmon_mbwu_63counter)
+				now = FIELD_GET(MSMON___LWD_VALUE, now);
+			else
+				now = FIELD_GET(MSMON___L_VALUE, now);
+		} else {
+			now = mpam_read_monsel_reg(msc, MBWU);
+			if (mpam_has_feature(mpam_feat_msmon_mbwu_hw_nrdy, rprops))
+				nrdy = now & MSMON___NRDY;
+			now = FIELD_GET(MSMON___VALUE, now);
+		}
+
+		if (nrdy)
+			break;
+
+		mbwu_state = &ris->mbwu_state[ctx->mon];
+
+		if (overflow)
+			mbwu_state->correction += mpam_msmon_overflow_val(m->type);
+
+		/*
+		 * Include bandwidth consumed before the last hardware reset and
+		 * a counter size increment for each overflow.
+		 */
+		now += mbwu_state->correction;
+		break;
+	default:
+		m->err = -EINVAL;
+	}
+	mpam_mon_sel_unlock(msc);
+
+	if (nrdy) {
+		m->err = -EBUSY;
+		return;
+	}
+
+	*m->val += now;
+}
+
+static int _msmon_read(struct mpam_component *comp, struct mon_read *arg)
+{
+	int err, any_err = 0;
+	struct mpam_vmsc *vmsc;
+
+	guard(srcu)(&mpam_srcu);
+	list_for_each_entry_srcu(vmsc, &comp->vmsc, comp_list,
+				 srcu_read_lock_held(&mpam_srcu)) {
+		struct mpam_msc *msc = vmsc->msc;
+		struct mpam_msc_ris *ris;
+
+		list_for_each_entry_srcu(ris, &vmsc->ris, vmsc_list,
+					 srcu_read_lock_held(&mpam_srcu)) {
+			arg->ris = ris;
+
+			err = smp_call_function_any(&msc->accessibility,
+						    __ris_msmon_read, arg,
+						    true);
+			if (!err && arg->err)
+				err = arg->err;
+
+			/*
+			 * Save one error to be returned to the caller, but
+			 * keep reading counters so that get reprogrammed. On
+			 * platforms with NRDY this lets us wait once.
+			 */
+			if (err)
+				any_err = err;
+		}
+	}
+
+	return any_err;
+}
+
+static enum mpam_device_features mpam_msmon_choose_counter(struct mpam_class *class)
+{
+	struct mpam_props *cprops = &class->props;
+
+	if (mpam_has_feature(mpam_feat_msmon_mbwu_63counter, cprops))
+		return mpam_feat_msmon_mbwu_63counter;
+	if (mpam_has_feature(mpam_feat_msmon_mbwu_44counter, cprops))
+		return mpam_feat_msmon_mbwu_44counter;
+
+	return mpam_feat_msmon_mbwu_31counter;
+}
+
+int mpam_msmon_read(struct mpam_component *comp, struct mon_cfg *ctx,
+		    enum mpam_device_features type, u64 *val)
+{
+	int err;
+	struct mon_read arg;
+	u64 wait_jiffies = 0;
+	struct mpam_class *class = comp->class;
+	struct mpam_props *cprops = &class->props;
+
+	might_sleep();
+
+	if (!mpam_is_enabled())
+		return -EIO;
+
+	if (!mpam_has_feature(type, cprops))
+		return -EOPNOTSUPP;
+
+	if (type == mpam_feat_msmon_mbwu)
+		type = mpam_msmon_choose_counter(class);
+
+	arg = (struct mon_read) {
+		.ctx = ctx,
+		.type = type,
+		.val = val,
+	};
+	*val = 0;
+
+	err = _msmon_read(comp, &arg);
+	if (err == -EBUSY && class->nrdy_usec)
+		wait_jiffies = usecs_to_jiffies(class->nrdy_usec);
+
+	while (wait_jiffies)
+		wait_jiffies = schedule_timeout_uninterruptible(wait_jiffies);
+
+	if (err == -EBUSY) {
+		arg = (struct mon_read) {
+			.ctx = ctx,
+			.type = type,
+			.val = val,
+		};
+		*val = 0;
+
+		err = _msmon_read(comp, &arg);
+	}
+
+	return err;
+}
+
+void mpam_msmon_reset_mbwu(struct mpam_component *comp, struct mon_cfg *ctx)
+{
+	struct mpam_msc *msc;
+	struct mpam_vmsc *vmsc;
+	struct mpam_msc_ris *ris;
+
+	if (!mpam_is_enabled())
+		return;
+
+	guard(srcu)(&mpam_srcu);
+	list_for_each_entry_srcu(vmsc, &comp->vmsc, comp_list,
+				 srcu_read_lock_held(&mpam_srcu)) {
+		if (!mpam_has_feature(mpam_feat_msmon_mbwu, &vmsc->props))
+			continue;
+
+		msc = vmsc->msc;
+		list_for_each_entry_srcu(ris, &vmsc->ris, vmsc_list,
+					 srcu_read_lock_held(&mpam_srcu)) {
+			if (!mpam_has_feature(mpam_feat_msmon_mbwu, &ris->props))
+				continue;
+
+			if (WARN_ON_ONCE(!mpam_mon_sel_lock(msc)))
+				continue;
+
+			ris->mbwu_state[ctx->mon].correction = 0;
+			ris->mbwu_state[ctx->mon].reset_on_next_read = true;
+			mpam_mon_sel_unlock(msc);
+		}
+	}
+}
+
+static void mpam_reset_msc_bitmap(struct mpam_msc *msc, u16 reg, u16 wd)
+{
+	u32 num_words, msb;
+	u32 bm = ~0;
+	int i;
+
+	lockdep_assert_held(&msc->part_sel_lock);
+
+	if (wd == 0)
+		return;
+
+	/*
+	 * Write all ~0 to all but the last 32bit-word, which may
+	 * have fewer bits...
+	 */
+	num_words = DIV_ROUND_UP(wd, 32);
+	for (i = 0; i < num_words - 1; i++, reg += sizeof(bm))
+		__mpam_write_reg(msc, reg, bm);
+
+	/*
+	 * ....and then the last (maybe) partial 32bit word. When wd is a
+	 * multiple of 32, msb should be 31 to write a full 32bit word.
+	 */
+	msb = (wd - 1) % 32;
+	bm = GENMASK(msb, 0);
+	__mpam_write_reg(msc, reg, bm);
+}
+
+/* Called via IPI. Call while holding an SRCU reference */
+static void mpam_reprogram_ris_partid(struct mpam_msc_ris *ris, u16 partid,
+				      struct mpam_config *cfg)
+{
+	u32 pri_val = 0;
+	u16 cmax = MPAMCFG_CMAX_CMAX;
+	struct mpam_msc *msc = ris->vmsc->msc;
+	struct mpam_props *rprops = &ris->props;
+	u16 dspri = GENMASK(rprops->dspri_wd, 0);
+	u16 intpri = GENMASK(rprops->intpri_wd, 0);
+
+	mutex_lock(&msc->part_sel_lock);
+	__mpam_part_sel(ris->ris_idx, partid, msc);
+
+	if (mpam_has_feature(mpam_feat_partid_nrw, rprops)) {
+		/* Update the intpartid mapping */
+		mpam_write_partsel_reg(msc, INTPARTID,
+				       MPAMCFG_INTPARTID_INTERNAL | partid);
+
+		/*
+		 * Then switch to the 'internal' partid to update the
+		 * configuration.
+		 */
+		__mpam_intpart_sel(ris->ris_idx, partid, msc);
+	}
+
+	if (mpam_has_feature(mpam_feat_cpor_part, rprops) &&
+	    mpam_has_feature(mpam_feat_cpor_part, cfg)) {
+		if (cfg->reset_cpbm)
+			mpam_reset_msc_bitmap(msc, MPAMCFG_CPBM, rprops->cpbm_wd);
+		else
+			mpam_write_partsel_reg(msc, CPBM, cfg->cpbm);
+	}
+
+	if (mpam_has_feature(mpam_feat_mbw_part, rprops) &&
+	    mpam_has_feature(mpam_feat_mbw_part, cfg)) {
+		if (cfg->reset_mbw_pbm)
+			mpam_reset_msc_bitmap(msc, MPAMCFG_MBW_PBM, rprops->mbw_pbm_bits);
+		else
+			mpam_write_partsel_reg(msc, MBW_PBM, cfg->mbw_pbm);
+	}
+
+	if (mpam_has_feature(mpam_feat_mbw_min, rprops) &&
+	    mpam_has_feature(mpam_feat_mbw_min, cfg))
+		mpam_write_partsel_reg(msc, MBW_MIN, 0);
+
+	if (mpam_has_feature(mpam_feat_mbw_max, rprops) &&
+	    mpam_has_feature(mpam_feat_mbw_max, cfg)) {
+		if (cfg->reset_mbw_max)
+			mpam_write_partsel_reg(msc, MBW_MAX, MPAMCFG_MBW_MAX_MAX);
+		else
+			mpam_write_partsel_reg(msc, MBW_MAX, cfg->mbw_max);
+	}
+
+	if (mpam_has_feature(mpam_feat_mbw_prop, rprops) &&
+	    mpam_has_feature(mpam_feat_mbw_prop, cfg))
+		mpam_write_partsel_reg(msc, MBW_PROP, 0);
+
+	if (mpam_has_feature(mpam_feat_cmax_cmax, rprops))
+		mpam_write_partsel_reg(msc, CMAX, cmax);
+
+	if (mpam_has_feature(mpam_feat_cmax_cmin, rprops))
+		mpam_write_partsel_reg(msc, CMIN, 0);
+
+	if (mpam_has_feature(mpam_feat_cmax_cassoc, rprops))
+		mpam_write_partsel_reg(msc, CASSOC, MPAMCFG_CASSOC_CASSOC);
+
+	if (mpam_has_feature(mpam_feat_intpri_part, rprops) ||
+	    mpam_has_feature(mpam_feat_dspri_part, rprops)) {
+		/* aces high? */
+		if (!mpam_has_feature(mpam_feat_intpri_part_0_low, rprops))
+			intpri = 0;
+		if (!mpam_has_feature(mpam_feat_dspri_part_0_low, rprops))
+			dspri = 0;
+
+		if (mpam_has_feature(mpam_feat_intpri_part, rprops))
+			pri_val |= FIELD_PREP(MPAMCFG_PRI_INTPRI, intpri);
+		if (mpam_has_feature(mpam_feat_dspri_part, rprops))
+			pri_val |= FIELD_PREP(MPAMCFG_PRI_DSPRI, dspri);
+
+		mpam_write_partsel_reg(msc, PRI, pri_val);
+	}
+
+	mutex_unlock(&msc->part_sel_lock);
+}
+
+/* Call with msc cfg_lock held */
+static int mpam_restore_mbwu_state(void *_ris)
+{
+	int i;
+	struct mon_read mwbu_arg;
+	struct mpam_msc_ris *ris = _ris;
+	struct mpam_class *class = ris->vmsc->comp->class;
+
+	for (i = 0; i < ris->props.num_mbwu_mon; i++) {
+		if (ris->mbwu_state[i].enabled) {
+			mwbu_arg.ris = ris;
+			mwbu_arg.ctx = &ris->mbwu_state[i].cfg;
+			mwbu_arg.type = mpam_msmon_choose_counter(class);
+
+			__ris_msmon_read(&mwbu_arg);
+		}
+	}
+
+	return 0;
+}
+
+/* Call with MSC cfg_lock held */
+static int mpam_save_mbwu_state(void *arg)
+{
+	int i;
+	u64 val;
+	struct mon_cfg *cfg;
+	u32 cur_flt, cur_ctl, mon_sel;
+	struct mpam_msc_ris *ris = arg;
+	struct msmon_mbwu_state *mbwu_state;
+	struct mpam_msc *msc = ris->vmsc->msc;
+
+	for (i = 0; i < ris->props.num_mbwu_mon; i++) {
+		mbwu_state = &ris->mbwu_state[i];
+		cfg = &mbwu_state->cfg;
+
+		if (WARN_ON_ONCE(!mpam_mon_sel_lock(msc)))
+			return -EIO;
+
+		mon_sel = FIELD_PREP(MSMON_CFG_MON_SEL_MON_SEL, i) |
+			  FIELD_PREP(MSMON_CFG_MON_SEL_RIS, ris->ris_idx);
+		mpam_write_monsel_reg(msc, CFG_MON_SEL, mon_sel);
+
+		cur_flt = mpam_read_monsel_reg(msc, CFG_MBWU_FLT);
+		cur_ctl = mpam_read_monsel_reg(msc, CFG_MBWU_CTL);
+		mpam_write_monsel_reg(msc, CFG_MBWU_CTL, 0);
+
+		if (mpam_ris_has_mbwu_long_counter(ris)) {
+			val = mpam_msc_read_mbwu_l(msc);
+			mpam_msc_zero_mbwu_l(msc);
+		} else {
+			val = mpam_read_monsel_reg(msc, MBWU);
+			mpam_write_monsel_reg(msc, MBWU, 0);
+		}
+
+		cfg->mon = i;
+		cfg->pmg = FIELD_GET(MSMON_CFG_x_FLT_PMG, cur_flt);
+		cfg->match_pmg = FIELD_GET(MSMON_CFG_x_CTL_MATCH_PMG, cur_ctl);
+		cfg->partid = FIELD_GET(MSMON_CFG_x_FLT_PARTID, cur_flt);
+		mbwu_state->correction += val;
+		mbwu_state->enabled = FIELD_GET(MSMON_CFG_x_CTL_EN, cur_ctl);
+		mpam_mon_sel_unlock(msc);
+	}
+
+	return 0;
+}
+
+static void mpam_init_reset_cfg(struct mpam_config *reset_cfg)
+{
+	*reset_cfg = (struct mpam_config) {
+		.reset_cpbm = true,
+		.reset_mbw_pbm = true,
+		.reset_mbw_max = true,
+	};
+	bitmap_fill(reset_cfg->features, MPAM_FEATURE_LAST);
+}
+
+/*
+ * Called via smp_call_on_cpu() to prevent migration, while still being
+ * pre-emptible. Caller must hold mpam_srcu.
+ */
+static int mpam_reset_ris(void *arg)
+{
+	u16 partid, partid_max;
+	struct mpam_config reset_cfg;
+	struct mpam_msc_ris *ris = arg;
+
+	if (ris->in_reset_state)
+		return 0;
+
+	mpam_init_reset_cfg(&reset_cfg);
+
+	spin_lock(&partid_max_lock);
+	partid_max = mpam_partid_max;
+	spin_unlock(&partid_max_lock);
+	for (partid = 0; partid <= partid_max; partid++)
+		mpam_reprogram_ris_partid(ris, partid, &reset_cfg);
+
+	return 0;
+}
+
+/*
+ * Get the preferred CPU for this MSC. If it is accessible from this CPU,
+ * this CPU is preferred. This can be preempted/migrated, it will only result
+ * in more work.
+ */
+static int mpam_get_msc_preferred_cpu(struct mpam_msc *msc)
+{
+	int cpu = raw_smp_processor_id();
+
+	if (cpumask_test_cpu(cpu, &msc->accessibility))
+		return cpu;
+
+	return cpumask_first_and(&msc->accessibility, cpu_online_mask);
+}
+
+static int mpam_touch_msc(struct mpam_msc *msc, int (*fn)(void *a), void *arg)
+{
+	lockdep_assert_irqs_enabled();
+	lockdep_assert_cpus_held();
+	WARN_ON_ONCE(!srcu_read_lock_held((&mpam_srcu)));
+
+	return smp_call_on_cpu(mpam_get_msc_preferred_cpu(msc), fn, arg, true);
+}
+
+struct mpam_write_config_arg {
+	struct mpam_msc_ris *ris;
+	struct mpam_component *comp;
+	u16 partid;
+};
+
+static int __write_config(void *arg)
+{
+	struct mpam_write_config_arg *c = arg;
+
+	mpam_reprogram_ris_partid(c->ris, c->partid, &c->comp->cfg[c->partid]);
+
+	return 0;
+}
+
+static void mpam_reprogram_msc(struct mpam_msc *msc)
+{
+	u16 partid;
+	bool reset;
+	struct mpam_config *cfg;
+	struct mpam_msc_ris *ris;
+	struct mpam_write_config_arg arg;
+
+	/*
+	 * No lock for mpam_partid_max as partid_max_published has been
+	 * set by mpam_enabled(), so the values can no longer change.
+	 */
+	mpam_assert_partid_sizes_fixed();
+
+	mutex_lock(&msc->cfg_lock);
+	list_for_each_entry_srcu(ris, &msc->ris, msc_list,
+				 srcu_read_lock_held(&mpam_srcu)) {
+		if (!mpam_is_enabled() && !ris->in_reset_state) {
+			mpam_touch_msc(msc, &mpam_reset_ris, ris);
+			ris->in_reset_state = true;
+			continue;
+		}
+
+		arg.comp = ris->vmsc->comp;
+		arg.ris = ris;
+		reset = true;
+		for (partid = 0; partid <= mpam_partid_max; partid++) {
+			cfg = &ris->vmsc->comp->cfg[partid];
+			if (!bitmap_empty(cfg->features, MPAM_FEATURE_LAST))
+				reset = false;
+
+			arg.partid = partid;
+			mpam_touch_msc(msc, __write_config, &arg);
+		}
+		ris->in_reset_state = reset;
+
+		if (mpam_has_feature(mpam_feat_msmon_mbwu, &ris->props))
+			mpam_touch_msc(msc, &mpam_restore_mbwu_state, ris);
+	}
+	mutex_unlock(&msc->cfg_lock);
+}
+
+static void _enable_percpu_irq(void *_irq)
+{
+	int *irq = _irq;
+
+	enable_percpu_irq(*irq, IRQ_TYPE_NONE);
+}
+
+static int mpam_cpu_online(unsigned int cpu)
+{
+	struct mpam_msc *msc;
+
+	guard(srcu)(&mpam_srcu);
+	list_for_each_entry_srcu(msc, &mpam_all_msc, all_msc_list,
+				 srcu_read_lock_held(&mpam_srcu)) {
+		if (!cpumask_test_cpu(cpu, &msc->accessibility))
+			continue;
+
+		if (msc->reenable_error_ppi)
+			_enable_percpu_irq(&msc->reenable_error_ppi);
+
+		if (atomic_fetch_inc(&msc->online_refs) == 0)
+			mpam_reprogram_msc(msc);
+	}
+
+	return 0;
+}
+
+/* Before mpam is enabled, try to probe new MSC */
+static int mpam_discovery_cpu_online(unsigned int cpu)
+{
+	int err = 0;
+	struct mpam_msc *msc;
+	bool new_device_probed = false;
+
+	if (mpam_is_enabled())
+		return 0;
+
+	guard(srcu)(&mpam_srcu);
+	list_for_each_entry_srcu(msc, &mpam_all_msc, all_msc_list,
+				 srcu_read_lock_held(&mpam_srcu)) {
+		if (!cpumask_test_cpu(cpu, &msc->accessibility))
+			continue;
+
+		mutex_lock(&msc->probe_lock);
+		if (!msc->probed)
+			err = mpam_msc_hw_probe(msc);
+		mutex_unlock(&msc->probe_lock);
+
+		if (err)
+			break;
+		new_device_probed = true;
+	}
+
+	if (new_device_probed && !err)
+		schedule_work(&mpam_enable_work);
+	if (err) {
+		mpam_disable_reason = "error during probing";
+		schedule_work(&mpam_broken_work);
+	}
+
+	return err;
+}
+
+static int mpam_cpu_offline(unsigned int cpu)
+{
+	struct mpam_msc *msc;
+
+	guard(srcu)(&mpam_srcu);
+	list_for_each_entry_srcu(msc, &mpam_all_msc, all_msc_list,
+				 srcu_read_lock_held(&mpam_srcu)) {
+		if (!cpumask_test_cpu(cpu, &msc->accessibility))
+			continue;
+
+		if (msc->reenable_error_ppi)
+			disable_percpu_irq(msc->reenable_error_ppi);
+
+		if (atomic_dec_and_test(&msc->online_refs)) {
+			struct mpam_msc_ris *ris;
+
+			mutex_lock(&msc->cfg_lock);
+			list_for_each_entry_srcu(ris, &msc->ris, msc_list,
+						 srcu_read_lock_held(&mpam_srcu)) {
+				mpam_touch_msc(msc, &mpam_reset_ris, ris);
+
+				/*
+				 * The reset state for non-zero partid may be
+				 * lost while the CPUs are offline.
+				 */
+				ris->in_reset_state = false;
+
+				if (mpam_is_enabled())
+					mpam_touch_msc(msc, &mpam_save_mbwu_state, ris);
+			}
+			mutex_unlock(&msc->cfg_lock);
+		}
+	}
+
+	return 0;
+}
+
+static void mpam_register_cpuhp_callbacks(int (*online)(unsigned int online),
+					  int (*offline)(unsigned int offline),
+					  char *name)
+{
+	mutex_lock(&mpam_cpuhp_state_lock);
+	if (mpam_cpuhp_state) {
+		cpuhp_remove_state(mpam_cpuhp_state);
+		mpam_cpuhp_state = 0;
+	}
+
+	mpam_cpuhp_state = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, name, online,
+					     offline);
+	if (mpam_cpuhp_state <= 0) {
+		pr_err("Failed to register cpuhp callbacks");
+		mpam_cpuhp_state = 0;
+	}
+	mutex_unlock(&mpam_cpuhp_state_lock);
+}
+
+static int __setup_ppi(struct mpam_msc *msc)
+{
+	int cpu;
+
+	msc->error_dev_id = alloc_percpu(struct mpam_msc *);
+	if (!msc->error_dev_id)
+		return -ENOMEM;
+
+	for_each_cpu(cpu, &msc->accessibility)
+		*per_cpu_ptr(msc->error_dev_id, cpu) = msc;
+
+	return 0;
+}
+
+static int mpam_msc_setup_error_irq(struct mpam_msc *msc)
+{
+	int irq;
+
+	irq = platform_get_irq_byname_optional(msc->pdev, "error");
+	if (irq <= 0)
+		return 0;
+
+	/* Allocate and initialise the percpu device pointer for PPI */
+	if (irq_is_percpu(irq))
+		return __setup_ppi(msc);
+
+	/* sanity check: shared interrupts can be routed anywhere? */
+	if (!cpumask_equal(&msc->accessibility, cpu_possible_mask)) {
+		pr_err_once("msc:%u is a private resource with a shared error interrupt",
+			    msc->id);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/*
+ * An MSC can control traffic from a set of CPUs, but may only be accessible
+ * from a (hopefully wider) set of CPUs. The common reason for this is power
+ * management. If all the CPUs in a cluster are in PSCI:CPU_SUSPEND, the
+ * corresponding cache may also be powered off. By making accesses from
+ * one of those CPUs, we ensure we don't access a cache that's powered off.
+ */
+static void update_msc_accessibility(struct mpam_msc *msc)
+{
+	u32 affinity_id;
+	int err;
+
+	err = device_property_read_u32(&msc->pdev->dev, "cpu_affinity",
+				       &affinity_id);
+	if (err)
+		cpumask_copy(&msc->accessibility, cpu_possible_mask);
+	else
+		acpi_pptt_get_cpus_from_container(affinity_id, &msc->accessibility);
+}
+
+/*
+ * There are two ways of reaching a struct mpam_msc_ris. Via the
+ * class->component->vmsc->ris, or via the msc.
+ * When destroying the msc, the other side needs unlinking and cleaning up too.
+ */
+static void mpam_msc_destroy(struct mpam_msc *msc)
+{
+	struct platform_device *pdev = msc->pdev;
+	struct mpam_msc_ris *ris, *tmp;
+
+	lockdep_assert_held(&mpam_list_lock);
+
+	list_for_each_entry_safe(ris, tmp, &msc->ris, msc_list)
+		mpam_ris_destroy(ris);
+
+	list_del_rcu(&msc->all_msc_list);
+	platform_set_drvdata(pdev, NULL);
+
+	add_to_garbage(msc);
+}
+
+static void mpam_msc_drv_remove(struct platform_device *pdev)
+{
+	struct mpam_msc *msc = platform_get_drvdata(pdev);
+
+	mutex_lock(&mpam_list_lock);
+	mpam_msc_destroy(msc);
+	mutex_unlock(&mpam_list_lock);
+
+	mpam_free_garbage();
+}
+
+static struct mpam_msc *do_mpam_msc_drv_probe(struct platform_device *pdev)
+{
+	int err;
+	u32 tmp;
+	struct mpam_msc *msc;
+	struct resource *msc_res;
+	struct device *dev = &pdev->dev;
+
+	lockdep_assert_held(&mpam_list_lock);
+
+	msc = devm_kzalloc(&pdev->dev, sizeof(*msc), GFP_KERNEL);
+	if (!msc)
+		return ERR_PTR(-ENOMEM);
+	init_garbage(&msc->garbage);
+	msc->garbage.pdev = pdev;
+
+	err = devm_mutex_init(dev, &msc->probe_lock);
+	if (err)
+		return ERR_PTR(err);
+
+	err = devm_mutex_init(dev, &msc->part_sel_lock);
+	if (err)
+		return ERR_PTR(err);
+
+	err = devm_mutex_init(dev, &msc->error_irq_lock);
+	if (err)
+		return ERR_PTR(err);
+
+	err = devm_mutex_init(dev, &msc->cfg_lock);
+	if (err)
+		return ERR_PTR(err);
+
+	mpam_mon_sel_lock_init(msc);
+	msc->id = pdev->id;
+	msc->pdev = pdev;
+	INIT_LIST_HEAD_RCU(&msc->all_msc_list);
+	INIT_LIST_HEAD_RCU(&msc->ris);
+
+	update_msc_accessibility(msc);
+	if (cpumask_empty(&msc->accessibility)) {
+		dev_err_once(dev, "MSC is not accessible from any CPU!");
+		return ERR_PTR(-EINVAL);
+	}
+
+	err = mpam_msc_setup_error_irq(msc);
+	if (err)
+		return ERR_PTR(err);
+
+	if (device_property_read_u32(&pdev->dev, "pcc-channel", &tmp))
+		msc->iface = MPAM_IFACE_MMIO;
+	else
+		msc->iface = MPAM_IFACE_PCC;
+
+	if (msc->iface == MPAM_IFACE_MMIO) {
+		void __iomem *io;
+
+		io = devm_platform_get_and_ioremap_resource(pdev, 0,
+							    &msc_res);
+		if (IS_ERR(io)) {
+			dev_err_once(dev, "Failed to map MSC base address\n");
+			return ERR_CAST(io);
+		}
+		msc->mapped_hwpage_sz = msc_res->end - msc_res->start;
+		msc->mapped_hwpage = io;
+	} else {
+		return ERR_PTR(-EINVAL);
+	}
+
+	list_add_rcu(&msc->all_msc_list, &mpam_all_msc);
+	platform_set_drvdata(pdev, msc);
+
+	return msc;
+}
+
+static int fw_num_msc;
+
+static int mpam_msc_drv_probe(struct platform_device *pdev)
+{
+	int err;
+	struct mpam_msc *msc = NULL;
+	void *plat_data = pdev->dev.platform_data;
+
+	mutex_lock(&mpam_list_lock);
+	msc = do_mpam_msc_drv_probe(pdev);
+	mutex_unlock(&mpam_list_lock);
+
+	if (IS_ERR(msc))
+		return PTR_ERR(msc);
+
+	/* Create RIS entries described by firmware */
+	err = acpi_mpam_parse_resources(msc, plat_data);
+	if (err) {
+		mpam_msc_drv_remove(pdev);
+		return err;
+	}
+
+	if (atomic_add_return(1, &mpam_num_msc) == fw_num_msc)
+		mpam_register_cpuhp_callbacks(mpam_discovery_cpu_online, NULL,
+					      "mpam:drv_probe");
+
+	return 0;
+}
+
+static struct platform_driver mpam_msc_driver = {
+	.driver = {
+		.name = "mpam_msc",
+	},
+	.probe = mpam_msc_drv_probe,
+	.remove = mpam_msc_drv_remove,
+};
+
+/* Any of these features mean the BWA_WD field is valid. */
+static bool mpam_has_bwa_wd_feature(struct mpam_props *props)
+{
+	if (mpam_has_feature(mpam_feat_mbw_min, props))
+		return true;
+	if (mpam_has_feature(mpam_feat_mbw_max, props))
+		return true;
+	if (mpam_has_feature(mpam_feat_mbw_prop, props))
+		return true;
+	return false;
+}
+
+/* Any of these features mean the CMAX_WD field is valid. */
+static bool mpam_has_cmax_wd_feature(struct mpam_props *props)
+{
+	if (mpam_has_feature(mpam_feat_cmax_cmax, props))
+		return true;
+	if (mpam_has_feature(mpam_feat_cmax_cmin, props))
+		return true;
+	return false;
+}
+
+#define MISMATCHED_HELPER(parent, child, helper, field, alias)		\
+	helper(parent) &&						\
+	((helper(child) && (parent)->field != (child)->field) ||	\
+	 (!helper(child) && !(alias)))
+
+#define MISMATCHED_FEAT(parent, child, feat, field, alias)		     \
+	mpam_has_feature((feat), (parent)) &&				     \
+	((mpam_has_feature((feat), (child)) && (parent)->field != (child)->field) || \
+	 (!mpam_has_feature((feat), (child)) && !(alias)))
+
+#define CAN_MERGE_FEAT(parent, child, feat, alias)			\
+	(alias) && !mpam_has_feature((feat), (parent)) &&		\
+	mpam_has_feature((feat), (child))
+
+/*
+ * Combine two props fields.
+ * If this is for controls that alias the same resource, it is safe to just
+ * copy the values over. If two aliasing controls implement the same scheme
+ * a safe value must be picked.
+ * For non-aliasing controls, these control different resources, and the
+ * resulting safe value must be compatible with both. When merging values in
+ * the tree, all the aliasing resources must be handled first.
+ * On mismatch, parent is modified.
+ */
+static void __props_mismatch(struct mpam_props *parent,
+			     struct mpam_props *child, bool alias)
+{
+	if (CAN_MERGE_FEAT(parent, child, mpam_feat_cpor_part, alias)) {
+		parent->cpbm_wd = child->cpbm_wd;
+	} else if (MISMATCHED_FEAT(parent, child, mpam_feat_cpor_part,
+				   cpbm_wd, alias)) {
+		pr_debug("cleared cpor_part\n");
+		mpam_clear_feature(mpam_feat_cpor_part, parent);
+		parent->cpbm_wd = 0;
+	}
+
+	if (CAN_MERGE_FEAT(parent, child, mpam_feat_mbw_part, alias)) {
+		parent->mbw_pbm_bits = child->mbw_pbm_bits;
+	} else if (MISMATCHED_FEAT(parent, child, mpam_feat_mbw_part,
+				   mbw_pbm_bits, alias)) {
+		pr_debug("cleared mbw_part\n");
+		mpam_clear_feature(mpam_feat_mbw_part, parent);
+		parent->mbw_pbm_bits = 0;
+	}
+
+	/* bwa_wd is a count of bits, fewer bits means less precision */
+	if (alias && !mpam_has_bwa_wd_feature(parent) &&
+	    mpam_has_bwa_wd_feature(child)) {
+		parent->bwa_wd = child->bwa_wd;
+	} else if (MISMATCHED_HELPER(parent, child, mpam_has_bwa_wd_feature,
+				     bwa_wd, alias)) {
+		pr_debug("took the min bwa_wd\n");
+		parent->bwa_wd = min(parent->bwa_wd, child->bwa_wd);
+	}
+
+	if (alias && !mpam_has_cmax_wd_feature(parent) && mpam_has_cmax_wd_feature(child)) {
+		parent->cmax_wd = child->cmax_wd;
+	} else if (MISMATCHED_HELPER(parent, child, mpam_has_cmax_wd_feature,
+				     cmax_wd, alias)) {
+		pr_debug("%s took the min cmax_wd\n", __func__);
+		parent->cmax_wd = min(parent->cmax_wd, child->cmax_wd);
+	}
+
+	if (CAN_MERGE_FEAT(parent, child, mpam_feat_cmax_cassoc, alias)) {
+		parent->cassoc_wd = child->cassoc_wd;
+	} else if (MISMATCHED_FEAT(parent, child, mpam_feat_cmax_cassoc,
+				   cassoc_wd, alias)) {
+		pr_debug("%s cleared cassoc_wd\n", __func__);
+		mpam_clear_feature(mpam_feat_cmax_cassoc, parent);
+		parent->cassoc_wd = 0;
+	}
+
+	/* For num properties, take the minimum */
+	if (CAN_MERGE_FEAT(parent, child, mpam_feat_msmon_csu, alias)) {
+		parent->num_csu_mon = child->num_csu_mon;
+	} else if (MISMATCHED_FEAT(parent, child, mpam_feat_msmon_csu,
+				   num_csu_mon, alias)) {
+		pr_debug("took the min num_csu_mon\n");
+		parent->num_csu_mon = min(parent->num_csu_mon,
+					  child->num_csu_mon);
+	}
+
+	if (CAN_MERGE_FEAT(parent, child, mpam_feat_msmon_mbwu, alias)) {
+		parent->num_mbwu_mon = child->num_mbwu_mon;
+	} else if (MISMATCHED_FEAT(parent, child, mpam_feat_msmon_mbwu,
+				   num_mbwu_mon, alias)) {
+		pr_debug("took the min num_mbwu_mon\n");
+		parent->num_mbwu_mon = min(parent->num_mbwu_mon,
+					   child->num_mbwu_mon);
+	}
+
+	if (CAN_MERGE_FEAT(parent, child, mpam_feat_intpri_part, alias)) {
+		parent->intpri_wd = child->intpri_wd;
+	} else if (MISMATCHED_FEAT(parent, child, mpam_feat_intpri_part,
+				   intpri_wd, alias)) {
+		pr_debug("%s took the min intpri_wd\n", __func__);
+		parent->intpri_wd = min(parent->intpri_wd, child->intpri_wd);
+	}
+
+	if (CAN_MERGE_FEAT(parent, child, mpam_feat_dspri_part, alias)) {
+		parent->dspri_wd = child->dspri_wd;
+	} else if (MISMATCHED_FEAT(parent, child, mpam_feat_dspri_part,
+				   dspri_wd, alias)) {
+		pr_debug("%s took the min dspri_wd\n", __func__);
+		parent->dspri_wd = min(parent->dspri_wd, child->dspri_wd);
+	}
+
+	/* TODO: alias support for these two */
+	/* {int,ds}pri may not have differing 0-low behaviour */
+	if (mpam_has_feature(mpam_feat_intpri_part, parent) &&
+	    (!mpam_has_feature(mpam_feat_intpri_part, child) ||
+	     mpam_has_feature(mpam_feat_intpri_part_0_low, parent) !=
+	     mpam_has_feature(mpam_feat_intpri_part_0_low, child))) {
+		pr_debug("%s cleared intpri_part\n", __func__);
+		mpam_clear_feature(mpam_feat_intpri_part, parent);
+		mpam_clear_feature(mpam_feat_intpri_part_0_low, parent);
+	}
+	if (mpam_has_feature(mpam_feat_dspri_part, parent) &&
+	    (!mpam_has_feature(mpam_feat_dspri_part, child) ||
+	     mpam_has_feature(mpam_feat_dspri_part_0_low, parent) !=
+	     mpam_has_feature(mpam_feat_dspri_part_0_low, child))) {
+		pr_debug("%s cleared dspri_part\n", __func__);
+		mpam_clear_feature(mpam_feat_dspri_part, parent);
+		mpam_clear_feature(mpam_feat_dspri_part_0_low, parent);
+	}
+
+	if (alias) {
+		/* Merge features for aliased resources */
+		bitmap_or(parent->features, parent->features, child->features, MPAM_FEATURE_LAST);
+	} else {
+		/* Clear missing features for non aliasing */
+		bitmap_and(parent->features, parent->features, child->features, MPAM_FEATURE_LAST);
+	}
+}
+
+/*
+ * If a vmsc doesn't match class feature/configuration, do the right thing(tm).
+ * For 'num' properties we can just take the minimum.
+ * For properties where the mismatched unused bits would make a difference, we
+ * nobble the class feature, as we can't configure all the resources.
+ * e.g. The L3 cache is composed of two resources with 13 and 17 portion
+ * bitmaps respectively.
+ */
+static void
+__class_props_mismatch(struct mpam_class *class, struct mpam_vmsc *vmsc)
+{
+	struct mpam_props *cprops = &class->props;
+	struct mpam_props *vprops = &vmsc->props;
+	struct device *dev = &vmsc->msc->pdev->dev;
+
+	lockdep_assert_held(&mpam_list_lock); /* we modify class */
+
+	dev_dbg(dev, "Merging features for class:0x%lx &= vmsc:0x%lx\n",
+		(long)cprops->features, (long)vprops->features);
+
+	/* Take the safe value for any common features */
+	__props_mismatch(cprops, vprops, false);
+}
+
+static void
+__vmsc_props_mismatch(struct mpam_vmsc *vmsc, struct mpam_msc_ris *ris)
+{
+	struct mpam_props *rprops = &ris->props;
+	struct mpam_props *vprops = &vmsc->props;
+	struct device *dev = &vmsc->msc->pdev->dev;
+
+	lockdep_assert_held(&mpam_list_lock); /* we modify vmsc */
+
+	dev_dbg(dev, "Merging features for vmsc:0x%lx |= ris:0x%lx\n",
+		(long)vprops->features, (long)rprops->features);
+
+	/*
+	 * Merge mismatched features - Copy any features that aren't common,
+	 * but take the safe value for any common features.
+	 */
+	__props_mismatch(vprops, rprops, true);
+}
+
+/*
+ * Copy the first component's first vMSC's properties and features to the
+ * class. __class_props_mismatch() will remove conflicts.
+ * It is not possible to have a class with no components, or a component with
+ * no resources. The vMSC properties have already been built.
+ */
+static void mpam_enable_init_class_features(struct mpam_class *class)
+{
+	struct mpam_vmsc *vmsc;
+	struct mpam_component *comp;
+
+	comp = list_first_entry(&class->components,
+				struct mpam_component, class_list);
+	vmsc = list_first_entry(&comp->vmsc,
+				struct mpam_vmsc, comp_list);
+
+	class->props = vmsc->props;
+}
+
+static void mpam_enable_merge_vmsc_features(struct mpam_component *comp)
+{
+	struct mpam_vmsc *vmsc;
+	struct mpam_msc_ris *ris;
+	struct mpam_class *class = comp->class;
+
+	list_for_each_entry(vmsc, &comp->vmsc, comp_list) {
+		list_for_each_entry(ris, &vmsc->ris, vmsc_list) {
+			__vmsc_props_mismatch(vmsc, ris);
+			class->nrdy_usec = max(class->nrdy_usec,
+					       vmsc->msc->nrdy_usec);
+		}
+	}
+}
+
+static void mpam_enable_merge_class_features(struct mpam_component *comp)
+{
+	struct mpam_vmsc *vmsc;
+	struct mpam_class *class = comp->class;
+
+	list_for_each_entry(vmsc, &comp->vmsc, comp_list)
+		__class_props_mismatch(class, vmsc);
+}
+
+/*
+ * Merge all the common resource features into class.
+ * vmsc features are bitwise-or'd together by mpam_enable_merge_vmsc_features()
+ * as the first step so that mpam_enable_init_class_features() can initialise
+ * the class with a representative set of features.
+ * Next the mpam_enable_merge_class_features() bitwise-and's all the vmsc
+ * features to form the class features.
+ * Other features are the min/max as appropriate.
+ *
+ * To avoid walking the whole tree twice, the class->nrdy_usec property is
+ * updated when working with the vmsc as it is a max(), and doesn't need
+ * initialising first.
+ */
+static void mpam_enable_merge_features(struct list_head *all_classes_list)
+{
+	struct mpam_class *class;
+	struct mpam_component *comp;
+
+	lockdep_assert_held(&mpam_list_lock);
+
+	list_for_each_entry(class, all_classes_list, classes_list) {
+		list_for_each_entry(comp, &class->components, class_list)
+			mpam_enable_merge_vmsc_features(comp);
+
+		mpam_enable_init_class_features(class);
+
+		list_for_each_entry(comp, &class->components, class_list)
+			mpam_enable_merge_class_features(comp);
+	}
+}
+
+static char *mpam_errcode_names[16] = {
+	[MPAM_ERRCODE_NONE]			= "No error",
+	[MPAM_ERRCODE_PARTID_SEL_RANGE]		= "PARTID_SEL_Range",
+	[MPAM_ERRCODE_REQ_PARTID_RANGE]		= "Req_PARTID_Range",
+	[MPAM_ERRCODE_MSMONCFG_ID_RANGE]	= "MSMONCFG_ID_RANGE",
+	[MPAM_ERRCODE_REQ_PMG_RANGE]		= "Req_PMG_Range",
+	[MPAM_ERRCODE_MONITOR_RANGE]		= "Monitor_Range",
+	[MPAM_ERRCODE_INTPARTID_RANGE]		= "intPARTID_Range",
+	[MPAM_ERRCODE_UNEXPECTED_INTERNAL]	= "Unexpected_INTERNAL",
+	[MPAM_ERRCODE_UNDEFINED_RIS_PART_SEL]	= "Undefined_RIS_PART_SEL",
+	[MPAM_ERRCODE_RIS_NO_CONTROL]		= "RIS_No_Control",
+	[MPAM_ERRCODE_UNDEFINED_RIS_MON_SEL]	= "Undefined_RIS_MON_SEL",
+	[MPAM_ERRCODE_RIS_NO_MONITOR]		= "RIS_No_Monitor",
+	[12 ... 15] = "Reserved"
+};
+
+static int mpam_enable_msc_ecr(void *_msc)
+{
+	struct mpam_msc *msc = _msc;
+
+	__mpam_write_reg(msc, MPAMF_ECR, MPAMF_ECR_INTEN);
+
+	return 0;
+}
+
+/* This can run in mpam_disable(), and the interrupt handler on the same CPU */
+static int mpam_disable_msc_ecr(void *_msc)
+{
+	struct mpam_msc *msc = _msc;
+
+	__mpam_write_reg(msc, MPAMF_ECR, 0);
+
+	return 0;
+}
+
+static irqreturn_t __mpam_irq_handler(int irq, struct mpam_msc *msc)
+{
+	u64 reg;
+	u16 partid;
+	u8 errcode, pmg, ris;
+
+	if (WARN_ON_ONCE(!msc) ||
+	    WARN_ON_ONCE(!cpumask_test_cpu(smp_processor_id(),
+					   &msc->accessibility)))
+		return IRQ_NONE;
+
+	reg = mpam_msc_read_esr(msc);
+
+	errcode = FIELD_GET(MPAMF_ESR_ERRCODE, reg);
+	if (!errcode)
+		return IRQ_NONE;
+
+	/* Clear level triggered irq */
+	mpam_msc_clear_esr(msc);
+
+	partid = FIELD_GET(MPAMF_ESR_PARTID_MON, reg);
+	pmg = FIELD_GET(MPAMF_ESR_PMG, reg);
+	ris = FIELD_GET(MPAMF_ESR_RIS, reg);
+
+	pr_err_ratelimited("error irq from msc:%u '%s', partid:%u, pmg: %u, ris: %u\n",
+			   msc->id, mpam_errcode_names[errcode], partid, pmg,
+			   ris);
+
+	/* Disable this interrupt. */
+	mpam_disable_msc_ecr(msc);
+
+	/* Are we racing with the thread disabling MPAM? */
+	if (!mpam_is_enabled())
+		return IRQ_HANDLED;
+
+	/*
+	 * Schedule the teardown work. Don't use a threaded IRQ as we can't
+	 * unregister the interrupt from the threaded part of the handler.
+	 */
+	mpam_disable_reason = "hardware error interrupt";
+	schedule_work(&mpam_broken_work);
+
+	return IRQ_HANDLED;
+}
+
+static irqreturn_t mpam_ppi_handler(int irq, void *dev_id)
+{
+	struct mpam_msc *msc = *(struct mpam_msc **)dev_id;
+
+	return __mpam_irq_handler(irq, msc);
+}
+
+static irqreturn_t mpam_spi_handler(int irq, void *dev_id)
+{
+	struct mpam_msc *msc = dev_id;
+
+	return __mpam_irq_handler(irq, msc);
+}
+
+static int mpam_register_irqs(void)
+{
+	int err, irq;
+	struct mpam_msc *msc;
+
+	lockdep_assert_cpus_held();
+
+	guard(srcu)(&mpam_srcu);
+	list_for_each_entry_srcu(msc, &mpam_all_msc, all_msc_list,
+				 srcu_read_lock_held(&mpam_srcu)) {
+		irq = platform_get_irq_byname_optional(msc->pdev, "error");
+		if (irq <= 0)
+			continue;
+
+		/* The MPAM spec says the interrupt can be SPI, PPI or LPI */
+		/* We anticipate sharing the interrupt with other MSCs */
+		if (irq_is_percpu(irq)) {
+			err = request_percpu_irq(irq, &mpam_ppi_handler,
+						 "mpam:msc:error",
+						 msc->error_dev_id);
+			if (err)
+				return err;
+
+			msc->reenable_error_ppi = irq;
+			smp_call_function_many(&msc->accessibility,
+					       &_enable_percpu_irq, &irq,
+					       true);
+		} else {
+			err = devm_request_irq(&msc->pdev->dev, irq,
+					       &mpam_spi_handler, IRQF_SHARED,
+					       "mpam:msc:error", msc);
+			if (err)
+				return err;
+		}
+
+		mutex_lock(&msc->error_irq_lock);
+		msc->error_irq_req = true;
+		mpam_touch_msc(msc, mpam_enable_msc_ecr, msc);
+		msc->error_irq_hw_enabled = true;
+		mutex_unlock(&msc->error_irq_lock);
+	}
+
+	return 0;
+}
+
+static void mpam_unregister_irqs(void)
+{
+	int irq;
+	struct mpam_msc *msc;
+
+	guard(cpus_read_lock)();
+	guard(srcu)(&mpam_srcu);
+	list_for_each_entry_srcu(msc, &mpam_all_msc, all_msc_list,
+				 srcu_read_lock_held(&mpam_srcu)) {
+		irq = platform_get_irq_byname_optional(msc->pdev, "error");
+		if (irq <= 0)
+			continue;
+
+		mutex_lock(&msc->error_irq_lock);
+		if (msc->error_irq_hw_enabled) {
+			mpam_touch_msc(msc, mpam_disable_msc_ecr, msc);
+			msc->error_irq_hw_enabled = false;
+		}
+
+		if (msc->error_irq_req) {
+			if (irq_is_percpu(irq)) {
+				msc->reenable_error_ppi = 0;
+				free_percpu_irq(irq, msc->error_dev_id);
+			} else {
+				devm_free_irq(&msc->pdev->dev, irq, msc);
+			}
+			msc->error_irq_req = false;
+		}
+		mutex_unlock(&msc->error_irq_lock);
+	}
+}
+
+static void __destroy_component_cfg(struct mpam_component *comp)
+{
+	struct mpam_msc *msc;
+	struct mpam_vmsc *vmsc;
+	struct mpam_msc_ris *ris;
+
+	lockdep_assert_held(&mpam_list_lock);
+
+	add_to_garbage(comp->cfg);
+	list_for_each_entry(vmsc, &comp->vmsc, comp_list) {
+		msc = vmsc->msc;
+
+		if (mpam_mon_sel_lock(msc)) {
+			list_for_each_entry(ris, &vmsc->ris, vmsc_list)
+				add_to_garbage(ris->mbwu_state);
+			mpam_mon_sel_unlock(msc);
+		}
+	}
+}
+
+static void mpam_reset_component_cfg(struct mpam_component *comp)
+{
+	int i;
+	struct mpam_props *cprops = &comp->class->props;
+
+	mpam_assert_partid_sizes_fixed();
+
+	if (!comp->cfg)
+		return;
+
+	for (i = 0; i <= mpam_partid_max; i++) {
+		comp->cfg[i] = (struct mpam_config) {};
+		if (cprops->cpbm_wd)
+			comp->cfg[i].cpbm = GENMASK(cprops->cpbm_wd - 1, 0);
+		if (cprops->mbw_pbm_bits)
+			comp->cfg[i].mbw_pbm = GENMASK(cprops->mbw_pbm_bits - 1, 0);
+		if (cprops->bwa_wd)
+			comp->cfg[i].mbw_max = GENMASK(15, 16 - cprops->bwa_wd);
+	}
+}
+
+static int __allocate_component_cfg(struct mpam_component *comp)
+{
+	struct mpam_vmsc *vmsc;
+
+	mpam_assert_partid_sizes_fixed();
+
+	if (comp->cfg)
+		return 0;
+
+	comp->cfg = kcalloc(mpam_partid_max + 1, sizeof(*comp->cfg), GFP_KERNEL);
+	if (!comp->cfg)
+		return -ENOMEM;
+
+	/*
+	 * The array is free()d in one go, so only cfg[0]'s structure needs
+	 * to be initialised.
+	 */
+	init_garbage(&comp->cfg[0].garbage);
+
+	mpam_reset_component_cfg(comp);
+
+	list_for_each_entry(vmsc, &comp->vmsc, comp_list) {
+		struct mpam_msc *msc;
+		struct mpam_msc_ris *ris;
+		struct msmon_mbwu_state *mbwu_state;
+
+		if (!vmsc->props.num_mbwu_mon)
+			continue;
+
+		msc = vmsc->msc;
+		list_for_each_entry(ris, &vmsc->ris, vmsc_list) {
+			if (!ris->props.num_mbwu_mon)
+				continue;
+
+			mbwu_state = kcalloc(ris->props.num_mbwu_mon,
+					     sizeof(*ris->mbwu_state),
+					     GFP_KERNEL);
+			if (!mbwu_state) {
+				__destroy_component_cfg(comp);
+				return -ENOMEM;
+			}
+
+			init_garbage(&mbwu_state[0].garbage);
+
+			if (mpam_mon_sel_lock(msc)) {
+				ris->mbwu_state = mbwu_state;
+				mpam_mon_sel_unlock(msc);
+			}
+		}
+	}
+
+	return 0;
+}
+
+static int mpam_allocate_config(void)
+{
+	struct mpam_class *class;
+	struct mpam_component *comp;
+
+	lockdep_assert_held(&mpam_list_lock);
+
+	list_for_each_entry(class, &mpam_classes, classes_list) {
+		list_for_each_entry(comp, &class->components, class_list) {
+			int err = __allocate_component_cfg(comp);
+			if (err)
+				return err;
+		}
+	}
+
+	return 0;
+}
+
+static void mpam_enable_once(void)
+{
+	int err;
+
+	/*
+	 * Once the cpuhp callbacks have been changed, mpam_partid_max can no
+	 * longer change.
+	 */
+	spin_lock(&partid_max_lock);
+	partid_max_published = true;
+	spin_unlock(&partid_max_lock);
+
+	/*
+	 * If all the MSC have been probed, enabling the IRQs happens next.
+	 * That involves cross-calling to a CPU that can reach the MSC, and
+	 * the locks must be taken in this order:
+	 */
+	cpus_read_lock();
+	mutex_lock(&mpam_list_lock);
+	do {
+		mpam_enable_merge_features(&mpam_classes);
+
+		err = mpam_register_irqs();
+		if (err) {
+			pr_warn("Failed to register irqs: %d\n", err);
+			break;
+		}
+
+		err = mpam_allocate_config();
+		if (err) {
+			pr_err("Failed to allocate configuration arrays.\n");
+			break;
+		}
+	} while (0);
+	mutex_unlock(&mpam_list_lock);
+	cpus_read_unlock();
+
+	if (err) {
+		mpam_disable_reason = "Failed to enable.";
+		schedule_work(&mpam_broken_work);
+		return;
+	}
+
+	static_branch_enable(&mpam_enabled);
+	mpam_register_cpuhp_callbacks(mpam_cpu_online, mpam_cpu_offline,
+				      "mpam:online");
+
+	/* Use printk() to avoid the pr_fmt adding the function name. */
+	printk(KERN_INFO "MPAM enabled with %u PARTIDs and %u PMGs\n",
+	       mpam_partid_max + 1, mpam_pmg_max + 1);
+}
+
+static void mpam_reset_component_locked(struct mpam_component *comp)
+{
+	struct mpam_vmsc *vmsc;
+
+	lockdep_assert_cpus_held();
+	mpam_assert_partid_sizes_fixed();
+
+	mpam_reset_component_cfg(comp);
+
+	guard(srcu)(&mpam_srcu);
+	list_for_each_entry_srcu(vmsc, &comp->vmsc, comp_list,
+				 srcu_read_lock_held(&mpam_srcu)) {
+		struct mpam_msc *msc = vmsc->msc;
+		struct mpam_msc_ris *ris;
+
+		list_for_each_entry_srcu(ris, &vmsc->ris, vmsc_list,
+					 srcu_read_lock_held(&mpam_srcu)) {
+			if (!ris->in_reset_state)
+				mpam_touch_msc(msc, mpam_reset_ris, ris);
+			ris->in_reset_state = true;
+		}
+	}
+}
+
+static void mpam_reset_class_locked(struct mpam_class *class)
+{
+	struct mpam_component *comp;
+
+	lockdep_assert_cpus_held();
+
+	guard(srcu)(&mpam_srcu);
+	list_for_each_entry_srcu(comp, &class->components, class_list,
+				 srcu_read_lock_held(&mpam_srcu))
+		mpam_reset_component_locked(comp);
+}
+
+static void mpam_reset_class(struct mpam_class *class)
+{
+	cpus_read_lock();
+	mpam_reset_class_locked(class);
+	cpus_read_unlock();
+}
+
+/*
+ * Called in response to an error IRQ.
+ * All of MPAMs errors indicate a software bug, restore any modified
+ * controls to their reset values.
+ */
+void mpam_disable(struct work_struct *ignored)
+{
+	int idx;
+	struct mpam_class *class;
+	struct mpam_msc *msc, *tmp;
+
+	mutex_lock(&mpam_cpuhp_state_lock);
+	if (mpam_cpuhp_state) {
+		cpuhp_remove_state(mpam_cpuhp_state);
+		mpam_cpuhp_state = 0;
+	}
+	mutex_unlock(&mpam_cpuhp_state_lock);
+
+	static_branch_disable(&mpam_enabled);
+
+	mpam_unregister_irqs();
+
+	idx = srcu_read_lock(&mpam_srcu);
+	list_for_each_entry_srcu(class, &mpam_classes, classes_list,
+				 srcu_read_lock_held(&mpam_srcu))
+		mpam_reset_class(class);
+	srcu_read_unlock(&mpam_srcu, idx);
+
+	mutex_lock(&mpam_list_lock);
+	list_for_each_entry_safe(msc, tmp, &mpam_all_msc, all_msc_list)
+		mpam_msc_destroy(msc);
+	mutex_unlock(&mpam_list_lock);
+	mpam_free_garbage();
+
+	pr_err_once("MPAM disabled due to %s\n", mpam_disable_reason);
+}
+
+/*
+ * Enable mpam once all devices have been probed.
+ * Scheduled by mpam_discovery_cpu_online() once all devices have been created.
+ * Also scheduled when new devices are probed when new CPUs come online.
+ */
+void mpam_enable(struct work_struct *work)
+{
+	static atomic_t once;
+	struct mpam_msc *msc;
+	bool all_devices_probed = true;
+
+	/* Have we probed all the hw devices? */
+	guard(srcu)(&mpam_srcu);
+	list_for_each_entry_srcu(msc, &mpam_all_msc, all_msc_list,
+				 srcu_read_lock_held(&mpam_srcu)) {
+		mutex_lock(&msc->probe_lock);
+		if (!msc->probed)
+			all_devices_probed = false;
+		mutex_unlock(&msc->probe_lock);
+
+		if (!all_devices_probed)
+			break;
+	}
+
+	if (all_devices_probed && !atomic_fetch_inc(&once))
+		mpam_enable_once();
+}
+
+#define maybe_update_config(cfg, feature, newcfg, member, changes) do { \
+	if (mpam_has_feature(feature, newcfg) &&			\
+	    (newcfg)->member != (cfg)->member) {			\
+		(cfg)->member = (newcfg)->member;			\
+		mpam_set_feature(feature, cfg);				\
+									\
+		(changes) = true;					\
+	}								\
+} while (0)
+
+static bool mpam_update_config(struct mpam_config *cfg,
+			       const struct mpam_config *newcfg)
+{
+	bool has_changes = false;
+
+	maybe_update_config(cfg, mpam_feat_cpor_part, newcfg, cpbm, has_changes);
+	maybe_update_config(cfg, mpam_feat_mbw_part, newcfg, mbw_pbm, has_changes);
+	maybe_update_config(cfg, mpam_feat_mbw_max, newcfg, mbw_max, has_changes);
+
+	return has_changes;
+}
+
+int mpam_apply_config(struct mpam_component *comp, u16 partid,
+		      struct mpam_config *cfg)
+{
+	struct mpam_write_config_arg arg;
+	struct mpam_msc_ris *ris;
+	struct mpam_vmsc *vmsc;
+	struct mpam_msc *msc;
+
+	lockdep_assert_cpus_held();
+
+	/* Don't pass in the current config! */
+	WARN_ON_ONCE(&comp->cfg[partid] == cfg);
+
+	if (!mpam_update_config(&comp->cfg[partid], cfg))
+		return 0;
+
+	arg.comp = comp;
+	arg.partid = partid;
+
+	guard(srcu)(&mpam_srcu);
+	list_for_each_entry_srcu(vmsc, &comp->vmsc, comp_list,
+				 srcu_read_lock_held(&mpam_srcu)) {
+		msc = vmsc->msc;
+
+		mutex_lock(&msc->cfg_lock);
+		list_for_each_entry_srcu(ris, &vmsc->ris, vmsc_list,
+					 srcu_read_lock_held(&mpam_srcu)) {
+			arg.ris = ris;
+			mpam_touch_msc(msc, __write_config, &arg);
+		}
+		mutex_unlock(&msc->cfg_lock);
+	}
+
+	return 0;
+}
+
+static int __init mpam_msc_driver_init(void)
+{
+	if (!system_supports_mpam())
+		return -EOPNOTSUPP;
+
+	init_srcu_struct(&mpam_srcu);
+
+	fw_num_msc = acpi_mpam_count_msc();
+	if (fw_num_msc <= 0) {
+		pr_err("No MSC devices found in firmware\n");
+		return -EINVAL;
+	}
+
+	return platform_driver_register(&mpam_msc_driver);
+}
+
+/* Must occur after arm64_mpam_register_cpus() from arch_initcall() */
+subsys_initcall(mpam_msc_driver_init);
+
+#ifdef CONFIG_MPAM_KUNIT_TEST
+#include "test_mpam_devices.c"
+#endif
diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h
new file mode 100644
index 000000000000..e79c3c47259c
--- /dev/null
+++ b/drivers/resctrl/mpam_internal.h
@@ -0,0 +1,658 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+// Copyright (C) 2025 Arm Ltd.
+
+#ifndef MPAM_INTERNAL_H
+#define MPAM_INTERNAL_H
+
+#include <linux/arm_mpam.h>
+#include <linux/atomic.h>
+#include <linux/bitmap.h>
+#include <linux/cpumask.h>
+#include <linux/io.h>
+#include <linux/jump_label.h>
+#include <linux/llist.h>
+#include <linux/mutex.h>
+#include <linux/srcu.h>
+#include <linux/spinlock.h>
+#include <linux/srcu.h>
+#include <linux/types.h>
+
+#define MPAM_MSC_MAX_NUM_RIS	16
+
+struct platform_device;
+
+DECLARE_STATIC_KEY_FALSE(mpam_enabled);
+
+#ifdef CONFIG_MPAM_KUNIT_TEST
+#define PACKED_FOR_KUNIT __packed
+#else
+#define PACKED_FOR_KUNIT
+#endif
+
+static inline bool mpam_is_enabled(void)
+{
+	return static_branch_likely(&mpam_enabled);
+}
+
+/*
+ * Structures protected by SRCU may not be freed for a surprising amount of
+ * time (especially if perf is running). To ensure the MPAM error interrupt can
+ * tear down all the structures, build a list of objects that can be garbage
+ * collected once synchronize_srcu() has returned.
+ * If pdev is non-NULL, use devm_kfree().
+ */
+struct mpam_garbage {
+	/* member of mpam_garbage */
+	struct llist_node	llist;
+
+	void			*to_free;
+	struct platform_device	*pdev;
+};
+
+struct mpam_msc {
+	/* member of mpam_all_msc */
+	struct list_head	all_msc_list;
+
+	int			id;
+	struct platform_device	*pdev;
+
+	/* Not modified after mpam_is_enabled() becomes true */
+	enum mpam_msc_iface	iface;
+	u32			nrdy_usec;
+	cpumask_t		accessibility;
+	bool			has_extd_esr;
+
+	int				reenable_error_ppi;
+	struct mpam_msc * __percpu	*error_dev_id;
+
+	atomic_t		online_refs;
+
+	/*
+	 * probe_lock is only taken during discovery. After discovery these
+	 * properties become read-only and the lists are protected by SRCU.
+	 */
+	struct mutex		probe_lock;
+	bool			probed;
+	u16			partid_max;
+	u8			pmg_max;
+	unsigned long		ris_idxs;
+	u32			ris_max;
+
+	/*
+	 * error_irq_lock is taken when registering/unregistering the error
+	 * interrupt and maniupulating the below flags.
+	 */
+	struct mutex		error_irq_lock;
+	bool			error_irq_req;
+	bool			error_irq_hw_enabled;
+
+	/* mpam_msc_ris of this component */
+	struct list_head	ris;
+
+	/*
+	 * part_sel_lock protects access to the MSC hardware registers that are
+	 * affected by MPAMCFG_PART_SEL. (including the ID registers that vary
+	 * by RIS).
+	 * If needed, take msc->probe_lock first.
+	 */
+	struct mutex		part_sel_lock;
+
+	/*
+	 * cfg_lock protects the msc configuration and guards against mbwu_state
+	 * save and restore racing.
+	 */
+	struct mutex		cfg_lock;
+
+	/*
+	 * mon_sel_lock protects access to the MSC hardware registers that are
+	 * affected by MPAMCFG_MON_SEL, and the mbwu_state.
+	 * Access to mon_sel is needed from both process and interrupt contexts,
+	 * but is complicated by firmware-backed platforms that can't make any
+	 * access unless they can sleep.
+	 * Always use the mpam_mon_sel_lock() helpers.
+	 * Accesses to mon_sel need to be able to fail if they occur in the wrong
+	 * context.
+	 * If needed, take msc->probe_lock first.
+	 */
+	raw_spinlock_t		_mon_sel_lock;
+	unsigned long		_mon_sel_flags;
+
+	void __iomem		*mapped_hwpage;
+	size_t			mapped_hwpage_sz;
+
+	struct mpam_garbage	garbage;
+};
+
+/* Returning false here means accesses to mon_sel must fail and report an error. */
+static inline bool __must_check mpam_mon_sel_lock(struct mpam_msc *msc)
+{
+	/* Locking will require updating to support a firmware backed interface */
+	if (WARN_ON_ONCE(msc->iface != MPAM_IFACE_MMIO))
+		return false;
+
+	raw_spin_lock_irqsave(&msc->_mon_sel_lock, msc->_mon_sel_flags);
+	return true;
+}
+
+static inline void mpam_mon_sel_unlock(struct mpam_msc *msc)
+{
+	raw_spin_unlock_irqrestore(&msc->_mon_sel_lock, msc->_mon_sel_flags);
+}
+
+static inline void mpam_mon_sel_lock_held(struct mpam_msc *msc)
+{
+	lockdep_assert_held_once(&msc->_mon_sel_lock);
+}
+
+static inline void mpam_mon_sel_lock_init(struct mpam_msc *msc)
+{
+	raw_spin_lock_init(&msc->_mon_sel_lock);
+}
+
+/* Bits for mpam features bitmaps */
+enum mpam_device_features {
+	mpam_feat_cpor_part,
+	mpam_feat_cmax_softlim,
+	mpam_feat_cmax_cmax,
+	mpam_feat_cmax_cmin,
+	mpam_feat_cmax_cassoc,
+	mpam_feat_mbw_part,
+	mpam_feat_mbw_min,
+	mpam_feat_mbw_max,
+	mpam_feat_mbw_prop,
+	mpam_feat_intpri_part,
+	mpam_feat_intpri_part_0_low,
+	mpam_feat_dspri_part,
+	mpam_feat_dspri_part_0_low,
+	mpam_feat_msmon,
+	mpam_feat_msmon_csu,
+	mpam_feat_msmon_csu_capture,
+	mpam_feat_msmon_csu_xcl,
+	mpam_feat_msmon_csu_hw_nrdy,
+	mpam_feat_msmon_mbwu,
+	mpam_feat_msmon_mbwu_31counter,
+	mpam_feat_msmon_mbwu_44counter,
+	mpam_feat_msmon_mbwu_63counter,
+	mpam_feat_msmon_mbwu_capture,
+	mpam_feat_msmon_mbwu_rwbw,
+	mpam_feat_msmon_mbwu_hw_nrdy,
+	mpam_feat_partid_nrw,
+	MPAM_FEATURE_LAST
+};
+
+struct mpam_props {
+	DECLARE_BITMAP(features, MPAM_FEATURE_LAST);
+
+	u16			cpbm_wd;
+	u16			mbw_pbm_bits;
+	u16			bwa_wd;
+	u16			cmax_wd;
+	u16			cassoc_wd;
+	u16			intpri_wd;
+	u16			dspri_wd;
+	u16			num_csu_mon;
+	u16			num_mbwu_mon;
+
+/*
+ * Kunit tests use memset() to set up feature combinations that should be
+ * removed, and will false-positive if the compiler introduces padding that
+ * isn't cleared during sanitisation.
+ */
+} PACKED_FOR_KUNIT;
+
+#define mpam_has_feature(_feat, x)	test_bit(_feat, (x)->features)
+#define mpam_set_feature(_feat, x)	set_bit(_feat, (x)->features)
+#define mpam_clear_feature(_feat, x)	clear_bit(_feat, (x)->features)
+
+/* The values for MSMON_CFG_MBWU_FLT.RWBW */
+enum mon_filter_options {
+	COUNT_BOTH	= 0,
+	COUNT_WRITE	= 1,
+	COUNT_READ	= 2,
+};
+
+struct mon_cfg {
+	u16			mon;
+	u8			pmg;
+	bool			match_pmg;
+	bool			csu_exclude_clean;
+	u32			partid;
+	enum mon_filter_options opts;
+};
+
+/* Changes to msmon_mbwu_state are protected by the msc's mon_sel_lock. */
+struct msmon_mbwu_state {
+	bool		enabled;
+	bool		reset_on_next_read;
+	struct mon_cfg	cfg;
+
+	/*
+	 * The value to add to the new reading to account for power management,
+	 * and overflow.
+	 */
+	u64		correction;
+
+	struct mpam_garbage	garbage;
+};
+
+struct mpam_class {
+	/* mpam_components in this class */
+	struct list_head	components;
+
+	cpumask_t		affinity;
+
+	struct mpam_props	props;
+	u32			nrdy_usec;
+	u8			level;
+	enum mpam_class_types	type;
+
+	/* member of mpam_classes */
+	struct list_head	classes_list;
+
+	struct ida		ida_csu_mon;
+	struct ida		ida_mbwu_mon;
+
+	struct mpam_garbage	garbage;
+};
+
+struct mpam_config {
+	/* Which configuration values are valid. */
+	DECLARE_BITMAP(features, MPAM_FEATURE_LAST);
+
+	u32	cpbm;
+	u32	mbw_pbm;
+	u16	mbw_max;
+
+	bool	reset_cpbm;
+	bool	reset_mbw_pbm;
+	bool	reset_mbw_max;
+
+	struct mpam_garbage	garbage;
+};
+
+struct mpam_component {
+	u32			comp_id;
+
+	/* mpam_vmsc in this component */
+	struct list_head	vmsc;
+
+	cpumask_t		affinity;
+
+	/*
+	 * Array of configuration values, indexed by partid.
+	 * Read from cpuhp callbacks, hold the cpuhp lock when writing.
+	 */
+	struct mpam_config	*cfg;
+
+	/* member of mpam_class:components */
+	struct list_head	class_list;
+
+	/* parent: */
+	struct mpam_class	*class;
+
+	struct mpam_garbage	garbage;
+};
+
+struct mpam_vmsc {
+	/* member of mpam_component:vmsc_list */
+	struct list_head	comp_list;
+
+	/* mpam_msc_ris in this vmsc */
+	struct list_head	ris;
+
+	struct mpam_props	props;
+
+	/* All RIS in this vMSC are members of this MSC */
+	struct mpam_msc		*msc;
+
+	/* parent: */
+	struct mpam_component	*comp;
+
+	struct mpam_garbage	garbage;
+};
+
+struct mpam_msc_ris {
+	u8			ris_idx;
+	u64			idr;
+	struct mpam_props	props;
+	bool			in_reset_state;
+
+	cpumask_t		affinity;
+
+	/* member of mpam_vmsc:ris */
+	struct list_head	vmsc_list;
+
+	/* member of mpam_msc:ris */
+	struct list_head	msc_list;
+
+	/* parent: */
+	struct mpam_vmsc	*vmsc;
+
+	/* msmon mbwu configuration is preserved over reset */
+	struct msmon_mbwu_state	*mbwu_state;
+
+	struct mpam_garbage	garbage;
+};
+
+static inline int mpam_alloc_csu_mon(struct mpam_class *class)
+{
+	struct mpam_props *cprops = &class->props;
+
+	if (!mpam_has_feature(mpam_feat_msmon_csu, cprops))
+		return -EOPNOTSUPP;
+
+	return ida_alloc_max(&class->ida_csu_mon, cprops->num_csu_mon - 1,
+			     GFP_KERNEL);
+}
+
+static inline void mpam_free_csu_mon(struct mpam_class *class, int csu_mon)
+{
+	ida_free(&class->ida_csu_mon, csu_mon);
+}
+
+static inline int mpam_alloc_mbwu_mon(struct mpam_class *class)
+{
+	struct mpam_props *cprops = &class->props;
+
+	if (!mpam_has_feature(mpam_feat_msmon_mbwu, cprops))
+		return -EOPNOTSUPP;
+
+	return ida_alloc_max(&class->ida_mbwu_mon, cprops->num_mbwu_mon - 1,
+			     GFP_KERNEL);
+}
+
+static inline void mpam_free_mbwu_mon(struct mpam_class *class, int mbwu_mon)
+{
+	ida_free(&class->ida_mbwu_mon, mbwu_mon);
+}
+
+/* List of all classes - protected by srcu*/
+extern struct srcu_struct mpam_srcu;
+extern struct list_head mpam_classes;
+
+/* System wide partid/pmg values */
+extern u16 mpam_partid_max;
+extern u8 mpam_pmg_max;
+
+/* Scheduled work callback to enable mpam once all MSC have been probed */
+void mpam_enable(struct work_struct *work);
+void mpam_disable(struct work_struct *work);
+
+int mpam_apply_config(struct mpam_component *comp, u16 partid,
+		      struct mpam_config *cfg);
+
+int mpam_msmon_read(struct mpam_component *comp, struct mon_cfg *ctx,
+		    enum mpam_device_features, u64 *val);
+void mpam_msmon_reset_mbwu(struct mpam_component *comp, struct mon_cfg *ctx);
+
+int mpam_get_cpumask_from_cache_id(unsigned long cache_id, u32 cache_level,
+				   cpumask_t *affinity);
+
+/*
+ * MPAM MSCs have the following register layout. See:
+ * Arm Memory System Resource Partitioning and Monitoring (MPAM) System
+ * Component Specification.
+ * https://developer.arm.com/documentation/ihi0099/aa/
+ */
+#define MPAM_ARCHITECTURE_V1    0x10
+
+/* Memory mapped control pages */
+/* ID Register offsets in the memory mapped page */
+#define MPAMF_IDR		0x0000  /* features id register */
+#define MPAMF_IIDR		0x0018  /* implementer id register */
+#define MPAMF_AIDR		0x0020  /* architectural id register */
+#define MPAMF_IMPL_IDR		0x0028  /* imp-def partitioning */
+#define MPAMF_CPOR_IDR		0x0030  /* cache-portion partitioning */
+#define MPAMF_CCAP_IDR		0x0038  /* cache-capacity partitioning */
+#define MPAMF_MBW_IDR		0x0040  /* mem-bw partitioning */
+#define MPAMF_PRI_IDR		0x0048  /* priority partitioning */
+#define MPAMF_MSMON_IDR		0x0080  /* performance monitoring features */
+#define MPAMF_CSUMON_IDR	0x0088  /* cache-usage monitor */
+#define MPAMF_MBWUMON_IDR	0x0090  /* mem-bw usage monitor */
+#define MPAMF_PARTID_NRW_IDR	0x0050  /* partid-narrowing */
+
+/* Configuration and Status Register offsets in the memory mapped page */
+#define MPAMCFG_PART_SEL	0x0100  /* partid to configure */
+#define MPAMCFG_CPBM		0x1000  /* cache-portion config */
+#define MPAMCFG_CMAX		0x0108  /* cache-capacity config */
+#define MPAMCFG_CMIN		0x0110  /* cache-capacity config */
+#define MPAMCFG_CASSOC		0x0118  /* cache-associativity config */
+#define MPAMCFG_MBW_MIN		0x0200  /* min mem-bw config */
+#define MPAMCFG_MBW_MAX		0x0208  /* max mem-bw config */
+#define MPAMCFG_MBW_WINWD	0x0220  /* mem-bw accounting window config */
+#define MPAMCFG_MBW_PBM		0x2000  /* mem-bw portion bitmap config */
+#define MPAMCFG_PRI		0x0400  /* priority partitioning config */
+#define MPAMCFG_MBW_PROP	0x0500  /* mem-bw stride config */
+#define MPAMCFG_INTPARTID	0x0600  /* partid-narrowing config */
+
+#define MSMON_CFG_MON_SEL	0x0800  /* monitor selector */
+#define MSMON_CFG_CSU_FLT	0x0810  /* cache-usage monitor filter */
+#define MSMON_CFG_CSU_CTL	0x0818  /* cache-usage monitor config */
+#define MSMON_CFG_MBWU_FLT	0x0820  /* mem-bw monitor filter */
+#define MSMON_CFG_MBWU_CTL	0x0828  /* mem-bw monitor config */
+#define MSMON_CSU		0x0840  /* current cache-usage */
+#define MSMON_CSU_CAPTURE	0x0848  /* last cache-usage value captured */
+#define MSMON_MBWU		0x0860  /* current mem-bw usage value */
+#define MSMON_MBWU_CAPTURE	0x0868  /* last mem-bw value captured */
+#define MSMON_MBWU_L		0x0880  /* current long mem-bw usage value */
+#define MSMON_MBWU_L_CAPTURE	0x0890  /* last long mem-bw value captured */
+#define MSMON_CAPT_EVNT		0x0808  /* signal a capture event */
+#define MPAMF_ESR		0x00F8  /* error status register */
+#define MPAMF_ECR		0x00F0  /* error control register */
+
+/* MPAMF_IDR - MPAM features ID register */
+#define MPAMF_IDR_PARTID_MAX		GENMASK(15, 0)
+#define MPAMF_IDR_PMG_MAX		GENMASK(23, 16)
+#define MPAMF_IDR_HAS_CCAP_PART		BIT(24)
+#define MPAMF_IDR_HAS_CPOR_PART		BIT(25)
+#define MPAMF_IDR_HAS_MBW_PART		BIT(26)
+#define MPAMF_IDR_HAS_PRI_PART		BIT(27)
+#define MPAMF_IDR_EXT			BIT(28)
+#define MPAMF_IDR_HAS_IMPL_IDR		BIT(29)
+#define MPAMF_IDR_HAS_MSMON		BIT(30)
+#define MPAMF_IDR_HAS_PARTID_NRW	BIT(31)
+#define MPAMF_IDR_HAS_RIS		BIT(32)
+#define MPAMF_IDR_HAS_EXTD_ESR		BIT(38)
+#define MPAMF_IDR_HAS_ESR		BIT(39)
+#define MPAMF_IDR_RIS_MAX		GENMASK(59, 56)
+
+/* MPAMF_MSMON_IDR - MPAM performance monitoring ID register */
+#define MPAMF_MSMON_IDR_MSMON_CSU		BIT(16)
+#define MPAMF_MSMON_IDR_MSMON_MBWU		BIT(17)
+#define MPAMF_MSMON_IDR_HAS_LOCAL_CAPT_EVNT	BIT(31)
+
+/* MPAMF_CPOR_IDR - MPAM features cache portion partitioning ID register */
+#define MPAMF_CPOR_IDR_CPBM_WD			GENMASK(15, 0)
+
+/* MPAMF_CCAP_IDR - MPAM features cache capacity partitioning ID register */
+#define MPAMF_CCAP_IDR_CMAX_WD			GENMASK(5, 0)
+#define MPAMF_CCAP_IDR_CASSOC_WD		GENMASK(12, 8)
+#define MPAMF_CCAP_IDR_HAS_CASSOC		BIT(28)
+#define MPAMF_CCAP_IDR_HAS_CMIN			BIT(29)
+#define MPAMF_CCAP_IDR_NO_CMAX			BIT(30)
+#define MPAMF_CCAP_IDR_HAS_CMAX_SOFTLIM		BIT(31)
+
+/* MPAMF_MBW_IDR - MPAM features memory bandwidth partitioning ID register */
+#define MPAMF_MBW_IDR_BWA_WD		GENMASK(5, 0)
+#define MPAMF_MBW_IDR_HAS_MIN		BIT(10)
+#define MPAMF_MBW_IDR_HAS_MAX		BIT(11)
+#define MPAMF_MBW_IDR_HAS_PBM		BIT(12)
+#define MPAMF_MBW_IDR_HAS_PROP		BIT(13)
+#define MPAMF_MBW_IDR_WINDWR		BIT(14)
+#define MPAMF_MBW_IDR_BWPBM_WD		GENMASK(28, 16)
+
+/* MPAMF_PRI_IDR - MPAM features priority partitioning ID register */
+#define MPAMF_PRI_IDR_HAS_INTPRI	BIT(0)
+#define MPAMF_PRI_IDR_INTPRI_0_IS_LOW	BIT(1)
+#define MPAMF_PRI_IDR_INTPRI_WD		GENMASK(9, 4)
+#define MPAMF_PRI_IDR_HAS_DSPRI		BIT(16)
+#define MPAMF_PRI_IDR_DSPRI_0_IS_LOW	BIT(17)
+#define MPAMF_PRI_IDR_DSPRI_WD		GENMASK(25, 20)
+
+/* MPAMF_CSUMON_IDR - MPAM cache storage usage monitor ID register */
+#define MPAMF_CSUMON_IDR_NUM_MON	GENMASK(15, 0)
+#define MPAMF_CSUMON_IDR_HAS_OFLOW_CAPT	BIT(24)
+#define MPAMF_CSUMON_IDR_HAS_CEVNT_OFLW	BIT(25)
+#define MPAMF_CSUMON_IDR_HAS_OFSR	BIT(26)
+#define MPAMF_CSUMON_IDR_HAS_OFLOW_LNKG	BIT(27)
+#define MPAMF_CSUMON_IDR_HAS_XCL	BIT(29)
+#define MPAMF_CSUMON_IDR_CSU_RO		BIT(30)
+#define MPAMF_CSUMON_IDR_HAS_CAPTURE	BIT(31)
+
+/* MPAMF_MBWUMON_IDR - MPAM memory bandwidth usage monitor ID register */
+#define MPAMF_MBWUMON_IDR_NUM_MON	GENMASK(15, 0)
+#define MPAMF_MBWUMON_IDR_HAS_RWBW	BIT(28)
+#define MPAMF_MBWUMON_IDR_LWD		BIT(29)
+#define MPAMF_MBWUMON_IDR_HAS_LONG	BIT(30)
+#define MPAMF_MBWUMON_IDR_HAS_CAPTURE	BIT(31)
+
+/* MPAMF_PARTID_NRW_IDR - MPAM PARTID narrowing ID register */
+#define MPAMF_PARTID_NRW_IDR_INTPARTID_MAX	GENMASK(15, 0)
+
+/* MPAMF_IIDR - MPAM implementation ID register */
+#define MPAMF_IIDR_IMPLEMENTER	GENMASK(11, 0)
+#define MPAMF_IIDR_REVISION	GENMASK(15, 12)
+#define MPAMF_IIDR_VARIANT	GENMASK(19, 16)
+#define MPAMF_IIDR_PRODUCTID	GENMASK(31, 20)
+
+/* MPAMF_AIDR - MPAM architecture ID register */
+#define MPAMF_AIDR_ARCH_MINOR_REV	GENMASK(3, 0)
+#define MPAMF_AIDR_ARCH_MAJOR_REV	GENMASK(7, 4)
+
+/* MPAMCFG_PART_SEL - MPAM partition configuration selection register */
+#define MPAMCFG_PART_SEL_PARTID_SEL	GENMASK(15, 0)
+#define MPAMCFG_PART_SEL_INTERNAL	BIT(16)
+#define MPAMCFG_PART_SEL_RIS		GENMASK(27, 24)
+
+/* MPAMCFG_CASSOC - MPAM cache maximum associativity partition configuration register */
+#define MPAMCFG_CASSOC_CASSOC		GENMASK(15, 0)
+
+/* MPAMCFG_CMAX - MPAM cache capacity configuration register */
+#define MPAMCFG_CMAX_SOFTLIM		BIT(31)
+#define MPAMCFG_CMAX_CMAX		GENMASK(15, 0)
+
+/* MPAMCFG_CMIN - MPAM cache capacity configuration register */
+#define MPAMCFG_CMIN_CMIN		GENMASK(15, 0)
+
+/*
+ * MPAMCFG_MBW_MIN - MPAM memory minimum bandwidth partitioning configuration
+ *                   register
+ */
+#define MPAMCFG_MBW_MIN_MIN		GENMASK(15, 0)
+
+/*
+ * MPAMCFG_MBW_MAX - MPAM memory maximum bandwidth partitioning configuration
+ *                   register
+ */
+#define MPAMCFG_MBW_MAX_MAX		GENMASK(15, 0)
+#define MPAMCFG_MBW_MAX_HARDLIM		BIT(31)
+
+/*
+ * MPAMCFG_MBW_WINWD - MPAM memory bandwidth partitioning window width
+ *                     register
+ */
+#define MPAMCFG_MBW_WINWD_US_FRAC	GENMASK(7, 0)
+#define MPAMCFG_MBW_WINWD_US_INT	GENMASK(23, 8)
+
+/* MPAMCFG_PRI - MPAM priority partitioning configuration register */
+#define MPAMCFG_PRI_INTPRI		GENMASK(15, 0)
+#define MPAMCFG_PRI_DSPRI		GENMASK(31, 16)
+
+/*
+ * MPAMCFG_MBW_PROP - Memory bandwidth proportional stride partitioning
+ *                    configuration register
+ */
+#define MPAMCFG_MBW_PROP_STRIDEM1	GENMASK(15, 0)
+#define MPAMCFG_MBW_PROP_EN		BIT(31)
+
+/*
+ * MPAMCFG_INTPARTID - MPAM internal partition narrowing configuration register
+ */
+#define MPAMCFG_INTPARTID_INTPARTID	GENMASK(15, 0)
+#define MPAMCFG_INTPARTID_INTERNAL	BIT(16)
+
+/* MSMON_CFG_MON_SEL - Memory system performance monitor selection register */
+#define MSMON_CFG_MON_SEL_MON_SEL	GENMASK(15, 0)
+#define MSMON_CFG_MON_SEL_RIS		GENMASK(27, 24)
+
+/* MPAMF_ESR - MPAM Error Status Register */
+#define MPAMF_ESR_PARTID_MON	GENMASK(15, 0)
+#define MPAMF_ESR_PMG		GENMASK(23, 16)
+#define MPAMF_ESR_ERRCODE	GENMASK(27, 24)
+#define MPAMF_ESR_OVRWR		BIT(31)
+#define MPAMF_ESR_RIS		GENMASK(35, 32)
+
+/* MPAMF_ECR - MPAM Error Control Register */
+#define MPAMF_ECR_INTEN		BIT(0)
+
+/* Error conditions in accessing memory mapped registers */
+#define MPAM_ERRCODE_NONE			0
+#define MPAM_ERRCODE_PARTID_SEL_RANGE		1
+#define MPAM_ERRCODE_REQ_PARTID_RANGE		2
+#define MPAM_ERRCODE_MSMONCFG_ID_RANGE		3
+#define MPAM_ERRCODE_REQ_PMG_RANGE		4
+#define MPAM_ERRCODE_MONITOR_RANGE		5
+#define MPAM_ERRCODE_INTPARTID_RANGE		6
+#define MPAM_ERRCODE_UNEXPECTED_INTERNAL	7
+#define MPAM_ERRCODE_UNDEFINED_RIS_PART_SEL	8
+#define MPAM_ERRCODE_RIS_NO_CONTROL		9
+#define MPAM_ERRCODE_UNDEFINED_RIS_MON_SEL	10
+#define MPAM_ERRCODE_RIS_NO_MONITOR		11
+
+/*
+ * MSMON_CFG_CSU_CTL - Memory system performance monitor configure cache storage
+ *                    usage monitor control register
+ * MSMON_CFG_MBWU_CTL - Memory system performance monitor configure memory
+ *                     bandwidth usage monitor control register
+ */
+#define MSMON_CFG_x_CTL_TYPE			GENMASK(7, 0)
+#define MSMON_CFG_MBWU_CTL_OFLOW_STATUS_L	BIT(15)
+#define MSMON_CFG_x_CTL_MATCH_PARTID		BIT(16)
+#define MSMON_CFG_x_CTL_MATCH_PMG		BIT(17)
+#define MSMON_CFG_MBWU_CTL_SCLEN		BIT(19)
+#define MSMON_CFG_x_CTL_SUBTYPE			GENMASK(22, 20)
+#define MSMON_CFG_x_CTL_OFLOW_FRZ		BIT(24)
+#define MSMON_CFG_x_CTL_OFLOW_INTR		BIT(25)
+#define MSMON_CFG_x_CTL_OFLOW_STATUS		BIT(26)
+#define MSMON_CFG_x_CTL_CAPT_RESET		BIT(27)
+#define MSMON_CFG_x_CTL_CAPT_EVNT		GENMASK(30, 28)
+#define MSMON_CFG_x_CTL_EN			BIT(31)
+
+#define MSMON_CFG_MBWU_CTL_TYPE_MBWU		0x42
+#define MSMON_CFG_CSU_CTL_TYPE_CSU		0x43
+
+/*
+ * MSMON_CFG_CSU_FLT -  Memory system performance monitor configure cache storage
+ *                      usage monitor filter register
+ * MSMON_CFG_MBWU_FLT - Memory system performance monitor configure memory
+ *                      bandwidth usage monitor filter register
+ */
+#define MSMON_CFG_x_FLT_PARTID			GENMASK(15, 0)
+#define MSMON_CFG_x_FLT_PMG			GENMASK(23, 16)
+
+#define MSMON_CFG_MBWU_FLT_RWBW			GENMASK(31, 30)
+#define MSMON_CFG_CSU_FLT_XCL			BIT(31)
+
+/*
+ * MSMON_CSU - Memory system performance monitor cache storage usage monitor
+ *            register
+ * MSMON_CSU_CAPTURE -  Memory system performance monitor cache storage usage
+ *                     capture register
+ * MSMON_MBWU  - Memory system performance monitor memory bandwidth usage
+ *               monitor register
+ * MSMON_MBWU_CAPTURE - Memory system performance monitor memory bandwidth usage
+ *                     capture register
+ */
+#define MSMON___VALUE		GENMASK(30, 0)
+#define MSMON___NRDY		BIT(31)
+#define MSMON___L_NRDY		BIT(63)
+#define MSMON___L_VALUE		GENMASK(43, 0)
+#define MSMON___LWD_VALUE	GENMASK(62, 0)
+
+/*
+ * MSMON_CAPT_EVNT - Memory system performance monitoring capture event
+ *                  generation register
+ */
+#define MSMON_CAPT_EVNT_NOW	BIT(0)
+
+#endif /* MPAM_INTERNAL_H */
diff --git a/drivers/resctrl/test_mpam_devices.c b/drivers/resctrl/test_mpam_devices.c
new file mode 100644
index 000000000000..3e8d564a0c64
--- /dev/null
+++ b/drivers/resctrl/test_mpam_devices.c
@@ -0,0 +1,389 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (C) 2025 Arm Ltd.
+/* This file is intended to be included into mpam_devices.c */
+
+#include <kunit/test.h>
+
+/*
+ * This test catches fields that aren't being sanitised - but can't tell you
+ * which one...
+ */
+static void test__props_mismatch(struct kunit *test)
+{
+	struct mpam_props parent = { 0 };
+	struct mpam_props child;
+
+	memset(&child, 0xff, sizeof(child));
+	__props_mismatch(&parent, &child, false);
+
+	memset(&child, 0, sizeof(child));
+	KUNIT_EXPECT_EQ(test, memcmp(&parent, &child, sizeof(child)), 0);
+
+	memset(&child, 0xff, sizeof(child));
+	__props_mismatch(&parent, &child, true);
+
+	KUNIT_EXPECT_EQ(test, memcmp(&parent, &child, sizeof(child)), 0);
+}
+
+static struct list_head fake_classes_list;
+static struct mpam_class fake_class = { 0 };
+static struct mpam_component fake_comp1 = { 0 };
+static struct mpam_component fake_comp2 = { 0 };
+static struct mpam_vmsc fake_vmsc1 = { 0 };
+static struct mpam_vmsc fake_vmsc2 = { 0 };
+static struct mpam_msc fake_msc1 = { 0 };
+static struct mpam_msc fake_msc2 = { 0 };
+static struct mpam_msc_ris fake_ris1 = { 0 };
+static struct mpam_msc_ris fake_ris2 = { 0 };
+static struct platform_device fake_pdev = { 0 };
+
+static inline void reset_fake_hierarchy(void)
+{
+	INIT_LIST_HEAD(&fake_classes_list);
+
+	memset(&fake_class, 0, sizeof(fake_class));
+	fake_class.level = 3;
+	fake_class.type = MPAM_CLASS_CACHE;
+	INIT_LIST_HEAD_RCU(&fake_class.components);
+	INIT_LIST_HEAD(&fake_class.classes_list);
+
+	memset(&fake_comp1, 0, sizeof(fake_comp1));
+	memset(&fake_comp2, 0, sizeof(fake_comp2));
+	fake_comp1.comp_id = 1;
+	fake_comp2.comp_id = 2;
+	INIT_LIST_HEAD(&fake_comp1.vmsc);
+	INIT_LIST_HEAD(&fake_comp1.class_list);
+	INIT_LIST_HEAD(&fake_comp2.vmsc);
+	INIT_LIST_HEAD(&fake_comp2.class_list);
+
+	memset(&fake_vmsc1, 0, sizeof(fake_vmsc1));
+	memset(&fake_vmsc2, 0, sizeof(fake_vmsc2));
+	INIT_LIST_HEAD(&fake_vmsc1.ris);
+	INIT_LIST_HEAD(&fake_vmsc1.comp_list);
+	fake_vmsc1.msc = &fake_msc1;
+	INIT_LIST_HEAD(&fake_vmsc2.ris);
+	INIT_LIST_HEAD(&fake_vmsc2.comp_list);
+	fake_vmsc2.msc = &fake_msc2;
+
+	memset(&fake_ris1, 0, sizeof(fake_ris1));
+	memset(&fake_ris2, 0, sizeof(fake_ris2));
+	fake_ris1.ris_idx = 1;
+	INIT_LIST_HEAD(&fake_ris1.msc_list);
+	fake_ris2.ris_idx = 2;
+	INIT_LIST_HEAD(&fake_ris2.msc_list);
+
+	fake_msc1.pdev = &fake_pdev;
+	fake_msc2.pdev = &fake_pdev;
+
+	list_add(&fake_class.classes_list, &fake_classes_list);
+}
+
+static void test_mpam_enable_merge_features(struct kunit *test)
+{
+	reset_fake_hierarchy();
+
+	mutex_lock(&mpam_list_lock);
+
+	/* One Class+Comp, two RIS in one vMSC with common features */
+	fake_comp1.class = &fake_class;
+	list_add(&fake_comp1.class_list, &fake_class.components);
+	fake_comp2.class = NULL;
+	fake_vmsc1.comp = &fake_comp1;
+	list_add(&fake_vmsc1.comp_list, &fake_comp1.vmsc);
+	fake_vmsc2.comp = NULL;
+	fake_ris1.vmsc = &fake_vmsc1;
+	list_add(&fake_ris1.vmsc_list, &fake_vmsc1.ris);
+	fake_ris2.vmsc = &fake_vmsc1;
+	list_add(&fake_ris2.vmsc_list, &fake_vmsc1.ris);
+
+	mpam_set_feature(mpam_feat_cpor_part, &fake_ris1.props);
+	mpam_set_feature(mpam_feat_cpor_part, &fake_ris2.props);
+	fake_ris1.props.cpbm_wd = 4;
+	fake_ris2.props.cpbm_wd = 4;
+
+	mpam_enable_merge_features(&fake_classes_list);
+
+	KUNIT_EXPECT_TRUE(test, mpam_has_feature(mpam_feat_cpor_part, &fake_class.props));
+	KUNIT_EXPECT_EQ(test, fake_class.props.cpbm_wd, 4);
+
+	reset_fake_hierarchy();
+
+	/* One Class+Comp, two RIS in one vMSC with non-overlapping features */
+	fake_comp1.class = &fake_class;
+	list_add(&fake_comp1.class_list, &fake_class.components);
+	fake_comp2.class = NULL;
+	fake_vmsc1.comp = &fake_comp1;
+	list_add(&fake_vmsc1.comp_list, &fake_comp1.vmsc);
+	fake_vmsc2.comp = NULL;
+	fake_ris1.vmsc = &fake_vmsc1;
+	list_add(&fake_ris1.vmsc_list, &fake_vmsc1.ris);
+	fake_ris2.vmsc = &fake_vmsc1;
+	list_add(&fake_ris2.vmsc_list, &fake_vmsc1.ris);
+
+	mpam_set_feature(mpam_feat_cpor_part, &fake_ris1.props);
+	mpam_set_feature(mpam_feat_cmax_cmin, &fake_ris2.props);
+	fake_ris1.props.cpbm_wd = 4;
+	fake_ris2.props.cmax_wd = 4;
+
+	mpam_enable_merge_features(&fake_classes_list);
+
+	/* Multiple RIS within one MSC controlling the same resource can be mismatched */
+	KUNIT_EXPECT_TRUE(test, mpam_has_feature(mpam_feat_cpor_part, &fake_class.props));
+	KUNIT_EXPECT_TRUE(test, mpam_has_feature(mpam_feat_cmax_cmin, &fake_class.props));
+	KUNIT_EXPECT_TRUE(test, mpam_has_feature(mpam_feat_cmax_cmin, &fake_vmsc1.props));
+	KUNIT_EXPECT_EQ(test, fake_class.props.cpbm_wd, 4);
+	KUNIT_EXPECT_EQ(test, fake_vmsc1.props.cmax_wd, 4);
+	KUNIT_EXPECT_EQ(test, fake_class.props.cmax_wd, 4);
+
+	reset_fake_hierarchy();
+
+	/* One Class+Comp, two MSC with overlapping features */
+	fake_comp1.class = &fake_class;
+	list_add(&fake_comp1.class_list, &fake_class.components);
+	fake_comp2.class = NULL;
+	fake_vmsc1.comp = &fake_comp1;
+	list_add(&fake_vmsc1.comp_list, &fake_comp1.vmsc);
+	fake_vmsc2.comp = &fake_comp1;
+	list_add(&fake_vmsc2.comp_list, &fake_comp1.vmsc);
+	fake_ris1.vmsc = &fake_vmsc1;
+	list_add(&fake_ris1.vmsc_list, &fake_vmsc1.ris);
+	fake_ris2.vmsc = &fake_vmsc2;
+	list_add(&fake_ris2.vmsc_list, &fake_vmsc2.ris);
+
+	mpam_set_feature(mpam_feat_cpor_part, &fake_ris1.props);
+	mpam_set_feature(mpam_feat_cpor_part, &fake_ris2.props);
+	fake_ris1.props.cpbm_wd = 4;
+	fake_ris2.props.cpbm_wd = 4;
+
+	mpam_enable_merge_features(&fake_classes_list);
+
+	KUNIT_EXPECT_TRUE(test, mpam_has_feature(mpam_feat_cpor_part, &fake_class.props));
+	KUNIT_EXPECT_EQ(test, fake_class.props.cpbm_wd, 4);
+
+	reset_fake_hierarchy();
+
+	/* One Class+Comp, two MSC with non-overlapping features */
+	fake_comp1.class = &fake_class;
+	list_add(&fake_comp1.class_list, &fake_class.components);
+	fake_comp2.class = NULL;
+	fake_vmsc1.comp = &fake_comp1;
+	list_add(&fake_vmsc1.comp_list, &fake_comp1.vmsc);
+	fake_vmsc2.comp = &fake_comp1;
+	list_add(&fake_vmsc2.comp_list, &fake_comp1.vmsc);
+	fake_ris1.vmsc = &fake_vmsc1;
+	list_add(&fake_ris1.vmsc_list, &fake_vmsc1.ris);
+	fake_ris2.vmsc = &fake_vmsc2;
+	list_add(&fake_ris2.vmsc_list, &fake_vmsc2.ris);
+
+	mpam_set_feature(mpam_feat_cpor_part, &fake_ris1.props);
+	mpam_set_feature(mpam_feat_cmax_cmin, &fake_ris2.props);
+	fake_ris1.props.cpbm_wd = 4;
+	fake_ris2.props.cmax_wd = 4;
+
+	mpam_enable_merge_features(&fake_classes_list);
+
+	/*
+	 * Multiple RIS in different MSC can't control the same resource,
+	 * mismatched features can not be supported.
+	 */
+	KUNIT_EXPECT_FALSE(test, mpam_has_feature(mpam_feat_cpor_part, &fake_class.props));
+	KUNIT_EXPECT_FALSE(test, mpam_has_feature(mpam_feat_cmax_cmin, &fake_class.props));
+	KUNIT_EXPECT_EQ(test, fake_class.props.cpbm_wd, 0);
+	KUNIT_EXPECT_EQ(test, fake_class.props.cmax_wd, 0);
+
+	reset_fake_hierarchy();
+
+	/* One Class+Comp, two MSC with incompatible overlapping features */
+	fake_comp1.class = &fake_class;
+	list_add(&fake_comp1.class_list, &fake_class.components);
+	fake_comp2.class = NULL;
+	fake_vmsc1.comp = &fake_comp1;
+	list_add(&fake_vmsc1.comp_list, &fake_comp1.vmsc);
+	fake_vmsc2.comp = &fake_comp1;
+	list_add(&fake_vmsc2.comp_list, &fake_comp1.vmsc);
+	fake_ris1.vmsc = &fake_vmsc1;
+	list_add(&fake_ris1.vmsc_list, &fake_vmsc1.ris);
+	fake_ris2.vmsc = &fake_vmsc2;
+	list_add(&fake_ris2.vmsc_list, &fake_vmsc2.ris);
+
+	mpam_set_feature(mpam_feat_cpor_part, &fake_ris1.props);
+	mpam_set_feature(mpam_feat_cpor_part, &fake_ris2.props);
+	mpam_set_feature(mpam_feat_mbw_part, &fake_ris1.props);
+	mpam_set_feature(mpam_feat_mbw_part, &fake_ris2.props);
+	fake_ris1.props.cpbm_wd = 5;
+	fake_ris2.props.cpbm_wd = 3;
+	fake_ris1.props.mbw_pbm_bits = 5;
+	fake_ris2.props.mbw_pbm_bits = 3;
+
+	mpam_enable_merge_features(&fake_classes_list);
+
+	/*
+	 * Multiple RIS in different MSC can't control the same resource,
+	 * mismatched features can not be supported.
+	 */
+	KUNIT_EXPECT_FALSE(test, mpam_has_feature(mpam_feat_cpor_part, &fake_class.props));
+	KUNIT_EXPECT_FALSE(test, mpam_has_feature(mpam_feat_mbw_part, &fake_class.props));
+	KUNIT_EXPECT_EQ(test, fake_class.props.cpbm_wd, 0);
+	KUNIT_EXPECT_EQ(test, fake_class.props.mbw_pbm_bits, 0);
+
+	reset_fake_hierarchy();
+
+	/* One Class+Comp, two MSC with overlapping features that need tweaking */
+	fake_comp1.class = &fake_class;
+	list_add(&fake_comp1.class_list, &fake_class.components);
+	fake_comp2.class = NULL;
+	fake_vmsc1.comp = &fake_comp1;
+	list_add(&fake_vmsc1.comp_list, &fake_comp1.vmsc);
+	fake_vmsc2.comp = &fake_comp1;
+	list_add(&fake_vmsc2.comp_list, &fake_comp1.vmsc);
+	fake_ris1.vmsc = &fake_vmsc1;
+	list_add(&fake_ris1.vmsc_list, &fake_vmsc1.ris);
+	fake_ris2.vmsc = &fake_vmsc2;
+	list_add(&fake_ris2.vmsc_list, &fake_vmsc2.ris);
+
+	mpam_set_feature(mpam_feat_mbw_min, &fake_ris1.props);
+	mpam_set_feature(mpam_feat_mbw_min, &fake_ris2.props);
+	mpam_set_feature(mpam_feat_cmax_cmax, &fake_ris1.props);
+	mpam_set_feature(mpam_feat_cmax_cmax, &fake_ris2.props);
+	fake_ris1.props.bwa_wd = 5;
+	fake_ris2.props.bwa_wd = 3;
+	fake_ris1.props.cmax_wd = 5;
+	fake_ris2.props.cmax_wd = 3;
+
+	mpam_enable_merge_features(&fake_classes_list);
+
+	/*
+	 * RIS with different control properties need to be sanitised so the
+	 * class has the common set of properties.
+	 */
+	KUNIT_EXPECT_TRUE(test, mpam_has_feature(mpam_feat_mbw_min, &fake_class.props));
+	KUNIT_EXPECT_TRUE(test, mpam_has_feature(mpam_feat_cmax_cmax, &fake_class.props));
+	KUNIT_EXPECT_EQ(test, fake_class.props.bwa_wd, 3);
+	KUNIT_EXPECT_EQ(test, fake_class.props.cmax_wd, 3);
+
+	reset_fake_hierarchy();
+
+	/* One Class Two Comp with overlapping features */
+	fake_comp1.class = &fake_class;
+	list_add(&fake_comp1.class_list, &fake_class.components);
+	fake_comp2.class = &fake_class;
+	list_add(&fake_comp2.class_list, &fake_class.components);
+	fake_vmsc1.comp = &fake_comp1;
+	list_add(&fake_vmsc1.comp_list, &fake_comp1.vmsc);
+	fake_vmsc2.comp = &fake_comp2;
+	list_add(&fake_vmsc2.comp_list, &fake_comp2.vmsc);
+	fake_ris1.vmsc = &fake_vmsc1;
+	list_add(&fake_ris1.vmsc_list, &fake_vmsc1.ris);
+	fake_ris2.vmsc = &fake_vmsc2;
+	list_add(&fake_ris2.vmsc_list, &fake_vmsc2.ris);
+
+	mpam_set_feature(mpam_feat_cpor_part, &fake_ris1.props);
+	mpam_set_feature(mpam_feat_cpor_part, &fake_ris2.props);
+	fake_ris1.props.cpbm_wd = 4;
+	fake_ris2.props.cpbm_wd = 4;
+
+	mpam_enable_merge_features(&fake_classes_list);
+
+	KUNIT_EXPECT_TRUE(test, mpam_has_feature(mpam_feat_cpor_part, &fake_class.props));
+	KUNIT_EXPECT_EQ(test, fake_class.props.cpbm_wd, 4);
+
+	reset_fake_hierarchy();
+
+	/* One Class Two Comp with non-overlapping features */
+	fake_comp1.class = &fake_class;
+	list_add(&fake_comp1.class_list, &fake_class.components);
+	fake_comp2.class = &fake_class;
+	list_add(&fake_comp2.class_list, &fake_class.components);
+	fake_vmsc1.comp = &fake_comp1;
+	list_add(&fake_vmsc1.comp_list, &fake_comp1.vmsc);
+	fake_vmsc2.comp = &fake_comp2;
+	list_add(&fake_vmsc2.comp_list, &fake_comp2.vmsc);
+	fake_ris1.vmsc = &fake_vmsc1;
+	list_add(&fake_ris1.vmsc_list, &fake_vmsc1.ris);
+	fake_ris2.vmsc = &fake_vmsc2;
+	list_add(&fake_ris2.vmsc_list, &fake_vmsc2.ris);
+
+	mpam_set_feature(mpam_feat_cpor_part, &fake_ris1.props);
+	mpam_set_feature(mpam_feat_cmax_cmin, &fake_ris2.props);
+	fake_ris1.props.cpbm_wd = 4;
+	fake_ris2.props.cmax_wd = 4;
+
+	mpam_enable_merge_features(&fake_classes_list);
+
+	/*
+	 * Multiple components can't control the same resource, mismatched features can
+	 * not be supported.
+	 */
+	KUNIT_EXPECT_FALSE(test, mpam_has_feature(mpam_feat_cpor_part, &fake_class.props));
+	KUNIT_EXPECT_FALSE(test, mpam_has_feature(mpam_feat_cmax_cmin, &fake_class.props));
+	KUNIT_EXPECT_EQ(test, fake_class.props.cpbm_wd, 0);
+	KUNIT_EXPECT_EQ(test, fake_class.props.cmax_wd, 0);
+
+	mutex_unlock(&mpam_list_lock);
+}
+
+static void test_mpam_reset_msc_bitmap(struct kunit *test)
+{
+	char __iomem *buf = kunit_kzalloc(test, SZ_16K, GFP_KERNEL);
+	struct mpam_msc fake_msc = {};
+	u32 *test_result;
+
+	if (!buf)
+		return;
+
+	fake_msc.mapped_hwpage = buf;
+	fake_msc.mapped_hwpage_sz = SZ_16K;
+	cpumask_copy(&fake_msc.accessibility, cpu_possible_mask);
+
+	/* Satisfy lockdep checks */
+	mutex_init(&fake_msc.part_sel_lock);
+	mutex_lock(&fake_msc.part_sel_lock);
+
+	test_result = (u32 *)(buf + MPAMCFG_CPBM);
+
+	mpam_reset_msc_bitmap(&fake_msc, MPAMCFG_CPBM, 0);
+	KUNIT_EXPECT_EQ(test, test_result[0], 0);
+	KUNIT_EXPECT_EQ(test, test_result[1], 0);
+	test_result[0] = 0;
+	test_result[1] = 0;
+
+	mpam_reset_msc_bitmap(&fake_msc, MPAMCFG_CPBM, 1);
+	KUNIT_EXPECT_EQ(test, test_result[0], 1);
+	KUNIT_EXPECT_EQ(test, test_result[1], 0);
+	test_result[0] = 0;
+	test_result[1] = 0;
+
+	mpam_reset_msc_bitmap(&fake_msc, MPAMCFG_CPBM, 16);
+	KUNIT_EXPECT_EQ(test, test_result[0], 0xffff);
+	KUNIT_EXPECT_EQ(test, test_result[1], 0);
+	test_result[0] = 0;
+	test_result[1] = 0;
+
+	mpam_reset_msc_bitmap(&fake_msc, MPAMCFG_CPBM, 32);
+	KUNIT_EXPECT_EQ(test, test_result[0], 0xffffffff);
+	KUNIT_EXPECT_EQ(test, test_result[1], 0);
+	test_result[0] = 0;
+	test_result[1] = 0;
+
+	mpam_reset_msc_bitmap(&fake_msc, MPAMCFG_CPBM, 33);
+	KUNIT_EXPECT_EQ(test, test_result[0], 0xffffffff);
+	KUNIT_EXPECT_EQ(test, test_result[1], 1);
+	test_result[0] = 0;
+	test_result[1] = 0;
+
+	mutex_unlock(&fake_msc.part_sel_lock);
+}
+
+static struct kunit_case mpam_devices_test_cases[] = {
+	KUNIT_CASE(test_mpam_reset_msc_bitmap),
+	KUNIT_CASE(test_mpam_enable_merge_features),
+	KUNIT_CASE(test__props_mismatch),
+	{}
+};
+
+static struct kunit_suite mpam_devices_test_suite = {
+	.name = "mpam_devices_test_suite",
+	.test_cases = mpam_devices_test_cases,
+};
+
+kunit_test_suites(&mpam_devices_test_suite);
diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c
index 7765e40f7cea..2f3039cca6f2 100644
--- a/drivers/s390/block/dasd.c
+++ b/drivers/s390/block/dasd.c
@@ -3350,7 +3350,6 @@ dasd_device_operations = {
 	.open		= dasd_open,
 	.release	= dasd_release,
 	.ioctl		= dasd_ioctl,
-	.compat_ioctl	= dasd_ioctl,
 	.getgeo		= dasd_getgeo,
 	.set_read_only	= dasd_set_read_only,
 };
diff --git a/drivers/s390/block/dasd_eckd.c b/drivers/s390/block/dasd_eckd.c
index 88fa17aea2ec..687396703788 100644
--- a/drivers/s390/block/dasd_eckd.c
+++ b/drivers/s390/block/dasd_eckd.c
@@ -16,7 +16,6 @@
 #include <linux/hdreg.h>	/* HDIO_GETGEO			    */
 #include <linux/bio.h>
 #include <linux/module.h>
-#include <linux/compat.h>
 #include <linux/init.h>
 #include <linux/seq_file.h>
 #include <linux/uaccess.h>
@@ -5389,16 +5388,6 @@ static int dasd_symm_io(struct dasd_device *device, void __user *argp)
 	rc = -EFAULT;
 	if (copy_from_user(&usrparm, argp, sizeof(usrparm)))
 		goto out;
-	if (is_compat_task()) {
-		/* Make sure pointers are sane even on 31 bit. */
-		rc = -EINVAL;
-		if ((usrparm.psf_data >> 32) != 0)
-			goto out;
-		if ((usrparm.rssd_result >> 32) != 0)
-			goto out;
-		usrparm.psf_data &= 0x7fffffffULL;
-		usrparm.rssd_result &= 0x7fffffffULL;
-	}
 	/* at least 2 bytes are accessed and should be allocated */
 	if (usrparm.psf_data_len < 2) {
 		DBF_DEV_EVENT(DBF_WARNING, device,
diff --git a/drivers/s390/block/dasd_fba.c b/drivers/s390/block/dasd_fba.c
index a2216795591d..c2a87201c153 100644
--- a/drivers/s390/block/dasd_fba.c
+++ b/drivers/s390/block/dasd_fba.c
@@ -5,7 +5,6 @@
  * Copyright IBM Corp. 1999, 2009
  */
 
-#define KMSG_COMPONENT "dasd-fba"
 
 #include <linux/stddef.h>
 #include <linux/kernel.h>
diff --git a/drivers/s390/block/dasd_ioctl.c b/drivers/s390/block/dasd_ioctl.c
index 8308046a9f8f..f883990be626 100644
--- a/drivers/s390/block/dasd_ioctl.c
+++ b/drivers/s390/block/dasd_ioctl.c
@@ -11,7 +11,6 @@
  */
 
 #include <linux/interrupt.h>
-#include <linux/compat.h>
 #include <linux/export.h>
 #include <linux/major.h>
 #include <linux/fs.h>
@@ -616,10 +615,7 @@ int dasd_ioctl(struct block_device *bdev, blk_mode_t mode,
 	void __user *argp;
 	int rc;
 
-	if (is_compat_task())
-		argp = compat_ptr(arg);
-	else
-		argp = (void __user *)arg;
+	argp = (void __user *)arg;
 
 	if ((_IOC_DIR(cmd) != _IOC_NONE) && !arg)
 		return -EINVAL;
diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c
index 86fef4b15015..38e1df8f8a82 100644
--- a/drivers/s390/block/dcssblk.c
+++ b/drivers/s390/block/dcssblk.c
@@ -5,8 +5,7 @@
  * Authors: Carsten Otte, Stefan Weinhuber, Gerald Schaefer
  */
 
-#define KMSG_COMPONENT "dcssblk"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "dcssblk: " fmt
 
 #include <linux/module.h>
 #include <linux/moduleparam.h>
@@ -674,8 +673,8 @@ dcssblk_add_store(struct device *dev, struct device_attribute *attr, const char
 	rc = dcssblk_assign_free_minor(dev_info);
 	if (rc)
 		goto release_gd;
-	sprintf(dev_info->gd->disk_name, "dcssblk%d",
-		dev_info->gd->first_minor);
+	scnprintf(dev_info->gd->disk_name, sizeof(dev_info->gd->disk_name),
+		  "dcssblk%d", dev_info->gd->first_minor);
 	list_add_tail(&dev_info->lh, &dcssblk_devices);
 
 	if (!try_module_get(THIS_MODULE)) {
diff --git a/drivers/s390/block/scm_blk.c b/drivers/s390/block/scm_blk.c
index 91bbe9d2e5ac..04e84f45dcc9 100644
--- a/drivers/s390/block/scm_blk.c
+++ b/drivers/s390/block/scm_blk.c
@@ -6,8 +6,7 @@
  * Author(s): Sebastian Ott <sebott@linux.vnet.ibm.com>
  */
 
-#define KMSG_COMPONENT "scm_block"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "scm_block: " fmt
 
 #include <linux/interrupt.h>
 #include <linux/spinlock.h>
diff --git a/drivers/s390/block/scm_drv.c b/drivers/s390/block/scm_drv.c
index 69a845eb8b1f..6cffbbe83f89 100644
--- a/drivers/s390/block/scm_drv.c
+++ b/drivers/s390/block/scm_drv.c
@@ -6,8 +6,7 @@
  * Author(s): Sebastian Ott <sebott@linux.vnet.ibm.com>
  */
 
-#define KMSG_COMPONENT "scm_block"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "scm_block: " fmt
 
 #include <linux/module.h>
 #include <linux/slab.h>
diff --git a/drivers/s390/char/con3270.c b/drivers/s390/char/con3270.c
index a367f95c7c53..4a7c084e68a6 100644
--- a/drivers/s390/char/con3270.c
+++ b/drivers/s390/char/con3270.c
@@ -21,7 +21,6 @@
 #include <linux/reboot.h>
 #include <linux/slab.h>
 #include <linux/memblock.h>
-#include <linux/compat.h>
 
 #include <asm/machine.h>
 #include <asm/ccwdev.h>
@@ -1662,7 +1661,7 @@ static void tty3270_escape_sequence(struct tty3270 *tp, u8 ch)
 		else if (tp->esc_par[0] == 6) {	/* Cursor report. */
 			char buf[40];
 
-			sprintf(buf, "\033[%d;%dR", tp->cy + 1, tp->cx + 1);
+			scnprintf(buf, sizeof(buf), "\033[%d;%dR", tp->cy + 1, tp->cx + 1);
 			kbd_puts_queue(&tp->port, buf);
 		}
 		return;
@@ -1947,21 +1946,6 @@ static int tty3270_ioctl(struct tty_struct *tty, unsigned int cmd,
 	return kbd_ioctl(tp->kbd, cmd, arg);
 }
 
-#ifdef CONFIG_COMPAT
-static long tty3270_compat_ioctl(struct tty_struct *tty,
-				 unsigned int cmd, unsigned long arg)
-{
-	struct tty3270 *tp;
-
-	tp = tty->driver_data;
-	if (!tp)
-		return -ENODEV;
-	if (tty_io_error(tty))
-		return -EIO;
-	return kbd_ioctl(tp->kbd, cmd, (unsigned long)compat_ptr(arg));
-}
-#endif
-
 static const struct tty_operations tty3270_ops = {
 	.install = tty3270_install,
 	.cleanup = tty3270_cleanup,
@@ -1976,9 +1960,6 @@ static const struct tty_operations tty3270_ops = {
 	.hangup = tty3270_hangup,
 	.wait_until_sent = tty3270_wait_until_sent,
 	.ioctl = tty3270_ioctl,
-#ifdef CONFIG_COMPAT
-	.compat_ioctl = tty3270_compat_ioctl,
-#endif
 	.set_termios = tty3270_set_termios
 };
 
diff --git a/drivers/s390/char/diag_ftp.c b/drivers/s390/char/diag_ftp.c
index f41b39c9d267..a1e110c96f74 100644
--- a/drivers/s390/char/diag_ftp.c
+++ b/drivers/s390/char/diag_ftp.c
@@ -7,8 +7,7 @@
  *
  */
 
-#define KMSG_COMPONENT "hmcdrv"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "hmcdrv: " fmt
 
 #include <linux/kernel.h>
 #include <linux/mm.h>
diff --git a/drivers/s390/char/fs3270.c b/drivers/s390/char/fs3270.c
index cfe7efd5b5da..73555dbe30d0 100644
--- a/drivers/s390/char/fs3270.c
+++ b/drivers/s390/char/fs3270.c
@@ -12,7 +12,6 @@
 #include <linux/console.h>
 #include <linux/init.h>
 #include <linux/interrupt.h>
-#include <linux/compat.h>
 #include <linux/sched/signal.h>
 #include <linux/module.h>
 #include <linux/list.h>
@@ -330,10 +329,7 @@ static long fs3270_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 	fp = filp->private_data;
 	if (!fp)
 		return -ENODEV;
-	if (is_compat_task())
-		argp = compat_ptr(arg);
-	else
-		argp = (char __user *)arg;
+	argp = (char __user *)arg;
 	rc = 0;
 	mutex_lock(&fs3270_mutex);
 	switch (cmd) {
@@ -512,7 +508,6 @@ static const struct file_operations fs3270_fops = {
 	.read		 = fs3270_read,		/* read */
 	.write		 = fs3270_write,	/* write */
 	.unlocked_ioctl	 = fs3270_ioctl,	/* ioctl */
-	.compat_ioctl	 = fs3270_ioctl,	/* ioctl */
 	.open		 = fs3270_open,		/* open */
 	.release	 = fs3270_close,	/* release */
 };
diff --git a/drivers/s390/char/hmcdrv_cache.c b/drivers/s390/char/hmcdrv_cache.c
index 43df27ceec11..85fb689594ca 100644
--- a/drivers/s390/char/hmcdrv_cache.c
+++ b/drivers/s390/char/hmcdrv_cache.c
@@ -7,8 +7,7 @@
  *
  */
 
-#define KMSG_COMPONENT "hmcdrv"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "hmcdrv: " fmt
 
 #include <linux/kernel.h>
 #include <linux/mm.h>
diff --git a/drivers/s390/char/hmcdrv_dev.c b/drivers/s390/char/hmcdrv_dev.c
index b26fcf6849f2..04b938c5357f 100644
--- a/drivers/s390/char/hmcdrv_dev.c
+++ b/drivers/s390/char/hmcdrv_dev.c
@@ -14,8 +14,7 @@
  *    end read() the response.
  */
 
-#define KMSG_COMPONENT "hmcdrv"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "hmcdrv: " fmt
 
 #include <linux/kernel.h>
 #include <linux/module.h>
diff --git a/drivers/s390/char/hmcdrv_ftp.c b/drivers/s390/char/hmcdrv_ftp.c
index 4e3c7ec6749b..3312b2ac00a9 100644
--- a/drivers/s390/char/hmcdrv_ftp.c
+++ b/drivers/s390/char/hmcdrv_ftp.c
@@ -6,8 +6,7 @@
  *    Author(s): Ralf Hoppe (rhoppe@de.ibm.com)
  */
 
-#define KMSG_COMPONENT "hmcdrv"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "hmcdrv: " fmt
 
 #include <linux/kernel.h>
 #include <linux/slab.h>
diff --git a/drivers/s390/char/hmcdrv_mod.c b/drivers/s390/char/hmcdrv_mod.c
index 1447d0887225..b1cc5ba9fed8 100644
--- a/drivers/s390/char/hmcdrv_mod.c
+++ b/drivers/s390/char/hmcdrv_mod.c
@@ -6,8 +6,7 @@
  *    Author(s): Ralf Hoppe (rhoppe@de.ibm.com)
  */
 
-#define KMSG_COMPONENT "hmcdrv"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "hmcdrv: " fmt
 
 #include <linux/kernel.h>
 #include <linux/module.h>
diff --git a/drivers/s390/char/monreader.c b/drivers/s390/char/monreader.c
index 2d9886651d9b..3d84f84b4cbd 100644
--- a/drivers/s390/char/monreader.c
+++ b/drivers/s390/char/monreader.c
@@ -7,8 +7,7 @@
  * Author: Gerald Schaefer <gerald.schaefer@de.ibm.com>
  */
 
-#define KMSG_COMPONENT "monreader"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "monreader: " fmt
 
 #include <linux/module.h>
 #include <linux/moduleparam.h>
diff --git a/drivers/s390/char/monwriter.c b/drivers/s390/char/monwriter.c
index 0fab1f025a94..cf2e51061422 100644
--- a/drivers/s390/char/monwriter.c
+++ b/drivers/s390/char/monwriter.c
@@ -7,8 +7,7 @@
  * Author(s): Melissa Howland <Melissa.Howland@us.ibm.com>
  */
 
-#define KMSG_COMPONENT "monwriter"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "monwriter: " fmt
 
 #include <linux/module.h>
 #include <linux/moduleparam.h>
diff --git a/drivers/s390/char/sclp_ap.c b/drivers/s390/char/sclp_ap.c
index 0dd1ca712795..18bb018b4e0c 100644
--- a/drivers/s390/char/sclp_ap.c
+++ b/drivers/s390/char/sclp_ap.c
@@ -4,8 +4,7 @@
  *
  * Copyright IBM Corp. 2020
  */
-#define KMSG_COMPONENT "sclp_cmd"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "sclp_cmd: " fmt
 
 #include <linux/export.h>
 #include <linux/slab.h>
diff --git a/drivers/s390/char/sclp_cmd.c b/drivers/s390/char/sclp_cmd.c
index 3480198eac02..be4730936f5c 100644
--- a/drivers/s390/char/sclp_cmd.c
+++ b/drivers/s390/char/sclp_cmd.c
@@ -5,8 +5,7 @@
  * Author(s): Peter Oberparleiter <peter.oberparleiter@de.ibm.com>
  */
 
-#define KMSG_COMPONENT "sclp_cmd"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "sclp_cmd: " fmt
 
 #include <linux/completion.h>
 #include <linux/err.h>
diff --git a/drivers/s390/char/sclp_config.c b/drivers/s390/char/sclp_config.c
index 356d26a09af0..9cfbe3fc3dca 100644
--- a/drivers/s390/char/sclp_config.c
+++ b/drivers/s390/char/sclp_config.c
@@ -3,8 +3,7 @@
  *    Copyright IBM Corp. 2007
  */
 
-#define KMSG_COMPONENT "sclp_config"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "sclp_config: " fmt
 
 #include <linux/init.h>
 #include <linux/errno.h>
diff --git a/drivers/s390/char/sclp_cpi_sys.c b/drivers/s390/char/sclp_cpi_sys.c
index d8f91aab11e8..8e1636bcf8b5 100644
--- a/drivers/s390/char/sclp_cpi_sys.c
+++ b/drivers/s390/char/sclp_cpi_sys.c
@@ -7,8 +7,7 @@
  *		 Michael Ernst <mernst@de.ibm.com>
  */
 
-#define KMSG_COMPONENT "sclp_cpi"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "sclp_cpi: " fmt
 
 #include <linux/kernel.h>
 #include <linux/init.h>
diff --git a/drivers/s390/char/sclp_ctl.c b/drivers/s390/char/sclp_ctl.c
index dd6051602070..e23a97359286 100644
--- a/drivers/s390/char/sclp_ctl.c
+++ b/drivers/s390/char/sclp_ctl.c
@@ -7,7 +7,6 @@
  * Author: Michael Holzheu <holzheu@linux.vnet.ibm.com>
  */
 
-#include <linux/compat.h>
 #include <linux/uaccess.h>
 #include <linux/miscdevice.h>
 #include <linux/gfp.h>
@@ -43,10 +42,7 @@ static int sclp_ctl_cmdw_supported(unsigned int cmdw)
 
 static void __user *u64_to_uptr(u64 value)
 {
-	if (is_compat_task())
-		return compat_ptr(value);
-	else
-		return (void __user *)(unsigned long)value;
+	return (void __user *)(unsigned long)value;
 }
 
 /*
@@ -95,10 +91,7 @@ static long sclp_ctl_ioctl(struct file *filp, unsigned int cmd,
 {
 	void __user *argp;
 
-	if (is_compat_task())
-		argp = compat_ptr(arg);
-	else
-		argp = (void __user *) arg;
+	argp = (void __user *)arg;
 	switch (cmd) {
 	case SCLP_CTL_SCCB:
 		return sclp_ctl_ioctl_sccb(argp);
@@ -114,7 +107,6 @@ static const struct file_operations sclp_ctl_fops = {
 	.owner = THIS_MODULE,
 	.open = nonseekable_open,
 	.unlocked_ioctl = sclp_ctl_ioctl,
-	.compat_ioctl = sclp_ctl_ioctl,
 };
 
 /*
diff --git a/drivers/s390/char/sclp_early.c b/drivers/s390/char/sclp_early.c
index bd5e5ba50c0a..6bf501ad8ff0 100644
--- a/drivers/s390/char/sclp_early.c
+++ b/drivers/s390/char/sclp_early.c
@@ -5,8 +5,7 @@
  * Copyright IBM Corp. 2013
  */
 
-#define KMSG_COMPONENT "sclp_early"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "sclp_early: " fmt
 
 #include <linux/export.h>
 #include <linux/errno.h>
diff --git a/drivers/s390/char/sclp_ftp.c b/drivers/s390/char/sclp_ftp.c
index d27e2cbfbccb..2a1c4b2cafc8 100644
--- a/drivers/s390/char/sclp_ftp.c
+++ b/drivers/s390/char/sclp_ftp.c
@@ -7,8 +7,7 @@
  *
  */
 
-#define KMSG_COMPONENT "hmcdrv"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "hmcdrv: " fmt
 
 #include <linux/kernel.h>
 #include <linux/mm.h>
diff --git a/drivers/s390/char/sclp_mem.c b/drivers/s390/char/sclp_mem.c
index 27f49f5fd358..676c085b4f8a 100644
--- a/drivers/s390/char/sclp_mem.c
+++ b/drivers/s390/char/sclp_mem.c
@@ -5,13 +5,15 @@
  * Copyright IBM Corp. 2025
  */
 
-#define KMSG_COMPONENT "sclp_mem"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "sclp_mem: " fmt
 
 #include <linux/cpufeature.h>
+#include <linux/container_of.h>
 #include <linux/err.h>
 #include <linux/errno.h>
 #include <linux/init.h>
+#include <linux/kobject.h>
+#include <linux/kstrtox.h>
 #include <linux/memory.h>
 #include <linux/memory_hotplug.h>
 #include <linux/mm.h>
@@ -27,7 +29,6 @@
 #define SCLP_CMDW_ASSIGN_STORAGE		0x000d0001
 #define SCLP_CMDW_UNASSIGN_STORAGE		0x000c0001
 
-static DEFINE_MUTEX(sclp_mem_mutex);
 static LIST_HEAD(sclp_mem_list);
 static u8 sclp_max_storage_id;
 static DECLARE_BITMAP(sclp_storage_ids, 256);
@@ -38,6 +39,18 @@ struct memory_increment {
 	int standby;
 };
 
+struct sclp_mem {
+	struct kobject kobj;
+	unsigned int id;
+	unsigned int memmap_on_memory;
+	unsigned int config;
+};
+
+struct sclp_mem_arg {
+	struct sclp_mem *sclp_mems;
+	struct kset *kset;
+};
+
 struct assign_storage_sccb {
 	struct sccb_header header;
 	u16 rn;
@@ -163,92 +176,168 @@ static int sclp_mem_change_state(unsigned long start, unsigned long size,
 	return rc ? -EIO : 0;
 }
 
-static bool contains_standby_increment(unsigned long start, unsigned long end)
+static ssize_t sclp_config_mem_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
 {
-	struct memory_increment *incr;
-	unsigned long istart;
+	struct sclp_mem *sclp_mem = container_of(kobj, struct sclp_mem, kobj);
 
-	list_for_each_entry(incr, &sclp_mem_list, list) {
-		istart = rn2addr(incr->rn);
-		if (end - 1 < istart)
-			continue;
-		if (start > istart + sclp.rzm - 1)
-			continue;
-		if (incr->standby)
-			return true;
-	}
-	return false;
+	return sysfs_emit(buf, "%u\n", READ_ONCE(sclp_mem->config));
 }
 
-static int sclp_mem_notifier(struct notifier_block *nb,
-			     unsigned long action, void *data)
+static ssize_t sclp_config_mem_store(struct kobject *kobj, struct kobj_attribute *attr,
+				     const char *buf, size_t count)
 {
-	unsigned long start, size;
-	struct memory_notify *arg;
+	unsigned long addr, block_size;
+	struct sclp_mem *sclp_mem;
+	struct memory_block *mem;
 	unsigned char id;
-	int rc = 0;
+	bool value;
+	int rc;
 
-	arg = data;
-	start = arg->start_pfn << PAGE_SHIFT;
-	size = arg->nr_pages << PAGE_SHIFT;
-	mutex_lock(&sclp_mem_mutex);
+	rc = kstrtobool(buf, &value);
+	if (rc)
+		return rc;
+	sclp_mem = container_of(kobj, struct sclp_mem, kobj);
+	block_size = memory_block_size_bytes();
+	addr = sclp_mem->id * block_size;
+	/*
+	 * Hold device_hotplug_lock when adding/removing memory blocks.
+	 * Additionally, also protect calls to find_memory_block() and
+	 * sclp_attach_storage().
+	 */
+	rc = lock_device_hotplug_sysfs();
+	if (rc)
+		goto out;
 	for_each_clear_bit(id, sclp_storage_ids, sclp_max_storage_id + 1)
 		sclp_attach_storage(id);
-	switch (action) {
-	case MEM_GOING_OFFLINE:
-		/*
-		 * Do not allow to set memory blocks offline that contain
-		 * standby memory. This is done to simplify the "memory online"
-		 * case.
-		 */
-		if (contains_standby_increment(start, start + size))
-			rc = -EPERM;
-		break;
-	case MEM_PREPARE_ONLINE:
+	if (value) {
+		if (sclp_mem->config)
+			goto out_unlock;
+		rc = sclp_mem_change_state(addr, block_size, 1);
+		if (rc)
+			goto out_unlock;
 		/*
-		 * Access the altmap_start_pfn and altmap_nr_pages fields
-		 * within the struct memory_notify specifically when dealing
-		 * with only MEM_PREPARE_ONLINE/MEM_FINISH_OFFLINE notifiers.
-		 *
-		 * When altmap is in use, take the specified memory range
-		 * online, which includes the altmap.
+		 * Set entire memory block CMMA state to nodat. Later, when
+		 * page tables pages are allocated via __add_memory(), those
+		 * regions are marked __arch_set_page_dat().
 		 */
-		if (arg->altmap_nr_pages) {
-			start = PFN_PHYS(arg->altmap_start_pfn);
-			size += PFN_PHYS(arg->altmap_nr_pages);
+		__arch_set_page_nodat((void *)__va(addr), block_size >> PAGE_SHIFT);
+		rc = __add_memory(0, addr, block_size,
+				  sclp_mem->memmap_on_memory ?
+				  MHP_MEMMAP_ON_MEMORY : MHP_NONE);
+		if (rc) {
+			sclp_mem_change_state(addr, block_size, 0);
+			goto out_unlock;
 		}
-		rc = sclp_mem_change_state(start, size, 1);
-		if (rc || !arg->altmap_nr_pages)
-			break;
-		/*
-		 * Set CMMA state to nodat here, since the struct page memory
-		 * at the beginning of the memory block will not go through the
-		 * buddy allocator later.
-		 */
-		__arch_set_page_nodat((void *)__va(start), arg->altmap_nr_pages);
-		break;
-	case MEM_FINISH_OFFLINE:
-		/*
-		 * When altmap is in use, take the specified memory range
-		 * offline, which includes the altmap.
-		 */
-		if (arg->altmap_nr_pages) {
-			start = PFN_PHYS(arg->altmap_start_pfn);
-			size += PFN_PHYS(arg->altmap_nr_pages);
+		mem = find_memory_block(pfn_to_section_nr(PFN_DOWN(addr)));
+		put_device(&mem->dev);
+		WRITE_ONCE(sclp_mem->config, 1);
+	} else {
+		if (!sclp_mem->config)
+			goto out_unlock;
+		mem = find_memory_block(pfn_to_section_nr(PFN_DOWN(addr)));
+		if (mem->state != MEM_OFFLINE) {
+			put_device(&mem->dev);
+			rc = -EBUSY;
+			goto out_unlock;
 		}
-		sclp_mem_change_state(start, size, 0);
-		break;
-	default:
-		break;
+		/* drop the ref just got via find_memory_block() */
+		put_device(&mem->dev);
+		sclp_mem_change_state(addr, block_size, 0);
+		__remove_memory(addr, block_size);
+		WRITE_ONCE(sclp_mem->config, 0);
+	}
+out_unlock:
+	unlock_device_hotplug();
+out:
+	return rc ? rc : count;
+}
+
+static struct kobj_attribute sclp_config_mem_attr =
+	__ATTR(config, 0644, sclp_config_mem_show, sclp_config_mem_store);
+
+static ssize_t sclp_memmap_on_memory_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
+{
+	struct sclp_mem *sclp_mem = container_of(kobj, struct sclp_mem, kobj);
+
+	return sysfs_emit(buf, "%u\n", READ_ONCE(sclp_mem->memmap_on_memory));
+}
+
+static ssize_t sclp_memmap_on_memory_store(struct kobject *kobj, struct kobj_attribute *attr,
+					   const char *buf, size_t count)
+{
+	struct sclp_mem *sclp_mem;
+	unsigned long block_size;
+	struct memory_block *mem;
+	bool value;
+	int rc;
+
+	rc = kstrtobool(buf, &value);
+	if (rc)
+		return rc;
+	if (value && !mhp_supports_memmap_on_memory())
+		return -EOPNOTSUPP;
+	rc = lock_device_hotplug_sysfs();
+	if (rc)
+		return rc;
+	block_size = memory_block_size_bytes();
+	sclp_mem = container_of(kobj, struct sclp_mem, kobj);
+	mem = find_memory_block(pfn_to_section_nr(PFN_DOWN(sclp_mem->id * block_size)));
+	if (!mem) {
+		WRITE_ONCE(sclp_mem->memmap_on_memory, value);
+	} else {
+		put_device(&mem->dev);
+		rc = -EBUSY;
 	}
-	mutex_unlock(&sclp_mem_mutex);
-	return rc ? NOTIFY_BAD : NOTIFY_OK;
+	unlock_device_hotplug();
+	return rc ? rc : count;
 }
 
-static struct notifier_block sclp_mem_nb = {
-	.notifier_call = sclp_mem_notifier,
+static const struct kobj_type ktype = {
+	.sysfs_ops = &kobj_sysfs_ops,
+};
+
+static struct kobj_attribute sclp_memmap_attr =
+	__ATTR(memmap_on_memory, 0644, sclp_memmap_on_memory_show, sclp_memmap_on_memory_store);
+
+static struct attribute *sclp_mem_attrs[] = {
+	&sclp_config_mem_attr.attr,
+	&sclp_memmap_attr.attr,
+	NULL,
 };
 
+static struct attribute_group sclp_mem_attr_group = {
+	.attrs = sclp_mem_attrs,
+};
+
+static int sclp_create_mem(struct sclp_mem *sclp_mem, struct kset *kset,
+			   unsigned int id, bool config, bool memmap_on_memory)
+{
+	int rc;
+
+	sclp_mem->memmap_on_memory = memmap_on_memory;
+	sclp_mem->config = config;
+	sclp_mem->id = id;
+	kobject_init(&sclp_mem->kobj, &ktype);
+	rc = kobject_add(&sclp_mem->kobj, &kset->kobj, "memory%d", id);
+	if (rc)
+		return rc;
+	return sysfs_create_group(&sclp_mem->kobj, &sclp_mem_attr_group);
+}
+
+static int sclp_create_configured_mem(struct memory_block *mem, void *argument)
+{
+	struct sclp_mem *sclp_mems;
+	struct sclp_mem_arg *arg;
+	struct kset *kset;
+	unsigned int id;
+
+	id = mem->dev.id;
+	arg = (struct sclp_mem_arg *)argument;
+	sclp_mems = arg->sclp_mems;
+	kset = arg->kset;
+	return sclp_create_mem(&sclp_mems[id], kset, id, true, false);
+}
+
 static void __init align_to_block_size(unsigned long *start,
 				       unsigned long *size,
 				       unsigned long alignment)
@@ -264,14 +353,17 @@ static void __init align_to_block_size(unsigned long *start,
 	*size = size_align;
 }
 
-static void __init add_memory_merged(u16 rn)
+static int __init sclp_create_standby_mems_merged(struct sclp_mem *sclp_mems,
+						  struct kset *kset, u16 rn)
 {
 	unsigned long start, size, addr, block_size;
 	static u16 first_rn, num;
+	unsigned int id;
+	int rc = 0;
 
 	if (rn && first_rn && (first_rn + num == rn)) {
 		num++;
-		return;
+		return rc;
 	}
 	if (!first_rn)
 		goto skip_add;
@@ -286,24 +378,57 @@ static void __init add_memory_merged(u16 rn)
 	if (!size)
 		goto skip_add;
 	for (addr = start; addr < start + size; addr += block_size) {
-		add_memory(0, addr, block_size,
-			   cpu_has_edat1() ?
-			   MHP_MEMMAP_ON_MEMORY | MHP_OFFLINE_INACCESSIBLE : MHP_NONE);
+		id = addr / block_size;
+		rc = sclp_create_mem(&sclp_mems[id], kset, id, false,
+				     mhp_supports_memmap_on_memory());
+		if (rc)
+			break;
 	}
 skip_add:
 	first_rn = rn;
 	num = 1;
+	return rc;
 }
 
-static void __init sclp_add_standby_memory(void)
+static int __init sclp_create_standby_mems(struct sclp_mem *sclp_mems, struct kset *kset)
 {
 	struct memory_increment *incr;
+	int rc = 0;
 
 	list_for_each_entry(incr, &sclp_mem_list, list) {
 		if (incr->standby)
-			add_memory_merged(incr->rn);
+			rc = sclp_create_standby_mems_merged(sclp_mems, kset, incr->rn);
+		if (rc)
+			return rc;
 	}
-	add_memory_merged(0);
+	return sclp_create_standby_mems_merged(sclp_mems, kset, 0);
+}
+
+static int __init sclp_init_mem(void)
+{
+	const unsigned long block_size = memory_block_size_bytes();
+	unsigned int max_sclp_mems;
+	struct sclp_mem *sclp_mems;
+	struct sclp_mem_arg arg;
+	struct kset *kset;
+	int rc;
+
+	max_sclp_mems = roundup(sclp.rnmax * sclp.rzm, block_size) / block_size;
+	/* Allocate memory for all blocks ahead of time. */
+	sclp_mems = kcalloc(max_sclp_mems, sizeof(struct sclp_mem), GFP_KERNEL);
+	if (!sclp_mems)
+		return -ENOMEM;
+	kset = kset_create_and_add("memory", NULL, firmware_kobj);
+	if (!kset)
+		return -ENOMEM;
+	/* Initial memory is in the "configured" state already. */
+	arg.sclp_mems = sclp_mems;
+	arg.kset = kset;
+	rc = for_each_memory_block(&arg, sclp_create_configured_mem);
+	if (rc)
+		return rc;
+	/* Standby memory is "deconfigured". */
+	return sclp_create_standby_mems(sclp_mems, kset);
 }
 
 static void __init insert_increment(u16 rn, int standby, int assigned)
@@ -336,7 +461,7 @@ static void __init insert_increment(u16 rn, int standby, int assigned)
 	list_add(&new_incr->list, prev);
 }
 
-static int __init sclp_detect_standby_memory(void)
+static int __init sclp_setup_memory(void)
 {
 	struct read_storage_sccb *sccb;
 	int i, id, assigned, rc;
@@ -388,12 +513,9 @@ static int __init sclp_detect_standby_memory(void)
 		goto out;
 	for (i = 1; i <= sclp.rnmax - assigned; i++)
 		insert_increment(0, 1, 0);
-	rc = register_memory_notifier(&sclp_mem_nb);
-	if (rc)
-		goto out;
-	sclp_add_standby_memory();
+	rc = sclp_init_mem();
 out:
 	free_page((unsigned long)sccb);
 	return rc;
 }
-__initcall(sclp_detect_standby_memory);
+__initcall(sclp_setup_memory);
diff --git a/drivers/s390/char/sclp_ocf.c b/drivers/s390/char/sclp_ocf.c
index ae2479b804d8..35f3a4a08b12 100644
--- a/drivers/s390/char/sclp_ocf.c
+++ b/drivers/s390/char/sclp_ocf.c
@@ -6,8 +6,7 @@
  *    Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
  */
 
-#define KMSG_COMPONENT "sclp_ocf"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "sclp_ocf: " fmt
 
 #include <linux/export.h>
 #include <linux/kernel.h>
diff --git a/drivers/s390/char/sclp_pci.c b/drivers/s390/char/sclp_pci.c
index 56400886f7fc..899063e64aef 100644
--- a/drivers/s390/char/sclp_pci.c
+++ b/drivers/s390/char/sclp_pci.c
@@ -4,8 +4,7 @@
  *
  * Copyright IBM Corp. 2016
  */
-#define KMSG_COMPONENT "sclp_cmd"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "sclp_cmd: " fmt
 
 #include <linux/completion.h>
 #include <linux/export.h>
diff --git a/drivers/s390/char/sclp_sd.c b/drivers/s390/char/sclp_sd.c
index 129b89fe40a3..bb1bce70ec00 100644
--- a/drivers/s390/char/sclp_sd.c
+++ b/drivers/s390/char/sclp_sd.c
@@ -5,8 +5,7 @@
  * Copyright IBM Corp. 2017
  */
 
-#define KMSG_COMPONENT "sclp_sd"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "sclp_sd: " fmt
 
 #include <linux/completion.h>
 #include <linux/jiffies.h>
diff --git a/drivers/s390/char/sclp_sdias.c b/drivers/s390/char/sclp_sdias.c
index e915a343fcf5..ab8f1b758a1a 100644
--- a/drivers/s390/char/sclp_sdias.c
+++ b/drivers/s390/char/sclp_sdias.c
@@ -6,8 +6,7 @@
  * Author(s): Michael Holzheu
  */
 
-#define KMSG_COMPONENT "sclp_sdias"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "sclp_sdias: " fmt
 
 #include <linux/completion.h>
 #include <linux/sched.h>
diff --git a/drivers/s390/char/tape.h b/drivers/s390/char/tape.h
index 0aba30efb483..3953b31b0c55 100644
--- a/drivers/s390/char/tape.h
+++ b/drivers/s390/char/tape.h
@@ -130,6 +130,7 @@ struct tape_request {
 	int retries;			/* retry counter for error recovery. */
 	int rescnt;			/* residual count from devstat. */
 	struct timer_list timer;	/* timer for std_assign_timeout(). */
+	struct irb irb;			/* device status */
 
 	/* Callback for delivering final status. */
 	void (*callback)(struct tape_request *, void *);
@@ -151,8 +152,8 @@ struct tape_discipline {
 	int  (*setup_device)(struct tape_device *);
 	void (*cleanup_device)(struct tape_device *);
 	int (*irq)(struct tape_device *, struct tape_request *, struct irb *);
-	struct tape_request *(*read_block)(struct tape_device *, size_t);
-	struct tape_request *(*write_block)(struct tape_device *, size_t);
+	struct tape_request *(*read_block)(struct tape_device *);
+	struct tape_request *(*write_block)(struct tape_device *);
 	void (*process_eov)(struct tape_device*);
 	/* ioctl function for additional ioctls. */
 	int (*ioctl_fn)(struct tape_device *, unsigned int, unsigned long);
@@ -172,7 +173,7 @@ struct tape_discipline {
 
 /* Char Frontend Data */
 struct tape_char_data {
-	struct idal_buffer *idal_buf;	/* idal buffer for user char data */
+	struct idal_buffer **ibs;	/* idal buffer array for user char data */
 	int block_size;			/*   of size block_size. */
 };
 
@@ -234,6 +235,7 @@ struct tape_device {
 /* Externals from tape_core.c */
 extern struct tape_request *tape_alloc_request(int cplength, int datasize);
 extern void tape_free_request(struct tape_request *);
+extern int tape_check_idalbuffer(struct tape_device *device, size_t size);
 extern int tape_do_io(struct tape_device *, struct tape_request *);
 extern int tape_do_io_async(struct tape_device *, struct tape_request *);
 extern int tape_do_io_interruptible(struct tape_device *, struct tape_request *);
@@ -347,12 +349,21 @@ tape_ccw_repeat(struct ccw1 *ccw, __u8 cmd_code, int count)
 }
 
 static inline struct ccw1 *
+tape_ccw_dc_idal(struct ccw1 *ccw, __u8 cmd_code, struct idal_buffer *idal)
+{
+	ccw->cmd_code = cmd_code;
+	ccw->flags    = CCW_FLAG_DC;
+	idal_buffer_set_cda(idal, ccw);
+	return ccw + 1;
+}
+
+static inline struct ccw1 *
 tape_ccw_cc_idal(struct ccw1 *ccw, __u8 cmd_code, struct idal_buffer *idal)
 {
 	ccw->cmd_code = cmd_code;
 	ccw->flags    = CCW_FLAG_CC;
 	idal_buffer_set_cda(idal, ccw);
-	return ccw++;
+	return ccw + 1;
 }
 
 static inline struct ccw1 *
@@ -361,7 +372,7 @@ tape_ccw_end_idal(struct ccw1 *ccw, __u8 cmd_code, struct idal_buffer *idal)
 	ccw->cmd_code = cmd_code;
 	ccw->flags    = 0;
 	idal_buffer_set_cda(idal, ccw);
-	return ccw++;
+	return ccw + 1;
 }
 
 /* Global vars */
diff --git a/drivers/s390/char/tape_34xx.c b/drivers/s390/char/tape_34xx.c
index 1e4984acb648..a13e0ac1a4e2 100644
--- a/drivers/s390/char/tape_34xx.c
+++ b/drivers/s390/char/tape_34xx.c
@@ -8,8 +8,7 @@
  *		 Martin Schwidefsky <schwidefsky@de.ibm.com>
  */
 
-#define KMSG_COMPONENT "tape_34xx"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "tape_34xx: " fmt
 
 #include <linux/export.h>
 #include <linux/module.h>
@@ -234,31 +233,6 @@ tape_34xx_unsolicited_irq(struct tape_device *device, struct irb *irb)
 	return TAPE_IO_SUCCESS;
 }
 
-/*
- * Read Opposite Error Recovery Function:
- * Used, when Read Forward does not work
- */
-static int
-tape_34xx_erp_read_opposite(struct tape_device *device,
-			    struct tape_request *request)
-{
-	if (request->op == TO_RFO) {
-		/*
-		 * We did read forward, but the data could not be read
-		 * *correctly*. We transform the request to a read backward
-		 * and try again.
-		 */
-		tape_std_read_backward(device, request);
-		return tape_34xx_erp_retry(request);
-	}
-
-	/*
-	 * We tried to read forward and backward, but hat no
-	 * success -> failed.
-	 */
-	return tape_34xx_erp_failed(request, -EIO);
-}
-
 static int
 tape_34xx_erp_bug(struct tape_device *device, struct tape_request *request,
 		  struct irb *irb, int no)
@@ -440,9 +414,6 @@ tape_34xx_unit_check(struct tape_device *device, struct tape_request *request,
 			dev_warn (&device->cdev->dev, "A write error on the "
 				"tape cannot be recovered\n");
 			return tape_34xx_erp_failed(request, -EIO);
-		case 0x26:
-			/* Data Check (read opposite) occurred. */
-			return tape_34xx_erp_read_opposite(device, request);
 		case 0x28:
 			/* ID-Mark at tape start couldn't be written */
 			dev_warn (&device->cdev->dev, "Writing the ID-mark "
diff --git a/drivers/s390/char/tape_3590.c b/drivers/s390/char/tape_3590.c
index 2a2931d303cb..0d80f43b175d 100644
--- a/drivers/s390/char/tape_3590.c
+++ b/drivers/s390/char/tape_3590.c
@@ -8,8 +8,7 @@
  *		 Martin Schwidefsky <schwidefsky@de.ibm.com>
  */
 
-#define KMSG_COMPONENT "tape_3590"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "tape_3590: " fmt
 
 #include <linux/export.h>
 #include <linux/module.h>
@@ -551,31 +550,6 @@ tape_3590_mtseek(struct tape_device *device, int count)
 }
 
 /*
- * Read Opposite Error Recovery Function:
- * Used, when Read Forward does not work
- */
-static void
-tape_3590_read_opposite(struct tape_device *device,
-			struct tape_request *request)
-{
-	struct tape_3590_disc_data *data;
-
-	/*
-	 * We have allocated 4 ccws in tape_std_read, so we can now
-	 * transform the request to a read backward, followed by a
-	 * forward space block.
-	 */
-	request->op = TO_RBA;
-	tape_ccw_cc(request->cpaddr, MODE_SET_DB, 1, device->modeset_byte);
-	data = device->discdata;
-	tape_ccw_cc_idal(request->cpaddr + 1, data->read_back_op,
-			 device->char_data.idal_buf);
-	tape_ccw_cc(request->cpaddr + 2, FORSPACEBLOCK, 0, NULL);
-	tape_ccw_end(request->cpaddr + 3, NOP, 0, NULL);
-	DBF_EVENT(6, "xrop ccwg\n");
-}
-
-/*
  * Read Attention Msg
  * This should be done after an interrupt with attention bit (0x80)
  * in device state.
@@ -897,60 +871,6 @@ tape_3590_erp_special_interrupt(struct tape_device *device,
 }
 
 /*
- *  RDA: Read Alternate
- */
-static int
-tape_3590_erp_read_alternate(struct tape_device *device,
-			     struct tape_request *request, struct irb *irb)
-{
-	struct tape_3590_disc_data *data;
-
-	/*
-	 * The issued Read Backward or Read Previous command is not
-	 * supported by the device
-	 * The recovery action should be to issue another command:
-	 * Read Revious: if Read Backward is not supported
-	 * Read Backward: if Read Previous is not supported
-	 */
-	data = device->discdata;
-	if (data->read_back_op == READ_PREVIOUS) {
-		DBF_EVENT(2, "(%08x): No support for READ_PREVIOUS command\n",
-			  device->cdev_id);
-		data->read_back_op = READ_BACKWARD;
-	} else {
-		DBF_EVENT(2, "(%08x): No support for READ_BACKWARD command\n",
-			  device->cdev_id);
-		data->read_back_op = READ_PREVIOUS;
-	}
-	tape_3590_read_opposite(device, request);
-	return tape_3590_erp_retry(device, request, irb);
-}
-
-/*
- * Error Recovery read opposite
- */
-static int
-tape_3590_erp_read_opposite(struct tape_device *device,
-			    struct tape_request *request, struct irb *irb)
-{
-	switch (request->op) {
-	case TO_RFO:
-		/*
-		 * We did read forward, but the data could not be read.
-		 * We will read backward and then skip forward again.
-		 */
-		tape_3590_read_opposite(device, request);
-		return tape_3590_erp_retry(device, request, irb);
-	case TO_RBA:
-		/* We tried to read forward and backward, but hat no success */
-		return tape_3590_erp_failed(device, request, irb, -EIO);
-		break;
-	default:
-		return tape_3590_erp_failed(device, request, irb, -EIO);
-	}
-}
-
-/*
  * Print an MIM (Media Information  Message) (message code f0)
  */
 static void
@@ -1348,10 +1268,6 @@ tape_3590_unit_check(struct tape_device *device, struct tape_request *request,
 		tape_3590_print_era_msg(device, irb);
 		return tape_3590_erp_read_buf_log(device, request, irb);
 
-	case 0x2011:
-		tape_3590_print_era_msg(device, irb);
-		return tape_3590_erp_read_alternate(device, request, irb);
-
 	case 0x2230:
 	case 0x2231:
 		tape_3590_print_era_msg(device, irb);
@@ -1405,12 +1321,6 @@ tape_3590_unit_check(struct tape_device *device, struct tape_request *request,
 			tape_3590_print_era_msg(device, irb);
 			return tape_3590_erp_swap(device, request, irb);
 		}
-		if (sense->rac == 0x26) {
-			/* Read Opposite */
-			tape_3590_print_era_msg(device, irb);
-			return tape_3590_erp_read_opposite(device, request,
-							   irb);
-		}
 		return tape_3590_erp_basic(device, request, irb, -EIO);
 	case 0x5020:
 	case 0x5021:
diff --git a/drivers/s390/char/tape_char.c b/drivers/s390/char/tape_char.c
index 89778d922d9f..c5d3c303c15c 100644
--- a/drivers/s390/char/tape_char.c
+++ b/drivers/s390/char/tape_char.c
@@ -10,14 +10,12 @@
  *		 Martin Schwidefsky <schwidefsky@de.ibm.com>
  */
 
-#define KMSG_COMPONENT "tape"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "tape: " fmt
 
 #include <linux/module.h>
 #include <linux/types.h>
 #include <linux/proc_fs.h>
 #include <linux/mtio.h>
-#include <linux/compat.h>
 
 #include <linux/uaccess.h>
 
@@ -37,9 +35,6 @@ static ssize_t tapechar_write(struct file *, const char __user *, size_t, loff_t
 static int tapechar_open(struct inode *,struct file *);
 static int tapechar_release(struct inode *,struct file *);
 static long tapechar_ioctl(struct file *, unsigned int, unsigned long);
-#ifdef CONFIG_COMPAT
-static long tapechar_compat_ioctl(struct file *, unsigned int, unsigned long);
-#endif
 
 static const struct file_operations tape_fops =
 {
@@ -47,9 +42,6 @@ static const struct file_operations tape_fops =
 	.read = tapechar_read,
 	.write = tapechar_write,
 	.unlocked_ioctl = tapechar_ioctl,
-#ifdef CONFIG_COMPAT
-	.compat_ioctl = tapechar_compat_ioctl,
-#endif
 	.open = tapechar_open,
 	.release = tapechar_release,
 };
@@ -64,7 +56,7 @@ tapechar_setup_device(struct tape_device * device)
 {
 	char	device_name[20];
 
-	sprintf(device_name, "ntibm%i", device->first_minor / 2);
+	scnprintf(device_name, sizeof(device_name), "ntibm%i", device->first_minor / 2);
 	device->nt = register_tape_dev(
 		&device->cdev->dev,
 		MKDEV(tapechar_major, device->first_minor),
@@ -93,33 +85,6 @@ tapechar_cleanup_device(struct tape_device *device)
 	device->nt = NULL;
 }
 
-static int
-tapechar_check_idalbuffer(struct tape_device *device, size_t block_size)
-{
-	struct idal_buffer *new;
-
-	if (device->char_data.idal_buf != NULL &&
-	    device->char_data.idal_buf->size == block_size)
-		return 0;
-
-	if (block_size > MAX_BLOCKSIZE) {
-		DBF_EVENT(3, "Invalid blocksize (%zd > %d)\n",
-			block_size, MAX_BLOCKSIZE);
-		return -EINVAL;
-	}
-
-	/* The current idal buffer is not correct. Allocate a new one. */
-	new = idal_buffer_alloc(block_size, 0);
-	if (IS_ERR(new))
-		return -ENOMEM;
-
-	if (device->char_data.idal_buf != NULL)
-		idal_buffer_free(device->char_data.idal_buf);
-
-	device->char_data.idal_buf = new;
-
-	return 0;
-}
 
 /*
  * Tape device read function
@@ -127,9 +92,12 @@ tapechar_check_idalbuffer(struct tape_device *device, size_t block_size)
 static ssize_t
 tapechar_read(struct file *filp, char __user *data, size_t count, loff_t *ppos)
 {
-	struct tape_device *device;
 	struct tape_request *request;
+	struct ccw1 *ccw, *last_ccw;
+	struct tape_device *device;
+	struct idal_buffer **ibs;
 	size_t block_size;
+	size_t read = 0;
 	int rc;
 
 	DBF_EVENT(6, "TCHAR:read\n");
@@ -156,24 +124,37 @@ tapechar_read(struct file *filp, char __user *data, size_t count, loff_t *ppos)
 		block_size = count;
 	}
 
-	rc = tapechar_check_idalbuffer(device, block_size);
+	rc = tape_check_idalbuffer(device, block_size);
 	if (rc)
 		return rc;
 
 	DBF_EVENT(6, "TCHAR:nbytes: %lx\n", block_size);
 	/* Let the discipline build the ccw chain. */
-	request = device->discipline->read_block(device, block_size);
+	request = device->discipline->read_block(device);
 	if (IS_ERR(request))
 		return PTR_ERR(request);
 	/* Execute it. */
 	rc = tape_do_io(device, request);
 	if (rc == 0) {
-		rc = block_size - request->rescnt;
 		DBF_EVENT(6, "TCHAR:rbytes:  %x\n", rc);
-		/* Copy data from idal buffer to user space. */
-		if (idal_buffer_to_user(device->char_data.idal_buf,
-					data, rc) != 0)
-			rc = -EFAULT;
+		/* Channel Program Address (cpa) points to last CCW + 8 */
+		last_ccw = dma32_to_virt(request->irb.scsw.cmd.cpa);
+		ccw = request->cpaddr;
+		ibs = device->char_data.ibs;
+		while (++ccw < last_ccw) {
+			/* Copy data from idal buffer to user space. */
+			if (idal_buffer_to_user(*ibs++, data, ccw->count) != 0) {
+				rc = -EFAULT;
+				break;
+			}
+			read += ccw->count;
+			data += ccw->count;
+		}
+		if (&last_ccw[-1] == &request->cpaddr[1] &&
+		    request->rescnt == last_ccw[-1].count)
+			rc = 0;
+		else
+			rc = read - request->rescnt;
 	}
 	tape_free_request(request);
 	return rc;
@@ -185,10 +166,12 @@ tapechar_read(struct file *filp, char __user *data, size_t count, loff_t *ppos)
 static ssize_t
 tapechar_write(struct file *filp, const char __user *data, size_t count, loff_t *ppos)
 {
-	struct tape_device *device;
 	struct tape_request *request;
+	struct ccw1 *ccw, *last_ccw;
+	struct tape_device *device;
+	struct idal_buffer **ibs;
+	size_t written = 0;
 	size_t block_size;
-	size_t written;
 	int nblocks;
 	int i, rc;
 
@@ -208,35 +191,45 @@ tapechar_write(struct file *filp, const char __user *data, size_t count, loff_t
 		nblocks = 1;
 	}
 
-	rc = tapechar_check_idalbuffer(device, block_size);
+	rc = tape_check_idalbuffer(device, block_size);
 	if (rc)
 		return rc;
 
-	DBF_EVENT(6,"TCHAR:nbytes: %lx\n", block_size);
+	DBF_EVENT(6, "TCHAR:nbytes: %lx\n", block_size);
 	DBF_EVENT(6, "TCHAR:nblocks: %x\n", nblocks);
 	/* Let the discipline build the ccw chain. */
-	request = device->discipline->write_block(device, block_size);
+	request = device->discipline->write_block(device);
 	if (IS_ERR(request))
 		return PTR_ERR(request);
-	rc = 0;
-	written = 0;
+
 	for (i = 0; i < nblocks; i++) {
-		/* Copy data from user space to idal buffer. */
-		if (idal_buffer_from_user(device->char_data.idal_buf,
-					  data, block_size)) {
-			rc = -EFAULT;
-			break;
+		size_t wbytes = 0; /* Used to trace written data in dbf */
+
+		ibs = device->char_data.ibs;
+		while (ibs && *ibs) {
+			if (idal_buffer_from_user(*ibs, data, (*ibs)->size)) {
+				rc = -EFAULT;
+				goto out;
+			}
+			data += (*ibs)->size;
+			ibs++;
 		}
 		rc = tape_do_io(device, request);
 		if (rc)
-			break;
-		DBF_EVENT(6, "TCHAR:wbytes: %lx\n",
-			  block_size - request->rescnt);
-		written += block_size - request->rescnt;
+			goto out;
+
+		/* Channel Program Address (cpa) points to last CCW + 8 */
+		last_ccw = dma32_to_virt(request->irb.scsw.cmd.cpa);
+		ccw = request->cpaddr;
+		while (++ccw < last_ccw)
+			wbytes += ccw->count;
+		DBF_EVENT(6, "TCHAR:wbytes: %lx\n", wbytes - request->rescnt);
+		written += wbytes - request->rescnt;
 		if (request->rescnt != 0)
 			break;
-		data += block_size;
 	}
+
+out:
 	tape_free_request(request);
 	if (rc == -ENOSPC) {
 		/*
@@ -324,10 +317,8 @@ tapechar_release(struct inode *inode, struct file *filp)
 		}
 	}
 
-	if (device->char_data.idal_buf != NULL) {
-		idal_buffer_free(device->char_data.idal_buf);
-		device->char_data.idal_buf = NULL;
-	}
+	if (device->char_data.ibs)
+		idal_buffer_array_free(&device->char_data.ibs);
 	tape_release(device);
 	filp->private_data = NULL;
 	tape_put_device(device);
@@ -442,25 +433,6 @@ tapechar_ioctl(struct file *filp, unsigned int no, unsigned long data)
 	return rc;
 }
 
-#ifdef CONFIG_COMPAT
-static long
-tapechar_compat_ioctl(struct file *filp, unsigned int no, unsigned long data)
-{
-	struct tape_device *device = filp->private_data;
-	long rc;
-
-	if (no == MTIOCPOS32)
-		no = MTIOCPOS;
-	else if (no == MTIOCGET32)
-		no = MTIOCGET;
-
-	mutex_lock(&device->mutex);
-	rc = __tapechar_ioctl(device, no, compat_ptr(data));
-	mutex_unlock(&device->mutex);
-	return rc;
-}
-#endif /* CONFIG_COMPAT */
-
 /*
  * Initialize character device frontend.
  */
diff --git a/drivers/s390/char/tape_class.c b/drivers/s390/char/tape_class.c
index fb18adfb95b5..6fa7b7824856 100644
--- a/drivers/s390/char/tape_class.c
+++ b/drivers/s390/char/tape_class.c
@@ -8,8 +8,7 @@
  * Based on simple class device code by Greg K-H
  */
 
-#define KMSG_COMPONENT "tape"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "tape: " fmt
 
 #include <linux/export.h>
 #include <linux/slab.h>
diff --git a/drivers/s390/char/tape_core.c b/drivers/s390/char/tape_core.c
index 6ec812280221..0250076a7d9f 100644
--- a/drivers/s390/char/tape_core.c
+++ b/drivers/s390/char/tape_core.c
@@ -11,8 +11,7 @@
  *		 Stefan Bader <shbader@de.ibm.com>
  */
 
-#define KMSG_COMPONENT "tape"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "tape: " fmt
 
 #include <linux/export.h>
 #include <linux/module.h>
@@ -726,6 +725,36 @@ tape_free_request (struct tape_request * request)
 	kfree(request);
 }
 
+int
+tape_check_idalbuffer(struct tape_device *device, size_t size)
+{
+	struct idal_buffer **new;
+	size_t old_size = 0;
+
+	old_size = idal_buffer_array_datasize(device->char_data.ibs);
+	if (old_size == size)
+		return 0;
+
+	if (size > MAX_BLOCKSIZE) {
+		DBF_EVENT(3, "Invalid blocksize (%zd > %d)\n",
+			  size, MAX_BLOCKSIZE);
+		return -EINVAL;
+	}
+
+	/* The current idal buffer is not correct. Allocate a new one. */
+	new = idal_buffer_array_alloc(size, 0);
+	if (IS_ERR(new))
+		return -ENOMEM;
+
+	/* Free old idal buffer array */
+	if (device->char_data.ibs)
+		idal_buffer_array_free(&device->char_data.ibs);
+
+	device->char_data.ibs = new;
+
+	return 0;
+}
+
 static int
 __tape_start_io(struct tape_device *device, struct tape_request *request)
 {
@@ -1099,9 +1128,10 @@ __tape_do_irq (struct ccw_device *cdev, unsigned long intparm, struct irb *irb)
 	}
 
 	/* May be an unsolicited irq */
-	if(request != NULL)
+	if (request != NULL) {
 		request->rescnt = irb->scsw.cmd.count;
-	else if ((irb->scsw.cmd.dstat == 0x85 || irb->scsw.cmd.dstat == 0x80) &&
+		memcpy(&request->irb, irb, sizeof(*irb));
+	} else if ((irb->scsw.cmd.dstat == 0x85 || irb->scsw.cmd.dstat == 0x80) &&
 		 !list_empty(&device->req_queue)) {
 		/* Not Ready to Ready after long busy ? */
 		struct tape_request *req;
diff --git a/drivers/s390/char/tape_proc.c b/drivers/s390/char/tape_proc.c
index 2238d9df6c47..a1e5fab12af2 100644
--- a/drivers/s390/char/tape_proc.c
+++ b/drivers/s390/char/tape_proc.c
@@ -11,8 +11,7 @@
  * PROCFS Functions
  */
 
-#define KMSG_COMPONENT "tape"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "tape: " fmt
 
 #include <linux/module.h>
 #include <linux/vmalloc.h>
diff --git a/drivers/s390/char/tape_std.c b/drivers/s390/char/tape_std.c
index 176ae8e2eb6b..43a5586685ff 100644
--- a/drivers/s390/char/tape_std.c
+++ b/drivers/s390/char/tape_std.c
@@ -11,8 +11,7 @@
  *		 Stefan Bader <shbader@de.ibm.com>
  */
 
-#define KMSG_COMPONENT "tape"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "tape: " fmt
 
 #include <linux/export.h>
 #include <linux/stddef.h>
@@ -212,7 +211,7 @@ tape_std_mtload(struct tape_device *device, int count)
 int
 tape_std_mtsetblk(struct tape_device *device, int count)
 {
-	struct idal_buffer *new;
+	int rc;
 
 	DBF_LH(6, "tape_std_mtsetblk(%d)\n", count);
 	if (count <= 0) {
@@ -224,26 +223,12 @@ tape_std_mtsetblk(struct tape_device *device, int count)
 		device->char_data.block_size = 0;
 		return 0;
 	}
-	if (device->char_data.idal_buf != NULL &&
-	    device->char_data.idal_buf->size == count)
-		/* We already have a idal buffer of that size. */
-		return 0;
 
-	if (count > MAX_BLOCKSIZE) {
-		DBF_EVENT(3, "Invalid block size (%d > %d) given.\n",
-			count, MAX_BLOCKSIZE);
-		return -EINVAL;
-	}
+	rc = tape_check_idalbuffer(device, count);
+	if (rc)
+		return rc;
 
-	/* Allocate a new idal buffer. */
-	new = idal_buffer_alloc(count, 0);
-	if (IS_ERR(new))
-		return -ENOMEM;
-	if (device->char_data.idal_buf != NULL)
-		idal_buffer_free(device->char_data.idal_buf);
-	device->char_data.idal_buf = new;
 	device->char_data.block_size = count;
-
 	DBF_LH(6, "new blocksize is %d\n", device->char_data.block_size);
 
 	return 0;
@@ -641,63 +626,54 @@ tape_std_mtcompression(struct tape_device *device, int mt_count)
  * Read Block
  */
 struct tape_request *
-tape_std_read_block(struct tape_device *device, size_t count)
+tape_std_read_block(struct tape_device *device)
 {
 	struct tape_request *request;
+	struct idal_buffer **ibs;
+	struct ccw1 *ccw;
+	size_t count;
 
-	/*
-	 * We have to alloc 4 ccws in order to be able to transform request
-	 * into a read backward request in error case.
-	 */
-	request = tape_alloc_request(4, 0);
+	ibs = device->char_data.ibs;
+	count = idal_buffer_array_size(ibs);
+	request = tape_alloc_request(count + 1 /* MODE_SET_DB */, 0);
 	if (IS_ERR(request)) {
 		DBF_EXCEPTION(6, "xrbl fail");
 		return request;
 	}
 	request->op = TO_RFO;
-	tape_ccw_cc(request->cpaddr, MODE_SET_DB, 1, device->modeset_byte);
-	tape_ccw_end_idal(request->cpaddr + 1, READ_FORWARD,
-			  device->char_data.idal_buf);
+	ccw = tape_ccw_cc(request->cpaddr, MODE_SET_DB, 1, device->modeset_byte);
+	while (count-- > 1)
+		ccw = tape_ccw_dc_idal(ccw, READ_FORWARD, *ibs++);
+	tape_ccw_end_idal(ccw, READ_FORWARD, *ibs);
+
 	DBF_EVENT(6, "xrbl ccwg\n");
 	return request;
 }
 
 /*
- * Read Block backward transformation function.
- */
-void
-tape_std_read_backward(struct tape_device *device, struct tape_request *request)
-{
-	/*
-	 * We have allocated 4 ccws in tape_std_read, so we can now
-	 * transform the request to a read backward, followed by a
-	 * forward space block.
-	 */
-	request->op = TO_RBA;
-	tape_ccw_cc(request->cpaddr, MODE_SET_DB, 1, device->modeset_byte);
-	tape_ccw_cc_idal(request->cpaddr + 1, READ_BACKWARD,
-			 device->char_data.idal_buf);
-	tape_ccw_cc(request->cpaddr + 2, FORSPACEBLOCK, 0, NULL);
-	tape_ccw_end(request->cpaddr + 3, NOP, 0, NULL);
-	DBF_EVENT(6, "xrop ccwg");}
-
-/*
  * Write Block
  */
 struct tape_request *
-tape_std_write_block(struct tape_device *device, size_t count)
+tape_std_write_block(struct tape_device *device)
 {
 	struct tape_request *request;
+	struct idal_buffer **ibs;
+	struct ccw1 *ccw;
+	size_t count;
 
-	request = tape_alloc_request(2, 0);
+	count = idal_buffer_array_size(device->char_data.ibs);
+	request = tape_alloc_request(count + 1 /* MODE_SET_DB */, 0);
 	if (IS_ERR(request)) {
 		DBF_EXCEPTION(6, "xwbl fail\n");
 		return request;
 	}
 	request->op = TO_WRI;
-	tape_ccw_cc(request->cpaddr, MODE_SET_DB, 1, device->modeset_byte);
-	tape_ccw_end_idal(request->cpaddr + 1, WRITE_CMD,
-			  device->char_data.idal_buf);
+	ccw = tape_ccw_cc(request->cpaddr, MODE_SET_DB, 1, device->modeset_byte);
+	ibs = device->char_data.ibs;
+	while (count-- > 1)
+		ccw = tape_ccw_dc_idal(ccw, WRITE_CMD, *ibs++);
+	tape_ccw_end_idal(ccw, WRITE_CMD, *ibs);
+
 	DBF_EVENT(6, "xwbl ccwg\n");
 	return request;
 }
@@ -741,6 +717,5 @@ EXPORT_SYMBOL(tape_std_mterase);
 EXPORT_SYMBOL(tape_std_mtunload);
 EXPORT_SYMBOL(tape_std_mtcompression);
 EXPORT_SYMBOL(tape_std_read_block);
-EXPORT_SYMBOL(tape_std_read_backward);
 EXPORT_SYMBOL(tape_std_write_block);
 EXPORT_SYMBOL(tape_std_process_eov);
diff --git a/drivers/s390/char/tape_std.h b/drivers/s390/char/tape_std.h
index dcc63ff587f9..2cf9f725b3b3 100644
--- a/drivers/s390/char/tape_std.h
+++ b/drivers/s390/char/tape_std.h
@@ -14,10 +14,9 @@
 #include <asm/tape390.h>
 
 /*
- * Biggest block size to handle. Currently 64K because we only build
- * channel programs without data chaining.
+ * Biggest block size of 256K to handle.
  */
-#define MAX_BLOCKSIZE   65535
+#define MAX_BLOCKSIZE	262144
 
 /*
  * The CCW commands for the Tape type of command.
@@ -97,10 +96,10 @@
 #define SENSE_TAPE_POSITIONING		0x01
 
 /* discipline functions */
-struct tape_request *tape_std_read_block(struct tape_device *, size_t);
+struct tape_request *tape_std_read_block(struct tape_device *);
 void tape_std_read_backward(struct tape_device *device,
 			    struct tape_request *request);
-struct tape_request *tape_std_write_block(struct tape_device *, size_t);
+struct tape_request *tape_std_write_block(struct tape_device *);
 
 /* Some non-mtop commands. */
 int tape_std_assign(struct tape_device *);
diff --git a/drivers/s390/char/vmcp.c b/drivers/s390/char/vmcp.c
index 69899bb86b3e..bde6c9e59166 100644
--- a/drivers/s390/char/vmcp.c
+++ b/drivers/s390/char/vmcp.c
@@ -14,7 +14,6 @@
 
 #include <linux/fs.h>
 #include <linux/init.h>
-#include <linux/compat.h>
 #include <linux/kernel.h>
 #include <linux/miscdevice.h>
 #include <linux/slab.h>
@@ -204,10 +203,7 @@ static long vmcp_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	int __user *argp;
 
 	session = file->private_data;
-	if (is_compat_task())
-		argp = compat_ptr(arg);
-	else
-		argp = (int __user *)arg;
+	argp = (int __user *)arg;
 	if (mutex_lock_interruptible(&session->mutex))
 		return -ERESTARTSYS;
 	switch (cmd) {
@@ -241,7 +237,6 @@ static const struct file_operations vmcp_fops = {
 	.read		= vmcp_read,
 	.write		= vmcp_write,
 	.unlocked_ioctl	= vmcp_ioctl,
-	.compat_ioctl	= vmcp_ioctl,
 };
 
 static struct miscdevice vmcp_dev = {
diff --git a/drivers/s390/char/vmlogrdr.c b/drivers/s390/char/vmlogrdr.c
index e284eea331d7..383e7e2bd69f 100644
--- a/drivers/s390/char/vmlogrdr.c
+++ b/drivers/s390/char/vmlogrdr.c
@@ -11,8 +11,7 @@
  *
  */
 
-#define KMSG_COMPONENT "vmlogrdr"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "vmlogrdr: " fmt
 
 #include <linux/module.h>
 #include <linux/init.h>
diff --git a/drivers/s390/char/vmur.c b/drivers/s390/char/vmur.c
index 0fd918769a4b..e3e0e9f36527 100644
--- a/drivers/s390/char/vmur.c
+++ b/drivers/s390/char/vmur.c
@@ -9,8 +9,7 @@
  *	    Frank Munzert <munzert@de.ibm.com>
  */
 
-#define KMSG_COMPONENT "vmur"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "vmur: " fmt
 
 #include <linux/cdev.h>
 #include <linux/slab.h>
diff --git a/drivers/s390/char/zcore.c b/drivers/s390/char/zcore.c
index 33cebb91b933..b26b5fca6ce8 100644
--- a/drivers/s390/char/zcore.c
+++ b/drivers/s390/char/zcore.c
@@ -9,8 +9,7 @@
  * Author(s): Michael Holzheu
  */
 
-#define KMSG_COMPONENT "zdump"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "zdump: " fmt
 
 #include <linux/init.h>
 #include <linux/slab.h>
diff --git a/drivers/s390/cio/blacklist.c b/drivers/s390/cio/blacklist.c
index 93695d535380..738d5e2d5304 100644
--- a/drivers/s390/cio/blacklist.c
+++ b/drivers/s390/cio/blacklist.c
@@ -8,8 +8,7 @@
  *		 Arnd Bergmann (arndb@de.ibm.com)
  */
 
-#define KMSG_COMPONENT "cio"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "cio: " fmt
 
 #include <linux/init.h>
 #include <linux/vmalloc.h>
diff --git a/drivers/s390/cio/ccwgroup.c b/drivers/s390/cio/ccwgroup.c
index 2fc2ea4b2e3b..185c99c5d4cc 100644
--- a/drivers/s390/cio/ccwgroup.c
+++ b/drivers/s390/cio/ccwgroup.c
@@ -41,7 +41,7 @@ static void __ccwgroup_remove_symlinks(struct ccwgroup_device *gdev)
 	char str[16];
 
 	for (i = 0; i < gdev->count; i++) {
-		sprintf(str, "cdev%d", i);
+		scnprintf(str, sizeof(str), "cdev%d", i);
 		sysfs_remove_link(&gdev->dev.kobj, str);
 		sysfs_remove_link(&gdev->cdev[i]->dev.kobj, "group_device");
 	}
@@ -249,12 +249,12 @@ static int __ccwgroup_create_symlinks(struct ccwgroup_device *gdev)
 		}
 	}
 	for (i = 0; i < gdev->count; i++) {
-		sprintf(str, "cdev%d", i);
+		scnprintf(str, sizeof(str), "cdev%d", i);
 		rc = sysfs_create_link(&gdev->dev.kobj,
 				       &gdev->cdev[i]->dev.kobj, str);
 		if (rc) {
 			while (i--) {
-				sprintf(str, "cdev%d", i);
+				scnprintf(str, sizeof(str), "cdev%d", i);
 				sysfs_remove_link(&gdev->dev.kobj, str);
 			}
 			for (i = 0; i < gdev->count; i++)
diff --git a/drivers/s390/cio/ccwreq.c b/drivers/s390/cio/ccwreq.c
index 73582a0a2622..763f477cc431 100644
--- a/drivers/s390/cio/ccwreq.c
+++ b/drivers/s390/cio/ccwreq.c
@@ -6,8 +6,7 @@
  *    Author(s): Peter Oberparleiter <peter.oberparleiter@de.ibm.com>
  */
 
-#define KMSG_COMPONENT "cio"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "cio: " fmt
 
 #include <linux/types.h>
 #include <linux/err.h>
diff --git a/drivers/s390/cio/chp.c b/drivers/s390/cio/chp.c
index caa300160b17..c10e2444507e 100644
--- a/drivers/s390/cio/chp.c
+++ b/drivers/s390/cio/chp.c
@@ -111,8 +111,9 @@ static int s390_vary_chpid(struct chp_id chpid, int on)
 	char dbf_text[15];
 	int status;
 
-	sprintf(dbf_text, on?"varyon%x.%02x":"varyoff%x.%02x", chpid.cssid,
-		chpid.id);
+	scnprintf(dbf_text, sizeof(dbf_text),
+		  on ? "varyon%x.%02x" : "varyoff%x.%02x",
+		  chpid.cssid, chpid.id);
 	CIO_TRACE_EVENT(2, dbf_text);
 
 	status = chp_get_status(chpid);
diff --git a/drivers/s390/cio/chsc.c b/drivers/s390/cio/chsc.c
index 239c92d4ec11..fbb58edd6274 100644
--- a/drivers/s390/cio/chsc.c
+++ b/drivers/s390/cio/chsc.c
@@ -8,8 +8,7 @@
  *		 Arnd Bergmann (arndb@de.ibm.com)
  */
 
-#define KMSG_COMPONENT "cio"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "cio: " fmt
 
 #include <linux/export.h>
 #include <linux/module.h>
@@ -253,7 +252,7 @@ void chsc_chp_offline(struct chp_id chpid)
 	struct chp_link link;
 	char dbf_txt[15];
 
-	sprintf(dbf_txt, "chpr%x.%02x", chpid.cssid, chpid.id);
+	scnprintf(dbf_txt, sizeof(dbf_txt), "chpr%x.%02x", chpid.cssid, chpid.id);
 	CIO_TRACE_EVENT(2, dbf_txt);
 
 	if (chp_get_status(chpid) <= 0)
@@ -284,11 +283,11 @@ static void s390_process_res_acc(struct chp_link *link)
 {
 	char dbf_txt[15];
 
-	sprintf(dbf_txt, "accpr%x.%02x", link->chpid.cssid,
-		link->chpid.id);
+	scnprintf(dbf_txt, sizeof(dbf_txt), "accpr%x.%02x", link->chpid.cssid,
+		  link->chpid.id);
 	CIO_TRACE_EVENT( 2, dbf_txt);
 	if (link->fla != 0) {
-		sprintf(dbf_txt, "fla%x", link->fla);
+		scnprintf(dbf_txt, sizeof(dbf_txt), "fla%x", link->fla);
 		CIO_TRACE_EVENT( 2, dbf_txt);
 	}
 	/* Wait until previous actions have settled. */
@@ -757,7 +756,7 @@ void chsc_chp_online(struct chp_id chpid)
 	struct chp_link link;
 	char dbf_txt[15];
 
-	sprintf(dbf_txt, "cadd%x.%02x", chpid.cssid, chpid.id);
+	scnprintf(dbf_txt, sizeof(dbf_txt), "cadd%x.%02x", chpid.cssid, chpid.id);
 	CIO_TRACE_EVENT(2, dbf_txt);
 
 	if (chp_get_status(chpid) != 0) {
diff --git a/drivers/s390/cio/chsc_sch.c b/drivers/s390/cio/chsc_sch.c
index 1e58ee3cc87d..ce992b2107cb 100644
--- a/drivers/s390/cio/chsc_sch.c
+++ b/drivers/s390/cio/chsc_sch.c
@@ -9,7 +9,6 @@
  */
 
 #include <linux/slab.h>
-#include <linux/compat.h>
 #include <linux/device.h>
 #include <linux/io.h>
 #include <linux/module.h>
@@ -845,10 +844,7 @@ static long chsc_ioctl(struct file *filp, unsigned int cmd,
 	void __user *argp;
 
 	CHSC_MSG(2, "chsc_ioctl called, cmd=%x\n", cmd);
-	if (is_compat_task())
-		argp = compat_ptr(arg);
-	else
-		argp = (void __user *)arg;
+	argp = (void __user *)arg;
 	switch (cmd) {
 	case CHSC_START:
 		return chsc_ioctl_start(argp);
@@ -923,7 +919,6 @@ static const struct file_operations chsc_fops = {
 	.open = chsc_open,
 	.release = chsc_release,
 	.unlocked_ioctl = chsc_ioctl,
-	.compat_ioctl = chsc_ioctl,
 };
 
 static struct miscdevice chsc_misc_device = {
diff --git a/drivers/s390/cio/cio.c b/drivers/s390/cio/cio.c
index 21508e4606d5..70dc8cc76594 100644
--- a/drivers/s390/cio/cio.c
+++ b/drivers/s390/cio/cio.c
@@ -9,8 +9,7 @@
  *		 Martin Schwidefsky (schwidefsky@de.ibm.com)
  */
 
-#define KMSG_COMPONENT "cio"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "cio: " fmt
 
 #include <linux/export.h>
 #include <linux/ftrace.h>
@@ -113,7 +112,7 @@ cio_start_handle_notoper(struct subchannel *sch, __u8 lpm)
 	if (cio_update_schib(sch))
 		return -ENODEV;
 
-	sprintf(dbf_text, "no%s", dev_name(&sch->dev));
+	scnprintf(dbf_text, sizeof(dbf_text), "no%s", dev_name(&sch->dev));
 	CIO_TRACE_EVENT(0, dbf_text);
 	CIO_HEX_EVENT(0, &sch->schib, sizeof (struct schib));
 
diff --git a/drivers/s390/cio/cio_inject.c b/drivers/s390/cio/cio_inject.c
index a2e771ebae8e..0e18cb921ef6 100644
--- a/drivers/s390/cio/cio_inject.c
+++ b/drivers/s390/cio/cio_inject.c
@@ -6,8 +6,7 @@
  *    Author(s): Vineeth Vijayan <vneethv@linux.ibm.com>
  */
 
-#define KMSG_COMPONENT "cio"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "cio: " fmt
 
 #include <linux/slab.h>
 #include <linux/spinlock.h>
diff --git a/drivers/s390/cio/cmf.c b/drivers/s390/cio/cmf.c
index b7048f2b036e..7d035e4937ce 100644
--- a/drivers/s390/cio/cmf.c
+++ b/drivers/s390/cio/cmf.c
@@ -10,8 +10,7 @@
  * original idea from Natarajan Krishnaswami <nkrishna@us.ibm.com>
  */
 
-#define KMSG_COMPONENT "cio"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "cio: " fmt
 
 #include <linux/memblock.h>
 #include <linux/device.h>
diff --git a/drivers/s390/cio/css.c b/drivers/s390/cio/css.c
index be78a57f9bfd..4c85df7a548e 100644
--- a/drivers/s390/cio/css.c
+++ b/drivers/s390/cio/css.c
@@ -8,8 +8,7 @@
  *	      Cornelia Huck (cornelia.huck@de.ibm.com)
  */
 
-#define KMSG_COMPONENT "cio"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "cio: " fmt
 
 #include <linux/export.h>
 #include <linux/init.h>
diff --git a/drivers/s390/cio/device.c b/drivers/s390/cio/device.c
index 4b2dae6eb376..602f36102c7c 100644
--- a/drivers/s390/cio/device.c
+++ b/drivers/s390/cio/device.c
@@ -8,8 +8,7 @@
  *		 Martin Schwidefsky (schwidefsky@de.ibm.com)
  */
 
-#define KMSG_COMPONENT "cio"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "cio: " fmt
 
 #include <linux/export.h>
 #include <linux/init.h>
diff --git a/drivers/s390/cio/device_status.c b/drivers/s390/cio/device_status.c
index 0ff8482a7b15..f4096373c8c0 100644
--- a/drivers/s390/cio/device_status.c
+++ b/drivers/s390/cio/device_status.c
@@ -42,7 +42,7 @@ ccw_device_msg_control_check(struct ccw_device *cdev, struct irb *irb)
 		      cdev->private->dev_id.devno, sch->schid.ssid,
 		      sch->schid.sch_no,
 		      scsw_dstat(&irb->scsw), scsw_cstat(&irb->scsw));
-	sprintf(dbf_text, "chk%x", sch->schid.sch_no);
+	scnprintf(dbf_text, sizeof(dbf_text), "chk%x", sch->schid.sch_no);
 	CIO_TRACE_EVENT(0, dbf_text);
 	CIO_HEX_EVENT(0, irb, sizeof(struct irb));
 }
diff --git a/drivers/s390/crypto/ap_bus.c b/drivers/s390/crypto/ap_bus.c
index 65f1a127cc3f..a445494fd2be 100644
--- a/drivers/s390/crypto/ap_bus.c
+++ b/drivers/s390/crypto/ap_bus.c
@@ -11,8 +11,7 @@
  * Adjunct processor bus.
  */
 
-#define KMSG_COMPONENT "ap"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "ap: " fmt
 
 #include <linux/kernel_stat.h>
 #include <linux/moduleparam.h>
@@ -86,8 +85,17 @@ DEFINE_SPINLOCK(ap_queues_lock);
 /* Default permissions (ioctl, card and domain masking) */
 struct ap_perms ap_perms;
 EXPORT_SYMBOL(ap_perms);
-DEFINE_MUTEX(ap_perms_mutex);
-EXPORT_SYMBOL(ap_perms_mutex);
+/* true if apmask and/or aqmask are NOT default */
+bool ap_apmask_aqmask_in_use;
+/* counter for how many driver_overrides are currently active */
+int ap_driver_override_ctr;
+/*
+ * Mutex for consistent read and write of the ap_perms struct,
+ * ap_apmask_aqmask_in_use, ap_driver_override_ctr
+ * and the ap bus sysfs attributes apmask and aqmask.
+ */
+DEFINE_MUTEX(ap_attr_mutex);
+EXPORT_SYMBOL(ap_attr_mutex);
 
 /* # of bindings complete since init */
 static atomic64_t ap_bindings_complete_count = ATOMIC64_INIT(0);
@@ -853,20 +861,38 @@ static int __ap_revise_reserved(struct device *dev, void *dummy)
 	int rc, card, queue, devres, drvres;
 
 	if (is_queue_dev(dev)) {
-		card = AP_QID_CARD(to_ap_queue(dev)->qid);
-		queue = AP_QID_QUEUE(to_ap_queue(dev)->qid);
-		mutex_lock(&ap_perms_mutex);
-		devres = test_bit_inv(card, ap_perms.apm) &&
-			test_bit_inv(queue, ap_perms.aqm);
-		mutex_unlock(&ap_perms_mutex);
-		drvres = to_ap_drv(dev->driver)->flags
-			& AP_DRIVER_FLAG_DEFAULT;
-		if (!!devres != !!drvres) {
-			pr_debug("reprobing queue=%02x.%04x\n", card, queue);
-			rc = device_reprobe(dev);
-			if (rc)
-				AP_DBF_WARN("%s reprobing queue=%02x.%04x failed\n",
-					    __func__, card, queue);
+		struct ap_driver *ap_drv = to_ap_drv(dev->driver);
+		struct ap_queue *aq = to_ap_queue(dev);
+		struct ap_device *ap_dev = &aq->ap_dev;
+
+		card = AP_QID_CARD(aq->qid);
+		queue = AP_QID_QUEUE(aq->qid);
+
+		if (ap_dev->driver_override) {
+			if (strcmp(ap_dev->driver_override,
+				   ap_drv->driver.name)) {
+				pr_debug("reprobing queue=%02x.%04x\n", card, queue);
+				rc = device_reprobe(dev);
+				if (rc) {
+					AP_DBF_WARN("%s reprobing queue=%02x.%04x failed\n",
+						    __func__, card, queue);
+				}
+			}
+		} else {
+			mutex_lock(&ap_attr_mutex);
+			devres = test_bit_inv(card, ap_perms.apm) &&
+				test_bit_inv(queue, ap_perms.aqm);
+			mutex_unlock(&ap_attr_mutex);
+			drvres = to_ap_drv(dev->driver)->flags
+				& AP_DRIVER_FLAG_DEFAULT;
+			if (!!devres != !!drvres) {
+				pr_debug("reprobing queue=%02x.%04x\n", card, queue);
+				rc = device_reprobe(dev);
+				if (rc) {
+					AP_DBF_WARN("%s reprobing queue=%02x.%04x failed\n",
+						    __func__, card, queue);
+				}
+			}
 		}
 	}
 
@@ -884,22 +910,37 @@ static void ap_bus_revise_bindings(void)
  * @card: the APID of the adapter card to check
  * @queue: the APQI of the queue to check
  *
- * Note: the ap_perms_mutex must be locked by the caller of this function.
+ * Note: the ap_attr_mutex must be locked by the caller of this function.
  *
  * Return: an int specifying whether the AP adapter is reserved for the host (1)
  *	   or not (0).
  */
 int ap_owned_by_def_drv(int card, int queue)
 {
+	struct ap_queue *aq;
 	int rc = 0;
 
 	if (card < 0 || card >= AP_DEVICES || queue < 0 || queue >= AP_DOMAINS)
 		return -EINVAL;
 
+	aq = ap_get_qdev(AP_MKQID(card, queue));
+	if (aq) {
+		const struct device_driver *drv = aq->ap_dev.device.driver;
+		const struct ap_driver *ap_drv = to_ap_drv(drv);
+		bool override = !!aq->ap_dev.driver_override;
+
+		if (override && drv && ap_drv->flags & AP_DRIVER_FLAG_DEFAULT)
+			rc = 1;
+		put_device(&aq->ap_dev.device);
+		if (override)
+			goto out;
+	}
+
 	if (test_bit_inv(card, ap_perms.apm) &&
 	    test_bit_inv(queue, ap_perms.aqm))
 		rc = 1;
 
+out:
 	return rc;
 }
 EXPORT_SYMBOL(ap_owned_by_def_drv);
@@ -911,7 +952,7 @@ EXPORT_SYMBOL(ap_owned_by_def_drv);
  * @apm: a bitmap specifying a set of APIDs comprising the APQNs to check
  * @aqm: a bitmap specifying a set of APQIs comprising the APQNs to check
  *
- * Note: the ap_perms_mutex must be locked by the caller of this function.
+ * Note: the ap_attr_mutex must be locked by the caller of this function.
  *
  * Return: an int specifying whether each APQN is reserved for the host (1) or
  *	   not (0)
@@ -922,12 +963,10 @@ int ap_apqn_in_matrix_owned_by_def_drv(unsigned long *apm,
 	int card, queue, rc = 0;
 
 	for (card = 0; !rc && card < AP_DEVICES; card++)
-		if (test_bit_inv(card, apm) &&
-		    test_bit_inv(card, ap_perms.apm))
+		if (test_bit_inv(card, apm))
 			for (queue = 0; !rc && queue < AP_DOMAINS; queue++)
-				if (test_bit_inv(queue, aqm) &&
-				    test_bit_inv(queue, ap_perms.aqm))
-					rc = 1;
+				if (test_bit_inv(queue, aqm))
+					rc = ap_owned_by_def_drv(card, queue);
 
 	return rc;
 }
@@ -951,13 +990,19 @@ static int ap_device_probe(struct device *dev)
 		 */
 		card = AP_QID_CARD(to_ap_queue(dev)->qid);
 		queue = AP_QID_QUEUE(to_ap_queue(dev)->qid);
-		mutex_lock(&ap_perms_mutex);
-		devres = test_bit_inv(card, ap_perms.apm) &&
-			test_bit_inv(queue, ap_perms.aqm);
-		mutex_unlock(&ap_perms_mutex);
-		drvres = ap_drv->flags & AP_DRIVER_FLAG_DEFAULT;
-		if (!!devres != !!drvres)
-			goto out;
+		if (ap_dev->driver_override) {
+			if (strcmp(ap_dev->driver_override,
+				   ap_drv->driver.name))
+				goto out;
+		} else {
+			mutex_lock(&ap_attr_mutex);
+			devres = test_bit_inv(card, ap_perms.apm) &&
+				test_bit_inv(queue, ap_perms.aqm);
+			mutex_unlock(&ap_attr_mutex);
+			drvres = ap_drv->flags & AP_DRIVER_FLAG_DEFAULT;
+			if (!!devres != !!drvres)
+				goto out;
+		}
 	}
 
 	/*
@@ -983,8 +1028,17 @@ static int ap_device_probe(struct device *dev)
 	}
 
 out:
-	if (rc)
+	if (rc) {
 		put_device(dev);
+	} else {
+		if (is_queue_dev(dev)) {
+			pr_debug("queue=%02x.%04x new driver=%s\n",
+				 card, queue, ap_drv->driver.name);
+		} else {
+			pr_debug("card=%02x new driver=%s\n",
+				 to_ap_card(dev)->id, ap_drv->driver.name);
+		}
+	}
 	return rc;
 }
 
@@ -1437,12 +1491,12 @@ static ssize_t apmask_show(const struct bus_type *bus, char *buf)
 {
 	int rc;
 
-	if (mutex_lock_interruptible(&ap_perms_mutex))
+	if (mutex_lock_interruptible(&ap_attr_mutex))
 		return -ERESTARTSYS;
 	rc = sysfs_emit(buf, "0x%016lx%016lx%016lx%016lx\n",
 			ap_perms.apm[0], ap_perms.apm[1],
 			ap_perms.apm[2], ap_perms.apm[3]);
-	mutex_unlock(&ap_perms_mutex);
+	mutex_unlock(&ap_attr_mutex);
 
 	return rc;
 }
@@ -1452,6 +1506,7 @@ static int __verify_card_reservations(struct device_driver *drv, void *data)
 	int rc = 0;
 	struct ap_driver *ap_drv = to_ap_drv(drv);
 	unsigned long *newapm = (unsigned long *)data;
+	unsigned long aqm_any[BITS_TO_LONGS(AP_DOMAINS)];
 
 	/*
 	 * increase the driver's module refcounter to be sure it is not
@@ -1461,7 +1516,8 @@ static int __verify_card_reservations(struct device_driver *drv, void *data)
 		return 0;
 
 	if (ap_drv->in_use) {
-		rc = ap_drv->in_use(newapm, ap_perms.aqm);
+		bitmap_fill(aqm_any, AP_DOMAINS);
+		rc = ap_drv->in_use(newapm, aqm_any);
 		if (rc)
 			rc = -EBUSY;
 	}
@@ -1490,18 +1546,31 @@ static int apmask_commit(unsigned long *newapm)
 
 	memcpy(ap_perms.apm, newapm, APMASKSIZE);
 
+	/*
+	 * Update ap_apmask_aqmask_in_use. Note that the
+	 * ap_attr_mutex has to be obtained here.
+	 */
+	ap_apmask_aqmask_in_use =
+		bitmap_full(ap_perms.apm, AP_DEVICES) &&
+		bitmap_full(ap_perms.aqm, AP_DOMAINS) ?
+		false : true;
+
 	return 0;
 }
 
 static ssize_t apmask_store(const struct bus_type *bus, const char *buf,
 			    size_t count)
 {
-	int rc, changes = 0;
 	DECLARE_BITMAP(newapm, AP_DEVICES);
+	int rc = -EINVAL, changes = 0;
 
-	if (mutex_lock_interruptible(&ap_perms_mutex))
+	if (mutex_lock_interruptible(&ap_attr_mutex))
 		return -ERESTARTSYS;
 
+	/* Do not allow apmask/aqmask if driver override is active */
+	if (ap_driver_override_ctr)
+		goto done;
+
 	rc = ap_parse_bitmap_str(buf, ap_perms.apm, AP_DEVICES, newapm);
 	if (rc)
 		goto done;
@@ -1511,7 +1580,7 @@ static ssize_t apmask_store(const struct bus_type *bus, const char *buf,
 		rc = apmask_commit(newapm);
 
 done:
-	mutex_unlock(&ap_perms_mutex);
+	mutex_unlock(&ap_attr_mutex);
 	if (rc)
 		return rc;
 
@@ -1529,12 +1598,12 @@ static ssize_t aqmask_show(const struct bus_type *bus, char *buf)
 {
 	int rc;
 
-	if (mutex_lock_interruptible(&ap_perms_mutex))
+	if (mutex_lock_interruptible(&ap_attr_mutex))
 		return -ERESTARTSYS;
 	rc = sysfs_emit(buf, "0x%016lx%016lx%016lx%016lx\n",
 			ap_perms.aqm[0], ap_perms.aqm[1],
 			ap_perms.aqm[2], ap_perms.aqm[3]);
-	mutex_unlock(&ap_perms_mutex);
+	mutex_unlock(&ap_attr_mutex);
 
 	return rc;
 }
@@ -1544,6 +1613,7 @@ static int __verify_queue_reservations(struct device_driver *drv, void *data)
 	int rc = 0;
 	struct ap_driver *ap_drv = to_ap_drv(drv);
 	unsigned long *newaqm = (unsigned long *)data;
+	unsigned long apm_any[BITS_TO_LONGS(AP_DEVICES)];
 
 	/*
 	 * increase the driver's module refcounter to be sure it is not
@@ -1553,7 +1623,8 @@ static int __verify_queue_reservations(struct device_driver *drv, void *data)
 		return 0;
 
 	if (ap_drv->in_use) {
-		rc = ap_drv->in_use(ap_perms.apm, newaqm);
+		bitmap_fill(apm_any, AP_DEVICES);
+		rc = ap_drv->in_use(apm_any, newaqm);
 		if (rc)
 			rc = -EBUSY;
 	}
@@ -1582,18 +1653,31 @@ static int aqmask_commit(unsigned long *newaqm)
 
 	memcpy(ap_perms.aqm, newaqm, AQMASKSIZE);
 
+	/*
+	 * Update ap_apmask_aqmask_in_use. Note that the
+	 * ap_attr_mutex has to be obtained here.
+	 */
+	ap_apmask_aqmask_in_use =
+		bitmap_full(ap_perms.apm, AP_DEVICES) &&
+		bitmap_full(ap_perms.aqm, AP_DOMAINS) ?
+		false : true;
+
 	return 0;
 }
 
 static ssize_t aqmask_store(const struct bus_type *bus, const char *buf,
 			    size_t count)
 {
-	int rc, changes = 0;
 	DECLARE_BITMAP(newaqm, AP_DOMAINS);
+	int rc = -EINVAL, changes = 0;
 
-	if (mutex_lock_interruptible(&ap_perms_mutex))
+	if (mutex_lock_interruptible(&ap_attr_mutex))
 		return -ERESTARTSYS;
 
+	/* Do not allow apmask/aqmask if driver override is active */
+	if (ap_driver_override_ctr)
+		goto done;
+
 	rc = ap_parse_bitmap_str(buf, ap_perms.aqm, AP_DOMAINS, newaqm);
 	if (rc)
 		goto done;
@@ -1603,7 +1687,7 @@ static ssize_t aqmask_store(const struct bus_type *bus, const char *buf,
 		rc = aqmask_commit(newaqm);
 
 done:
-	mutex_unlock(&ap_perms_mutex);
+	mutex_unlock(&ap_attr_mutex);
 	if (rc)
 		return rc;
 
@@ -1650,6 +1734,15 @@ static ssize_t bindings_show(const struct bus_type *bus, char *buf)
 
 static BUS_ATTR_RO(bindings);
 
+static ssize_t bindings_complete_count_show(const struct bus_type *bus,
+					    char *buf)
+{
+	return sysfs_emit(buf, "%llu\n",
+			  atomic64_read(&ap_bindings_complete_count));
+}
+
+static BUS_ATTR_RO(bindings_complete_count);
+
 static ssize_t features_show(const struct bus_type *bus, char *buf)
 {
 	int n = 0;
@@ -1690,6 +1783,7 @@ static struct attribute *ap_bus_attrs[] = {
 	&bus_attr_aqmask.attr,
 	&bus_attr_scans.attr,
 	&bus_attr_bindings.attr,
+	&bus_attr_bindings_complete_count.attr,
 	&bus_attr_features.attr,
 	NULL,
 };
@@ -2464,14 +2558,14 @@ static void __init ap_perms_init(void)
 	if (apm_str) {
 		memset(&ap_perms.apm, 0, sizeof(ap_perms.apm));
 		ap_parse_mask_str(apm_str, ap_perms.apm, AP_DEVICES,
-				  &ap_perms_mutex);
+				  &ap_attr_mutex);
 	}
 
 	/* aqm kernel parameter string */
 	if (aqm_str) {
 		memset(&ap_perms.aqm, 0, sizeof(ap_perms.aqm));
 		ap_parse_mask_str(aqm_str, ap_perms.aqm, AP_DOMAINS,
-				  &ap_perms_mutex);
+				  &ap_attr_mutex);
 	}
 }
 
@@ -2484,15 +2578,15 @@ static int __init ap_module_init(void)
 {
 	int rc;
 
-	rc = ap_debug_init();
-	if (rc)
-		return rc;
-
 	if (!ap_instructions_available()) {
 		pr_warn("The hardware system does not support AP instructions\n");
 		return -ENODEV;
 	}
 
+	rc = ap_debug_init();
+	if (rc)
+		return rc;
+
 	/* init ap_queue hashtable */
 	hash_init(ap_queues);
 
diff --git a/drivers/s390/crypto/ap_bus.h b/drivers/s390/crypto/ap_bus.h
index 4b7ffa840563..51e08f27bd75 100644
--- a/drivers/s390/crypto/ap_bus.h
+++ b/drivers/s390/crypto/ap_bus.h
@@ -166,6 +166,7 @@ void ap_driver_unregister(struct ap_driver *);
 struct ap_device {
 	struct device device;
 	int device_type;		/* AP device type. */
+	const char *driver_override;
 };
 
 #define to_ap_dev(x) container_of((x), struct ap_device, device)
@@ -280,7 +281,9 @@ struct ap_perms {
 };
 
 extern struct ap_perms ap_perms;
-extern struct mutex ap_perms_mutex;
+extern bool ap_apmask_aqmask_in_use;
+extern int ap_driver_override_ctr;
+extern struct mutex ap_attr_mutex;
 
 /*
  * Get ap_queue device for this qid.
diff --git a/drivers/s390/crypto/ap_card.c b/drivers/s390/crypto/ap_card.c
index ce953cbbd564..8102c8134c49 100644
--- a/drivers/s390/crypto/ap_card.c
+++ b/drivers/s390/crypto/ap_card.c
@@ -6,8 +6,7 @@
  * Adjunct processor bus, card related code.
  */
 
-#define KMSG_COMPONENT "ap"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "ap: " fmt
 
 #include <linux/init.h>
 #include <linux/slab.h>
diff --git a/drivers/s390/crypto/ap_queue.c b/drivers/s390/crypto/ap_queue.c
index 8977866fab1b..4a32c1e19a1e 100644
--- a/drivers/s390/crypto/ap_queue.c
+++ b/drivers/s390/crypto/ap_queue.c
@@ -6,17 +6,22 @@
  * Adjunct processor bus, queue related code.
  */
 
-#define KMSG_COMPONENT "ap"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "ap: " fmt
 
 #include <linux/export.h>
 #include <linux/init.h>
 #include <linux/slab.h>
 #include <asm/facility.h>
 
+#define CREATE_TRACE_POINTS
+#include <asm/trace/ap.h>
+
 #include "ap_bus.h"
 #include "ap_debug.h"
 
+EXPORT_TRACEPOINT_SYMBOL(s390_ap_nqap);
+EXPORT_TRACEPOINT_SYMBOL(s390_ap_dqap);
+
 static void __ap_flush_queue(struct ap_queue *aq);
 
 /*
@@ -98,9 +103,17 @@ static inline struct ap_queue_status
 __ap_send(ap_qid_t qid, unsigned long psmid, void *msg, size_t msglen,
 	  int special)
 {
+	struct ap_queue_status status;
+
 	if (special)
 		qid |= 0x400000UL;
-	return ap_nqap(qid, psmid, msg, msglen);
+
+	status = ap_nqap(qid, psmid, msg, msglen);
+
+	trace_s390_ap_nqap(AP_QID_CARD(qid), AP_QID_QUEUE(qid),
+			   status.value, psmid);
+
+	return status;
 }
 
 /* State machine definitions and helpers */
@@ -140,6 +153,9 @@ static struct ap_queue_status ap_sm_recv(struct ap_queue *aq)
 		parts++;
 	} while (status.response_code == 0xFF && resgr0 != 0);
 
+	trace_s390_ap_dqap(AP_QID_CARD(aq->qid), AP_QID_QUEUE(aq->qid),
+			   status.value, aq->reply->psmid);
+
 	switch (status.response_code) {
 	case AP_RESPONSE_NORMAL:
 		print_hex_dump_debug("aprpl: ", DUMP_PREFIX_ADDRESS, 16, 1,
@@ -714,6 +730,58 @@ static ssize_t ap_functions_show(struct device *dev,
 
 static DEVICE_ATTR_RO(ap_functions);
 
+static ssize_t driver_override_show(struct device *dev,
+				    struct device_attribute *attr,
+				    char *buf)
+{
+	struct ap_queue *aq = to_ap_queue(dev);
+	struct ap_device *ap_dev = &aq->ap_dev;
+	int rc;
+
+	device_lock(dev);
+	if (ap_dev->driver_override)
+		rc = sysfs_emit(buf, "%s\n", ap_dev->driver_override);
+	else
+		rc = sysfs_emit(buf, "\n");
+	device_unlock(dev);
+
+	return rc;
+}
+
+static ssize_t driver_override_store(struct device *dev,
+				     struct device_attribute *attr,
+				     const char *buf, size_t count)
+{
+	struct ap_queue *aq = to_ap_queue(dev);
+	struct ap_device *ap_dev = &aq->ap_dev;
+	int rc = -EINVAL;
+	bool old_value;
+
+	if (mutex_lock_interruptible(&ap_attr_mutex))
+		return -ERESTARTSYS;
+
+	/* Do not allow driver override if apmask/aqmask is in use */
+	if (ap_apmask_aqmask_in_use)
+		goto out;
+
+	old_value = ap_dev->driver_override ? true : false;
+	rc = driver_set_override(dev, &ap_dev->driver_override, buf, count);
+	if (rc)
+		goto out;
+	if (old_value && !ap_dev->driver_override)
+		--ap_driver_override_ctr;
+	else if (!old_value && ap_dev->driver_override)
+		++ap_driver_override_ctr;
+
+	rc = count;
+
+out:
+	mutex_unlock(&ap_attr_mutex);
+	return rc;
+}
+
+static DEVICE_ATTR_RW(driver_override);
+
 #ifdef CONFIG_AP_DEBUG
 static ssize_t states_show(struct device *dev,
 			   struct device_attribute *attr, char *buf)
@@ -826,6 +894,7 @@ static struct attribute *ap_queue_dev_attrs[] = {
 	&dev_attr_config.attr,
 	&dev_attr_chkstop.attr,
 	&dev_attr_ap_functions.attr,
+	&dev_attr_driver_override.attr,
 #ifdef CONFIG_AP_DEBUG
 	&dev_attr_states.attr,
 	&dev_attr_last_err_rc.attr,
diff --git a/drivers/s390/crypto/pkey_api.c b/drivers/s390/crypto/pkey_api.c
index 01549003a903..ad1cd699f53b 100644
--- a/drivers/s390/crypto/pkey_api.c
+++ b/drivers/s390/crypto/pkey_api.c
@@ -7,8 +7,7 @@
  *  Author(s): Harald Freudenberger
  */
 
-#define KMSG_COMPONENT "pkey"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "pkey: " fmt
 
 #include <linux/init.h>
 #include <linux/miscdevice.h>
diff --git a/drivers/s390/crypto/pkey_base.c b/drivers/s390/crypto/pkey_base.c
index b15741461a63..d60cd987c16d 100644
--- a/drivers/s390/crypto/pkey_base.c
+++ b/drivers/s390/crypto/pkey_base.c
@@ -5,8 +5,7 @@
  *  Copyright IBM Corp. 2024
  */
 
-#define KMSG_COMPONENT "pkey"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "pkey: " fmt
 
 #include <linux/cpufeature.h>
 #include <linux/export.h>
diff --git a/drivers/s390/crypto/pkey_cca.c b/drivers/s390/crypto/pkey_cca.c
index 6c7897a93f27..d4550d8d8eea 100644
--- a/drivers/s390/crypto/pkey_cca.c
+++ b/drivers/s390/crypto/pkey_cca.c
@@ -5,8 +5,7 @@
  *  Copyright IBM Corp. 2024
  */
 
-#define KMSG_COMPONENT "pkey"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "pkey: " fmt
 
 #include <linux/init.h>
 #include <linux/module.h>
diff --git a/drivers/s390/crypto/pkey_ep11.c b/drivers/s390/crypto/pkey_ep11.c
index 6b23adc560c8..654eed20d0d9 100644
--- a/drivers/s390/crypto/pkey_ep11.c
+++ b/drivers/s390/crypto/pkey_ep11.c
@@ -5,8 +5,7 @@
  *  Copyright IBM Corp. 2024
  */
 
-#define KMSG_COMPONENT "pkey"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "pkey: " fmt
 
 #include <linux/init.h>
 #include <linux/module.h>
diff --git a/drivers/s390/crypto/pkey_pckmo.c b/drivers/s390/crypto/pkey_pckmo.c
index 7eca9f1340bd..793326c4c59a 100644
--- a/drivers/s390/crypto/pkey_pckmo.c
+++ b/drivers/s390/crypto/pkey_pckmo.c
@@ -5,8 +5,7 @@
  *  Copyright IBM Corp. 2024
  */
 
-#define KMSG_COMPONENT "pkey"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "pkey: " fmt
 
 #include <linux/init.h>
 #include <linux/module.h>
diff --git a/drivers/s390/crypto/pkey_sysfs.c b/drivers/s390/crypto/pkey_sysfs.c
index 792c0fce88fa..b6b0a46cb8a8 100644
--- a/drivers/s390/crypto/pkey_sysfs.c
+++ b/drivers/s390/crypto/pkey_sysfs.c
@@ -5,8 +5,7 @@
  *  Copyright IBM Corp. 2024
  */
 
-#define KMSG_COMPONENT "pkey"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "pkey: " fmt
 
 #include <linux/sysfs.h>
 
diff --git a/drivers/s390/crypto/pkey_uv.c b/drivers/s390/crypto/pkey_uv.c
index e5c6e01acaf3..6cd3c49384b5 100644
--- a/drivers/s390/crypto/pkey_uv.c
+++ b/drivers/s390/crypto/pkey_uv.c
@@ -5,8 +5,7 @@
  *  Copyright IBM Corp. 2024
  */
 
-#define KMSG_COMPONENT "pkey"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "pkey: " fmt
 
 #include <linux/cpufeature.h>
 #include <linux/init.h>
diff --git a/drivers/s390/crypto/vfio_ap_ops.c b/drivers/s390/crypto/vfio_ap_ops.c
index eb5ff49f6fe7..48da32ad0493 100644
--- a/drivers/s390/crypto/vfio_ap_ops.c
+++ b/drivers/s390/crypto/vfio_ap_ops.c
@@ -968,7 +968,7 @@ static int vfio_ap_mdev_verify_no_sharing(struct ap_matrix_mdev *assignee,
  *
  * Return: One of the following values:
  * o the error returned from the ap_apqn_in_matrix_owned_by_def_drv() function,
- *   most likely -EBUSY indicating the ap_perms_mutex lock is already held.
+ *   most likely -EBUSY indicating the ap_attr_mutex lock is already held.
  * o EADDRNOTAVAIL if an APQN assigned to @matrix_mdev is reserved for the
  *		   zcrypt default driver.
  * o EADDRINUSE if an APQN assigned to @matrix_mdev is assigned to another mdev
@@ -1079,7 +1079,7 @@ static ssize_t assign_adapter_store(struct device *dev,
 	DECLARE_BITMAP(apm_filtered, AP_DEVICES);
 	struct ap_matrix_mdev *matrix_mdev = dev_get_drvdata(dev);
 
-	mutex_lock(&ap_perms_mutex);
+	mutex_lock(&ap_attr_mutex);
 	get_update_locks_for_mdev(matrix_mdev);
 
 	ret = kstrtoul(buf, 0, &apid);
@@ -1114,7 +1114,7 @@ static ssize_t assign_adapter_store(struct device *dev,
 	ret = count;
 done:
 	release_update_locks_for_mdev(matrix_mdev);
-	mutex_unlock(&ap_perms_mutex);
+	mutex_unlock(&ap_attr_mutex);
 
 	return ret;
 }
@@ -1303,7 +1303,7 @@ static ssize_t assign_domain_store(struct device *dev,
 	DECLARE_BITMAP(apm_filtered, AP_DEVICES);
 	struct ap_matrix_mdev *matrix_mdev = dev_get_drvdata(dev);
 
-	mutex_lock(&ap_perms_mutex);
+	mutex_lock(&ap_attr_mutex);
 	get_update_locks_for_mdev(matrix_mdev);
 
 	ret = kstrtoul(buf, 0, &apqi);
@@ -1338,7 +1338,7 @@ static ssize_t assign_domain_store(struct device *dev,
 	ret = count;
 done:
 	release_update_locks_for_mdev(matrix_mdev);
-	mutex_unlock(&ap_perms_mutex);
+	mutex_unlock(&ap_attr_mutex);
 
 	return ret;
 }
@@ -1718,7 +1718,7 @@ static ssize_t ap_config_store(struct device *dev, struct device_attribute *attr
 		return -ENOMEM;
 	rest = newbuf;
 
-	mutex_lock(&ap_perms_mutex);
+	mutex_lock(&ap_attr_mutex);
 	get_update_locks_for_mdev(matrix_mdev);
 
 	/* Save old state */
@@ -1779,7 +1779,7 @@ static ssize_t ap_config_store(struct device *dev, struct device_attribute *attr
 	}
 out:
 	release_update_locks_for_mdev(matrix_mdev);
-	mutex_unlock(&ap_perms_mutex);
+	mutex_unlock(&ap_attr_mutex);
 	kfree(newbuf);
 	return rc;
 }
diff --git a/drivers/s390/crypto/zcrypt_api.c b/drivers/s390/crypto/zcrypt_api.c
index 4e6bf1cb3475..7a3b99f065f2 100644
--- a/drivers/s390/crypto/zcrypt_api.c
+++ b/drivers/s390/crypto/zcrypt_api.c
@@ -12,8 +12,7 @@
  *  Multiple device nodes: Harald Freudenberger <freude@linux.ibm.com>
  */
 
-#define KMSG_COMPONENT "zcrypt"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "zcrypt: " fmt
 
 #include <linux/export.h>
 #include <linux/module.h>
@@ -21,7 +20,6 @@
 #include <linux/interrupt.h>
 #include <linux/miscdevice.h>
 #include <linux/fs.h>
-#include <linux/compat.h>
 #include <linux/slab.h>
 #include <linux/atomic.h>
 #include <linux/uaccess.h>
@@ -163,7 +161,7 @@ static ssize_t ioctlmask_show(struct device *dev,
 	struct zcdn_device *zcdndev = to_zcdn_dev(dev);
 	int i, n;
 
-	if (mutex_lock_interruptible(&ap_perms_mutex))
+	if (mutex_lock_interruptible(&ap_attr_mutex))
 		return -ERESTARTSYS;
 
 	n = sysfs_emit(buf, "0x");
@@ -171,7 +169,7 @@ static ssize_t ioctlmask_show(struct device *dev,
 		n += sysfs_emit_at(buf, n, "%016lx", zcdndev->perms.ioctlm[i]);
 	n += sysfs_emit_at(buf, n, "\n");
 
-	mutex_unlock(&ap_perms_mutex);
+	mutex_unlock(&ap_attr_mutex);
 
 	return n;
 }
@@ -184,7 +182,7 @@ static ssize_t ioctlmask_store(struct device *dev,
 	struct zcdn_device *zcdndev = to_zcdn_dev(dev);
 
 	rc = ap_parse_mask_str(buf, zcdndev->perms.ioctlm,
-			       AP_IOCTLS, &ap_perms_mutex);
+			       AP_IOCTLS, &ap_attr_mutex);
 	if (rc)
 		return rc;
 
@@ -200,7 +198,7 @@ static ssize_t apmask_show(struct device *dev,
 	struct zcdn_device *zcdndev = to_zcdn_dev(dev);
 	int i, n;
 
-	if (mutex_lock_interruptible(&ap_perms_mutex))
+	if (mutex_lock_interruptible(&ap_attr_mutex))
 		return -ERESTARTSYS;
 
 	n = sysfs_emit(buf, "0x");
@@ -208,7 +206,7 @@ static ssize_t apmask_show(struct device *dev,
 		n += sysfs_emit_at(buf, n, "%016lx", zcdndev->perms.apm[i]);
 	n += sysfs_emit_at(buf, n, "\n");
 
-	mutex_unlock(&ap_perms_mutex);
+	mutex_unlock(&ap_attr_mutex);
 
 	return n;
 }
@@ -221,7 +219,7 @@ static ssize_t apmask_store(struct device *dev,
 	struct zcdn_device *zcdndev = to_zcdn_dev(dev);
 
 	rc = ap_parse_mask_str(buf, zcdndev->perms.apm,
-			       AP_DEVICES, &ap_perms_mutex);
+			       AP_DEVICES, &ap_attr_mutex);
 	if (rc)
 		return rc;
 
@@ -237,7 +235,7 @@ static ssize_t aqmask_show(struct device *dev,
 	struct zcdn_device *zcdndev = to_zcdn_dev(dev);
 	int i, n;
 
-	if (mutex_lock_interruptible(&ap_perms_mutex))
+	if (mutex_lock_interruptible(&ap_attr_mutex))
 		return -ERESTARTSYS;
 
 	n = sysfs_emit(buf, "0x");
@@ -245,7 +243,7 @@ static ssize_t aqmask_show(struct device *dev,
 		n += sysfs_emit_at(buf, n, "%016lx", zcdndev->perms.aqm[i]);
 	n += sysfs_emit_at(buf, n, "\n");
 
-	mutex_unlock(&ap_perms_mutex);
+	mutex_unlock(&ap_attr_mutex);
 
 	return n;
 }
@@ -258,7 +256,7 @@ static ssize_t aqmask_store(struct device *dev,
 	struct zcdn_device *zcdndev = to_zcdn_dev(dev);
 
 	rc = ap_parse_mask_str(buf, zcdndev->perms.aqm,
-			       AP_DOMAINS, &ap_perms_mutex);
+			       AP_DOMAINS, &ap_attr_mutex);
 	if (rc)
 		return rc;
 
@@ -274,7 +272,7 @@ static ssize_t admask_show(struct device *dev,
 	struct zcdn_device *zcdndev = to_zcdn_dev(dev);
 	int i, n;
 
-	if (mutex_lock_interruptible(&ap_perms_mutex))
+	if (mutex_lock_interruptible(&ap_attr_mutex))
 		return -ERESTARTSYS;
 
 	n = sysfs_emit(buf, "0x");
@@ -282,7 +280,7 @@ static ssize_t admask_show(struct device *dev,
 		n += sysfs_emit_at(buf, n, "%016lx", zcdndev->perms.adm[i]);
 	n += sysfs_emit_at(buf, n, "\n");
 
-	mutex_unlock(&ap_perms_mutex);
+	mutex_unlock(&ap_attr_mutex);
 
 	return n;
 }
@@ -295,7 +293,7 @@ static ssize_t admask_store(struct device *dev,
 	struct zcdn_device *zcdndev = to_zcdn_dev(dev);
 
 	rc = ap_parse_mask_str(buf, zcdndev->perms.adm,
-			       AP_DOMAINS, &ap_perms_mutex);
+			       AP_DOMAINS, &ap_attr_mutex);
 	if (rc)
 		return rc;
 
@@ -371,7 +369,7 @@ static int zcdn_create(const char *name)
 	int i, rc = 0;
 	struct zcdn_device *zcdndev;
 
-	if (mutex_lock_interruptible(&ap_perms_mutex))
+	if (mutex_lock_interruptible(&ap_attr_mutex))
 		return -ERESTARTSYS;
 
 	/* check if device node with this name already exists */
@@ -426,7 +424,7 @@ static int zcdn_create(const char *name)
 			__func__, MAJOR(devt), MINOR(devt));
 
 unlockout:
-	mutex_unlock(&ap_perms_mutex);
+	mutex_unlock(&ap_attr_mutex);
 	return rc;
 }
 
@@ -435,7 +433,7 @@ static int zcdn_destroy(const char *name)
 	int rc = 0;
 	struct zcdn_device *zcdndev;
 
-	if (mutex_lock_interruptible(&ap_perms_mutex))
+	if (mutex_lock_interruptible(&ap_attr_mutex))
 		return -ERESTARTSYS;
 
 	/* try to find this zcdn device */
@@ -453,7 +451,7 @@ static int zcdn_destroy(const char *name)
 	device_unregister(&zcdndev->device);
 
 unlockout:
-	mutex_unlock(&ap_perms_mutex);
+	mutex_unlock(&ap_attr_mutex);
 	return rc;
 }
 
@@ -463,7 +461,7 @@ static void zcdn_destroy_all(void)
 	dev_t devt;
 	struct zcdn_device *zcdndev;
 
-	mutex_lock(&ap_perms_mutex);
+	mutex_lock(&ap_attr_mutex);
 	for (i = 0; i < ZCRYPT_MAX_MINOR_NODES; i++) {
 		devt = MKDEV(MAJOR(zcrypt_devt), MINOR(zcrypt_devt) + i);
 		zcdndev = find_zcdndev_by_devt(devt);
@@ -472,7 +470,7 @@ static void zcdn_destroy_all(void)
 			device_unregister(&zcdndev->device);
 		}
 	}
-	mutex_unlock(&ap_perms_mutex);
+	mutex_unlock(&ap_attr_mutex);
 }
 
 /*
@@ -509,11 +507,11 @@ static int zcrypt_open(struct inode *inode, struct file *filp)
 	if (filp->f_inode->i_cdev == &zcrypt_cdev) {
 		struct zcdn_device *zcdndev;
 
-		if (mutex_lock_interruptible(&ap_perms_mutex))
+		if (mutex_lock_interruptible(&ap_attr_mutex))
 			return -ERESTARTSYS;
 		zcdndev = find_zcdndev_by_devt(filp->f_inode->i_rdev);
 		/* find returns a reference, no get_device() needed */
-		mutex_unlock(&ap_perms_mutex);
+		mutex_unlock(&ap_attr_mutex);
 		if (zcdndev)
 			perms = &zcdndev->perms;
 	}
@@ -533,9 +531,9 @@ static int zcrypt_release(struct inode *inode, struct file *filp)
 	if (filp->f_inode->i_cdev == &zcrypt_cdev) {
 		struct zcdn_device *zcdndev;
 
-		mutex_lock(&ap_perms_mutex);
+		mutex_lock(&ap_attr_mutex);
 		zcdndev = find_zcdndev_by_devt(filp->f_inode->i_rdev);
-		mutex_unlock(&ap_perms_mutex);
+		mutex_unlock(&ap_attr_mutex);
 		if (zcdndev) {
 			/* 2 puts here: one for find, one for open */
 			put_device(&zcdndev->device);
@@ -740,7 +738,8 @@ out:
 		tr->last_qid = qid;
 	}
 	trace_s390_zcrypt_rep(mex, func_code, rc,
-			      AP_QID_CARD(qid), AP_QID_QUEUE(qid));
+			      AP_QID_CARD(qid), AP_QID_QUEUE(qid),
+			      ap_msg.psmid);
 	return rc;
 }
 
@@ -845,7 +844,8 @@ out:
 		tr->last_qid = qid;
 	}
 	trace_s390_zcrypt_rep(crt, func_code, rc,
-			      AP_QID_CARD(qid), AP_QID_QUEUE(qid));
+			      AP_QID_CARD(qid), AP_QID_QUEUE(qid),
+			      ap_msg.psmid);
 	return rc;
 }
 
@@ -980,7 +980,8 @@ out:
 		tr->last_qid = qid;
 	}
 	trace_s390_zcrypt_rep(xcrb, func_code, rc,
-			      AP_QID_CARD(qid), AP_QID_QUEUE(qid));
+			      AP_QID_CARD(qid), AP_QID_QUEUE(qid),
+			      ap_msg.psmid);
 	return rc;
 }
 
@@ -1182,7 +1183,8 @@ out:
 		tr->last_qid = qid;
 	}
 	trace_s390_zcrypt_rep(xcrb, func_code, rc,
-			      AP_QID_CARD(qid), AP_QID_QUEUE(qid));
+			      AP_QID_CARD(qid), AP_QID_QUEUE(qid),
+			      ap_msg.psmid);
 	return rc;
 }
 
@@ -1274,7 +1276,8 @@ static long zcrypt_rng(char *buffer)
 out:
 	ap_release_apmsg(&ap_msg);
 	trace_s390_zcrypt_rep(buffer, func_code, rc,
-			      AP_QID_CARD(qid), AP_QID_QUEUE(qid));
+			      AP_QID_CARD(qid), AP_QID_QUEUE(qid),
+			      ap_msg.psmid);
 	return rc;
 }
 
@@ -1729,197 +1732,6 @@ static long zcrypt_unlocked_ioctl(struct file *filp, unsigned int cmd,
 	}
 }
 
-#ifdef CONFIG_COMPAT
-/*
- * ioctl32 conversion routines
- */
-struct compat_ica_rsa_modexpo {
-	compat_uptr_t	inputdata;
-	unsigned int	inputdatalength;
-	compat_uptr_t	outputdata;
-	unsigned int	outputdatalength;
-	compat_uptr_t	b_key;
-	compat_uptr_t	n_modulus;
-};
-
-static long trans_modexpo32(struct ap_perms *perms, struct file *filp,
-			    unsigned int cmd, unsigned long arg)
-{
-	struct compat_ica_rsa_modexpo __user *umex32 = compat_ptr(arg);
-	struct compat_ica_rsa_modexpo mex32;
-	struct ica_rsa_modexpo mex64;
-	struct zcrypt_track tr;
-	long rc;
-
-	memset(&tr, 0, sizeof(tr));
-	if (copy_from_user(&mex32, umex32, sizeof(mex32)))
-		return -EFAULT;
-	mex64.inputdata = compat_ptr(mex32.inputdata);
-	mex64.inputdatalength = mex32.inputdatalength;
-	mex64.outputdata = compat_ptr(mex32.outputdata);
-	mex64.outputdatalength = mex32.outputdatalength;
-	mex64.b_key = compat_ptr(mex32.b_key);
-	mex64.n_modulus = compat_ptr(mex32.n_modulus);
-	do {
-		rc = zcrypt_rsa_modexpo(perms, &tr, &mex64);
-	} while (rc == -EAGAIN && ++tr.again_counter < TRACK_AGAIN_MAX);
-
-	/* on ENODEV failure: retry once again after a requested rescan */
-	if (rc == -ENODEV && zcrypt_process_rescan())
-		do {
-			rc = zcrypt_rsa_modexpo(perms, &tr, &mex64);
-		} while (rc == -EAGAIN && ++tr.again_counter < TRACK_AGAIN_MAX);
-	if (rc == -EAGAIN && tr.again_counter >= TRACK_AGAIN_MAX)
-		rc = -EIO;
-	if (rc)
-		return rc;
-	return put_user(mex64.outputdatalength,
-			&umex32->outputdatalength);
-}
-
-struct compat_ica_rsa_modexpo_crt {
-	compat_uptr_t	inputdata;
-	unsigned int	inputdatalength;
-	compat_uptr_t	outputdata;
-	unsigned int	outputdatalength;
-	compat_uptr_t	bp_key;
-	compat_uptr_t	bq_key;
-	compat_uptr_t	np_prime;
-	compat_uptr_t	nq_prime;
-	compat_uptr_t	u_mult_inv;
-};
-
-static long trans_modexpo_crt32(struct ap_perms *perms, struct file *filp,
-				unsigned int cmd, unsigned long arg)
-{
-	struct compat_ica_rsa_modexpo_crt __user *ucrt32 = compat_ptr(arg);
-	struct compat_ica_rsa_modexpo_crt crt32;
-	struct ica_rsa_modexpo_crt crt64;
-	struct zcrypt_track tr;
-	long rc;
-
-	memset(&tr, 0, sizeof(tr));
-	if (copy_from_user(&crt32, ucrt32, sizeof(crt32)))
-		return -EFAULT;
-	crt64.inputdata = compat_ptr(crt32.inputdata);
-	crt64.inputdatalength = crt32.inputdatalength;
-	crt64.outputdata = compat_ptr(crt32.outputdata);
-	crt64.outputdatalength = crt32.outputdatalength;
-	crt64.bp_key = compat_ptr(crt32.bp_key);
-	crt64.bq_key = compat_ptr(crt32.bq_key);
-	crt64.np_prime = compat_ptr(crt32.np_prime);
-	crt64.nq_prime = compat_ptr(crt32.nq_prime);
-	crt64.u_mult_inv = compat_ptr(crt32.u_mult_inv);
-	do {
-		rc = zcrypt_rsa_crt(perms, &tr, &crt64);
-	} while (rc == -EAGAIN && ++tr.again_counter < TRACK_AGAIN_MAX);
-
-	/* on ENODEV failure: retry once again after a requested rescan */
-	if (rc == -ENODEV && zcrypt_process_rescan())
-		do {
-			rc = zcrypt_rsa_crt(perms, &tr, &crt64);
-		} while (rc == -EAGAIN && ++tr.again_counter < TRACK_AGAIN_MAX);
-	if (rc == -EAGAIN && tr.again_counter >= TRACK_AGAIN_MAX)
-		rc = -EIO;
-	if (rc)
-		return rc;
-	return put_user(crt64.outputdatalength,
-			&ucrt32->outputdatalength);
-}
-
-struct compat_ica_xcrb {
-	unsigned short	agent_ID;
-	unsigned int	user_defined;
-	unsigned short	request_ID;
-	unsigned int	request_control_blk_length;
-	unsigned char	padding1[16 - sizeof(compat_uptr_t)];
-	compat_uptr_t	request_control_blk_addr;
-	unsigned int	request_data_length;
-	char		padding2[16 - sizeof(compat_uptr_t)];
-	compat_uptr_t	request_data_address;
-	unsigned int	reply_control_blk_length;
-	char		padding3[16 - sizeof(compat_uptr_t)];
-	compat_uptr_t	reply_control_blk_addr;
-	unsigned int	reply_data_length;
-	char		padding4[16 - sizeof(compat_uptr_t)];
-	compat_uptr_t	reply_data_addr;
-	unsigned short	priority_window;
-	unsigned int	status;
-} __packed;
-
-static long trans_xcrb32(struct ap_perms *perms, struct file *filp,
-			 unsigned int cmd, unsigned long arg)
-{
-	struct compat_ica_xcrb __user *uxcrb32 = compat_ptr(arg);
-	u32 xflags = ZCRYPT_XFLAG_USERSPACE;
-	struct compat_ica_xcrb xcrb32;
-	struct zcrypt_track tr;
-	struct ica_xcRB xcrb64;
-	long rc;
-
-	memset(&tr, 0, sizeof(tr));
-	if (copy_from_user(&xcrb32, uxcrb32, sizeof(xcrb32)))
-		return -EFAULT;
-	xcrb64.agent_ID = xcrb32.agent_ID;
-	xcrb64.user_defined = xcrb32.user_defined;
-	xcrb64.request_ID = xcrb32.request_ID;
-	xcrb64.request_control_blk_length =
-		xcrb32.request_control_blk_length;
-	xcrb64.request_control_blk_addr =
-		compat_ptr(xcrb32.request_control_blk_addr);
-	xcrb64.request_data_length =
-		xcrb32.request_data_length;
-	xcrb64.request_data_address =
-		compat_ptr(xcrb32.request_data_address);
-	xcrb64.reply_control_blk_length =
-		xcrb32.reply_control_blk_length;
-	xcrb64.reply_control_blk_addr =
-		compat_ptr(xcrb32.reply_control_blk_addr);
-	xcrb64.reply_data_length = xcrb32.reply_data_length;
-	xcrb64.reply_data_addr =
-		compat_ptr(xcrb32.reply_data_addr);
-	xcrb64.priority_window = xcrb32.priority_window;
-	xcrb64.status = xcrb32.status;
-	do {
-		rc = _zcrypt_send_cprb(xflags, perms, &tr, &xcrb64);
-	} while (rc == -EAGAIN && ++tr.again_counter < TRACK_AGAIN_MAX);
-
-	/* on ENODEV failure: retry once again after a requested rescan */
-	if (rc == -ENODEV && zcrypt_process_rescan())
-		do {
-			rc = _zcrypt_send_cprb(xflags, perms, &tr, &xcrb64);
-		} while (rc == -EAGAIN && ++tr.again_counter < TRACK_AGAIN_MAX);
-	if (rc == -EAGAIN && tr.again_counter >= TRACK_AGAIN_MAX)
-		rc = -EIO;
-	xcrb32.reply_control_blk_length = xcrb64.reply_control_blk_length;
-	xcrb32.reply_data_length = xcrb64.reply_data_length;
-	xcrb32.status = xcrb64.status;
-	if (copy_to_user(uxcrb32, &xcrb32, sizeof(xcrb32)))
-		return -EFAULT;
-	return rc;
-}
-
-static long zcrypt_compat_ioctl(struct file *filp, unsigned int cmd,
-				unsigned long arg)
-{
-	int rc;
-	struct ap_perms *perms =
-		(struct ap_perms *)filp->private_data;
-
-	rc = zcrypt_check_ioctl(perms, cmd);
-	if (rc)
-		return rc;
-
-	if (cmd == ICARSAMODEXPO)
-		return trans_modexpo32(perms, filp, cmd, arg);
-	if (cmd == ICARSACRT)
-		return trans_modexpo_crt32(perms, filp, cmd, arg);
-	if (cmd == ZSECSENDCPRB)
-		return trans_xcrb32(perms, filp, cmd, arg);
-	return zcrypt_unlocked_ioctl(filp, cmd, arg);
-}
-#endif
-
 /*
  * Misc device file operations.
  */
@@ -1928,9 +1740,6 @@ static const struct file_operations zcrypt_fops = {
 	.read		= zcrypt_read,
 	.write		= zcrypt_write,
 	.unlocked_ioctl	= zcrypt_unlocked_ioctl,
-#ifdef CONFIG_COMPAT
-	.compat_ioctl	= zcrypt_compat_ioctl,
-#endif
 	.open		= zcrypt_open,
 	.release	= zcrypt_release,
 };
diff --git a/drivers/s390/crypto/zcrypt_card.c b/drivers/s390/crypto/zcrypt_card.c
index aa2c8ff2740e..6dea702a5cac 100644
--- a/drivers/s390/crypto/zcrypt_card.c
+++ b/drivers/s390/crypto/zcrypt_card.c
@@ -19,7 +19,6 @@
 #include <linux/fs.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
-#include <linux/compat.h>
 #include <linux/slab.h>
 #include <linux/atomic.h>
 #include <linux/uaccess.h>
diff --git a/drivers/s390/crypto/zcrypt_ccamisc.c b/drivers/s390/crypto/zcrypt_ccamisc.c
index a96e25614303..573bad1d6d86 100644
--- a/drivers/s390/crypto/zcrypt_ccamisc.c
+++ b/drivers/s390/crypto/zcrypt_ccamisc.c
@@ -7,8 +7,7 @@
  *  Collection of CCA misc functions used by zcrypt and pkey
  */
 
-#define KMSG_COMPONENT "zcrypt"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "zcrypt: " fmt
 
 #include <linux/export.h>
 #include <linux/init.h>
diff --git a/drivers/s390/crypto/zcrypt_ep11misc.c b/drivers/s390/crypto/zcrypt_ep11misc.c
index e92e2fd8ce5d..3dda9589f2b9 100644
--- a/drivers/s390/crypto/zcrypt_ep11misc.c
+++ b/drivers/s390/crypto/zcrypt_ep11misc.c
@@ -6,8 +6,7 @@
  *  Collection of EP11 misc functions used by zcrypt and pkey
  */
 
-#define KMSG_COMPONENT "zcrypt"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "zcrypt: " fmt
 
 #include <linux/export.h>
 #include <linux/init.h>
diff --git a/drivers/s390/crypto/zcrypt_msgtype50.c b/drivers/s390/crypto/zcrypt_msgtype50.c
index fc0a2a053dc2..d6fc2d8e7fad 100644
--- a/drivers/s390/crypto/zcrypt_msgtype50.c
+++ b/drivers/s390/crypto/zcrypt_msgtype50.c
@@ -10,8 +10,7 @@
  *  MSGTYPE restruct:		  Holger Dengler <hd@linux.vnet.ibm.com>
  */
 
-#define KMSG_COMPONENT "zcrypt"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "zcrypt: " fmt
 
 #include <linux/module.h>
 #include <linux/slab.h>
diff --git a/drivers/s390/crypto/zcrypt_msgtype6.c b/drivers/s390/crypto/zcrypt_msgtype6.c
index 9cefbb30960f..a0dcab5dc4f2 100644
--- a/drivers/s390/crypto/zcrypt_msgtype6.c
+++ b/drivers/s390/crypto/zcrypt_msgtype6.c
@@ -10,8 +10,7 @@
  *  MSGTYPE restruct:		  Holger Dengler <hd@linux.vnet.ibm.com>
  */
 
-#define KMSG_COMPONENT "zcrypt"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "zcrypt: " fmt
 
 #include <linux/module.h>
 #include <linux/init.h>
diff --git a/drivers/s390/crypto/zcrypt_queue.c b/drivers/s390/crypto/zcrypt_queue.c
index 76a8678bdad6..a173d32eb6e8 100644
--- a/drivers/s390/crypto/zcrypt_queue.c
+++ b/drivers/s390/crypto/zcrypt_queue.c
@@ -19,7 +19,6 @@
 #include <linux/fs.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
-#include <linux/compat.h>
 #include <linux/slab.h>
 #include <linux/atomic.h>
 #include <linux/uaccess.h>
diff --git a/drivers/s390/net/ctcm_fsms.c b/drivers/s390/net/ctcm_fsms.c
index 9678c6a2cda7..e221687a9858 100644
--- a/drivers/s390/net/ctcm_fsms.c
+++ b/drivers/s390/net/ctcm_fsms.c
@@ -12,8 +12,7 @@
 #undef DEBUGDATA
 #undef DEBUGCCW
 
-#define KMSG_COMPONENT "ctcm"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "ctcm: " fmt
 
 #include <linux/module.h>
 #include <linux/init.h>
diff --git a/drivers/s390/net/ctcm_main.c b/drivers/s390/net/ctcm_main.c
index b93c2eb45916..3d7ccf2366a0 100644
--- a/drivers/s390/net/ctcm_main.c
+++ b/drivers/s390/net/ctcm_main.c
@@ -20,8 +20,7 @@
 #undef DEBUGDATA
 #undef DEBUGCCW
 
-#define KMSG_COMPONENT "ctcm"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "ctcm: " fmt
 
 #include <linux/module.h>
 #include <linux/init.h>
diff --git a/drivers/s390/net/ctcm_mpc.c b/drivers/s390/net/ctcm_mpc.c
index 407b7c516658..0f329fb514ee 100644
--- a/drivers/s390/net/ctcm_mpc.c
+++ b/drivers/s390/net/ctcm_mpc.c
@@ -18,8 +18,7 @@
 #undef DEBUGDATA
 #undef DEBUGCCW
 
-#define KMSG_COMPONENT "ctcm"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "ctcm: " fmt
 
 #include <linux/export.h>
 #include <linux/module.h>
diff --git a/drivers/s390/net/ctcm_sysfs.c b/drivers/s390/net/ctcm_sysfs.c
index 0c5d8a3eaa2e..529a1c40ae63 100644
--- a/drivers/s390/net/ctcm_sysfs.c
+++ b/drivers/s390/net/ctcm_sysfs.c
@@ -9,8 +9,7 @@
 #undef DEBUGDATA
 #undef DEBUGCCW
 
-#define KMSG_COMPONENT "ctcm"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "ctcm: " fmt
 
 #include <linux/device.h>
 #include <linux/sysfs.h>
diff --git a/drivers/s390/net/ism_drv.c b/drivers/s390/net/ism_drv.c
index f84aa2e676e9..8b8e4f06be0f 100644
--- a/drivers/s390/net/ism_drv.c
+++ b/drivers/s390/net/ism_drv.c
@@ -4,8 +4,7 @@
  *
  * Copyright IBM Corp. 2018
  */
-#define KMSG_COMPONENT "ism"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "ism: " fmt
 
 #include <linux/export.h>
 #include <linux/module.h>
diff --git a/drivers/s390/net/qeth_core_main.c b/drivers/s390/net/qeth_core_main.c
index edc0bcd46923..64d45285651d 100644
--- a/drivers/s390/net/qeth_core_main.c
+++ b/drivers/s390/net/qeth_core_main.c
@@ -7,10 +7,8 @@
  *		 Frank Blaschka <frank.blaschka@de.ibm.com>
  */
 
-#define KMSG_COMPONENT "qeth"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "qeth: " fmt
 
-#include <linux/compat.h>
 #include <linux/export.h>
 #include <linux/module.h>
 #include <linux/moduleparam.h>
@@ -4805,8 +4803,7 @@ static int qeth_query_oat_command(struct qeth_card *card, char __user *udata)
 
 	rc = qeth_send_ipa_cmd(card, iob, qeth_setadpparms_query_oat_cb, &priv);
 	if (!rc) {
-		tmp = is_compat_task() ? compat_ptr(oat_data.ptr) :
-					 u64_to_user_ptr(oat_data.ptr);
+		tmp = u64_to_user_ptr(oat_data.ptr);
 		oat_data.response_len = priv.response_len;
 
 		if (copy_to_user(tmp, priv.buffer, priv.response_len) ||
diff --git a/drivers/s390/net/qeth_core_sys.c b/drivers/s390/net/qeth_core_sys.c
index c0e4883be6d0..a3b16d4d16fb 100644
--- a/drivers/s390/net/qeth_core_sys.c
+++ b/drivers/s390/net/qeth_core_sys.c
@@ -7,8 +7,7 @@
  *		 Frank Blaschka <frank.blaschka@de.ibm.com>
  */
 
-#define KMSG_COMPONENT "qeth"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "qeth: " fmt
 
 #include <linux/list.h>
 #include <linux/rwsem.h>
diff --git a/drivers/s390/net/qeth_ethtool.c b/drivers/s390/net/qeth_ethtool.c
index f184c58ecf24..d214a889cf4e 100644
--- a/drivers/s390/net/qeth_ethtool.c
+++ b/drivers/s390/net/qeth_ethtool.c
@@ -3,8 +3,7 @@
  * Copyright IBM Corp. 2018
  */
 
-#define KMSG_COMPONENT "qeth"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "qeth: " fmt
 
 #include <linux/ethtool.h>
 #include "qeth_core.h"
diff --git a/drivers/s390/net/qeth_l2_main.c b/drivers/s390/net/qeth_l2_main.c
index 2a3888283a94..7498a83b1f06 100644
--- a/drivers/s390/net/qeth_l2_main.c
+++ b/drivers/s390/net/qeth_l2_main.c
@@ -7,8 +7,7 @@
  *		 Frank Blaschka <frank.blaschka@de.ibm.com>
  */
 
-#define KMSG_COMPONENT "qeth"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "qeth: " fmt
 
 #include <linux/export.h>
 #include <linux/module.h>
diff --git a/drivers/s390/net/qeth_l3_main.c b/drivers/s390/net/qeth_l3_main.c
index 3525be819362..027bc346232f 100644
--- a/drivers/s390/net/qeth_l3_main.c
+++ b/drivers/s390/net/qeth_l3_main.c
@@ -7,8 +7,7 @@
  *		 Frank Blaschka <frank.blaschka@de.ibm.com>
  */
 
-#define KMSG_COMPONENT "qeth"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "qeth: " fmt
 
 #include <linux/export.h>
 #include <linux/module.h>
diff --git a/drivers/s390/net/smsgiucv_app.c b/drivers/s390/net/smsgiucv_app.c
index 4bd4d6bfc126..7041c1dca1e8 100644
--- a/drivers/s390/net/smsgiucv_app.c
+++ b/drivers/s390/net/smsgiucv_app.c
@@ -10,8 +10,7 @@
  * Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
  *
  */
-#define KMSG_COMPONENT		"smsgiucv_app"
-#define pr_fmt(fmt)		KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "smsgiucv_app: " fmt
 
 #include <linux/ctype.h>
 #include <linux/err.h>
@@ -161,7 +160,7 @@ static int __init smsgiucv_app_init(void)
 	if (!smsgiucv_drv)
 		return -ENODEV;
 
-	smsg_app_dev = iucv_alloc_device(NULL, smsgiucv_drv, NULL, KMSG_COMPONENT);
+	smsg_app_dev = iucv_alloc_device(NULL, smsgiucv_drv, NULL, "smsgiucv_app");
 	if (!smsg_app_dev)
 		return -ENOMEM;
 
diff --git a/drivers/s390/scsi/zfcp_aux.c b/drivers/s390/scsi/zfcp_aux.c
index dc2265ebb11b..01f927ae61b5 100644
--- a/drivers/s390/scsi/zfcp_aux.c
+++ b/drivers/s390/scsi/zfcp_aux.c
@@ -28,8 +28,7 @@
  *	      Benjamin Block
  */
 
-#define KMSG_COMPONENT "zfcp"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "zfcp: " fmt
 
 #include <linux/seq_file.h>
 #include <linux/slab.h>
diff --git a/drivers/s390/scsi/zfcp_ccw.c b/drivers/s390/scsi/zfcp_ccw.c
index bdf2cc1ea713..67cb947048c4 100644
--- a/drivers/s390/scsi/zfcp_ccw.c
+++ b/drivers/s390/scsi/zfcp_ccw.c
@@ -7,8 +7,7 @@
  * Copyright IBM Corp. 2002, 2010
  */
 
-#define KMSG_COMPONENT "zfcp"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "zfcp: " fmt
 
 #include <linux/module.h>
 #include "zfcp_ext.h"
diff --git a/drivers/s390/scsi/zfcp_dbf.c b/drivers/s390/scsi/zfcp_dbf.c
index d904625afd40..6b5561c54e2f 100644
--- a/drivers/s390/scsi/zfcp_dbf.c
+++ b/drivers/s390/scsi/zfcp_dbf.c
@@ -7,8 +7,7 @@
  * Copyright IBM Corp. 2002, 2023
  */
 
-#define KMSG_COMPONENT "zfcp"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "zfcp: " fmt
 
 #include <linux/module.h>
 #include <linux/ctype.h>
diff --git a/drivers/s390/scsi/zfcp_erp.c b/drivers/s390/scsi/zfcp_erp.c
index ffd994416995..ec6c0e102119 100644
--- a/drivers/s390/scsi/zfcp_erp.c
+++ b/drivers/s390/scsi/zfcp_erp.c
@@ -7,8 +7,7 @@
  * Copyright IBM Corp. 2002, 2020
  */
 
-#define KMSG_COMPONENT "zfcp"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "zfcp: " fmt
 
 #include <linux/kthread.h>
 #include <linux/bug.h>
diff --git a/drivers/s390/scsi/zfcp_fc.c b/drivers/s390/scsi/zfcp_fc.c
index 1d50f463afe7..78ca394e1195 100644
--- a/drivers/s390/scsi/zfcp_fc.c
+++ b/drivers/s390/scsi/zfcp_fc.c
@@ -7,8 +7,7 @@
  * Copyright IBM Corp. 2008, 2017
  */
 
-#define KMSG_COMPONENT "zfcp"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "zfcp: " fmt
 
 #include <linux/types.h>
 #include <linux/slab.h>
diff --git a/drivers/s390/scsi/zfcp_fsf.c b/drivers/s390/scsi/zfcp_fsf.c
index c5bba1be88f4..9418086368c3 100644
--- a/drivers/s390/scsi/zfcp_fsf.c
+++ b/drivers/s390/scsi/zfcp_fsf.c
@@ -7,8 +7,7 @@
  * Copyright IBM Corp. 2002, 2023
  */
 
-#define KMSG_COMPONENT "zfcp"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "zfcp: " fmt
 
 #include <linux/blktrace_api.h>
 #include <linux/jiffies.h>
diff --git a/drivers/s390/scsi/zfcp_qdio.c b/drivers/s390/scsi/zfcp_qdio.c
index f2410bc44ad3..e15a1eabe42d 100644
--- a/drivers/s390/scsi/zfcp_qdio.c
+++ b/drivers/s390/scsi/zfcp_qdio.c
@@ -7,8 +7,7 @@
  * Copyright IBM Corp. 2002, 2020
  */
 
-#define KMSG_COMPONENT "zfcp"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "zfcp: " fmt
 
 #include <linux/lockdep.h>
 #include <linux/slab.h>
diff --git a/drivers/s390/scsi/zfcp_scsi.c b/drivers/s390/scsi/zfcp_scsi.c
index b31f860af47b..141476ea21bb 100644
--- a/drivers/s390/scsi/zfcp_scsi.c
+++ b/drivers/s390/scsi/zfcp_scsi.c
@@ -7,8 +7,7 @@
  * Copyright IBM Corp. 2002, 2020
  */
 
-#define KMSG_COMPONENT "zfcp"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "zfcp: " fmt
 
 #include <linux/module.h>
 #include <linux/types.h>
diff --git a/drivers/s390/scsi/zfcp_sysfs.c b/drivers/s390/scsi/zfcp_sysfs.c
index 90a84ae98b97..10a3840b2b6b 100644
--- a/drivers/s390/scsi/zfcp_sysfs.c
+++ b/drivers/s390/scsi/zfcp_sysfs.c
@@ -7,8 +7,7 @@
  * Copyright IBM Corp. 2008, 2020
  */
 
-#define KMSG_COMPONENT "zfcp"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "zfcp: " fmt
 
 #include <linux/slab.h>
 #include "zfcp_diag.h"
diff --git a/drivers/scsi/mesh.c b/drivers/scsi/mesh.c
index 1c15cac41d80..768b85eecc8f 100644
--- a/drivers/scsi/mesh.c
+++ b/drivers/scsi/mesh.c
@@ -1762,6 +1762,7 @@ static int mesh_suspend(struct macio_dev *mdev, pm_message_t mesg)
 	case PM_EVENT_SUSPEND:
 	case PM_EVENT_HIBERNATE:
 	case PM_EVENT_FREEZE:
+	case PM_EVENT_POWEROFF:
 		break;
 	default:
 		return 0;
diff --git a/drivers/scsi/stex.c b/drivers/scsi/stex.c
index d8ad02c29320..e6357bc301cb 100644
--- a/drivers/scsi/stex.c
+++ b/drivers/scsi/stex.c
@@ -1965,6 +1965,7 @@ static int stex_choice_sleep_mic(struct st_hba *hba, pm_message_t state)
 	case PM_EVENT_SUSPEND:
 		return ST_S3;
 	case PM_EVENT_HIBERNATE:
+	case PM_EVENT_POWEROFF:
 		hba->msi_lock = 0;
 		return ST_S4;
 	default:
diff --git a/drivers/thermal/Kconfig b/drivers/thermal/Kconfig
index a09c188b9ad1..b10080d61860 100644
--- a/drivers/thermal/Kconfig
+++ b/drivers/thermal/Kconfig
@@ -296,6 +296,16 @@ config IMX8MM_THERMAL
 	  cpufreq is used as the cooling device to throttle CPUs when the passive
 	  trip is crossed.
 
+config IMX91_THERMAL
+	tristate "Temperature sensor driver for NXP i.MX91 SoC"
+	depends on ARCH_MXC || COMPILE_TEST
+	depends on OF
+	help
+	  Include one sensor and six comparators. Each of them compares the
+	  temperature value (from the sensor) against the programmable
+	  threshold values. The direction of the comparison is configurable
+	  (greater / lesser than).
+
 config K3_THERMAL
 	tristate "Texas Instruments K3 thermal support"
 	depends on ARCH_K3 || COMPILE_TEST
diff --git a/drivers/thermal/Makefile b/drivers/thermal/Makefile
index d7718978db24..bb21e7ea7fc6 100644
--- a/drivers/thermal/Makefile
+++ b/drivers/thermal/Makefile
@@ -51,6 +51,7 @@ obj-$(CONFIG_ARMADA_THERMAL)	+= armada_thermal.o
 obj-$(CONFIG_IMX_THERMAL)	+= imx_thermal.o
 obj-$(CONFIG_IMX_SC_THERMAL)	+= imx_sc_thermal.o
 obj-$(CONFIG_IMX8MM_THERMAL)	+= imx8mm_thermal.o
+obj-$(CONFIG_IMX91_THERMAL)	+= imx91_thermal.o
 obj-$(CONFIG_MAX77620_THERMAL)	+= max77620_thermal.o
 obj-$(CONFIG_QORIQ_THERMAL)	+= qoriq_thermal.o
 obj-$(CONFIG_DA9062_THERMAL)	+= da9062-thermal.o
diff --git a/drivers/thermal/imx91_thermal.c b/drivers/thermal/imx91_thermal.c
new file mode 100644
index 000000000000..9b20be03d6de
--- /dev/null
+++ b/drivers/thermal/imx91_thermal.c
@@ -0,0 +1,384 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright 2025 NXP.
+ */
+
+#include <linux/bitfield.h>
+#include <linux/clk.h>
+#include <linux/err.h>
+#include <linux/interrupt.h>
+#include <linux/iopoll.h>
+#include <linux/nvmem-consumer.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/of_device.h>
+#include <linux/platform_device.h>
+#include <linux/pm_runtime.h>
+#include <linux/thermal.h>
+#include <linux/units.h>
+
+#define REG_SET					0x4
+#define REG_CLR					0x8
+#define REG_TOG					0xc
+
+#define IMX91_TMU_CTRL0				0x0
+#define   IMX91_TMU_CTRL0_THR1_IE		BIT(9)
+#define   IMX91_TMU_CTRL0_THR1_MASK		GENMASK(3, 2)
+#define   IMX91_TMU_CTRL0_CLR_FLT1		BIT(21)
+
+#define IMX91_TMU_THR_MODE_LE			0
+#define IMX91_TMU_THR_MODE_GE			1
+
+#define IMX91_TMU_STAT0				0x10
+#define   IMX91_TMU_STAT0_THR1_IF		BIT(9)
+#define   IMX91_TMU_STAT0_THR1_STAT		BIT(13)
+#define   IMX91_TMU_STAT0_DRDY0_IF_MASK		BIT(16)
+
+#define IMX91_TMU_DATA0				0x20
+
+#define IMX91_TMU_CTRL1				0x200
+#define IMX91_TMU_CTRL1_EN			BIT(31)
+#define IMX91_TMU_CTRL1_START			BIT(30)
+#define IMX91_TMU_CTRL1_STOP			BIT(29)
+#define IMX91_TMU_CTRL1_RES_MASK		GENMASK(19, 18)
+#define IMX91_TMU_CTRL1_MEAS_MODE_MASK		GENMASK(25, 24)
+#define   IMX91_TMU_CTRL1_MEAS_MODE_SINGLE	0
+#define   IMX91_TMU_CTRL1_MEAS_MODE_CONTINUES	1
+#define   IMX91_TMU_CTRL1_MEAS_MODE_PERIODIC	2
+
+#define IMX91_TMU_THR_CTRL01			0x30
+#define   IMX91_TMU_THR_CTRL01_THR1_MASK	GENMASK(31, 16)
+
+#define IMX91_TMU_REF_DIV			0x280
+#define IMX91_TMU_DIV_EN			BIT(31)
+#define IMX91_TMU_DIV_MASK			GENMASK(23, 16)
+#define IMX91_TMU_DIV_MAX			255
+
+#define IMX91_TMU_PUD_ST_CTRL			0x2b0
+#define IMX91_TMU_PUDL_MASK			GENMASK(23, 16)
+
+#define IMX91_TMU_TRIM1				0x2e0
+#define IMX91_TMU_TRIM2				0x2f0
+
+#define IMX91_TMU_TEMP_LOW_LIMIT		-40000
+#define IMX91_TMU_TEMP_HIGH_LIMIT		125000
+
+#define IMX91_TMU_DEFAULT_TRIM1_CONFIG		0xb561bc2d
+#define IMX91_TMU_DEFAULT_TRIM2_CONFIG		0x65d4
+
+#define IMX91_TMU_PERIOD_CTRL			0x270
+#define   IMX91_TMU_PERIOD_CTRL_MEAS_MASK	GENMASK(23, 0)
+
+#define IMX91_TMP_FRAC				64
+
+struct imx91_tmu {
+	void __iomem *base;
+	struct clk *clk;
+	struct device *dev;
+	struct thermal_zone_device *tzd;
+};
+
+static void imx91_tmu_start(struct imx91_tmu *tmu, bool start)
+{
+	u32 val = start ? IMX91_TMU_CTRL1_START : IMX91_TMU_CTRL1_STOP;
+
+	writel_relaxed(val, tmu->base + IMX91_TMU_CTRL1 + REG_SET);
+}
+
+static void imx91_tmu_enable(struct imx91_tmu *tmu, bool enable)
+{
+	u32 reg = IMX91_TMU_CTRL1;
+
+	reg += enable ? REG_SET : REG_CLR;
+
+	writel_relaxed(IMX91_TMU_CTRL1_EN, tmu->base + reg);
+}
+
+static int imx91_tmu_to_mcelsius(int x)
+{
+	return x * MILLIDEGREE_PER_DEGREE / IMX91_TMP_FRAC;
+}
+
+static int imx91_tmu_from_mcelsius(int x)
+{
+	return x * IMX91_TMP_FRAC / MILLIDEGREE_PER_DEGREE;
+}
+
+static int imx91_tmu_get_temp(struct thermal_zone_device *tz, int *temp)
+{
+	struct imx91_tmu *tmu = thermal_zone_device_priv(tz);
+	s16 data;
+
+	/* DATA0 is 16bit signed number */
+	data = readw_relaxed(tmu->base + IMX91_TMU_DATA0);
+	*temp = imx91_tmu_to_mcelsius(data);
+
+	return 0;
+}
+
+static int imx91_tmu_set_trips(struct thermal_zone_device *tz, int low, int high)
+{
+	struct imx91_tmu *tmu = thermal_zone_device_priv(tz);
+	int val;
+
+	if (high >= IMX91_TMU_TEMP_HIGH_LIMIT)
+		return -EINVAL;
+
+	writel_relaxed(IMX91_TMU_CTRL0_THR1_IE, tmu->base + IMX91_TMU_CTRL0 + REG_CLR);
+
+	/* Comparator1 for temperature threshold */
+	writel_relaxed(IMX91_TMU_THR_CTRL01_THR1_MASK, tmu->base + IMX91_TMU_THR_CTRL01 + REG_CLR);
+	val = FIELD_PREP(IMX91_TMU_THR_CTRL01_THR1_MASK, imx91_tmu_from_mcelsius(high));
+
+	writel_relaxed(val, tmu->base + IMX91_TMU_THR_CTRL01 + REG_SET);
+
+	writel_relaxed(IMX91_TMU_STAT0_THR1_IF, tmu->base + IMX91_TMU_STAT0 + REG_CLR);
+
+	writel_relaxed(IMX91_TMU_CTRL0_THR1_IE, tmu->base + IMX91_TMU_CTRL0 + REG_SET);
+
+	return 0;
+}
+
+static int imx91_init_from_nvmem_cells(struct imx91_tmu *tmu)
+{
+	struct device *dev = tmu->dev;
+	u32 trim1, trim2;
+	int ret;
+
+	ret = nvmem_cell_read_u32(dev, "trim1", &trim1);
+	if (ret)
+		return ret;
+
+	ret = nvmem_cell_read_u32(dev, "trim2", &trim2);
+	if (ret)
+		return ret;
+
+	if (trim1 == 0 || trim2 == 0)
+		return -EINVAL;
+
+	writel_relaxed(trim1, tmu->base + IMX91_TMU_TRIM1);
+	writel_relaxed(trim2, tmu->base + IMX91_TMU_TRIM2);
+
+	return 0;
+}
+
+static void imx91_tmu_action_remove(void *data)
+{
+	struct imx91_tmu *tmu = data;
+
+	/* disable tmu */
+	imx91_tmu_enable(tmu, false);
+}
+
+static irqreturn_t imx91_tmu_alarm_irq(int irq, void *data)
+{
+	struct imx91_tmu *tmu = data;
+	u32 val;
+
+	val = readl_relaxed(tmu->base + IMX91_TMU_STAT0);
+
+	/* Check if comparison interrupt occurred */
+	if (val & IMX91_TMU_STAT0_THR1_IF) {
+		/* Clear irq flag and disable interrupt until reconfigured */
+		writel(IMX91_TMU_STAT0_THR1_IF, tmu->base + IMX91_TMU_STAT0 + REG_CLR);
+		writel_relaxed(IMX91_TMU_CTRL0_THR1_IE, tmu->base + IMX91_TMU_CTRL0 + REG_CLR);
+
+		return IRQ_WAKE_THREAD;
+	}
+
+	return IRQ_NONE;
+}
+
+static irqreturn_t imx91_tmu_alarm_irq_thread(int irq, void *data)
+{
+	struct imx91_tmu *tmu = data;
+
+	thermal_zone_device_update(tmu->tzd, THERMAL_EVENT_UNSPECIFIED);
+
+	return IRQ_HANDLED;
+}
+
+static int imx91_tmu_change_mode(struct thermal_zone_device *tz, enum thermal_device_mode mode)
+{
+	struct imx91_tmu *tmu = thermal_zone_device_priv(tz);
+	int ret;
+
+	if (mode == THERMAL_DEVICE_ENABLED) {
+		ret = pm_runtime_get(tmu->dev);
+		if (ret < 0)
+			return ret;
+
+		writel_relaxed(IMX91_TMU_CTRL0_THR1_IE | IMX91_TMU_CTRL0_THR1_MASK,
+			       tmu->base + IMX91_TMU_CTRL0 + REG_CLR);
+
+		writel_relaxed(FIELD_PREP(IMX91_TMU_CTRL0_THR1_MASK, IMX91_TMU_THR_MODE_GE),
+			       tmu->base + IMX91_TMU_CTRL0 + REG_SET);
+		imx91_tmu_start(tmu, true);
+	} else {
+		writel_relaxed(IMX91_TMU_CTRL0_THR1_IE, tmu->base + IMX91_TMU_CTRL0 + REG_CLR);
+		imx91_tmu_start(tmu, false);
+		pm_runtime_put(tmu->dev);
+	}
+
+	return 0;
+}
+
+static struct thermal_zone_device_ops tmu_tz_ops = {
+	.get_temp = imx91_tmu_get_temp,
+	.change_mode = imx91_tmu_change_mode,
+	.set_trips = imx91_tmu_set_trips,
+};
+
+static int imx91_tmu_probe(struct platform_device *pdev)
+{
+	struct device *dev = &pdev->dev;
+	struct imx91_tmu *tmu;
+	unsigned long rate;
+	int irq, ret;
+	u32 div;
+
+	tmu = devm_kzalloc(dev, sizeof(struct imx91_tmu), GFP_KERNEL);
+	if (!tmu)
+		return -ENOMEM;
+
+	tmu->dev = dev;
+
+	tmu->base = devm_platform_ioremap_resource(pdev, 0);
+	if (IS_ERR(tmu->base))
+		return dev_err_probe(dev, PTR_ERR(tmu->base), "failed to get io resource");
+
+	tmu->clk = devm_clk_get_enabled(dev, NULL);
+	if (IS_ERR(tmu->clk))
+		return dev_err_probe(dev, PTR_ERR(tmu->clk), "failed to get tmu clock\n");
+
+	platform_set_drvdata(pdev, tmu);
+
+	/* disable the monitor during initialization */
+	imx91_tmu_enable(tmu, false);
+	imx91_tmu_start(tmu, false);
+
+	ret = imx91_init_from_nvmem_cells(tmu);
+	if (ret) {
+		dev_warn(dev, "can't get trim value, use default settings\n");
+
+		writel_relaxed(IMX91_TMU_DEFAULT_TRIM1_CONFIG, tmu->base + IMX91_TMU_TRIM1);
+		writel_relaxed(IMX91_TMU_DEFAULT_TRIM2_CONFIG, tmu->base + IMX91_TMU_TRIM2);
+	}
+
+	/* The typical conv clk is 4MHz, the output freq is 'rate / (div + 1)' */
+	rate = clk_get_rate(tmu->clk);
+	div = (rate / (4 * HZ_PER_MHZ)) - 1;
+	if (div > IMX91_TMU_DIV_MAX)
+		return dev_err_probe(dev, -EINVAL, "clock divider exceed hardware limitation");
+
+	/* Set divider value and enable divider */
+	writel_relaxed(IMX91_TMU_DIV_EN | FIELD_PREP(IMX91_TMU_DIV_MASK, div),
+		       tmu->base + IMX91_TMU_REF_DIV);
+
+	/* Set max power up delay: 'Tpud(ms) = 0xFF * 1000 / 4000000' */
+	writel_relaxed(FIELD_PREP(IMX91_TMU_PUDL_MASK, 100U), tmu->base + IMX91_TMU_PUD_ST_CTRL);
+
+	/*
+	 * Set resolution mode
+	 * 00b - Conversion time = 0.59325 ms
+	 * 01b - Conversion time = 1.10525 ms
+	 * 10b - Conversion time = 2.12925 ms
+	 * 11b - Conversion time = 4.17725 ms
+	 */
+	writel_relaxed(FIELD_PREP(IMX91_TMU_CTRL1_RES_MASK, 0x3),
+		       tmu->base + IMX91_TMU_CTRL1 + REG_CLR);
+	writel_relaxed(FIELD_PREP(IMX91_TMU_CTRL1_RES_MASK, 0x1),
+		       tmu->base + IMX91_TMU_CTRL1 + REG_SET);
+
+	writel_relaxed(IMX91_TMU_CTRL1_MEAS_MODE_MASK, tmu->base + IMX91_TMU_CTRL1 + REG_CLR);
+	writel_relaxed(FIELD_PREP(IMX91_TMU_CTRL1_MEAS_MODE_MASK,
+				  IMX91_TMU_CTRL1_MEAS_MODE_PERIODIC),
+		       tmu->base + IMX91_TMU_CTRL1 + REG_SET);
+
+	/*
+	 * Set Periodic Measurement Frequency to 25Hz:
+	 * tMEAS_FREQ = tCONV_CLK * PERIOD_CTRL[MEAS_FREQ]
+	 */
+	writel_relaxed(FIELD_PREP(IMX91_TMU_PERIOD_CTRL_MEAS_MASK, 4 * HZ_PER_MHZ / 25),
+		       tmu->base + IMX91_TMU_PERIOD_CTRL);
+
+	imx91_tmu_enable(tmu, true);
+	ret = devm_add_action(dev, imx91_tmu_action_remove, tmu);
+	if (ret)
+		return dev_err_probe(dev, ret, "Failure to add action imx91_tmu_action_remove()\n");
+
+	pm_runtime_set_active(dev);
+	pm_runtime_get_noresume(dev);
+	ret = devm_pm_runtime_enable(dev);
+	if (ret)
+		return ret;
+
+	tmu->tzd = devm_thermal_of_zone_register(dev, 0, tmu, &tmu_tz_ops);
+	if (IS_ERR(tmu->tzd))
+		return dev_err_probe(dev, PTR_ERR(tmu->tzd),
+				     "failed to register thermal zone sensor\n");
+
+	irq = platform_get_irq(pdev, 0);
+	if (irq < 0)
+		return irq;
+
+	ret = devm_request_threaded_irq(dev, irq, imx91_tmu_alarm_irq,
+					imx91_tmu_alarm_irq_thread,
+					IRQF_ONESHOT, "imx91_thermal", tmu);
+
+	if (ret < 0)
+		return dev_err_probe(dev, ret, "failed to request alarm irq\n");
+
+	pm_runtime_put(dev);
+
+	return 0;
+}
+
+static int imx91_tmu_runtime_suspend(struct device *dev)
+{
+	struct imx91_tmu *tmu = dev_get_drvdata(dev);
+
+	/* disable tmu */
+	imx91_tmu_enable(tmu, false);
+
+	clk_disable_unprepare(tmu->clk);
+
+	return 0;
+}
+
+static int imx91_tmu_runtime_resume(struct device *dev)
+{
+	struct imx91_tmu *tmu = dev_get_drvdata(dev);
+	int ret;
+
+	ret = clk_prepare_enable(tmu->clk);
+	if (ret)
+		return ret;
+
+	imx91_tmu_enable(tmu, true);
+
+	return 0;
+}
+
+static DEFINE_RUNTIME_DEV_PM_OPS(imx91_tmu_pm_ops, imx91_tmu_runtime_suspend,
+				 imx91_tmu_runtime_resume, NULL);
+
+static const struct of_device_id imx91_tmu_table[] = {
+	{ .compatible = "fsl,imx91-tmu", },
+	{ },
+};
+MODULE_DEVICE_TABLE(of, imx91_tmu_table);
+
+static struct platform_driver imx91_tmu = {
+	.driver = {
+		.name	= "imx91_thermal",
+		.pm	= pm_ptr(&imx91_tmu_pm_ops),
+		.of_match_table = imx91_tmu_table,
+	},
+	.probe = imx91_tmu_probe,
+};
+module_platform_driver(imx91_tmu);
+
+MODULE_AUTHOR("Peng Fan <peng.fan@nxp.com>");
+MODULE_DESCRIPTION("i.MX91 Thermal Monitor Unit driver");
+MODULE_LICENSE("GPL");
diff --git a/drivers/thermal/intel/Kconfig b/drivers/thermal/intel/Kconfig
index e0268fac7093..347c59bc87d6 100644
--- a/drivers/thermal/intel/Kconfig
+++ b/drivers/thermal/intel/Kconfig
@@ -44,7 +44,8 @@ config INTEL_SOC_DTS_IOSF_CORE
 
 config INTEL_SOC_DTS_THERMAL
 	tristate "Intel SoCs DTS thermal driver"
-	depends on X86 && PCI && ACPI
+	depends on X86_64 && PCI && ACPI && NET
+	select INT340X_THERMAL
 	select INTEL_SOC_DTS_IOSF_CORE
 	help
 	  Enable this to register Intel SoCs (e.g. Bay Trail) platform digital
diff --git a/drivers/thermal/intel/int340x_thermal/int3400_thermal.c b/drivers/thermal/intel/int340x_thermal/int3400_thermal.c
index 908cc1bf57f1..41d3bc3ed8a2 100644
--- a/drivers/thermal/intel/int340x_thermal/int3400_thermal.c
+++ b/drivers/thermal/intel/int340x_thermal/int3400_thermal.c
@@ -16,6 +16,8 @@
 #define INT3400_ODVP_CHANGED 0x88
 #define INT3400_KEEP_ALIVE 0xA0
 #define INT3400_FAKE_TEMP (20 * 1000) /* faked temp sensor with 20C */
+/* UUID prefix length for comparison - sufficient for all UUIDs */
+#define INT3400_UUID_PREFIX_LEN 7
 
 enum int3400_thermal_uuid {
 	INT3400_THERMAL_ACTIVE = 0,
@@ -112,7 +114,7 @@ static ssize_t available_uuids_show(struct device *dev,
 	int length = 0;
 
 	if (!priv->uuid_bitmap)
-		return sprintf(buf, "UNKNOWN\n");
+		return sysfs_emit(buf, "UNKNOWN\n");
 
 	for (i = 0; i < INT3400_THERMAL_MAXIMUM_UUID; i++) {
 		if (priv->uuid_bitmap & (1 << i))
@@ -129,7 +131,7 @@ static ssize_t current_uuid_show(struct device *dev,
 	int i, length = 0;
 
 	if (priv->current_uuid_index >= 0)
-		return sprintf(buf, "%s\n",
+		return sysfs_emit(buf, "%s\n",
 			       int3400_thermal_uuids[priv->current_uuid_index]);
 
 	for (i = 0; i <= INT3400_THERMAL_CRITICAL; i++) {
@@ -140,7 +142,7 @@ static ssize_t current_uuid_show(struct device *dev,
 	if (length)
 		return length;
 
-	return sprintf(buf, "INVALID\n");
+	return sysfs_emit(buf, "INVALID\n");
 }
 
 static int int3400_thermal_run_osc(acpi_handle handle, char *uuid_str, int *enable)
@@ -199,7 +201,7 @@ static ssize_t current_uuid_store(struct device *dev,
 
 	for (i = 0; i < INT3400_THERMAL_MAXIMUM_UUID; ++i) {
 		if (!strncmp(buf, int3400_thermal_uuids[i],
-			     sizeof(int3400_thermal_uuids[i]) - 1)) {
+			     INT3400_UUID_PREFIX_LEN)) {
 			/*
 			 * If we have a list of supported UUIDs, make sure
 			 * this one is supported.
@@ -340,7 +342,7 @@ static ssize_t odvp_show(struct device *dev, struct device_attribute *attr,
 
 	odvp_attr = container_of(attr, struct odvp_attr, attr);
 
-	return sprintf(buf, "%d\n", odvp_attr->priv->odvp[odvp_attr->odvp]);
+	return sysfs_emit(buf, "%d\n", odvp_attr->priv->odvp[odvp_attr->odvp]);
 }
 
 static void cleanup_odvp(struct int3400_thermal_priv *priv)
@@ -691,6 +693,7 @@ static const struct acpi_device_id int3400_thermal_match[] = {
 	{"INTC10A0", 0},
 	{"INTC10D4", 0},
 	{"INTC10FC", 0},
+	{"INTC10F3", 0},
 	{}
 };
 
diff --git a/drivers/thermal/intel/int340x_thermal/int3403_thermal.c b/drivers/thermal/intel/int340x_thermal/int3403_thermal.c
index ba63796761eb..264c9bc8e645 100644
--- a/drivers/thermal/intel/int340x_thermal/int3403_thermal.c
+++ b/drivers/thermal/intel/int340x_thermal/int3403_thermal.c
@@ -277,6 +277,7 @@ static const struct acpi_device_id int3403_device_ids[] = {
 	{"INTC10A1", 0},
 	{"INTC10D5", 0},
 	{"INTC10FD", 0},
+	{"INTC10F4", 0},
 	{"", 0},
 };
 MODULE_DEVICE_TABLE(acpi, int3403_device_ids);
diff --git a/drivers/thermal/intel/int340x_thermal/processor_thermal_device.h b/drivers/thermal/intel/int340x_thermal/processor_thermal_device.h
index 30760475102f..b79937a386ec 100644
--- a/drivers/thermal/intel/int340x_thermal/processor_thermal_device.h
+++ b/drivers/thermal/intel/int340x_thermal/processor_thermal_device.h
@@ -27,6 +27,8 @@
 #define PCI_DEVICE_ID_INTEL_JSL_THERMAL	0x4E03
 #define PCI_DEVICE_ID_INTEL_LNLM_THERMAL	0x641D
 #define PCI_DEVICE_ID_INTEL_MTLP_THERMAL	0x7D03
+#define PCI_DEVICE_ID_INTEL_NVL_H_THERMAL	0xD703
+#define PCI_DEVICE_ID_INTEL_NVL_S_THERMAL	0xAD03
 #define PCI_DEVICE_ID_INTEL_RPL_THERMAL	0xA71D
 #define PCI_DEVICE_ID_INTEL_SKL_THERMAL	0x1903
 #define PCI_DEVICE_ID_INTEL_TGL_THERMAL	0x9A03
diff --git a/drivers/thermal/intel/int340x_thermal/processor_thermal_device_pci.c b/drivers/thermal/intel/int340x_thermal/processor_thermal_device_pci.c
index e2471768d355..0d4dcc66e097 100644
--- a/drivers/thermal/intel/int340x_thermal/processor_thermal_device_pci.c
+++ b/drivers/thermal/intel/int340x_thermal/processor_thermal_device_pci.c
@@ -504,6 +504,16 @@ static const struct pci_device_id proc_thermal_pci_ids[] = {
 	  PROC_THERMAL_FEATURE_RAPL | PROC_THERMAL_FEATURE_DLVR |
 	  PROC_THERMAL_FEATURE_DVFS | PROC_THERMAL_FEATURE_WT_HINT |
 	  PROC_THERMAL_FEATURE_POWER_FLOOR | PROC_THERMAL_FEATURE_PTC) },
+	{ PCI_DEVICE_DATA(INTEL, NVL_H_THERMAL, PROC_THERMAL_FEATURE_RAPL |
+	  PROC_THERMAL_FEATURE_DLVR | PROC_THERMAL_FEATURE_DVFS |
+	  PROC_THERMAL_FEATURE_MSI_SUPPORT | PROC_THERMAL_FEATURE_WT_HINT |
+	  PROC_THERMAL_FEATURE_POWER_FLOOR | PROC_THERMAL_FEATURE_PTC |
+	  PROC_THERMAL_FEATURE_SOC_POWER_SLIDER) },
+	{ PCI_DEVICE_DATA(INTEL, NVL_S_THERMAL, PROC_THERMAL_FEATURE_RAPL |
+	  PROC_THERMAL_FEATURE_DLVR | PROC_THERMAL_FEATURE_DVFS |
+	  PROC_THERMAL_FEATURE_MSI_SUPPORT | PROC_THERMAL_FEATURE_WT_HINT |
+	  PROC_THERMAL_FEATURE_POWER_FLOOR | PROC_THERMAL_FEATURE_PTC |
+	  PROC_THERMAL_FEATURE_SOC_POWER_SLIDER) },
 	{ },
 };
 
diff --git a/drivers/thermal/intel/int340x_thermal/processor_thermal_rapl.c b/drivers/thermal/intel/int340x_thermal/processor_thermal_rapl.c
index bde2cc386afd..bf51a17c5be6 100644
--- a/drivers/thermal/intel/int340x_thermal/processor_thermal_rapl.c
+++ b/drivers/thermal/intel/int340x_thermal/processor_thermal_rapl.c
@@ -19,7 +19,7 @@ static const struct rapl_mmio_regs rapl_mmio_default = {
 	.limits[RAPL_DOMAIN_DRAM] = BIT(POWER_LIMIT2),
 };
 
-static int rapl_mmio_read_raw(int cpu, struct reg_action *ra)
+static int rapl_mmio_read_raw(int cpu, struct reg_action *ra, bool atomic)
 {
 	if (!ra->reg.mmio)
 		return -EINVAL;
diff --git a/drivers/thermal/intel/int340x_thermal/processor_thermal_rfim.c b/drivers/thermal/intel/int340x_thermal/processor_thermal_rfim.c
index 1f3d22b659db..589a3a71f0c4 100644
--- a/drivers/thermal/intel/int340x_thermal/processor_thermal_rfim.c
+++ b/drivers/thermal/intel/int340x_thermal/processor_thermal_rfim.c
@@ -87,6 +87,17 @@ static const struct mapping_table lnl_dlvr_mapping[] = {
 	{NULL, 0, NULL},
 };
 
+static const struct mmio_reg nvl_dlvr_mmio_regs[] = {
+	{ 0, 0x19208, 5, 0x1F, 0}, /* dlvr_spread_spectrum_pct */
+	{ 0, 0x19208, 1, 0x1, 5}, /* dlvr_control_mode */
+	{ 0, 0x19208, 1, 0x1, 6}, /* dlvr_control_lock */
+	{ 0, 0x19208, 1, 0x1, 7}, /* dlvr_rfim_enable */
+	{ 0, 0x19208, 12, 0xFFF, 8}, /* dlvr_freq_select */
+	{ 1, 0x19210, 2, 0x3, 30}, /* dlvr_hardware_rev */
+	{ 1, 0x19210, 16, 0xFFFF, 0}, /* dlvr_freq_mhz */
+	{ 1, 0x19210, 1, 0x1, 16}, /* dlvr_pll_busy */
+};
+
 static int match_mapping_table(const struct mapping_table *table, const char *attr_name,
 			       bool match_int_value, const u32 value, const char *value_str,
 			       char **result_str, u32 *result_int)
@@ -446,6 +457,10 @@ int proc_thermal_rfim_add(struct pci_dev *pdev, struct proc_thermal_device *proc
 			dlvr_mmio_regs_table = lnl_dlvr_mmio_regs;
 			dlvr_mapping = lnl_dlvr_mapping;
 			break;
+		case PCI_DEVICE_ID_INTEL_NVL_H_THERMAL:
+		case PCI_DEVICE_ID_INTEL_NVL_S_THERMAL:
+			dlvr_mmio_regs_table = nvl_dlvr_mmio_regs;
+			break;
 		default:
 			dlvr_mmio_regs_table = dlvr_mmio_regs;
 			break;
diff --git a/drivers/thermal/renesas/rcar_gen3_thermal.c b/drivers/thermal/renesas/rcar_gen3_thermal.c
index 3223de238d01..94804816e9e1 100644
--- a/drivers/thermal/renesas/rcar_gen3_thermal.c
+++ b/drivers/thermal/renesas/rcar_gen3_thermal.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
- *  R-Car Gen3 THS thermal sensor driver
+ *  R-Car Gen3, Gen4 and RZ/G2 THS thermal sensor driver
  *  Based on rcar_thermal.c and work from Hien Dang and Khiem Nguyen.
  *
  * Copyright (C) 2016 Renesas Electronics Corporation.
@@ -601,7 +601,7 @@ error_unregister:
 	return ret;
 }
 
-static int __maybe_unused rcar_gen3_thermal_resume(struct device *dev)
+static int rcar_gen3_thermal_resume(struct device *dev)
 {
 	struct rcar_gen3_thermal_priv *priv = dev_get_drvdata(dev);
 	unsigned int i;
@@ -615,13 +615,13 @@ static int __maybe_unused rcar_gen3_thermal_resume(struct device *dev)
 	return 0;
 }
 
-static SIMPLE_DEV_PM_OPS(rcar_gen3_thermal_pm_ops, NULL,
-			 rcar_gen3_thermal_resume);
+static DEFINE_SIMPLE_DEV_PM_OPS(rcar_gen3_thermal_pm_ops, NULL,
+				rcar_gen3_thermal_resume);
 
 static struct platform_driver rcar_gen3_thermal_driver = {
 	.driver	= {
 		.name	= "rcar_gen3_thermal",
-		.pm = &rcar_gen3_thermal_pm_ops,
+		.pm = pm_sleep_ptr(&rcar_gen3_thermal_pm_ops),
 		.of_match_table = rcar_gen3_thermal_dt_ids,
 	},
 	.probe		= rcar_gen3_thermal_probe,
diff --git a/drivers/thermal/renesas/rcar_thermal.c b/drivers/thermal/renesas/rcar_thermal.c
index fdd7afdc4ff6..6e5dcac5d47a 100644
--- a/drivers/thermal/renesas/rcar_thermal.c
+++ b/drivers/thermal/renesas/rcar_thermal.c
@@ -534,7 +534,6 @@ error_unregister:
 	return ret;
 }
 
-#ifdef CONFIG_PM_SLEEP
 static int rcar_thermal_suspend(struct device *dev)
 {
 	struct rcar_thermal_common *common = dev_get_drvdata(dev);
@@ -567,15 +566,14 @@ static int rcar_thermal_resume(struct device *dev)
 
 	return 0;
 }
-#endif
 
-static SIMPLE_DEV_PM_OPS(rcar_thermal_pm_ops, rcar_thermal_suspend,
-			 rcar_thermal_resume);
+static DEFINE_SIMPLE_DEV_PM_OPS(rcar_thermal_pm_ops, rcar_thermal_suspend,
+				rcar_thermal_resume);
 
 static struct platform_driver rcar_thermal_driver = {
 	.driver	= {
 		.name	= "rcar_thermal",
-		.pm = &rcar_thermal_pm_ops,
+		.pm = pm_sleep_ptr(&rcar_thermal_pm_ops),
 		.of_match_table = rcar_thermal_dt_ids,
 	},
 	.probe		= rcar_thermal_probe,
diff --git a/drivers/usb/host/sl811-hcd.c b/drivers/usb/host/sl811-hcd.c
index ea3cab99c5d4..5d6dba681e50 100644
--- a/drivers/usb/host/sl811-hcd.c
+++ b/drivers/usb/host/sl811-hcd.c
@@ -1748,6 +1748,7 @@ sl811h_suspend(struct platform_device *dev, pm_message_t state)
 		break;
 	case PM_EVENT_SUSPEND:
 	case PM_EVENT_HIBERNATE:
+	case PM_EVENT_POWEROFF:
 	case PM_EVENT_PRETHAW:		/* explicitly discard hw state */
 		port_power(sl811, 0);
 		break;
diff --git a/drivers/watchdog/diag288_wdt.c b/drivers/watchdog/diag288_wdt.c
index 887d5a6c155b..9daed2758ae5 100644
--- a/drivers/watchdog/diag288_wdt.c
+++ b/drivers/watchdog/diag288_wdt.c
@@ -18,8 +18,7 @@
  *
  */
 
-#define KMSG_COMPONENT "diag288_wdt"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "diag288_wdt: " fmt
 
 #include <linux/init.h>
 #include <linux/kernel.h>
diff --git a/include/crypto/blake2b.h b/include/crypto/blake2b.h
index dd7694477e50..3bc37fd103a7 100644
--- a/include/crypto/blake2b.h
+++ b/include/crypto/blake2b.h
@@ -7,20 +7,10 @@
 #include <linux/types.h>
 #include <linux/string.h>
 
-struct blake2b_state {
-	/* 'h', 't', and 'f' are used in assembly code, so keep them as-is. */
-	u64 h[8];
-	u64 t[2];
-	/* The true state ends here.  The rest is temporary storage. */
-	u64 f[2];
-};
-
 enum blake2b_lengths {
 	BLAKE2B_BLOCK_SIZE = 128,
 	BLAKE2B_HASH_SIZE = 64,
 	BLAKE2B_KEY_SIZE = 64,
-	BLAKE2B_STATE_SIZE = offsetof(struct blake2b_state, f),
-	BLAKE2B_DESC_SIZE = sizeof(struct blake2b_state),
 
 	BLAKE2B_160_HASH_SIZE = 20,
 	BLAKE2B_256_HASH_SIZE = 32,
@@ -28,6 +18,25 @@ enum blake2b_lengths {
 	BLAKE2B_512_HASH_SIZE = 64,
 };
 
+/**
+ * struct blake2b_ctx - Context for hashing a message with BLAKE2b
+ * @h: compression function state
+ * @t: block counter
+ * @f: finalization indicator
+ * @buf: partial block buffer; 'buflen' bytes are valid
+ * @buflen: number of bytes buffered in @buf
+ * @outlen: length of output hash value in bytes, at most BLAKE2B_HASH_SIZE
+ */
+struct blake2b_ctx {
+	/* 'h', 't', and 'f' are used in assembly code, so keep them as-is. */
+	u64 h[8];
+	u64 t[2];
+	u64 f[2];
+	u8 buf[BLAKE2B_BLOCK_SIZE];
+	unsigned int buflen;
+	unsigned int outlen;
+};
+
 enum blake2b_iv {
 	BLAKE2B_IV0 = 0x6A09E667F3BCC908ULL,
 	BLAKE2B_IV1 = 0xBB67AE8584CAA73BULL,
@@ -39,19 +48,109 @@ enum blake2b_iv {
 	BLAKE2B_IV7 = 0x5BE0CD19137E2179ULL,
 };
 
-static inline void __blake2b_init(struct blake2b_state *state, size_t outlen,
-				  size_t keylen)
+static inline void __blake2b_init(struct blake2b_ctx *ctx, size_t outlen,
+				  const void *key, size_t keylen)
+{
+	ctx->h[0] = BLAKE2B_IV0 ^ (0x01010000 | keylen << 8 | outlen);
+	ctx->h[1] = BLAKE2B_IV1;
+	ctx->h[2] = BLAKE2B_IV2;
+	ctx->h[3] = BLAKE2B_IV3;
+	ctx->h[4] = BLAKE2B_IV4;
+	ctx->h[5] = BLAKE2B_IV5;
+	ctx->h[6] = BLAKE2B_IV6;
+	ctx->h[7] = BLAKE2B_IV7;
+	ctx->t[0] = 0;
+	ctx->t[1] = 0;
+	ctx->f[0] = 0;
+	ctx->f[1] = 0;
+	ctx->buflen = 0;
+	ctx->outlen = outlen;
+	if (keylen) {
+		memcpy(ctx->buf, key, keylen);
+		memset(&ctx->buf[keylen], 0, BLAKE2B_BLOCK_SIZE - keylen);
+		ctx->buflen = BLAKE2B_BLOCK_SIZE;
+	}
+}
+
+/**
+ * blake2b_init() - Initialize a BLAKE2b context for a new message (unkeyed)
+ * @ctx: the context to initialize
+ * @outlen: length of output hash value in bytes, at most BLAKE2B_HASH_SIZE
+ *
+ * Context: Any context.
+ */
+static inline void blake2b_init(struct blake2b_ctx *ctx, size_t outlen)
 {
-	state->h[0] = BLAKE2B_IV0 ^ (0x01010000 | keylen << 8 | outlen);
-	state->h[1] = BLAKE2B_IV1;
-	state->h[2] = BLAKE2B_IV2;
-	state->h[3] = BLAKE2B_IV3;
-	state->h[4] = BLAKE2B_IV4;
-	state->h[5] = BLAKE2B_IV5;
-	state->h[6] = BLAKE2B_IV6;
-	state->h[7] = BLAKE2B_IV7;
-	state->t[0] = 0;
-	state->t[1] = 0;
+	__blake2b_init(ctx, outlen, NULL, 0);
+}
+
+/**
+ * blake2b_init_key() - Initialize a BLAKE2b context for a new message (keyed)
+ * @ctx: the context to initialize
+ * @outlen: length of output hash value in bytes, at most BLAKE2B_HASH_SIZE
+ * @key: the key
+ * @keylen: the key length in bytes, at most BLAKE2B_KEY_SIZE
+ *
+ * Context: Any context.
+ */
+static inline void blake2b_init_key(struct blake2b_ctx *ctx, size_t outlen,
+				    const void *key, size_t keylen)
+{
+	WARN_ON(IS_ENABLED(DEBUG) && (!outlen || outlen > BLAKE2B_HASH_SIZE ||
+		!key || !keylen || keylen > BLAKE2B_KEY_SIZE));
+
+	__blake2b_init(ctx, outlen, key, keylen);
+}
+
+/**
+ * blake2b_update() - Update a BLAKE2b context with message data
+ * @ctx: the context to update; must have been initialized
+ * @in: the message data
+ * @inlen: the data length in bytes
+ *
+ * This can be called any number of times.
+ *
+ * Context: Any context.
+ */
+void blake2b_update(struct blake2b_ctx *ctx, const u8 *in, size_t inlen);
+
+/**
+ * blake2b_final() - Finish computing a BLAKE2b hash
+ * @ctx: the context to finalize; must have been initialized
+ * @out: (output) the resulting BLAKE2b hash.  Its length will be equal to the
+ *	 @outlen that was passed to blake2b_init() or blake2b_init_key().
+ *
+ * After finishing, this zeroizes @ctx.  So the caller does not need to do it.
+ *
+ * Context: Any context.
+ */
+void blake2b_final(struct blake2b_ctx *ctx, u8 *out);
+
+/**
+ * blake2b() - Compute BLAKE2b hash in one shot
+ * @key: the key, or NULL for an unkeyed hash
+ * @keylen: the key length in bytes (at most BLAKE2B_KEY_SIZE), or 0 for an
+ *	    unkeyed hash
+ * @in: the message data
+ * @inlen: the data length in bytes
+ * @out: (output) the resulting BLAKE2b hash, with length @outlen
+ * @outlen: length of output hash value in bytes, at most BLAKE2B_HASH_SIZE
+ *
+ * Context: Any context.
+ */
+static inline void blake2b(const u8 *key, size_t keylen,
+			   const u8 *in, size_t inlen,
+			   u8 *out, size_t outlen)
+{
+	struct blake2b_ctx ctx;
+
+	WARN_ON(IS_ENABLED(DEBUG) && ((!in && inlen > 0) || !out || !outlen ||
+		outlen > BLAKE2B_HASH_SIZE || keylen > BLAKE2B_KEY_SIZE ||
+		(!key && keylen)));
+
+	__blake2b_init(&ctx, outlen, key, keylen);
+	blake2b_update(&ctx, in, inlen);
+	blake2b_final(&ctx, out);
 }
 
 #endif /* _CRYPTO_BLAKE2B_H */
diff --git a/include/crypto/blake2s.h b/include/crypto/blake2s.h
index f9ffd39194eb..648cb7824358 100644
--- a/include/crypto/blake2s.h
+++ b/include/crypto/blake2s.h
@@ -22,7 +22,16 @@ enum blake2s_lengths {
 	BLAKE2S_256_HASH_SIZE = 32,
 };
 
-struct blake2s_state {
+/**
+ * struct blake2s_ctx - Context for hashing a message with BLAKE2s
+ * @h: compression function state
+ * @t: block counter
+ * @f: finalization indicator
+ * @buf: partial block buffer; 'buflen' bytes are valid
+ * @buflen: number of bytes buffered in @buf
+ * @outlen: length of output hash value in bytes, at most BLAKE2S_HASH_SIZE
+ */
+struct blake2s_ctx {
 	/* 'h', 't', and 'f' are used in assembly code, so keep them as-is. */
 	u32 h[8];
 	u32 t[2];
@@ -43,62 +52,109 @@ enum blake2s_iv {
 	BLAKE2S_IV7 = 0x5BE0CD19UL,
 };
 
-static inline void __blake2s_init(struct blake2s_state *state, size_t outlen,
+static inline void __blake2s_init(struct blake2s_ctx *ctx, size_t outlen,
 				  const void *key, size_t keylen)
 {
-	state->h[0] = BLAKE2S_IV0 ^ (0x01010000 | keylen << 8 | outlen);
-	state->h[1] = BLAKE2S_IV1;
-	state->h[2] = BLAKE2S_IV2;
-	state->h[3] = BLAKE2S_IV3;
-	state->h[4] = BLAKE2S_IV4;
-	state->h[5] = BLAKE2S_IV5;
-	state->h[6] = BLAKE2S_IV6;
-	state->h[7] = BLAKE2S_IV7;
-	state->t[0] = 0;
-	state->t[1] = 0;
-	state->f[0] = 0;
-	state->f[1] = 0;
-	state->buflen = 0;
-	state->outlen = outlen;
+	ctx->h[0] = BLAKE2S_IV0 ^ (0x01010000 | keylen << 8 | outlen);
+	ctx->h[1] = BLAKE2S_IV1;
+	ctx->h[2] = BLAKE2S_IV2;
+	ctx->h[3] = BLAKE2S_IV3;
+	ctx->h[4] = BLAKE2S_IV4;
+	ctx->h[5] = BLAKE2S_IV5;
+	ctx->h[6] = BLAKE2S_IV6;
+	ctx->h[7] = BLAKE2S_IV7;
+	ctx->t[0] = 0;
+	ctx->t[1] = 0;
+	ctx->f[0] = 0;
+	ctx->f[1] = 0;
+	ctx->buflen = 0;
+	ctx->outlen = outlen;
 	if (keylen) {
-		memcpy(state->buf, key, keylen);
-		memset(&state->buf[keylen], 0, BLAKE2S_BLOCK_SIZE - keylen);
-		state->buflen = BLAKE2S_BLOCK_SIZE;
+		memcpy(ctx->buf, key, keylen);
+		memset(&ctx->buf[keylen], 0, BLAKE2S_BLOCK_SIZE - keylen);
+		ctx->buflen = BLAKE2S_BLOCK_SIZE;
 	}
 }
 
-static inline void blake2s_init(struct blake2s_state *state,
-				const size_t outlen)
+/**
+ * blake2s_init() - Initialize a BLAKE2s context for a new message (unkeyed)
+ * @ctx: the context to initialize
+ * @outlen: length of output hash value in bytes, at most BLAKE2S_HASH_SIZE
+ *
+ * Context: Any context.
+ */
+static inline void blake2s_init(struct blake2s_ctx *ctx, size_t outlen)
 {
-	__blake2s_init(state, outlen, NULL, 0);
+	__blake2s_init(ctx, outlen, NULL, 0);
 }
 
-static inline void blake2s_init_key(struct blake2s_state *state,
-				    const size_t outlen, const void *key,
-				    const size_t keylen)
+/**
+ * blake2s_init_key() - Initialize a BLAKE2s context for a new message (keyed)
+ * @ctx: the context to initialize
+ * @outlen: length of output hash value in bytes, at most BLAKE2S_HASH_SIZE
+ * @key: the key
+ * @keylen: the key length in bytes, at most BLAKE2S_KEY_SIZE
+ *
+ * Context: Any context.
+ */
+static inline void blake2s_init_key(struct blake2s_ctx *ctx, size_t outlen,
+				    const void *key, size_t keylen)
 {
 	WARN_ON(IS_ENABLED(DEBUG) && (!outlen || outlen > BLAKE2S_HASH_SIZE ||
 		!key || !keylen || keylen > BLAKE2S_KEY_SIZE));
 
-	__blake2s_init(state, outlen, key, keylen);
+	__blake2s_init(ctx, outlen, key, keylen);
 }
 
-void blake2s_update(struct blake2s_state *state, const u8 *in, size_t inlen);
-void blake2s_final(struct blake2s_state *state, u8 *out);
+/**
+ * blake2s_update() - Update a BLAKE2s context with message data
+ * @ctx: the context to update; must have been initialized
+ * @in: the message data
+ * @inlen: the data length in bytes
+ *
+ * This can be called any number of times.
+ *
+ * Context: Any context.
+ */
+void blake2s_update(struct blake2s_ctx *ctx, const u8 *in, size_t inlen);
 
-static inline void blake2s(u8 *out, const u8 *in, const u8 *key,
-			   const size_t outlen, const size_t inlen,
-			   const size_t keylen)
+/**
+ * blake2s_final() - Finish computing a BLAKE2s hash
+ * @ctx: the context to finalize; must have been initialized
+ * @out: (output) the resulting BLAKE2s hash.  Its length will be equal to the
+ *	 @outlen that was passed to blake2s_init() or blake2s_init_key().
+ *
+ * After finishing, this zeroizes @ctx.  So the caller does not need to do it.
+ *
+ * Context: Any context.
+ */
+void blake2s_final(struct blake2s_ctx *ctx, u8 *out);
+
+/**
+ * blake2s() - Compute BLAKE2s hash in one shot
+ * @key: the key, or NULL for an unkeyed hash
+ * @keylen: the key length in bytes (at most BLAKE2S_KEY_SIZE), or 0 for an
+ *	    unkeyed hash
+ * @in: the message data
+ * @inlen: the data length in bytes
+ * @out: (output) the resulting BLAKE2s hash, with length @outlen
+ * @outlen: length of output hash value in bytes, at most BLAKE2S_HASH_SIZE
+ *
+ * Context: Any context.
+ */
+static inline void blake2s(const u8 *key, size_t keylen,
+			   const u8 *in, size_t inlen,
+			   u8 *out, size_t outlen)
 {
-	struct blake2s_state state;
+	struct blake2s_ctx ctx;
 
 	WARN_ON(IS_ENABLED(DEBUG) && ((!in && inlen > 0) || !out || !outlen ||
 		outlen > BLAKE2S_HASH_SIZE || keylen > BLAKE2S_KEY_SIZE ||
 		(!key && keylen)));
 
-	__blake2s_init(&state, outlen, key, keylen);
-	blake2s_update(&state, in, inlen);
-	blake2s_final(&state, out);
+	__blake2s_init(&ctx, outlen, key, keylen);
+	blake2s_update(&ctx, in, inlen);
+	blake2s_final(&ctx, out);
 }
 
 #endif /* _CRYPTO_BLAKE2S_H */
diff --git a/include/crypto/chacha.h b/include/crypto/chacha.h
index 38e26dff27b0..1cc301a48469 100644
--- a/include/crypto/chacha.h
+++ b/include/crypto/chacha.h
@@ -38,18 +38,18 @@ struct chacha_state {
 };
 
 void chacha_block_generic(struct chacha_state *state,
-			  u8 out[CHACHA_BLOCK_SIZE], int nrounds);
+			  u8 out[at_least CHACHA_BLOCK_SIZE], int nrounds);
 static inline void chacha20_block(struct chacha_state *state,
-				  u8 out[CHACHA_BLOCK_SIZE])
+				  u8 out[at_least CHACHA_BLOCK_SIZE])
 {
 	chacha_block_generic(state, out, 20);
 }
 
 void hchacha_block_generic(const struct chacha_state *state,
-			   u32 out[HCHACHA_OUT_WORDS], int nrounds);
+			   u32 out[at_least HCHACHA_OUT_WORDS], int nrounds);
 
 void hchacha_block(const struct chacha_state *state,
-		   u32 out[HCHACHA_OUT_WORDS], int nrounds);
+		   u32 out[at_least HCHACHA_OUT_WORDS], int nrounds);
 
 enum chacha_constants { /* expand 32-byte k */
 	CHACHA_CONSTANT_EXPA = 0x61707865U,
@@ -67,8 +67,8 @@ static inline void chacha_init_consts(struct chacha_state *state)
 }
 
 static inline void chacha_init(struct chacha_state *state,
-			       const u32 key[CHACHA_KEY_WORDS],
-			       const u8 iv[CHACHA_IV_SIZE])
+			       const u32 key[at_least CHACHA_KEY_WORDS],
+			       const u8 iv[at_least CHACHA_IV_SIZE])
 {
 	chacha_init_consts(state);
 	state->x[4]  = key[0];
diff --git a/include/crypto/chacha20poly1305.h b/include/crypto/chacha20poly1305.h
index d2ac3ff7dc1e..0f71b037702d 100644
--- a/include/crypto/chacha20poly1305.h
+++ b/include/crypto/chacha20poly1305.h
@@ -18,32 +18,33 @@ enum chacha20poly1305_lengths {
 void chacha20poly1305_encrypt(u8 *dst, const u8 *src, const size_t src_len,
 			      const u8 *ad, const size_t ad_len,
 			      const u64 nonce,
-			      const u8 key[CHACHA20POLY1305_KEY_SIZE]);
+			      const u8 key[at_least CHACHA20POLY1305_KEY_SIZE]);
 
 bool __must_check
 chacha20poly1305_decrypt(u8 *dst, const u8 *src, const size_t src_len,
 			 const u8 *ad, const size_t ad_len, const u64 nonce,
-			 const u8 key[CHACHA20POLY1305_KEY_SIZE]);
+			 const u8 key[at_least CHACHA20POLY1305_KEY_SIZE]);
 
 void xchacha20poly1305_encrypt(u8 *dst, const u8 *src, const size_t src_len,
 			       const u8 *ad, const size_t ad_len,
-			       const u8 nonce[XCHACHA20POLY1305_NONCE_SIZE],
-			       const u8 key[CHACHA20POLY1305_KEY_SIZE]);
+			       const u8 nonce[at_least XCHACHA20POLY1305_NONCE_SIZE],
+			       const u8 key[at_least CHACHA20POLY1305_KEY_SIZE]);
 
 bool __must_check xchacha20poly1305_decrypt(
-	u8 *dst, const u8 *src, const size_t src_len, const u8 *ad,
-	const size_t ad_len, const u8 nonce[XCHACHA20POLY1305_NONCE_SIZE],
-	const u8 key[CHACHA20POLY1305_KEY_SIZE]);
+	u8 *dst, const u8 *src, const size_t src_len,
+	const u8 *ad, const size_t ad_len,
+	const u8 nonce[at_least XCHACHA20POLY1305_NONCE_SIZE],
+	const u8 key[at_least CHACHA20POLY1305_KEY_SIZE]);
 
 bool chacha20poly1305_encrypt_sg_inplace(struct scatterlist *src, size_t src_len,
 					 const u8 *ad, const size_t ad_len,
 					 const u64 nonce,
-					 const u8 key[CHACHA20POLY1305_KEY_SIZE]);
+					 const u8 key[at_least CHACHA20POLY1305_KEY_SIZE]);
 
 bool chacha20poly1305_decrypt_sg_inplace(struct scatterlist *src, size_t src_len,
 					 const u8 *ad, const size_t ad_len,
 					 const u64 nonce,
-					 const u8 key[CHACHA20POLY1305_KEY_SIZE]);
+					 const u8 key[at_least CHACHA20POLY1305_KEY_SIZE]);
 
 bool chacha20poly1305_selftest(void);
 
diff --git a/include/crypto/curve25519.h b/include/crypto/curve25519.h
index db63a5577c00..2362b48f8741 100644
--- a/include/crypto/curve25519.h
+++ b/include/crypto/curve25519.h
@@ -13,24 +13,28 @@ enum curve25519_lengths {
 	CURVE25519_KEY_SIZE = 32
 };
 
-void curve25519_generic(u8 out[CURVE25519_KEY_SIZE],
-			const u8 scalar[CURVE25519_KEY_SIZE],
-			const u8 point[CURVE25519_KEY_SIZE]);
+void curve25519_generic(u8 out[at_least CURVE25519_KEY_SIZE],
+			const u8 scalar[at_least CURVE25519_KEY_SIZE],
+			const u8 point[at_least CURVE25519_KEY_SIZE]);
 
-bool __must_check curve25519(u8 mypublic[CURVE25519_KEY_SIZE],
-			     const u8 secret[CURVE25519_KEY_SIZE],
-			     const u8 basepoint[CURVE25519_KEY_SIZE]);
+bool __must_check
+curve25519(u8 mypublic[at_least CURVE25519_KEY_SIZE],
+	   const u8 secret[at_least CURVE25519_KEY_SIZE],
+	   const u8 basepoint[at_least CURVE25519_KEY_SIZE]);
 
-bool __must_check curve25519_generate_public(u8 pub[CURVE25519_KEY_SIZE],
-					     const u8 secret[CURVE25519_KEY_SIZE]);
+bool __must_check
+curve25519_generate_public(u8 pub[at_least CURVE25519_KEY_SIZE],
+			   const u8 secret[at_least CURVE25519_KEY_SIZE]);
 
-static inline void curve25519_clamp_secret(u8 secret[CURVE25519_KEY_SIZE])
+static inline void
+curve25519_clamp_secret(u8 secret[at_least CURVE25519_KEY_SIZE])
 {
 	secret[0] &= 248;
 	secret[31] = (secret[31] & 127) | 64;
 }
 
-static inline void curve25519_generate_secret(u8 secret[CURVE25519_KEY_SIZE])
+static inline void
+curve25519_generate_secret(u8 secret[at_least CURVE25519_KEY_SIZE])
 {
 	get_random_bytes_wait(secret, CURVE25519_KEY_SIZE);
 	curve25519_clamp_secret(secret);
diff --git a/include/crypto/internal/blake2b.h b/include/crypto/internal/blake2b.h
deleted file mode 100644
index 3e09e2485306..000000000000
--- a/include/crypto/internal/blake2b.h
+++ /dev/null
@@ -1,101 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 OR MIT */
-/*
- * Helper functions for BLAKE2b implementations.
- * Keep this in sync with the corresponding BLAKE2s header.
- */
-
-#ifndef _CRYPTO_INTERNAL_BLAKE2B_H
-#define _CRYPTO_INTERNAL_BLAKE2B_H
-
-#include <asm/byteorder.h>
-#include <crypto/blake2b.h>
-#include <crypto/internal/hash.h>
-#include <linux/array_size.h>
-#include <linux/compiler.h>
-#include <linux/build_bug.h>
-#include <linux/errno.h>
-#include <linux/math.h>
-#include <linux/string.h>
-#include <linux/types.h>
-
-static inline void blake2b_set_lastblock(struct blake2b_state *state)
-{
-	state->f[0] = -1;
-	state->f[1] = 0;
-}
-
-static inline void blake2b_set_nonlast(struct blake2b_state *state)
-{
-	state->f[0] = 0;
-	state->f[1] = 0;
-}
-
-typedef void (*blake2b_compress_t)(struct blake2b_state *state,
-				   const u8 *block, size_t nblocks, u32 inc);
-
-/* Helper functions for shash implementations of BLAKE2b */
-
-struct blake2b_tfm_ctx {
-	u8 key[BLAKE2B_BLOCK_SIZE];
-	unsigned int keylen;
-};
-
-static inline int crypto_blake2b_setkey(struct crypto_shash *tfm,
-					const u8 *key, unsigned int keylen)
-{
-	struct blake2b_tfm_ctx *tctx = crypto_shash_ctx(tfm);
-
-	if (keylen > BLAKE2B_KEY_SIZE)
-		return -EINVAL;
-
-	BUILD_BUG_ON(BLAKE2B_KEY_SIZE > BLAKE2B_BLOCK_SIZE);
-
-	memcpy(tctx->key, key, keylen);
-	memset(tctx->key + keylen, 0, BLAKE2B_BLOCK_SIZE - keylen);
-	tctx->keylen = keylen;
-
-	return 0;
-}
-
-static inline int crypto_blake2b_init(struct shash_desc *desc)
-{
-	const struct blake2b_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm);
-	struct blake2b_state *state = shash_desc_ctx(desc);
-	unsigned int outlen = crypto_shash_digestsize(desc->tfm);
-
-	__blake2b_init(state, outlen, tctx->keylen);
-	return tctx->keylen ?
-	       crypto_shash_update(desc, tctx->key, BLAKE2B_BLOCK_SIZE) : 0;
-}
-
-static inline int crypto_blake2b_update_bo(struct shash_desc *desc,
-					   const u8 *in, unsigned int inlen,
-					   blake2b_compress_t compress)
-{
-	struct blake2b_state *state = shash_desc_ctx(desc);
-
-	blake2b_set_nonlast(state);
-	compress(state, in, inlen / BLAKE2B_BLOCK_SIZE, BLAKE2B_BLOCK_SIZE);
-	return inlen - round_down(inlen, BLAKE2B_BLOCK_SIZE);
-}
-
-static inline int crypto_blake2b_finup(struct shash_desc *desc, const u8 *in,
-				       unsigned int inlen, u8 *out,
-				       blake2b_compress_t compress)
-{
-	struct blake2b_state *state = shash_desc_ctx(desc);
-	u8 buf[BLAKE2B_BLOCK_SIZE];
-	int i;
-
-	memcpy(buf, in, inlen);
-	memset(buf + inlen, 0, BLAKE2B_BLOCK_SIZE - inlen);
-	blake2b_set_lastblock(state);
-	compress(state, buf, 1, inlen);
-	for (i = 0; i < ARRAY_SIZE(state->h); i++)
-		__cpu_to_le64s(&state->h[i]);
-	memcpy(out, state->h, crypto_shash_digestsize(desc->tfm));
-	memzero_explicit(buf, sizeof(buf));
-	return 0;
-}
-
-#endif /* _CRYPTO_INTERNAL_BLAKE2B_H */
diff --git a/include/crypto/md5.h b/include/crypto/md5.h
index c9aa5c3abc53..c47aedfe67ec 100644
--- a/include/crypto/md5.h
+++ b/include/crypto/md5.h
@@ -76,7 +76,7 @@ void md5_update(struct md5_ctx *ctx, const u8 *data, size_t len);
  *
  * Context: Any context.
  */
-void md5_final(struct md5_ctx *ctx, u8 out[MD5_DIGEST_SIZE]);
+void md5_final(struct md5_ctx *ctx, u8 out[at_least MD5_DIGEST_SIZE]);
 
 /**
  * md5() - Compute MD5 message digest in one shot
@@ -86,7 +86,7 @@ void md5_final(struct md5_ctx *ctx, u8 out[MD5_DIGEST_SIZE]);
  *
  * Context: Any context.
  */
-void md5(const u8 *data, size_t len, u8 out[MD5_DIGEST_SIZE]);
+void md5(const u8 *data, size_t len, u8 out[at_least MD5_DIGEST_SIZE]);
 
 /**
  * struct hmac_md5_key - Prepared key for HMAC-MD5
@@ -173,7 +173,7 @@ static inline void hmac_md5_update(struct hmac_md5_ctx *ctx,
  *
  * Context: Any context.
  */
-void hmac_md5_final(struct hmac_md5_ctx *ctx, u8 out[MD5_DIGEST_SIZE]);
+void hmac_md5_final(struct hmac_md5_ctx *ctx, u8 out[at_least MD5_DIGEST_SIZE]);
 
 /**
  * hmac_md5() - Compute HMAC-MD5 in one shot, using a prepared key
@@ -187,7 +187,8 @@ void hmac_md5_final(struct hmac_md5_ctx *ctx, u8 out[MD5_DIGEST_SIZE]);
  * Context: Any context.
  */
 void hmac_md5(const struct hmac_md5_key *key,
-	      const u8 *data, size_t data_len, u8 out[MD5_DIGEST_SIZE]);
+	      const u8 *data, size_t data_len,
+	      u8 out[at_least MD5_DIGEST_SIZE]);
 
 /**
  * hmac_md5_usingrawkey() - Compute HMAC-MD5 in one shot, using a raw key
@@ -204,6 +205,6 @@ void hmac_md5(const struct hmac_md5_key *key,
  */
 void hmac_md5_usingrawkey(const u8 *raw_key, size_t raw_key_len,
 			  const u8 *data, size_t data_len,
-			  u8 out[MD5_DIGEST_SIZE]);
+			  u8 out[at_least MD5_DIGEST_SIZE]);
 
 #endif /* _CRYPTO_MD5_H */
diff --git a/include/crypto/poly1305.h b/include/crypto/poly1305.h
index d4daeec8da19..190beb427c6d 100644
--- a/include/crypto/poly1305.h
+++ b/include/crypto/poly1305.h
@@ -59,7 +59,7 @@ struct poly1305_desc_ctx {
 };
 
 void poly1305_init(struct poly1305_desc_ctx *desc,
-		   const u8 key[POLY1305_KEY_SIZE]);
+		   const u8 key[at_least POLY1305_KEY_SIZE]);
 void poly1305_update(struct poly1305_desc_ctx *desc,
 		     const u8 *src, unsigned int nbytes);
 void poly1305_final(struct poly1305_desc_ctx *desc, u8 *digest);
diff --git a/include/crypto/polyval.h b/include/crypto/polyval.h
index d2e63743e592..b28b8ef11353 100644
--- a/include/crypto/polyval.h
+++ b/include/crypto/polyval.h
@@ -1,14 +1,190 @@
-/* SPDX-License-Identifier: GPL-2.0 */
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /*
- * Common values for the Polyval hash algorithm
+ * POLYVAL library API
  *
- * Copyright 2021 Google LLC
+ * Copyright 2025 Google LLC
  */
 
 #ifndef _CRYPTO_POLYVAL_H
 #define _CRYPTO_POLYVAL_H
 
+#include <linux/string.h>
+#include <linux/types.h>
+
 #define POLYVAL_BLOCK_SIZE	16
 #define POLYVAL_DIGEST_SIZE	16
 
+/**
+ * struct polyval_elem - An element of the POLYVAL finite field
+ * @bytes: View of the element as a byte array (unioned with @lo and @hi)
+ * @lo: The low 64 terms of the element's polynomial
+ * @hi: The high 64 terms of the element's polynomial
+ *
+ * This represents an element of the finite field GF(2^128), using the POLYVAL
+ * convention: little-endian byte order and natural bit order.
+ */
+struct polyval_elem {
+	union {
+		u8 bytes[POLYVAL_BLOCK_SIZE];
+		struct {
+			__le64 lo;
+			__le64 hi;
+		};
+	};
+};
+
+/**
+ * struct polyval_key - Prepared key for POLYVAL
+ *
+ * This may contain just the raw key H, or it may contain precomputed key
+ * powers, depending on the platform's POLYVAL implementation.  Use
+ * polyval_preparekey() to initialize this.
+ *
+ * By H^i we mean H^(i-1) * H * x^-128, with base case H^1 = H.  I.e. the
+ * exponentiation repeats the POLYVAL dot operation, with its "extra" x^-128.
+ */
+struct polyval_key {
+#ifdef CONFIG_CRYPTO_LIB_POLYVAL_ARCH
+#ifdef CONFIG_ARM64
+	/** @h_powers: Powers of the hash key H^8 through H^1 */
+	struct polyval_elem h_powers[8];
+#elif defined(CONFIG_X86)
+	/** @h_powers: Powers of the hash key H^8 through H^1 */
+	struct polyval_elem h_powers[8];
+#else
+#error "Unhandled arch"
 #endif
+#else /* CONFIG_CRYPTO_LIB_POLYVAL_ARCH */
+	/** @h: The hash key H */
+	struct polyval_elem h;
+#endif /* !CONFIG_CRYPTO_LIB_POLYVAL_ARCH */
+};
+
+/**
+ * struct polyval_ctx - Context for computing a POLYVAL value
+ * @key: Pointer to the prepared POLYVAL key.  The user of the API is
+ *	 responsible for ensuring that the key lives as long as the context.
+ * @acc: The accumulator
+ * @partial: Number of data bytes processed so far modulo POLYVAL_BLOCK_SIZE
+ */
+struct polyval_ctx {
+	const struct polyval_key *key;
+	struct polyval_elem acc;
+	size_t partial;
+};
+
+/**
+ * polyval_preparekey() - Prepare a POLYVAL key
+ * @key: (output) The key structure to initialize
+ * @raw_key: The raw hash key
+ *
+ * Initialize a POLYVAL key structure from a raw key.  This may be a simple
+ * copy, or it may involve precomputing powers of the key, depending on the
+ * platform's POLYVAL implementation.
+ *
+ * Context: Any context.
+ */
+#ifdef CONFIG_CRYPTO_LIB_POLYVAL_ARCH
+void polyval_preparekey(struct polyval_key *key,
+			const u8 raw_key[POLYVAL_BLOCK_SIZE]);
+
+#else
+static inline void polyval_preparekey(struct polyval_key *key,
+				      const u8 raw_key[POLYVAL_BLOCK_SIZE])
+{
+	/* Just a simple copy, so inline it. */
+	memcpy(key->h.bytes, raw_key, POLYVAL_BLOCK_SIZE);
+}
+#endif
+
+/**
+ * polyval_init() - Initialize a POLYVAL context for a new message
+ * @ctx: The context to initialize
+ * @key: The key to use.  Note that a pointer to the key is saved in the
+ *	 context, so the key must live at least as long as the context.
+ */
+static inline void polyval_init(struct polyval_ctx *ctx,
+				const struct polyval_key *key)
+{
+	*ctx = (struct polyval_ctx){ .key = key };
+}
+
+/**
+ * polyval_import_blkaligned() - Import a POLYVAL accumulator value
+ * @ctx: The context to initialize
+ * @key: The key to import.  Note that a pointer to the key is saved in the
+ *	 context, so the key must live at least as long as the context.
+ * @acc: The accumulator value to import.
+ *
+ * This imports an accumulator that was saved by polyval_export_blkaligned().
+ * The same key must be used.
+ */
+static inline void
+polyval_import_blkaligned(struct polyval_ctx *ctx,
+			  const struct polyval_key *key,
+			  const struct polyval_elem *acc)
+{
+	*ctx = (struct polyval_ctx){ .key = key, .acc = *acc };
+}
+
+/**
+ * polyval_export_blkaligned() - Export a POLYVAL accumulator value
+ * @ctx: The context to export the accumulator value from
+ * @acc: (output) The exported accumulator value
+ *
+ * This exports the accumulator from a POLYVAL context.  The number of data
+ * bytes processed so far must be a multiple of POLYVAL_BLOCK_SIZE.
+ */
+static inline void polyval_export_blkaligned(const struct polyval_ctx *ctx,
+					     struct polyval_elem *acc)
+{
+	*acc = ctx->acc;
+}
+
+/**
+ * polyval_update() - Update a POLYVAL context with message data
+ * @ctx: The context to update; must have been initialized
+ * @data: The message data
+ * @len: The data length in bytes.  Doesn't need to be block-aligned.
+ *
+ * This can be called any number of times.
+ *
+ * Context: Any context.
+ */
+void polyval_update(struct polyval_ctx *ctx, const u8 *data, size_t len);
+
+/**
+ * polyval_final() - Finish computing a POLYVAL value
+ * @ctx: The context to finalize
+ * @out: The output value
+ *
+ * If the total data length isn't a multiple of POLYVAL_BLOCK_SIZE, then the
+ * final block is automatically zero-padded.
+ *
+ * After finishing, this zeroizes @ctx.  So the caller does not need to do it.
+ *
+ * Context: Any context.
+ */
+void polyval_final(struct polyval_ctx *ctx, u8 out[POLYVAL_BLOCK_SIZE]);
+
+/**
+ * polyval() - Compute a POLYVAL value
+ * @key: The prepared key
+ * @data: The message data
+ * @len: The data length in bytes.  Doesn't need to be block-aligned.
+ * @out: The output value
+ *
+ * Context: Any context.
+ */
+static inline void polyval(const struct polyval_key *key,
+			   const u8 *data, size_t len,
+			   u8 out[POLYVAL_BLOCK_SIZE])
+{
+	struct polyval_ctx ctx;
+
+	polyval_init(&ctx, key);
+	polyval_update(&ctx, data, len);
+	polyval_final(&ctx, out);
+}
+
+#endif /* _CRYPTO_POLYVAL_H */
diff --git a/include/crypto/sha1.h b/include/crypto/sha1.h
index 162a529ec841..27f08b972931 100644
--- a/include/crypto/sha1.h
+++ b/include/crypto/sha1.h
@@ -84,7 +84,7 @@ void sha1_update(struct sha1_ctx *ctx, const u8 *data, size_t len);
  *
  * Context: Any context.
  */
-void sha1_final(struct sha1_ctx *ctx, u8 out[SHA1_DIGEST_SIZE]);
+void sha1_final(struct sha1_ctx *ctx, u8 out[at_least SHA1_DIGEST_SIZE]);
 
 /**
  * sha1() - Compute SHA-1 message digest in one shot
@@ -94,7 +94,7 @@ void sha1_final(struct sha1_ctx *ctx, u8 out[SHA1_DIGEST_SIZE]);
  *
  * Context: Any context.
  */
-void sha1(const u8 *data, size_t len, u8 out[SHA1_DIGEST_SIZE]);
+void sha1(const u8 *data, size_t len, u8 out[at_least SHA1_DIGEST_SIZE]);
 
 /**
  * struct hmac_sha1_key - Prepared key for HMAC-SHA1
@@ -181,7 +181,8 @@ static inline void hmac_sha1_update(struct hmac_sha1_ctx *ctx,
  *
  * Context: Any context.
  */
-void hmac_sha1_final(struct hmac_sha1_ctx *ctx, u8 out[SHA1_DIGEST_SIZE]);
+void hmac_sha1_final(struct hmac_sha1_ctx *ctx,
+		     u8 out[at_least SHA1_DIGEST_SIZE]);
 
 /**
  * hmac_sha1() - Compute HMAC-SHA1 in one shot, using a prepared key
@@ -195,7 +196,8 @@ void hmac_sha1_final(struct hmac_sha1_ctx *ctx, u8 out[SHA1_DIGEST_SIZE]);
  * Context: Any context.
  */
 void hmac_sha1(const struct hmac_sha1_key *key,
-	       const u8 *data, size_t data_len, u8 out[SHA1_DIGEST_SIZE]);
+	       const u8 *data, size_t data_len,
+	       u8 out[at_least SHA1_DIGEST_SIZE]);
 
 /**
  * hmac_sha1_usingrawkey() - Compute HMAC-SHA1 in one shot, using a raw key
@@ -212,6 +214,6 @@ void hmac_sha1(const struct hmac_sha1_key *key,
  */
 void hmac_sha1_usingrawkey(const u8 *raw_key, size_t raw_key_len,
 			   const u8 *data, size_t data_len,
-			   u8 out[SHA1_DIGEST_SIZE]);
+			   u8 out[at_least SHA1_DIGEST_SIZE]);
 
 #endif /* _CRYPTO_SHA1_H */
diff --git a/include/crypto/sha2.h b/include/crypto/sha2.h
index e5dafb935cc8..7bb8fe169daf 100644
--- a/include/crypto/sha2.h
+++ b/include/crypto/sha2.h
@@ -190,7 +190,7 @@ static inline void sha224_update(struct sha224_ctx *ctx,
  *
  * Context: Any context.
  */
-void sha224_final(struct sha224_ctx *ctx, u8 out[SHA224_DIGEST_SIZE]);
+void sha224_final(struct sha224_ctx *ctx, u8 out[at_least SHA224_DIGEST_SIZE]);
 
 /**
  * sha224() - Compute SHA-224 message digest in one shot
@@ -200,7 +200,7 @@ void sha224_final(struct sha224_ctx *ctx, u8 out[SHA224_DIGEST_SIZE]);
  *
  * Context: Any context.
  */
-void sha224(const u8 *data, size_t len, u8 out[SHA224_DIGEST_SIZE]);
+void sha224(const u8 *data, size_t len, u8 out[at_least SHA224_DIGEST_SIZE]);
 
 /**
  * struct hmac_sha224_key - Prepared key for HMAC-SHA224
@@ -287,7 +287,8 @@ static inline void hmac_sha224_update(struct hmac_sha224_ctx *ctx,
  *
  * Context: Any context.
  */
-void hmac_sha224_final(struct hmac_sha224_ctx *ctx, u8 out[SHA224_DIGEST_SIZE]);
+void hmac_sha224_final(struct hmac_sha224_ctx *ctx,
+		       u8 out[at_least SHA224_DIGEST_SIZE]);
 
 /**
  * hmac_sha224() - Compute HMAC-SHA224 in one shot, using a prepared key
@@ -301,7 +302,8 @@ void hmac_sha224_final(struct hmac_sha224_ctx *ctx, u8 out[SHA224_DIGEST_SIZE]);
  * Context: Any context.
  */
 void hmac_sha224(const struct hmac_sha224_key *key,
-		 const u8 *data, size_t data_len, u8 out[SHA224_DIGEST_SIZE]);
+		 const u8 *data, size_t data_len,
+		 u8 out[at_least SHA224_DIGEST_SIZE]);
 
 /**
  * hmac_sha224_usingrawkey() - Compute HMAC-SHA224 in one shot, using a raw key
@@ -318,7 +320,7 @@ void hmac_sha224(const struct hmac_sha224_key *key,
  */
 void hmac_sha224_usingrawkey(const u8 *raw_key, size_t raw_key_len,
 			     const u8 *data, size_t data_len,
-			     u8 out[SHA224_DIGEST_SIZE]);
+			     u8 out[at_least SHA224_DIGEST_SIZE]);
 
 /**
  * struct sha256_ctx - Context for hashing a message with SHA-256
@@ -363,7 +365,7 @@ static inline void sha256_update(struct sha256_ctx *ctx,
  *
  * Context: Any context.
  */
-void sha256_final(struct sha256_ctx *ctx, u8 out[SHA256_DIGEST_SIZE]);
+void sha256_final(struct sha256_ctx *ctx, u8 out[at_least SHA256_DIGEST_SIZE]);
 
 /**
  * sha256() - Compute SHA-256 message digest in one shot
@@ -373,7 +375,7 @@ void sha256_final(struct sha256_ctx *ctx, u8 out[SHA256_DIGEST_SIZE]);
  *
  * Context: Any context.
  */
-void sha256(const u8 *data, size_t len, u8 out[SHA256_DIGEST_SIZE]);
+void sha256(const u8 *data, size_t len, u8 out[at_least SHA256_DIGEST_SIZE]);
 
 /**
  * sha256_finup_2x() - Compute two SHA-256 digests from a common initial
@@ -390,8 +392,9 @@ void sha256(const u8 *data, size_t len, u8 out[SHA256_DIGEST_SIZE]);
  * Context: Any context.
  */
 void sha256_finup_2x(const struct sha256_ctx *ctx, const u8 *data1,
-		     const u8 *data2, size_t len, u8 out1[SHA256_DIGEST_SIZE],
-		     u8 out2[SHA256_DIGEST_SIZE]);
+		     const u8 *data2, size_t len,
+		     u8 out1[at_least SHA256_DIGEST_SIZE],
+		     u8 out2[at_least SHA256_DIGEST_SIZE]);
 
 /**
  * sha256_finup_2x_is_optimized() - Check if sha256_finup_2x() is using a real
@@ -488,7 +491,8 @@ static inline void hmac_sha256_update(struct hmac_sha256_ctx *ctx,
  *
  * Context: Any context.
  */
-void hmac_sha256_final(struct hmac_sha256_ctx *ctx, u8 out[SHA256_DIGEST_SIZE]);
+void hmac_sha256_final(struct hmac_sha256_ctx *ctx,
+		       u8 out[at_least SHA256_DIGEST_SIZE]);
 
 /**
  * hmac_sha256() - Compute HMAC-SHA256 in one shot, using a prepared key
@@ -502,7 +506,8 @@ void hmac_sha256_final(struct hmac_sha256_ctx *ctx, u8 out[SHA256_DIGEST_SIZE]);
  * Context: Any context.
  */
 void hmac_sha256(const struct hmac_sha256_key *key,
-		 const u8 *data, size_t data_len, u8 out[SHA256_DIGEST_SIZE]);
+		 const u8 *data, size_t data_len,
+		 u8 out[at_least SHA256_DIGEST_SIZE]);
 
 /**
  * hmac_sha256_usingrawkey() - Compute HMAC-SHA256 in one shot, using a raw key
@@ -519,7 +524,7 @@ void hmac_sha256(const struct hmac_sha256_key *key,
  */
 void hmac_sha256_usingrawkey(const u8 *raw_key, size_t raw_key_len,
 			     const u8 *data, size_t data_len,
-			     u8 out[SHA256_DIGEST_SIZE]);
+			     u8 out[at_least SHA256_DIGEST_SIZE]);
 
 /* State for the SHA-512 (and SHA-384) compression function */
 struct sha512_block_state {
@@ -598,7 +603,7 @@ static inline void sha384_update(struct sha384_ctx *ctx,
  *
  * Context: Any context.
  */
-void sha384_final(struct sha384_ctx *ctx, u8 out[SHA384_DIGEST_SIZE]);
+void sha384_final(struct sha384_ctx *ctx, u8 out[at_least SHA384_DIGEST_SIZE]);
 
 /**
  * sha384() - Compute SHA-384 message digest in one shot
@@ -608,7 +613,7 @@ void sha384_final(struct sha384_ctx *ctx, u8 out[SHA384_DIGEST_SIZE]);
  *
  * Context: Any context.
  */
-void sha384(const u8 *data, size_t len, u8 out[SHA384_DIGEST_SIZE]);
+void sha384(const u8 *data, size_t len, u8 out[at_least SHA384_DIGEST_SIZE]);
 
 /**
  * struct hmac_sha384_key - Prepared key for HMAC-SHA384
@@ -695,7 +700,8 @@ static inline void hmac_sha384_update(struct hmac_sha384_ctx *ctx,
  *
  * Context: Any context.
  */
-void hmac_sha384_final(struct hmac_sha384_ctx *ctx, u8 out[SHA384_DIGEST_SIZE]);
+void hmac_sha384_final(struct hmac_sha384_ctx *ctx,
+		       u8 out[at_least SHA384_DIGEST_SIZE]);
 
 /**
  * hmac_sha384() - Compute HMAC-SHA384 in one shot, using a prepared key
@@ -709,7 +715,8 @@ void hmac_sha384_final(struct hmac_sha384_ctx *ctx, u8 out[SHA384_DIGEST_SIZE]);
  * Context: Any context.
  */
 void hmac_sha384(const struct hmac_sha384_key *key,
-		 const u8 *data, size_t data_len, u8 out[SHA384_DIGEST_SIZE]);
+		 const u8 *data, size_t data_len,
+		 u8 out[at_least SHA384_DIGEST_SIZE]);
 
 /**
  * hmac_sha384_usingrawkey() - Compute HMAC-SHA384 in one shot, using a raw key
@@ -726,7 +733,7 @@ void hmac_sha384(const struct hmac_sha384_key *key,
  */
 void hmac_sha384_usingrawkey(const u8 *raw_key, size_t raw_key_len,
 			     const u8 *data, size_t data_len,
-			     u8 out[SHA384_DIGEST_SIZE]);
+			     u8 out[at_least SHA384_DIGEST_SIZE]);
 
 /**
  * struct sha512_ctx - Context for hashing a message with SHA-512
@@ -771,7 +778,7 @@ static inline void sha512_update(struct sha512_ctx *ctx,
  *
  * Context: Any context.
  */
-void sha512_final(struct sha512_ctx *ctx, u8 out[SHA512_DIGEST_SIZE]);
+void sha512_final(struct sha512_ctx *ctx, u8 out[at_least SHA512_DIGEST_SIZE]);
 
 /**
  * sha512() - Compute SHA-512 message digest in one shot
@@ -781,7 +788,7 @@ void sha512_final(struct sha512_ctx *ctx, u8 out[SHA512_DIGEST_SIZE]);
  *
  * Context: Any context.
  */
-void sha512(const u8 *data, size_t len, u8 out[SHA512_DIGEST_SIZE]);
+void sha512(const u8 *data, size_t len, u8 out[at_least SHA512_DIGEST_SIZE]);
 
 /**
  * struct hmac_sha512_key - Prepared key for HMAC-SHA512
@@ -868,7 +875,8 @@ static inline void hmac_sha512_update(struct hmac_sha512_ctx *ctx,
  *
  * Context: Any context.
  */
-void hmac_sha512_final(struct hmac_sha512_ctx *ctx, u8 out[SHA512_DIGEST_SIZE]);
+void hmac_sha512_final(struct hmac_sha512_ctx *ctx,
+		       u8 out[at_least SHA512_DIGEST_SIZE]);
 
 /**
  * hmac_sha512() - Compute HMAC-SHA512 in one shot, using a prepared key
@@ -882,7 +890,8 @@ void hmac_sha512_final(struct hmac_sha512_ctx *ctx, u8 out[SHA512_DIGEST_SIZE]);
  * Context: Any context.
  */
 void hmac_sha512(const struct hmac_sha512_key *key,
-		 const u8 *data, size_t data_len, u8 out[SHA512_DIGEST_SIZE]);
+		 const u8 *data, size_t data_len,
+		 u8 out[at_least SHA512_DIGEST_SIZE]);
 
 /**
  * hmac_sha512_usingrawkey() - Compute HMAC-SHA512 in one shot, using a raw key
@@ -899,6 +908,6 @@ void hmac_sha512(const struct hmac_sha512_key *key,
  */
 void hmac_sha512_usingrawkey(const u8 *raw_key, size_t raw_key_len,
 			     const u8 *data, size_t data_len,
-			     u8 out[SHA512_DIGEST_SIZE]);
+			     u8 out[at_least SHA512_DIGEST_SIZE]);
 
 #endif /* _CRYPTO_SHA2_H */
diff --git a/include/crypto/sha3.h b/include/crypto/sha3.h
index 41e1b83a6d91..c9e4182ff74f 100644
--- a/include/crypto/sha3.h
+++ b/include/crypto/sha3.h
@@ -1,11 +1,14 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 /*
  * Common values for SHA-3 algorithms
+ *
+ * See also Documentation/crypto/sha3.rst
  */
 #ifndef __CRYPTO_SHA3_H__
 #define __CRYPTO_SHA3_H__
 
 #include <linux/types.h>
+#include <linux/string.h>
 
 #define SHA3_224_DIGEST_SIZE	(224 / 8)
 #define SHA3_224_BLOCK_SIZE	(200 - 2 * SHA3_224_DIGEST_SIZE)
@@ -23,14 +26,321 @@
 #define SHA3_512_BLOCK_SIZE	(200 - 2 * SHA3_512_DIGEST_SIZE)
 #define SHA3_512_EXPORT_SIZE	SHA3_STATE_SIZE + SHA3_512_BLOCK_SIZE + 1
 
-#define SHA3_STATE_SIZE		200
+/*
+ * SHAKE128 and SHAKE256 actually have variable output size, but this is used to
+ * calculate the block size (rate) analogously to the above.
+ */
+#define SHAKE128_DEFAULT_SIZE	(128 / 8)
+#define SHAKE128_BLOCK_SIZE	(200 - 2 * SHAKE128_DEFAULT_SIZE)
+#define SHAKE256_DEFAULT_SIZE	(256 / 8)
+#define SHAKE256_BLOCK_SIZE	(200 - 2 * SHAKE256_DEFAULT_SIZE)
 
-struct shash_desc;
+#define SHA3_STATE_SIZE		200
 
+/*
+ * State for the Keccak-f[1600] permutation: 25 64-bit words.
+ *
+ * We usually keep the state words as little-endian, to make absorbing and
+ * squeezing easier.  (It means that absorbing and squeezing can just treat the
+ * state as a byte array.)  The state words are converted to native-endian only
+ * temporarily by implementations of the permutation that need native-endian
+ * words.  Of course, that conversion is a no-op on little-endian machines.
+ */
 struct sha3_state {
-	u64		st[SHA3_STATE_SIZE / 8];
+	union {
+		__le64 words[SHA3_STATE_SIZE / 8];
+		u8 bytes[SHA3_STATE_SIZE];
+
+		u64 native_words[SHA3_STATE_SIZE / 8]; /* see comment above */
+	};
+};
+
+/* Internal context, shared by the digests (SHA3-*) and the XOFs (SHAKE*) */
+struct __sha3_ctx {
+	struct sha3_state state;
+	u8 digest_size;		/* Digests only: the digest size in bytes */
+	u8 block_size;		/* Block size in bytes */
+	u8 absorb_offset;	/* Index of next state byte to absorb into */
+	u8 squeeze_offset;	/* XOFs only: index of next state byte to extract */
+};
+
+void __sha3_update(struct __sha3_ctx *ctx, const u8 *in, size_t in_len);
+
+/**
+ * struct sha3_ctx - Context for SHA3-224, SHA3-256, SHA3-384, or SHA3-512
+ * @ctx: private
+ */
+struct sha3_ctx {
+	struct __sha3_ctx ctx;
 };
 
-int crypto_sha3_init(struct shash_desc *desc);
+/**
+ * sha3_zeroize_ctx() - Zeroize a SHA-3 context
+ * @ctx: The context to zeroize
+ *
+ * This is already called by sha3_final().  Call this explicitly when abandoning
+ * a context without calling sha3_final().
+ */
+static inline void sha3_zeroize_ctx(struct sha3_ctx *ctx)
+{
+	memzero_explicit(ctx, sizeof(*ctx));
+}
+
+/**
+ * struct shake_ctx - Context for SHAKE128 or SHAKE256
+ * @ctx: private
+ */
+struct shake_ctx {
+	struct __sha3_ctx ctx;
+};
+
+/**
+ * shake_zeroize_ctx() - Zeroize a SHAKE context
+ * @ctx: The context to zeroize
+ *
+ * Call this after the last squeeze.
+ */
+static inline void shake_zeroize_ctx(struct shake_ctx *ctx)
+{
+	memzero_explicit(ctx, sizeof(*ctx));
+}
+
+/**
+ * sha3_224_init() - Initialize a context for SHA3-224
+ * @ctx: The context to initialize
+ *
+ * This begins a new SHA3-224 message digest computation.
+ *
+ * Context: Any context.
+ */
+static inline void sha3_224_init(struct sha3_ctx *ctx)
+{
+	*ctx = (struct sha3_ctx){
+		.ctx.digest_size = SHA3_224_DIGEST_SIZE,
+		.ctx.block_size = SHA3_224_BLOCK_SIZE,
+	};
+}
+
+/**
+ * sha3_256_init() - Initialize a context for SHA3-256
+ * @ctx: The context to initialize
+ *
+ * This begins a new SHA3-256 message digest computation.
+ *
+ * Context: Any context.
+ */
+static inline void sha3_256_init(struct sha3_ctx *ctx)
+{
+	*ctx = (struct sha3_ctx){
+		.ctx.digest_size = SHA3_256_DIGEST_SIZE,
+		.ctx.block_size = SHA3_256_BLOCK_SIZE,
+	};
+}
+
+/**
+ * sha3_384_init() - Initialize a context for SHA3-384
+ * @ctx: The context to initialize
+ *
+ * This begins a new SHA3-384 message digest computation.
+ *
+ * Context: Any context.
+ */
+static inline void sha3_384_init(struct sha3_ctx *ctx)
+{
+	*ctx = (struct sha3_ctx){
+		.ctx.digest_size = SHA3_384_DIGEST_SIZE,
+		.ctx.block_size = SHA3_384_BLOCK_SIZE,
+	};
+}
+
+/**
+ * sha3_512_init() - Initialize a context for SHA3-512
+ * @ctx: The context to initialize
+ *
+ * This begins a new SHA3-512 message digest computation.
+ *
+ * Context: Any context.
+ */
+static inline void sha3_512_init(struct sha3_ctx *ctx)
+{
+	*ctx = (struct sha3_ctx){
+		.ctx.digest_size = SHA3_512_DIGEST_SIZE,
+		.ctx.block_size = SHA3_512_BLOCK_SIZE,
+	};
+}
+
+/**
+ * sha3_update() - Update a SHA-3 digest context with input data
+ * @ctx: The context to update; must have been initialized
+ * @in: The input data
+ * @in_len: Length of the input data in bytes
+ *
+ * This can be called any number of times to add data to a SHA3-224, SHA3-256,
+ * SHA3-384, or SHA3-512 digest (depending on which init function was called).
+ *
+ * Context: Any context.
+ */
+static inline void sha3_update(struct sha3_ctx *ctx,
+			       const u8 *in, size_t in_len)
+{
+	__sha3_update(&ctx->ctx, in, in_len);
+}
+
+/**
+ * sha3_final() - Finish computing a SHA-3 message digest
+ * @ctx: The context to finalize; must have been initialized
+ * @out: (output) The resulting SHA3-224, SHA3-256, SHA3-384, or SHA3-512
+ *	 message digest, matching the init function that was called.  Note that
+ *	 the size differs for each one; see SHA3_*_DIGEST_SIZE.
+ *
+ * After finishing, this zeroizes @ctx.  So the caller does not need to do it.
+ *
+ * Context: Any context.
+ */
+void sha3_final(struct sha3_ctx *ctx, u8 *out);
+
+/**
+ * shake128_init() - Initialize a context for SHAKE128
+ * @ctx: The context to initialize
+ *
+ * This begins a new SHAKE128 extendable-output function (XOF) computation.
+ *
+ * Context: Any context.
+ */
+static inline void shake128_init(struct shake_ctx *ctx)
+{
+	*ctx = (struct shake_ctx){
+		.ctx.block_size = SHAKE128_BLOCK_SIZE,
+	};
+}
+
+/**
+ * shake256_init() - Initialize a context for SHAKE256
+ * @ctx: The context to initialize
+ *
+ * This begins a new SHAKE256 extendable-output function (XOF) computation.
+ *
+ * Context: Any context.
+ */
+static inline void shake256_init(struct shake_ctx *ctx)
+{
+	*ctx = (struct shake_ctx){
+		.ctx.block_size = SHAKE256_BLOCK_SIZE,
+	};
+}
+
+/**
+ * shake_update() - Update a SHAKE context with input data
+ * @ctx: The context to update; must have been initialized
+ * @in: The input data
+ * @in_len: Length of the input data in bytes
+ *
+ * This can be called any number of times to add more input data to SHAKE128 or
+ * SHAKE256.  This cannot be called after squeezing has begun.
+ *
+ * Context: Any context.
+ */
+static inline void shake_update(struct shake_ctx *ctx,
+				const u8 *in, size_t in_len)
+{
+	__sha3_update(&ctx->ctx, in, in_len);
+}
+
+/**
+ * shake_squeeze() - Generate output from SHAKE128 or SHAKE256
+ * @ctx: The context to squeeze; must have been initialized
+ * @out: Where to write the resulting output data
+ * @out_len: The amount of data to extract to @out in bytes
+ *
+ * This may be called multiple times.  A number of consecutive squeezes laid
+ * end-to-end will yield the same output as one big squeeze generating the same
+ * total amount of output.  More input cannot be provided after squeezing has
+ * begun.  After the last squeeze, call shake_zeroize_ctx().
+ *
+ * Context: Any context.
+ */
+void shake_squeeze(struct shake_ctx *ctx, u8 *out, size_t out_len);
+
+/**
+ * sha3_224() - Compute SHA3-224 digest in one shot
+ * @in: The input data to be digested
+ * @in_len: Length of the input data in bytes
+ * @out: The buffer into which the digest will be stored
+ *
+ * Convenience function that computes a SHA3-224 digest.  Use this instead of
+ * the incremental API if you're able to provide all the input at once.
+ *
+ * Context: Any context.
+ */
+void sha3_224(const u8 *in, size_t in_len, u8 out[SHA3_224_DIGEST_SIZE]);
+
+/**
+ * sha3_256() - Compute SHA3-256 digest in one shot
+ * @in: The input data to be digested
+ * @in_len: Length of the input data in bytes
+ * @out: The buffer into which the digest will be stored
+ *
+ * Convenience function that computes a SHA3-256 digest.  Use this instead of
+ * the incremental API if you're able to provide all the input at once.
+ *
+ * Context: Any context.
+ */
+void sha3_256(const u8 *in, size_t in_len, u8 out[SHA3_256_DIGEST_SIZE]);
+
+/**
+ * sha3_384() - Compute SHA3-384 digest in one shot
+ * @in: The input data to be digested
+ * @in_len: Length of the input data in bytes
+ * @out: The buffer into which the digest will be stored
+ *
+ * Convenience function that computes a SHA3-384 digest.  Use this instead of
+ * the incremental API if you're able to provide all the input at once.
+ *
+ * Context: Any context.
+ */
+void sha3_384(const u8 *in, size_t in_len, u8 out[SHA3_384_DIGEST_SIZE]);
+
+/**
+ * sha3_512() - Compute SHA3-512 digest in one shot
+ * @in: The input data to be digested
+ * @in_len: Length of the input data in bytes
+ * @out: The buffer into which the digest will be stored
+ *
+ * Convenience function that computes a SHA3-512 digest.  Use this instead of
+ * the incremental API if you're able to provide all the input at once.
+ *
+ * Context: Any context.
+ */
+void sha3_512(const u8 *in, size_t in_len, u8 out[SHA3_512_DIGEST_SIZE]);
+
+/**
+ * shake128() - Compute SHAKE128 in one shot
+ * @in: The input data to be used
+ * @in_len: Length of the input data in bytes
+ * @out: The buffer into which the output will be stored
+ * @out_len: Length of the output to produce in bytes
+ *
+ * Convenience function that computes SHAKE128 in one shot.  Use this instead of
+ * the incremental API if you're able to provide all the input at once as well
+ * as receive all the output at once.  All output lengths are supported.
+ *
+ * Context: Any context.
+ */
+void shake128(const u8 *in, size_t in_len, u8 *out, size_t out_len);
+
+/**
+ * shake256() - Compute SHAKE256 in one shot
+ * @in: The input data to be used
+ * @in_len: Length of the input data in bytes
+ * @out: The buffer into which the output will be stored
+ * @out_len: Length of the output to produce in bytes
+ *
+ * Convenience function that computes SHAKE256 in one shot.  Use this instead of
+ * the incremental API if you're able to provide all the input at once as well
+ * as receive all the output at once.  All output lengths are supported.
+ *
+ * Context: Any context.
+ */
+void shake256(const u8 *in, size_t in_len, u8 *out, size_t out_len);
 
-#endif
+#endif /* __CRYPTO_SHA3_H__ */
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 607db773b672..fbf0c3a65f59 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -8,6 +8,7 @@
 #ifndef _LINUX_ACPI_H
 #define _LINUX_ACPI_H
 
+#include <linux/cleanup.h>
 #include <linux/errno.h>
 #include <linux/ioport.h>	/* for struct resource */
 #include <linux/resource_ext.h>
@@ -221,6 +222,17 @@ void acpi_reserve_initial_tables (void);
 void acpi_table_init_complete (void);
 int acpi_table_init (void);
 
+static inline struct acpi_table_header *acpi_get_table_pointer(char *signature, u32 instance)
+{
+	struct acpi_table_header *table;
+	int status = acpi_get_table(signature, instance, &table);
+
+	if (ACPI_FAILURE(status))
+		return ERR_PTR(-ENOENT);
+	return table;
+}
+DEFINE_FREE(acpi_put_table, struct acpi_table_header *, if (!IS_ERR_OR_NULL(_T)) acpi_put_table(_T))
+
 int acpi_table_parse(char *id, acpi_tbl_table_handler handler);
 int __init_or_acpilib acpi_table_parse_entries(char *id,
 		unsigned long table_size, int entry_id,
@@ -755,7 +767,6 @@ int acpi_reconfig_notifier_unregister(struct notifier_block *nb);
 int acpi_gtdt_init(struct acpi_table_header *table, int *platform_timer_count);
 int acpi_gtdt_map_ppi(int type);
 bool acpi_gtdt_c3stop(int type);
-int acpi_arch_timer_mem_init(struct arch_timer_mem *timer_mem, int *timer_count);
 #endif
 
 #ifndef ACPI_HAVE_ARCH_SET_ROOT_POINTER
@@ -1146,12 +1157,7 @@ struct acpi_s2idle_dev_ops {
 #if defined(CONFIG_SUSPEND) && defined(CONFIG_X86)
 int acpi_register_lps0_dev(struct acpi_s2idle_dev_ops *arg);
 void acpi_unregister_lps0_dev(struct acpi_s2idle_dev_ops *arg);
-int acpi_get_lps0_constraint(struct acpi_device *adev);
 #else /* CONFIG_SUSPEND && CONFIG_X86 */
-static inline int acpi_get_lps0_constraint(struct device *dev)
-{
-	return ACPI_STATE_UNKNOWN;
-}
 static inline int acpi_register_lps0_dev(struct acpi_s2idle_dev_ops *arg)
 {
 	return -ENODEV;
@@ -1349,9 +1355,6 @@ acpi_data_add_props(struct acpi_device_data *data, const guid_t *guid,
 int acpi_node_prop_get(const struct fwnode_handle *fwnode, const char *propname,
 		       void **valptr);
 
-struct fwnode_handle *acpi_get_next_subnode(const struct fwnode_handle *fwnode,
-					    struct fwnode_handle *child);
-
 struct acpi_probe_entry;
 typedef bool (*acpi_probe_entry_validate_subtbl)(struct acpi_subtable_header *,
 						 struct acpi_probe_entry *);
@@ -1451,13 +1454,6 @@ static inline int acpi_node_prop_get(const struct fwnode_handle *fwnode,
 }
 
 static inline struct fwnode_handle *
-acpi_get_next_subnode(const struct fwnode_handle *fwnode,
-		      struct fwnode_handle *child)
-{
-	return NULL;
-}
-
-static inline struct fwnode_handle *
 acpi_graph_get_next_endpoint(const struct fwnode_handle *fwnode,
 			     struct fwnode_handle *prev)
 {
@@ -1548,6 +1544,9 @@ int find_acpi_cpu_topology(unsigned int cpu, int level);
 int find_acpi_cpu_topology_cluster(unsigned int cpu);
 int find_acpi_cpu_topology_package(unsigned int cpu);
 int find_acpi_cpu_topology_hetero_id(unsigned int cpu);
+void acpi_pptt_get_cpus_from_container(u32 acpi_cpu_id, cpumask_t *cpus);
+int find_acpi_cache_level_from_id(u32 cache_id);
+int acpi_pptt_get_cpumask_from_cache_id(u32 cache_id, cpumask_t *cpus);
 #else
 static inline int acpi_pptt_cpu_is_thread(unsigned int cpu)
 {
@@ -1569,6 +1568,17 @@ static inline int find_acpi_cpu_topology_hetero_id(unsigned int cpu)
 {
 	return -EINVAL;
 }
+static inline void acpi_pptt_get_cpus_from_container(u32 acpi_cpu_id,
+						     cpumask_t *cpus) { }
+static inline int find_acpi_cache_level_from_id(u32 cache_id)
+{
+	return -ENOENT;
+}
+static inline int acpi_pptt_get_cpumask_from_cache_id(u32 cache_id,
+						      cpumask_t *cpus)
+{
+	return -ENOENT;
+}
 #endif
 
 void acpi_arch_init(void);
diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h
index d72d6e5aa200..0c2a8b846c20 100644
--- a/include/linux/arch_topology.h
+++ b/include/linux/arch_topology.h
@@ -89,6 +89,21 @@ void remove_cpu_topology(unsigned int cpuid);
 void reset_cpu_topology(void);
 int parse_acpi_topology(void);
 void freq_inv_set_max_ratio(int cpu, u64 max_rate);
-#endif
+
+/*
+ * Architectures like ARM64 don't have reliable architectural way to get SMT
+ * information and depend on the firmware (ACPI/OF) report. Non-SMT core won't
+ * initialize thread_id so we can use this to detect the SMT implementation.
+ */
+static inline bool topology_core_has_smt(int cpu)
+{
+	return cpu_topology[cpu].thread_id != -1;
+}
+
+#else
+
+static inline bool topology_core_has_smt(int cpu) { return false; }
+
+#endif /* CONFIG_GENERIC_ARCH_TOPOLOGY */
 
 #endif /* _LINUX_ARCH_TOPOLOGY_H_ */
diff --git a/include/linux/arm_mpam.h b/include/linux/arm_mpam.h
new file mode 100644
index 000000000000..7f00c5285a32
--- /dev/null
+++ b/include/linux/arm_mpam.h
@@ -0,0 +1,66 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2025 Arm Ltd. */
+
+#ifndef __LINUX_ARM_MPAM_H
+#define __LINUX_ARM_MPAM_H
+
+#include <linux/acpi.h>
+#include <linux/types.h>
+
+struct mpam_msc;
+
+enum mpam_msc_iface {
+	MPAM_IFACE_MMIO,	/* a real MPAM MSC */
+	MPAM_IFACE_PCC,		/* a fake MPAM MSC */
+};
+
+enum mpam_class_types {
+	MPAM_CLASS_CACHE,	/* Caches, e.g. L2, L3 */
+	MPAM_CLASS_MEMORY,	/* Main memory */
+	MPAM_CLASS_UNKNOWN,	/* Everything else, e.g. SMMU */
+};
+
+#define MPAM_CLASS_ID_DEFAULT	255
+
+#ifdef CONFIG_ACPI_MPAM
+int acpi_mpam_parse_resources(struct mpam_msc *msc,
+			      struct acpi_mpam_msc_node *tbl_msc);
+
+int acpi_mpam_count_msc(void);
+#else
+static inline int acpi_mpam_parse_resources(struct mpam_msc *msc,
+					    struct acpi_mpam_msc_node *tbl_msc)
+{
+	return -EINVAL;
+}
+
+static inline int acpi_mpam_count_msc(void) { return -EINVAL; }
+#endif
+
+#ifdef CONFIG_ARM64_MPAM_DRIVER
+int mpam_ris_create(struct mpam_msc *msc, u8 ris_idx,
+		    enum mpam_class_types type, u8 class_id, int component_id);
+#else
+static inline int mpam_ris_create(struct mpam_msc *msc, u8 ris_idx,
+				  enum mpam_class_types type, u8 class_id,
+				  int component_id)
+{
+	return -EINVAL;
+}
+#endif
+
+/**
+ * mpam_register_requestor() - Register a requestor with the MPAM driver
+ * @partid_max:		The maximum PARTID value the requestor can generate.
+ * @pmg_max:		The maximum PMG value the requestor can generate.
+ *
+ * Registers a requestor with the MPAM driver to ensure the chosen system-wide
+ * minimum PARTID and PMG values will allow the requestors features to be used.
+ *
+ * Returns an error if the registration is too late, and a larger PARTID/PMG
+ * value has been advertised to user-space. In this case the requestor should
+ * not use its MPAM features. Returns 0 on success.
+ */
+int mpam_register_requestor(u16 partid_max, u8 pmg_max);
+
+#endif /* __LINUX_ARM_MPAM_H */
diff --git a/include/linux/byteorder/generic.h b/include/linux/byteorder/generic.h
index b3705e8bbe2b..55a44199de87 100644
--- a/include/linux/byteorder/generic.h
+++ b/include/linux/byteorder/generic.h
@@ -173,6 +173,22 @@ static inline void cpu_to_le32_array(u32 *buf, unsigned int words)
 	}
 }
 
+static inline void le64_to_cpu_array(u64 *buf, unsigned int words)
+{
+	while (words--) {
+		__le64_to_cpus(buf);
+		buf++;
+	}
+}
+
+static inline void cpu_to_le64_array(u64 *buf, unsigned int words)
+{
+	while (words--) {
+		__cpu_to_le64s(buf);
+		buf++;
+	}
+}
+
 static inline void memcpy_from_le32(u32 *dst, const __le32 *src, size_t words)
 {
 	size_t i;
diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h
index 0a1b9598940d..3eac51d68426 100644
--- a/include/linux/compiler_types.h
+++ b/include/linux/compiler_types.h
@@ -393,6 +393,21 @@ struct ftrace_likely_data {
 #define __counted_by_be(member)	__counted_by(member)
 #endif
 
+/*
+ * This designates the minimum number of elements a passed array parameter must
+ * have. For example:
+ *
+ *     void some_function(u8 param[at_least 7]);
+ *
+ * If a caller passes an array with fewer than 7 elements, the compiler will
+ * emit a warning.
+ */
+#ifndef __CHECKER__
+#define at_least static
+#else
+#define at_least
+#endif
+
 /* Do not trap wrapping arithmetic within an annotated function. */
 #ifdef CONFIG_UBSAN_INTEGER_WRAP
 # define __signed_wrap __attribute__((no_sanitize("signed-integer-overflow")))
diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
index a9ee4fe55dcf..4073690504a7 100644
--- a/include/linux/cpuidle.h
+++ b/include/linux/cpuidle.h
@@ -248,7 +248,8 @@ extern int cpuidle_find_deepest_state(struct cpuidle_driver *drv,
 				      struct cpuidle_device *dev,
 				      u64 latency_limit_ns);
 extern int cpuidle_enter_s2idle(struct cpuidle_driver *drv,
-				struct cpuidle_device *dev);
+				struct cpuidle_device *dev,
+				u64 latency_limit_ns);
 extern void cpuidle_use_deepest_state(u64 latency_limit_ns);
 #else
 static inline int cpuidle_find_deepest_state(struct cpuidle_driver *drv,
@@ -256,7 +257,8 @@ static inline int cpuidle_find_deepest_state(struct cpuidle_driver *drv,
 					     u64 latency_limit_ns)
 {return -ENODEV; }
 static inline int cpuidle_enter_s2idle(struct cpuidle_driver *drv,
-				       struct cpuidle_device *dev)
+				       struct cpuidle_device *dev,
+				       u64 latency_limit_ns)
 {return -ENODEV; }
 static inline void cpuidle_use_deepest_state(u64 latency_limit_ns)
 {
diff --git a/drivers/devfreq/governor.h b/include/linux/devfreq-governor.h
index 0adfebc0467a..dfdd0160a29f 100644
--- a/drivers/devfreq/governor.h
+++ b/include/linux/devfreq-governor.h
@@ -5,11 +5,11 @@
  * Copyright (C) 2011 Samsung Electronics
  *	MyungJoo Ham <myungjoo.ham@samsung.com>
  *
- * This header is for devfreq governors in drivers/devfreq/
+ * This header is for devfreq governors
  */
 
-#ifndef _GOVERNOR_H
-#define _GOVERNOR_H
+#ifndef __LINUX_DEVFREQ_DEVFREQ_H__
+#define __LINUX_DEVFREQ_DEVFREQ_H__
 
 #include <linux/devfreq.h>
 
@@ -48,31 +48,6 @@
 #define DEVFREQ_GOV_ATTR_TIMER				BIT(1)
 
 /**
- * struct devfreq_cpu_data - Hold the per-cpu data
- * @node:	list node
- * @dev:	reference to cpu device.
- * @first_cpu:	the cpumask of the first cpu of a policy.
- * @opp_table:	reference to cpu opp table.
- * @cur_freq:	the current frequency of the cpu.
- * @min_freq:	the min frequency of the cpu.
- * @max_freq:	the max frequency of the cpu.
- *
- * This structure stores the required cpu_data of a cpu.
- * This is auto-populated by the governor.
- */
-struct devfreq_cpu_data {
-	struct list_head node;
-
-	struct device *dev;
-	unsigned int first_cpu;
-
-	struct opp_table *opp_table;
-	unsigned int cur_freq;
-	unsigned int min_freq;
-	unsigned int max_freq;
-};
-
-/**
  * struct devfreq_governor - Devfreq policy governor
  * @node:		list node - contains registered devfreq governors
  * @name:		Governor's name
@@ -124,4 +99,4 @@ static inline int devfreq_update_stats(struct devfreq *df)
 
 	return df->profile->get_dev_status(df->dev.parent, &df->last_status);
 }
-#endif /* _GOVERNOR_H */
+#endif /* __LINUX_DEVFREQ_DEVFREQ_H__ */
diff --git a/include/linux/efi.h b/include/linux/efi.h
index a98cc39e7aaa..b23ff8b83219 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -1126,6 +1126,8 @@ static inline bool efi_runtime_disabled(void) { return true; }
 extern void efi_call_virt_check_flags(unsigned long flags, const void *caller);
 extern unsigned long efi_call_virt_save_flags(void);
 
+void efi_runtime_assert_lock_held(void);
+
 enum efi_secureboot_mode {
 	efi_secureboot_mode_unset,
 	efi_secureboot_mode_unknown,
diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h
index 61d50571ad88..43aa6153dc57 100644
--- a/include/linux/energy_model.h
+++ b/include/linux/energy_model.h
@@ -54,6 +54,8 @@ struct em_perf_table {
 /**
  * struct em_perf_domain - Performance domain
  * @em_table:		Pointer to the runtime modifiable em_perf_table
+ * @node:		node in	em_pd_list (in energy_model.c)
+ * @id:			A unique ID number for each performance domain
  * @nr_perf_states:	Number of performance states
  * @min_perf_state:	Minimum allowed Performance State index
  * @max_perf_state:	Maximum allowed Performance State index
@@ -71,6 +73,8 @@ struct em_perf_table {
  */
 struct em_perf_domain {
 	struct em_perf_table __rcu *em_table;
+	struct list_head node;
+	int id;
 	int nr_perf_states;
 	int min_perf_state;
 	int max_perf_state;
diff --git a/include/linux/freezer.h b/include/linux/freezer.h
index 32884c9721e5..0a8c6c4d1a82 100644
--- a/include/linux/freezer.h
+++ b/include/linux/freezer.h
@@ -22,14 +22,18 @@ extern bool pm_nosig_freezing;		/* PM nosig freezing in effect */
 extern unsigned int freeze_timeout_msecs;
 
 /*
- * Check if a process has been frozen
+ * Check if a process has been frozen for PM or cgroup1 freezer. Note that
+ * cgroup2 freezer uses the job control mechanism and does not interact with
+ * the PM freezer.
  */
 extern bool frozen(struct task_struct *p);
 
 extern bool freezing_slow_path(struct task_struct *p);
 
 /*
- * Check if there is a request to freeze a process
+ * Check if there is a request to freeze a task from PM or cgroup1 freezer.
+ * Note that cgroup2 freezer uses the job control mechanism and does not
+ * interact with the PM freezer.
  */
 static inline bool freezing(struct task_struct *p)
 {
@@ -63,9 +67,9 @@ extern bool freeze_task(struct task_struct *p);
 extern bool set_freezable(void);
 
 #ifdef CONFIG_CGROUP_FREEZER
-extern bool cgroup_freezing(struct task_struct *task);
+extern bool cgroup1_freezing(struct task_struct *task);
 #else /* !CONFIG_CGROUP_FREEZER */
-static inline bool cgroup_freezing(struct task_struct *task)
+static inline bool cgroup1_freezing(struct task_struct *task)
 {
 	return false;
 }
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 71ac78b9f834..11cab07f322a 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -11,7 +11,7 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf);
 int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 		  pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
 		  struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma);
-void huge_pmd_set_accessed(struct vm_fault *vmf);
+bool huge_pmd_set_accessed(struct vm_fault *vmf);
 int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 		  pud_t *dst_pud, pud_t *src_pud, unsigned long addr,
 		  struct vm_area_struct *vma);
diff --git a/include/linux/intel_rapl.h b/include/linux/intel_rapl.h
index c0397423d3a8..e9ade2ff4af6 100644
--- a/include/linux/intel_rapl.h
+++ b/include/linux/intel_rapl.h
@@ -152,7 +152,7 @@ struct rapl_if_priv {
 	union rapl_reg reg_unit;
 	union rapl_reg regs[RAPL_DOMAIN_MAX][RAPL_DOMAIN_REG_MAX];
 	int limits[RAPL_DOMAIN_MAX];
-	int (*read_raw)(int id, struct reg_action *ra);
+	int (*read_raw)(int id, struct reg_action *ra, bool atomic);
 	int (*write_raw)(int id, struct reg_action *ra);
 	void *defaults;
 	void *rpi;
diff --git a/include/linux/memory.h b/include/linux/memory.h
index 0c214256216f..ba1515160894 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -96,17 +96,8 @@ int set_memory_block_size_order(unsigned int order);
 #define	MEM_GOING_ONLINE	(1<<3)
 #define	MEM_CANCEL_ONLINE	(1<<4)
 #define	MEM_CANCEL_OFFLINE	(1<<5)
-#define	MEM_PREPARE_ONLINE	(1<<6)
-#define	MEM_FINISH_OFFLINE	(1<<7)
 
 struct memory_notify {
-	/*
-	 * The altmap_start_pfn and altmap_nr_pages fields are designated for
-	 * specifying the altmap range and are exclusively intended for use in
-	 * MEM_PREPARE_ONLINE/MEM_FINISH_OFFLINE notifiers.
-	 */
-	unsigned long altmap_start_pfn;
-	unsigned long altmap_nr_pages;
 	unsigned long start_pfn;
 	unsigned long nr_pages;
 };
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 23f038a16231..f2f16cdd73ee 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -58,22 +58,6 @@ typedef int __bitwise mhp_t;
  * implies the node id (nid).
  */
 #define MHP_NID_IS_MGID		((__force mhp_t)BIT(2))
-/*
- * The hotplugged memory is completely inaccessible while the memory is
- * offline. The memory provider will handle MEM_PREPARE_ONLINE /
- * MEM_FINISH_OFFLINE notifications and make the memory accessible.
- *
- * This flag is only relevant when used along with MHP_MEMMAP_ON_MEMORY,
- * because the altmap cannot be written (e.g., poisoned) when adding
- * memory -- before it is set online.
- *
- * This allows for adding memory with an altmap that is not currently
- * made available by a hypervisor. When onlining that memory, the
- * hypervisor can be instructed to make that memory available, and
- * the onlining phase will not require any memory allocations, which is
- * helpful in low-memory situations.
- */
-#define MHP_OFFLINE_INACCESSIBLE	((__force mhp_t)BIT(3))
 
 /*
  * Extended parameters for memory hotplug:
@@ -123,7 +107,7 @@ extern void adjust_present_page_count(struct page *page,
 				      long nr_pages);
 /* VM interface that may be used by firmware interface */
 extern int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages,
-				     struct zone *zone, bool mhp_off_inaccessible);
+				     struct zone *zone);
 extern void mhp_deinit_memmap_on_memory(unsigned long pfn, unsigned long nr_pages);
 extern int online_pages(unsigned long pfn, unsigned long nr_pages,
 			struct zone *zone, struct memory_group *group);
diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index e5951ba12a28..30c7aecbd245 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -25,7 +25,6 @@ struct vmem_altmap {
 	unsigned long free;
 	unsigned long align;
 	unsigned long alloc;
-	bool inaccessible;
 };
 
 /*
diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h
index 12d90360f6db..43c854a273c3 100644
--- a/include/linux/percpu-defs.h
+++ b/include/linux/percpu-defs.h
@@ -52,7 +52,7 @@
 	__section(".discard") __attribute__((unused))
 
 /*
- * s390 and alpha modules require percpu variables to be defined as
+ * alpha modules require percpu variables to be defined as
  * weak to force the compiler to generate GOT based external
  * references for them.  This is necessary because percpu sections
  * will be located outside of the usually addressable area.
diff --git a/include/linux/perf/arm_pmu.h b/include/linux/perf/arm_pmu.h
index bab26a7d79f4..52b37f7bdbf9 100644
--- a/include/linux/perf/arm_pmu.h
+++ b/include/linux/perf/arm_pmu.h
@@ -119,6 +119,7 @@ struct arm_pmu {
 
 	/* PMUv3 only */
 	int		pmuver;
+	bool		has_smt;
 	u64		reg_pmmir;
 	u64		reg_brbidr;
 #define ARMV8_PMUV3_MAX_COMMON_EVENTS		0x40
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 32e8457ad535..ee3148ef87f6 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -1232,6 +1232,10 @@ static inline void arch_swap_restore(swp_entry_t entry, struct folio *folio)
 #define flush_tlb_fix_spurious_fault(vma, address, ptep) flush_tlb_page(vma, address)
 #endif
 
+#ifndef flush_tlb_fix_spurious_fault_pmd
+#define flush_tlb_fix_spurious_fault_pmd(vma, address, pmdp) do { } while (0)
+#endif
+
 /*
  * When walking page tables, get the address of the next boundary,
  * or the end address of the range if that comes earlier.  Although no
diff --git a/include/linux/platform_device.h b/include/linux/platform_device.h
index ad66333ce85c..93c945331f39 100644
--- a/include/linux/platform_device.h
+++ b/include/linux/platform_device.h
@@ -234,6 +234,7 @@ extern int platform_device_add_data(struct platform_device *pdev,
 extern int platform_device_add(struct platform_device *pdev);
 extern void platform_device_del(struct platform_device *pdev);
 extern void platform_device_put(struct platform_device *pdev);
+DEFINE_FREE(platform_device_put, struct platform_device *, if (_T) platform_device_put(_T))
 
 struct platform_driver {
 	int (*probe)(struct platform_device *);
diff --git a/include/linux/pm.h b/include/linux/pm.h
index cc7b2dc28574..7f69f739f613 100644
--- a/include/linux/pm.h
+++ b/include/linux/pm.h
@@ -25,11 +25,12 @@ extern void (*pm_power_off)(void);
 
 struct device; /* we have a circular dep with device.h */
 #ifdef CONFIG_VT_CONSOLE_SLEEP
-extern void pm_vt_switch_required(struct device *dev, bool required);
+extern int pm_vt_switch_required(struct device *dev, bool required);
 extern void pm_vt_switch_unregister(struct device *dev);
 #else
-static inline void pm_vt_switch_required(struct device *dev, bool required)
+static inline int pm_vt_switch_required(struct device *dev, bool required)
 {
+	return 0;
 }
 static inline void pm_vt_switch_unregister(struct device *dev)
 {
@@ -507,6 +508,7 @@ const struct dev_pm_ops name = { \
  * RECOVER	Creation of a hibernation image or restoration of the main
  *		memory contents from a hibernation image has failed, call
  *		->thaw() and ->complete() for all devices.
+ * POWEROFF	System will poweroff, call ->poweroff() for all devices.
  *
  * The following PM_EVENT_ messages are defined for internal use by
  * kernel subsystems.  They are never issued by the PM core.
@@ -537,6 +539,7 @@ const struct dev_pm_ops name = { \
 #define PM_EVENT_USER		0x0100
 #define PM_EVENT_REMOTE		0x0200
 #define PM_EVENT_AUTO		0x0400
+#define PM_EVENT_POWEROFF	0x0800
 
 #define PM_EVENT_SLEEP		(PM_EVENT_SUSPEND | PM_EVENT_HIBERNATE)
 #define PM_EVENT_USER_SUSPEND	(PM_EVENT_USER | PM_EVENT_SUSPEND)
@@ -551,6 +554,7 @@ const struct dev_pm_ops name = { \
 #define PMSG_QUIESCE	((struct pm_message){ .event = PM_EVENT_QUIESCE, })
 #define PMSG_SUSPEND	((struct pm_message){ .event = PM_EVENT_SUSPEND, })
 #define PMSG_HIBERNATE	((struct pm_message){ .event = PM_EVENT_HIBERNATE, })
+#define PMSG_POWEROFF	((struct pm_message){ .event = PM_EVENT_POWEROFF, })
 #define PMSG_RESUME	((struct pm_message){ .event = PM_EVENT_RESUME, })
 #define PMSG_THAW	((struct pm_message){ .event = PM_EVENT_THAW, })
 #define PMSG_RESTORE	((struct pm_message){ .event = PM_EVENT_RESTORE, })
diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h
index f67a2cb7d781..93ba0143ca47 100644
--- a/include/linux/pm_domain.h
+++ b/include/linux/pm_domain.h
@@ -153,6 +153,7 @@ enum genpd_sync_state {
 };
 
 struct dev_power_governor {
+	bool (*system_power_down_ok)(struct dev_pm_domain *domain);
 	bool (*power_down_ok)(struct dev_pm_domain *domain);
 	bool (*suspend_ok)(struct device *dev);
 };
diff --git a/include/linux/pm_qos.h b/include/linux/pm_qos.h
index 4a69d4af3ff8..6cea4455f867 100644
--- a/include/linux/pm_qos.h
+++ b/include/linux/pm_qos.h
@@ -162,6 +162,15 @@ static inline void cpu_latency_qos_update_request(struct pm_qos_request *req,
 static inline void cpu_latency_qos_remove_request(struct pm_qos_request *req) {}
 #endif
 
+#ifdef CONFIG_PM_QOS_CPU_SYSTEM_WAKEUP
+s32 cpu_wakeup_latency_qos_limit(void);
+#else
+static inline s32 cpu_wakeup_latency_qos_limit(void)
+{
+	return PM_QOS_RESUME_LATENCY_NO_CONSTRAINT;
+}
+#endif
+
 #ifdef CONFIG_PM
 enum pm_qos_flags_status __dev_pm_qos_flags(struct device *dev, s32 mask);
 enum pm_qos_flags_status dev_pm_qos_flags(struct device *dev, s32 mask);
diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h
index 0b436e15f4cd..911d7a4d32c1 100644
--- a/include/linux/pm_runtime.h
+++ b/include/linux/pm_runtime.h
@@ -637,6 +637,30 @@ DEFINE_GUARD_COND(pm_runtime_active_auto, _try,
 DEFINE_GUARD_COND(pm_runtime_active_auto, _try_enabled,
 		  pm_runtime_resume_and_get(_T), _RET == 0)
 
+/* ACQUIRE() wrapper macros for the guards defined above. */
+
+#define PM_RUNTIME_ACQUIRE(_dev, _var)			\
+	ACQUIRE(pm_runtime_active_try, _var)(_dev)
+
+#define PM_RUNTIME_ACQUIRE_AUTOSUSPEND(_dev, _var)	\
+	ACQUIRE(pm_runtime_active_auto_try, _var)(_dev)
+
+#define PM_RUNTIME_ACQUIRE_IF_ENABLED(_dev, _var)	\
+	ACQUIRE(pm_runtime_active_try_enabled, _var)(_dev)
+
+#define PM_RUNTIME_ACQUIRE_IF_ENABLED_AUTOSUSPEND(_dev, _var)	\
+	ACQUIRE(pm_runtime_active_auto_try_enabled, _var)(_dev)
+
+/*
+ * ACQUIRE_ERR() wrapper macro for guard pm_runtime_active.
+ *
+ * Always check PM_RUNTIME_ACQUIRE_ERR() after using one of the
+ * PM_RUNTIME_ACQUIRE*() macros defined above (yes, it can be used with
+ * any of them) and if it is nonzero, avoid accessing the given device.
+ */
+#define PM_RUNTIME_ACQUIRE_ERR(_var_ptr)	\
+	ACQUIRE_ERR(pm_runtime_active, _var_ptr)
+
 /**
  * pm_runtime_put_sync - Drop device usage counter and run "idle check" if 0.
  * @dev: Target device.
diff --git a/include/linux/prandom.h b/include/linux/prandom.h
index f2ed5b72b3d6..ff7dcc3fa105 100644
--- a/include/linux/prandom.h
+++ b/include/linux/prandom.h
@@ -47,10 +47,4 @@ static inline void prandom_seed_state(struct rnd_state *state, u64 seed)
 	state->s4 = __seed(i, 128U);
 }
 
-/* Pseudo random number generator from numerical recipes. */
-static inline u32 next_pseudo_random32(u32 seed)
-{
-	return seed * 1664525 + 1013904223;
-}
-
 #endif
diff --git a/include/linux/random.h b/include/linux/random.h
index 333cecfca93f..8a8064dc3970 100644
--- a/include/linux/random.h
+++ b/include/linux/random.h
@@ -130,21 +130,6 @@ static inline int get_random_bytes_wait(void *buf, size_t nbytes)
 	return ret;
 }
 
-#define declare_get_random_var_wait(name, ret_type) \
-	static inline int get_random_ ## name ## _wait(ret_type *out) { \
-		int ret = wait_for_random_bytes(); \
-		if (unlikely(ret)) \
-			return ret; \
-		*out = get_random_ ## name(); \
-		return 0; \
-	}
-declare_get_random_var_wait(u8, u8)
-declare_get_random_var_wait(u16, u16)
-declare_get_random_var_wait(u32, u32)
-declare_get_random_var_wait(u64, u32)
-declare_get_random_var_wait(long, unsigned long)
-#undef declare_get_random_var
-
 #ifdef CONFIG_SMP
 int random_prepare_cpu(unsigned int cpu);
 int random_online_cpu(unsigned int cpu);
diff --git a/include/trace/events/power.h b/include/trace/events/power.h
index 82904291c2b8..370f8df2fdb4 100644
--- a/include/trace/events/power.h
+++ b/include/trace/events/power.h
@@ -179,7 +179,8 @@ TRACE_EVENT(pstate_sample,
 		{ PM_EVENT_HIBERNATE, "hibernate" }, \
 		{ PM_EVENT_THAW, "thaw" }, \
 		{ PM_EVENT_RESTORE, "restore" }, \
-		{ PM_EVENT_RECOVER, "recover" })
+		{ PM_EVENT_RECOVER, "recover" }, \
+		{ PM_EVENT_POWEROFF, "poweroff" })
 
 DEFINE_EVENT(cpu, cpu_frequency,
 
diff --git a/include/uapi/linux/energy_model.h b/include/uapi/linux/energy_model.h
new file mode 100644
index 000000000000..4ec4c0eabbbb
--- /dev/null
+++ b/include/uapi/linux/energy_model.h
@@ -0,0 +1,62 @@
+/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */
+/* Do not edit directly, auto-generated from: */
+/*	Documentation/netlink/specs/em.yaml */
+/* YNL-GEN uapi header */
+
+#ifndef _UAPI_LINUX_ENERGY_MODEL_H
+#define _UAPI_LINUX_ENERGY_MODEL_H
+
+#define EM_FAMILY_NAME		"em"
+#define EM_FAMILY_VERSION	1
+
+enum {
+	EM_A_PDS_PD = 1,
+
+	__EM_A_PDS_MAX,
+	EM_A_PDS_MAX = (__EM_A_PDS_MAX - 1)
+};
+
+enum {
+	EM_A_PD_PAD = 1,
+	EM_A_PD_PD_ID,
+	EM_A_PD_FLAGS,
+	EM_A_PD_CPUS,
+
+	__EM_A_PD_MAX,
+	EM_A_PD_MAX = (__EM_A_PD_MAX - 1)
+};
+
+enum {
+	EM_A_PD_TABLE_PD_ID = 1,
+	EM_A_PD_TABLE_PS,
+
+	__EM_A_PD_TABLE_MAX,
+	EM_A_PD_TABLE_MAX = (__EM_A_PD_TABLE_MAX - 1)
+};
+
+enum {
+	EM_A_PS_PAD = 1,
+	EM_A_PS_PERFORMANCE,
+	EM_A_PS_FREQUENCY,
+	EM_A_PS_POWER,
+	EM_A_PS_COST,
+	EM_A_PS_FLAGS,
+
+	__EM_A_PS_MAX,
+	EM_A_PS_MAX = (__EM_A_PS_MAX - 1)
+};
+
+enum {
+	EM_CMD_GET_PDS = 1,
+	EM_CMD_GET_PD_TABLE,
+	EM_CMD_PD_CREATED,
+	EM_CMD_PD_UPDATED,
+	EM_CMD_PD_DELETED,
+
+	__EM_CMD_MAX,
+	EM_CMD_MAX = (__EM_CMD_MAX - 1)
+};
+
+#define EM_MCGRP_EVENT	"event"
+
+#endif /* _UAPI_LINUX_ENERGY_MODEL_H */
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index d292f96bc06f..c44a8fb3e418 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -382,6 +382,7 @@ enum perf_event_read_format {
 #define PERF_ATTR_SIZE_VER6			120	/* Add: aux_sample_size */
 #define PERF_ATTR_SIZE_VER7			128	/* Add: sig_data */
 #define PERF_ATTR_SIZE_VER8			136	/* Add: config3 */
+#define PERF_ATTR_SIZE_VER9			144	/* add: config4 */
 
 /*
  * 'struct perf_event_attr' contains various attributes that define
@@ -545,6 +546,7 @@ struct perf_event_attr {
 	__u64	sig_data;
 
 	__u64	config3; /* extension of config2 */
+	__u64	config4; /* extension of config3 */
 };
 
 /*
diff --git a/kernel/cgroup/legacy_freezer.c b/kernel/cgroup/legacy_freezer.c
index dd9417425d92..915b02f65980 100644
--- a/kernel/cgroup/legacy_freezer.c
+++ b/kernel/cgroup/legacy_freezer.c
@@ -63,7 +63,7 @@ static struct freezer *parent_freezer(struct freezer *freezer)
 	return css_freezer(freezer->css.parent);
 }
 
-bool cgroup_freezing(struct task_struct *task)
+bool cgroup1_freezing(struct task_struct *task)
 {
 	bool ret;
 
diff --git a/kernel/freezer.c b/kernel/freezer.c
index ddc11a8bd2ea..a76bf957fb32 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -44,7 +44,7 @@ bool freezing_slow_path(struct task_struct *p)
 	if (tsk_is_oom_victim(p))
 		return false;
 
-	if (pm_nosig_freezing || cgroup_freezing(p))
+	if (pm_nosig_freezing || cgroup1_freezing(p))
 		return true;
 
 	if (pm_freezing && !(p->flags & PF_KTHREAD))
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 54a623680019..05337f437cca 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -202,6 +202,17 @@ config PM_WAKELOCKS_GC
 	depends on PM_WAKELOCKS
 	default y
 
+config PM_QOS_CPU_SYSTEM_WAKEUP
+	bool "User space interface for CPU system wakeup QoS"
+	depends on CPU_IDLE
+	help
+	  Enable this to allow user space via the cpu_wakeup_latency file to
+	  specify a CPU system wakeup latency limit.
+
+	  This may be particularly useful for platforms supporting multiple low
+	  power states for CPUs during system-wide suspend and s2idle in
+	  particular.
+
 config PM
 	bool "Device power management core functionality"
 	help
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index 874ad834dc8d..773e2789412b 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -21,4 +21,6 @@ obj-$(CONFIG_PM_WAKELOCKS)	+= wakelock.o
 
 obj-$(CONFIG_MAGIC_SYSRQ)	+= poweroff.o
 
-obj-$(CONFIG_ENERGY_MODEL)	+= energy_model.o
+obj-$(CONFIG_ENERGY_MODEL)	+= em.o
+em-y				:= energy_model.o
+em-$(CONFIG_NET)		+= em_netlink_autogen.o em_netlink.o
diff --git a/kernel/power/console.c b/kernel/power/console.c
index 19c48aa5355d..a906a0ac0f9b 100644
--- a/kernel/power/console.c
+++ b/kernel/power/console.c
@@ -44,9 +44,10 @@ static LIST_HEAD(pm_vt_switch_list);
  * no_console_suspend argument has been passed on the command line, VT
  * switches will occur.
  */
-void pm_vt_switch_required(struct device *dev, bool required)
+int pm_vt_switch_required(struct device *dev, bool required)
 {
 	struct pm_vt_switch *entry, *tmp;
+	int ret = 0;
 
 	mutex_lock(&vt_switch_mutex);
 	list_for_each_entry(tmp, &pm_vt_switch_list, head) {
@@ -58,8 +59,10 @@ void pm_vt_switch_required(struct device *dev, bool required)
 	}
 
 	entry = kmalloc(sizeof(*entry), GFP_KERNEL);
-	if (!entry)
+	if (!entry) {
+		ret = -ENOMEM;
 		goto out;
+		}
 
 	entry->required = required;
 	entry->dev = dev;
@@ -67,6 +70,7 @@ void pm_vt_switch_required(struct device *dev, bool required)
 	list_add(&entry->head, &pm_vt_switch_list);
 out:
 	mutex_unlock(&vt_switch_mutex);
+	return ret;
 }
 EXPORT_SYMBOL(pm_vt_switch_required);
 
diff --git a/kernel/power/em_netlink.c b/kernel/power/em_netlink.c
new file mode 100644
index 000000000000..4b85da138a06
--- /dev/null
+++ b/kernel/power/em_netlink.c
@@ -0,0 +1,308 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ *
+ * Generic netlink for energy model.
+ *
+ * Copyright (c) 2025 Valve Corporation.
+ * Author: Changwoo Min <changwoo@igalia.com>
+ */
+
+#define pr_fmt(fmt) "energy_model: " fmt
+
+#include <linux/energy_model.h>
+#include <net/sock.h>
+#include <net/genetlink.h>
+#include <uapi/linux/energy_model.h>
+
+#include "em_netlink.h"
+#include "em_netlink_autogen.h"
+
+#define EM_A_PD_CPUS_LEN		256
+
+/*************************** Command encoding ********************************/
+static int __em_nl_get_pd_size(struct em_perf_domain *pd, void *data)
+{
+	char cpus_buf[EM_A_PD_CPUS_LEN];
+	int *tot_msg_sz = data;
+	int msg_sz, cpus_sz;
+
+	cpus_sz = snprintf(cpus_buf, sizeof(cpus_buf), "%*pb",
+			   cpumask_pr_args(to_cpumask(pd->cpus)));
+
+	msg_sz = nla_total_size(0) +			/* EM_A_PDS_PD */
+		 nla_total_size(sizeof(u32)) +		/* EM_A_PD_PD_ID */
+		 nla_total_size_64bit(sizeof(u64)) +	/* EM_A_PD_FLAGS */
+		 nla_total_size(cpus_sz);		/* EM_A_PD_CPUS */
+
+	*tot_msg_sz += nlmsg_total_size(genlmsg_msg_size(msg_sz));
+	return 0;
+}
+
+static int __em_nl_get_pd(struct em_perf_domain *pd, void *data)
+{
+	char cpus_buf[EM_A_PD_CPUS_LEN];
+	struct sk_buff *msg = data;
+	struct nlattr *entry;
+
+	entry = nla_nest_start(msg, EM_A_PDS_PD);
+	if (!entry)
+		goto out_cancel_nest;
+
+	if (nla_put_u32(msg, EM_A_PD_PD_ID, pd->id))
+		goto out_cancel_nest;
+
+	if (nla_put_u64_64bit(msg, EM_A_PD_FLAGS, pd->flags, EM_A_PD_PAD))
+		goto out_cancel_nest;
+
+	snprintf(cpus_buf, sizeof(cpus_buf), "%*pb",
+		 cpumask_pr_args(to_cpumask(pd->cpus)));
+	if (nla_put_string(msg, EM_A_PD_CPUS, cpus_buf))
+		goto out_cancel_nest;
+
+	nla_nest_end(msg, entry);
+
+	return 0;
+
+out_cancel_nest:
+	nla_nest_cancel(msg, entry);
+
+	return -EMSGSIZE;
+}
+
+int em_nl_get_pds_doit(struct sk_buff *skb, struct genl_info *info)
+{
+	struct sk_buff *msg;
+	void *hdr;
+	int cmd = info->genlhdr->cmd;
+	int ret = -EMSGSIZE, msg_sz = 0;
+
+	for_each_em_perf_domain(__em_nl_get_pd_size, &msg_sz);
+
+	msg = genlmsg_new(msg_sz, GFP_KERNEL);
+	if (!msg)
+		return -ENOMEM;
+
+	hdr = genlmsg_put_reply(msg, info, &em_nl_family, 0, cmd);
+	if (!hdr)
+		goto out_free_msg;
+
+	ret = for_each_em_perf_domain(__em_nl_get_pd, msg);
+	if (ret)
+		goto out_cancel_msg;
+
+	genlmsg_end(msg, hdr);
+
+	return genlmsg_reply(msg, info);
+
+out_cancel_msg:
+	genlmsg_cancel(msg, hdr);
+out_free_msg:
+	nlmsg_free(msg);
+
+	return ret;
+}
+
+static struct em_perf_domain *__em_nl_get_pd_table_id(struct nlattr **attrs)
+{
+	struct em_perf_domain *pd;
+	int id;
+
+	if (!attrs[EM_A_PD_TABLE_PD_ID])
+		return NULL;
+
+	id = nla_get_u32(attrs[EM_A_PD_TABLE_PD_ID]);
+	pd = em_perf_domain_get_by_id(id);
+	return pd;
+}
+
+static int __em_nl_get_pd_table_size(const struct em_perf_domain *pd)
+{
+	int id_sz, ps_sz;
+
+	id_sz = nla_total_size(sizeof(u32));		/* EM_A_PD_TABLE_PD_ID */
+	ps_sz = nla_total_size(0) +			/* EM_A_PD_TABLE_PS */
+		nla_total_size_64bit(sizeof(u64)) +	/* EM_A_PS_PERFORMANCE */
+		nla_total_size_64bit(sizeof(u64)) +	/* EM_A_PS_FREQUENCY */
+		nla_total_size_64bit(sizeof(u64)) +	/* EM_A_PS_POWER */
+		nla_total_size_64bit(sizeof(u64)) +	/* EM_A_PS_COST */
+		nla_total_size_64bit(sizeof(u64));	/* EM_A_PS_FLAGS */
+	ps_sz *= pd->nr_perf_states;
+
+	return nlmsg_total_size(genlmsg_msg_size(id_sz + ps_sz));
+}
+
+static int __em_nl_get_pd_table(struct sk_buff *msg, const struct em_perf_domain *pd)
+{
+	struct em_perf_state *table, *ps;
+	struct nlattr *entry;
+	int i;
+
+	if (nla_put_u32(msg, EM_A_PD_TABLE_PD_ID, pd->id))
+		goto out_err;
+
+	rcu_read_lock();
+	table = em_perf_state_from_pd((struct em_perf_domain *)pd);
+
+	for (i = 0; i < pd->nr_perf_states; i++) {
+		ps = &table[i];
+
+		entry = nla_nest_start(msg, EM_A_PD_TABLE_PS);
+		if (!entry)
+			goto out_unlock_ps;
+
+		if (nla_put_u64_64bit(msg, EM_A_PS_PERFORMANCE,
+				      ps->performance, EM_A_PS_PAD))
+			goto out_cancel_ps_nest;
+		if (nla_put_u64_64bit(msg, EM_A_PS_FREQUENCY,
+				      ps->frequency, EM_A_PS_PAD))
+			goto out_cancel_ps_nest;
+		if (nla_put_u64_64bit(msg, EM_A_PS_POWER,
+				      ps->power, EM_A_PS_PAD))
+			goto out_cancel_ps_nest;
+		if (nla_put_u64_64bit(msg, EM_A_PS_COST,
+				      ps->cost, EM_A_PS_PAD))
+			goto out_cancel_ps_nest;
+		if (nla_put_u64_64bit(msg, EM_A_PS_FLAGS,
+				      ps->flags, EM_A_PS_PAD))
+			goto out_cancel_ps_nest;
+
+		nla_nest_end(msg, entry);
+	}
+	rcu_read_unlock();
+	return 0;
+
+out_cancel_ps_nest:
+	nla_nest_cancel(msg, entry);
+out_unlock_ps:
+	rcu_read_unlock();
+out_err:
+	return -EMSGSIZE;
+}
+
+int em_nl_get_pd_table_doit(struct sk_buff *skb, struct genl_info *info)
+{
+	int cmd = info->genlhdr->cmd;
+	int msg_sz, ret = -EMSGSIZE;
+	struct em_perf_domain *pd;
+	struct sk_buff *msg;
+	void *hdr;
+
+	pd = __em_nl_get_pd_table_id(info->attrs);
+	if (!pd)
+		return -EINVAL;
+
+	msg_sz = __em_nl_get_pd_table_size(pd);
+
+	msg = genlmsg_new(msg_sz, GFP_KERNEL);
+	if (!msg)
+		return -ENOMEM;
+
+	hdr = genlmsg_put_reply(msg, info, &em_nl_family, 0, cmd);
+	if (!hdr)
+		goto out_free_msg;
+
+	ret = __em_nl_get_pd_table(msg, pd);
+	if (ret)
+		goto out_free_msg;
+
+	genlmsg_end(msg, hdr);
+	return genlmsg_reply(msg, info);
+
+out_free_msg:
+	nlmsg_free(msg);
+	return ret;
+}
+
+
+/**************************** Event encoding *********************************/
+static void __em_notify_pd_table(const struct em_perf_domain *pd, int ntf_type)
+{
+	struct sk_buff *msg;
+	int msg_sz, ret = -EMSGSIZE;
+	void *hdr;
+
+	if (!genl_has_listeners(&em_nl_family, &init_net, EM_NLGRP_EVENT))
+		return;
+
+	msg_sz = __em_nl_get_pd_table_size(pd);
+
+	msg = genlmsg_new(msg_sz, GFP_KERNEL);
+	if (!msg)
+		return;
+
+	hdr = genlmsg_put(msg, 0, 0, &em_nl_family, 0, ntf_type);
+	if (!hdr)
+		goto out_free_msg;
+
+	ret = __em_nl_get_pd_table(msg, pd);
+	if (ret)
+		goto out_free_msg;
+
+	genlmsg_end(msg, hdr);
+
+	genlmsg_multicast(&em_nl_family, msg, 0, EM_NLGRP_EVENT, GFP_KERNEL);
+
+	return;
+
+out_free_msg:
+	nlmsg_free(msg);
+	return;
+}
+
+void em_notify_pd_created(const struct em_perf_domain *pd)
+{
+	__em_notify_pd_table(pd, EM_CMD_PD_CREATED);
+}
+
+void em_notify_pd_updated(const struct em_perf_domain *pd)
+{
+	__em_notify_pd_table(pd, EM_CMD_PD_UPDATED);
+}
+
+static int __em_notify_pd_deleted_size(const struct em_perf_domain *pd)
+{
+	int id_sz = nla_total_size(sizeof(u32)); /* EM_A_PD_TABLE_PD_ID */
+
+	return nlmsg_total_size(genlmsg_msg_size(id_sz));
+}
+
+void em_notify_pd_deleted(const struct em_perf_domain *pd)
+{
+	struct sk_buff *msg;
+	void *hdr;
+	int msg_sz;
+
+	if (!genl_has_listeners(&em_nl_family, &init_net, EM_NLGRP_EVENT))
+		return;
+
+	msg_sz = __em_notify_pd_deleted_size(pd);
+
+	msg = genlmsg_new(msg_sz, GFP_KERNEL);
+	if (!msg)
+		return;
+
+	hdr = genlmsg_put(msg, 0, 0, &em_nl_family, 0, EM_CMD_PD_DELETED);
+	if (!hdr)
+		goto out_free_msg;
+
+	if (nla_put_u32(msg, EM_A_PD_TABLE_PD_ID, pd->id)) {
+		goto out_free_msg;
+	}
+
+	genlmsg_end(msg, hdr);
+
+	genlmsg_multicast(&em_nl_family, msg, 0, EM_NLGRP_EVENT, GFP_KERNEL);
+
+	return;
+
+out_free_msg:
+	nlmsg_free(msg);
+	return;
+}
+
+/**************************** Initialization *********************************/
+static int __init em_netlink_init(void)
+{
+	return genl_register_family(&em_nl_family);
+}
+postcore_initcall(em_netlink_init);
diff --git a/kernel/power/em_netlink.h b/kernel/power/em_netlink.h
new file mode 100644
index 000000000000..583d7f1c3939
--- /dev/null
+++ b/kernel/power/em_netlink.h
@@ -0,0 +1,39 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ *
+ * Generic netlink for energy model.
+ *
+ * Copyright (c) 2025 Valve Corporation.
+ * Author: Changwoo Min <changwoo@igalia.com>
+ */
+#ifndef _EM_NETLINK_H
+#define _EM_NETLINK_H
+
+#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_NET)
+int for_each_em_perf_domain(int (*cb)(struct em_perf_domain*, void *),
+			    void *data);
+struct em_perf_domain *em_perf_domain_get_by_id(int id);
+void em_notify_pd_created(const struct em_perf_domain *pd);
+void em_notify_pd_deleted(const struct em_perf_domain *pd);
+void em_notify_pd_updated(const struct em_perf_domain *pd);
+#else
+static inline
+int for_each_em_perf_domain(int (*cb)(struct em_perf_domain*, void *),
+			    void *data)
+{
+	return -EINVAL;
+}
+static inline
+struct em_perf_domain *em_perf_domain_get_by_id(int id)
+{
+	return NULL;
+}
+
+static inline void em_notify_pd_created(const struct em_perf_domain *pd) {}
+
+static inline void em_notify_pd_deleted(const struct em_perf_domain *pd) {}
+
+static inline void em_notify_pd_updated(const struct em_perf_domain *pd) {}
+#endif
+
+#endif /* _EM_NETLINK_H */
diff --git a/kernel/power/em_netlink_autogen.c b/kernel/power/em_netlink_autogen.c
new file mode 100644
index 000000000000..a7a09ab1d1c2
--- /dev/null
+++ b/kernel/power/em_netlink_autogen.c
@@ -0,0 +1,48 @@
+// SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause)
+/* Do not edit directly, auto-generated from: */
+/*	Documentation/netlink/specs/em.yaml */
+/* YNL-GEN kernel source */
+
+#include <net/netlink.h>
+#include <net/genetlink.h>
+
+#include "em_netlink_autogen.h"
+
+#include <uapi/linux/energy_model.h>
+
+/* EM_CMD_GET_PD_TABLE - do */
+static const struct nla_policy em_get_pd_table_nl_policy[EM_A_PD_TABLE_PD_ID + 1] = {
+	[EM_A_PD_TABLE_PD_ID] = { .type = NLA_U32, },
+};
+
+/* Ops table for em */
+static const struct genl_split_ops em_nl_ops[] = {
+	{
+		.cmd	= EM_CMD_GET_PDS,
+		.doit	= em_nl_get_pds_doit,
+		.flags	= GENL_CMD_CAP_DO,
+	},
+	{
+		.cmd		= EM_CMD_GET_PD_TABLE,
+		.doit		= em_nl_get_pd_table_doit,
+		.policy		= em_get_pd_table_nl_policy,
+		.maxattr	= EM_A_PD_TABLE_PD_ID,
+		.flags		= GENL_CMD_CAP_DO,
+	},
+};
+
+static const struct genl_multicast_group em_nl_mcgrps[] = {
+	[EM_NLGRP_EVENT] = { "event", },
+};
+
+struct genl_family em_nl_family __ro_after_init = {
+	.name		= EM_FAMILY_NAME,
+	.version	= EM_FAMILY_VERSION,
+	.netnsok	= true,
+	.parallel_ops	= true,
+	.module		= THIS_MODULE,
+	.split_ops	= em_nl_ops,
+	.n_split_ops	= ARRAY_SIZE(em_nl_ops),
+	.mcgrps		= em_nl_mcgrps,
+	.n_mcgrps	= ARRAY_SIZE(em_nl_mcgrps),
+};
diff --git a/kernel/power/em_netlink_autogen.h b/kernel/power/em_netlink_autogen.h
new file mode 100644
index 000000000000..78ce609641f1
--- /dev/null
+++ b/kernel/power/em_netlink_autogen.h
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */
+/* Do not edit directly, auto-generated from: */
+/*	Documentation/netlink/specs/em.yaml */
+/* YNL-GEN kernel header */
+
+#ifndef _LINUX_EM_GEN_H
+#define _LINUX_EM_GEN_H
+
+#include <net/netlink.h>
+#include <net/genetlink.h>
+
+#include <uapi/linux/energy_model.h>
+
+int em_nl_get_pds_doit(struct sk_buff *skb, struct genl_info *info);
+int em_nl_get_pd_table_doit(struct sk_buff *skb, struct genl_info *info);
+
+enum {
+	EM_NLGRP_EVENT,
+};
+
+extern struct genl_family em_nl_family;
+
+#endif /* _LINUX_EM_GEN_H */
diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c
index 5f17d2e8e954..11af9f64aa82 100644
--- a/kernel/power/energy_model.c
+++ b/kernel/power/energy_model.c
@@ -17,12 +17,24 @@
 #include <linux/sched/topology.h>
 #include <linux/slab.h>
 
+#include "em_netlink.h"
+
 /*
  * Mutex serializing the registrations of performance domains and letting
  * callbacks defined by drivers sleep.
  */
 static DEFINE_MUTEX(em_pd_mutex);
 
+/*
+ * Manage performance domains with IDs. One can iterate the performance domains
+ * through the list and pick one with their associated ID. The mutex serializes
+ * the list access. When holding em_pd_list_mutex, em_pd_mutex should not be
+ * taken to avoid potential deadlock.
+ */
+static DEFINE_IDA(em_pd_ida);
+static LIST_HEAD(em_pd_list);
+static DEFINE_MUTEX(em_pd_list_mutex);
+
 static void em_cpufreq_update_efficiencies(struct device *dev,
 					   struct em_perf_state *table);
 static void em_check_capacity_update(void);
@@ -116,6 +128,16 @@ static int em_debug_flags_show(struct seq_file *s, void *unused)
 }
 DEFINE_SHOW_ATTRIBUTE(em_debug_flags);
 
+static int em_debug_id_show(struct seq_file *s, void *unused)
+{
+	struct em_perf_domain *pd = s->private;
+
+	seq_printf(s, "%d\n", pd->id);
+
+	return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(em_debug_id);
+
 static void em_debug_create_pd(struct device *dev)
 {
 	struct em_dbg_info *em_dbg;
@@ -132,6 +154,8 @@ static void em_debug_create_pd(struct device *dev)
 	debugfs_create_file("flags", 0444, d, dev->em_pd,
 			    &em_debug_flags_fops);
 
+	debugfs_create_file("id", 0444, d, dev->em_pd, &em_debug_id_fops);
+
 	em_dbg = devm_kcalloc(dev, dev->em_pd->nr_perf_states,
 			      sizeof(*em_dbg), GFP_KERNEL);
 	if (!em_dbg)
@@ -328,6 +352,8 @@ int em_dev_update_perf_domain(struct device *dev,
 	em_table_free(old_table);
 
 	mutex_unlock(&em_pd_mutex);
+
+	em_notify_pd_updated(pd);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(em_dev_update_perf_domain);
@@ -396,7 +422,7 @@ static int em_create_pd(struct device *dev, int nr_states,
 	struct em_perf_table *em_table;
 	struct em_perf_domain *pd;
 	struct device *cpu_dev;
-	int cpu, ret, num_cpus;
+	int cpu, ret, num_cpus, id;
 
 	if (_is_cpu_device(dev)) {
 		num_cpus = cpumask_weight(cpus);
@@ -420,6 +446,13 @@ static int em_create_pd(struct device *dev, int nr_states,
 
 	pd->nr_perf_states = nr_states;
 
+	INIT_LIST_HEAD(&pd->node);
+
+	id = ida_alloc(&em_pd_ida, GFP_KERNEL);
+	if (id < 0)
+		return -ENOMEM;
+	pd->id = id;
+
 	em_table = em_table_alloc(pd);
 	if (!em_table)
 		goto free_pd;
@@ -444,6 +477,7 @@ free_pd_table:
 	kfree(em_table);
 free_pd:
 	kfree(pd);
+	ida_free(&em_pd_ida, id);
 	return -EINVAL;
 }
 
@@ -659,8 +693,16 @@ int em_dev_register_pd_no_update(struct device *dev, unsigned int nr_states,
 
 unlock:
 	mutex_unlock(&em_pd_mutex);
+	if (ret)
+		return ret;
 
-	return ret;
+	mutex_lock(&em_pd_list_mutex);
+	list_add_tail(&dev->em_pd->node, &em_pd_list);
+	mutex_unlock(&em_pd_list_mutex);
+
+	em_notify_pd_created(dev->em_pd);
+
+	return 0;
 }
 EXPORT_SYMBOL_GPL(em_dev_register_pd_no_update);
 
@@ -678,6 +720,12 @@ void em_dev_unregister_perf_domain(struct device *dev)
 	if (_is_cpu_device(dev))
 		return;
 
+	mutex_lock(&em_pd_list_mutex);
+	list_del_init(&dev->em_pd->node);
+	mutex_unlock(&em_pd_list_mutex);
+
+	em_notify_pd_deleted(dev->em_pd);
+
 	/*
 	 * The mutex separates all register/unregister requests and protects
 	 * from potential clean-up/setup issues in the debugfs directories.
@@ -689,6 +737,8 @@ void em_dev_unregister_perf_domain(struct device *dev)
 	em_table_free(rcu_dereference_protected(dev->em_pd->em_table,
 						lockdep_is_held(&em_pd_mutex)));
 
+	ida_free(&em_pd_ida, dev->em_pd->id);
+
 	kfree(dev->em_pd);
 	dev->em_pd = NULL;
 	mutex_unlock(&em_pd_mutex);
@@ -958,3 +1008,39 @@ void em_rebuild_sched_domains(void)
 	 */
 	schedule_work(&rebuild_sd_work);
 }
+
+#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_NET)
+int for_each_em_perf_domain(int (*cb)(struct em_perf_domain*, void *),
+			    void *data)
+{
+	struct em_perf_domain *pd;
+
+	lockdep_assert_not_held(&em_pd_mutex);
+	guard(mutex)(&em_pd_list_mutex);
+
+	list_for_each_entry(pd, &em_pd_list, node) {
+		int ret;
+
+		ret = cb(pd, data);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+struct em_perf_domain *em_perf_domain_get_by_id(int id)
+{
+	struct em_perf_domain *pd;
+
+	lockdep_assert_not_held(&em_pd_mutex);
+	guard(mutex)(&em_pd_list_mutex);
+
+	list_for_each_entry(pd, &em_pd_list, node) {
+		if (pd->id == id)
+			return pd;
+	}
+
+	return NULL;
+}
+#endif
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 26e45f86b955..af8d07bafe02 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -820,7 +820,10 @@ int hibernate(void)
 	if (error)
 		goto Restore;
 
-	ksys_sync_helper();
+	error = pm_sleep_fs_sync();
+	if (error)
+		goto Notify;
+
 	filesystems_freeze(filesystem_freeze_enabled);
 
 	error = freeze_processes();
@@ -891,6 +894,7 @@ int hibernate(void)
 	freezer_test_done = false;
  Exit:
 	filesystems_thaw();
+ Notify:
 	pm_notifier_call_chain(PM_POST_HIBERNATION);
  Restore:
 	pm_restore_console();
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 549f51ca3a1e..03b2c5495c77 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -18,6 +18,8 @@
 #include <linux/suspend.h>
 #include <linux/syscalls.h>
 #include <linux/pm_runtime.h>
+#include <linux/atomic.h>
+#include <linux/wait.h>
 
 #include "power.h"
 
@@ -92,6 +94,61 @@ void ksys_sync_helper(void)
 }
 EXPORT_SYMBOL_GPL(ksys_sync_helper);
 
+#if defined(CONFIG_SUSPEND) || defined(CONFIG_HIBERNATION)
+/* Wakeup events handling resolution while syncing file systems in jiffies */
+#define PM_FS_SYNC_WAKEUP_RESOLUTION	5
+
+static atomic_t pm_fs_sync_count = ATOMIC_INIT(0);
+static struct workqueue_struct *pm_fs_sync_wq;
+static DECLARE_WAIT_QUEUE_HEAD(pm_fs_sync_wait);
+
+static bool pm_fs_sync_completed(void)
+{
+	return atomic_read(&pm_fs_sync_count) == 0;
+}
+
+static void pm_fs_sync_work_fn(struct work_struct *work)
+{
+	ksys_sync_helper();
+
+	if (atomic_dec_and_test(&pm_fs_sync_count))
+		wake_up(&pm_fs_sync_wait);
+}
+static DECLARE_WORK(pm_fs_sync_work, pm_fs_sync_work_fn);
+
+/**
+ * pm_sleep_fs_sync() - Sync file systems in an interruptible way
+ *
+ * Return: 0 on successful file system sync, or -EBUSY if the file system sync
+ * was aborted.
+ */
+int pm_sleep_fs_sync(void)
+{
+	pm_wakeup_clear(0);
+
+	/*
+	 * Take back-to-back sleeps into account by queuing a subsequent fs sync
+	 * only if the previous fs sync is running or is not queued. Multiple fs
+	 * syncs increase the likelihood of saving the latest files immediately
+	 * before sleep.
+	 */
+	if (!work_pending(&pm_fs_sync_work)) {
+		atomic_inc(&pm_fs_sync_count);
+		queue_work(pm_fs_sync_wq, &pm_fs_sync_work);
+	}
+
+	while (!pm_fs_sync_completed()) {
+		if (pm_wakeup_pending())
+			return -EBUSY;
+
+		wait_event_timeout(pm_fs_sync_wait, pm_fs_sync_completed(),
+				   PM_FS_SYNC_WAKEUP_RESOLUTION);
+	}
+
+	return 0;
+}
+#endif /* CONFIG_SUSPEND || CONFIG_HIBERNATION */
+
 /* Routines for PM-transition notifications */
 
 static BLOCKING_NOTIFIER_HEAD(pm_chain_head);
@@ -231,10 +288,10 @@ static ssize_t mem_sleep_store(struct kobject *kobj, struct kobj_attribute *attr
 power_attr(mem_sleep);
 
 /*
- * sync_on_suspend: invoke ksys_sync_helper() before suspend.
+ * sync_on_suspend: Sync file systems before suspend.
  *
- * show() returns whether ksys_sync_helper() is invoked before suspend.
- * store() accepts 0 or 1.  0 disables ksys_sync_helper() and 1 enables it.
+ * show() returns whether file systems sync before suspend is enabled.
+ * store() accepts 0 or 1.  0 disables file systems sync and 1 enables it.
  */
 bool sync_on_suspend_enabled = !IS_ENABLED(CONFIG_SUSPEND_SKIP_SYNC);
 
@@ -1066,16 +1123,26 @@ static const struct attribute_group *attr_groups[] = {
 struct workqueue_struct *pm_wq;
 EXPORT_SYMBOL_GPL(pm_wq);
 
-static int __init pm_start_workqueue(void)
+static int __init pm_start_workqueues(void)
 {
-	pm_wq = alloc_workqueue("pm", WQ_FREEZABLE, 0);
+	pm_wq = alloc_workqueue("pm", WQ_FREEZABLE | WQ_UNBOUND, 0);
+	if (!pm_wq)
+		return -ENOMEM;
 
-	return pm_wq ? 0 : -ENOMEM;
+#if defined(CONFIG_SUSPEND) || defined(CONFIG_HIBERNATION)
+	pm_fs_sync_wq = alloc_ordered_workqueue("pm_fs_sync", 0);
+	if (!pm_fs_sync_wq) {
+		destroy_workqueue(pm_wq);
+		return -ENOMEM;
+	}
+#endif
+
+	return 0;
 }
 
 static int __init pm_init(void)
 {
-	int error = pm_start_workqueue();
+	int error = pm_start_workqueues();
 	if (error)
 		return error;
 	hibernate_image_size_init();
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 7ccd709af93f..75b63843886e 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -19,6 +19,7 @@ struct swsusp_info {
 } __aligned(PAGE_SIZE);
 
 #if defined(CONFIG_SUSPEND) || defined(CONFIG_HIBERNATION)
+extern int pm_sleep_fs_sync(void);
 extern bool filesystem_freeze_enabled;
 #endif
 
diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index 4244b069442e..f7d8064e9adc 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -415,6 +415,105 @@ static struct miscdevice cpu_latency_qos_miscdev = {
 	.fops = &cpu_latency_qos_fops,
 };
 
+#ifdef CONFIG_PM_QOS_CPU_SYSTEM_WAKEUP
+/* The CPU system wakeup latency QoS. */
+static struct pm_qos_constraints cpu_wakeup_latency_constraints = {
+	.list = PLIST_HEAD_INIT(cpu_wakeup_latency_constraints.list),
+	.target_value = PM_QOS_RESUME_LATENCY_NO_CONSTRAINT,
+	.default_value = PM_QOS_RESUME_LATENCY_NO_CONSTRAINT,
+	.no_constraint_value = PM_QOS_RESUME_LATENCY_NO_CONSTRAINT,
+	.type = PM_QOS_MIN,
+};
+
+/**
+ * cpu_wakeup_latency_qos_limit - Current CPU system wakeup latency QoS limit.
+ *
+ * Returns the current CPU system wakeup latency QoS limit that may have been
+ * requested by user space.
+ */
+s32 cpu_wakeup_latency_qos_limit(void)
+{
+	return pm_qos_read_value(&cpu_wakeup_latency_constraints);
+}
+
+static int cpu_wakeup_latency_qos_open(struct inode *inode, struct file *filp)
+{
+	struct pm_qos_request *req;
+
+	req = kzalloc(sizeof(*req), GFP_KERNEL);
+	if (!req)
+		return -ENOMEM;
+
+	req->qos = &cpu_wakeup_latency_constraints;
+	pm_qos_update_target(req->qos, &req->node, PM_QOS_ADD_REQ,
+			     PM_QOS_RESUME_LATENCY_NO_CONSTRAINT);
+	filp->private_data = req;
+
+	return 0;
+}
+
+static int cpu_wakeup_latency_qos_release(struct inode *inode,
+					  struct file *filp)
+{
+	struct pm_qos_request *req = filp->private_data;
+
+	filp->private_data = NULL;
+	pm_qos_update_target(req->qos, &req->node, PM_QOS_REMOVE_REQ,
+			     PM_QOS_RESUME_LATENCY_NO_CONSTRAINT);
+	kfree(req);
+
+	return 0;
+}
+
+static ssize_t cpu_wakeup_latency_qos_read(struct file *filp, char __user *buf,
+					   size_t count, loff_t *f_pos)
+{
+	s32 value = pm_qos_read_value(&cpu_wakeup_latency_constraints);
+
+	return simple_read_from_buffer(buf, count, f_pos, &value, sizeof(s32));
+}
+
+static ssize_t cpu_wakeup_latency_qos_write(struct file *filp,
+					    const char __user *buf,
+					    size_t count, loff_t *f_pos)
+{
+	struct pm_qos_request *req = filp->private_data;
+	s32 value;
+
+	if (count == sizeof(s32)) {
+		if (copy_from_user(&value, buf, sizeof(s32)))
+			return -EFAULT;
+	} else {
+		int ret;
+
+		ret = kstrtos32_from_user(buf, count, 16, &value);
+		if (ret)
+			return ret;
+	}
+
+	if (value < 0)
+		return -EINVAL;
+
+	pm_qos_update_target(req->qos, &req->node, PM_QOS_UPDATE_REQ, value);
+
+	return count;
+}
+
+static const struct file_operations cpu_wakeup_latency_qos_fops = {
+	.open = cpu_wakeup_latency_qos_open,
+	.release = cpu_wakeup_latency_qos_release,
+	.read = cpu_wakeup_latency_qos_read,
+	.write = cpu_wakeup_latency_qos_write,
+	.llseek = noop_llseek,
+};
+
+static struct miscdevice cpu_wakeup_latency_qos_miscdev = {
+	.minor = MISC_DYNAMIC_MINOR,
+	.name = "cpu_wakeup_latency",
+	.fops = &cpu_wakeup_latency_qos_fops,
+};
+#endif /* CONFIG_PM_QOS_CPU_SYSTEM_WAKEUP */
+
 static int __init cpu_latency_qos_init(void)
 {
 	int ret;
@@ -424,6 +523,13 @@ static int __init cpu_latency_qos_init(void)
 		pr_err("%s: %s setup failed\n", __func__,
 		       cpu_latency_qos_miscdev.name);
 
+#ifdef CONFIG_PM_QOS_CPU_SYSTEM_WAKEUP
+	ret = misc_register(&cpu_wakeup_latency_qos_miscdev);
+	if (ret < 0)
+		pr_err("%s: %s setup failed\n", __func__,
+		       cpu_wakeup_latency_qos_miscdev.name);
+#endif
+
 	return ret;
 }
 late_initcall(cpu_latency_qos_init);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 645f42e40478..0a946932d5c1 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -2110,22 +2110,20 @@ asmlinkage __visible int swsusp_save(void)
 {
 	unsigned int nr_pages, nr_highmem;
 
-	pr_info("Creating image:\n");
+	pm_deferred_pr_dbg("Creating image\n");
 
 	drain_local_pages(NULL);
 	nr_pages = count_data_pages();
 	nr_highmem = count_highmem_pages();
-	pr_info("Need to copy %u pages\n", nr_pages + nr_highmem);
+	pm_deferred_pr_dbg("Need to copy %u pages\n", nr_pages + nr_highmem);
 
 	if (!enough_free_mem(nr_pages, nr_highmem)) {
-		pr_err("Not enough free memory\n");
+		pm_deferred_pr_dbg("Not enough free memory for image creation\n");
 		return -ENOMEM;
 	}
 
-	if (swsusp_alloc(&copy_bm, nr_pages, nr_highmem)) {
-		pr_err("Memory allocation failed\n");
+	if (swsusp_alloc(&copy_bm, nr_pages, nr_highmem))
 		return -ENOMEM;
-	}
 
 	/*
 	 * During allocating of suspend pagedir, new cold pages may appear.
@@ -2144,7 +2142,8 @@ asmlinkage __visible int swsusp_save(void)
 	nr_zero_pages = nr_pages - nr_copy_pages;
 	nr_meta_pages = DIV_ROUND_UP(nr_pages * sizeof(long), PAGE_SIZE);
 
-	pr_info("Image created (%d pages copied, %d zero pages)\n", nr_copy_pages, nr_zero_pages);
+	pm_deferred_pr_dbg("Image created (%d pages copied, %d zero pages)\n",
+			   nr_copy_pages, nr_zero_pages);
 
 	return 0;
 }
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 3d4ebedad69f..2da4482bb6eb 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -344,10 +344,14 @@ MODULE_PARM_DESC(pm_test_delay,
 static int suspend_test(int level)
 {
 #ifdef CONFIG_PM_DEBUG
+	int i;
+
 	if (pm_test_level == level) {
 		pr_info("suspend debug: Waiting for %d second(s).\n",
 				pm_test_delay);
-		mdelay(pm_test_delay * 1000);
+		for (i = 0; i < pm_test_delay && !pm_wakeup_pending(); i++)
+			msleep(1000);
+
 		return 1;
 	}
 #endif /* !CONFIG_PM_DEBUG */
@@ -589,7 +593,11 @@ static int enter_state(suspend_state_t state)
 
 	if (sync_on_suspend_enabled) {
 		trace_suspend_resume(TPS("sync_filesystems"), 0, true);
-		ksys_sync_helper();
+
+		error = pm_sleep_fs_sync();
+		if (error)
+			goto Unlock;
+
 		trace_suspend_resume(TPS("sync_filesystems"), 0, false);
 	}
 
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 70ae21f7370d..33a186373bef 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -46,19 +46,18 @@ static bool clean_pages_on_read;
 static bool clean_pages_on_decompress;
 
 /*
- *	The swap map is a data structure used for keeping track of each page
- *	written to a swap partition.  It consists of many swap_map_page
- *	structures that contain each an array of MAP_PAGE_ENTRIES swap entries.
- *	These structures are stored on the swap and linked together with the
- *	help of the .next_swap member.
+ * The swap map is a data structure used for keeping track of each page
+ * written to a swap partition.  It consists of many swap_map_page structures
+ * that contain each an array of MAP_PAGE_ENTRIES swap entries.  These
+ * structures are stored on the swap and linked together with the help of the
+ * .next_swap member.
  *
- *	The swap map is created during suspend.  The swap map pages are
- *	allocated and populated one at a time, so we only need one memory
- *	page to set up the entire structure.
+ * The swap map is created during suspend.  The swap map pages are allocated and
+ * populated one at a time, so we only need one memory page to set up the entire
+ * structure.
  *
- *	During resume we pick up all swap_map_page structures into a list.
+ * During resume we pick up all swap_map_page structures into a list.
  */
-
 #define MAP_PAGE_ENTRIES	(PAGE_SIZE / sizeof(sector_t) - 1)
 
 /*
@@ -89,10 +88,8 @@ struct swap_map_page_list {
 };
 
 /*
- *	The swap_map_handle structure is used for handling swap in
- *	a file-alike way
+ * The swap_map_handle structure is used for handling swap in a file-alike way.
  */
-
 struct swap_map_handle {
 	struct swap_map_page *cur;
 	struct swap_map_page_list *maps;
@@ -117,10 +114,9 @@ struct swsusp_header {
 static struct swsusp_header *swsusp_header;
 
 /*
- *	The following functions are used for tracing the allocated
- *	swap pages, so that they can be freed in case of an error.
+ * The following functions are used for tracing the allocated swap pages, so
+ * that they can be freed in case of an error.
  */
-
 struct swsusp_extent {
 	struct rb_node node;
 	unsigned long start;
@@ -170,15 +166,14 @@ static int swsusp_extents_insert(unsigned long swap_offset)
 	return 0;
 }
 
-/*
- *	alloc_swapdev_block - allocate a swap page and register that it has
- *	been allocated, so that it can be freed in case of an error.
- */
-
 sector_t alloc_swapdev_block(int swap)
 {
 	unsigned long offset;
 
+	/*
+	 * Allocate a swap page and register that it has been allocated, so that
+	 * it can be freed in case of an error.
+	 */
 	offset = swp_offset(get_swap_page_of_type(swap));
 	if (offset) {
 		if (swsusp_extents_insert(offset))
@@ -189,16 +184,14 @@ sector_t alloc_swapdev_block(int swap)
 	return 0;
 }
 
-/*
- *	free_all_swap_pages - free swap pages allocated for saving image data.
- *	It also frees the extents used to register which swap entries had been
- *	allocated.
- */
-
 void free_all_swap_pages(int swap)
 {
 	struct rb_node *node;
 
+	/*
+	 * Free swap pages allocated for saving image data.  It also frees the
+	 * extents used to register which swap entries had been allocated.
+	 */
 	while ((node = swsusp_extents.rb_node)) {
 		struct swsusp_extent *ext;
 
@@ -303,6 +296,7 @@ static int hib_wait_io(struct hib_bio_batch *hb)
 /*
  * Saving part
  */
+
 static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags)
 {
 	int error;
@@ -336,16 +330,14 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags)
  */
 unsigned int swsusp_header_flags;
 
-/**
- *	swsusp_swap_check - check if the resume device is a swap device
- *	and get its index (if so)
- *
- *	This is called before saving image
- */
 static int swsusp_swap_check(void)
 {
 	int res;
 
+	/*
+	 * Check if the resume device is a swap device and get its index (if so).
+	 * This is called before saving the image.
+	 */
 	if (swsusp_resume_device)
 		res = swap_type_of(swsusp_resume_device, swsusp_resume_block);
 	else
@@ -362,13 +354,6 @@ static int swsusp_swap_check(void)
 	return 0;
 }
 
-/**
- *	write_page - Write one page to given swap location.
- *	@buf:		Address we're writing.
- *	@offset:	Offset of the swap page we're writing to.
- *	@hb:		bio completion batch
- */
-
 static int write_page(void *buf, sector_t offset, struct hib_bio_batch *hb)
 {
 	gfp_t gfp = GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY;
@@ -519,17 +504,14 @@ static int swap_writer_finish(struct swap_map_handle *handle,
 				CMP_HEADER, PAGE_SIZE)
 #define CMP_SIZE	(CMP_PAGES * PAGE_SIZE)
 
-/* Maximum number of threads for compression/decompression. */
-#define CMP_THREADS	3
+/* Default number of threads for compression/decompression. */
+#define CMP_THREADS    3
+static unsigned int hibernate_compression_threads = CMP_THREADS;
 
 /* Minimum/maximum number of pages for read buffering. */
 #define CMP_MIN_RD_PAGES	1024
 #define CMP_MAX_RD_PAGES	8192
 
-/**
- *	save_image - save the suspend image data
- */
-
 static int save_image(struct swap_map_handle *handle,
                       struct snapshot_handle *snapshot,
                       unsigned int nr_to_write)
@@ -585,13 +567,48 @@ struct crc_data {
 	wait_queue_head_t go;                     /* start crc update */
 	wait_queue_head_t done;                   /* crc update done */
 	u32 *crc32;                               /* points to handle's crc32 */
-	size_t *unc_len[CMP_THREADS];             /* uncompressed lengths */
-	unsigned char *unc[CMP_THREADS];          /* uncompressed data */
+	size_t **unc_len;			  /* uncompressed lengths */
+	unsigned char **unc;			  /* uncompressed data */
 };
 
-/*
- * CRC32 update function that runs in its own thread.
- */
+static struct crc_data *alloc_crc_data(int nr_threads)
+{
+	struct crc_data *crc;
+
+	crc = kzalloc(sizeof(*crc), GFP_KERNEL);
+	if (!crc)
+		return NULL;
+
+	crc->unc = kcalloc(nr_threads, sizeof(*crc->unc), GFP_KERNEL);
+	if (!crc->unc)
+		goto err_free_crc;
+
+	crc->unc_len = kcalloc(nr_threads, sizeof(*crc->unc_len), GFP_KERNEL);
+	if (!crc->unc_len)
+		goto err_free_unc;
+
+	return crc;
+
+err_free_unc:
+	kfree(crc->unc);
+err_free_crc:
+	kfree(crc);
+	return NULL;
+}
+
+static void free_crc_data(struct crc_data *crc)
+{
+	if (!crc)
+		return;
+
+	if (crc->thr)
+		kthread_stop(crc->thr);
+
+	kfree(crc->unc_len);
+	kfree(crc->unc);
+	kfree(crc);
+}
+
 static int crc32_threadfn(void *data)
 {
 	struct crc_data *d = data;
@@ -616,6 +633,7 @@ static int crc32_threadfn(void *data)
 	}
 	return 0;
 }
+
 /*
  * Structure used for data compression.
  */
@@ -637,9 +655,6 @@ struct cmp_data {
 /* Indicates the image size after compression */
 static atomic64_t compressed_size = ATOMIC_INIT(0);
 
-/*
- * Compression function that runs in its own thread.
- */
 static int compress_threadfn(void *data)
 {
 	struct cmp_data *d = data;
@@ -671,12 +686,6 @@ static int compress_threadfn(void *data)
 	return 0;
 }
 
-/**
- * save_compressed_image - Save the suspend image data after compression.
- * @handle: Swap map handle to use for saving the image.
- * @snapshot: Image to read data from.
- * @nr_to_write: Number of pages to save.
- */
 static int save_compressed_image(struct swap_map_handle *handle,
 				 struct snapshot_handle *snapshot,
 				 unsigned int nr_to_write)
@@ -703,7 +712,7 @@ static int save_compressed_image(struct swap_map_handle *handle,
 	 * footprint.
 	 */
 	nr_threads = num_online_cpus() - 1;
-	nr_threads = clamp_val(nr_threads, 1, CMP_THREADS);
+	nr_threads = clamp_val(nr_threads, 1, hibernate_compression_threads);
 
 	page = (void *)__get_free_page(GFP_NOIO | __GFP_HIGH);
 	if (!page) {
@@ -719,7 +728,7 @@ static int save_compressed_image(struct swap_map_handle *handle,
 		goto out_clean;
 	}
 
-	crc = kzalloc(sizeof(*crc), GFP_KERNEL);
+	crc = alloc_crc_data(nr_threads);
 	if (!crc) {
 		pr_err("Failed to allocate crc\n");
 		ret = -ENOMEM;
@@ -888,11 +897,7 @@ out_finish:
 
 out_clean:
 	hib_finish_batch(&hb);
-	if (crc) {
-		if (crc->thr)
-			kthread_stop(crc->thr);
-		kfree(crc);
-	}
+	free_crc_data(crc);
 	if (data) {
 		for (thr = 0; thr < nr_threads; thr++) {
 			if (data[thr].thr)
@@ -908,13 +913,6 @@ out_clean:
 	return ret;
 }
 
-/**
- *	enough_swap - Make sure we have enough swap to save the image.
- *
- *	Returns TRUE or FALSE after checking the total amount of swap
- *	space available from the resume partition.
- */
-
 static int enough_swap(unsigned int nr_pages)
 {
 	unsigned int free_swap = count_swap_pages(root_swap, 1);
@@ -927,15 +925,16 @@ static int enough_swap(unsigned int nr_pages)
 }
 
 /**
- *	swsusp_write - Write entire image and metadata.
- *	@flags: flags to pass to the "boot" kernel in the image header
+ * swsusp_write - Write entire image and metadata.
+ * @flags: flags to pass to the "boot" kernel in the image header
+ *
+ * It is important _NOT_ to umount filesystems at this point. We want them
+ * synced (in case something goes wrong) but we DO not want to mark filesystem
+ * clean: it is not. (And it does not matter, if we resume correctly, we'll mark
+ * system clean, anyway.)
  *
- *	It is important _NOT_ to umount filesystems at this point. We want
- *	them synced (in case something goes wrong) but we DO not want to mark
- *	filesystem clean: it is not. (And it does not matter, if we resume
- *	correctly, we'll mark system clean, anyway.)
+ * Return: 0 on success, negative error code on failure.
  */
-
 int swsusp_write(unsigned int flags)
 {
 	struct swap_map_handle handle;
@@ -978,8 +977,8 @@ out_finish:
 }
 
 /*
- *	The following functions allow us to read data using a swap map
- *	in a file-like way.
+ * The following functions allow us to read data using a swap map in a file-like
+ * way.
  */
 
 static void release_swap_reader(struct swap_map_handle *handle)
@@ -1081,12 +1080,6 @@ static int swap_reader_finish(struct swap_map_handle *handle)
 	return 0;
 }
 
-/**
- *	load_image - load the image using the swap map handle
- *	@handle and the snapshot handle @snapshot
- *	(assume there are @nr_pages pages to load)
- */
-
 static int load_image(struct swap_map_handle *handle,
                       struct snapshot_handle *snapshot,
                       unsigned int nr_to_read)
@@ -1157,9 +1150,6 @@ struct dec_data {
 	unsigned char cmp[CMP_SIZE];              /* compressed buffer */
 };
 
-/*
- * Decompression function that runs in its own thread.
- */
 static int decompress_threadfn(void *data)
 {
 	struct dec_data *d = data;
@@ -1194,12 +1184,6 @@ static int decompress_threadfn(void *data)
 	return 0;
 }
 
-/**
- * load_compressed_image - Load compressed image data and decompress it.
- * @handle: Swap map handle to use for loading data.
- * @snapshot: Image to copy uncompressed data into.
- * @nr_to_read: Number of pages to load.
- */
 static int load_compressed_image(struct swap_map_handle *handle,
 				 struct snapshot_handle *snapshot,
 				 unsigned int nr_to_read)
@@ -1227,7 +1211,7 @@ static int load_compressed_image(struct swap_map_handle *handle,
 	 * footprint.
 	 */
 	nr_threads = num_online_cpus() - 1;
-	nr_threads = clamp_val(nr_threads, 1, CMP_THREADS);
+	nr_threads = clamp_val(nr_threads, 1, hibernate_compression_threads);
 
 	page = vmalloc_array(CMP_MAX_RD_PAGES, sizeof(*page));
 	if (!page) {
@@ -1243,7 +1227,7 @@ static int load_compressed_image(struct swap_map_handle *handle,
 		goto out_clean;
 	}
 
-	crc = kzalloc(sizeof(*crc), GFP_KERNEL);
+	crc = alloc_crc_data(nr_threads);
 	if (!crc) {
 		pr_err("Failed to allocate crc\n");
 		ret = -ENOMEM;
@@ -1510,11 +1494,7 @@ out_clean:
 	hib_finish_batch(&hb);
 	for (i = 0; i < ring_size; i++)
 		free_page((unsigned long)page[i]);
-	if (crc) {
-		if (crc->thr)
-			kthread_stop(crc->thr);
-		kfree(crc);
-	}
+	free_crc_data(crc);
 	if (data) {
 		for (thr = 0; thr < nr_threads; thr++) {
 			if (data[thr].thr)
@@ -1533,8 +1513,9 @@ out_clean:
  *	swsusp_read - read the hibernation image.
  *	@flags_p: flags passed by the "frozen" kernel in the image header should
  *		  be written into this memory location
+ *
+ *	Return: 0 on success, negative error code on failure.
  */
-
 int swsusp_read(unsigned int *flags_p)
 {
 	int error;
@@ -1571,8 +1552,9 @@ static void *swsusp_holder;
 /**
  * swsusp_check - Open the resume device and check for the swsusp signature.
  * @exclusive: Open the resume device exclusively.
+ *
+ * Return: 0 if a valid image is found, negative error code otherwise.
  */
-
 int swsusp_check(bool exclusive)
 {
 	void *holder = exclusive ? &swsusp_holder : NULL;
@@ -1622,7 +1604,6 @@ put:
 /**
  * swsusp_close - close resume device.
  */
-
 void swsusp_close(void)
 {
 	if (IS_ERR(hib_resume_bdev_file)) {
@@ -1634,9 +1615,10 @@ void swsusp_close(void)
 }
 
 /**
- *      swsusp_unmark - Unmark swsusp signature in the resume device
+ * swsusp_unmark - Unmark swsusp signature in the resume device
+ *
+ * Return: 0 on success, negative error code on failure.
  */
-
 #ifdef CONFIG_SUSPEND
 int swsusp_unmark(void)
 {
@@ -1662,8 +1644,46 @@ int swsusp_unmark(void)
 }
 #endif
 
+static ssize_t hibernate_compression_threads_show(struct kobject *kobj,
+				struct kobj_attribute *attr, char *buf)
+{
+	return sysfs_emit(buf, "%d\n", hibernate_compression_threads);
+}
+
+static ssize_t hibernate_compression_threads_store(struct kobject *kobj,
+				struct kobj_attribute *attr,
+				const char *buf, size_t n)
+{
+	unsigned long val;
+
+	if (kstrtoul(buf, 0, &val))
+		return -EINVAL;
+
+	if (val < 1)
+		return -EINVAL;
+
+	hibernate_compression_threads = val;
+	return n;
+}
+power_attr(hibernate_compression_threads);
+
+static struct attribute *g[] = {
+	&hibernate_compression_threads_attr.attr,
+	NULL,
+};
+
+static const struct attribute_group attr_group = {
+	.attrs = g,
+};
+
 static int __init swsusp_header_init(void)
 {
+	int error;
+
+	error = sysfs_create_group(power_kobj, &attr_group);
+	if (error)
+		return -ENOMEM;
+
 	swsusp_header = (struct swsusp_header*) __get_free_page(GFP_KERNEL);
 	if (!swsusp_header)
 		panic("Could not allocate memory for swsusp_header\n");
@@ -1671,3 +1691,19 @@ static int __init swsusp_header_init(void)
 }
 
 core_initcall(swsusp_header_init);
+
+static int __init hibernate_compression_threads_setup(char *str)
+{
+	int rc = kstrtouint(str, 0, &hibernate_compression_threads);
+
+	if (rc)
+		return rc;
+
+	if (hibernate_compression_threads < 1)
+		hibernate_compression_threads = CMP_THREADS;
+
+	return 1;
+
+}
+
+__setup("hibernate_compression_threads=", hibernate_compression_threads_setup);
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 3f9e3efb9f6e..4401cfe26e5c 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -278,7 +278,9 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
 		if (data->frozen)
 			break;
 
-		ksys_sync_helper();
+		error = pm_sleep_fs_sync();
+		if (error)
+			break;
 
 		error = freeze_processes();
 		if (error)
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 1cb7a3d70e65..c174afe1dd17 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -131,12 +131,13 @@ void __cpuidle default_idle_call(void)
 }
 
 static int call_cpuidle_s2idle(struct cpuidle_driver *drv,
-			       struct cpuidle_device *dev)
+			       struct cpuidle_device *dev,
+			       u64 max_latency_ns)
 {
 	if (current_clr_polling_and_test())
 		return -EBUSY;
 
-	return cpuidle_enter_s2idle(drv, dev);
+	return cpuidle_enter_s2idle(drv, dev, max_latency_ns);
 }
 
 static int call_cpuidle(struct cpuidle_driver *drv, struct cpuidle_device *dev,
@@ -205,12 +206,13 @@ static void cpuidle_idle_call(void)
 		u64 max_latency_ns;
 
 		if (idle_should_enter_s2idle()) {
+			max_latency_ns = cpu_wakeup_latency_qos_limit() *
+					 NSEC_PER_USEC;
 
-			entered_state = call_cpuidle_s2idle(drv, dev);
+			entered_state = call_cpuidle_s2idle(drv, dev,
+							    max_latency_ns);
 			if (entered_state > 0)
 				goto exit_idle;
-
-			max_latency_ns = U64_MAX;
 		} else {
 			max_latency_ns = dev->forced_idle_latency_limit_ns;
 		}
diff --git a/lib/crc/arm/crc-t10dif.h b/lib/crc/arm/crc-t10dif.h
index 63441de5e3f1..afc0ebf97f19 100644
--- a/lib/crc/arm/crc-t10dif.h
+++ b/lib/crc/arm/crc-t10dif.h
@@ -5,7 +5,6 @@
  * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
  */
 
-#include <asm/neon.h>
 #include <asm/simd.h>
 
 static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
@@ -19,22 +18,16 @@ asmlinkage void crc_t10dif_pmull8(u16 init_crc, const u8 *buf, size_t len,
 
 static inline u16 crc_t10dif_arch(u16 crc, const u8 *data, size_t length)
 {
-	if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE) {
+	if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE && likely(may_use_simd())) {
 		if (static_branch_likely(&have_pmull)) {
-			if (likely(may_use_simd())) {
-				kernel_neon_begin();
-				crc = crc_t10dif_pmull64(crc, data, length);
-				kernel_neon_end();
-				return crc;
-			}
+			scoped_ksimd()
+				return crc_t10dif_pmull64(crc, data, length);
 		} else if (length > CRC_T10DIF_PMULL_CHUNK_SIZE &&
-			   static_branch_likely(&have_neon) &&
-			   likely(may_use_simd())) {
+			   static_branch_likely(&have_neon)) {
 			u8 buf[16] __aligned(16);
 
-			kernel_neon_begin();
-			crc_t10dif_pmull8(crc, data, length, buf);
-			kernel_neon_end();
+			scoped_ksimd()
+				crc_t10dif_pmull8(crc, data, length, buf);
 
 			return crc_t10dif_generic(0, buf, sizeof(buf));
 		}
diff --git a/lib/crc/arm/crc32.h b/lib/crc/arm/crc32.h
index 7b76f52f6907..f33de6b22cd4 100644
--- a/lib/crc/arm/crc32.h
+++ b/lib/crc/arm/crc32.h
@@ -8,7 +8,6 @@
 #include <linux/cpufeature.h>
 
 #include <asm/hwcap.h>
-#include <asm/neon.h>
 #include <asm/simd.h>
 
 static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_crc32);
@@ -42,9 +41,8 @@ static inline u32 crc32_le_arch(u32 crc, const u8 *p, size_t len)
 			len -= n;
 		}
 		n = round_down(len, 16);
-		kernel_neon_begin();
-		crc = crc32_pmull_le(p, n, crc);
-		kernel_neon_end();
+		scoped_ksimd()
+			crc = crc32_pmull_le(p, n, crc);
 		p += n;
 		len -= n;
 	}
@@ -71,9 +69,8 @@ static inline u32 crc32c_arch(u32 crc, const u8 *p, size_t len)
 			len -= n;
 		}
 		n = round_down(len, 16);
-		kernel_neon_begin();
-		crc = crc32c_pmull_le(p, n, crc);
-		kernel_neon_end();
+		scoped_ksimd()
+			crc = crc32c_pmull_le(p, n, crc);
 		p += n;
 		len -= n;
 	}
diff --git a/lib/crc/arm64/crc-t10dif.h b/lib/crc/arm64/crc-t10dif.h
index f88db2971805..b8338139ed77 100644
--- a/lib/crc/arm64/crc-t10dif.h
+++ b/lib/crc/arm64/crc-t10dif.h
@@ -7,7 +7,6 @@
 
 #include <linux/cpufeature.h>
 
-#include <asm/neon.h>
 #include <asm/simd.h>
 
 static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_asimd);
@@ -21,22 +20,16 @@ asmlinkage u16 crc_t10dif_pmull_p64(u16 init_crc, const u8 *buf, size_t len);
 
 static inline u16 crc_t10dif_arch(u16 crc, const u8 *data, size_t length)
 {
-	if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE) {
+	if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE && likely(may_use_simd())) {
 		if (static_branch_likely(&have_pmull)) {
-			if (likely(may_use_simd())) {
-				kernel_neon_begin();
-				crc = crc_t10dif_pmull_p64(crc, data, length);
-				kernel_neon_end();
-				return crc;
-			}
+			scoped_ksimd()
+				return crc_t10dif_pmull_p64(crc, data, length);
 		} else if (length > CRC_T10DIF_PMULL_CHUNK_SIZE &&
-			   static_branch_likely(&have_asimd) &&
-			   likely(may_use_simd())) {
+			   static_branch_likely(&have_asimd)) {
 			u8 buf[16];
 
-			kernel_neon_begin();
-			crc_t10dif_pmull_p8(crc, data, length, buf);
-			kernel_neon_end();
+			scoped_ksimd()
+				crc_t10dif_pmull_p8(crc, data, length, buf);
 
 			return crc_t10dif_generic(0, buf, sizeof(buf));
 		}
diff --git a/lib/crc/arm64/crc32.h b/lib/crc/arm64/crc32.h
index 31e649cd40a2..1939a5dee477 100644
--- a/lib/crc/arm64/crc32.h
+++ b/lib/crc/arm64/crc32.h
@@ -2,7 +2,6 @@
 
 #include <asm/alternative.h>
 #include <asm/cpufeature.h>
-#include <asm/neon.h>
 #include <asm/simd.h>
 
 // The minimum input length to consider the 4-way interleaved code path
@@ -23,9 +22,8 @@ static inline u32 crc32_le_arch(u32 crc, const u8 *p, size_t len)
 
 	if (len >= min_len && cpu_have_named_feature(PMULL) &&
 	    likely(may_use_simd())) {
-		kernel_neon_begin();
-		crc = crc32_le_arm64_4way(crc, p, len);
-		kernel_neon_end();
+		scoped_ksimd()
+			crc = crc32_le_arm64_4way(crc, p, len);
 
 		p += round_down(len, 64);
 		len %= 64;
@@ -44,9 +42,8 @@ static inline u32 crc32c_arch(u32 crc, const u8 *p, size_t len)
 
 	if (len >= min_len && cpu_have_named_feature(PMULL) &&
 	    likely(may_use_simd())) {
-		kernel_neon_begin();
-		crc = crc32c_le_arm64_4way(crc, p, len);
-		kernel_neon_end();
+		scoped_ksimd()
+			crc = crc32c_le_arm64_4way(crc, p, len);
 
 		p += round_down(len, 64);
 		len %= 64;
@@ -65,9 +62,8 @@ static inline u32 crc32_be_arch(u32 crc, const u8 *p, size_t len)
 
 	if (len >= min_len && cpu_have_named_feature(PMULL) &&
 	    likely(may_use_simd())) {
-		kernel_neon_begin();
-		crc = crc32_be_arm64_4way(crc, p, len);
-		kernel_neon_end();
+		scoped_ksimd()
+			crc = crc32_be_arm64_4way(crc, p, len);
 
 		p += round_down(len, 64);
 		len %= 64;
diff --git a/lib/crypto/Kconfig b/lib/crypto/Kconfig
index 16859c6226dd..a3647352bff6 100644
--- a/lib/crypto/Kconfig
+++ b/lib/crypto/Kconfig
@@ -28,6 +28,17 @@ config CRYPTO_LIB_ARC4
 config CRYPTO_LIB_GF128MUL
 	tristate
 
+config CRYPTO_LIB_BLAKE2B
+	tristate
+	help
+	  The BLAKE2b library functions.  Select this if your module uses any of
+	  the functions from <crypto/blake2b.h>.
+
+config CRYPTO_LIB_BLAKE2B_ARCH
+	bool
+	depends on CRYPTO_LIB_BLAKE2B && !UML
+	default y if ARM && KERNEL_MODE_NEON
+
 # BLAKE2s support is always built-in, so there's no CRYPTO_LIB_BLAKE2S option.
 
 config CRYPTO_LIB_BLAKE2S_ARCH
@@ -124,6 +135,18 @@ config CRYPTO_LIB_POLY1305_RSIZE
 	default 9 if ARM || ARM64
 	default 1
 
+config CRYPTO_LIB_POLYVAL
+	tristate
+	help
+	  The POLYVAL library functions.  Select this if your module uses any of
+	  the functions from <crypto/polyval.h>.
+
+config CRYPTO_LIB_POLYVAL_ARCH
+	bool
+	depends on CRYPTO_LIB_POLYVAL && !UML
+	default y if ARM64 && KERNEL_MODE_NEON
+	default y if X86_64
+
 config CRYPTO_LIB_CHACHA20POLY1305
 	tristate
 	select CRYPTO_LIB_CHACHA
@@ -184,6 +207,19 @@ config CRYPTO_LIB_SHA512_ARCH
 	default y if SPARC64
 	default y if X86_64
 
+config CRYPTO_LIB_SHA3
+	tristate
+	select CRYPTO_LIB_UTILS
+	help
+	  The SHA3 library functions.  Select this if your module uses any of
+	  the functions from <crypto/sha3.h>.
+
+config CRYPTO_LIB_SHA3_ARCH
+	bool
+	depends on CRYPTO_LIB_SHA3 && !UML
+	default y if ARM64 && KERNEL_MODE_NEON
+	default y if S390
+
 config CRYPTO_LIB_SM3
 	tristate
 
diff --git a/lib/crypto/Makefile b/lib/crypto/Makefile
index d2845b214585..b5346cebbb55 100644
--- a/lib/crypto/Makefile
+++ b/lib/crypto/Makefile
@@ -31,6 +31,16 @@ obj-$(CONFIG_CRYPTO_LIB_GF128MUL)		+= gf128mul.o
 
 ################################################################################
 
+obj-$(CONFIG_CRYPTO_LIB_BLAKE2B) += libblake2b.o
+libblake2b-y := blake2b.o
+CFLAGS_blake2b.o := -Wframe-larger-than=4096 #  https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105930
+ifeq ($(CONFIG_CRYPTO_LIB_BLAKE2B_ARCH),y)
+CFLAGS_blake2b.o += -I$(src)/$(SRCARCH)
+libblake2b-$(CONFIG_ARM) += arm/blake2b-neon-core.o
+endif # CONFIG_CRYPTO_LIB_BLAKE2B_ARCH
+
+################################################################################
+
 # blake2s is used by the /dev/random driver which is always builtin
 obj-y += blake2s.o
 ifeq ($(CONFIG_CRYPTO_LIB_BLAKE2S_ARCH),y)
@@ -188,6 +198,16 @@ clean-files += arm/poly1305-core.S \
 
 ################################################################################
 
+obj-$(CONFIG_CRYPTO_LIB_POLYVAL) += libpolyval.o
+libpolyval-y := polyval.o
+ifeq ($(CONFIG_CRYPTO_LIB_POLYVAL_ARCH),y)
+CFLAGS_polyval.o += -I$(src)/$(SRCARCH)
+libpolyval-$(CONFIG_ARM64) += arm64/polyval-ce-core.o
+libpolyval-$(CONFIG_X86) += x86/polyval-pclmul-avx.o
+endif
+
+################################################################################
+
 obj-$(CONFIG_CRYPTO_LIB_SHA1) += libsha1.o
 libsha1-y := sha1.o
 ifeq ($(CONFIG_CRYPTO_LIB_SHA1_ARCH),y)
@@ -268,6 +288,16 @@ endif # CONFIG_CRYPTO_LIB_SHA512_ARCH
 
 ################################################################################
 
+obj-$(CONFIG_CRYPTO_LIB_SHA3) += libsha3.o
+libsha3-y := sha3.o
+
+ifeq ($(CONFIG_CRYPTO_LIB_SHA3_ARCH),y)
+CFLAGS_sha3.o += -I$(src)/$(SRCARCH)
+libsha3-$(CONFIG_ARM64) += arm64/sha3-ce-core.o
+endif # CONFIG_CRYPTO_LIB_SHA3_ARCH
+
+################################################################################
+
 obj-$(CONFIG_MPILIB) += mpi/
 
 obj-$(CONFIG_CRYPTO_SELFTESTS_FULL)		+= simd.o
diff --git a/arch/arm/crypto/blake2b-neon-core.S b/lib/crypto/arm/blake2b-neon-core.S
index 0406a186377f..b55c37f0b88f 100644
--- a/arch/arm/crypto/blake2b-neon-core.S
+++ b/lib/crypto/arm/blake2b-neon-core.S
@@ -1,6 +1,9 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
 /*
- * BLAKE2b digest algorithm, NEON accelerated
+ * BLAKE2b digest algorithm optimized with ARM NEON instructions.  On ARM
+ * processors that have NEON support but not the ARMv8 Crypto Extensions,
+ * typically this BLAKE2b implementation is much faster than the SHA-2 family
+ * and slightly faster than SHA-1.
  *
  * Copyright 2020 Google LLC
  *
@@ -13,8 +16,8 @@
 	.fpu		neon
 
 	// The arguments to blake2b_compress_neon()
-	STATE		.req	r0
-	BLOCK		.req	r1
+	CTX		.req	r0
+	DATA		.req	r1
 	NBLOCKS		.req	r2
 	INC		.req	r3
 
@@ -234,10 +237,10 @@
 .endm
 
 //
-// void blake2b_compress_neon(struct blake2b_state *state,
-//			      const u8 *block, size_t nblocks, u32 inc);
+// void blake2b_compress_neon(struct blake2b_ctx *ctx,
+//			      const u8 *data, size_t nblocks, u32 inc);
 //
-// Only the first three fields of struct blake2b_state are used:
+// Only the first three fields of struct blake2b_ctx are used:
 //	u64 h[8];	(inout)
 //	u64 t[2];	(inout)
 //	u64 f[2];	(in)
@@ -255,7 +258,7 @@ ENTRY(blake2b_compress_neon)
 	adr		ROR24_TABLE, .Lror24_table
 	adr		ROR16_TABLE, .Lror16_table
 
-	mov		ip, STATE
+	mov		ip, CTX
 	vld1.64		{q0-q1}, [ip]!		// Load h[0..3]
 	vld1.64		{q2-q3}, [ip]!		// Load h[4..7]
 .Lnext_block:
@@ -281,14 +284,14 @@ ENTRY(blake2b_compress_neon)
 	// (q8-q9) in an aligned buffer on the stack so that they can be
 	// reloaded when needed.  (We could just reload directly from the
 	// message buffer, but it's faster to use aligned loads.)
-	vld1.8		{q8-q9}, [BLOCK]!
+	vld1.8		{q8-q9}, [DATA]!
 	  veor		q6, q6, q14	// v[12..13] = IV[4..5] ^ t[0..1]
-	vld1.8		{q10-q11}, [BLOCK]!
+	vld1.8		{q10-q11}, [DATA]!
 	  veor		q7, q7, q15	// v[14..15] = IV[6..7] ^ f[0..1]
-	vld1.8		{q12-q13}, [BLOCK]!
+	vld1.8		{q12-q13}, [DATA]!
 	vst1.8		{q8-q9}, [sp, :256]
-	  mov		ip, STATE
-	vld1.8		{q14-q15}, [BLOCK]!
+	  mov		ip, CTX
+	vld1.8		{q14-q15}, [DATA]!
 
 	// Execute the rounds.  Each round is provided the order in which it
 	// needs to use the message words.
@@ -319,7 +322,7 @@ ENTRY(blake2b_compress_neon)
 	veor		q3, q3, q7		// v[6..7] ^= v[14..15]
 	veor		q0, q0, q8		// v[0..1] ^= h[0..1]
 	veor		q1, q1, q9		// v[2..3] ^= h[2..3]
-	  mov		ip, STATE
+	  mov		ip, CTX
 	  subs		NBLOCKS, NBLOCKS, #1	// nblocks--
 	  vst1.64	{q0-q1}, [ip]!		// Store new h[0..3]
 	veor		q2, q2, q10		// v[4..5] ^= h[4..5]
diff --git a/lib/crypto/arm/blake2b.h b/lib/crypto/arm/blake2b.h
new file mode 100644
index 000000000000..5c76498521e6
--- /dev/null
+++ b/lib/crypto/arm/blake2b.h
@@ -0,0 +1,40 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * BLAKE2b digest algorithm, NEON accelerated
+ *
+ * Copyright 2020 Google LLC
+ */
+
+#include <asm/neon.h>
+#include <asm/simd.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
+
+asmlinkage void blake2b_compress_neon(struct blake2b_ctx *ctx,
+				      const u8 *data, size_t nblocks, u32 inc);
+
+static void blake2b_compress(struct blake2b_ctx *ctx,
+			     const u8 *data, size_t nblocks, u32 inc)
+{
+	if (!static_branch_likely(&have_neon) || !may_use_simd()) {
+		blake2b_compress_generic(ctx, data, nblocks, inc);
+		return;
+	}
+	do {
+		const size_t blocks = min_t(size_t, nblocks,
+					    SZ_4K / BLAKE2B_BLOCK_SIZE);
+
+		scoped_ksimd()
+			blake2b_compress_neon(ctx, data, blocks, inc);
+
+		data += blocks * BLAKE2B_BLOCK_SIZE;
+		nblocks -= blocks;
+	} while (nblocks);
+}
+
+#define blake2b_mod_init_arch blake2b_mod_init_arch
+static void blake2b_mod_init_arch(void)
+{
+	if (elf_hwcap & HWCAP_NEON)
+		static_branch_enable(&have_neon);
+}
diff --git a/lib/crypto/arm/blake2s-core.S b/lib/crypto/arm/blake2s-core.S
index 293f44fa8f31..933f0558b7cd 100644
--- a/lib/crypto/arm/blake2s-core.S
+++ b/lib/crypto/arm/blake2s-core.S
@@ -115,7 +115,7 @@
 
 // Execute one round of BLAKE2s by updating the state matrix v[0..15].  v[0..9]
 // are in r0..r9.  The stack pointer points to 8 bytes of scratch space for
-// spilling v[8..9], then to v[9..15], then to the message block.  r10-r12 and
+// spilling v[8..9], then to v[10..15], then to the message block.  r10-r12 and
 // r14 are free to use.  The macro arguments s0-s15 give the order in which the
 // message words are used in this round.
 //
@@ -170,10 +170,10 @@
 .endm
 
 //
-// void blake2s_compress(struct blake2s_state *state,
-//			 const u8 *block, size_t nblocks, u32 inc);
+// void blake2s_compress(struct blake2s_ctx *ctx,
+//			 const u8 *data, size_t nblocks, u32 inc);
 //
-// Only the first three fields of struct blake2s_state are used:
+// Only the first three fields of struct blake2s_ctx are used:
 //	u32 h[8];	(inout)
 //	u32 t[2];	(inout)
 //	u32 f[2];	(in)
@@ -183,8 +183,8 @@ ENTRY(blake2s_compress)
 	push		{r0-r2,r4-r11,lr}	// keep this an even number
 
 .Lnext_block:
-	// r0 is 'state'
-	// r1 is 'block'
+	// r0 is 'ctx'
+	// r1 is 'data'
 	// r3 is 'inc'
 
 	// Load and increment the counter t[0..1].
@@ -209,18 +209,18 @@ ENTRY(blake2s_compress)
 .Lcopy_block_done:
 	str		r1, [sp, #68]		// Update message pointer
 
-	// Calculate v[8..15].  Push v[9..15] onto the stack, and leave space
+	// Calculate v[8..15].  Push v[10..15] onto the stack, and leave space
 	// for spilling v[8..9].  Leave v[8..9] in r8-r9.
-	mov		r14, r0			// r14 = state
+	mov		r14, r0			// r14 = ctx
 	adr		r12, .Lblake2s_IV
 	ldmia		r12!, {r8-r9}		// load IV[0..1]
 	__ldrd		r0, r1, r14, 40		// load f[0..1]
-	ldm		r12, {r2-r7}		// load IV[3..7]
+	ldm		r12, {r2-r7}		// load IV[2..7]
 	eor		r4, r4, r10		// v[12] = IV[4] ^ t[0]
 	eor		r5, r5, r11		// v[13] = IV[5] ^ t[1]
 	eor		r6, r6, r0		// v[14] = IV[6] ^ f[0]
 	eor		r7, r7, r1		// v[15] = IV[7] ^ f[1]
-	push		{r2-r7}			// push v[9..15]
+	push		{r2-r7}			// push v[10..15]
 	sub		sp, sp, #8		// leave space for v[8..9]
 
 	// Load h[0..7] == v[0..7].
@@ -275,7 +275,7 @@ ENTRY(blake2s_compress)
 	// Advance to the next block, if there is one.  Note that if there are
 	// multiple blocks, then 'inc' (the counter increment amount) must be
 	// 64.  So we can simply set it to 64 without re-loading it.
-	ldm		sp, {r0, r1, r2}	// load (state, block, nblocks)
+	ldm		sp, {r0, r1, r2}	// load (ctx, data, nblocks)
 	mov		r3, #64			// set 'inc'
 	subs		r2, r2, #1		// nblocks--
 	str		r2, [sp, #8]
diff --git a/lib/crypto/arm/blake2s.h b/lib/crypto/arm/blake2s.h
index aa7a97139ea7..42c04440c191 100644
--- a/lib/crypto/arm/blake2s.h
+++ b/lib/crypto/arm/blake2s.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
 
 /* defined in blake2s-core.S */
-void blake2s_compress(struct blake2s_state *state, const u8 *block,
-		      size_t nblocks, u32 inc);
+void blake2s_compress(struct blake2s_ctx *ctx,
+		      const u8 *data, size_t nblocks, u32 inc);
diff --git a/lib/crypto/arm/chacha.h b/lib/crypto/arm/chacha.h
index 0cae30f8ee5d..836e49088e98 100644
--- a/lib/crypto/arm/chacha.h
+++ b/lib/crypto/arm/chacha.h
@@ -12,7 +12,6 @@
 
 #include <asm/cputype.h>
 #include <asm/hwcap.h>
-#include <asm/neon.h>
 #include <asm/simd.h>
 
 asmlinkage void chacha_block_xor_neon(const struct chacha_state *state,
@@ -68,9 +67,8 @@ static void hchacha_block_arch(const struct chacha_state *state,
 	if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !neon_usable()) {
 		hchacha_block_arm(state, out, nrounds);
 	} else {
-		kernel_neon_begin();
-		hchacha_block_neon(state, out, nrounds);
-		kernel_neon_end();
+		scoped_ksimd()
+			hchacha_block_neon(state, out, nrounds);
 	}
 }
 
@@ -87,9 +85,8 @@ static void chacha_crypt_arch(struct chacha_state *state, u8 *dst,
 	do {
 		unsigned int todo = min_t(unsigned int, bytes, SZ_4K);
 
-		kernel_neon_begin();
-		chacha_doneon(state, dst, src, todo, nrounds);
-		kernel_neon_end();
+		scoped_ksimd()
+			chacha_doneon(state, dst, src, todo, nrounds);
 
 		bytes -= todo;
 		src += todo;
diff --git a/lib/crypto/arm/curve25519.h b/lib/crypto/arm/curve25519.h
index f6d66494eb8f..b1a566885e95 100644
--- a/lib/crypto/arm/curve25519.h
+++ b/lib/crypto/arm/curve25519.h
@@ -25,9 +25,8 @@ static void curve25519_arch(u8 out[CURVE25519_KEY_SIZE],
 			    const u8 point[CURVE25519_KEY_SIZE])
 {
 	if (static_branch_likely(&have_neon) && crypto_simd_usable()) {
-		kernel_neon_begin();
-		curve25519_neon(out, scalar, point);
-		kernel_neon_end();
+		scoped_ksimd()
+			curve25519_neon(out, scalar, point);
 	} else {
 		curve25519_generic(out, scalar, point);
 	}
diff --git a/lib/crypto/arm/poly1305.h b/lib/crypto/arm/poly1305.h
index 0021cf368307..0fe903d8de55 100644
--- a/lib/crypto/arm/poly1305.h
+++ b/lib/crypto/arm/poly1305.h
@@ -6,7 +6,6 @@
  */
 
 #include <asm/hwcap.h>
-#include <asm/neon.h>
 #include <asm/simd.h>
 #include <linux/cpufeature.h>
 #include <linux/jump_label.h>
@@ -32,9 +31,8 @@ static void poly1305_blocks(struct poly1305_block_state *state, const u8 *src,
 		do {
 			unsigned int todo = min_t(unsigned int, len, SZ_4K);
 
-			kernel_neon_begin();
-			poly1305_blocks_neon(state, src, todo, padbit);
-			kernel_neon_end();
+			scoped_ksimd()
+				poly1305_blocks_neon(state, src, todo, padbit);
 
 			len -= todo;
 			src += todo;
diff --git a/lib/crypto/arm/sha1-armv7-neon.S b/lib/crypto/arm/sha1-armv7-neon.S
index 6edba3ab62e8..a0323fa5c58a 100644
--- a/lib/crypto/arm/sha1-armv7-neon.S
+++ b/lib/crypto/arm/sha1-armv7-neon.S
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* sha1-armv7-neon.S - ARM/NEON accelerated SHA-1 transform function
+/* ARM/NEON accelerated SHA-1 transform function
  *
  * Copyright © 2013-2014 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  */
diff --git a/lib/crypto/arm/sha1-ce-core.S b/lib/crypto/arm/sha1-ce-core.S
index 2de40dd25e47..7d6b2631ca8d 100644
--- a/lib/crypto/arm/sha1-ce-core.S
+++ b/lib/crypto/arm/sha1-ce-core.S
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
 /*
- * sha1-ce-core.S - SHA-1 secure hash using ARMv8 Crypto Extensions
+ * SHA-1 secure hash using ARMv8 Crypto Extensions
  *
  * Copyright (C) 2015 Linaro Ltd.
  * Author: Ard Biesheuvel <ard.biesheuvel@linaro.org>
diff --git a/lib/crypto/arm/sha1.h b/lib/crypto/arm/sha1.h
index 29f8bcad0447..3e2d8c7cab9f 100644
--- a/lib/crypto/arm/sha1.h
+++ b/lib/crypto/arm/sha1.h
@@ -4,7 +4,6 @@
  *
  * Copyright 2025 Google LLC
  */
-#include <asm/neon.h>
 #include <asm/simd.h>
 
 static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
@@ -22,12 +21,12 @@ static void sha1_blocks(struct sha1_block_state *state,
 {
 	if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
 	    static_branch_likely(&have_neon) && likely(may_use_simd())) {
-		kernel_neon_begin();
-		if (static_branch_likely(&have_ce))
-			sha1_ce_transform(state, data, nblocks);
-		else
-			sha1_transform_neon(state, data, nblocks);
-		kernel_neon_end();
+		scoped_ksimd() {
+			if (static_branch_likely(&have_ce))
+				sha1_ce_transform(state, data, nblocks);
+			else
+				sha1_transform_neon(state, data, nblocks);
+		}
 	} else {
 		sha1_block_data_order(state, data, nblocks);
 	}
diff --git a/lib/crypto/arm/sha256-ce.S b/lib/crypto/arm/sha256-ce.S
index 7481ac8e6c0d..144ee805f64a 100644
--- a/lib/crypto/arm/sha256-ce.S
+++ b/lib/crypto/arm/sha256-ce.S
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
 /*
- * sha256-ce.S - SHA-224/256 secure hash using ARMv8 Crypto Extensions
+ * SHA-224/256 secure hash using ARMv8 Crypto Extensions
  *
  * Copyright (C) 2015 Linaro Ltd.
  * Author: Ard Biesheuvel <ard.biesheuvel@linaro.org>
diff --git a/lib/crypto/arm/sha256.h b/lib/crypto/arm/sha256.h
index 7556457b3094..ae7e52dd6e3b 100644
--- a/lib/crypto/arm/sha256.h
+++ b/lib/crypto/arm/sha256.h
@@ -22,12 +22,12 @@ static void sha256_blocks(struct sha256_block_state *state,
 {
 	if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
 	    static_branch_likely(&have_neon) && likely(may_use_simd())) {
-		kernel_neon_begin();
-		if (static_branch_likely(&have_ce))
-			sha256_ce_transform(state, data, nblocks);
-		else
-			sha256_block_data_order_neon(state, data, nblocks);
-		kernel_neon_end();
+		scoped_ksimd() {
+			if (static_branch_likely(&have_ce))
+				sha256_ce_transform(state, data, nblocks);
+			else
+				sha256_block_data_order_neon(state, data, nblocks);
+		}
 	} else {
 		sha256_block_data_order(state, data, nblocks);
 	}
diff --git a/lib/crypto/arm/sha512.h b/lib/crypto/arm/sha512.h
index d1b485dd275d..ed9bd81d6d78 100644
--- a/lib/crypto/arm/sha512.h
+++ b/lib/crypto/arm/sha512.h
@@ -19,9 +19,8 @@ static void sha512_blocks(struct sha512_block_state *state,
 {
 	if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
 	    static_branch_likely(&have_neon) && likely(may_use_simd())) {
-		kernel_neon_begin();
-		sha512_block_data_order_neon(state, data, nblocks);
-		kernel_neon_end();
+		scoped_ksimd()
+			sha512_block_data_order_neon(state, data, nblocks);
 	} else {
 		sha512_block_data_order(state, data, nblocks);
 	}
diff --git a/lib/crypto/arm64/chacha.h b/lib/crypto/arm64/chacha.h
index ba6c22d46086..ca8c6a8b0578 100644
--- a/lib/crypto/arm64/chacha.h
+++ b/lib/crypto/arm64/chacha.h
@@ -23,7 +23,6 @@
 #include <linux/kernel.h>
 
 #include <asm/hwcap.h>
-#include <asm/neon.h>
 #include <asm/simd.h>
 
 asmlinkage void chacha_block_xor_neon(const struct chacha_state *state,
@@ -65,9 +64,8 @@ static void hchacha_block_arch(const struct chacha_state *state,
 	if (!static_branch_likely(&have_neon) || !crypto_simd_usable()) {
 		hchacha_block_generic(state, out, nrounds);
 	} else {
-		kernel_neon_begin();
-		hchacha_block_neon(state, out, nrounds);
-		kernel_neon_end();
+		scoped_ksimd()
+			hchacha_block_neon(state, out, nrounds);
 	}
 }
 
@@ -81,9 +79,8 @@ static void chacha_crypt_arch(struct chacha_state *state, u8 *dst,
 	do {
 		unsigned int todo = min_t(unsigned int, bytes, SZ_4K);
 
-		kernel_neon_begin();
-		chacha_doneon(state, dst, src, todo, nrounds);
-		kernel_neon_end();
+		scoped_ksimd()
+			chacha_doneon(state, dst, src, todo, nrounds);
 
 		bytes -= todo;
 		src += todo;
diff --git a/lib/crypto/arm64/poly1305.h b/lib/crypto/arm64/poly1305.h
index aed5921ccd9a..b77669767cd6 100644
--- a/lib/crypto/arm64/poly1305.h
+++ b/lib/crypto/arm64/poly1305.h
@@ -6,7 +6,6 @@
  */
 
 #include <asm/hwcap.h>
-#include <asm/neon.h>
 #include <asm/simd.h>
 #include <linux/cpufeature.h>
 #include <linux/jump_label.h>
@@ -31,9 +30,8 @@ static void poly1305_blocks(struct poly1305_block_state *state, const u8 *src,
 		do {
 			unsigned int todo = min_t(unsigned int, len, SZ_4K);
 
-			kernel_neon_begin();
-			poly1305_blocks_neon(state, src, todo, padbit);
-			kernel_neon_end();
+			scoped_ksimd()
+				poly1305_blocks_neon(state, src, todo, padbit);
 
 			len -= todo;
 			src += todo;
diff --git a/arch/arm64/crypto/polyval-ce-core.S b/lib/crypto/arm64/polyval-ce-core.S
index b5326540d2e3..7c731a044d02 100644
--- a/arch/arm64/crypto/polyval-ce-core.S
+++ b/lib/crypto/arm64/polyval-ce-core.S
@@ -27,10 +27,10 @@
 #include <linux/linkage.h>
 #define STRIDE_BLOCKS 8
 
-KEY_POWERS	.req	x0
-MSG		.req	x1
-BLOCKS_LEFT	.req	x2
-ACCUMULATOR	.req	x3
+ACCUMULATOR	.req	x0
+KEY_POWERS	.req	x1
+MSG		.req	x2
+BLOCKS_LEFT	.req	x3
 KEY_START	.req	x10
 EXTRA_BYTES	.req	x11
 TMP	.req	x13
@@ -300,15 +300,12 @@ GSTAR	.req	v24
 .endm
 
 /*
- * Perform montgomery multiplication in GF(2^128) and store result in op1.
+ * Computes a = a * b * x^{-128} mod x^128 + x^127 + x^126 + x^121 + 1.
  *
- * Computes op1*op2*x^{-128} mod x^128 + x^127 + x^126 + x^121 + 1
- * If op1, op2 are in montgomery form, this computes the montgomery
- * form of op1*op2.
- *
- * void pmull_polyval_mul(u8 *op1, const u8 *op2);
+ * void polyval_mul_pmull(struct polyval_elem *a,
+ *			  const struct polyval_elem *b);
  */
-SYM_FUNC_START(pmull_polyval_mul)
+SYM_FUNC_START(polyval_mul_pmull)
 	adr	TMP, .Lgstar
 	ld1	{GSTAR.2d}, [TMP]
 	ld1	{v0.16b}, [x0]
@@ -318,22 +315,23 @@ SYM_FUNC_START(pmull_polyval_mul)
 	montgomery_reduction SUM
 	st1	{SUM.16b}, [x0]
 	ret
-SYM_FUNC_END(pmull_polyval_mul)
+SYM_FUNC_END(polyval_mul_pmull)
 
 /*
  * Perform polynomial evaluation as specified by POLYVAL.  This computes:
  *	h^n * accumulator + h^n * m_0 + ... + h^1 * m_{n-1}
  * where n=nblocks, h is the hash key, and m_i are the message blocks.
  *
- * x0 - pointer to precomputed key powers h^8 ... h^1
- * x1 - pointer to message blocks
- * x2 - number of blocks to hash
- * x3 - pointer to accumulator
+ * x0 - pointer to accumulator
+ * x1 - pointer to precomputed key powers h^8 ... h^1
+ * x2 - pointer to message blocks
+ * x3 - number of blocks to hash
  *
- * void pmull_polyval_update(const struct polyval_ctx *ctx, const u8 *in,
- *			     size_t nblocks, u8 *accumulator);
+ * void polyval_blocks_pmull(struct polyval_elem *acc,
+ *			     const struct polyval_key *key,
+ *			     const u8 *data, size_t nblocks);
  */
-SYM_FUNC_START(pmull_polyval_update)
+SYM_FUNC_START(polyval_blocks_pmull)
 	adr	TMP, .Lgstar
 	mov	KEY_START, KEY_POWERS
 	ld1	{GSTAR.2d}, [TMP]
@@ -358,4 +356,4 @@ SYM_FUNC_START(pmull_polyval_update)
 .LskipPartial:
 	st1	{SUM.16b}, [ACCUMULATOR]
 	ret
-SYM_FUNC_END(pmull_polyval_update)
+SYM_FUNC_END(polyval_blocks_pmull)
diff --git a/lib/crypto/arm64/polyval.h b/lib/crypto/arm64/polyval.h
new file mode 100644
index 000000000000..a39763395e9b
--- /dev/null
+++ b/lib/crypto/arm64/polyval.h
@@ -0,0 +1,80 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * POLYVAL library functions, arm64 optimized
+ *
+ * Copyright 2025 Google LLC
+ */
+#include <asm/simd.h>
+#include <linux/cpufeature.h>
+
+#define NUM_H_POWERS 8
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pmull);
+
+asmlinkage void polyval_mul_pmull(struct polyval_elem *a,
+				  const struct polyval_elem *b);
+asmlinkage void polyval_blocks_pmull(struct polyval_elem *acc,
+				     const struct polyval_key *key,
+				     const u8 *data, size_t nblocks);
+
+static void polyval_preparekey_arch(struct polyval_key *key,
+				    const u8 raw_key[POLYVAL_BLOCK_SIZE])
+{
+	static_assert(ARRAY_SIZE(key->h_powers) == NUM_H_POWERS);
+	memcpy(&key->h_powers[NUM_H_POWERS - 1], raw_key, POLYVAL_BLOCK_SIZE);
+	if (static_branch_likely(&have_pmull) && may_use_simd()) {
+		scoped_ksimd() {
+			for (int i = NUM_H_POWERS - 2; i >= 0; i--) {
+				key->h_powers[i] = key->h_powers[i + 1];
+				polyval_mul_pmull(
+					&key->h_powers[i],
+					&key->h_powers[NUM_H_POWERS - 1]);
+			}
+		}
+	} else {
+		for (int i = NUM_H_POWERS - 2; i >= 0; i--) {
+			key->h_powers[i] = key->h_powers[i + 1];
+			polyval_mul_generic(&key->h_powers[i],
+					    &key->h_powers[NUM_H_POWERS - 1]);
+		}
+	}
+}
+
+static void polyval_mul_arch(struct polyval_elem *acc,
+			     const struct polyval_key *key)
+{
+	if (static_branch_likely(&have_pmull) && may_use_simd()) {
+		scoped_ksimd()
+			polyval_mul_pmull(acc, &key->h_powers[NUM_H_POWERS - 1]);
+	} else {
+		polyval_mul_generic(acc, &key->h_powers[NUM_H_POWERS - 1]);
+	}
+}
+
+static void polyval_blocks_arch(struct polyval_elem *acc,
+				const struct polyval_key *key,
+				const u8 *data, size_t nblocks)
+{
+	if (static_branch_likely(&have_pmull) && may_use_simd()) {
+		do {
+			/* Allow rescheduling every 4 KiB. */
+			size_t n = min_t(size_t, nblocks,
+					 4096 / POLYVAL_BLOCK_SIZE);
+
+			scoped_ksimd()
+				polyval_blocks_pmull(acc, key, data, n);
+			data += n * POLYVAL_BLOCK_SIZE;
+			nblocks -= n;
+		} while (nblocks);
+	} else {
+		polyval_blocks_generic(acc, &key->h_powers[NUM_H_POWERS - 1],
+				       data, nblocks);
+	}
+}
+
+#define polyval_mod_init_arch polyval_mod_init_arch
+static void polyval_mod_init_arch(void)
+{
+	if (cpu_have_named_feature(PMULL))
+		static_branch_enable(&have_pmull);
+}
diff --git a/lib/crypto/arm64/sha1-ce-core.S b/lib/crypto/arm64/sha1-ce-core.S
index 21efbbafd7d6..8fbd4767f0f0 100644
--- a/lib/crypto/arm64/sha1-ce-core.S
+++ b/lib/crypto/arm64/sha1-ce-core.S
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
 /*
- * sha1-ce-core.S - SHA-1 secure hash using ARMv8 Crypto Extensions
+ * SHA-1 secure hash using ARMv8 Crypto Extensions
  *
  * Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org>
  */
diff --git a/lib/crypto/arm64/sha1.h b/lib/crypto/arm64/sha1.h
index aaef4ebfc5e3..bc7071f1be09 100644
--- a/lib/crypto/arm64/sha1.h
+++ b/lib/crypto/arm64/sha1.h
@@ -4,7 +4,6 @@
  *
  * Copyright 2025 Google LLC
  */
-#include <asm/neon.h>
 #include <asm/simd.h>
 #include <linux/cpufeature.h>
 
@@ -20,9 +19,9 @@ static void sha1_blocks(struct sha1_block_state *state,
 		do {
 			size_t rem;
 
-			kernel_neon_begin();
-			rem = __sha1_ce_transform(state, data, nblocks);
-			kernel_neon_end();
+			scoped_ksimd()
+				rem = __sha1_ce_transform(state, data, nblocks);
+
 			data += (nblocks - rem) * SHA1_BLOCK_SIZE;
 			nblocks = rem;
 		} while (nblocks);
diff --git a/lib/crypto/arm64/sha256-ce.S b/lib/crypto/arm64/sha256-ce.S
index 410174ba5237..e4bfe42a61a9 100644
--- a/lib/crypto/arm64/sha256-ce.S
+++ b/lib/crypto/arm64/sha256-ce.S
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
 /*
- * sha2-ce-core.S - core SHA-224/SHA-256 transform using v8 Crypto Extensions
+ * Core SHA-224/SHA-256 transform using v8 Crypto Extensions
  *
  * Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org>
  */
diff --git a/lib/crypto/arm64/sha256.h b/lib/crypto/arm64/sha256.h
index 80d06df27d3a..568dff0f276a 100644
--- a/lib/crypto/arm64/sha256.h
+++ b/lib/crypto/arm64/sha256.h
@@ -4,7 +4,6 @@
  *
  * Copyright 2025 Google LLC
  */
-#include <asm/neon.h>
 #include <asm/simd.h>
 #include <linux/cpufeature.h>
 
@@ -27,17 +26,16 @@ static void sha256_blocks(struct sha256_block_state *state,
 			do {
 				size_t rem;
 
-				kernel_neon_begin();
-				rem = __sha256_ce_transform(state,
-							    data, nblocks);
-				kernel_neon_end();
+				scoped_ksimd()
+					rem = __sha256_ce_transform(state, data,
+								    nblocks);
+
 				data += (nblocks - rem) * SHA256_BLOCK_SIZE;
 				nblocks = rem;
 			} while (nblocks);
 		} else {
-			kernel_neon_begin();
-			sha256_block_neon(state, data, nblocks);
-			kernel_neon_end();
+			scoped_ksimd()
+				sha256_block_neon(state, data, nblocks);
 		}
 	} else {
 		sha256_block_data_order(state, data, nblocks);
@@ -66,9 +64,8 @@ static bool sha256_finup_2x_arch(const struct __sha256_ctx *ctx,
 	if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
 	    static_branch_likely(&have_ce) && len >= SHA256_BLOCK_SIZE &&
 	    len <= 65536 && likely(may_use_simd())) {
-		kernel_neon_begin();
-		sha256_ce_finup2x(ctx, data1, data2, len, out1, out2);
-		kernel_neon_end();
+		scoped_ksimd()
+			sha256_ce_finup2x(ctx, data1, data2, len, out1, out2);
 		kmsan_unpoison_memory(out1, SHA256_DIGEST_SIZE);
 		kmsan_unpoison_memory(out2, SHA256_DIGEST_SIZE);
 		return true;
diff --git a/arch/arm64/crypto/sha3-ce-core.S b/lib/crypto/arm64/sha3-ce-core.S
index 9c77313f5a60..ace90b506490 100644
--- a/arch/arm64/crypto/sha3-ce-core.S
+++ b/lib/crypto/arm64/sha3-ce-core.S
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 /*
- * sha3-ce-core.S - core SHA-3 transform using v8.2 Crypto Extensions
+ * Core SHA-3 transform using v8.2 Crypto Extensions
  *
  * Copyright (C) 2018 Linaro Ltd <ard.biesheuvel@linaro.org>
  *
@@ -37,7 +37,11 @@
 	.endm
 
 	/*
-	 * int sha3_ce_transform(u64 *st, const u8 *data, int blocks, int dg_size)
+	 * size_t sha3_ce_transform(struct sha3_state *state, const u8 *data,
+	 *			    size_t nblocks, size_t block_size)
+	 *
+	 * block_size is assumed to be one of 72 (SHA3-512), 104 (SHA3-384), 136
+	 * (SHA3-256 and SHAKE256), 144 (SHA3-224), or 168 (SHAKE128).
 	 */
 	.text
 SYM_FUNC_START(sha3_ce_transform)
@@ -51,58 +55,55 @@ SYM_FUNC_START(sha3_ce_transform)
 	ld1	{v20.1d-v23.1d}, [x8], #32
 	ld1	{v24.1d}, [x8]
 
-0:	sub	w2, w2, #1
+0:	sub	x2, x2, #1
 	mov	w8, #24
 	adr_l	x9, .Lsha3_rcon
 
 	/* load input */
 	ld1	{v25.8b-v28.8b}, [x1], #32
-	ld1	{v29.8b-v31.8b}, [x1], #24
+	ld1	{v29.8b}, [x1], #8
 	eor	v0.8b, v0.8b, v25.8b
 	eor	v1.8b, v1.8b, v26.8b
 	eor	v2.8b, v2.8b, v27.8b
 	eor	v3.8b, v3.8b, v28.8b
 	eor	v4.8b, v4.8b, v29.8b
-	eor	v5.8b, v5.8b, v30.8b
-	eor	v6.8b, v6.8b, v31.8b
-
-	tbnz	x3, #6, 2f		// SHA3-512
 
 	ld1	{v25.8b-v28.8b}, [x1], #32
-	ld1	{v29.8b-v30.8b}, [x1], #16
-	eor	 v7.8b,  v7.8b, v25.8b
-	eor	 v8.8b,  v8.8b, v26.8b
-	eor	 v9.8b,  v9.8b, v27.8b
-	eor	v10.8b, v10.8b, v28.8b
-	eor	v11.8b, v11.8b, v29.8b
-	eor	v12.8b, v12.8b, v30.8b
+	eor	v5.8b, v5.8b, v25.8b
+	eor	v6.8b, v6.8b, v26.8b
+	eor	v7.8b, v7.8b, v27.8b
+	eor	v8.8b, v8.8b, v28.8b
+	cmp	x3, #72
+	b.eq	3f	/* SHA3-512 (block_size=72)? */
 
-	tbnz	x3, #4, 1f		// SHA3-384 or SHA3-224
+	ld1	{v25.8b-v28.8b}, [x1], #32
+	eor	v9.8b, v9.8b, v25.8b
+	eor	v10.8b, v10.8b, v26.8b
+	eor	v11.8b, v11.8b, v27.8b
+	eor	v12.8b, v12.8b, v28.8b
+	cmp	x3, #104
+	b.eq	3f	/* SHA3-384 (block_size=104)? */
 
-	// SHA3-256
 	ld1	{v25.8b-v28.8b}, [x1], #32
 	eor	v13.8b, v13.8b, v25.8b
 	eor	v14.8b, v14.8b, v26.8b
 	eor	v15.8b, v15.8b, v27.8b
 	eor	v16.8b, v16.8b, v28.8b
-	b	3f
-
-1:	tbz	x3, #2, 3f		// bit 2 cleared? SHA-384
+	cmp	x3, #144
+	b.lt	3f	/* SHA3-256 or SHAKE256 (block_size=136)? */
+	b.eq	2f	/* SHA3-224 (block_size=144)? */
 
-	// SHA3-224
+	/* SHAKE128 (block_size=168) */
 	ld1	{v25.8b-v28.8b}, [x1], #32
-	ld1	{v29.8b}, [x1], #8
-	eor	v13.8b, v13.8b, v25.8b
-	eor	v14.8b, v14.8b, v26.8b
-	eor	v15.8b, v15.8b, v27.8b
-	eor	v16.8b, v16.8b, v28.8b
-	eor	v17.8b, v17.8b, v29.8b
+	eor	v17.8b, v17.8b, v25.8b
+	eor	v18.8b, v18.8b, v26.8b
+	eor	v19.8b, v19.8b, v27.8b
+	eor	v20.8b, v20.8b, v28.8b
 	b	3f
-
-	// SHA3-512
-2:	ld1	{v25.8b-v26.8b}, [x1], #16
-	eor	 v7.8b,  v7.8b, v25.8b
-	eor	 v8.8b,  v8.8b, v26.8b
+2:
+	/* SHA3-224 (block_size=144) */
+	ld1	{v25.8b}, [x1], #8
+	eor	v17.8b, v17.8b, v25.8b
 
 3:	sub	w8, w8, #1
 
@@ -185,7 +186,7 @@ SYM_FUNC_START(sha3_ce_transform)
 
 	cbnz	w8, 3b
 	cond_yield 4f, x8, x9
-	cbnz	w2, 0b
+	cbnz	x2, 0b
 
 	/* save state */
 4:	st1	{ v0.1d- v3.1d}, [x0], #32
@@ -195,7 +196,7 @@ SYM_FUNC_START(sha3_ce_transform)
 	st1	{v16.1d-v19.1d}, [x0], #32
 	st1	{v20.1d-v23.1d}, [x0], #32
 	st1	{v24.1d}, [x0]
-	mov	w0, w2
+	mov	x0, x2
 	ret
 SYM_FUNC_END(sha3_ce_transform)
 
diff --git a/lib/crypto/arm64/sha3.h b/lib/crypto/arm64/sha3.h
new file mode 100644
index 000000000000..b602f1b3b282
--- /dev/null
+++ b/lib/crypto/arm64/sha3.h
@@ -0,0 +1,59 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2018 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <asm/simd.h>
+#include <linux/cpufeature.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_sha3);
+
+asmlinkage size_t sha3_ce_transform(struct sha3_state *state, const u8 *data,
+				    size_t nblocks, size_t block_size);
+
+static void sha3_absorb_blocks(struct sha3_state *state, const u8 *data,
+			       size_t nblocks, size_t block_size)
+{
+	if (static_branch_likely(&have_sha3) && likely(may_use_simd())) {
+		do {
+			size_t rem;
+
+			scoped_ksimd()
+				rem = sha3_ce_transform(state, data, nblocks,
+							block_size);
+			data += (nblocks - rem) * block_size;
+			nblocks = rem;
+		} while (nblocks);
+	} else {
+		sha3_absorb_blocks_generic(state, data, nblocks, block_size);
+	}
+}
+
+static void sha3_keccakf(struct sha3_state *state)
+{
+	if (static_branch_likely(&have_sha3) && likely(may_use_simd())) {
+		/*
+		 * Passing zeroes into sha3_ce_transform() gives the plain
+		 * Keccak-f permutation, which is what we want here.  Any
+		 * supported block size may be used.  Use SHA3_512_BLOCK_SIZE
+		 * since it's the shortest.
+		 */
+		static const u8 zeroes[SHA3_512_BLOCK_SIZE];
+
+		scoped_ksimd()
+			sha3_ce_transform(state, zeroes, 1, sizeof(zeroes));
+	} else {
+		sha3_keccakf_generic(state);
+	}
+}
+
+#define sha3_mod_init_arch sha3_mod_init_arch
+static void sha3_mod_init_arch(void)
+{
+	if (cpu_have_named_feature(SHA3))
+		static_branch_enable(&have_sha3);
+}
diff --git a/lib/crypto/arm64/sha512-ce-core.S b/lib/crypto/arm64/sha512-ce-core.S
index 22f1ded89bc8..ffd51acfd1ee 100644
--- a/lib/crypto/arm64/sha512-ce-core.S
+++ b/lib/crypto/arm64/sha512-ce-core.S
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 /*
- * sha512-ce-core.S - core SHA-384/SHA-512 transform using v8 Crypto Extensions
+ * Core SHA-384/SHA-512 transform using v8 Crypto Extensions
  *
  * Copyright (C) 2018 Linaro Ltd <ard.biesheuvel@linaro.org>
  *
diff --git a/lib/crypto/arm64/sha512.h b/lib/crypto/arm64/sha512.h
index ddb0d256f73a..7eb7ef04d268 100644
--- a/lib/crypto/arm64/sha512.h
+++ b/lib/crypto/arm64/sha512.h
@@ -4,7 +4,7 @@
  *
  * Copyright 2025 Google LLC
  */
-#include <asm/neon.h>
+
 #include <asm/simd.h>
 #include <linux/cpufeature.h>
 
@@ -24,9 +24,9 @@ static void sha512_blocks(struct sha512_block_state *state,
 		do {
 			size_t rem;
 
-			kernel_neon_begin();
-			rem = __sha512_ce_transform(state, data, nblocks);
-			kernel_neon_end();
+			scoped_ksimd()
+				rem = __sha512_ce_transform(state, data, nblocks);
+
 			data += (nblocks - rem) * SHA512_BLOCK_SIZE;
 			nblocks = rem;
 		} while (nblocks);
diff --git a/lib/crypto/blake2b.c b/lib/crypto/blake2b.c
new file mode 100644
index 000000000000..09c6d65d8a6e
--- /dev/null
+++ b/lib/crypto/blake2b.c
@@ -0,0 +1,174 @@
+// SPDX-License-Identifier: GPL-2.0 OR MIT
+/*
+ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ * Copyright 2025 Google LLC
+ *
+ * This is an implementation of the BLAKE2b hash and PRF functions.
+ *
+ * Information: https://blake2.net/
+ */
+
+#include <crypto/blake2b.h>
+#include <linux/bug.h>
+#include <linux/export.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/types.h>
+
+static const u8 blake2b_sigma[12][16] = {
+	{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+	{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
+	{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
+	{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
+	{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
+	{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
+	{ 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
+	{ 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
+	{ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
+	{ 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
+	{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+	{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }
+};
+
+static inline void blake2b_increment_counter(struct blake2b_ctx *ctx, u32 inc)
+{
+	ctx->t[0] += inc;
+	ctx->t[1] += (ctx->t[0] < inc);
+}
+
+static void __maybe_unused
+blake2b_compress_generic(struct blake2b_ctx *ctx,
+			 const u8 *data, size_t nblocks, u32 inc)
+{
+	u64 m[16];
+	u64 v[16];
+	int i;
+
+	WARN_ON(IS_ENABLED(DEBUG) &&
+		(nblocks > 1 && inc != BLAKE2B_BLOCK_SIZE));
+
+	while (nblocks > 0) {
+		blake2b_increment_counter(ctx, inc);
+		memcpy(m, data, BLAKE2B_BLOCK_SIZE);
+		le64_to_cpu_array(m, ARRAY_SIZE(m));
+		memcpy(v, ctx->h, 64);
+		v[ 8] = BLAKE2B_IV0;
+		v[ 9] = BLAKE2B_IV1;
+		v[10] = BLAKE2B_IV2;
+		v[11] = BLAKE2B_IV3;
+		v[12] = BLAKE2B_IV4 ^ ctx->t[0];
+		v[13] = BLAKE2B_IV5 ^ ctx->t[1];
+		v[14] = BLAKE2B_IV6 ^ ctx->f[0];
+		v[15] = BLAKE2B_IV7 ^ ctx->f[1];
+
+#define G(r, i, a, b, c, d) do { \
+	a += b + m[blake2b_sigma[r][2 * i + 0]]; \
+	d = ror64(d ^ a, 32); \
+	c += d; \
+	b = ror64(b ^ c, 24); \
+	a += b + m[blake2b_sigma[r][2 * i + 1]]; \
+	d = ror64(d ^ a, 16); \
+	c += d; \
+	b = ror64(b ^ c, 63); \
+} while (0)
+
+#define ROUND(r) do { \
+	G(r, 0, v[0], v[ 4], v[ 8], v[12]); \
+	G(r, 1, v[1], v[ 5], v[ 9], v[13]); \
+	G(r, 2, v[2], v[ 6], v[10], v[14]); \
+	G(r, 3, v[3], v[ 7], v[11], v[15]); \
+	G(r, 4, v[0], v[ 5], v[10], v[15]); \
+	G(r, 5, v[1], v[ 6], v[11], v[12]); \
+	G(r, 6, v[2], v[ 7], v[ 8], v[13]); \
+	G(r, 7, v[3], v[ 4], v[ 9], v[14]); \
+} while (0)
+		ROUND(0);
+		ROUND(1);
+		ROUND(2);
+		ROUND(3);
+		ROUND(4);
+		ROUND(5);
+		ROUND(6);
+		ROUND(7);
+		ROUND(8);
+		ROUND(9);
+		ROUND(10);
+		ROUND(11);
+
+#undef G
+#undef ROUND
+
+		for (i = 0; i < 8; ++i)
+			ctx->h[i] ^= v[i] ^ v[i + 8];
+
+		data += BLAKE2B_BLOCK_SIZE;
+		--nblocks;
+	}
+}
+
+#ifdef CONFIG_CRYPTO_LIB_BLAKE2B_ARCH
+#include "blake2b.h" /* $(SRCARCH)/blake2b.h */
+#else
+#define blake2b_compress blake2b_compress_generic
+#endif
+
+static inline void blake2b_set_lastblock(struct blake2b_ctx *ctx)
+{
+	ctx->f[0] = -1;
+}
+
+void blake2b_update(struct blake2b_ctx *ctx, const u8 *in, size_t inlen)
+{
+	const size_t fill = BLAKE2B_BLOCK_SIZE - ctx->buflen;
+
+	if (unlikely(!inlen))
+		return;
+	if (inlen > fill) {
+		memcpy(ctx->buf + ctx->buflen, in, fill);
+		blake2b_compress(ctx, ctx->buf, 1, BLAKE2B_BLOCK_SIZE);
+		ctx->buflen = 0;
+		in += fill;
+		inlen -= fill;
+	}
+	if (inlen > BLAKE2B_BLOCK_SIZE) {
+		const size_t nblocks = DIV_ROUND_UP(inlen, BLAKE2B_BLOCK_SIZE);
+
+		blake2b_compress(ctx, in, nblocks - 1, BLAKE2B_BLOCK_SIZE);
+		in += BLAKE2B_BLOCK_SIZE * (nblocks - 1);
+		inlen -= BLAKE2B_BLOCK_SIZE * (nblocks - 1);
+	}
+	memcpy(ctx->buf + ctx->buflen, in, inlen);
+	ctx->buflen += inlen;
+}
+EXPORT_SYMBOL(blake2b_update);
+
+void blake2b_final(struct blake2b_ctx *ctx, u8 *out)
+{
+	WARN_ON(IS_ENABLED(DEBUG) && !out);
+	blake2b_set_lastblock(ctx);
+	memset(ctx->buf + ctx->buflen, 0,
+	       BLAKE2B_BLOCK_SIZE - ctx->buflen); /* Padding */
+	blake2b_compress(ctx, ctx->buf, 1, ctx->buflen);
+	cpu_to_le64_array(ctx->h, ARRAY_SIZE(ctx->h));
+	memcpy(out, ctx->h, ctx->outlen);
+	memzero_explicit(ctx, sizeof(*ctx));
+}
+EXPORT_SYMBOL(blake2b_final);
+
+#ifdef blake2b_mod_init_arch
+static int __init blake2b_mod_init(void)
+{
+	blake2b_mod_init_arch();
+	return 0;
+}
+subsys_initcall(blake2b_mod_init);
+
+static void __exit blake2b_mod_exit(void)
+{
+}
+module_exit(blake2b_mod_exit);
+#endif
+
+MODULE_DESCRIPTION("BLAKE2b hash function");
+MODULE_LICENSE("GPL");
diff --git a/lib/crypto/blake2s.c b/lib/crypto/blake2s.c
index 5638ed9d882d..6182c21ed943 100644
--- a/lib/crypto/blake2s.c
+++ b/lib/crypto/blake2s.c
@@ -29,16 +29,15 @@ static const u8 blake2s_sigma[10][16] = {
 	{ 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
 };
 
-static inline void blake2s_increment_counter(struct blake2s_state *state,
-					     const u32 inc)
+static inline void blake2s_increment_counter(struct blake2s_ctx *ctx, u32 inc)
 {
-	state->t[0] += inc;
-	state->t[1] += (state->t[0] < inc);
+	ctx->t[0] += inc;
+	ctx->t[1] += (ctx->t[0] < inc);
 }
 
 static void __maybe_unused
-blake2s_compress_generic(struct blake2s_state *state, const u8 *block,
-			 size_t nblocks, const u32 inc)
+blake2s_compress_generic(struct blake2s_ctx *ctx,
+			 const u8 *data, size_t nblocks, u32 inc)
 {
 	u32 m[16];
 	u32 v[16];
@@ -48,18 +47,18 @@ blake2s_compress_generic(struct blake2s_state *state, const u8 *block,
 		(nblocks > 1 && inc != BLAKE2S_BLOCK_SIZE));
 
 	while (nblocks > 0) {
-		blake2s_increment_counter(state, inc);
-		memcpy(m, block, BLAKE2S_BLOCK_SIZE);
+		blake2s_increment_counter(ctx, inc);
+		memcpy(m, data, BLAKE2S_BLOCK_SIZE);
 		le32_to_cpu_array(m, ARRAY_SIZE(m));
-		memcpy(v, state->h, 32);
+		memcpy(v, ctx->h, 32);
 		v[ 8] = BLAKE2S_IV0;
 		v[ 9] = BLAKE2S_IV1;
 		v[10] = BLAKE2S_IV2;
 		v[11] = BLAKE2S_IV3;
-		v[12] = BLAKE2S_IV4 ^ state->t[0];
-		v[13] = BLAKE2S_IV5 ^ state->t[1];
-		v[14] = BLAKE2S_IV6 ^ state->f[0];
-		v[15] = BLAKE2S_IV7 ^ state->f[1];
+		v[12] = BLAKE2S_IV4 ^ ctx->t[0];
+		v[13] = BLAKE2S_IV5 ^ ctx->t[1];
+		v[14] = BLAKE2S_IV6 ^ ctx->f[0];
+		v[15] = BLAKE2S_IV7 ^ ctx->f[1];
 
 #define G(r, i, a, b, c, d) do { \
 	a += b + m[blake2s_sigma[r][2 * i + 0]]; \
@@ -97,9 +96,9 @@ blake2s_compress_generic(struct blake2s_state *state, const u8 *block,
 #undef ROUND
 
 		for (i = 0; i < 8; ++i)
-			state->h[i] ^= v[i] ^ v[i + 8];
+			ctx->h[i] ^= v[i] ^ v[i + 8];
 
-		block += BLAKE2S_BLOCK_SIZE;
+		data += BLAKE2S_BLOCK_SIZE;
 		--nblocks;
 	}
 }
@@ -110,45 +109,46 @@ blake2s_compress_generic(struct blake2s_state *state, const u8 *block,
 #define blake2s_compress blake2s_compress_generic
 #endif
 
-static inline void blake2s_set_lastblock(struct blake2s_state *state)
+static inline void blake2s_set_lastblock(struct blake2s_ctx *ctx)
 {
-	state->f[0] = -1;
+	ctx->f[0] = -1;
 }
 
-void blake2s_update(struct blake2s_state *state, const u8 *in, size_t inlen)
+void blake2s_update(struct blake2s_ctx *ctx, const u8 *in, size_t inlen)
 {
-	const size_t fill = BLAKE2S_BLOCK_SIZE - state->buflen;
+	const size_t fill = BLAKE2S_BLOCK_SIZE - ctx->buflen;
 
 	if (unlikely(!inlen))
 		return;
 	if (inlen > fill) {
-		memcpy(state->buf + state->buflen, in, fill);
-		blake2s_compress(state, state->buf, 1, BLAKE2S_BLOCK_SIZE);
-		state->buflen = 0;
+		memcpy(ctx->buf + ctx->buflen, in, fill);
+		blake2s_compress(ctx, ctx->buf, 1, BLAKE2S_BLOCK_SIZE);
+		ctx->buflen = 0;
 		in += fill;
 		inlen -= fill;
 	}
 	if (inlen > BLAKE2S_BLOCK_SIZE) {
 		const size_t nblocks = DIV_ROUND_UP(inlen, BLAKE2S_BLOCK_SIZE);
-		blake2s_compress(state, in, nblocks - 1, BLAKE2S_BLOCK_SIZE);
+
+		blake2s_compress(ctx, in, nblocks - 1, BLAKE2S_BLOCK_SIZE);
 		in += BLAKE2S_BLOCK_SIZE * (nblocks - 1);
 		inlen -= BLAKE2S_BLOCK_SIZE * (nblocks - 1);
 	}
-	memcpy(state->buf + state->buflen, in, inlen);
-	state->buflen += inlen;
+	memcpy(ctx->buf + ctx->buflen, in, inlen);
+	ctx->buflen += inlen;
 }
 EXPORT_SYMBOL(blake2s_update);
 
-void blake2s_final(struct blake2s_state *state, u8 *out)
+void blake2s_final(struct blake2s_ctx *ctx, u8 *out)
 {
 	WARN_ON(IS_ENABLED(DEBUG) && !out);
-	blake2s_set_lastblock(state);
-	memset(state->buf + state->buflen, 0,
-	       BLAKE2S_BLOCK_SIZE - state->buflen); /* Padding */
-	blake2s_compress(state, state->buf, 1, state->buflen);
-	cpu_to_le32_array(state->h, ARRAY_SIZE(state->h));
-	memcpy(out, state->h, state->outlen);
-	memzero_explicit(state, sizeof(*state));
+	blake2s_set_lastblock(ctx);
+	memset(ctx->buf + ctx->buflen, 0,
+	       BLAKE2S_BLOCK_SIZE - ctx->buflen); /* Padding */
+	blake2s_compress(ctx, ctx->buf, 1, ctx->buflen);
+	cpu_to_le32_array(ctx->h, ARRAY_SIZE(ctx->h));
+	memcpy(out, ctx->h, ctx->outlen);
+	memzero_explicit(ctx, sizeof(*ctx));
 }
 EXPORT_SYMBOL(blake2s_final);
 
diff --git a/lib/crypto/chacha20poly1305.c b/lib/crypto/chacha20poly1305.c
index 0b49d6aedefd..212ce33562af 100644
--- a/lib/crypto/chacha20poly1305.c
+++ b/lib/crypto/chacha20poly1305.c
@@ -89,7 +89,7 @@ __chacha20poly1305_encrypt(u8 *dst, const u8 *src, const size_t src_len,
 void chacha20poly1305_encrypt(u8 *dst, const u8 *src, const size_t src_len,
 			      const u8 *ad, const size_t ad_len,
 			      const u64 nonce,
-			      const u8 key[CHACHA20POLY1305_KEY_SIZE])
+			      const u8 key[at_least CHACHA20POLY1305_KEY_SIZE])
 {
 	struct chacha_state chacha_state;
 	u32 k[CHACHA_KEY_WORDS];
@@ -111,8 +111,8 @@ EXPORT_SYMBOL(chacha20poly1305_encrypt);
 
 void xchacha20poly1305_encrypt(u8 *dst, const u8 *src, const size_t src_len,
 			       const u8 *ad, const size_t ad_len,
-			       const u8 nonce[XCHACHA20POLY1305_NONCE_SIZE],
-			       const u8 key[CHACHA20POLY1305_KEY_SIZE])
+			       const u8 nonce[at_least XCHACHA20POLY1305_NONCE_SIZE],
+			       const u8 key[at_least CHACHA20POLY1305_KEY_SIZE])
 {
 	struct chacha_state chacha_state;
 
@@ -170,7 +170,7 @@ __chacha20poly1305_decrypt(u8 *dst, const u8 *src, const size_t src_len,
 bool chacha20poly1305_decrypt(u8 *dst, const u8 *src, const size_t src_len,
 			      const u8 *ad, const size_t ad_len,
 			      const u64 nonce,
-			      const u8 key[CHACHA20POLY1305_KEY_SIZE])
+			      const u8 key[at_least CHACHA20POLY1305_KEY_SIZE])
 {
 	struct chacha_state chacha_state;
 	u32 k[CHACHA_KEY_WORDS];
@@ -195,8 +195,8 @@ EXPORT_SYMBOL(chacha20poly1305_decrypt);
 
 bool xchacha20poly1305_decrypt(u8 *dst, const u8 *src, const size_t src_len,
 			       const u8 *ad, const size_t ad_len,
-			       const u8 nonce[XCHACHA20POLY1305_NONCE_SIZE],
-			       const u8 key[CHACHA20POLY1305_KEY_SIZE])
+			       const u8 nonce[at_least XCHACHA20POLY1305_NONCE_SIZE],
+			       const u8 key[at_least CHACHA20POLY1305_KEY_SIZE])
 {
 	struct chacha_state chacha_state;
 
@@ -211,7 +211,7 @@ bool chacha20poly1305_crypt_sg_inplace(struct scatterlist *src,
 				       const size_t src_len,
 				       const u8 *ad, const size_t ad_len,
 				       const u64 nonce,
-				       const u8 key[CHACHA20POLY1305_KEY_SIZE],
+				       const u8 key[at_least CHACHA20POLY1305_KEY_SIZE],
 				       int encrypt)
 {
 	const u8 *pad0 = page_address(ZERO_PAGE(0));
@@ -335,7 +335,7 @@ bool chacha20poly1305_crypt_sg_inplace(struct scatterlist *src,
 bool chacha20poly1305_encrypt_sg_inplace(struct scatterlist *src, size_t src_len,
 					 const u8 *ad, const size_t ad_len,
 					 const u64 nonce,
-					 const u8 key[CHACHA20POLY1305_KEY_SIZE])
+					 const u8 key[at_least CHACHA20POLY1305_KEY_SIZE])
 {
 	return chacha20poly1305_crypt_sg_inplace(src, src_len, ad, ad_len,
 						 nonce, key, 1);
@@ -345,7 +345,7 @@ EXPORT_SYMBOL(chacha20poly1305_encrypt_sg_inplace);
 bool chacha20poly1305_decrypt_sg_inplace(struct scatterlist *src, size_t src_len,
 					 const u8 *ad, const size_t ad_len,
 					 const u64 nonce,
-					 const u8 key[CHACHA20POLY1305_KEY_SIZE])
+					 const u8 key[at_least CHACHA20POLY1305_KEY_SIZE])
 {
 	if (unlikely(src_len < POLY1305_DIGEST_SIZE))
 		return false;
diff --git a/lib/crypto/fips.h b/lib/crypto/fips.h
new file mode 100644
index 000000000000..023410c2e0db
--- /dev/null
+++ b/lib/crypto/fips.h
@@ -0,0 +1,45 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* This file was generated by: gen-fips-testvecs.py */
+
+#include <linux/fips.h>
+
+static const u8 fips_test_data[] __initconst __maybe_unused = {
+	0x66, 0x69, 0x70, 0x73, 0x20, 0x74, 0x65, 0x73,
+	0x74, 0x20, 0x64, 0x61, 0x74, 0x61, 0x00, 0x00,
+};
+
+static const u8 fips_test_key[] __initconst __maybe_unused = {
+	0x66, 0x69, 0x70, 0x73, 0x20, 0x74, 0x65, 0x73,
+	0x74, 0x20, 0x6b, 0x65, 0x79, 0x00, 0x00, 0x00,
+};
+
+static const u8 fips_test_hmac_sha1_value[] __initconst __maybe_unused = {
+	0x29, 0xa9, 0x88, 0xb8, 0x5c, 0xb4, 0xaf, 0x4b,
+	0x97, 0x2a, 0xee, 0x87, 0x5b, 0x0a, 0x02, 0x55,
+	0x99, 0xbf, 0x86, 0x78,
+};
+
+static const u8 fips_test_hmac_sha256_value[] __initconst __maybe_unused = {
+	0x59, 0x25, 0x85, 0xcc, 0x40, 0xe9, 0x64, 0x2f,
+	0xe9, 0xbf, 0x82, 0xb7, 0xd3, 0x15, 0x3d, 0x43,
+	0x22, 0x0b, 0x4c, 0x00, 0x90, 0x14, 0x25, 0xcf,
+	0x9e, 0x13, 0x2b, 0xc2, 0x30, 0xe6, 0xe8, 0x93,
+};
+
+static const u8 fips_test_hmac_sha512_value[] __initconst __maybe_unused = {
+	0x6b, 0xea, 0x5d, 0x27, 0x49, 0x5b, 0x3f, 0xea,
+	0xde, 0x2d, 0xfa, 0x32, 0x75, 0xdb, 0x77, 0xc8,
+	0x26, 0xe9, 0x4e, 0x95, 0x4d, 0xad, 0x88, 0x02,
+	0x87, 0xf9, 0x52, 0x0a, 0xd1, 0x92, 0x80, 0x1d,
+	0x92, 0x7e, 0x3c, 0xbd, 0xb1, 0x3c, 0x49, 0x98,
+	0x44, 0x9c, 0x8f, 0xee, 0x3f, 0x02, 0x71, 0x51,
+	0x57, 0x0b, 0x15, 0x38, 0x95, 0xd8, 0xa3, 0x81,
+	0xba, 0xb3, 0x15, 0x37, 0x5c, 0x6d, 0x57, 0x2b,
+};
+
+static const u8 fips_test_sha3_256_value[] __initconst __maybe_unused = {
+	0x77, 0xc4, 0x8b, 0x69, 0x70, 0x5f, 0x0a, 0xb1,
+	0xb1, 0xa5, 0x82, 0x0a, 0x22, 0x2b, 0x49, 0x31,
+	0xba, 0x9b, 0xb6, 0xaa, 0x32, 0xa7, 0x97, 0x00,
+	0x98, 0xdb, 0xff, 0xe7, 0xc6, 0xde, 0xb5, 0x82,
+};
diff --git a/lib/crypto/polyval.c b/lib/crypto/polyval.c
new file mode 100644
index 000000000000..5796275f574a
--- /dev/null
+++ b/lib/crypto/polyval.c
@@ -0,0 +1,307 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * POLYVAL library functions
+ *
+ * Copyright 2025 Google LLC
+ */
+
+#include <crypto/polyval.h>
+#include <linux/export.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/unaligned.h>
+
+/*
+ * POLYVAL is an almost-XOR-universal hash function.  Similar to GHASH, POLYVAL
+ * interprets the message as the coefficients of a polynomial in GF(2^128) and
+ * evaluates that polynomial at a secret point.  POLYVAL has a simple
+ * mathematical relationship with GHASH, but it uses a better field convention
+ * which makes it easier and faster to implement.
+ *
+ * POLYVAL is not a cryptographic hash function, and it should be used only by
+ * algorithms that are specifically designed to use it.
+ *
+ * POLYVAL is specified by "AES-GCM-SIV: Nonce Misuse-Resistant Authenticated
+ * Encryption" (https://datatracker.ietf.org/doc/html/rfc8452)
+ *
+ * POLYVAL is also used by HCTR2.  See "Length-preserving encryption with HCTR2"
+ * (https://eprint.iacr.org/2021/1441.pdf).
+ *
+ * This file provides a library API for POLYVAL.  This API can delegate to
+ * either a generic implementation or an architecture-optimized implementation.
+ *
+ * For the generic implementation, we don't use the traditional table approach
+ * to GF(2^128) multiplication.  That approach is not constant-time and requires
+ * a lot of memory.  Instead, we use a different approach which emulates
+ * carryless multiplication using standard multiplications by spreading the data
+ * bits apart using "holes".  This allows the carries to spill harmlessly.  This
+ * approach is borrowed from BoringSSL, which in turn credits BearSSL's
+ * documentation (https://bearssl.org/constanttime.html#ghash-for-gcm) for the
+ * "holes" trick and a presentation by Shay Gueron
+ * (https://crypto.stanford.edu/RealWorldCrypto/slides/gueron.pdf) for the
+ * 256-bit => 128-bit reduction algorithm.
+ */
+
+#ifdef CONFIG_ARCH_SUPPORTS_INT128
+
+/* Do a 64 x 64 => 128 bit carryless multiplication. */
+static void clmul64(u64 a, u64 b, u64 *out_lo, u64 *out_hi)
+{
+	/*
+	 * With 64-bit multiplicands and one term every 4 bits, there would be
+	 * up to 64 / 4 = 16 one bits per column when each multiplication is
+	 * written out as a series of additions in the schoolbook manner.
+	 * Unfortunately, that doesn't work since the value 16 is 1 too large to
+	 * fit in 4 bits.  Carries would sometimes overflow into the next term.
+	 *
+	 * Using one term every 5 bits would work.  However, that would cost
+	 * 5 x 5 = 25 multiplications instead of 4 x 4 = 16.
+	 *
+	 * Instead, mask off 4 bits from one multiplicand, giving a max of 15
+	 * one bits per column.  Then handle those 4 bits separately.
+	 */
+	u64 a0 = a & 0x1111111111111110;
+	u64 a1 = a & 0x2222222222222220;
+	u64 a2 = a & 0x4444444444444440;
+	u64 a3 = a & 0x8888888888888880;
+
+	u64 b0 = b & 0x1111111111111111;
+	u64 b1 = b & 0x2222222222222222;
+	u64 b2 = b & 0x4444444444444444;
+	u64 b3 = b & 0x8888888888888888;
+
+	/* Multiply the high 60 bits of @a by @b. */
+	u128 c0 = (a0 * (u128)b0) ^ (a1 * (u128)b3) ^
+		  (a2 * (u128)b2) ^ (a3 * (u128)b1);
+	u128 c1 = (a0 * (u128)b1) ^ (a1 * (u128)b0) ^
+		  (a2 * (u128)b3) ^ (a3 * (u128)b2);
+	u128 c2 = (a0 * (u128)b2) ^ (a1 * (u128)b1) ^
+		  (a2 * (u128)b0) ^ (a3 * (u128)b3);
+	u128 c3 = (a0 * (u128)b3) ^ (a1 * (u128)b2) ^
+		  (a2 * (u128)b1) ^ (a3 * (u128)b0);
+
+	/* Multiply the low 4 bits of @a by @b. */
+	u64 e0 = -(a & 1) & b;
+	u64 e1 = -((a >> 1) & 1) & b;
+	u64 e2 = -((a >> 2) & 1) & b;
+	u64 e3 = -((a >> 3) & 1) & b;
+	u64 extra_lo = e0 ^ (e1 << 1) ^ (e2 << 2) ^ (e3 << 3);
+	u64 extra_hi = (e1 >> 63) ^ (e2 >> 62) ^ (e3 >> 61);
+
+	/* Add all the intermediate products together. */
+	*out_lo = (((u64)c0) & 0x1111111111111111) ^
+		  (((u64)c1) & 0x2222222222222222) ^
+		  (((u64)c2) & 0x4444444444444444) ^
+		  (((u64)c3) & 0x8888888888888888) ^ extra_lo;
+	*out_hi = (((u64)(c0 >> 64)) & 0x1111111111111111) ^
+		  (((u64)(c1 >> 64)) & 0x2222222222222222) ^
+		  (((u64)(c2 >> 64)) & 0x4444444444444444) ^
+		  (((u64)(c3 >> 64)) & 0x8888888888888888) ^ extra_hi;
+}
+
+#else /* CONFIG_ARCH_SUPPORTS_INT128 */
+
+/* Do a 32 x 32 => 64 bit carryless multiplication. */
+static u64 clmul32(u32 a, u32 b)
+{
+	/*
+	 * With 32-bit multiplicands and one term every 4 bits, there are up to
+	 * 32 / 4 = 8 one bits per column when each multiplication is written
+	 * out as a series of additions in the schoolbook manner.  The value 8
+	 * fits in 4 bits, so the carries don't overflow into the next term.
+	 */
+	u32 a0 = a & 0x11111111;
+	u32 a1 = a & 0x22222222;
+	u32 a2 = a & 0x44444444;
+	u32 a3 = a & 0x88888888;
+
+	u32 b0 = b & 0x11111111;
+	u32 b1 = b & 0x22222222;
+	u32 b2 = b & 0x44444444;
+	u32 b3 = b & 0x88888888;
+
+	u64 c0 = (a0 * (u64)b0) ^ (a1 * (u64)b3) ^
+		 (a2 * (u64)b2) ^ (a3 * (u64)b1);
+	u64 c1 = (a0 * (u64)b1) ^ (a1 * (u64)b0) ^
+		 (a2 * (u64)b3) ^ (a3 * (u64)b2);
+	u64 c2 = (a0 * (u64)b2) ^ (a1 * (u64)b1) ^
+		 (a2 * (u64)b0) ^ (a3 * (u64)b3);
+	u64 c3 = (a0 * (u64)b3) ^ (a1 * (u64)b2) ^
+		 (a2 * (u64)b1) ^ (a3 * (u64)b0);
+
+	/* Add all the intermediate products together. */
+	return (c0 & 0x1111111111111111) ^
+	       (c1 & 0x2222222222222222) ^
+	       (c2 & 0x4444444444444444) ^
+	       (c3 & 0x8888888888888888);
+}
+
+/* Do a 64 x 64 => 128 bit carryless multiplication. */
+static void clmul64(u64 a, u64 b, u64 *out_lo, u64 *out_hi)
+{
+	u32 a_lo = (u32)a;
+	u32 a_hi = a >> 32;
+	u32 b_lo = (u32)b;
+	u32 b_hi = b >> 32;
+
+	/* Karatsuba multiplication */
+	u64 lo = clmul32(a_lo, b_lo);
+	u64 hi = clmul32(a_hi, b_hi);
+	u64 mi = clmul32(a_lo ^ a_hi, b_lo ^ b_hi) ^ lo ^ hi;
+
+	*out_lo = lo ^ (mi << 32);
+	*out_hi = hi ^ (mi >> 32);
+}
+#endif /* !CONFIG_ARCH_SUPPORTS_INT128 */
+
+/* Compute @a = @a * @b * x^-128 in the POLYVAL field. */
+static void __maybe_unused
+polyval_mul_generic(struct polyval_elem *a, const struct polyval_elem *b)
+{
+	u64 c0, c1, c2, c3, mi0, mi1;
+
+	/*
+	 * Carryless-multiply @a by @b using Karatsuba multiplication.  Store
+	 * the 256-bit product in @c0 (low) through @c3 (high).
+	 */
+	clmul64(le64_to_cpu(a->lo), le64_to_cpu(b->lo), &c0, &c1);
+	clmul64(le64_to_cpu(a->hi), le64_to_cpu(b->hi), &c2, &c3);
+	clmul64(le64_to_cpu(a->lo ^ a->hi), le64_to_cpu(b->lo ^ b->hi),
+		&mi0, &mi1);
+	mi0 ^= c0 ^ c2;
+	mi1 ^= c1 ^ c3;
+	c1 ^= mi0;
+	c2 ^= mi1;
+
+	/*
+	 * Cancel out the low 128 bits of the product by adding multiples of
+	 * G(x) = x^128 + x^127 + x^126 + x^121 + 1.  Do this in two steps, each
+	 * of which cancels out 64 bits.  Note that we break G(x) into three
+	 * parts: 1, x^64 * (x^63 + x^62 + x^57), and x^128 * 1.
+	 */
+
+	/*
+	 * First, add G(x) times c0 as follows:
+	 *
+	 * (c0, c1, c2) = (0,
+	 *                 c1 + (c0 * (x^63 + x^62 + x^57) mod x^64),
+	 *		   c2 + c0 + floor((c0 * (x^63 + x^62 + x^57)) / x^64))
+	 */
+	c1 ^= (c0 << 63) ^ (c0 << 62) ^ (c0 << 57);
+	c2 ^= c0 ^ (c0 >> 1) ^ (c0 >> 2) ^ (c0 >> 7);
+
+	/*
+	 * Second, add G(x) times the new c1:
+	 *
+	 * (c1, c2, c3) = (0,
+	 *                 c2 + (c1 * (x^63 + x^62 + x^57) mod x^64),
+	 *		   c3 + c1 + floor((c1 * (x^63 + x^62 + x^57)) / x^64))
+	 */
+	c2 ^= (c1 << 63) ^ (c1 << 62) ^ (c1 << 57);
+	c3 ^= c1 ^ (c1 >> 1) ^ (c1 >> 2) ^ (c1 >> 7);
+
+	/* Return (c2, c3).  This implicitly multiplies by x^-128. */
+	a->lo = cpu_to_le64(c2);
+	a->hi = cpu_to_le64(c3);
+}
+
+static void __maybe_unused
+polyval_blocks_generic(struct polyval_elem *acc, const struct polyval_elem *key,
+		       const u8 *data, size_t nblocks)
+{
+	do {
+		acc->lo ^= get_unaligned((__le64 *)data);
+		acc->hi ^= get_unaligned((__le64 *)(data + 8));
+		polyval_mul_generic(acc, key);
+		data += POLYVAL_BLOCK_SIZE;
+	} while (--nblocks);
+}
+
+/* Include the arch-optimized implementation of POLYVAL, if one is available. */
+#ifdef CONFIG_CRYPTO_LIB_POLYVAL_ARCH
+#include "polyval.h" /* $(SRCARCH)/polyval.h */
+void polyval_preparekey(struct polyval_key *key,
+			const u8 raw_key[POLYVAL_BLOCK_SIZE])
+{
+	polyval_preparekey_arch(key, raw_key);
+}
+EXPORT_SYMBOL_GPL(polyval_preparekey);
+#endif /* Else, polyval_preparekey() is an inline function. */
+
+/*
+ * polyval_mul_generic() and polyval_blocks_generic() take the key as a
+ * polyval_elem rather than a polyval_key, so that arch-optimized
+ * implementations with a different key format can use it as a fallback (if they
+ * have H^1 stored somewhere in their struct).  Thus, the following dispatch
+ * code is needed to pass the appropriate key argument.
+ */
+
+static void polyval_mul(struct polyval_ctx *ctx)
+{
+#ifdef CONFIG_CRYPTO_LIB_POLYVAL_ARCH
+	polyval_mul_arch(&ctx->acc, ctx->key);
+#else
+	polyval_mul_generic(&ctx->acc, &ctx->key->h);
+#endif
+}
+
+static void polyval_blocks(struct polyval_ctx *ctx,
+			   const u8 *data, size_t nblocks)
+{
+#ifdef CONFIG_CRYPTO_LIB_POLYVAL_ARCH
+	polyval_blocks_arch(&ctx->acc, ctx->key, data, nblocks);
+#else
+	polyval_blocks_generic(&ctx->acc, &ctx->key->h, data, nblocks);
+#endif
+}
+
+void polyval_update(struct polyval_ctx *ctx, const u8 *data, size_t len)
+{
+	if (unlikely(ctx->partial)) {
+		size_t n = min(len, POLYVAL_BLOCK_SIZE - ctx->partial);
+
+		len -= n;
+		while (n--)
+			ctx->acc.bytes[ctx->partial++] ^= *data++;
+		if (ctx->partial < POLYVAL_BLOCK_SIZE)
+			return;
+		polyval_mul(ctx);
+	}
+	if (len >= POLYVAL_BLOCK_SIZE) {
+		size_t nblocks = len / POLYVAL_BLOCK_SIZE;
+
+		polyval_blocks(ctx, data, nblocks);
+		data += len & ~(POLYVAL_BLOCK_SIZE - 1);
+		len &= POLYVAL_BLOCK_SIZE - 1;
+	}
+	for (size_t i = 0; i < len; i++)
+		ctx->acc.bytes[i] ^= data[i];
+	ctx->partial = len;
+}
+EXPORT_SYMBOL_GPL(polyval_update);
+
+void polyval_final(struct polyval_ctx *ctx, u8 out[POLYVAL_BLOCK_SIZE])
+{
+	if (unlikely(ctx->partial))
+		polyval_mul(ctx);
+	memcpy(out, &ctx->acc, POLYVAL_BLOCK_SIZE);
+	memzero_explicit(ctx, sizeof(*ctx));
+}
+EXPORT_SYMBOL_GPL(polyval_final);
+
+#ifdef polyval_mod_init_arch
+static int __init polyval_mod_init(void)
+{
+	polyval_mod_init_arch();
+	return 0;
+}
+subsys_initcall(polyval_mod_init);
+
+static void __exit polyval_mod_exit(void)
+{
+}
+module_exit(polyval_mod_exit);
+#endif
+
+MODULE_DESCRIPTION("POLYVAL almost-XOR-universal hash function");
+MODULE_LICENSE("GPL");
diff --git a/lib/crypto/s390/sha3.h b/lib/crypto/s390/sha3.h
new file mode 100644
index 000000000000..85471404775a
--- /dev/null
+++ b/lib/crypto/s390/sha3.h
@@ -0,0 +1,151 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * SHA-3 optimized using the CP Assist for Cryptographic Functions (CPACF)
+ *
+ * Copyright 2025 Google LLC
+ */
+#include <asm/cpacf.h>
+#include <linux/cpufeature.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_sha3);
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_sha3_init_optim);
+
+static void sha3_absorb_blocks(struct sha3_state *state, const u8 *data,
+			       size_t nblocks, size_t block_size)
+{
+	if (static_branch_likely(&have_sha3)) {
+		/*
+		 * Note that KIMD assumes little-endian order of the state
+		 * words.  sha3_state already uses that order, though, so
+		 * there's no need for a byteswap.
+		 */
+		switch (block_size) {
+		case SHA3_224_BLOCK_SIZE:
+			cpacf_kimd(CPACF_KIMD_SHA3_224, state,
+				   data, nblocks * block_size);
+			return;
+		case SHA3_256_BLOCK_SIZE:
+			/*
+			 * This case handles both SHA3-256 and SHAKE256, since
+			 * they have the same block size.
+			 */
+			cpacf_kimd(CPACF_KIMD_SHA3_256, state,
+				   data, nblocks * block_size);
+			return;
+		case SHA3_384_BLOCK_SIZE:
+			cpacf_kimd(CPACF_KIMD_SHA3_384, state,
+				   data, nblocks * block_size);
+			return;
+		case SHA3_512_BLOCK_SIZE:
+			cpacf_kimd(CPACF_KIMD_SHA3_512, state,
+				   data, nblocks * block_size);
+			return;
+		}
+	}
+	sha3_absorb_blocks_generic(state, data, nblocks, block_size);
+}
+
+static void sha3_keccakf(struct sha3_state *state)
+{
+	if (static_branch_likely(&have_sha3)) {
+		/*
+		 * Passing zeroes into any of CPACF_KIMD_SHA3_* gives the plain
+		 * Keccak-f permutation, which is what we want here.  Use
+		 * SHA3-512 since it has the smallest block size.
+		 */
+		static const u8 zeroes[SHA3_512_BLOCK_SIZE];
+
+		cpacf_kimd(CPACF_KIMD_SHA3_512, state, zeroes, sizeof(zeroes));
+	} else {
+		sha3_keccakf_generic(state);
+	}
+}
+
+static inline bool s390_sha3(int func, const u8 *in, size_t in_len,
+			     u8 *out, size_t out_len)
+{
+	struct sha3_state state;
+
+	if (!static_branch_likely(&have_sha3))
+		return false;
+
+	if (static_branch_likely(&have_sha3_init_optim))
+		func |= CPACF_KLMD_NIP | CPACF_KLMD_DUFOP;
+	else
+		memset(&state, 0, sizeof(state));
+
+	cpacf_klmd(func, &state, in, in_len);
+
+	if (static_branch_likely(&have_sha3_init_optim))
+		kmsan_unpoison_memory(&state, out_len);
+
+	memcpy(out, &state, out_len);
+	memzero_explicit(&state, sizeof(state));
+	return true;
+}
+
+#define sha3_224_arch sha3_224_arch
+static bool sha3_224_arch(const u8 *in, size_t in_len,
+			  u8 out[SHA3_224_DIGEST_SIZE])
+{
+	return s390_sha3(CPACF_KLMD_SHA3_224, in, in_len,
+			 out, SHA3_224_DIGEST_SIZE);
+}
+
+#define sha3_256_arch sha3_256_arch
+static bool sha3_256_arch(const u8 *in, size_t in_len,
+			  u8 out[SHA3_256_DIGEST_SIZE])
+{
+	return s390_sha3(CPACF_KLMD_SHA3_256, in, in_len,
+			 out, SHA3_256_DIGEST_SIZE);
+}
+
+#define sha3_384_arch sha3_384_arch
+static bool sha3_384_arch(const u8 *in, size_t in_len,
+			  u8 out[SHA3_384_DIGEST_SIZE])
+{
+	return s390_sha3(CPACF_KLMD_SHA3_384, in, in_len,
+			 out, SHA3_384_DIGEST_SIZE);
+}
+
+#define sha3_512_arch sha3_512_arch
+static bool sha3_512_arch(const u8 *in, size_t in_len,
+			  u8 out[SHA3_512_DIGEST_SIZE])
+{
+	return s390_sha3(CPACF_KLMD_SHA3_512, in, in_len,
+			 out, SHA3_512_DIGEST_SIZE);
+}
+
+#define sha3_mod_init_arch sha3_mod_init_arch
+static void sha3_mod_init_arch(void)
+{
+	int num_present = 0;
+	int num_possible = 0;
+
+	if (!cpu_have_feature(S390_CPU_FEATURE_MSA))
+		return;
+	/*
+	 * Since all the SHA-3 functions are in Message-Security-Assist
+	 * Extension 6, just treat them as all or nothing.  This way we need
+	 * only one static_key.
+	 */
+#define QUERY(opcode, func) \
+	({ num_present += !!cpacf_query_func(opcode, func); num_possible++; })
+	QUERY(CPACF_KIMD, CPACF_KIMD_SHA3_224);
+	QUERY(CPACF_KIMD, CPACF_KIMD_SHA3_256);
+	QUERY(CPACF_KIMD, CPACF_KIMD_SHA3_384);
+	QUERY(CPACF_KIMD, CPACF_KIMD_SHA3_512);
+	QUERY(CPACF_KLMD, CPACF_KLMD_SHA3_224);
+	QUERY(CPACF_KLMD, CPACF_KLMD_SHA3_256);
+	QUERY(CPACF_KLMD, CPACF_KLMD_SHA3_384);
+	QUERY(CPACF_KLMD, CPACF_KLMD_SHA3_512);
+#undef QUERY
+
+	if (num_present == num_possible) {
+		static_branch_enable(&have_sha3);
+		if (test_facility(86))
+			static_branch_enable(&have_sha3_init_optim);
+	} else if (num_present != 0) {
+		pr_warn("Unsupported combination of SHA-3 facilities\n");
+	}
+}
diff --git a/lib/crypto/sha1.c b/lib/crypto/sha1.c
index 5904e4ae85d2..52788278cd17 100644
--- a/lib/crypto/sha1.c
+++ b/lib/crypto/sha1.c
@@ -12,6 +12,7 @@
 #include <linux/string.h>
 #include <linux/unaligned.h>
 #include <linux/wordpart.h>
+#include "fips.h"
 
 static const struct sha1_block_state sha1_iv = {
 	.h = { SHA1_H0, SHA1_H1, SHA1_H2, SHA1_H3, SHA1_H4 },
@@ -330,10 +331,26 @@ void hmac_sha1_usingrawkey(const u8 *raw_key, size_t raw_key_len,
 }
 EXPORT_SYMBOL_GPL(hmac_sha1_usingrawkey);
 
-#ifdef sha1_mod_init_arch
+#if defined(sha1_mod_init_arch) || defined(CONFIG_CRYPTO_FIPS)
 static int __init sha1_mod_init(void)
 {
+#ifdef sha1_mod_init_arch
 	sha1_mod_init_arch();
+#endif
+	if (fips_enabled) {
+		/*
+		 * FIPS cryptographic algorithm self-test.  As per the FIPS
+		 * Implementation Guidance, testing HMAC-SHA1 satisfies the test
+		 * requirement for SHA-1 too.
+		 */
+		u8 mac[SHA1_DIGEST_SIZE];
+
+		hmac_sha1_usingrawkey(fips_test_key, sizeof(fips_test_key),
+				      fips_test_data, sizeof(fips_test_data),
+				      mac);
+		if (memcmp(fips_test_hmac_sha1_value, mac, sizeof(mac)) != 0)
+			panic("sha1: FIPS self-test failed\n");
+	}
 	return 0;
 }
 subsys_initcall(sha1_mod_init);
diff --git a/lib/crypto/sha256.c b/lib/crypto/sha256.c
index 881b935418ce..5d6b77e7e141 100644
--- a/lib/crypto/sha256.c
+++ b/lib/crypto/sha256.c
@@ -17,6 +17,7 @@
 #include <linux/string.h>
 #include <linux/unaligned.h>
 #include <linux/wordpart.h>
+#include "fips.h"
 
 static const struct sha256_block_state sha224_iv = {
 	.h = {
@@ -269,8 +270,8 @@ void sha256(const u8 *data, size_t len, u8 out[SHA256_DIGEST_SIZE])
 EXPORT_SYMBOL(sha256);
 
 /*
- * Pre-boot environment (as indicated by __DISABLE_EXPORTS being defined)
- * doesn't need either HMAC support or interleaved hashing support
+ * Pre-boot environments (as indicated by __DISABLE_EXPORTS being defined) just
+ * need the generic SHA-256 code.  Omit all other features from them.
  */
 #ifndef __DISABLE_EXPORTS
 
@@ -477,12 +478,27 @@ void hmac_sha256_usingrawkey(const u8 *raw_key, size_t raw_key_len,
 	hmac_sha256_final(&ctx, out);
 }
 EXPORT_SYMBOL_GPL(hmac_sha256_usingrawkey);
-#endif /* !__DISABLE_EXPORTS */
 
-#ifdef sha256_mod_init_arch
+#if defined(sha256_mod_init_arch) || defined(CONFIG_CRYPTO_FIPS)
 static int __init sha256_mod_init(void)
 {
+#ifdef sha256_mod_init_arch
 	sha256_mod_init_arch();
+#endif
+	if (fips_enabled) {
+		/*
+		 * FIPS cryptographic algorithm self-test.  As per the FIPS
+		 * Implementation Guidance, testing HMAC-SHA256 satisfies the
+		 * test requirement for SHA-224, SHA-256, and HMAC-SHA224 too.
+		 */
+		u8 mac[SHA256_DIGEST_SIZE];
+
+		hmac_sha256_usingrawkey(fips_test_key, sizeof(fips_test_key),
+					fips_test_data, sizeof(fips_test_data),
+					mac);
+		if (memcmp(fips_test_hmac_sha256_value, mac, sizeof(mac)) != 0)
+			panic("sha256: FIPS self-test failed\n");
+	}
 	return 0;
 }
 subsys_initcall(sha256_mod_init);
@@ -493,5 +509,7 @@ static void __exit sha256_mod_exit(void)
 module_exit(sha256_mod_exit);
 #endif
 
+#endif /* !__DISABLE_EXPORTS */
+
 MODULE_DESCRIPTION("SHA-224, SHA-256, HMAC-SHA224, and HMAC-SHA256 library functions");
 MODULE_LICENSE("GPL");
diff --git a/lib/crypto/sha3.c b/lib/crypto/sha3.c
new file mode 100644
index 000000000000..32b7074de792
--- /dev/null
+++ b/lib/crypto/sha3.c
@@ -0,0 +1,411 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * SHA-3, as specified in
+ * https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.202.pdf
+ *
+ * SHA-3 code by Jeff Garzik <jeff@garzik.org>
+ *               Ard Biesheuvel <ard.biesheuvel@linaro.org>
+ *               David Howells <dhowells@redhat.com>
+ *
+ * See also Documentation/crypto/sha3.rst
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <crypto/sha3.h>
+#include <crypto/utils.h>
+#include <linux/export.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/unaligned.h>
+#include "fips.h"
+
+/*
+ * On some 32-bit architectures, such as h8300, GCC ends up using over 1 KB of
+ * stack if the round calculation gets inlined into the loop in
+ * sha3_keccakf_generic().  On the other hand, on 64-bit architectures with
+ * plenty of [64-bit wide] general purpose registers, not inlining it severely
+ * hurts performance.  So let's use 64-bitness as a heuristic to decide whether
+ * to inline or not.
+ */
+#ifdef CONFIG_64BIT
+#define SHA3_INLINE inline
+#else
+#define SHA3_INLINE noinline
+#endif
+
+#define SHA3_KECCAK_ROUNDS 24
+
+static const u64 sha3_keccakf_rndc[SHA3_KECCAK_ROUNDS] = {
+	0x0000000000000001ULL, 0x0000000000008082ULL, 0x800000000000808aULL,
+	0x8000000080008000ULL, 0x000000000000808bULL, 0x0000000080000001ULL,
+	0x8000000080008081ULL, 0x8000000000008009ULL, 0x000000000000008aULL,
+	0x0000000000000088ULL, 0x0000000080008009ULL, 0x000000008000000aULL,
+	0x000000008000808bULL, 0x800000000000008bULL, 0x8000000000008089ULL,
+	0x8000000000008003ULL, 0x8000000000008002ULL, 0x8000000000000080ULL,
+	0x000000000000800aULL, 0x800000008000000aULL, 0x8000000080008081ULL,
+	0x8000000000008080ULL, 0x0000000080000001ULL, 0x8000000080008008ULL
+};
+
+/*
+ * Perform a single round of Keccak mixing.
+ */
+static SHA3_INLINE void sha3_keccakf_one_round_generic(u64 st[25], int round)
+{
+	u64 t[5], tt, bc[5];
+
+	/* Theta */
+	bc[0] = st[0] ^ st[5] ^ st[10] ^ st[15] ^ st[20];
+	bc[1] = st[1] ^ st[6] ^ st[11] ^ st[16] ^ st[21];
+	bc[2] = st[2] ^ st[7] ^ st[12] ^ st[17] ^ st[22];
+	bc[3] = st[3] ^ st[8] ^ st[13] ^ st[18] ^ st[23];
+	bc[4] = st[4] ^ st[9] ^ st[14] ^ st[19] ^ st[24];
+
+	t[0] = bc[4] ^ rol64(bc[1], 1);
+	t[1] = bc[0] ^ rol64(bc[2], 1);
+	t[2] = bc[1] ^ rol64(bc[3], 1);
+	t[3] = bc[2] ^ rol64(bc[4], 1);
+	t[4] = bc[3] ^ rol64(bc[0], 1);
+
+	st[0] ^= t[0];
+
+	/* Rho Pi */
+	tt = st[1];
+	st[ 1] = rol64(st[ 6] ^ t[1], 44);
+	st[ 6] = rol64(st[ 9] ^ t[4], 20);
+	st[ 9] = rol64(st[22] ^ t[2], 61);
+	st[22] = rol64(st[14] ^ t[4], 39);
+	st[14] = rol64(st[20] ^ t[0], 18);
+	st[20] = rol64(st[ 2] ^ t[2], 62);
+	st[ 2] = rol64(st[12] ^ t[2], 43);
+	st[12] = rol64(st[13] ^ t[3], 25);
+	st[13] = rol64(st[19] ^ t[4],  8);
+	st[19] = rol64(st[23] ^ t[3], 56);
+	st[23] = rol64(st[15] ^ t[0], 41);
+	st[15] = rol64(st[ 4] ^ t[4], 27);
+	st[ 4] = rol64(st[24] ^ t[4], 14);
+	st[24] = rol64(st[21] ^ t[1],  2);
+	st[21] = rol64(st[ 8] ^ t[3], 55);
+	st[ 8] = rol64(st[16] ^ t[1], 45);
+	st[16] = rol64(st[ 5] ^ t[0], 36);
+	st[ 5] = rol64(st[ 3] ^ t[3], 28);
+	st[ 3] = rol64(st[18] ^ t[3], 21);
+	st[18] = rol64(st[17] ^ t[2], 15);
+	st[17] = rol64(st[11] ^ t[1], 10);
+	st[11] = rol64(st[ 7] ^ t[2],  6);
+	st[ 7] = rol64(st[10] ^ t[0],  3);
+	st[10] = rol64(    tt ^ t[1],  1);
+
+	/* Chi */
+	bc[ 0] = ~st[ 1] & st[ 2];
+	bc[ 1] = ~st[ 2] & st[ 3];
+	bc[ 2] = ~st[ 3] & st[ 4];
+	bc[ 3] = ~st[ 4] & st[ 0];
+	bc[ 4] = ~st[ 0] & st[ 1];
+	st[ 0] ^= bc[ 0];
+	st[ 1] ^= bc[ 1];
+	st[ 2] ^= bc[ 2];
+	st[ 3] ^= bc[ 3];
+	st[ 4] ^= bc[ 4];
+
+	bc[ 0] = ~st[ 6] & st[ 7];
+	bc[ 1] = ~st[ 7] & st[ 8];
+	bc[ 2] = ~st[ 8] & st[ 9];
+	bc[ 3] = ~st[ 9] & st[ 5];
+	bc[ 4] = ~st[ 5] & st[ 6];
+	st[ 5] ^= bc[ 0];
+	st[ 6] ^= bc[ 1];
+	st[ 7] ^= bc[ 2];
+	st[ 8] ^= bc[ 3];
+	st[ 9] ^= bc[ 4];
+
+	bc[ 0] = ~st[11] & st[12];
+	bc[ 1] = ~st[12] & st[13];
+	bc[ 2] = ~st[13] & st[14];
+	bc[ 3] = ~st[14] & st[10];
+	bc[ 4] = ~st[10] & st[11];
+	st[10] ^= bc[ 0];
+	st[11] ^= bc[ 1];
+	st[12] ^= bc[ 2];
+	st[13] ^= bc[ 3];
+	st[14] ^= bc[ 4];
+
+	bc[ 0] = ~st[16] & st[17];
+	bc[ 1] = ~st[17] & st[18];
+	bc[ 2] = ~st[18] & st[19];
+	bc[ 3] = ~st[19] & st[15];
+	bc[ 4] = ~st[15] & st[16];
+	st[15] ^= bc[ 0];
+	st[16] ^= bc[ 1];
+	st[17] ^= bc[ 2];
+	st[18] ^= bc[ 3];
+	st[19] ^= bc[ 4];
+
+	bc[ 0] = ~st[21] & st[22];
+	bc[ 1] = ~st[22] & st[23];
+	bc[ 2] = ~st[23] & st[24];
+	bc[ 3] = ~st[24] & st[20];
+	bc[ 4] = ~st[20] & st[21];
+	st[20] ^= bc[ 0];
+	st[21] ^= bc[ 1];
+	st[22] ^= bc[ 2];
+	st[23] ^= bc[ 3];
+	st[24] ^= bc[ 4];
+
+	/* Iota */
+	st[0] ^= sha3_keccakf_rndc[round];
+}
+
+/* Generic implementation of the Keccak-f[1600] permutation */
+static void sha3_keccakf_generic(struct sha3_state *state)
+{
+	/*
+	 * Temporarily convert the state words from little-endian to native-
+	 * endian so that they can be operated on.  Note that on little-endian
+	 * machines this conversion is a no-op and is optimized out.
+	 */
+
+	for (int i = 0; i < ARRAY_SIZE(state->words); i++)
+		state->native_words[i] = le64_to_cpu(state->words[i]);
+
+	for (int round = 0; round < SHA3_KECCAK_ROUNDS; round++)
+		sha3_keccakf_one_round_generic(state->native_words, round);
+
+	for (int i = 0; i < ARRAY_SIZE(state->words); i++)
+		state->words[i] = cpu_to_le64(state->native_words[i]);
+}
+
+/*
+ * Generic implementation of absorbing the given nonzero number of full blocks
+ * into the sponge function Keccak[r=8*block_size, c=1600-8*block_size].
+ */
+static void __maybe_unused
+sha3_absorb_blocks_generic(struct sha3_state *state, const u8 *data,
+			   size_t nblocks, size_t block_size)
+{
+	do {
+		for (size_t i = 0; i < block_size; i += 8)
+			state->words[i / 8] ^= get_unaligned((__le64 *)&data[i]);
+		sha3_keccakf_generic(state);
+		data += block_size;
+	} while (--nblocks);
+}
+
+#ifdef CONFIG_CRYPTO_LIB_SHA3_ARCH
+#include "sha3.h" /* $(SRCARCH)/sha3.h */
+#else
+#define sha3_keccakf		sha3_keccakf_generic
+#define sha3_absorb_blocks	sha3_absorb_blocks_generic
+#endif
+
+void __sha3_update(struct __sha3_ctx *ctx, const u8 *in, size_t in_len)
+{
+	const size_t block_size = ctx->block_size;
+	size_t absorb_offset = ctx->absorb_offset;
+
+	/* Warn if squeezing has already begun. */
+	WARN_ON_ONCE(absorb_offset >= block_size);
+
+	if (absorb_offset && absorb_offset + in_len >= block_size) {
+		crypto_xor(&ctx->state.bytes[absorb_offset], in,
+			   block_size - absorb_offset);
+		in += block_size - absorb_offset;
+		in_len -= block_size - absorb_offset;
+		sha3_keccakf(&ctx->state);
+		absorb_offset = 0;
+	}
+
+	if (in_len >= block_size) {
+		size_t nblocks = in_len / block_size;
+
+		sha3_absorb_blocks(&ctx->state, in, nblocks, block_size);
+		in += nblocks * block_size;
+		in_len -= nblocks * block_size;
+	}
+
+	if (in_len) {
+		crypto_xor(&ctx->state.bytes[absorb_offset], in, in_len);
+		absorb_offset += in_len;
+	}
+	ctx->absorb_offset = absorb_offset;
+}
+EXPORT_SYMBOL_GPL(__sha3_update);
+
+void sha3_final(struct sha3_ctx *sha3_ctx, u8 *out)
+{
+	struct __sha3_ctx *ctx = &sha3_ctx->ctx;
+
+	ctx->state.bytes[ctx->absorb_offset] ^= 0x06;
+	ctx->state.bytes[ctx->block_size - 1] ^= 0x80;
+	sha3_keccakf(&ctx->state);
+	memcpy(out, ctx->state.bytes, ctx->digest_size);
+	sha3_zeroize_ctx(sha3_ctx);
+}
+EXPORT_SYMBOL_GPL(sha3_final);
+
+void shake_squeeze(struct shake_ctx *shake_ctx, u8 *out, size_t out_len)
+{
+	struct __sha3_ctx *ctx = &shake_ctx->ctx;
+	const size_t block_size = ctx->block_size;
+	size_t squeeze_offset = ctx->squeeze_offset;
+
+	if (ctx->absorb_offset < block_size) {
+		/* First squeeze: */
+
+		/* Add the domain separation suffix and padding. */
+		ctx->state.bytes[ctx->absorb_offset] ^= 0x1f;
+		ctx->state.bytes[block_size - 1] ^= 0x80;
+
+		/* Indicate that squeezing has begun. */
+		ctx->absorb_offset = block_size;
+
+		/*
+		 * Indicate that no output is pending yet, i.e. sha3_keccakf()
+		 * will need to be called before the first copy.
+		 */
+		squeeze_offset = block_size;
+	}
+	while (out_len) {
+		if (squeeze_offset == block_size) {
+			sha3_keccakf(&ctx->state);
+			squeeze_offset = 0;
+		}
+		size_t copy = min(out_len, block_size - squeeze_offset);
+
+		memcpy(out, &ctx->state.bytes[squeeze_offset], copy);
+		out += copy;
+		out_len -= copy;
+		squeeze_offset += copy;
+	}
+	ctx->squeeze_offset = squeeze_offset;
+}
+EXPORT_SYMBOL_GPL(shake_squeeze);
+
+#ifndef sha3_224_arch
+static inline bool sha3_224_arch(const u8 *in, size_t in_len,
+				 u8 out[SHA3_224_DIGEST_SIZE])
+{
+	return false;
+}
+#endif
+#ifndef sha3_256_arch
+static inline bool sha3_256_arch(const u8 *in, size_t in_len,
+				 u8 out[SHA3_256_DIGEST_SIZE])
+{
+	return false;
+}
+#endif
+#ifndef sha3_384_arch
+static inline bool sha3_384_arch(const u8 *in, size_t in_len,
+				 u8 out[SHA3_384_DIGEST_SIZE])
+{
+	return false;
+}
+#endif
+#ifndef sha3_512_arch
+static inline bool sha3_512_arch(const u8 *in, size_t in_len,
+				 u8 out[SHA3_512_DIGEST_SIZE])
+{
+	return false;
+}
+#endif
+
+void sha3_224(const u8 *in, size_t in_len, u8 out[SHA3_224_DIGEST_SIZE])
+{
+	struct sha3_ctx ctx;
+
+	if (sha3_224_arch(in, in_len, out))
+		return;
+	sha3_224_init(&ctx);
+	sha3_update(&ctx, in, in_len);
+	sha3_final(&ctx, out);
+}
+EXPORT_SYMBOL_GPL(sha3_224);
+
+void sha3_256(const u8 *in, size_t in_len, u8 out[SHA3_256_DIGEST_SIZE])
+{
+	struct sha3_ctx ctx;
+
+	if (sha3_256_arch(in, in_len, out))
+		return;
+	sha3_256_init(&ctx);
+	sha3_update(&ctx, in, in_len);
+	sha3_final(&ctx, out);
+}
+EXPORT_SYMBOL_GPL(sha3_256);
+
+void sha3_384(const u8 *in, size_t in_len, u8 out[SHA3_384_DIGEST_SIZE])
+{
+	struct sha3_ctx ctx;
+
+	if (sha3_384_arch(in, in_len, out))
+		return;
+	sha3_384_init(&ctx);
+	sha3_update(&ctx, in, in_len);
+	sha3_final(&ctx, out);
+}
+EXPORT_SYMBOL_GPL(sha3_384);
+
+void sha3_512(const u8 *in, size_t in_len, u8 out[SHA3_512_DIGEST_SIZE])
+{
+	struct sha3_ctx ctx;
+
+	if (sha3_512_arch(in, in_len, out))
+		return;
+	sha3_512_init(&ctx);
+	sha3_update(&ctx, in, in_len);
+	sha3_final(&ctx, out);
+}
+EXPORT_SYMBOL_GPL(sha3_512);
+
+void shake128(const u8 *in, size_t in_len, u8 *out, size_t out_len)
+{
+	struct shake_ctx ctx;
+
+	shake128_init(&ctx);
+	shake_update(&ctx, in, in_len);
+	shake_squeeze(&ctx, out, out_len);
+	shake_zeroize_ctx(&ctx);
+}
+EXPORT_SYMBOL_GPL(shake128);
+
+void shake256(const u8 *in, size_t in_len, u8 *out, size_t out_len)
+{
+	struct shake_ctx ctx;
+
+	shake256_init(&ctx);
+	shake_update(&ctx, in, in_len);
+	shake_squeeze(&ctx, out, out_len);
+	shake_zeroize_ctx(&ctx);
+}
+EXPORT_SYMBOL_GPL(shake256);
+
+#if defined(sha3_mod_init_arch) || defined(CONFIG_CRYPTO_FIPS)
+static int __init sha3_mod_init(void)
+{
+#ifdef sha3_mod_init_arch
+	sha3_mod_init_arch();
+#endif
+	if (fips_enabled) {
+		/*
+		 * FIPS cryptographic algorithm self-test.  As per the FIPS
+		 * Implementation Guidance, testing any SHA-3 algorithm
+		 * satisfies the test requirement for all of them.
+		 */
+		u8 hash[SHA3_256_DIGEST_SIZE];
+
+		sha3_256(fips_test_data, sizeof(fips_test_data), hash);
+		if (memcmp(fips_test_sha3_256_value, hash, sizeof(hash)) != 0)
+			panic("sha3: FIPS self-test failed\n");
+	}
+	return 0;
+}
+subsys_initcall(sha3_mod_init);
+
+static void __exit sha3_mod_exit(void)
+{
+}
+module_exit(sha3_mod_exit);
+#endif
+
+MODULE_DESCRIPTION("SHA-3 library functions");
+MODULE_LICENSE("GPL");
diff --git a/lib/crypto/sha512.c b/lib/crypto/sha512.c
index d8062188be98..605eab51aabd 100644
--- a/lib/crypto/sha512.c
+++ b/lib/crypto/sha512.c
@@ -17,6 +17,7 @@
 #include <linux/string.h>
 #include <linux/unaligned.h>
 #include <linux/wordpart.h>
+#include "fips.h"
 
 static const struct sha512_block_state sha384_iv = {
 	.h = {
@@ -405,10 +406,26 @@ void hmac_sha512_usingrawkey(const u8 *raw_key, size_t raw_key_len,
 }
 EXPORT_SYMBOL_GPL(hmac_sha512_usingrawkey);
 
-#ifdef sha512_mod_init_arch
+#if defined(sha512_mod_init_arch) || defined(CONFIG_CRYPTO_FIPS)
 static int __init sha512_mod_init(void)
 {
+#ifdef sha512_mod_init_arch
 	sha512_mod_init_arch();
+#endif
+	if (fips_enabled) {
+		/*
+		 * FIPS cryptographic algorithm self-test.  As per the FIPS
+		 * Implementation Guidance, testing HMAC-SHA512 satisfies the
+		 * test requirement for SHA-384, SHA-512, and HMAC-SHA384 too.
+		 */
+		u8 mac[SHA512_DIGEST_SIZE];
+
+		hmac_sha512_usingrawkey(fips_test_key, sizeof(fips_test_key),
+					fips_test_data, sizeof(fips_test_data),
+					mac);
+		if (memcmp(fips_test_hmac_sha512_value, mac, sizeof(mac)) != 0)
+			panic("sha512: FIPS self-test failed\n");
+	}
 	return 0;
 }
 subsys_initcall(sha512_mod_init);
diff --git a/lib/crypto/tests/Kconfig b/lib/crypto/tests/Kconfig
index 578af717e13a..61d435c450bb 100644
--- a/lib/crypto/tests/Kconfig
+++ b/lib/crypto/tests/Kconfig
@@ -1,5 +1,14 @@
 # SPDX-License-Identifier: GPL-2.0-or-later
 
+config CRYPTO_LIB_BLAKE2B_KUNIT_TEST
+	tristate "KUnit tests for BLAKE2b" if !KUNIT_ALL_TESTS
+	depends on KUNIT
+	default KUNIT_ALL_TESTS || CRYPTO_SELFTESTS
+	select CRYPTO_LIB_BENCHMARK_VISIBLE
+	select CRYPTO_LIB_BLAKE2B
+	help
+	  KUnit tests for the BLAKE2b cryptographic hash function.
+
 config CRYPTO_LIB_BLAKE2S_KUNIT_TEST
 	tristate "KUnit tests for BLAKE2s" if !KUNIT_ALL_TESTS
 	depends on KUNIT
@@ -38,6 +47,15 @@ config CRYPTO_LIB_POLY1305_KUNIT_TEST
 	help
 	  KUnit tests for the Poly1305 library functions.
 
+config CRYPTO_LIB_POLYVAL_KUNIT_TEST
+	tristate "KUnit tests for POLYVAL" if !KUNIT_ALL_TESTS
+	depends on KUNIT
+	default KUNIT_ALL_TESTS || CRYPTO_SELFTESTS
+	select CRYPTO_LIB_BENCHMARK_VISIBLE
+	select CRYPTO_LIB_POLYVAL
+	help
+	  KUnit tests for the POLYVAL library functions.
+
 config CRYPTO_LIB_SHA1_KUNIT_TEST
 	tristate "KUnit tests for SHA-1" if !KUNIT_ALL_TESTS
 	depends on KUNIT
@@ -72,6 +90,17 @@ config CRYPTO_LIB_SHA512_KUNIT_TEST
 	  KUnit tests for the SHA-384 and SHA-512 cryptographic hash functions
 	  and their corresponding HMACs.
 
+config CRYPTO_LIB_SHA3_KUNIT_TEST
+	tristate "KUnit tests for SHA-3" if !KUNIT_ALL_TESTS
+	depends on KUNIT
+	default KUNIT_ALL_TESTS || CRYPTO_SELFTESTS
+	select CRYPTO_LIB_BENCHMARK_VISIBLE
+	select CRYPTO_LIB_SHA3
+	help
+	  KUnit tests for the SHA3 cryptographic hash and XOF functions,
+	  including SHA3-224, SHA3-256, SHA3-384, SHA3-512, SHAKE128 and
+	  SHAKE256.
+
 config CRYPTO_LIB_BENCHMARK_VISIBLE
 	bool
 
diff --git a/lib/crypto/tests/Makefile b/lib/crypto/tests/Makefile
index a71fad19922b..5109a0651925 100644
--- a/lib/crypto/tests/Makefile
+++ b/lib/crypto/tests/Makefile
@@ -1,9 +1,12 @@
 # SPDX-License-Identifier: GPL-2.0-or-later
 
+obj-$(CONFIG_CRYPTO_LIB_BLAKE2B_KUNIT_TEST) += blake2b_kunit.o
 obj-$(CONFIG_CRYPTO_LIB_BLAKE2S_KUNIT_TEST) += blake2s_kunit.o
 obj-$(CONFIG_CRYPTO_LIB_CURVE25519_KUNIT_TEST) += curve25519_kunit.o
 obj-$(CONFIG_CRYPTO_LIB_MD5_KUNIT_TEST) += md5_kunit.o
 obj-$(CONFIG_CRYPTO_LIB_POLY1305_KUNIT_TEST) += poly1305_kunit.o
+obj-$(CONFIG_CRYPTO_LIB_POLYVAL_KUNIT_TEST) += polyval_kunit.o
 obj-$(CONFIG_CRYPTO_LIB_SHA1_KUNIT_TEST) += sha1_kunit.o
 obj-$(CONFIG_CRYPTO_LIB_SHA256_KUNIT_TEST) += sha224_kunit.o sha256_kunit.o
 obj-$(CONFIG_CRYPTO_LIB_SHA512_KUNIT_TEST) += sha384_kunit.o sha512_kunit.o
+obj-$(CONFIG_CRYPTO_LIB_SHA3_KUNIT_TEST) += sha3_kunit.o
diff --git a/lib/crypto/tests/blake2b-testvecs.h b/lib/crypto/tests/blake2b-testvecs.h
new file mode 100644
index 000000000000..9e407dbc219c
--- /dev/null
+++ b/lib/crypto/tests/blake2b-testvecs.h
@@ -0,0 +1,342 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* This file was generated by: ./scripts/crypto/gen-hash-testvecs.py blake2b */
+
+static const struct {
+	size_t data_len;
+	u8 digest[BLAKE2B_HASH_SIZE];
+} hash_testvecs[] = {
+	{
+		.data_len = 0,
+		.digest = {
+			0x78, 0x6a, 0x02, 0xf7, 0x42, 0x01, 0x59, 0x03,
+			0xc6, 0xc6, 0xfd, 0x85, 0x25, 0x52, 0xd2, 0x72,
+			0x91, 0x2f, 0x47, 0x40, 0xe1, 0x58, 0x47, 0x61,
+			0x8a, 0x86, 0xe2, 0x17, 0xf7, 0x1f, 0x54, 0x19,
+			0xd2, 0x5e, 0x10, 0x31, 0xaf, 0xee, 0x58, 0x53,
+			0x13, 0x89, 0x64, 0x44, 0x93, 0x4e, 0xb0, 0x4b,
+			0x90, 0x3a, 0x68, 0x5b, 0x14, 0x48, 0xb7, 0x55,
+			0xd5, 0x6f, 0x70, 0x1a, 0xfe, 0x9b, 0xe2, 0xce,
+		},
+	},
+	{
+		.data_len = 1,
+		.digest = {
+			0x6f, 0x2e, 0xcc, 0x83, 0x53, 0xa3, 0x20, 0x16,
+			0x5b, 0xda, 0xd0, 0x04, 0xd3, 0xcb, 0xe4, 0x37,
+			0x5b, 0xf0, 0x84, 0x36, 0xe1, 0xad, 0x45, 0xcc,
+			0x4d, 0x7f, 0x09, 0x68, 0xb2, 0x62, 0x93, 0x7f,
+			0x72, 0x32, 0xe8, 0xa7, 0x2f, 0x1f, 0x6f, 0xc6,
+			0x14, 0xd6, 0x70, 0xae, 0x0c, 0xf0, 0xf3, 0xce,
+			0x64, 0x4d, 0x22, 0xdf, 0xc7, 0xa7, 0xf8, 0xa8,
+			0x18, 0x23, 0xd8, 0x6c, 0xaf, 0x65, 0xa2, 0x54,
+		},
+	},
+	{
+		.data_len = 2,
+		.digest = {
+			0x04, 0x13, 0xe2, 0x10, 0xbe, 0x65, 0xde, 0xce,
+			0x61, 0xa8, 0xe0, 0xd6, 0x35, 0xb1, 0xb8, 0x88,
+			0xd2, 0xea, 0x45, 0x3a, 0xe1, 0x8d, 0x94, 0xb5,
+			0x66, 0x06, 0x98, 0x96, 0x39, 0xf8, 0x0e, 0xcb,
+			0x34, 0xa6, 0xa8, 0x17, 0xfe, 0x56, 0xbc, 0xa9,
+			0x5e, 0x1b, 0xb1, 0xde, 0x3c, 0xc7, 0x78, 0x4f,
+			0x39, 0xc6, 0xfc, 0xa8, 0xb3, 0x27, 0x66, 0x3e,
+			0x4e, 0xb5, 0x5d, 0x08, 0x89, 0xee, 0xd1, 0xe0,
+		},
+	},
+	{
+		.data_len = 3,
+		.digest = {
+			0x2b, 0x4a, 0xa3, 0x4e, 0x2b, 0x7a, 0x47, 0x20,
+			0x30, 0x5b, 0x09, 0x17, 0x3a, 0xf4, 0xcc, 0xf0,
+			0xf7, 0x7b, 0x97, 0x68, 0x98, 0x9f, 0x4f, 0x09,
+			0x46, 0x25, 0xe7, 0xd6, 0x53, 0x6b, 0xf9, 0x68,
+			0x48, 0x12, 0x44, 0x8c, 0x9a, 0xc8, 0xd4, 0x42,
+			0xeb, 0x2c, 0x5f, 0x41, 0xba, 0x17, 0xd0, 0xc3,
+			0xad, 0xfd, 0xfb, 0x42, 0x33, 0xcb, 0x08, 0x5d,
+			0xd2, 0x5c, 0x3d, 0xde, 0x87, 0x4d, 0xd6, 0xe4,
+		},
+	},
+	{
+		.data_len = 16,
+		.digest = {
+			0xbf, 0x40, 0xf2, 0x38, 0x44, 0x8e, 0x24, 0x5e,
+			0xbc, 0x67, 0xbb, 0xf0, 0x10, 0x9a, 0x79, 0xbb,
+			0x36, 0x55, 0xce, 0xd2, 0xba, 0x04, 0x0d, 0xe8,
+			0x30, 0x29, 0x5c, 0x2a, 0xa6, 0x3a, 0x4f, 0x37,
+			0xac, 0x5f, 0xd4, 0x13, 0xa2, 0xf4, 0xfe, 0x80,
+			0x61, 0xd7, 0x58, 0x66, 0x0c, 0x7f, 0xa2, 0x56,
+			0x6b, 0x52, 0x7c, 0x22, 0x73, 0x7f, 0x17, 0xaa,
+			0x91, 0x5a, 0x22, 0x06, 0xd9, 0x00, 0x48, 0x12,
+		},
+	},
+	{
+		.data_len = 32,
+		.digest = {
+			0x41, 0x04, 0x65, 0x93, 0x81, 0x9a, 0x20, 0x0a,
+			0x00, 0x60, 0x00, 0x64, 0x4c, 0x04, 0x3d, 0xe0,
+			0x6b, 0x17, 0x0c, 0xe1, 0x0e, 0x28, 0x8b, 0xa0,
+			0x76, 0xd2, 0x79, 0xb0, 0x33, 0x60, 0x61, 0x27,
+			0xf2, 0x64, 0xf1, 0x8a, 0xe5, 0x3e, 0xaa, 0x37,
+			0x60, 0xad, 0x2d, 0x75, 0x13, 0xae, 0xd8, 0x9e,
+			0xec, 0xe0, 0xe4, 0x40, 0x2f, 0x59, 0x44, 0xb0,
+			0x66, 0x7a, 0x68, 0x38, 0xce, 0x21, 0x99, 0x2a,
+		},
+	},
+	{
+		.data_len = 48,
+		.digest = {
+			0x19, 0x6f, 0x9d, 0xc7, 0x87, 0x12, 0x5c, 0xa3,
+			0xe2, 0xd3, 0xf1, 0x82, 0xec, 0xf3, 0x55, 0x9c,
+			0x86, 0xd1, 0x6d, 0xde, 0xcf, 0x5b, 0xec, 0x4c,
+			0x43, 0x25, 0x85, 0x90, 0xef, 0xe8, 0xe3, 0x5f,
+			0x2c, 0x3a, 0x84, 0x07, 0xb8, 0x55, 0xfd, 0x5e,
+			0xa4, 0x45, 0xf2, 0xac, 0xe4, 0xbd, 0xc7, 0x96,
+			0x80, 0x59, 0x3e, 0xc9, 0xb1, 0x60, 0xb1, 0x2b,
+			0x17, 0x49, 0x7d, 0x3e, 0x7d, 0x4d, 0x70, 0x24,
+		},
+	},
+	{
+		.data_len = 49,
+		.digest = {
+			0x73, 0x72, 0xd5, 0x0a, 0x97, 0xb4, 0x7d, 0xdb,
+			0x05, 0x14, 0x8e, 0x40, 0xc2, 0x9a, 0x8a, 0x74,
+			0x4b, 0xda, 0x7e, 0xfc, 0x97, 0x57, 0x23, 0x39,
+			0xdc, 0x57, 0x09, 0x13, 0x24, 0xfc, 0xf3, 0x23,
+			0x55, 0x48, 0xdd, 0xe5, 0x07, 0x9a, 0x6f, 0x7b,
+			0x62, 0xea, 0x4d, 0x79, 0xb4, 0xb9, 0xc5, 0x86,
+			0xc0, 0x34, 0xd6, 0xd2, 0x6c, 0xc3, 0x94, 0xfb,
+			0x34, 0xd6, 0x62, 0xae, 0xb8, 0x99, 0xf1, 0x38,
+		},
+	},
+	{
+		.data_len = 63,
+		.digest = {
+			0x42, 0x3a, 0xe3, 0xa2, 0xae, 0x5a, 0x28, 0xce,
+			0xf1, 0x3c, 0x97, 0xc2, 0x34, 0xf6, 0xb5, 0x1e,
+			0xfc, 0x31, 0xb4, 0x04, 0x61, 0xb7, 0x54, 0x0b,
+			0x0d, 0x1a, 0x22, 0x9c, 0x04, 0x67, 0x5c, 0x4c,
+			0x75, 0x1b, 0x10, 0x0b, 0x99, 0xe2, 0xb1, 0x5e,
+			0x5d, 0x4b, 0x7a, 0xe6, 0xf6, 0xb5, 0x62, 0xee,
+			0x2d, 0x44, 0x57, 0xb2, 0x96, 0x73, 0x5e, 0xb9,
+			0x6a, 0xb2, 0xb3, 0x16, 0xa3, 0xd9, 0x6a, 0x60,
+		},
+	},
+	{
+		.data_len = 64,
+		.digest = {
+			0x50, 0xb9, 0xbe, 0xb2, 0x69, 0x07, 0x45, 0x5b,
+			0x59, 0xde, 0x8d, 0xbf, 0x08, 0xdc, 0x2e, 0x7f,
+			0x93, 0x29, 0xc1, 0x91, 0xe8, 0x74, 0x03, 0x89,
+			0x20, 0xfb, 0xb2, 0x4b, 0xe8, 0x68, 0x6f, 0xe1,
+			0xb4, 0x30, 0xbe, 0x11, 0x3c, 0x43, 0x19, 0x66,
+			0x72, 0x78, 0xb7, 0xf4, 0xe9, 0x09, 0x18, 0x4e,
+			0xae, 0x4a, 0x24, 0xe0, 0x6f, 0x44, 0x02, 0xe3,
+			0xfd, 0xda, 0xb3, 0x3e, 0x3c, 0x6d, 0x54, 0x2e,
+		},
+	},
+	{
+		.data_len = 65,
+		.digest = {
+			0xd6, 0xf2, 0xa9, 0x61, 0x3f, 0xce, 0x2a, 0x68,
+			0x19, 0x86, 0xff, 0xd1, 0xee, 0x89, 0x3b, 0xa4,
+			0x10, 0x9a, 0x91, 0x50, 0x35, 0x48, 0x9e, 0xf5,
+			0x9c, 0x95, 0xe0, 0xfb, 0x92, 0x0f, 0xa8, 0xf7,
+			0x6c, 0x43, 0x85, 0xf1, 0x6e, 0x11, 0x4e, 0x67,
+			0x78, 0xd7, 0x53, 0x25, 0x0c, 0xf8, 0xce, 0x38,
+			0x74, 0x08, 0xb0, 0x3c, 0x53, 0x20, 0x4d, 0xc4,
+			0x9a, 0xf5, 0x78, 0xe8, 0x41, 0x8f, 0xed, 0x1f,
+		},
+	},
+	{
+		.data_len = 127,
+		.digest = {
+			0xe8, 0xb2, 0xc5, 0xa7, 0xf5, 0xfa, 0xee, 0xa0,
+			0x57, 0xba, 0x58, 0xf9, 0x0a, 0xf2, 0x64, 0x16,
+			0xa8, 0xa6, 0x03, 0x85, 0x3b, 0xb8, 0x6f, 0xca,
+			0x76, 0xc3, 0xa1, 0x2b, 0xec, 0xef, 0xc4, 0x66,
+			0x11, 0xdf, 0x03, 0x85, 0x9d, 0x0c, 0x37, 0x7b,
+			0xa9, 0x7b, 0x44, 0xfb, 0x11, 0x8f, 0x3f, 0x71,
+			0xcd, 0x81, 0x43, 0x2e, 0x71, 0x5c, 0x54, 0x9f,
+			0xca, 0x0f, 0x01, 0x91, 0xca, 0xaa, 0x93, 0xe9,
+		},
+	},
+	{
+		.data_len = 128,
+		.digest = {
+			0x05, 0x8e, 0x9d, 0xdc, 0xe9, 0x36, 0x3e, 0x73,
+			0x63, 0x59, 0x69, 0x81, 0x0b, 0x8c, 0xc7, 0x9e,
+			0xcc, 0xe7, 0x9c, 0x19, 0x54, 0xa7, 0x2f, 0x86,
+			0xb5, 0xea, 0xae, 0x6d, 0xfe, 0x4e, 0x6e, 0x83,
+			0x8d, 0x1a, 0x1c, 0x70, 0x3f, 0x34, 0xa1, 0x04,
+			0x59, 0xd1, 0xbb, 0xaa, 0x58, 0xf7, 0xce, 0xfb,
+			0x86, 0x66, 0x22, 0xfc, 0x78, 0x74, 0x6e, 0x85,
+			0xf1, 0x59, 0x7d, 0x9e, 0x1c, 0x3b, 0xc6, 0x65,
+		},
+	},
+	{
+		.data_len = 129,
+		.digest = {
+			0x6b, 0x1f, 0x7c, 0x9a, 0x65, 0x7f, 0x09, 0x61,
+			0xe5, 0x04, 0x9a, 0xf1, 0x4b, 0x36, 0x8e, 0x41,
+			0x86, 0xcf, 0x86, 0x19, 0xd8, 0xc9, 0x34, 0x70,
+			0x67, 0xd1, 0x03, 0x72, 0x12, 0xf7, 0x27, 0x92,
+			0x2e, 0x3d, 0x2b, 0x54, 0x9a, 0x48, 0xa4, 0xc2,
+			0x61, 0xea, 0x6a, 0xe8, 0xdd, 0x07, 0x41, 0x85,
+			0x58, 0x6d, 0xcd, 0x12, 0x0d, 0xbc, 0xb1, 0x23,
+			0xb2, 0xdb, 0x24, 0x1f, 0xc4, 0xa7, 0xae, 0xda,
+		},
+	},
+	{
+		.data_len = 256,
+		.digest = {
+			0x50, 0xd8, 0xdc, 0xb2, 0x50, 0x24, 0x7a, 0x49,
+			0xb1, 0x00, 0x73, 0x16, 0x1f, 0xce, 0xf9, 0xe8,
+			0x77, 0x0a, 0x27, 0x74, 0xc7, 0xeb, 0xf0, 0x62,
+			0xb9, 0xf3, 0x24, 0xa6, 0x03, 0x18, 0x40, 0xde,
+			0x9b, 0x1d, 0xa8, 0xd0, 0xbf, 0x66, 0xa3, 0xc1,
+			0x31, 0x04, 0x95, 0xc7, 0xc3, 0xb7, 0x11, 0xe2,
+			0x1e, 0x31, 0x49, 0x98, 0x06, 0xab, 0xf0, 0xe6,
+			0x5c, 0xac, 0x88, 0x28, 0x0b, 0x3d, 0xb2, 0xc2,
+		},
+	},
+	{
+		.data_len = 511,
+		.digest = {
+			0xd4, 0x2b, 0x6b, 0x9e, 0xfc, 0x44, 0xc0, 0x90,
+			0x64, 0x77, 0x5d, 0xf3, 0x44, 0xb6, 0x92, 0x8f,
+			0x80, 0xe2, 0xe4, 0x9b, 0xaf, 0x49, 0x04, 0xea,
+			0x29, 0xf7, 0x4a, 0x33, 0x3f, 0xc7, 0x3b, 0xab,
+			0xa1, 0x71, 0x7f, 0xa2, 0x8e, 0x03, 0xa0, 0xd6,
+			0xa7, 0xcd, 0xe0, 0xf8, 0xd7, 0x3b, 0xa4, 0x0d,
+			0x84, 0x79, 0x12, 0x72, 0x3f, 0x8e, 0x48, 0x35,
+			0x76, 0x4f, 0x56, 0xe9, 0x21, 0x40, 0x19, 0xbe,
+		},
+	},
+	{
+		.data_len = 513,
+		.digest = {
+			0x84, 0xd4, 0xd8, 0x6c, 0x60, 0x3d, 0x6e, 0xfd,
+			0x84, 0xb7, 0xdf, 0xba, 0x13, 0x5e, 0x07, 0x94,
+			0x5b, 0x6b, 0x62, 0x1d, 0x82, 0x02, 0xa7, 0xb3,
+			0x21, 0xdf, 0x42, 0x20, 0x85, 0xa8, 0x6f, 0x30,
+			0xf7, 0x03, 0xba, 0x66, 0x0e, 0xa6, 0x42, 0x21,
+			0x37, 0xe8, 0xed, 0x5b, 0x22, 0xf5, 0x4e, 0xa5,
+			0xe5, 0x80, 0x1b, 0x47, 0xf0, 0x49, 0xb3, 0xe5,
+			0x6e, 0xd9, 0xd9, 0x95, 0x3d, 0x2e, 0x42, 0x13,
+		},
+	},
+	{
+		.data_len = 1000,
+		.digest = {
+			0x71, 0x17, 0xab, 0x93, 0xfe, 0x3b, 0xa4, 0xe6,
+			0xcb, 0xb0, 0xea, 0x95, 0xe7, 0x1a, 0x01, 0xc0,
+			0x12, 0x33, 0xfe, 0xcc, 0x79, 0x15, 0xae, 0x56,
+			0xd2, 0x70, 0x44, 0x60, 0x54, 0x42, 0xa8, 0x69,
+			0x7e, 0xc3, 0x90, 0xa0, 0x0c, 0x63, 0x39, 0xff,
+			0x55, 0x53, 0xb8, 0x46, 0xef, 0x06, 0xcb, 0xba,
+			0x73, 0xf4, 0x76, 0x22, 0xf1, 0x60, 0x98, 0xbc,
+			0xbf, 0x76, 0x95, 0x85, 0x13, 0x1d, 0x11, 0x3b,
+		},
+	},
+	{
+		.data_len = 3333,
+		.digest = {
+			0x3a, 0xaa, 0x85, 0xa0, 0x8c, 0x8e, 0xe1, 0x9c,
+			0x9b, 0x43, 0x72, 0x7f, 0x40, 0x88, 0x3b, 0xd1,
+			0xc4, 0xd8, 0x2b, 0x69, 0xa6, 0x74, 0x47, 0x69,
+			0x5f, 0x7d, 0xab, 0x75, 0xa9, 0xf9, 0x88, 0x54,
+			0xce, 0x57, 0xcc, 0x9d, 0xac, 0x13, 0x91, 0xdb,
+			0x6d, 0x5c, 0xd8, 0xf4, 0x35, 0xc9, 0x30, 0xf0,
+			0x4b, 0x91, 0x25, 0xab, 0x92, 0xa8, 0xc8, 0x6f,
+			0xa0, 0xeb, 0x71, 0x56, 0x95, 0xab, 0xfd, 0xd7,
+		},
+	},
+	{
+		.data_len = 4096,
+		.digest = {
+			0xe1, 0xe9, 0xbe, 0x6c, 0x96, 0xe2, 0xe8, 0xa6,
+			0x53, 0xcd, 0x79, 0x77, 0x57, 0x51, 0x2f, 0xb2,
+			0x9f, 0xfc, 0x09, 0xaa, 0x2c, 0xbc, 0x6c, 0x5f,
+			0xb0, 0xf2, 0x12, 0x39, 0x54, 0xd7, 0x27, 0xf8,
+			0x33, 0x5d, 0xd4, 0x8a, 0xca, 0xd8, 0x2e, 0xbb,
+			0x02, 0x82, 0xca, 0x1b, 0x54, 0xfa, 0xd6, 0xf4,
+			0x49, 0x63, 0xfc, 0xc8, 0x73, 0xd4, 0x26, 0x8d,
+			0x4f, 0x1c, 0x56, 0xa7, 0xf4, 0x58, 0x6f, 0x51,
+		},
+	},
+	{
+		.data_len = 4128,
+		.digest = {
+			0xf2, 0xf6, 0xe1, 0x16, 0x98, 0x69, 0x74, 0x5f,
+			0x6c, 0xc4, 0x9d, 0x34, 0xa2, 0x84, 0x5d, 0x47,
+			0xac, 0x39, 0xe0, 0x14, 0x2d, 0x78, 0xfa, 0x27,
+			0xd5, 0x18, 0xaf, 0x26, 0x89, 0xa4, 0x69, 0xd3,
+			0x56, 0xde, 0xfe, 0x4b, 0x9f, 0x0c, 0x9d, 0x5a,
+			0x9a, 0x73, 0x3e, 0x3c, 0x76, 0x4b, 0x96, 0xca,
+			0x49, 0xda, 0x05, 0x8c, 0x53, 0xbb, 0x85, 0x89,
+			0x60, 0xc7, 0xe0, 0xb3, 0x51, 0x18, 0xd2, 0xd2,
+		},
+	},
+	{
+		.data_len = 4160,
+		.digest = {
+			0xfc, 0x5c, 0xcf, 0xbf, 0x29, 0xe3, 0x01, 0xef,
+			0x4b, 0x40, 0x70, 0x01, 0xca, 0x4d, 0x46, 0xce,
+			0xa9, 0x95, 0x5d, 0xb4, 0xf1, 0x79, 0x29, 0xdb,
+			0xac, 0x32, 0x3d, 0xd9, 0x60, 0x9e, 0x6b, 0xb8,
+			0x28, 0x62, 0xb7, 0x4a, 0xbb, 0x33, 0xb9, 0xd0,
+			0x83, 0xe0, 0xd7, 0x5a, 0x2d, 0x01, 0x4c, 0x61,
+			0x9e, 0x7d, 0x2d, 0x2d, 0x60, 0x29, 0x5e, 0x60,
+			0x10, 0xb7, 0x41, 0x00, 0x3f, 0xe5, 0xf7, 0x52,
+		},
+	},
+	{
+		.data_len = 4224,
+		.digest = {
+			0xf8, 0xe5, 0x4b, 0xe5, 0x89, 0xf9, 0x1b, 0x43,
+			0xbb, 0x65, 0x3d, 0xa0, 0xb4, 0xdc, 0x04, 0x26,
+			0x68, 0x15, 0xae, 0x4d, 0xd6, 0x03, 0xb7, 0x27,
+			0x06, 0x8c, 0x2a, 0x82, 0x51, 0x96, 0xbf, 0x83,
+			0x38, 0x96, 0x21, 0x8a, 0xd9, 0xf9, 0x4e, 0x38,
+			0xc6, 0xb3, 0xbd, 0xfe, 0xd3, 0x49, 0x90, 0xbc,
+			0xa1, 0x77, 0xd0, 0xa0, 0x3c, 0x2b, 0x4e, 0x10,
+			0x34, 0xc3, 0x17, 0x85, 0x3d, 0xec, 0xa8, 0x05,
+		},
+	},
+	{
+		.data_len = 16384,
+		.digest = {
+			0x38, 0x56, 0xaf, 0x83, 0x68, 0x9c, 0xba, 0xe3,
+			0xec, 0x51, 0xf5, 0xf4, 0x93, 0x48, 0x1d, 0xe6,
+			0xad, 0xa8, 0x8c, 0x70, 0x2a, 0xd9, 0xaa, 0x43,
+			0x04, 0x40, 0x95, 0xc1, 0xe6, 0x8a, 0xf5, 0x01,
+			0x6b, 0x79, 0xd9, 0xb4, 0xd0, 0x1d, 0x93, 0x26,
+			0xfe, 0xf5, 0x07, 0x57, 0xda, 0x08, 0x0a, 0x82,
+			0xc9, 0x17, 0x13, 0x5b, 0x9e, 0x11, 0x96, 0xa5,
+			0xd0, 0x92, 0xcd, 0xf1, 0xa3, 0x5b, 0x43, 0x21,
+		},
+	},
+};
+
+static const u8 hash_testvec_consolidated[BLAKE2B_HASH_SIZE] = {
+	0xa4, 0xf8, 0xf6, 0xa1, 0x36, 0x89, 0xc0, 0x2a,
+	0xc3, 0x42, 0x32, 0x71, 0xe5, 0xea, 0x14, 0x77,
+	0xf3, 0x99, 0x91, 0x87, 0x49, 0xc2, 0x8d, 0xa5,
+	0x2f, 0xed, 0x01, 0x35, 0x39, 0x64, 0x09, 0x25,
+	0xe3, 0xa8, 0x50, 0x97, 0x35, 0x8b, 0xf5, 0x19,
+	0x1e, 0xd5, 0x9f, 0x03, 0x0b, 0x65, 0x55, 0x0e,
+	0xa0, 0xb7, 0xda, 0x18, 0x7b, 0x7f, 0x88, 0x55,
+	0x1f, 0xdb, 0x82, 0x6b, 0x98, 0x90, 0x1c, 0xdd,
+};
+
+static const u8 blake2b_keyed_testvec_consolidated[BLAKE2B_HASH_SIZE] = {
+	0x2b, 0x89, 0x36, 0x3a, 0x36, 0xe4, 0x18, 0x38,
+	0xc4, 0x5b, 0x5c, 0xa5, 0x9a, 0xed, 0xf2, 0xee,
+	0x5a, 0xb6, 0x82, 0x6c, 0x63, 0xf2, 0x29, 0x57,
+	0xc7, 0xd5, 0x32, 0x27, 0xba, 0x88, 0xb1, 0xab,
+	0xf2, 0x2a, 0xc1, 0xea, 0xf3, 0x91, 0x89, 0x66,
+	0x47, 0x1e, 0x5b, 0xc6, 0x98, 0x12, 0xe9, 0x25,
+	0xbf, 0x72, 0xd2, 0x3f, 0x88, 0x97, 0x17, 0x51,
+	0xed, 0x96, 0xfb, 0xe9, 0xca, 0x52, 0x42, 0xc9,
+};
diff --git a/lib/crypto/tests/blake2b_kunit.c b/lib/crypto/tests/blake2b_kunit.c
new file mode 100644
index 000000000000..bc0be7da1e76
--- /dev/null
+++ b/lib/crypto/tests/blake2b_kunit.c
@@ -0,0 +1,133 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright 2025 Google LLC
+ */
+#include <crypto/blake2b.h>
+#include "blake2b-testvecs.h"
+
+/*
+ * The following are compatibility functions that present BLAKE2b as an unkeyed
+ * hash function that produces hashes of fixed length BLAKE2B_HASH_SIZE, so that
+ * hash-test-template.h can be reused to test it.
+ */
+
+static void blake2b_default(const u8 *data, size_t len,
+			    u8 out[BLAKE2B_HASH_SIZE])
+{
+	blake2b(NULL, 0, data, len, out, BLAKE2B_HASH_SIZE);
+}
+
+static void blake2b_init_default(struct blake2b_ctx *ctx)
+{
+	blake2b_init(ctx, BLAKE2B_HASH_SIZE);
+}
+
+/*
+ * Generate the HASH_KUNIT_CASES using hash-test-template.h.  These test BLAKE2b
+ * with a key length of 0 and a hash length of BLAKE2B_HASH_SIZE.
+ */
+#define HASH blake2b_default
+#define HASH_CTX blake2b_ctx
+#define HASH_SIZE BLAKE2B_HASH_SIZE
+#define HASH_INIT blake2b_init_default
+#define HASH_UPDATE blake2b_update
+#define HASH_FINAL blake2b_final
+#include "hash-test-template.h"
+
+/*
+ * BLAKE2b specific test case which tests all possible combinations of key
+ * length and hash length.
+ */
+static void test_blake2b_all_key_and_hash_lens(struct kunit *test)
+{
+	const size_t data_len = 100;
+	u8 *data = &test_buf[0];
+	u8 *key = data + data_len;
+	u8 *hash = key + BLAKE2B_KEY_SIZE;
+	struct blake2b_ctx main_ctx;
+	u8 main_hash[BLAKE2B_HASH_SIZE];
+
+	rand_bytes_seeded_from_len(data, data_len);
+	blake2b_init(&main_ctx, BLAKE2B_HASH_SIZE);
+	for (int key_len = 0; key_len <= BLAKE2B_KEY_SIZE; key_len++) {
+		rand_bytes_seeded_from_len(key, key_len);
+		for (int out_len = 1; out_len <= BLAKE2B_HASH_SIZE; out_len++) {
+			blake2b(key, key_len, data, data_len, hash, out_len);
+			blake2b_update(&main_ctx, hash, out_len);
+		}
+	}
+	blake2b_final(&main_ctx, main_hash);
+	KUNIT_ASSERT_MEMEQ(test, main_hash, blake2b_keyed_testvec_consolidated,
+			   BLAKE2B_HASH_SIZE);
+}
+
+/*
+ * BLAKE2b specific test case which tests using a guarded buffer for all allowed
+ * key lengths.  Also tests both blake2b() and blake2b_init_key().
+ */
+static void test_blake2b_with_guarded_key_buf(struct kunit *test)
+{
+	const size_t data_len = 100;
+
+	rand_bytes(test_buf, data_len);
+	for (int key_len = 0; key_len <= BLAKE2B_KEY_SIZE; key_len++) {
+		u8 key[BLAKE2B_KEY_SIZE];
+		u8 *guarded_key = &test_buf[TEST_BUF_LEN - key_len];
+		u8 hash1[BLAKE2B_HASH_SIZE];
+		u8 hash2[BLAKE2B_HASH_SIZE];
+		struct blake2b_ctx ctx;
+
+		rand_bytes(key, key_len);
+		memcpy(guarded_key, key, key_len);
+
+		blake2b(key, key_len, test_buf, data_len,
+			hash1, BLAKE2B_HASH_SIZE);
+		blake2b(guarded_key, key_len, test_buf, data_len,
+			hash2, BLAKE2B_HASH_SIZE);
+		KUNIT_ASSERT_MEMEQ(test, hash1, hash2, BLAKE2B_HASH_SIZE);
+
+		blake2b_init_key(&ctx, BLAKE2B_HASH_SIZE, guarded_key, key_len);
+		blake2b_update(&ctx, test_buf, data_len);
+		blake2b_final(&ctx, hash2);
+		KUNIT_ASSERT_MEMEQ(test, hash1, hash2, BLAKE2B_HASH_SIZE);
+	}
+}
+
+/*
+ * BLAKE2b specific test case which tests using a guarded output buffer for all
+ * allowed output lengths.
+ */
+static void test_blake2b_with_guarded_out_buf(struct kunit *test)
+{
+	const size_t data_len = 100;
+
+	rand_bytes(test_buf, data_len);
+	for (int out_len = 1; out_len <= BLAKE2B_HASH_SIZE; out_len++) {
+		u8 hash[BLAKE2B_HASH_SIZE];
+		u8 *guarded_hash = &test_buf[TEST_BUF_LEN - out_len];
+
+		blake2b(NULL, 0, test_buf, data_len, hash, out_len);
+		blake2b(NULL, 0, test_buf, data_len, guarded_hash, out_len);
+		KUNIT_ASSERT_MEMEQ(test, hash, guarded_hash, out_len);
+	}
+}
+
+static struct kunit_case blake2b_test_cases[] = {
+	HASH_KUNIT_CASES,
+	KUNIT_CASE(test_blake2b_all_key_and_hash_lens),
+	KUNIT_CASE(test_blake2b_with_guarded_key_buf),
+	KUNIT_CASE(test_blake2b_with_guarded_out_buf),
+	KUNIT_CASE(benchmark_hash),
+	{},
+};
+
+static struct kunit_suite blake2b_test_suite = {
+	.name = "blake2b",
+	.test_cases = blake2b_test_cases,
+	.suite_init = hash_suite_init,
+	.suite_exit = hash_suite_exit,
+};
+kunit_test_suite(blake2b_test_suite);
+
+MODULE_DESCRIPTION("KUnit tests and benchmark for BLAKE2b");
+MODULE_LICENSE("GPL");
diff --git a/lib/crypto/tests/blake2s_kunit.c b/lib/crypto/tests/blake2s_kunit.c
index 057c40132246..6832d9aa7b82 100644
--- a/lib/crypto/tests/blake2s_kunit.c
+++ b/lib/crypto/tests/blake2s_kunit.c
@@ -14,12 +14,12 @@
 static void blake2s_default(const u8 *data, size_t len,
 			    u8 out[BLAKE2S_HASH_SIZE])
 {
-	blake2s(out, data, NULL, BLAKE2S_HASH_SIZE, len, 0);
+	blake2s(NULL, 0, data, len, out, BLAKE2S_HASH_SIZE);
 }
 
-static void blake2s_init_default(struct blake2s_state *state)
+static void blake2s_init_default(struct blake2s_ctx *ctx)
 {
-	blake2s_init(state, BLAKE2S_HASH_SIZE);
+	blake2s_init(ctx, BLAKE2S_HASH_SIZE);
 }
 
 /*
@@ -27,7 +27,7 @@ static void blake2s_init_default(struct blake2s_state *state)
  * with a key length of 0 and a hash length of BLAKE2S_HASH_SIZE.
  */
 #define HASH blake2s_default
-#define HASH_CTX blake2s_state
+#define HASH_CTX blake2s_ctx
 #define HASH_SIZE BLAKE2S_HASH_SIZE
 #define HASH_INIT blake2s_init_default
 #define HASH_UPDATE blake2s_update
@@ -44,19 +44,19 @@ static void test_blake2s_all_key_and_hash_lens(struct kunit *test)
 	u8 *data = &test_buf[0];
 	u8 *key = data + data_len;
 	u8 *hash = key + BLAKE2S_KEY_SIZE;
-	struct blake2s_state main_state;
+	struct blake2s_ctx main_ctx;
 	u8 main_hash[BLAKE2S_HASH_SIZE];
 
 	rand_bytes_seeded_from_len(data, data_len);
-	blake2s_init(&main_state, BLAKE2S_HASH_SIZE);
+	blake2s_init(&main_ctx, BLAKE2S_HASH_SIZE);
 	for (int key_len = 0; key_len <= BLAKE2S_KEY_SIZE; key_len++) {
 		rand_bytes_seeded_from_len(key, key_len);
 		for (int out_len = 1; out_len <= BLAKE2S_HASH_SIZE; out_len++) {
-			blake2s(hash, data, key, out_len, data_len, key_len);
-			blake2s_update(&main_state, hash, out_len);
+			blake2s(key, key_len, data, data_len, hash, out_len);
+			blake2s_update(&main_ctx, hash, out_len);
 		}
 	}
-	blake2s_final(&main_state, main_hash);
+	blake2s_final(&main_ctx, main_hash);
 	KUNIT_ASSERT_MEMEQ(test, main_hash, blake2s_keyed_testvec_consolidated,
 			   BLAKE2S_HASH_SIZE);
 }
@@ -75,21 +75,20 @@ static void test_blake2s_with_guarded_key_buf(struct kunit *test)
 		u8 *guarded_key = &test_buf[TEST_BUF_LEN - key_len];
 		u8 hash1[BLAKE2S_HASH_SIZE];
 		u8 hash2[BLAKE2S_HASH_SIZE];
-		struct blake2s_state state;
+		struct blake2s_ctx ctx;
 
 		rand_bytes(key, key_len);
 		memcpy(guarded_key, key, key_len);
 
-		blake2s(hash1, test_buf, key,
-			BLAKE2S_HASH_SIZE, data_len, key_len);
-		blake2s(hash2, test_buf, guarded_key,
-			BLAKE2S_HASH_SIZE, data_len, key_len);
+		blake2s(key, key_len, test_buf, data_len,
+			hash1, BLAKE2S_HASH_SIZE);
+		blake2s(guarded_key, key_len, test_buf, data_len,
+			hash2, BLAKE2S_HASH_SIZE);
 		KUNIT_ASSERT_MEMEQ(test, hash1, hash2, BLAKE2S_HASH_SIZE);
 
-		blake2s_init_key(&state, BLAKE2S_HASH_SIZE,
-				 guarded_key, key_len);
-		blake2s_update(&state, test_buf, data_len);
-		blake2s_final(&state, hash2);
+		blake2s_init_key(&ctx, BLAKE2S_HASH_SIZE, guarded_key, key_len);
+		blake2s_update(&ctx, test_buf, data_len);
+		blake2s_final(&ctx, hash2);
 		KUNIT_ASSERT_MEMEQ(test, hash1, hash2, BLAKE2S_HASH_SIZE);
 	}
 }
@@ -107,8 +106,8 @@ static void test_blake2s_with_guarded_out_buf(struct kunit *test)
 		u8 hash[BLAKE2S_HASH_SIZE];
 		u8 *guarded_hash = &test_buf[TEST_BUF_LEN - out_len];
 
-		blake2s(hash, test_buf, NULL, out_len, data_len, 0);
-		blake2s(guarded_hash, test_buf, NULL, out_len, data_len, 0);
+		blake2s(NULL, 0, test_buf, data_len, hash, out_len);
+		blake2s(NULL, 0, test_buf, data_len, guarded_hash, out_len);
 		KUNIT_ASSERT_MEMEQ(test, hash, guarded_hash, out_len);
 	}
 }
diff --git a/lib/crypto/tests/polyval-testvecs.h b/lib/crypto/tests/polyval-testvecs.h
new file mode 100644
index 000000000000..3d33f60d58bb
--- /dev/null
+++ b/lib/crypto/tests/polyval-testvecs.h
@@ -0,0 +1,186 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* This file was generated by: ./scripts/crypto/gen-hash-testvecs.py polyval */
+
+static const struct {
+	size_t data_len;
+	u8 digest[POLYVAL_DIGEST_SIZE];
+} hash_testvecs[] = {
+	{
+		.data_len = 0,
+		.digest = {
+			0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+			0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		},
+	},
+	{
+		.data_len = 1,
+		.digest = {
+			0xb5, 0x51, 0x69, 0x89, 0xd4, 0x3c, 0x59, 0xca,
+			0x6a, 0x1c, 0x2a, 0xe9, 0xa1, 0x9c, 0x6c, 0x83,
+		},
+	},
+	{
+		.data_len = 2,
+		.digest = {
+			0xf4, 0x50, 0xaf, 0x07, 0xda, 0x42, 0xa7, 0x41,
+			0x4d, 0x24, 0x88, 0x87, 0xe3, 0x40, 0x73, 0x7c,
+		},
+	},
+	{
+		.data_len = 3,
+		.digest = {
+			0x9e, 0x88, 0x78, 0x71, 0x4c, 0x55, 0x87, 0xe8,
+			0xb4, 0x96, 0x3d, 0x56, 0xc8, 0xb2, 0xe1, 0x68,
+		},
+	},
+	{
+		.data_len = 16,
+		.digest = {
+			0x9e, 0x81, 0x37, 0x8f, 0x49, 0xf7, 0xa2, 0xe4,
+			0x04, 0x45, 0x12, 0x78, 0x45, 0x42, 0x27, 0xad,
+		},
+	},
+	{
+		.data_len = 32,
+		.digest = {
+			0x60, 0x19, 0xd0, 0xa4, 0xf0, 0xde, 0x9e, 0xe7,
+			0x6a, 0x89, 0x1a, 0xea, 0x80, 0x14, 0xa9, 0xa3,
+		},
+	},
+	{
+		.data_len = 48,
+		.digest = {
+			0x0c, 0xa2, 0x70, 0x4d, 0x7c, 0x89, 0xac, 0x41,
+			0xc2, 0x9e, 0x0d, 0x07, 0x07, 0x6a, 0x7f, 0xd5,
+		},
+	},
+	{
+		.data_len = 49,
+		.digest = {
+			0x91, 0xd3, 0xa9, 0x5c, 0x79, 0x3d, 0x6b, 0x84,
+			0x99, 0x54, 0xa7, 0xb4, 0x06, 0x66, 0xfd, 0x1c,
+		},
+	},
+	{
+		.data_len = 63,
+		.digest = {
+			0x29, 0x37, 0xb8, 0xe5, 0xd8, 0x27, 0x4d, 0xfb,
+			0x83, 0x4f, 0x67, 0xf7, 0xf9, 0xc1, 0x0a, 0x9d,
+		},
+	},
+	{
+		.data_len = 64,
+		.digest = {
+			0x17, 0xa9, 0x06, 0x2c, 0xf3, 0xe8, 0x2e, 0xa6,
+			0x6b, 0xb2, 0x1f, 0x5d, 0x94, 0x3c, 0x02, 0xa2,
+		},
+	},
+	{
+		.data_len = 65,
+		.digest = {
+			0x7c, 0x80, 0x74, 0xd7, 0xa1, 0x37, 0x30, 0x64,
+			0x3b, 0xa4, 0xa3, 0x98, 0xde, 0x47, 0x10, 0x23,
+		},
+	},
+	{
+		.data_len = 127,
+		.digest = {
+			0x27, 0x3a, 0xcf, 0xf5, 0xaf, 0x9f, 0xd8, 0xd8,
+			0x2d, 0x6a, 0x91, 0xfb, 0xb8, 0xfa, 0xbe, 0x0c,
+		},
+	},
+	{
+		.data_len = 128,
+		.digest = {
+			0x97, 0x6e, 0xc4, 0xbe, 0x6b, 0x15, 0xa6, 0x7c,
+			0xc4, 0xa2, 0xb8, 0x0a, 0x0e, 0x9c, 0xc7, 0x3a,
+		},
+	},
+	{
+		.data_len = 129,
+		.digest = {
+			0x2b, 0xc3, 0x98, 0xba, 0x6e, 0x42, 0xf8, 0x18,
+			0x85, 0x69, 0x15, 0x37, 0x10, 0x60, 0xe6, 0xac,
+		},
+	},
+	{
+		.data_len = 256,
+		.digest = {
+			0x88, 0x21, 0x77, 0x89, 0xd7, 0x93, 0x90, 0xfc,
+			0xf3, 0xb0, 0xe3, 0xfb, 0x14, 0xe2, 0xcf, 0x74,
+		},
+	},
+	{
+		.data_len = 511,
+		.digest = {
+			0x66, 0x3d, 0x3e, 0x08, 0xa0, 0x49, 0x81, 0x68,
+			0x3e, 0x3b, 0xc8, 0x80, 0x55, 0xd4, 0x15, 0xe9,
+		},
+	},
+	{
+		.data_len = 513,
+		.digest = {
+			0x05, 0xf5, 0x06, 0x66, 0xe7, 0x11, 0x08, 0x84,
+			0xff, 0x94, 0x50, 0x85, 0x65, 0x95, 0x2a, 0x20,
+		},
+	},
+	{
+		.data_len = 1000,
+		.digest = {
+			0xd3, 0xa0, 0x51, 0x69, 0xb5, 0x38, 0xae, 0x1b,
+			0xe1, 0xa2, 0x89, 0xc6, 0x8d, 0x2b, 0x62, 0x37,
+		},
+	},
+	{
+		.data_len = 3333,
+		.digest = {
+			0x37, 0x6d, 0x6a, 0x14, 0xdc, 0xa5, 0x37, 0xfc,
+			0xfe, 0x67, 0x76, 0xb2, 0x64, 0x68, 0x64, 0x05,
+		},
+	},
+	{
+		.data_len = 4096,
+		.digest = {
+			0xe3, 0x12, 0x0c, 0x58, 0x46, 0x45, 0x27, 0x7a,
+			0x0e, 0xa2, 0xfa, 0x2c, 0x35, 0x73, 0x6c, 0x94,
+		},
+	},
+	{
+		.data_len = 4128,
+		.digest = {
+			0x63, 0x0d, 0xa1, 0xbc, 0x6e, 0x3e, 0xd3, 0x1d,
+			0x28, 0x52, 0xd2, 0xf4, 0x30, 0x2d, 0xff, 0xc4,
+		},
+	},
+	{
+		.data_len = 4160,
+		.digest = {
+			0xb2, 0x91, 0x49, 0xe2, 0x02, 0x98, 0x00, 0x79,
+			0x71, 0xb9, 0xd7, 0xd4, 0xb5, 0x94, 0x6d, 0x7d,
+		},
+	},
+	{
+		.data_len = 4224,
+		.digest = {
+			0x58, 0x96, 0x48, 0x69, 0x05, 0x17, 0xe1, 0x6d,
+			0xbc, 0xf2, 0x3d, 0x10, 0x96, 0x00, 0x74, 0x58,
+		},
+	},
+	{
+		.data_len = 16384,
+		.digest = {
+			0x99, 0x3c, 0xcb, 0x4d, 0x64, 0xc9, 0xa9, 0x41,
+			0x52, 0x93, 0xfd, 0x65, 0xc4, 0xcc, 0xa5, 0xe5,
+		},
+	},
+};
+
+static const u8 hash_testvec_consolidated[POLYVAL_DIGEST_SIZE] = {
+	0xdf, 0x68, 0x52, 0x99, 0x92, 0xc3, 0xe8, 0x88,
+	0x29, 0x13, 0xc8, 0x35, 0x67, 0xa3, 0xd3, 0xad,
+};
+
+static const u8 polyval_allones_hashofhashes[POLYVAL_DIGEST_SIZE] = {
+	0xd5, 0xf7, 0xfd, 0xb2, 0xa6, 0xef, 0x0b, 0x85,
+	0x0d, 0x0a, 0x06, 0x10, 0xbc, 0x64, 0x94, 0x73,
+};
diff --git a/lib/crypto/tests/polyval_kunit.c b/lib/crypto/tests/polyval_kunit.c
new file mode 100644
index 000000000000..e59f598c1572
--- /dev/null
+++ b/lib/crypto/tests/polyval_kunit.c
@@ -0,0 +1,223 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright 2025 Google LLC
+ */
+#include <crypto/polyval.h>
+#include "polyval-testvecs.h"
+
+/*
+ * A fixed key used when presenting POLYVAL as an unkeyed hash function in order
+ * to reuse hash-test-template.h.  At the beginning of the test suite, this is
+ * initialized to a key prepared from bytes generated from a fixed seed.
+ */
+static struct polyval_key test_key;
+
+static void polyval_init_withtestkey(struct polyval_ctx *ctx)
+{
+	polyval_init(ctx, &test_key);
+}
+
+static void polyval_withtestkey(const u8 *data, size_t len,
+				u8 out[POLYVAL_BLOCK_SIZE])
+{
+	polyval(&test_key, data, len, out);
+}
+
+/* Generate the HASH_KUNIT_CASES using hash-test-template.h. */
+#define HASH polyval_withtestkey
+#define HASH_CTX polyval_ctx
+#define HASH_SIZE POLYVAL_BLOCK_SIZE
+#define HASH_INIT polyval_init_withtestkey
+#define HASH_UPDATE polyval_update
+#define HASH_FINAL polyval_final
+#include "hash-test-template.h"
+
+/*
+ * Test an example from RFC8452 ("AES-GCM-SIV: Nonce Misuse-Resistant
+ * Authenticated Encryption") to ensure compatibility with that.
+ */
+static void test_polyval_rfc8452_testvec(struct kunit *test)
+{
+	static const u8 raw_key[POLYVAL_BLOCK_SIZE] =
+		"\x31\x07\x28\xd9\x91\x1f\x1f\x38"
+		"\x37\xb2\x43\x16\xc3\xfa\xb9\xa0";
+	static const u8 data[48] =
+		"\x65\x78\x61\x6d\x70\x6c\x65\x00"
+		"\x00\x00\x00\x00\x00\x00\x00\x00"
+		"\x48\x65\x6c\x6c\x6f\x20\x77\x6f"
+		"\x72\x6c\x64\x00\x00\x00\x00\x00"
+		"\x38\x00\x00\x00\x00\x00\x00\x00"
+		"\x58\x00\x00\x00\x00\x00\x00\x00";
+	static const u8 expected_hash[POLYVAL_BLOCK_SIZE] =
+		"\xad\x7f\xcf\x0b\x51\x69\x85\x16"
+		"\x62\x67\x2f\x3c\x5f\x95\x13\x8f";
+	u8 hash[POLYVAL_BLOCK_SIZE];
+	struct polyval_key key;
+
+	polyval_preparekey(&key, raw_key);
+	polyval(&key, data, sizeof(data), hash);
+	KUNIT_ASSERT_MEMEQ(test, hash, expected_hash, sizeof(hash));
+}
+
+/*
+ * Test a key and messages containing all one bits.  This is useful to detect
+ * overflow bugs in implementations that emulate carryless multiplication using
+ * a series of standard multiplications with the bits spread out.
+ */
+static void test_polyval_allones_key_and_message(struct kunit *test)
+{
+	struct polyval_key key;
+	struct polyval_ctx hashofhashes_ctx;
+	u8 hash[POLYVAL_BLOCK_SIZE];
+
+	static_assert(TEST_BUF_LEN >= 4096);
+	memset(test_buf, 0xff, 4096);
+
+	polyval_preparekey(&key, test_buf);
+	polyval_init(&hashofhashes_ctx, &key);
+	for (size_t len = 0; len <= 4096; len += 16) {
+		polyval(&key, test_buf, len, hash);
+		polyval_update(&hashofhashes_ctx, hash, sizeof(hash));
+	}
+	polyval_final(&hashofhashes_ctx, hash);
+	KUNIT_ASSERT_MEMEQ(test, hash, polyval_allones_hashofhashes,
+			   sizeof(hash));
+}
+
+#define MAX_LEN_FOR_KEY_CHECK 1024
+
+/*
+ * Given two prepared keys which should be identical (but may differ in
+ * alignment and/or whether they are followed by a guard page or not), verify
+ * that they produce consistent results on various data lengths.
+ */
+static void check_key_consistency(struct kunit *test,
+				  const struct polyval_key *key1,
+				  const struct polyval_key *key2)
+{
+	u8 *data = test_buf;
+	u8 hash1[POLYVAL_BLOCK_SIZE];
+	u8 hash2[POLYVAL_BLOCK_SIZE];
+
+	rand_bytes(data, MAX_LEN_FOR_KEY_CHECK);
+	KUNIT_ASSERT_MEMEQ(test, key1, key2, sizeof(*key1));
+
+	for (int i = 0; i < 100; i++) {
+		size_t len = rand_length(MAX_LEN_FOR_KEY_CHECK);
+
+		polyval(key1, data, len, hash1);
+		polyval(key2, data, len, hash2);
+		KUNIT_ASSERT_MEMEQ(test, hash1, hash2, sizeof(hash1));
+	}
+}
+
+/* Test that no buffer overreads occur on either raw_key or polyval_key. */
+static void test_polyval_with_guarded_key(struct kunit *test)
+{
+	u8 raw_key[POLYVAL_BLOCK_SIZE];
+	u8 *guarded_raw_key = &test_buf[TEST_BUF_LEN - sizeof(raw_key)];
+	struct polyval_key key1, key2;
+	struct polyval_key *guarded_key =
+		(struct polyval_key *)&test_buf[TEST_BUF_LEN - sizeof(key1)];
+
+	/* Prepare with regular buffers. */
+	rand_bytes(raw_key, sizeof(raw_key));
+	polyval_preparekey(&key1, raw_key);
+
+	/* Prepare with guarded raw_key, then check that it works. */
+	memcpy(guarded_raw_key, raw_key, sizeof(raw_key));
+	polyval_preparekey(&key2, guarded_raw_key);
+	check_key_consistency(test, &key1, &key2);
+
+	/* Prepare guarded polyval_key, then check that it works. */
+	polyval_preparekey(guarded_key, raw_key);
+	check_key_consistency(test, &key1, guarded_key);
+}
+
+/*
+ * Test that polyval_key only needs to be aligned to
+ * __alignof__(struct polyval_key), i.e. 8 bytes.  The assembly code may prefer
+ * 16-byte or higher alignment, but it musn't require it.
+ */
+static void test_polyval_with_minimally_aligned_key(struct kunit *test)
+{
+	u8 raw_key[POLYVAL_BLOCK_SIZE];
+	struct polyval_key key;
+	struct polyval_key *minaligned_key =
+		(struct polyval_key *)&test_buf[MAX_LEN_FOR_KEY_CHECK +
+						__alignof__(struct polyval_key)];
+
+	KUNIT_ASSERT_TRUE(test, IS_ALIGNED((uintptr_t)minaligned_key,
+					   __alignof__(struct polyval_key)));
+	KUNIT_ASSERT_TRUE(test,
+			  !IS_ALIGNED((uintptr_t)minaligned_key,
+				      2 * __alignof__(struct polyval_key)));
+
+	rand_bytes(raw_key, sizeof(raw_key));
+	polyval_preparekey(&key, raw_key);
+	polyval_preparekey(minaligned_key, raw_key);
+	check_key_consistency(test, &key, minaligned_key);
+}
+
+struct polyval_irq_test_state {
+	struct polyval_key expected_key;
+	u8 raw_key[POLYVAL_BLOCK_SIZE];
+};
+
+static bool polyval_irq_test_func(void *state_)
+{
+	struct polyval_irq_test_state *state = state_;
+	struct polyval_key key;
+
+	polyval_preparekey(&key, state->raw_key);
+	return memcmp(&key, &state->expected_key, sizeof(key)) == 0;
+}
+
+/*
+ * Test that polyval_preparekey() produces the same output regardless of whether
+ * FPU or vector registers are usable when it is called.
+ */
+static void test_polyval_preparekey_in_irqs(struct kunit *test)
+{
+	struct polyval_irq_test_state state;
+
+	rand_bytes(state.raw_key, sizeof(state.raw_key));
+	polyval_preparekey(&state.expected_key, state.raw_key);
+	kunit_run_irq_test(test, polyval_irq_test_func, 20000, &state);
+}
+
+static int polyval_suite_init(struct kunit_suite *suite)
+{
+	u8 raw_key[POLYVAL_BLOCK_SIZE];
+
+	rand_bytes_seeded_from_len(raw_key, sizeof(raw_key));
+	polyval_preparekey(&test_key, raw_key);
+	return hash_suite_init(suite);
+}
+
+static void polyval_suite_exit(struct kunit_suite *suite)
+{
+	hash_suite_exit(suite);
+}
+
+static struct kunit_case polyval_test_cases[] = {
+	HASH_KUNIT_CASES,
+	KUNIT_CASE(test_polyval_rfc8452_testvec),
+	KUNIT_CASE(test_polyval_allones_key_and_message),
+	KUNIT_CASE(test_polyval_with_guarded_key),
+	KUNIT_CASE(test_polyval_with_minimally_aligned_key),
+	KUNIT_CASE(test_polyval_preparekey_in_irqs),
+	KUNIT_CASE(benchmark_hash),
+	{},
+};
+
+static struct kunit_suite polyval_test_suite = {
+	.name = "polyval",
+	.test_cases = polyval_test_cases,
+	.suite_init = polyval_suite_init,
+	.suite_exit = polyval_suite_exit,
+};
+kunit_test_suite(polyval_test_suite);
+
+MODULE_DESCRIPTION("KUnit tests and benchmark for POLYVAL");
+MODULE_LICENSE("GPL");
diff --git a/lib/crypto/tests/sha3-testvecs.h b/lib/crypto/tests/sha3-testvecs.h
new file mode 100644
index 000000000000..8d614a5fa0c3
--- /dev/null
+++ b/lib/crypto/tests/sha3-testvecs.h
@@ -0,0 +1,249 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* This file was generated by: ./scripts/crypto/gen-hash-testvecs.py sha3 */
+
+/* SHA3-256 test vectors */
+
+static const struct {
+	size_t data_len;
+	u8 digest[SHA3_256_DIGEST_SIZE];
+} hash_testvecs[] = {
+	{
+		.data_len = 0,
+		.digest = {
+			0xa7, 0xff, 0xc6, 0xf8, 0xbf, 0x1e, 0xd7, 0x66,
+			0x51, 0xc1, 0x47, 0x56, 0xa0, 0x61, 0xd6, 0x62,
+			0xf5, 0x80, 0xff, 0x4d, 0xe4, 0x3b, 0x49, 0xfa,
+			0x82, 0xd8, 0x0a, 0x4b, 0x80, 0xf8, 0x43, 0x4a,
+		},
+	},
+	{
+		.data_len = 1,
+		.digest = {
+			0x11, 0x03, 0xe7, 0x84, 0x51, 0x50, 0x86, 0x35,
+			0x71, 0x8a, 0x70, 0xe3, 0xc4, 0x26, 0x7b, 0x21,
+			0x02, 0x13, 0xa0, 0x81, 0xe8, 0xe6, 0x14, 0x25,
+			0x07, 0x34, 0xe5, 0xc5, 0x40, 0x06, 0xf2, 0x8b,
+		},
+	},
+	{
+		.data_len = 2,
+		.digest = {
+			0x2f, 0x6f, 0x6d, 0x47, 0x48, 0x52, 0x11, 0xb9,
+			0xe4, 0x3d, 0xc8, 0x71, 0xcf, 0xb2, 0xee, 0xae,
+			0x5b, 0xf4, 0x12, 0x84, 0x5b, 0x1c, 0xec, 0x6c,
+			0xc1, 0x66, 0x88, 0xaa, 0xc3, 0x40, 0xbd, 0x7e,
+		},
+	},
+	{
+		.data_len = 3,
+		.digest = {
+			0xec, 0x02, 0xe8, 0x81, 0x4f, 0x84, 0x41, 0x69,
+			0x06, 0xd8, 0xdc, 0x1d, 0x01, 0x78, 0xd7, 0xcb,
+			0x39, 0xdf, 0xd3, 0x12, 0x1c, 0x99, 0xfd, 0xf3,
+			0x5c, 0x83, 0xc9, 0xc2, 0x7a, 0x7b, 0x6a, 0x05,
+		},
+	},
+	{
+		.data_len = 16,
+		.digest = {
+			0xff, 0x6f, 0xc3, 0x41, 0xc3, 0x5f, 0x34, 0x6d,
+			0xa7, 0xdf, 0x3e, 0xc2, 0x8b, 0x29, 0xb6, 0xf1,
+			0xf8, 0x67, 0xfd, 0xcd, 0xb1, 0x9f, 0x38, 0x08,
+			0x1d, 0x8d, 0xd9, 0xc2, 0x43, 0x66, 0x18, 0x6c,
+		},
+	},
+	{
+		.data_len = 32,
+		.digest = {
+			0xe4, 0xb1, 0x06, 0x17, 0xf8, 0x8b, 0x91, 0x95,
+			0xe7, 0x57, 0x66, 0xac, 0x08, 0xb2, 0x03, 0x3e,
+			0xf7, 0x84, 0x1f, 0xe3, 0x25, 0xa3, 0x11, 0xd2,
+			0x11, 0xa4, 0x78, 0x74, 0x2a, 0x43, 0x20, 0xa5,
+		},
+	},
+	{
+		.data_len = 48,
+		.digest = {
+			0xeb, 0x57, 0x5f, 0x20, 0xa3, 0x6b, 0xc7, 0xb4,
+			0x66, 0x2a, 0xa0, 0x30, 0x3b, 0x52, 0x00, 0xc9,
+			0xce, 0x6a, 0xd8, 0x1e, 0xbe, 0xed, 0xa1, 0xd1,
+			0xbe, 0x63, 0xc7, 0xe1, 0xe2, 0x66, 0x67, 0x0c,
+		},
+	},
+	{
+		.data_len = 49,
+		.digest = {
+			0xf0, 0x67, 0xad, 0x66, 0xbe, 0xec, 0x5a, 0xfd,
+			0x29, 0xd2, 0x4f, 0x1d, 0xb2, 0x24, 0xb8, 0x90,
+			0x05, 0x28, 0x0e, 0x66, 0x67, 0x74, 0x2d, 0xee,
+			0x66, 0x25, 0x11, 0xd1, 0x76, 0xa2, 0xfc, 0x3a,
+		},
+	},
+	{
+		.data_len = 63,
+		.digest = {
+			0x57, 0x56, 0x21, 0xb3, 0x2d, 0x2d, 0xe1, 0x9d,
+			0xbf, 0x2c, 0x82, 0xa8, 0xad, 0x7e, 0x6c, 0x46,
+			0xfb, 0x30, 0xeb, 0xce, 0xcf, 0xed, 0x2d, 0x65,
+			0xe7, 0xe4, 0x96, 0x69, 0xe0, 0x48, 0xd2, 0xb6,
+		},
+	},
+	{
+		.data_len = 64,
+		.digest = {
+			0x7b, 0xba, 0x67, 0x15, 0xe5, 0x21, 0xc4, 0x69,
+			0xd3, 0xef, 0x5c, 0x97, 0x9f, 0x5b, 0xba, 0x9c,
+			0xfa, 0x55, 0x64, 0xec, 0xb5, 0x37, 0x53, 0x1b,
+			0x3f, 0x4c, 0x0a, 0xed, 0x51, 0x98, 0x2b, 0x52,
+		},
+	},
+	{
+		.data_len = 65,
+		.digest = {
+			0x44, 0xb6, 0x6b, 0x83, 0x09, 0x83, 0x55, 0x83,
+			0xde, 0x1f, 0xcc, 0x33, 0xef, 0xdc, 0x05, 0xbb,
+			0x3b, 0x63, 0x76, 0x45, 0xe4, 0x8e, 0x14, 0x7a,
+			0x2d, 0xae, 0x90, 0xce, 0x68, 0xc3, 0xa4, 0xf2,
+		},
+	},
+	{
+		.data_len = 127,
+		.digest = {
+			0x50, 0x3e, 0x99, 0x4e, 0x28, 0x2b, 0xc9, 0xf4,
+			0xf5, 0xeb, 0x2b, 0x16, 0x04, 0x2d, 0xf5, 0xbe,
+			0xc0, 0x91, 0x41, 0x2a, 0x8e, 0x69, 0x5e, 0x39,
+			0x53, 0x2c, 0xc1, 0x18, 0xa5, 0xeb, 0xd8, 0xda,
+		},
+	},
+	{
+		.data_len = 128,
+		.digest = {
+			0x90, 0x0b, 0xa6, 0x92, 0x84, 0x30, 0xaf, 0xee,
+			0x38, 0x59, 0x83, 0x83, 0xe9, 0xfe, 0xab, 0x86,
+			0x79, 0x1b, 0xcd, 0xe7, 0x0a, 0x0f, 0x58, 0x53,
+			0x36, 0xab, 0x12, 0xe1, 0x5c, 0x97, 0xc1, 0xfb,
+		},
+	},
+	{
+		.data_len = 129,
+		.digest = {
+			0x2b, 0x52, 0x1e, 0x54, 0xbe, 0x38, 0x4c, 0x3e,
+			0x73, 0x37, 0x18, 0xf5, 0x25, 0x2c, 0xc8, 0xc7,
+			0xda, 0x7e, 0xb6, 0x47, 0x9d, 0xf4, 0x46, 0xce,
+			0xfa, 0x80, 0x20, 0x6b, 0xbd, 0xfd, 0x2a, 0xd8,
+		},
+	},
+	{
+		.data_len = 256,
+		.digest = {
+			0x45, 0xf0, 0xf5, 0x9b, 0xd9, 0x91, 0x26, 0xd5,
+			0x91, 0x3b, 0xf8, 0x87, 0x8b, 0x34, 0x02, 0x31,
+			0x64, 0xab, 0xf4, 0x1c, 0x6e, 0x34, 0x72, 0xdf,
+			0x32, 0x6d, 0xe5, 0xd2, 0x67, 0x5e, 0x86, 0x93,
+		},
+	},
+	{
+		.data_len = 511,
+		.digest = {
+			0xb3, 0xaf, 0x71, 0x64, 0xfa, 0xd4, 0xf1, 0x07,
+			0x38, 0xef, 0x04, 0x8e, 0x89, 0xf4, 0x02, 0xd2,
+			0xa5, 0xaf, 0x3b, 0xf5, 0x67, 0x56, 0xcf, 0xa9,
+			0x8e, 0x43, 0xf5, 0xb5, 0xe3, 0x91, 0x8e, 0xe7,
+		},
+	},
+	{
+		.data_len = 513,
+		.digest = {
+			0x51, 0xac, 0x0a, 0x65, 0xb7, 0x96, 0x20, 0xcf,
+			0x88, 0xf6, 0x97, 0x35, 0x89, 0x0d, 0x31, 0x0f,
+			0xbe, 0x17, 0xbe, 0x62, 0x03, 0x67, 0xc0, 0xee,
+			0x4f, 0xc1, 0xe3, 0x7f, 0x6f, 0xab, 0xac, 0xb4,
+		},
+	},
+	{
+		.data_len = 1000,
+		.digest = {
+			0x7e, 0xea, 0xa8, 0xd7, 0xde, 0x20, 0x1b, 0x58,
+			0x24, 0xd8, 0x26, 0x40, 0x36, 0x5f, 0x3f, 0xaa,
+			0xe5, 0x5a, 0xea, 0x98, 0x58, 0xd4, 0xd6, 0xfc,
+			0x20, 0x4c, 0x5c, 0x4f, 0xaf, 0x56, 0xc7, 0xc3,
+		},
+	},
+	{
+		.data_len = 3333,
+		.digest = {
+			0x61, 0xb1, 0xb1, 0x3e, 0x0e, 0x7e, 0x90, 0x3d,
+			0x31, 0x54, 0xbd, 0xc9, 0x0d, 0x53, 0x62, 0xf1,
+			0xcd, 0x18, 0x80, 0xf9, 0x91, 0x75, 0x41, 0xb3,
+			0x51, 0x39, 0x57, 0xa7, 0xa8, 0x1e, 0xfb, 0xc9,
+		},
+	},
+	{
+		.data_len = 4096,
+		.digest = {
+			0xab, 0x29, 0xda, 0x10, 0xc4, 0x11, 0x2d, 0x5c,
+			0xd1, 0xce, 0x1c, 0x95, 0xfa, 0xc6, 0xc7, 0xb0,
+			0x1b, 0xd1, 0xdc, 0x6f, 0xa0, 0x9d, 0x1b, 0x23,
+			0xfb, 0x6e, 0x90, 0x97, 0xd0, 0x75, 0x44, 0x7a,
+		},
+	},
+	{
+		.data_len = 4128,
+		.digest = {
+			0x02, 0x45, 0x95, 0xf4, 0x19, 0xb5, 0x93, 0x29,
+			0x90, 0xf2, 0x63, 0x3f, 0x89, 0xe8, 0xa5, 0x31,
+			0x76, 0xf2, 0x89, 0x79, 0x66, 0xd3, 0x96, 0xdf,
+			0x33, 0xd1, 0xa6, 0x17, 0x73, 0xb1, 0xd0, 0x45,
+		},
+	},
+	{
+		.data_len = 4160,
+		.digest = {
+			0xd1, 0x8e, 0x22, 0xea, 0x44, 0x87, 0x6e, 0x9d,
+			0xfb, 0x36, 0x02, 0x20, 0x63, 0xb7, 0x69, 0x45,
+			0x25, 0x41, 0x69, 0xe0, 0x9b, 0x87, 0xcf, 0xa3,
+			0x51, 0xbb, 0xfc, 0x8d, 0xf7, 0x29, 0xa7, 0xea,
+		},
+	},
+	{
+		.data_len = 4224,
+		.digest = {
+			0x11, 0x86, 0x7d, 0x84, 0xf9, 0x8c, 0x6e, 0xc4,
+			0x64, 0x36, 0xc6, 0xf3, 0x42, 0x92, 0x31, 0x2b,
+			0x1e, 0x12, 0xe6, 0x4d, 0xbe, 0xfa, 0x77, 0x3f,
+			0x89, 0x41, 0x33, 0x58, 0x1c, 0x98, 0x16, 0x0a,
+		},
+	},
+	{
+		.data_len = 16384,
+		.digest = {
+			0xb2, 0xba, 0x0c, 0x8c, 0x9d, 0xbb, 0x1e, 0xb0,
+			0x03, 0xb5, 0xdf, 0x4f, 0xf5, 0x35, 0xdb, 0xec,
+			0x60, 0xf2, 0x5b, 0xb6, 0xd0, 0x49, 0xd3, 0xed,
+			0x55, 0xc0, 0x7a, 0xd7, 0xaf, 0xa1, 0xea, 0x53,
+		},
+	},
+};
+
+static const u8 hash_testvec_consolidated[SHA3_256_DIGEST_SIZE] = {
+	0x3b, 0x33, 0x67, 0xf8, 0xea, 0x92, 0x78, 0x62,
+	0xdd, 0xbe, 0x72, 0x15, 0xbd, 0x6f, 0xfa, 0xe5,
+	0x5e, 0xab, 0x9f, 0xb1, 0xe4, 0x23, 0x7c, 0x2c,
+	0x80, 0xcf, 0x09, 0x75, 0xf8, 0xe2, 0xfa, 0x30,
+};
+
+/* SHAKE test vectors */
+
+static const u8 shake128_testvec_consolidated[SHA3_256_DIGEST_SIZE] = {
+	0x89, 0x88, 0x3a, 0x44, 0xec, 0xfe, 0x3c, 0xeb,
+	0x2f, 0x1c, 0x1d, 0xda, 0x9e, 0x36, 0x64, 0xf0,
+	0x85, 0x4c, 0x49, 0x12, 0x76, 0x5a, 0x4d, 0xe7,
+	0xa8, 0xfd, 0xcd, 0xbe, 0x45, 0xb4, 0x6f, 0xb0,
+};
+
+static const u8 shake256_testvec_consolidated[SHA3_256_DIGEST_SIZE] = {
+	0x5a, 0xfd, 0x66, 0x62, 0x5c, 0x37, 0x2b, 0x41,
+	0x77, 0x1c, 0x01, 0x5d, 0x64, 0x7c, 0x63, 0x7a,
+	0x7c, 0x76, 0x9e, 0xa8, 0xd1, 0xb0, 0x8e, 0x02,
+	0x16, 0x9b, 0xfe, 0x0e, 0xb5, 0xd8, 0x6a, 0xb5,
+};
diff --git a/lib/crypto/tests/sha3_kunit.c b/lib/crypto/tests/sha3_kunit.c
new file mode 100644
index 000000000000..ed5fbe80337f
--- /dev/null
+++ b/lib/crypto/tests/sha3_kunit.c
@@ -0,0 +1,422 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2025 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+#include <crypto/sha3.h>
+#include "sha3-testvecs.h"
+
+#define HASH		sha3_256
+#define HASH_CTX	sha3_ctx
+#define HASH_SIZE	SHA3_256_DIGEST_SIZE
+#define HASH_INIT	sha3_256_init
+#define HASH_UPDATE	sha3_update
+#define HASH_FINAL	sha3_final
+#include "hash-test-template.h"
+
+/*
+ * Sample message and the output generated for various algorithms by passing it
+ * into "openssl sha3-224" etc..
+ */
+static const u8 test_sha3_sample[] =
+	"The quick red fox jumped over the lazy brown dog!\n"
+	"The quick red fox jumped over the lazy brown dog!\n"
+	"The quick red fox jumped over the lazy brown dog!\n"
+	"The quick red fox jumped over the lazy brown dog!\n";
+
+static const u8 test_sha3_224[8 + SHA3_224_DIGEST_SIZE + 8] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* Write-before guard */
+	0xd6, 0xe8, 0xd8, 0x80, 0xfa, 0x42, 0x80, 0x70,
+	0x7e, 0x7f, 0xd7, 0xd2, 0xd7, 0x7a, 0x35, 0x65,
+	0xf0, 0x0b, 0x4f, 0x9f, 0x2a, 0x33, 0xca, 0x0a,
+	0xef, 0xa6, 0x4c, 0xb8,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* Write-after guard */
+};
+
+static const u8 test_sha3_256[8 + SHA3_256_DIGEST_SIZE + 8] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* Write-before guard */
+	0xdb, 0x3b, 0xb0, 0xb8, 0x8d, 0x15, 0x78, 0xe5,
+	0x78, 0x76, 0x8e, 0x39, 0x7e, 0x89, 0x86, 0xb9,
+	0x14, 0x3a, 0x1e, 0xe7, 0x96, 0x7c, 0xf3, 0x25,
+	0x70, 0xbd, 0xc3, 0xa9, 0xae, 0x63, 0x71, 0x1d,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* Write-after guard */
+};
+
+static const u8 test_sha3_384[8 + SHA3_384_DIGEST_SIZE + 8] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* Write-before guard */
+	0x2d, 0x4b, 0x29, 0x85, 0x19, 0x94, 0xaa, 0x31,
+	0x9b, 0x04, 0x9d, 0x6e, 0x79, 0x66, 0xc7, 0x56,
+	0x8a, 0x2e, 0x99, 0x84, 0x06, 0xcf, 0x10, 0x2d,
+	0xec, 0xf0, 0x03, 0x04, 0x1f, 0xd5, 0x99, 0x63,
+	0x2f, 0xc3, 0x2b, 0x0d, 0xd9, 0x45, 0xf7, 0xbb,
+	0x0a, 0xc3, 0x46, 0xab, 0xfe, 0x4d, 0x94, 0xc2,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* Write-after guard */
+};
+
+static const u8 test_sha3_512[8 + SHA3_512_DIGEST_SIZE + 8] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* Write-before guard */
+	0xdd, 0x71, 0x3b, 0x44, 0xb6, 0x6c, 0xd7, 0x78,
+	0xe7, 0x93, 0xa1, 0x4c, 0xd7, 0x24, 0x16, 0xf1,
+	0xfd, 0xa2, 0x82, 0x4e, 0xed, 0x59, 0xe9, 0x83,
+	0x15, 0x38, 0x89, 0x7d, 0x39, 0x17, 0x0c, 0xb2,
+	0xcf, 0x12, 0x80, 0x78, 0xa1, 0x78, 0x41, 0xeb,
+	0xed, 0x21, 0x4c, 0xa4, 0x4a, 0x5f, 0x30, 0x1a,
+	0x70, 0x98, 0x4f, 0x14, 0xa2, 0xd1, 0x64, 0x1b,
+	0xc2, 0x0a, 0xff, 0x3b, 0xe8, 0x26, 0x41, 0x8f,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* Write-after guard */
+};
+
+static const u8 test_shake128[8 + SHAKE128_DEFAULT_SIZE + 8] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* Write-before guard */
+	0x41, 0xd6, 0xb8, 0x9c, 0xf8, 0xe8, 0x54, 0xf2,
+	0x5c, 0xde, 0x51, 0x12, 0xaf, 0x9e, 0x0d, 0x91,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* Write-after guard */
+};
+
+static const u8 test_shake256[8 + SHAKE256_DEFAULT_SIZE + 8] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* Write-before guard */
+	0xab, 0x06, 0xd4, 0xf9, 0x8b, 0xfd, 0xb2, 0xc4,
+	0xfe, 0xf1, 0xcc, 0xe2, 0x40, 0x45, 0xdd, 0x15,
+	0xcb, 0xdd, 0x02, 0x8d, 0xb7, 0x9f, 0x1e, 0x67,
+	0xd6, 0x7f, 0x98, 0x5e, 0x1b, 0x19, 0xf8, 0x01,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* Write-after guard */
+};
+
+static void test_sha3_224_basic(struct kunit *test)
+{
+	u8 out[8 + SHA3_224_DIGEST_SIZE + 8];
+
+	BUILD_BUG_ON(sizeof(out) != sizeof(test_sha3_224));
+
+	memset(out, 0, sizeof(out));
+	sha3_224(test_sha3_sample, sizeof(test_sha3_sample) - 1, out + 8);
+
+	KUNIT_ASSERT_MEMEQ_MSG(test, out, test_sha3_224, sizeof(test_sha3_224),
+			       "SHA3-224 gives wrong output");
+}
+
+static void test_sha3_256_basic(struct kunit *test)
+{
+	u8 out[8 + SHA3_256_DIGEST_SIZE + 8];
+
+	BUILD_BUG_ON(sizeof(out) != sizeof(test_sha3_256));
+
+	memset(out, 0, sizeof(out));
+	sha3_256(test_sha3_sample, sizeof(test_sha3_sample) - 1, out + 8);
+
+	KUNIT_ASSERT_MEMEQ_MSG(test, out, test_sha3_256, sizeof(test_sha3_256),
+			       "SHA3-256 gives wrong output");
+}
+
+static void test_sha3_384_basic(struct kunit *test)
+{
+	u8 out[8 + SHA3_384_DIGEST_SIZE + 8];
+
+	BUILD_BUG_ON(sizeof(out) != sizeof(test_sha3_384));
+
+	memset(out, 0, sizeof(out));
+	sha3_384(test_sha3_sample, sizeof(test_sha3_sample) - 1, out + 8);
+
+	KUNIT_ASSERT_MEMEQ_MSG(test, out, test_sha3_384, sizeof(test_sha3_384),
+			       "SHA3-384 gives wrong output");
+}
+
+static void test_sha3_512_basic(struct kunit *test)
+{
+	u8 out[8 + SHA3_512_DIGEST_SIZE + 8];
+
+	BUILD_BUG_ON(sizeof(out) != sizeof(test_sha3_512));
+
+	memset(out, 0, sizeof(out));
+	sha3_512(test_sha3_sample, sizeof(test_sha3_sample) - 1, out + 8);
+
+	KUNIT_ASSERT_MEMEQ_MSG(test, out, test_sha3_512, sizeof(test_sha3_512),
+			       "SHA3-512 gives wrong output");
+}
+
+static void test_shake128_basic(struct kunit *test)
+{
+	u8 out[8 + SHAKE128_DEFAULT_SIZE + 8];
+
+	BUILD_BUG_ON(sizeof(out) != sizeof(test_shake128));
+
+	memset(out, 0, sizeof(out));
+	shake128(test_sha3_sample, sizeof(test_sha3_sample) - 1,
+		 out + 8, sizeof(out) - 16);
+
+	KUNIT_ASSERT_MEMEQ_MSG(test, out, test_shake128, sizeof(test_shake128),
+			       "SHAKE128 gives wrong output");
+}
+
+static void test_shake256_basic(struct kunit *test)
+{
+	u8 out[8 + SHAKE256_DEFAULT_SIZE + 8];
+
+	BUILD_BUG_ON(sizeof(out) != sizeof(test_shake256));
+
+	memset(out, 0, sizeof(out));
+	shake256(test_sha3_sample, sizeof(test_sha3_sample) - 1,
+		 out + 8, sizeof(out) - 16);
+
+	KUNIT_ASSERT_MEMEQ_MSG(test, out, test_shake256, sizeof(test_shake256),
+			       "SHAKE256 gives wrong output");
+}
+
+/*
+ * Usable NIST tests.
+ *
+ * From: https://csrc.nist.gov/projects/cryptographic-standards-and-guidelines/example-values
+ */
+static const u8 test_nist_1600_sample[] = {
+	0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+	0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+	0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+	0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+	0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+	0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+	0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+	0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+	0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+	0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+	0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+	0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+	0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+	0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+	0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+	0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+	0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+	0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+	0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+	0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+	0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+	0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+	0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+	0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+	0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3
+};
+
+static const u8 test_shake128_nist_0[] = {
+	0x7f, 0x9c, 0x2b, 0xa4, 0xe8, 0x8f, 0x82, 0x7d,
+	0x61, 0x60, 0x45, 0x50, 0x76, 0x05, 0x85, 0x3e
+};
+
+static const u8 test_shake128_nist_1600[] = {
+	0x13, 0x1a, 0xb8, 0xd2, 0xb5, 0x94, 0x94, 0x6b,
+	0x9c, 0x81, 0x33, 0x3f, 0x9b, 0xb6, 0xe0, 0xce,
+};
+
+static const u8 test_shake256_nist_0[] = {
+	0x46, 0xb9, 0xdd, 0x2b, 0x0b, 0xa8, 0x8d, 0x13,
+	0x23, 0x3b, 0x3f, 0xeb, 0x74, 0x3e, 0xeb, 0x24,
+	0x3f, 0xcd, 0x52, 0xea, 0x62, 0xb8, 0x1b, 0x82,
+	0xb5, 0x0c, 0x27, 0x64, 0x6e, 0xd5, 0x76, 0x2f
+};
+
+static const u8 test_shake256_nist_1600[] = {
+	0xcd, 0x8a, 0x92, 0x0e, 0xd1, 0x41, 0xaa, 0x04,
+	0x07, 0xa2, 0x2d, 0x59, 0x28, 0x86, 0x52, 0xe9,
+	0xd9, 0xf1, 0xa7, 0xee, 0x0c, 0x1e, 0x7c, 0x1c,
+	0xa6, 0x99, 0x42, 0x4d, 0xa8, 0x4a, 0x90, 0x4d,
+};
+
+static void test_shake128_nist(struct kunit *test)
+{
+	u8 out[SHAKE128_DEFAULT_SIZE];
+
+	shake128("", 0, out, sizeof(out));
+	KUNIT_ASSERT_MEMEQ_MSG(test, out, test_shake128_nist_0, sizeof(out),
+			       "SHAKE128 gives wrong output for NIST.0");
+
+	shake128(test_nist_1600_sample, sizeof(test_nist_1600_sample),
+		 out, sizeof(out));
+	KUNIT_ASSERT_MEMEQ_MSG(test, out, test_shake128_nist_1600, sizeof(out),
+			       "SHAKE128 gives wrong output for NIST.1600");
+}
+
+static void test_shake256_nist(struct kunit *test)
+{
+	u8 out[SHAKE256_DEFAULT_SIZE];
+
+	shake256("", 0, out, sizeof(out));
+	KUNIT_ASSERT_MEMEQ_MSG(test, out, test_shake256_nist_0, sizeof(out),
+			       "SHAKE256 gives wrong output for NIST.0");
+
+	shake256(test_nist_1600_sample, sizeof(test_nist_1600_sample),
+		 out, sizeof(out));
+	KUNIT_ASSERT_MEMEQ_MSG(test, out, test_shake256_nist_1600, sizeof(out),
+			       "SHAKE256 gives wrong output for NIST.1600");
+}
+
+static void shake(int alg, const u8 *in, size_t in_len, u8 *out, size_t out_len)
+{
+	if (alg == 0)
+		shake128(in, in_len, out, out_len);
+	else
+		shake256(in, in_len, out, out_len);
+}
+
+static void shake_init(struct shake_ctx *ctx, int alg)
+{
+	if (alg == 0)
+		shake128_init(ctx);
+	else
+		shake256_init(ctx);
+}
+
+/*
+ * Test each of SHAKE128 and SHAKE256 with all input lengths 0 through 4096, for
+ * both input and output.  The input and output lengths cycle through the values
+ * together, so we do 4096 tests total.  To verify all the SHAKE outputs,
+ * compute and verify the SHA3-256 digest of all of them concatenated together.
+ */
+static void test_shake_all_lens_up_to_4096(struct kunit *test)
+{
+	struct sha3_ctx main_ctx;
+	const size_t max_len = 4096;
+	u8 *const in = test_buf;
+	u8 *const out = &test_buf[TEST_BUF_LEN - max_len];
+	u8 main_hash[SHA3_256_DIGEST_SIZE];
+
+	KUNIT_ASSERT_LE(test, 2 * max_len, TEST_BUF_LEN);
+
+	rand_bytes_seeded_from_len(in, max_len);
+	for (int alg = 0; alg < 2; alg++) {
+		sha3_256_init(&main_ctx);
+		for (size_t in_len = 0; in_len <= max_len; in_len++) {
+			size_t out_len = (in_len * 293) % (max_len + 1);
+
+			shake(alg, in, in_len, out, out_len);
+			sha3_update(&main_ctx, out, out_len);
+		}
+		sha3_final(&main_ctx, main_hash);
+		if (alg == 0)
+			KUNIT_ASSERT_MEMEQ_MSG(test, main_hash,
+					       shake128_testvec_consolidated,
+					       sizeof(main_hash),
+					       "shake128() gives wrong output");
+		else
+			KUNIT_ASSERT_MEMEQ_MSG(test, main_hash,
+					       shake256_testvec_consolidated,
+					       sizeof(main_hash),
+					       "shake256() gives wrong output");
+	}
+}
+
+/*
+ * Test that a sequence of SHAKE squeezes gives the same output as a single
+ * squeeze of the same total length.
+ */
+static void test_shake_multiple_squeezes(struct kunit *test)
+{
+	const size_t max_len = 512;
+	u8 *ref_out;
+
+	KUNIT_ASSERT_GE(test, TEST_BUF_LEN, 2 * max_len);
+
+	ref_out = kunit_kzalloc(test, max_len, GFP_KERNEL);
+	KUNIT_ASSERT_NOT_NULL(test, ref_out);
+
+	for (int i = 0; i < 2000; i++) {
+		const int alg = rand32() % 2;
+		const size_t in_len = rand_length(max_len);
+		const size_t out_len = rand_length(max_len);
+		const size_t in_offs = rand_offset(max_len - in_len);
+		const size_t out_offs = rand_offset(max_len - out_len);
+		u8 *const in = &test_buf[in_offs];
+		u8 *const out = &test_buf[out_offs];
+		struct shake_ctx ctx;
+		size_t remaining_len, j, num_parts;
+
+		rand_bytes(in, in_len);
+		rand_bytes(out, out_len);
+
+		/* Compute the output using the one-shot function. */
+		shake(alg, in, in_len, ref_out, out_len);
+
+		/* Compute the output using a random sequence of squeezes. */
+		shake_init(&ctx, alg);
+		shake_update(&ctx, in, in_len);
+		remaining_len = out_len;
+		j = 0;
+		num_parts = 0;
+		while (rand_bool()) {
+			size_t part_len = rand_length(remaining_len);
+
+			shake_squeeze(&ctx, &out[j], part_len);
+			num_parts++;
+			j += part_len;
+			remaining_len -= part_len;
+		}
+		if (remaining_len != 0 || rand_bool()) {
+			shake_squeeze(&ctx, &out[j], remaining_len);
+			num_parts++;
+		}
+
+		/* Verify that the outputs are the same. */
+		KUNIT_ASSERT_MEMEQ_MSG(
+			test, out, ref_out, out_len,
+			"Multi-squeeze test failed with in_len=%zu in_offs=%zu out_len=%zu out_offs=%zu num_parts=%zu alg=%d",
+			in_len, in_offs, out_len, out_offs, num_parts, alg);
+	}
+}
+
+/*
+ * Test that SHAKE operations on buffers immediately followed by an unmapped
+ * page work as expected.  This catches out-of-bounds memory accesses even if
+ * they occur in assembly code.
+ */
+static void test_shake_with_guarded_bufs(struct kunit *test)
+{
+	const size_t max_len = 512;
+	u8 *reg_buf;
+
+	KUNIT_ASSERT_GE(test, TEST_BUF_LEN, max_len);
+
+	reg_buf = kunit_kzalloc(test, max_len, GFP_KERNEL);
+	KUNIT_ASSERT_NOT_NULL(test, reg_buf);
+
+	for (int alg = 0; alg < 2; alg++) {
+		for (size_t len = 0; len <= max_len; len++) {
+			u8 *guarded_buf = &test_buf[TEST_BUF_LEN - len];
+
+			rand_bytes(reg_buf, len);
+			memcpy(guarded_buf, reg_buf, len);
+
+			shake(alg, reg_buf, len, reg_buf, len);
+			shake(alg, guarded_buf, len, guarded_buf, len);
+
+			KUNIT_ASSERT_MEMEQ_MSG(
+				test, reg_buf, guarded_buf, len,
+				"Guard page test failed with len=%zu alg=%d",
+				len, alg);
+		}
+	}
+}
+
+static struct kunit_case sha3_test_cases[] = {
+	HASH_KUNIT_CASES,
+	KUNIT_CASE(test_sha3_224_basic),
+	KUNIT_CASE(test_sha3_256_basic),
+	KUNIT_CASE(test_sha3_384_basic),
+	KUNIT_CASE(test_sha3_512_basic),
+	KUNIT_CASE(test_shake128_basic),
+	KUNIT_CASE(test_shake256_basic),
+	KUNIT_CASE(test_shake128_nist),
+	KUNIT_CASE(test_shake256_nist),
+	KUNIT_CASE(test_shake_all_lens_up_to_4096),
+	KUNIT_CASE(test_shake_multiple_squeezes),
+	KUNIT_CASE(test_shake_with_guarded_bufs),
+	KUNIT_CASE(benchmark_hash),
+	{},
+};
+
+static struct kunit_suite sha3_test_suite = {
+	.name = "sha3",
+	.test_cases = sha3_test_cases,
+	.suite_init = hash_suite_init,
+	.suite_exit = hash_suite_exit,
+};
+kunit_test_suite(sha3_test_suite);
+
+MODULE_DESCRIPTION("KUnit tests and benchmark for SHA3");
+MODULE_LICENSE("GPL");
diff --git a/lib/crypto/x86/blake2s-core.S b/lib/crypto/x86/blake2s-core.S
index ef8e9f427aab..7b1d98ca7482 100644
--- a/lib/crypto/x86/blake2s-core.S
+++ b/lib/crypto/x86/blake2s-core.S
@@ -6,19 +6,25 @@
 
 #include <linux/linkage.h>
 
-.section .rodata.cst32.BLAKE2S_IV, "aM", @progbits, 32
+.section .rodata.cst32.iv, "aM", @progbits, 32
 .align 32
-IV:	.octa 0xA54FF53A3C6EF372BB67AE856A09E667
+.Liv:
+	.octa 0xA54FF53A3C6EF372BB67AE856A09E667
 	.octa 0x5BE0CD191F83D9AB9B05688C510E527F
-.section .rodata.cst16.ROT16, "aM", @progbits, 16
+
+.section .rodata.cst16.ror16, "aM", @progbits, 16
 .align 16
-ROT16:	.octa 0x0D0C0F0E09080B0A0504070601000302
-.section .rodata.cst16.ROR328, "aM", @progbits, 16
+.Lror16:
+	.octa 0x0D0C0F0E09080B0A0504070601000302
+
+.section .rodata.cst16.ror8, "aM", @progbits, 16
 .align 16
-ROR328:	.octa 0x0C0F0E0D080B0A090407060500030201
-.section .rodata.cst64.BLAKE2S_SIGMA, "aM", @progbits, 160
+.Lror8:
+	.octa 0x0C0F0E0D080B0A090407060500030201
+
+.section .rodata.cst64.sigma, "aM", @progbits, 160
 .align 64
-SIGMA:
+.Lsigma:
 .byte  0,  2,  4,  6,  1,  3,  5,  7, 14,  8, 10, 12, 15,  9, 11, 13
 .byte 14,  4,  9, 13, 10,  8, 15,  6,  5,  1,  0, 11,  3, 12,  2,  7
 .byte 11, 12,  5, 15,  8,  0,  2, 13,  9, 10,  3,  7,  4, 14,  6,  1
@@ -29,9 +35,10 @@ SIGMA:
 .byte 13,  7, 12,  3, 11, 14,  1,  9,  2,  5, 15,  8, 10,  0,  4,  6
 .byte  6, 14, 11,  0, 15,  9,  3,  8, 10, 12, 13,  1,  5,  2,  7,  4
 .byte 10,  8,  7,  1,  2,  4,  6,  5, 13, 15,  9,  3,  0, 11, 14, 12
-.section .rodata.cst64.BLAKE2S_SIGMA2, "aM", @progbits, 160
+
+.section .rodata.cst64.sigma2, "aM", @progbits, 160
 .align 64
-SIGMA2:
+.Lsigma2:
 .byte  0,  2,  4,  6,  1,  3,  5,  7, 14,  8, 10, 12, 15,  9, 11, 13
 .byte  8,  2, 13, 15, 10,  9, 12,  3,  6,  4,  0, 14,  5, 11,  1,  7
 .byte 11, 13,  8,  6,  5, 10, 14,  3,  2,  4, 12, 15,  1,  0,  7,  9
@@ -43,36 +50,52 @@ SIGMA2:
 .byte 15,  5,  4, 13, 10,  7,  3, 11, 12,  2,  0,  6,  9,  8,  1, 14
 .byte  8,  7, 14, 11, 13, 15,  0, 12, 10,  4,  5,  6,  3,  2,  1,  9
 
+#define CTX		%rdi
+#define DATA		%rsi
+#define NBLOCKS		%rdx
+#define INC		%ecx
+
 .text
+//
+// void blake2s_compress_ssse3(struct blake2s_ctx *ctx,
+//			       const u8 *data, size_t nblocks, u32 inc);
+//
+// Only the first three fields of struct blake2s_ctx are used:
+//	u32 h[8];	(inout)
+//	u32 t[2];	(inout)
+//	u32 f[2];	(in)
+//
 SYM_FUNC_START(blake2s_compress_ssse3)
-	testq		%rdx,%rdx
-	je		.Lendofloop
-	movdqu		(%rdi),%xmm0
-	movdqu		0x10(%rdi),%xmm1
-	movdqa		ROT16(%rip),%xmm12
-	movdqa		ROR328(%rip),%xmm13
-	movdqu		0x20(%rdi),%xmm14
-	movq		%rcx,%xmm15
-	leaq		SIGMA+0xa0(%rip),%r8
-	jmp		.Lbeginofloop
+	movdqu		(CTX),%xmm0		// Load h[0..3]
+	movdqu		16(CTX),%xmm1		// Load h[4..7]
+	movdqa		.Lror16(%rip),%xmm12
+	movdqa		.Lror8(%rip),%xmm13
+	movdqu		32(CTX),%xmm14		// Load t and f
+	movd		INC,%xmm15		// Load inc
+	leaq		.Lsigma+160(%rip),%r8
+	jmp		.Lssse3_mainloop
+
 	.align		32
-.Lbeginofloop:
-	movdqa		%xmm0,%xmm10
-	movdqa		%xmm1,%xmm11
-	paddq		%xmm15,%xmm14
-	movdqa		IV(%rip),%xmm2
+.Lssse3_mainloop:
+	// Main loop: each iteration processes one 64-byte block.
+	movdqa		%xmm0,%xmm10		// Save h[0..3] and let v[0..3] = h[0..3]
+	movdqa		%xmm1,%xmm11		// Save h[4..7] and let v[4..7] = h[4..7]
+	paddq		%xmm15,%xmm14		// t += inc (64-bit addition)
+	movdqa		.Liv(%rip),%xmm2	// v[8..11] = iv[0..3]
 	movdqa		%xmm14,%xmm3
-	pxor		IV+0x10(%rip),%xmm3
-	leaq		SIGMA(%rip),%rcx
-.Lroundloop:
+	pxor		.Liv+16(%rip),%xmm3	// v[12..15] = iv[4..7] ^ [t, f]
+	leaq		.Lsigma(%rip),%rcx
+
+.Lssse3_roundloop:
+	// Round loop: each iteration does 1 round (of 10 rounds total).
 	movzbl		(%rcx),%eax
-	movd		(%rsi,%rax,4),%xmm4
-	movzbl		0x1(%rcx),%eax
-	movd		(%rsi,%rax,4),%xmm5
-	movzbl		0x2(%rcx),%eax
-	movd		(%rsi,%rax,4),%xmm6
-	movzbl		0x3(%rcx),%eax
-	movd		(%rsi,%rax,4),%xmm7
+	movd		(DATA,%rax,4),%xmm4
+	movzbl		1(%rcx),%eax
+	movd		(DATA,%rax,4),%xmm5
+	movzbl		2(%rcx),%eax
+	movd		(DATA,%rax,4),%xmm6
+	movzbl		3(%rcx),%eax
+	movd		(DATA,%rax,4),%xmm7
 	punpckldq	%xmm5,%xmm4
 	punpckldq	%xmm7,%xmm6
 	punpcklqdq	%xmm6,%xmm4
@@ -83,17 +106,17 @@ SYM_FUNC_START(blake2s_compress_ssse3)
 	paddd		%xmm3,%xmm2
 	pxor		%xmm2,%xmm1
 	movdqa		%xmm1,%xmm8
-	psrld		$0xc,%xmm1
-	pslld		$0x14,%xmm8
+	psrld		$12,%xmm1
+	pslld		$20,%xmm8
 	por		%xmm8,%xmm1
-	movzbl		0x4(%rcx),%eax
-	movd		(%rsi,%rax,4),%xmm5
-	movzbl		0x5(%rcx),%eax
-	movd		(%rsi,%rax,4),%xmm6
-	movzbl		0x6(%rcx),%eax
-	movd		(%rsi,%rax,4),%xmm7
-	movzbl		0x7(%rcx),%eax
-	movd		(%rsi,%rax,4),%xmm4
+	movzbl		4(%rcx),%eax
+	movd		(DATA,%rax,4),%xmm5
+	movzbl		5(%rcx),%eax
+	movd		(DATA,%rax,4),%xmm6
+	movzbl		6(%rcx),%eax
+	movd		(DATA,%rax,4),%xmm7
+	movzbl		7(%rcx),%eax
+	movd		(DATA,%rax,4),%xmm4
 	punpckldq	%xmm6,%xmm5
 	punpckldq	%xmm4,%xmm7
 	punpcklqdq	%xmm7,%xmm5
@@ -104,20 +127,20 @@ SYM_FUNC_START(blake2s_compress_ssse3)
 	paddd		%xmm3,%xmm2
 	pxor		%xmm2,%xmm1
 	movdqa		%xmm1,%xmm8
-	psrld		$0x7,%xmm1
-	pslld		$0x19,%xmm8
+	psrld		$7,%xmm1
+	pslld		$25,%xmm8
 	por		%xmm8,%xmm1
 	pshufd		$0x93,%xmm0,%xmm0
 	pshufd		$0x4e,%xmm3,%xmm3
 	pshufd		$0x39,%xmm2,%xmm2
-	movzbl		0x8(%rcx),%eax
-	movd		(%rsi,%rax,4),%xmm6
-	movzbl		0x9(%rcx),%eax
-	movd		(%rsi,%rax,4),%xmm7
-	movzbl		0xa(%rcx),%eax
-	movd		(%rsi,%rax,4),%xmm4
-	movzbl		0xb(%rcx),%eax
-	movd		(%rsi,%rax,4),%xmm5
+	movzbl		8(%rcx),%eax
+	movd		(DATA,%rax,4),%xmm6
+	movzbl		9(%rcx),%eax
+	movd		(DATA,%rax,4),%xmm7
+	movzbl		10(%rcx),%eax
+	movd		(DATA,%rax,4),%xmm4
+	movzbl		11(%rcx),%eax
+	movd		(DATA,%rax,4),%xmm5
 	punpckldq	%xmm7,%xmm6
 	punpckldq	%xmm5,%xmm4
 	punpcklqdq	%xmm4,%xmm6
@@ -128,17 +151,17 @@ SYM_FUNC_START(blake2s_compress_ssse3)
 	paddd		%xmm3,%xmm2
 	pxor		%xmm2,%xmm1
 	movdqa		%xmm1,%xmm8
-	psrld		$0xc,%xmm1
-	pslld		$0x14,%xmm8
+	psrld		$12,%xmm1
+	pslld		$20,%xmm8
 	por		%xmm8,%xmm1
-	movzbl		0xc(%rcx),%eax
-	movd		(%rsi,%rax,4),%xmm7
-	movzbl		0xd(%rcx),%eax
-	movd		(%rsi,%rax,4),%xmm4
-	movzbl		0xe(%rcx),%eax
-	movd		(%rsi,%rax,4),%xmm5
-	movzbl		0xf(%rcx),%eax
-	movd		(%rsi,%rax,4),%xmm6
+	movzbl		12(%rcx),%eax
+	movd		(DATA,%rax,4),%xmm7
+	movzbl		13(%rcx),%eax
+	movd		(DATA,%rax,4),%xmm4
+	movzbl		14(%rcx),%eax
+	movd		(DATA,%rax,4),%xmm5
+	movzbl		15(%rcx),%eax
+	movd		(DATA,%rax,4),%xmm6
 	punpckldq	%xmm4,%xmm7
 	punpckldq	%xmm6,%xmm5
 	punpcklqdq	%xmm5,%xmm7
@@ -149,53 +172,68 @@ SYM_FUNC_START(blake2s_compress_ssse3)
 	paddd		%xmm3,%xmm2
 	pxor		%xmm2,%xmm1
 	movdqa		%xmm1,%xmm8
-	psrld		$0x7,%xmm1
-	pslld		$0x19,%xmm8
+	psrld		$7,%xmm1
+	pslld		$25,%xmm8
 	por		%xmm8,%xmm1
 	pshufd		$0x39,%xmm0,%xmm0
 	pshufd		$0x4e,%xmm3,%xmm3
 	pshufd		$0x93,%xmm2,%xmm2
-	addq		$0x10,%rcx
+	addq		$16,%rcx
 	cmpq		%r8,%rcx
-	jnz		.Lroundloop
+	jnz		.Lssse3_roundloop
+
+	// Compute the new h: h[0..7] ^= v[0..7] ^ v[8..15]
 	pxor		%xmm2,%xmm0
 	pxor		%xmm3,%xmm1
 	pxor		%xmm10,%xmm0
 	pxor		%xmm11,%xmm1
-	addq		$0x40,%rsi
-	decq		%rdx
-	jnz		.Lbeginofloop
-	movdqu		%xmm0,(%rdi)
-	movdqu		%xmm1,0x10(%rdi)
-	movdqu		%xmm14,0x20(%rdi)
-.Lendofloop:
+	addq		$64,DATA
+	decq		NBLOCKS
+	jnz		.Lssse3_mainloop
+
+	movdqu		%xmm0,(CTX)		// Store new h[0..3]
+	movdqu		%xmm1,16(CTX)		// Store new h[4..7]
+	movq		%xmm14,32(CTX)		// Store new t (f is unchanged)
 	RET
 SYM_FUNC_END(blake2s_compress_ssse3)
 
+//
+// void blake2s_compress_avx512(struct blake2s_ctx *ctx,
+//				const u8 *data, size_t nblocks, u32 inc);
+//
+// Only the first three fields of struct blake2s_ctx are used:
+//	u32 h[8];	(inout)
+//	u32 t[2];	(inout)
+//	u32 f[2];	(in)
+//
 SYM_FUNC_START(blake2s_compress_avx512)
-	vmovdqu		(%rdi),%xmm0
-	vmovdqu		0x10(%rdi),%xmm1
-	vmovdqu		0x20(%rdi),%xmm4
-	vmovq		%rcx,%xmm5
-	vmovdqa		IV(%rip),%xmm14
-	vmovdqa		IV+16(%rip),%xmm15
-	jmp		.Lblake2s_compress_avx512_mainloop
-.align 32
-.Lblake2s_compress_avx512_mainloop:
-	vmovdqa		%xmm0,%xmm10
-	vmovdqa		%xmm1,%xmm11
-	vpaddq		%xmm5,%xmm4,%xmm4
-	vmovdqa		%xmm14,%xmm2
-	vpxor		%xmm15,%xmm4,%xmm3
-	vmovdqu		(%rsi),%ymm6
-	vmovdqu		0x20(%rsi),%ymm7
-	addq		$0x40,%rsi
-	leaq		SIGMA2(%rip),%rax
-	movb		$0xa,%cl
-.Lblake2s_compress_avx512_roundloop:
+	vmovdqu		(CTX),%xmm0		// Load h[0..3]
+	vmovdqu		16(CTX),%xmm1		// Load h[4..7]
+	vmovdqu		32(CTX),%xmm4		// Load t and f
+	vmovd		INC,%xmm5		// Load inc
+	vmovdqa		.Liv(%rip),%xmm14	// Load iv[0..3]
+	vmovdqa		.Liv+16(%rip),%xmm15	// Load iv[4..7]
+	jmp		.Lavx512_mainloop
+
+	.align		32
+.Lavx512_mainloop:
+	// Main loop: each iteration processes one 64-byte block.
+	vmovdqa		%xmm0,%xmm10		// Save h[0..3] and let v[0..3] = h[0..3]
+	vmovdqa		%xmm1,%xmm11		// Save h[4..7] and let v[4..7] = h[4..7]
+	vpaddq		%xmm5,%xmm4,%xmm4	// t += inc (64-bit addition)
+	vmovdqa		%xmm14,%xmm2		// v[8..11] = iv[0..3]
+	vpxor		%xmm15,%xmm4,%xmm3	// v[12..15] = iv[4..7] ^ [t, f]
+	vmovdqu		(DATA),%ymm6		// Load first 8 data words
+	vmovdqu		32(DATA),%ymm7		// Load second 8 data words
+	addq		$64,DATA
+	leaq		.Lsigma2(%rip),%rax
+	movb		$10,%cl			// Set num rounds remaining
+
+.Lavx512_roundloop:
+	// Round loop: each iteration does 1 round (of 10 rounds total).
 	vpmovzxbd	(%rax),%ymm8
-	vpmovzxbd	0x8(%rax),%ymm9
-	addq		$0x10,%rax
+	vpmovzxbd	8(%rax),%ymm9
+	addq		$16,%rax
 	vpermi2d	%ymm7,%ymm6,%ymm8
 	vpermi2d	%ymm7,%ymm6,%ymm9
 	vmovdqa		%ymm8,%ymm6
@@ -203,50 +241,51 @@ SYM_FUNC_START(blake2s_compress_avx512)
 	vpaddd		%xmm8,%xmm0,%xmm0
 	vpaddd		%xmm1,%xmm0,%xmm0
 	vpxor		%xmm0,%xmm3,%xmm3
-	vprord		$0x10,%xmm3,%xmm3
+	vprord		$16,%xmm3,%xmm3
 	vpaddd		%xmm3,%xmm2,%xmm2
 	vpxor		%xmm2,%xmm1,%xmm1
-	vprord		$0xc,%xmm1,%xmm1
-	vextracti128	$0x1,%ymm8,%xmm8
+	vprord		$12,%xmm1,%xmm1
+	vextracti128	$1,%ymm8,%xmm8
 	vpaddd		%xmm8,%xmm0,%xmm0
 	vpaddd		%xmm1,%xmm0,%xmm0
 	vpxor		%xmm0,%xmm3,%xmm3
-	vprord		$0x8,%xmm3,%xmm3
+	vprord		$8,%xmm3,%xmm3
 	vpaddd		%xmm3,%xmm2,%xmm2
 	vpxor		%xmm2,%xmm1,%xmm1
-	vprord		$0x7,%xmm1,%xmm1
+	vprord		$7,%xmm1,%xmm1
 	vpshufd		$0x93,%xmm0,%xmm0
 	vpshufd		$0x4e,%xmm3,%xmm3
 	vpshufd		$0x39,%xmm2,%xmm2
 	vpaddd		%xmm9,%xmm0,%xmm0
 	vpaddd		%xmm1,%xmm0,%xmm0
 	vpxor		%xmm0,%xmm3,%xmm3
-	vprord		$0x10,%xmm3,%xmm3
+	vprord		$16,%xmm3,%xmm3
 	vpaddd		%xmm3,%xmm2,%xmm2
 	vpxor		%xmm2,%xmm1,%xmm1
-	vprord		$0xc,%xmm1,%xmm1
-	vextracti128	$0x1,%ymm9,%xmm9
+	vprord		$12,%xmm1,%xmm1
+	vextracti128	$1,%ymm9,%xmm9
 	vpaddd		%xmm9,%xmm0,%xmm0
 	vpaddd		%xmm1,%xmm0,%xmm0
 	vpxor		%xmm0,%xmm3,%xmm3
-	vprord		$0x8,%xmm3,%xmm3
+	vprord		$8,%xmm3,%xmm3
 	vpaddd		%xmm3,%xmm2,%xmm2
 	vpxor		%xmm2,%xmm1,%xmm1
-	vprord		$0x7,%xmm1,%xmm1
+	vprord		$7,%xmm1,%xmm1
 	vpshufd		$0x39,%xmm0,%xmm0
 	vpshufd		$0x4e,%xmm3,%xmm3
 	vpshufd		$0x93,%xmm2,%xmm2
 	decb		%cl
-	jne		.Lblake2s_compress_avx512_roundloop
-	vpxor		%xmm10,%xmm0,%xmm0
-	vpxor		%xmm11,%xmm1,%xmm1
-	vpxor		%xmm2,%xmm0,%xmm0
-	vpxor		%xmm3,%xmm1,%xmm1
-	decq		%rdx
-	jne		.Lblake2s_compress_avx512_mainloop
-	vmovdqu		%xmm0,(%rdi)
-	vmovdqu		%xmm1,0x10(%rdi)
-	vmovdqu		%xmm4,0x20(%rdi)
+	jne		.Lavx512_roundloop
+
+	// Compute the new h: h[0..7] ^= v[0..7] ^ v[8..15]
+	vpternlogd	$0x96,%xmm10,%xmm2,%xmm0
+	vpternlogd	$0x96,%xmm11,%xmm3,%xmm1
+	decq		NBLOCKS
+	jne		.Lavx512_mainloop
+
+	vmovdqu		%xmm0,(CTX)		// Store new h[0..3]
+	vmovdqu		%xmm1,16(CTX)		// Store new h[4..7]
+	vmovq		%xmm4,32(CTX)		// Store new t (f is unchanged)
 	vzeroupper
 	RET
 SYM_FUNC_END(blake2s_compress_avx512)
diff --git a/lib/crypto/x86/blake2s.h b/lib/crypto/x86/blake2s.h
index b6d30d2fa045..f8eed6cb042e 100644
--- a/lib/crypto/x86/blake2s.h
+++ b/lib/crypto/x86/blake2s.h
@@ -11,24 +11,22 @@
 #include <linux/kernel.h>
 #include <linux/sizes.h>
 
-asmlinkage void blake2s_compress_ssse3(struct blake2s_state *state,
-				       const u8 *block, const size_t nblocks,
-				       const u32 inc);
-asmlinkage void blake2s_compress_avx512(struct blake2s_state *state,
-					const u8 *block, const size_t nblocks,
-					const u32 inc);
+asmlinkage void blake2s_compress_ssse3(struct blake2s_ctx *ctx,
+				       const u8 *data, size_t nblocks, u32 inc);
+asmlinkage void blake2s_compress_avx512(struct blake2s_ctx *ctx,
+					const u8 *data, size_t nblocks, u32 inc);
 
 static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_ssse3);
 static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_avx512);
 
-static void blake2s_compress(struct blake2s_state *state, const u8 *block,
-			     size_t nblocks, const u32 inc)
+static void blake2s_compress(struct blake2s_ctx *ctx,
+			     const u8 *data, size_t nblocks, u32 inc)
 {
 	/* SIMD disables preemption, so relax after processing each page. */
 	BUILD_BUG_ON(SZ_4K / BLAKE2S_BLOCK_SIZE < 8);
 
 	if (!static_branch_likely(&blake2s_use_ssse3) || !may_use_simd()) {
-		blake2s_compress_generic(state, block, nblocks, inc);
+		blake2s_compress_generic(ctx, data, nblocks, inc);
 		return;
 	}
 
@@ -38,13 +36,13 @@ static void blake2s_compress(struct blake2s_state *state, const u8 *block,
 
 		kernel_fpu_begin();
 		if (static_branch_likely(&blake2s_use_avx512))
-			blake2s_compress_avx512(state, block, blocks, inc);
+			blake2s_compress_avx512(ctx, data, blocks, inc);
 		else
-			blake2s_compress_ssse3(state, block, blocks, inc);
+			blake2s_compress_ssse3(ctx, data, blocks, inc);
 		kernel_fpu_end();
 
+		data += blocks * BLAKE2S_BLOCK_SIZE;
 		nblocks -= blocks;
-		block += blocks * BLAKE2S_BLOCK_SIZE;
 	} while (nblocks);
 }
 
diff --git a/arch/x86/crypto/polyval-clmulni_asm.S b/lib/crypto/x86/polyval-pclmul-avx.S
index a6ebe4e7dd2b..7f739465ad35 100644
--- a/arch/x86/crypto/polyval-clmulni_asm.S
+++ b/lib/crypto/x86/polyval-pclmul-avx.S
@@ -36,10 +36,10 @@
 #define MI %xmm14
 #define SUM %xmm15
 
-#define KEY_POWERS %rdi
-#define MSG %rsi
-#define BLOCKS_LEFT %rdx
-#define ACCUMULATOR %rcx
+#define ACCUMULATOR %rdi
+#define KEY_POWERS %rsi
+#define MSG %rdx
+#define BLOCKS_LEFT %rcx
 #define TMP %rax
 
 .section    .rodata.cst16.gstar, "aM", @progbits, 16
@@ -234,7 +234,7 @@
 
 	movups (MSG), %xmm0
 	pxor SUM, %xmm0
-	movaps (KEY_POWERS), %xmm1
+	movups (KEY_POWERS), %xmm1
 	schoolbook1_noload
 	dec BLOCKS_LEFT
 	addq $16, MSG
@@ -261,15 +261,12 @@
 .endm
 
 /*
- * Perform montgomery multiplication in GF(2^128) and store result in op1.
+ * Computes a = a * b * x^{-128} mod x^128 + x^127 + x^126 + x^121 + 1.
  *
- * Computes op1*op2*x^{-128} mod x^128 + x^127 + x^126 + x^121 + 1
- * If op1, op2 are in montgomery form, this computes the montgomery
- * form of op1*op2.
- *
- * void clmul_polyval_mul(u8 *op1, const u8 *op2);
+ * void polyval_mul_pclmul_avx(struct polyval_elem *a,
+ *			       const struct polyval_elem *b);
  */
-SYM_FUNC_START(clmul_polyval_mul)
+SYM_FUNC_START(polyval_mul_pclmul_avx)
 	FRAME_BEGIN
 	vmovdqa .Lgstar(%rip), GSTAR
 	movups (%rdi), %xmm0
@@ -280,22 +277,23 @@ SYM_FUNC_START(clmul_polyval_mul)
 	movups SUM, (%rdi)
 	FRAME_END
 	RET
-SYM_FUNC_END(clmul_polyval_mul)
+SYM_FUNC_END(polyval_mul_pclmul_avx)
 
 /*
  * Perform polynomial evaluation as specified by POLYVAL.  This computes:
  *	h^n * accumulator + h^n * m_0 + ... + h^1 * m_{n-1}
  * where n=nblocks, h is the hash key, and m_i are the message blocks.
  *
- * rdi - pointer to precomputed key powers h^8 ... h^1
- * rsi - pointer to message blocks
- * rdx - number of blocks to hash
- * rcx - pointer to the accumulator
+ * rdi - pointer to the accumulator
+ * rsi - pointer to precomputed key powers h^8 ... h^1
+ * rdx - pointer to message blocks
+ * rcx - number of blocks to hash
  *
- * void clmul_polyval_update(const struct polyval_tfm_ctx *keys,
- *	const u8 *in, size_t nblocks, u8 *accumulator);
+ * void polyval_blocks_pclmul_avx(struct polyval_elem *acc,
+ *				  const struct polyval_key *key,
+ *				  const u8 *data, size_t nblocks);
  */
-SYM_FUNC_START(clmul_polyval_update)
+SYM_FUNC_START(polyval_blocks_pclmul_avx)
 	FRAME_BEGIN
 	vmovdqa .Lgstar(%rip), GSTAR
 	movups (ACCUMULATOR), SUM
@@ -318,4 +316,4 @@ SYM_FUNC_START(clmul_polyval_update)
 	movups SUM, (ACCUMULATOR)
 	FRAME_END
 	RET
-SYM_FUNC_END(clmul_polyval_update)
+SYM_FUNC_END(polyval_blocks_pclmul_avx)
diff --git a/lib/crypto/x86/polyval.h b/lib/crypto/x86/polyval.h
new file mode 100644
index 000000000000..ef8797521420
--- /dev/null
+++ b/lib/crypto/x86/polyval.h
@@ -0,0 +1,83 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * POLYVAL library functions, x86_64 optimized
+ *
+ * Copyright 2025 Google LLC
+ */
+#include <asm/fpu/api.h>
+#include <linux/cpufeature.h>
+
+#define NUM_H_POWERS 8
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pclmul_avx);
+
+asmlinkage void polyval_mul_pclmul_avx(struct polyval_elem *a,
+				       const struct polyval_elem *b);
+asmlinkage void polyval_blocks_pclmul_avx(struct polyval_elem *acc,
+					  const struct polyval_key *key,
+					  const u8 *data, size_t nblocks);
+
+static void polyval_preparekey_arch(struct polyval_key *key,
+				    const u8 raw_key[POLYVAL_BLOCK_SIZE])
+{
+	static_assert(ARRAY_SIZE(key->h_powers) == NUM_H_POWERS);
+	memcpy(&key->h_powers[NUM_H_POWERS - 1], raw_key, POLYVAL_BLOCK_SIZE);
+	if (static_branch_likely(&have_pclmul_avx) && irq_fpu_usable()) {
+		kernel_fpu_begin();
+		for (int i = NUM_H_POWERS - 2; i >= 0; i--) {
+			key->h_powers[i] = key->h_powers[i + 1];
+			polyval_mul_pclmul_avx(
+				&key->h_powers[i],
+				&key->h_powers[NUM_H_POWERS - 1]);
+		}
+		kernel_fpu_end();
+	} else {
+		for (int i = NUM_H_POWERS - 2; i >= 0; i--) {
+			key->h_powers[i] = key->h_powers[i + 1];
+			polyval_mul_generic(&key->h_powers[i],
+					    &key->h_powers[NUM_H_POWERS - 1]);
+		}
+	}
+}
+
+static void polyval_mul_arch(struct polyval_elem *acc,
+			     const struct polyval_key *key)
+{
+	if (static_branch_likely(&have_pclmul_avx) && irq_fpu_usable()) {
+		kernel_fpu_begin();
+		polyval_mul_pclmul_avx(acc, &key->h_powers[NUM_H_POWERS - 1]);
+		kernel_fpu_end();
+	} else {
+		polyval_mul_generic(acc, &key->h_powers[NUM_H_POWERS - 1]);
+	}
+}
+
+static void polyval_blocks_arch(struct polyval_elem *acc,
+				const struct polyval_key *key,
+				const u8 *data, size_t nblocks)
+{
+	if (static_branch_likely(&have_pclmul_avx) && irq_fpu_usable()) {
+		do {
+			/* Allow rescheduling every 4 KiB. */
+			size_t n = min_t(size_t, nblocks,
+					 4096 / POLYVAL_BLOCK_SIZE);
+
+			kernel_fpu_begin();
+			polyval_blocks_pclmul_avx(acc, key, data, n);
+			kernel_fpu_end();
+			data += n * POLYVAL_BLOCK_SIZE;
+			nblocks -= n;
+		} while (nblocks);
+	} else {
+		polyval_blocks_generic(acc, &key->h_powers[NUM_H_POWERS - 1],
+				       data, nblocks);
+	}
+}
+
+#define polyval_mod_init_arch polyval_mod_init_arch
+static void polyval_mod_init_arch(void)
+{
+	if (boot_cpu_has(X86_FEATURE_PCLMULQDQ) &&
+	    boot_cpu_has(X86_FEATURE_AVX))
+		static_branch_enable(&have_pclmul_avx);
+}
diff --git a/lib/raid6/neon.c b/lib/raid6/neon.c
index 0a2e76035ea9..6d9474ce6da9 100644
--- a/lib/raid6/neon.c
+++ b/lib/raid6/neon.c
@@ -8,10 +8,9 @@
 #include <linux/raid/pq.h>
 
 #ifdef __KERNEL__
-#include <asm/neon.h>
+#include <asm/simd.h>
 #else
-#define kernel_neon_begin()
-#define kernel_neon_end()
+#define scoped_ksimd()
 #define cpu_has_neon()		(1)
 #endif
 
@@ -32,10 +31,9 @@
 	{								\
 		void raid6_neon ## _n  ## _gen_syndrome_real(int,	\
 						unsigned long, void**);	\
-		kernel_neon_begin();					\
-		raid6_neon ## _n ## _gen_syndrome_real(disks,		\
+		scoped_ksimd()						\
+			raid6_neon ## _n ## _gen_syndrome_real(disks,	\
 					(unsigned long)bytes, ptrs);	\
-		kernel_neon_end();					\
 	}								\
 	static void raid6_neon ## _n ## _xor_syndrome(int disks,	\
 					int start, int stop, 		\
@@ -43,10 +41,9 @@
 	{								\
 		void raid6_neon ## _n  ## _xor_syndrome_real(int,	\
 				int, int, unsigned long, void**);	\
-		kernel_neon_begin();					\
-		raid6_neon ## _n ## _xor_syndrome_real(disks,		\
-			start, stop, (unsigned long)bytes, ptrs);	\
-		kernel_neon_end();					\
+		scoped_ksimd()						\
+			raid6_neon ## _n ## _xor_syndrome_real(disks,	\
+				start, stop, (unsigned long)bytes, ptrs);\
 	}								\
 	struct raid6_calls const raid6_neonx ## _n = {			\
 		raid6_neon ## _n ## _gen_syndrome,			\
diff --git a/lib/raid6/recov_neon.c b/lib/raid6/recov_neon.c
index 70e1404c1512..9d99aeabd31a 100644
--- a/lib/raid6/recov_neon.c
+++ b/lib/raid6/recov_neon.c
@@ -7,11 +7,10 @@
 #include <linux/raid/pq.h>
 
 #ifdef __KERNEL__
-#include <asm/neon.h>
+#include <asm/simd.h>
 #include "neon.h"
 #else
-#define kernel_neon_begin()
-#define kernel_neon_end()
+#define scoped_ksimd()
 #define cpu_has_neon()		(1)
 #endif
 
@@ -55,9 +54,8 @@ static void raid6_2data_recov_neon(int disks, size_t bytes, int faila,
 	qmul  = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila] ^
 					 raid6_gfexp[failb]]];
 
-	kernel_neon_begin();
-	__raid6_2data_recov_neon(bytes, p, q, dp, dq, pbmul, qmul);
-	kernel_neon_end();
+	scoped_ksimd()
+		__raid6_2data_recov_neon(bytes, p, q, dp, dq, pbmul, qmul);
 }
 
 static void raid6_datap_recov_neon(int disks, size_t bytes, int faila,
@@ -86,9 +84,8 @@ static void raid6_datap_recov_neon(int disks, size_t bytes, int faila,
 	/* Now, pick the proper data tables */
 	qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila]]];
 
-	kernel_neon_begin();
-	__raid6_datap_recov_neon(bytes, p, q, dq, qmul);
-	kernel_neon_end();
+	scoped_ksimd()
+		__raid6_datap_recov_neon(bytes, p, q, dq, qmul);
 }
 
 const struct raid6_recov_calls raid6_recov_neon = {
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 6cba1cb14b23..1192e62531cd 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1642,17 +1642,30 @@ vm_fault_t vmf_insert_folio_pud(struct vm_fault *vmf, struct folio *folio,
 EXPORT_SYMBOL_GPL(vmf_insert_folio_pud);
 #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
 
-void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
+/**
+ * touch_pmd - Mark page table pmd entry as accessed and dirty (for write)
+ * @vma: The VMA covering @addr
+ * @addr: The virtual address
+ * @pmd: pmd pointer into the page table mapping @addr
+ * @write: Whether it's a write access
+ *
+ * Return: whether the pmd entry is changed
+ */
+bool touch_pmd(struct vm_area_struct *vma, unsigned long addr,
 	       pmd_t *pmd, bool write)
 {
-	pmd_t _pmd;
+	pmd_t entry;
 
-	_pmd = pmd_mkyoung(*pmd);
+	entry = pmd_mkyoung(*pmd);
 	if (write)
-		_pmd = pmd_mkdirty(_pmd);
+		entry = pmd_mkdirty(entry);
 	if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK,
-				  pmd, _pmd, write))
+				  pmd, entry, write)) {
 		update_mmu_cache_pmd(vma, addr, pmd);
+		return true;
+	}
+
+	return false;
 }
 
 int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
@@ -1842,18 +1855,14 @@ unlock:
 }
 #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
 
-void huge_pmd_set_accessed(struct vm_fault *vmf)
+bool huge_pmd_set_accessed(struct vm_fault *vmf)
 {
 	bool write = vmf->flags & FAULT_FLAG_WRITE;
 
-	vmf->ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
 	if (unlikely(!pmd_same(*vmf->pmd, vmf->orig_pmd)))
-		goto unlock;
-
-	touch_pmd(vmf->vma, vmf->address, vmf->pmd, write);
+		return false;
 
-unlock:
-	spin_unlock(vmf->ptl);
+	return touch_pmd(vmf->vma, vmf->address, vmf->pmd, write);
 }
 
 static vm_fault_t do_huge_zero_wp_pmd(struct vm_fault *vmf)
diff --git a/mm/internal.h b/mm/internal.h
index 1561fc2ff5b8..27ad37a41868 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -1402,7 +1402,7 @@ int __must_check try_grab_folio(struct folio *folio, int refs,
  */
 void touch_pud(struct vm_area_struct *vma, unsigned long addr,
 	       pud_t *pud, bool write);
-void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
+bool touch_pmd(struct vm_area_struct *vma, unsigned long addr,
 	       pmd_t *pmd, bool write);
 
 /*
diff --git a/mm/memory.c b/mm/memory.c
index b59ae7ce42eb..aad432e71251 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -6134,6 +6134,45 @@ split:
 }
 
 /*
+ * The page faults may be spurious because of the racy access to the
+ * page table.  For example, a non-populated virtual page is accessed
+ * on 2 CPUs simultaneously, thus the page faults are triggered on
+ * both CPUs.  However, it's possible that one CPU (say CPU A) cannot
+ * find the reason for the page fault if the other CPU (say CPU B) has
+ * changed the page table before the PTE is checked on CPU A.  Most of
+ * the time, the spurious page faults can be ignored safely.  However,
+ * if the page fault is for the write access, it's possible that a
+ * stale read-only TLB entry exists in the local CPU and needs to be
+ * flushed on some architectures.  This is called the spurious page
+ * fault fixing.
+ *
+ * Note: flush_tlb_fix_spurious_fault() is defined as flush_tlb_page()
+ * by default and used as such on most architectures, while
+ * flush_tlb_fix_spurious_fault_pmd() is defined as NOP by default and
+ * used as such on most architectures.
+ */
+static void fix_spurious_fault(struct vm_fault *vmf,
+			       enum pgtable_level ptlevel)
+{
+	/* Skip spurious TLB flush for retried page fault */
+	if (vmf->flags & FAULT_FLAG_TRIED)
+		return;
+	/*
+	 * This is needed only for protection faults but the arch code
+	 * is not yet telling us if this is a protection fault or not.
+	 * This still avoids useless tlb flushes for .text page faults
+	 * with threads.
+	 */
+	if (vmf->flags & FAULT_FLAG_WRITE) {
+		if (ptlevel == PGTABLE_LEVEL_PTE)
+			flush_tlb_fix_spurious_fault(vmf->vma, vmf->address,
+						     vmf->pte);
+		else
+			flush_tlb_fix_spurious_fault_pmd(vmf->vma, vmf->address,
+							 vmf->pmd);
+	}
+}
+/*
  * These routines also need to handle stuff like marking pages dirty
  * and/or accessed for architectures that don't do it in hardware (most
  * RISC architectures).  The early dirtying is also good on the i386.
@@ -6214,23 +6253,11 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
 	}
 	entry = pte_mkyoung(entry);
 	if (ptep_set_access_flags(vmf->vma, vmf->address, vmf->pte, entry,
-				vmf->flags & FAULT_FLAG_WRITE)) {
+				vmf->flags & FAULT_FLAG_WRITE))
 		update_mmu_cache_range(vmf, vmf->vma, vmf->address,
 				vmf->pte, 1);
-	} else {
-		/* Skip spurious TLB flush for retried page fault */
-		if (vmf->flags & FAULT_FLAG_TRIED)
-			goto unlock;
-		/*
-		 * This is needed only for protection faults but the arch code
-		 * is not yet telling us if this is a protection fault or not.
-		 * This still avoids useless tlb flushes for .text page faults
-		 * with threads.
-		 */
-		if (vmf->flags & FAULT_FLAG_WRITE)
-			flush_tlb_fix_spurious_fault(vmf->vma, vmf->address,
-						     vmf->pte);
-	}
+	else
+		fix_spurious_fault(vmf, PGTABLE_LEVEL_PTE);
 unlock:
 	pte_unmap_unlock(vmf->pte, vmf->ptl);
 	return 0;
@@ -6327,7 +6354,10 @@ retry_pud:
 				if (!(ret & VM_FAULT_FALLBACK))
 					return ret;
 			} else {
-				huge_pmd_set_accessed(&vmf);
+				vmf.ptl = pmd_lock(mm, vmf.pmd);
+				if (!huge_pmd_set_accessed(&vmf))
+					fix_spurious_fault(&vmf, PGTABLE_LEVEL_PMD);
+				spin_unlock(vmf.ptl);
 				return 0;
 			}
 		}
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 0be83039c3b5..238a6712738e 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1088,7 +1088,7 @@ void adjust_present_page_count(struct page *page, struct memory_group *group,
 }
 
 int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages,
-			      struct zone *zone, bool mhp_off_inaccessible)
+			      struct zone *zone)
 {
 	unsigned long end_pfn = pfn + nr_pages;
 	int ret, i;
@@ -1097,15 +1097,6 @@ int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages,
 	if (ret)
 		return ret;
 
-	/*
-	 * Memory block is accessible at this stage and hence poison the struct
-	 * pages now.  If the memory block is accessible during memory hotplug
-	 * addition phase, then page poisining is already performed in
-	 * sparse_add_section().
-	 */
-	if (mhp_off_inaccessible)
-		page_init_poison(pfn_to_page(pfn), sizeof(struct page) * nr_pages);
-
 	move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_UNMOVABLE,
 			       false);
 
@@ -1444,7 +1435,7 @@ static void remove_memory_blocks_and_altmaps(u64 start, u64 size)
 }
 
 static int create_altmaps_and_memory_blocks(int nid, struct memory_group *group,
-					    u64 start, u64 size, mhp_t mhp_flags)
+					    u64 start, u64 size)
 {
 	unsigned long memblock_size = memory_block_size_bytes();
 	u64 cur_start;
@@ -1460,8 +1451,6 @@ static int create_altmaps_and_memory_blocks(int nid, struct memory_group *group,
 		};
 
 		mhp_altmap.free = memory_block_memmap_on_memory_pages();
-		if (mhp_flags & MHP_OFFLINE_INACCESSIBLE)
-			mhp_altmap.inaccessible = true;
 		params.altmap = kmemdup(&mhp_altmap, sizeof(struct vmem_altmap),
 					GFP_KERNEL);
 		if (!params.altmap) {
@@ -1555,7 +1544,7 @@ int add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
 	 */
 	if ((mhp_flags & MHP_MEMMAP_ON_MEMORY) &&
 	    mhp_supports_memmap_on_memory()) {
-		ret = create_altmaps_and_memory_blocks(nid, group, start, size, mhp_flags);
+		ret = create_altmaps_and_memory_blocks(nid, group, start, size);
 		if (ret)
 			goto error;
 	} else {
diff --git a/mm/sparse.c b/mm/sparse.c
index 17c50a6415c2..b5b2b6f7041b 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -951,8 +951,7 @@ int __meminit sparse_add_section(int nid, unsigned long start_pfn,
 	 * Poison uninitialized struct pages in order to catch invalid flags
 	 * combinations.
 	 */
-	if (!altmap || !altmap->inaccessible)
-		page_init_poison(memmap, sizeof(struct page) * nr_pages);
+	page_init_poison(memmap, sizeof(struct page) * nr_pages);
 
 	ms = __nr_to_section(section_nr);
 	set_section_nid(section_nr, nid);
diff --git a/rust/kernel/acpi.rs b/rust/kernel/acpi.rs
index 37e1161c1298..9b8efa623130 100644
--- a/rust/kernel/acpi.rs
+++ b/rust/kernel/acpi.rs
@@ -39,9 +39,7 @@ impl DeviceId {
     pub const fn new(id: &'static CStr) -> Self {
         let src = id.to_bytes_with_nul();
         build_assert!(src.len() <= Self::ACPI_ID_LEN, "ID exceeds 16 bytes");
-        // Replace with `bindings::acpi_device_id::default()` once stabilized for `const`.
-        // SAFETY: FFI type is valid to be zero-initialized.
-        let mut acpi: bindings::acpi_device_id = unsafe { core::mem::zeroed() };
+        let mut acpi: bindings::acpi_device_id = pin_init::zeroed();
         let mut i = 0;
         while i < src.len() {
             acpi.id[i] = src[i];
diff --git a/rust/kernel/opp.rs b/rust/kernel/opp.rs
index 2c763fa9276d..f9641c639fff 100644
--- a/rust/kernel/opp.rs
+++ b/rust/kernel/opp.rs
@@ -87,7 +87,7 @@ use core::{marker::PhantomData, ptr};
 
 use macros::vtable;
 
-/// Creates a null-terminated slice of pointers to [`Cstring`]s.
+/// Creates a null-terminated slice of pointers to [`CString`]s.
 fn to_c_str_array(names: &[CString]) -> Result<KVec<*const u8>> {
     // Allocated a null-terminated vector of pointers.
     let mut list = KVec::with_capacity(names.len() + 1, GFP_KERNEL)?;
@@ -443,66 +443,70 @@ impl<T: ConfigOps + Default> Config<T> {
     ///
     /// The returned [`ConfigToken`] will remove the configuration when dropped.
     pub fn set(self, dev: &Device) -> Result<ConfigToken> {
-        let (_clk_list, clk_names) = match &self.clk_names {
-            Some(x) => {
-                let list = to_c_str_array(x)?;
-                let ptr = list.as_ptr();
-                (Some(list), ptr)
-            }
-            None => (None, ptr::null()),
-        };
+        let clk_names = self.clk_names.as_deref().map(to_c_str_array).transpose()?;
+        let regulator_names = self
+            .regulator_names
+            .as_deref()
+            .map(to_c_str_array)
+            .transpose()?;
+
+        let set_config = || {
+            let clk_names = clk_names.as_ref().map_or(ptr::null(), |c| c.as_ptr());
+            let regulator_names = regulator_names.as_ref().map_or(ptr::null(), |c| c.as_ptr());
+
+            let prop_name = self
+                .prop_name
+                .as_ref()
+                .map_or(ptr::null(), |p| p.as_char_ptr());
+
+            let (supported_hw, supported_hw_count) = self
+                .supported_hw
+                .as_ref()
+                .map_or((ptr::null(), 0), |hw| (hw.as_ptr(), hw.len() as u32));
+
+            let (required_dev, required_dev_index) = self
+                .required_dev
+                .as_ref()
+                .map_or((ptr::null_mut(), 0), |(dev, idx)| (dev.as_raw(), *idx));
+
+            let mut config = bindings::dev_pm_opp_config {
+                clk_names,
+                config_clks: if T::HAS_CONFIG_CLKS {
+                    Some(Self::config_clks)
+                } else {
+                    None
+                },
+                prop_name,
+                regulator_names,
+                config_regulators: if T::HAS_CONFIG_REGULATORS {
+                    Some(Self::config_regulators)
+                } else {
+                    None
+                },
+                supported_hw,
+                supported_hw_count,
 
-        let (_regulator_list, regulator_names) = match &self.regulator_names {
-            Some(x) => {
-                let list = to_c_str_array(x)?;
-                let ptr = list.as_ptr();
-                (Some(list), ptr)
-            }
-            None => (None, ptr::null()),
-        };
+                required_dev,
+                required_dev_index,
+            };
 
-        let prop_name = self
-            .prop_name
-            .as_ref()
-            .map_or(ptr::null(), |p| p.as_char_ptr());
-
-        let (supported_hw, supported_hw_count) = self
-            .supported_hw
-            .as_ref()
-            .map_or((ptr::null(), 0), |hw| (hw.as_ptr(), hw.len() as u32));
-
-        let (required_dev, required_dev_index) = self
-            .required_dev
-            .as_ref()
-            .map_or((ptr::null_mut(), 0), |(dev, idx)| (dev.as_raw(), *idx));
-
-        let mut config = bindings::dev_pm_opp_config {
-            clk_names,
-            config_clks: if T::HAS_CONFIG_CLKS {
-                Some(Self::config_clks)
-            } else {
-                None
-            },
-            prop_name,
-            regulator_names,
-            config_regulators: if T::HAS_CONFIG_REGULATORS {
-                Some(Self::config_regulators)
-            } else {
-                None
-            },
-            supported_hw,
-            supported_hw_count,
+            // SAFETY: The requirements are satisfied by the existence of [`Device`] and its safety
+            // requirements. The OPP core guarantees not to access fields of [`Config`] after this
+            // call and so we don't need to save a copy of them for future use.
+            let ret = unsafe { bindings::dev_pm_opp_set_config(dev.as_raw(), &mut config) };
 
-            required_dev,
-            required_dev_index,
+            to_result(ret).map(|()| ConfigToken(ret))
         };
 
-        // SAFETY: The requirements are satisfied by the existence of [`Device`] and its safety
-        // requirements. The OPP core guarantees not to access fields of [`Config`] after this call
-        // and so we don't need to save a copy of them for future use.
-        let ret = unsafe { bindings::dev_pm_opp_set_config(dev.as_raw(), &mut config) };
+        // Ensure the closure does not accidentally drop owned data; if violated, the compiler
+        // produces E0525 with e.g.:
+        //
+        // ```
+        // closure is `FnOnce` because it moves the variable `clk_names` out of its environment
+        // ```
+        let _: &dyn Fn() -> _ = &set_config;
 
-        to_result(ret).map(|()| ConfigToken(ret))
+        set_config()
     }
 
     /// Config's clk callback.
diff --git a/scripts/crypto/gen-fips-testvecs.py b/scripts/crypto/gen-fips-testvecs.py
new file mode 100755
index 000000000000..db873f88619a
--- /dev/null
+++ b/scripts/crypto/gen-fips-testvecs.py
@@ -0,0 +1,36 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: GPL-2.0-or-later
+#
+# Script that generates lib/crypto/fips.h
+#
+# Copyright 2025 Google LLC
+
+import hashlib
+import hmac
+
+fips_test_data = b"fips test data\0\0"
+fips_test_key = b"fips test key\0\0\0"
+
+def print_static_u8_array_definition(name, value):
+    print('')
+    print(f'static const u8 {name}[] __initconst __maybe_unused = {{')
+    for i in range(0, len(value), 8):
+        line = '\t' + ''.join(f'0x{b:02x}, ' for b in value[i:i+8])
+        print(f'{line.rstrip()}')
+    print('};')
+
+print('/* SPDX-License-Identifier: GPL-2.0-or-later */')
+print(f'/* This file was generated by: gen-fips-testvecs.py */')
+print()
+print('#include <linux/fips.h>')
+
+print_static_u8_array_definition("fips_test_data", fips_test_data)
+print_static_u8_array_definition("fips_test_key", fips_test_key)
+
+for alg in 'sha1', 'sha256', 'sha512':
+    ctx = hmac.new(fips_test_key, digestmod=alg)
+    ctx.update(fips_test_data)
+    print_static_u8_array_definition(f'fips_test_hmac_{alg}_value', ctx.digest())
+
+print_static_u8_array_definition(f'fips_test_sha3_256_value',
+                                 hashlib.sha3_256(fips_test_data).digest())
diff --git a/scripts/crypto/gen-hash-testvecs.py b/scripts/crypto/gen-hash-testvecs.py
index fc063f2ee95f..c1d0517140bd 100755
--- a/scripts/crypto/gen-hash-testvecs.py
+++ b/scripts/crypto/gen-hash-testvecs.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python3
 # SPDX-License-Identifier: GPL-2.0-or-later
 #
-# Script that generates test vectors for the given cryptographic hash function.
+# Script that generates test vectors for the given hash function.
 #
 # Copyright 2025 Google LLC
 
@@ -50,11 +50,42 @@ class Poly1305:
         m = (self.h + self.s) % 2**128
         return m.to_bytes(16, byteorder='little')
 
+POLYVAL_POLY = sum((1 << i) for i in [128, 127, 126, 121, 0])
+POLYVAL_BLOCK_SIZE = 16
+
+# A straightforward, unoptimized implementation of POLYVAL.
+# Reference: https://datatracker.ietf.org/doc/html/rfc8452
+class Polyval:
+    def __init__(self, key):
+        assert len(key) == 16
+        self.h = int.from_bytes(key, byteorder='little')
+        self.acc = 0
+
+    # Note: this supports partial blocks only at the end.
+    def update(self, data):
+        for i in range(0, len(data), 16):
+            # acc += block
+            self.acc ^= int.from_bytes(data[i:i+16], byteorder='little')
+            # acc = (acc * h * x^-128) mod POLYVAL_POLY
+            product = 0
+            for j in range(128):
+                if (self.h & (1 << j)) != 0:
+                    product ^= self.acc << j
+                if (product & (1 << j)) != 0:
+                    product ^= POLYVAL_POLY << j
+            self.acc = product >> 128
+        return self
+
+    def digest(self):
+        return self.acc.to_bytes(16, byteorder='little')
+
 def hash_init(alg):
     if alg == 'poly1305':
         # Use a fixed random key here, to present Poly1305 as an unkeyed hash.
         # This allows all the test cases for unkeyed hashes to work on Poly1305.
         return Poly1305(rand_bytes(POLY1305_KEY_SIZE))
+    if alg == 'polyval':
+        return Polyval(rand_bytes(POLYVAL_BLOCK_SIZE))
     return hashlib.new(alg)
 
 def hash_update(ctx, data):
@@ -85,9 +116,9 @@ def print_c_struct_u8_array_field(name, value):
     print('\t\t},')
 
 def alg_digest_size_const(alg):
-    if alg == 'blake2s':
-        return 'BLAKE2S_HASH_SIZE'
-    return f'{alg.upper()}_DIGEST_SIZE'
+    if alg.startswith('blake2'):
+        return f'{alg.upper()}_HASH_SIZE'
+    return f'{alg.upper().replace('-', '_')}_DIGEST_SIZE'
 
 def gen_unkeyed_testvecs(alg):
     print('')
@@ -111,6 +142,18 @@ def gen_unkeyed_testvecs(alg):
             f'hash_testvec_consolidated[{alg_digest_size_const(alg)}]',
             hash_final(ctx))
 
+def gen_additional_sha3_testvecs():
+    max_len = 4096
+    in_data = rand_bytes(max_len)
+    for alg in ['shake128', 'shake256']:
+        ctx = hashlib.new('sha3-256')
+        for in_len in range(max_len + 1):
+            out_len = (in_len * 293) % (max_len + 1)
+            out = hashlib.new(alg, data=in_data[:in_len]).digest(out_len)
+            ctx.update(out)
+        print_static_u8_array_definition(f'{alg}_testvec_consolidated[SHA3_256_DIGEST_SIZE]',
+                                         ctx.digest())
+
 def gen_hmac_testvecs(alg):
     ctx = hmac.new(rand_bytes(32), digestmod=alg)
     data = rand_bytes(4096)
@@ -124,19 +167,22 @@ def gen_hmac_testvecs(alg):
             f'hmac_testvec_consolidated[{alg.upper()}_DIGEST_SIZE]',
             ctx.digest())
 
-BLAKE2S_KEY_SIZE = 32
-BLAKE2S_HASH_SIZE = 32
-
-def gen_additional_blake2s_testvecs():
+def gen_additional_blake2_testvecs(alg):
+    if alg == 'blake2s':
+        (max_key_size, max_hash_size) = (32, 32)
+    elif alg == 'blake2b':
+        (max_key_size, max_hash_size) = (64, 64)
+    else:
+        raise ValueError(f'Unsupported alg: {alg}')
     hashes = b''
-    for key_len in range(BLAKE2S_KEY_SIZE + 1):
-        for out_len in range(1, BLAKE2S_HASH_SIZE + 1):
-            h = hashlib.blake2s(digest_size=out_len, key=rand_bytes(key_len))
+    for key_len in range(max_key_size + 1):
+        for out_len in range(1, max_hash_size + 1):
+            h = hashlib.new(alg, digest_size=out_len, key=rand_bytes(key_len))
             h.update(rand_bytes(100))
             hashes += h.digest()
     print_static_u8_array_definition(
-            'blake2s_keyed_testvec_consolidated[BLAKE2S_HASH_SIZE]',
-            compute_hash('blake2s', hashes))
+            f'{alg}_keyed_testvec_consolidated[{alg_digest_size_const(alg)}]',
+            compute_hash(alg, hashes))
 
 def gen_additional_poly1305_testvecs():
     key = b'\xff' * POLY1305_KEY_SIZE
@@ -150,19 +196,40 @@ def gen_additional_poly1305_testvecs():
             'poly1305_allones_macofmacs[POLY1305_DIGEST_SIZE]',
             Poly1305(key).update(data).digest())
 
+def gen_additional_polyval_testvecs():
+    key = b'\xff' * POLYVAL_BLOCK_SIZE
+    hashes = b''
+    for data_len in range(0, 4097, 16):
+        hashes += Polyval(key).update(b'\xff' * data_len).digest()
+    print_static_u8_array_definition(
+            'polyval_allones_hashofhashes[POLYVAL_DIGEST_SIZE]',
+            Polyval(key).update(hashes).digest())
+
 if len(sys.argv) != 2:
     sys.stderr.write('Usage: gen-hash-testvecs.py ALGORITHM\n')
-    sys.stderr.write('ALGORITHM may be any supported by Python hashlib, or poly1305.\n')
+    sys.stderr.write('ALGORITHM may be any supported by Python hashlib; or poly1305, polyval, or sha3.\n')
     sys.stderr.write('Example: gen-hash-testvecs.py sha512\n')
     sys.exit(1)
 
 alg = sys.argv[1]
 print('/* SPDX-License-Identifier: GPL-2.0-or-later */')
 print(f'/* This file was generated by: {sys.argv[0]} {" ".join(sys.argv[1:])} */')
-gen_unkeyed_testvecs(alg)
-if alg == 'blake2s':
-    gen_additional_blake2s_testvecs()
+if alg.startswith('blake2'):
+    gen_unkeyed_testvecs(alg)
+    gen_additional_blake2_testvecs(alg)
 elif alg == 'poly1305':
+    gen_unkeyed_testvecs(alg)
     gen_additional_poly1305_testvecs()
+elif alg == 'polyval':
+    gen_unkeyed_testvecs(alg)
+    gen_additional_polyval_testvecs()
+elif alg == 'sha3':
+    print()
+    print('/* SHA3-256 test vectors */')
+    gen_unkeyed_testvecs('sha3-256')
+    print()
+    print('/* SHAKE test vectors */')
+    gen_additional_sha3_testvecs()
 else:
+    gen_unkeyed_testvecs(alg)
     gen_hmac_testvecs(alg)
diff --git a/tools/arch/arm64/include/asm/cputype.h b/tools/arch/arm64/include/asm/cputype.h
index 139d5e87dc95..b35d954d50c3 100644
--- a/tools/arch/arm64/include/asm/cputype.h
+++ b/tools/arch/arm64/include/asm/cputype.h
@@ -245,7 +245,7 @@
 #define MIDR_FUJITSU_ERRATUM_010001_MASK	(~MIDR_CPU_VAR_REV(1, 0))
 #define TCR_CLEAR_FUJITSU_ERRATUM_010001	(TCR_NFD1 | TCR_NFD0)
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 #include <asm/sysreg.h>
 
@@ -338,6 +338,6 @@ static inline u32 __attribute_const__ read_cpuid_cachetype(void)
 {
 	return read_cpuid(CTR_EL0);
 }
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 
 #endif
diff --git a/tools/arch/arm64/include/asm/esr.h b/tools/arch/arm64/include/asm/esr.h
index bd592ca81571..bbfbd1497a2f 100644
--- a/tools/arch/arm64/include/asm/esr.h
+++ b/tools/arch/arm64/include/asm/esr.h
@@ -385,7 +385,7 @@
 #define ESR_ELx_MOPS_ISS_SRCREG(esr)	(((esr) & (UL(0x1f) << 5)) >> 5)
 #define ESR_ELx_MOPS_ISS_SIZEREG(esr)	(((esr) & (UL(0x1f) << 0)) >> 0)
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 #include <asm/types.h>
 
 static inline unsigned long esr_brk_comment(unsigned long esr)
@@ -450,6 +450,6 @@ static inline bool esr_iss_is_eretab(unsigned long esr)
 }
 
 const char *esr_get_class_string(unsigned long esr);
-#endif /* __ASSEMBLY */
+#endif /* __ASSEMBLER__ */
 
 #endif /* __ASM_ESR_H */
diff --git a/tools/arch/arm64/include/asm/gpr-num.h b/tools/arch/arm64/include/asm/gpr-num.h
index 05da4a7c5788..a114e4f8209b 100644
--- a/tools/arch/arm64/include/asm/gpr-num.h
+++ b/tools/arch/arm64/include/asm/gpr-num.h
@@ -2,7 +2,7 @@
 #ifndef __ASM_GPR_NUM_H
 #define __ASM_GPR_NUM_H
 
-#ifdef __ASSEMBLY__
+#ifdef __ASSEMBLER__
 
 	.irp	num,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30
 	.equ	.L__gpr_num_x\num, \num
@@ -11,7 +11,7 @@
 	.equ	.L__gpr_num_xzr, 31
 	.equ	.L__gpr_num_wzr, 31
 
-#else /* __ASSEMBLY__ */
+#else /* __ASSEMBLER__ */
 
 #define __DEFINE_ASM_GPR_NUMS					\
 "	.irp	num,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30\n" \
@@ -21,6 +21,6 @@
 "	.equ	.L__gpr_num_xzr, 31\n"				\
 "	.equ	.L__gpr_num_wzr, 31\n"
 
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 
 #endif /* __ASM_GPR_NUM_H */
diff --git a/tools/arch/arm64/include/asm/sysreg.h b/tools/arch/arm64/include/asm/sysreg.h
index 65f2759ea27a..178b7322bf04 100644
--- a/tools/arch/arm64/include/asm/sysreg.h
+++ b/tools/arch/arm64/include/asm/sysreg.h
@@ -51,7 +51,7 @@
 
 #ifndef CONFIG_BROKEN_GAS_INST
 
-#ifdef __ASSEMBLY__
+#ifdef __ASSEMBLER__
 // The space separator is omitted so that __emit_inst(x) can be parsed as
 // either an assembler directive or an assembler macro argument.
 #define __emit_inst(x)			.inst(x)
@@ -70,11 +70,11 @@
 					 (((x) >> 24) & 0x000000ff))
 #endif	/* CONFIG_CPU_BIG_ENDIAN */
 
-#ifdef __ASSEMBLY__
+#ifdef __ASSEMBLER__
 #define __emit_inst(x)			.long __INSTR_BSWAP(x)
-#else  /* __ASSEMBLY__ */
+#else  /* __ASSEMBLER__ */
 #define __emit_inst(x)			".long " __stringify(__INSTR_BSWAP(x)) "\n\t"
-#endif	/* __ASSEMBLY__ */
+#endif	/* __ASSEMBLER__ */
 
 #endif	/* CONFIG_BROKEN_GAS_INST */
 
@@ -1078,9 +1078,7 @@
 #define GCS_CAP(x)	((((unsigned long)x) & GCS_CAP_ADDR_MASK) | \
 					       GCS_CAP_VALID_TOKEN)
 
-#define ARM64_FEATURE_FIELD_BITS	4
-
-#ifdef __ASSEMBLY__
+#ifdef __ASSEMBLER__
 
 	.macro	mrs_s, rt, sreg
 	 __emit_inst(0xd5200000|(\sreg)|(.L__gpr_num_\rt))
diff --git a/tools/arch/arm64/include/uapi/asm/kvm.h b/tools/arch/arm64/include/uapi/asm/kvm.h
index ed5f3892674c..a792a599b9d6 100644
--- a/tools/arch/arm64/include/uapi/asm/kvm.h
+++ b/tools/arch/arm64/include/uapi/asm/kvm.h
@@ -31,7 +31,7 @@
 #define KVM_SPSR_FIQ	4
 #define KVM_NR_SPSR	5
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 #include <linux/psci.h>
 #include <linux/types.h>
 #include <asm/ptrace.h>
diff --git a/tools/arch/s390/include/uapi/asm/bitsperlong.h b/tools/arch/s390/include/uapi/asm/bitsperlong.h
index d2bb620119bf..a226a1686a53 100644
--- a/tools/arch/s390/include/uapi/asm/bitsperlong.h
+++ b/tools/arch/s390/include/uapi/asm/bitsperlong.h
@@ -2,11 +2,7 @@
 #ifndef __ASM_S390_BITSPERLONG_H
 #define __ASM_S390_BITSPERLONG_H
 
-#ifndef __s390x__
-#define __BITS_PER_LONG 32
-#else
 #define __BITS_PER_LONG 64
-#endif
 
 #include <asm-generic/bitsperlong.h>
 
diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h
index 245cf6b3ec57..ccc01ad6ff7c 100644
--- a/tools/arch/x86/include/asm/cpufeatures.h
+++ b/tools/arch/x86/include/asm/cpufeatures.h
@@ -407,9 +407,12 @@
 #define X86_FEATURE_ENQCMD		(16*32+29) /* "enqcmd" ENQCMD and ENQCMDS instructions */
 #define X86_FEATURE_SGX_LC		(16*32+30) /* "sgx_lc" Software Guard Extensions Launch Control */
 
-/* AMD-defined CPU features, CPUID level 0x80000007 (EBX), word 17 */
+/*
+ * Linux-defined word for use with scattered/synthetic bits.
+ */
 #define X86_FEATURE_OVERFLOW_RECOV	(17*32+ 0) /* "overflow_recov" MCA overflow recovery support */
 #define X86_FEATURE_SUCCOR		(17*32+ 1) /* "succor" Uncorrectable error containment and recovery */
+
 #define X86_FEATURE_SMCA		(17*32+ 3) /* "smca" Scalable MCA */
 
 /* Intel-defined CPU features, CPUID level 0x00000007:0 (EDX), word 18 */
diff --git a/tools/include/nolibc/arch-s390.h b/tools/include/nolibc/arch-s390.h
index df4c3cc713ac..0a39bee261b9 100644
--- a/tools/include/nolibc/arch-s390.h
+++ b/tools/include/nolibc/arch-s390.h
@@ -143,13 +143,8 @@
 void __attribute__((weak, noreturn)) __nolibc_entrypoint __no_stack_protector _start(void)
 {
 	__asm__ volatile (
-#ifdef __s390x__
 		"lgr	%r2, %r15\n"          /* save stack pointer to %r2, as arg1 of _start_c */
 		"aghi	%r15, -160\n"         /* allocate new stackframe                        */
-#else
-		"lr	%r2, %r15\n"
-		"ahi	%r15, -96\n"
-#endif
 		"xc	0(8,%r15), 0(%r15)\n" /* clear backchain                                */
 		"brasl	%r14, _start_c\n"     /* transfer to c runtime                          */
 	);
diff --git a/tools/include/nolibc/arch.h b/tools/include/nolibc/arch.h
index 426c89198135..ef4743aad188 100644
--- a/tools/include/nolibc/arch.h
+++ b/tools/include/nolibc/arch.h
@@ -27,7 +27,7 @@
 #include "arch-powerpc.h"
 #elif defined(__riscv)
 #include "arch-riscv.h"
-#elif defined(__s390x__) || defined(__s390__)
+#elif defined(__s390x__)
 #include "arch-s390.h"
 #elif defined(__loongarch__)
 #include "arch-loongarch.h"
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index dd3b2f57082d..85abc357da31 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -11325,8 +11325,6 @@ static const char *arch_specific_syscall_pfx(void)
 	return "ia32";
 #elif defined(__s390x__)
 	return "s390x";
-#elif defined(__s390__)
-	return "s390";
 #elif defined(__arm__)
 	return "arm";
 #elif defined(__aarch64__)
@@ -12113,8 +12111,6 @@ static const char *arch_specific_lib_paths(void)
 	return "/lib/i386-linux-gnu";
 #elif defined(__s390x__)
 	return "/lib/s390x-linux-gnu";
-#elif defined(__s390__)
-	return "/lib/s390-linux-gnu";
 #elif defined(__arm__) && defined(__SOFTFP__)
 	return "/lib/arm-linux-gnueabi";
 #elif defined(__arm__) && !defined(__SOFTFP__)
diff --git a/tools/lib/bpf/usdt.c b/tools/lib/bpf/usdt.c
index c174b4086673..d1524f6f54ae 100644
--- a/tools/lib/bpf/usdt.c
+++ b/tools/lib/bpf/usdt.c
@@ -1376,8 +1376,6 @@ static int parse_usdt_arg(const char *arg_str, int arg_num, struct usdt_arg_spec
 
 #elif defined(__s390x__)
 
-/* Do not support __s390__ for now, since user_pt_regs is broken with -m31. */
-
 static int parse_usdt_arg(const char *arg_str, int arg_num, struct usdt_arg_spec *arg, int *arg_sz)
 {
 	unsigned int reg;
diff --git a/tools/power/acpi/tools/pfrut/pfrut.c b/tools/power/acpi/tools/pfrut/pfrut.c
index 44a9ecbd91e8..4d9b0177c312 100644
--- a/tools/power/acpi/tools/pfrut/pfrut.c
+++ b/tools/power/acpi/tools/pfrut/pfrut.c
@@ -222,6 +222,7 @@ int main(int argc, char *argv[])
 	fd_update_log = open("/dev/acpi_pfr_telemetry0", O_RDWR);
 	if (fd_update_log < 0) {
 		printf("PFRT device not supported - Quit...\n");
+		close(fd_update);
 		return 1;
 	}
 
@@ -265,7 +266,8 @@ int main(int argc, char *argv[])
 		printf("chunk2_size:%d\n", data_info.chunk2_size);
 		printf("rollover_cnt:%d\n", data_info.rollover_cnt);
 		printf("reset_cnt:%d\n", data_info.reset_cnt);
-
+		close(fd_update);
+		close(fd_update_log);
 		return 0;
 	}
 
@@ -358,6 +360,7 @@ int main(int argc, char *argv[])
 
 		if (ret == -1) {
 			perror("Failed to load capsule file");
+			munmap(addr_map_capsule, st.st_size);
 			close(fd_capsule);
 			close(fd_update);
 			close(fd_update_log);
@@ -420,7 +423,7 @@ int main(int argc, char *argv[])
 		if (p_mmap == MAP_FAILED) {
 			perror("mmap error.");
 			close(fd_update_log);
-
+			free(log_buf);
 			return 1;
 		}
 
diff --git a/tools/power/cpupower/Makefile b/tools/power/cpupower/Makefile
index c43db1c41205..a1df9196dc45 100644
--- a/tools/power/cpupower/Makefile
+++ b/tools/power/cpupower/Makefile
@@ -37,9 +37,7 @@ NLS ?=		true
 # cpufreq-bench benchmarking tool
 CPUFREQ_BENCH ?= true
 
-# Do not build libraries, but build the code in statically
-# Libraries are still built, otherwise the Makefile code would
-# be rather ugly.
+# Build the code, including libraries, statically.
 export STATIC ?= false
 
 # Prefix to the directories we're installing to
@@ -207,14 +205,25 @@ $(OUTPUT)lib/%.o: $(LIB_SRC) $(LIB_HEADERS)
 	$(ECHO) "  CC      " $@
 	$(QUIET) $(CC) $(CFLAGS) -fPIC -o $@ -c lib/$*.c
 
-$(OUTPUT)libcpupower.so.$(LIB_VER): $(LIB_OBJS)
+ifeq ($(strip $(STATIC)),true)
+LIBCPUPOWER := libcpupower.a
+else
+LIBCPUPOWER := libcpupower.so.$(LIB_VER)
+endif
+
+$(OUTPUT)$(LIBCPUPOWER): $(LIB_OBJS)
+ifeq ($(strip $(STATIC)),true)
+	$(ECHO) "  AR      " $@
+	$(QUIET) $(AR) rcs $@ $(LIB_OBJS)
+else
 	$(ECHO) "  LD      " $@
 	$(QUIET) $(CC) -shared $(CFLAGS) $(LDFLAGS) -o $@ \
 		-Wl,-soname,libcpupower.so.$(LIB_MAJ) $(LIB_OBJS)
 	@ln -sf $(@F) $(OUTPUT)libcpupower.so
 	@ln -sf $(@F) $(OUTPUT)libcpupower.so.$(LIB_MAJ)
+endif
 
-libcpupower: $(OUTPUT)libcpupower.so.$(LIB_VER)
+libcpupower: $(OUTPUT)$(LIBCPUPOWER)
 
 # Let all .o files depend on its .c file and all headers
 # Might be worth to put this into utils/Makefile at some point of time
@@ -224,7 +233,7 @@ $(OUTPUT)%.o: %.c
 	$(ECHO) "  CC      " $@
 	$(QUIET) $(CC) $(CFLAGS) -I./lib -I ./utils -o $@ -c $*.c
 
-$(OUTPUT)cpupower: $(UTIL_OBJS) $(OUTPUT)libcpupower.so.$(LIB_VER)
+$(OUTPUT)cpupower: $(UTIL_OBJS) $(OUTPUT)$(LIBCPUPOWER)
 	$(ECHO) "  CC      " $@
 ifeq ($(strip $(STATIC)),true)
 	$(QUIET) $(CC) $(CFLAGS) $(LDFLAGS) $(UTIL_OBJS) -lrt -lpci -L$(OUTPUT) -o $@
@@ -269,7 +278,7 @@ update-po: $(OUTPUT)po/$(PACKAGE).pot
 	done;
 endif
 
-compile-bench: $(OUTPUT)libcpupower.so.$(LIB_VER)
+compile-bench: $(OUTPUT)$(LIBCPUPOWER)
 	@V=$(V) confdir=$(confdir) $(MAKE) -C bench O=$(OUTPUT)
 
 # we compile into subdirectories. if the target directory is not the
@@ -287,6 +296,7 @@ clean:
 	-find $(OUTPUT) \( -not -type d \) -and \( -name '*~' -o -name '*.[oas]' \) -type f -print \
 	 | xargs rm -f
 	-rm -f $(OUTPUT)cpupower
+	-rm -f $(OUTPUT)libcpupower.a
 	-rm -f $(OUTPUT)libcpupower.so*
 	-rm -rf $(OUTPUT)po/*.gmo
 	-rm -rf $(OUTPUT)po/*.pot
@@ -295,7 +305,11 @@ clean:
 
 install-lib: libcpupower
 	$(INSTALL) -d $(DESTDIR)${libdir}
+ifeq ($(strip $(STATIC)),true)
+	$(CP) $(OUTPUT)libcpupower.a $(DESTDIR)${libdir}/
+else
 	$(CP) $(OUTPUT)libcpupower.so* $(DESTDIR)${libdir}/
+endif
 	$(INSTALL) -d $(DESTDIR)${includedir}
 	$(INSTALL_DATA) lib/cpufreq.h $(DESTDIR)${includedir}/cpufreq.h
 	$(INSTALL_DATA) lib/cpuidle.h $(DESTDIR)${includedir}/cpuidle.h
@@ -336,11 +350,7 @@ install-bench: compile-bench
 	@#DESTDIR must be set from outside to survive
 	@sbindir=$(sbindir) bindir=$(bindir) docdir=$(docdir) confdir=$(confdir) $(MAKE) -C bench O=$(OUTPUT) install
 
-ifeq ($(strip $(STATIC)),true)
-install: all install-tools install-man $(INSTALL_NLS) $(INSTALL_BENCH)
-else
 install: all install-lib install-tools install-man $(INSTALL_NLS) $(INSTALL_BENCH)
-endif
 
 uninstall:
 	- rm -f $(DESTDIR)${libdir}/libcpupower.*
diff --git a/tools/testing/selftests/arm64/fp/fp-ptrace.c b/tools/testing/selftests/arm64/fp/fp-ptrace.c
index a85c19e9524e..0114108ab25f 100644
--- a/tools/testing/selftests/arm64/fp/fp-ptrace.c
+++ b/tools/testing/selftests/arm64/fp/fp-ptrace.c
@@ -1071,7 +1071,7 @@ static bool sve_write_supported(struct test_config *config)
 
 static bool sve_write_fpsimd_supported(struct test_config *config)
 {
-	if (!sve_supported())
+	if (!sve_supported() && !sme_supported())
 		return false;
 
 	if ((config->svcr_in & SVCR_ZA) != (config->svcr_expected & SVCR_ZA))
@@ -1231,9 +1231,6 @@ static void sve_write_fpsimd(pid_t child, struct test_config *config)
 	vl = vl_expected(config);
 	vq = __sve_vq_from_vl(vl);
 
-	if (!vl)
-		return;
-
 	iov.iov_len = SVE_PT_SIZE(vq, SVE_PT_REGS_FPSIMD);
 	iov.iov_base = malloc(iov.iov_len);
 	if (!iov.iov_base) {
diff --git a/tools/testing/selftests/arm64/fp/sve-ptrace.c b/tools/testing/selftests/arm64/fp/sve-ptrace.c
index e0fc3a001e28..f44d44618575 100644
--- a/tools/testing/selftests/arm64/fp/sve-ptrace.c
+++ b/tools/testing/selftests/arm64/fp/sve-ptrace.c
@@ -394,6 +394,58 @@ out:
 	free(svebuf);
 }
 
+/* Write the FPSIMD registers via the SVE regset when SVE is not supported */
+static void ptrace_sve_fpsimd_no_sve(pid_t child)
+{
+	void *svebuf;
+	struct user_sve_header *sve;
+	struct user_fpsimd_state *fpsimd, new_fpsimd;
+	unsigned int i, j;
+	unsigned char *p;
+	int ret;
+
+	svebuf = malloc(SVE_PT_SIZE(0, SVE_PT_REGS_FPSIMD));
+	if (!svebuf) {
+		ksft_test_result_fail("Failed to allocate FPSIMD buffer\n");
+		return;
+	}
+
+	/* On a system without SVE the VL should be set to 0 */
+	memset(svebuf, 0, SVE_PT_SIZE(0, SVE_PT_REGS_FPSIMD));
+	sve = svebuf;
+	sve->flags = SVE_PT_REGS_FPSIMD;
+	sve->size = SVE_PT_SIZE(0, SVE_PT_REGS_FPSIMD);
+	sve->vl = 0;
+
+	/* Try to set a known FPSIMD state via PT_REGS_SVE */
+	fpsimd = (struct user_fpsimd_state *)((char *)sve +
+					      SVE_PT_FPSIMD_OFFSET);
+	for (i = 0; i < 32; ++i) {
+		p = (unsigned char *)&fpsimd->vregs[i];
+
+		for (j = 0; j < sizeof(fpsimd->vregs[i]); ++j)
+			p[j] = j;
+	}
+
+	ret = set_sve(child, &vec_types[0], sve);
+	ksft_test_result(ret == 0, "FPSIMD write via SVE\n");
+	if (ret) {
+		ksft_test_result_skip("Verify FPSIMD write via SVE\n");
+		goto out;
+	}
+
+	/* Verify via the FPSIMD regset */
+	if (get_fpsimd(child, &new_fpsimd)) {
+		ksft_test_result_skip("Verify FPSIMD write via SVE\n");
+		goto out;
+	}
+	ksft_test_result(memcmp(fpsimd, &new_fpsimd, sizeof(*fpsimd)) == 0,
+			 "Verify FPSIMD write via SVE\n");
+
+out:
+	free(svebuf);
+}
+
 /* Validate attempting to set SVE data and read SVE data */
 static void ptrace_set_sve_get_sve_data(pid_t child,
 					const struct vec_type *type,
@@ -826,6 +878,15 @@ static int do_parent(pid_t child)
 		}
 	}
 
+	/* We support SVE writes of FPSMID format on SME only systems */
+	if (!(getauxval(AT_HWCAP) & HWCAP_SVE) &&
+	    (getauxval(AT_HWCAP2) & HWCAP2_SME)) {
+		ptrace_sve_fpsimd_no_sve(child);
+	} else {
+		ksft_test_result_skip("FPSIMD write via SVE\n");
+		ksft_test_result_skip("Verify FPSIMD write via SVE\n");
+	}
+
 	ret = EXIT_SUCCESS;
 
 error:
diff --git a/tools/testing/selftests/arm64/fp/zt-test.S b/tools/testing/selftests/arm64/fp/zt-test.S
index 38080f3c3280..a8df05771670 100644
--- a/tools/testing/selftests/arm64/fp/zt-test.S
+++ b/tools/testing/selftests/arm64/fp/zt-test.S
@@ -276,7 +276,7 @@ function barf
 	bl	putdec
 	puts	", iteration="
 	mov	x0, x22
-	bl	putdec
+	bl	putdecn
 	puts	"\tExpected ["
 	mov	x0, x10
 	mov	x1, x12
diff --git a/tools/testing/selftests/kvm/arm64/set_id_regs.c b/tools/testing/selftests/kvm/arm64/set_id_regs.c
index 5e24f77868b5..c4815d365816 100644
--- a/tools/testing/selftests/kvm/arm64/set_id_regs.c
+++ b/tools/testing/selftests/kvm/arm64/set_id_regs.c
@@ -268,7 +268,9 @@ static void guest_code(void)
 /* Return a safe value to a given ftr_bits an ftr value */
 uint64_t get_safe_value(const struct reg_ftr_bits *ftr_bits, uint64_t ftr)
 {
-	uint64_t ftr_max = GENMASK_ULL(ARM64_FEATURE_FIELD_BITS - 1, 0);
+	uint64_t ftr_max = ftr_bits->mask >> ftr_bits->shift;
+
+	TEST_ASSERT(ftr_max > 1, "This test doesn't support single bit features");
 
 	if (ftr_bits->sign == FTR_UNSIGNED) {
 		switch (ftr_bits->type) {
@@ -320,7 +322,9 @@ uint64_t get_safe_value(const struct reg_ftr_bits *ftr_bits, uint64_t ftr)
 /* Return an invalid value to a given ftr_bits an ftr value */
 uint64_t get_invalid_value(const struct reg_ftr_bits *ftr_bits, uint64_t ftr)
 {
-	uint64_t ftr_max = GENMASK_ULL(ARM64_FEATURE_FIELD_BITS - 1, 0);
+	uint64_t ftr_max = ftr_bits->mask >> ftr_bits->shift;
+
+	TEST_ASSERT(ftr_max > 1, "This test doesn't support single bit features");
 
 	if (ftr_bits->sign == FTR_UNSIGNED) {
 		switch (ftr_bits->type) {
@@ -672,7 +676,7 @@ static void test_clidr(struct kvm_vcpu *vcpu)
 	clidr = vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_CLIDR_EL1));
 
 	/* find the first empty level in the cache hierarchy */
-	for (level = 1; level < 7; level++) {
+	for (level = 1; level <= 7; level++) {
 		if (!CLIDR_CTYPE(clidr, level))
 			break;
 	}
diff --git a/tools/testing/selftests/nolibc/Makefile.nolibc b/tools/testing/selftests/nolibc/Makefile.nolibc
index 330e000baeb1..9416ae952e18 100644
--- a/tools/testing/selftests/nolibc/Makefile.nolibc
+++ b/tools/testing/selftests/nolibc/Makefile.nolibc
@@ -87,7 +87,6 @@ IMAGE_riscv      = arch/riscv/boot/Image
 IMAGE_riscv32    = arch/riscv/boot/Image
 IMAGE_riscv64    = arch/riscv/boot/Image
 IMAGE_s390x      = arch/s390/boot/bzImage
-IMAGE_s390       = arch/s390/boot/bzImage
 IMAGE_loongarch  = arch/loongarch/boot/vmlinuz.efi
 IMAGE_sparc32    = arch/sparc/boot/image
 IMAGE_sparc64    = arch/sparc/boot/image
@@ -117,7 +116,6 @@ DEFCONFIG_riscv      = defconfig
 DEFCONFIG_riscv32    = rv32_defconfig
 DEFCONFIG_riscv64    = defconfig
 DEFCONFIG_s390x      = defconfig
-DEFCONFIG_s390       = defconfig compat.config
 DEFCONFIG_loongarch  = defconfig
 DEFCONFIG_sparc32    = sparc32_defconfig
 DEFCONFIG_sparc64    = sparc64_defconfig
@@ -156,7 +154,6 @@ QEMU_ARCH_riscv      = riscv64
 QEMU_ARCH_riscv32    = riscv32
 QEMU_ARCH_riscv64    = riscv64
 QEMU_ARCH_s390x      = s390x
-QEMU_ARCH_s390       = s390x
 QEMU_ARCH_loongarch  = loongarch64
 QEMU_ARCH_sparc32    = sparc
 QEMU_ARCH_sparc64    = sparc64
@@ -197,7 +194,6 @@ QEMU_ARGS_riscv      = -M virt -append "console=ttyS0 panic=-1 $(TEST:%=NOLIBC_T
 QEMU_ARGS_riscv32    = -M virt -append "console=ttyS0 panic=-1 $(TEST:%=NOLIBC_TEST=%)"
 QEMU_ARGS_riscv64    = -M virt -append "console=ttyS0 panic=-1 $(TEST:%=NOLIBC_TEST=%)"
 QEMU_ARGS_s390x      = -M s390-ccw-virtio -append "console=ttyS0 panic=-1 $(TEST:%=NOLIBC_TEST=%)"
-QEMU_ARGS_s390       = -M s390-ccw-virtio -append "console=ttyS0 panic=-1 $(TEST:%=NOLIBC_TEST=%)"
 QEMU_ARGS_loongarch  = -M virt -append "console=ttyS0,115200 panic=-1 $(TEST:%=NOLIBC_TEST=%)"
 QEMU_ARGS_sparc32    = -M SS-5 -m 256M -append "console=ttyS0,115200 panic=-1 $(TEST:%=NOLIBC_TEST=%)"
 QEMU_ARGS_sparc64    = -M sun4u -append "console=ttyS0,115200 panic=-1 $(TEST:%=NOLIBC_TEST=%)"
@@ -223,7 +219,6 @@ CFLAGS_ppc = -m32 -mbig-endian -mno-vsx $(call cc-option,-mmultiple)
 CFLAGS_ppc64 = -m64 -mbig-endian -mno-vsx $(call cc-option,-mmultiple)
 CFLAGS_ppc64le = -m64 -mlittle-endian -mno-vsx $(call cc-option,-mabi=elfv2)
 CFLAGS_s390x = -m64
-CFLAGS_s390 = -m31
 CFLAGS_mips32le = -EL -mabi=32 -fPIC
 CFLAGS_mips32be = -EB -mabi=32
 CFLAGS_mipsn32le = -EL -mabi=n32 -fPIC -march=mips64r2
diff --git a/tools/testing/selftests/nolibc/run-tests.sh b/tools/testing/selftests/nolibc/run-tests.sh
index e8af1fb505cf..210abe715ed9 100755
--- a/tools/testing/selftests/nolibc/run-tests.sh
+++ b/tools/testing/selftests/nolibc/run-tests.sh
@@ -23,7 +23,7 @@ all_archs=(
 	mips32le mips32be mipsn32le mipsn32be mips64le mips64be
 	ppc ppc64 ppc64le
 	riscv32 riscv64
-	s390x s390
+	s390x
 	loongarch
 	sparc32 sparc64
 	m68k
@@ -185,10 +185,6 @@ test_arch() {
 			exit 1
 	esac
 	printf '%-15s' "$arch:"
-	if [ "$arch" = "s390" ] && ([ "$llvm" = "1" ] || [ "$test_mode" = "user" ]); then
-		echo "Unsupported configuration"
-		return
-	fi
 	if [ "$arch" = "m68k" -o "$arch" = "sh4" ] && [ "$llvm" = "1" ]; then
 		echo "Unsupported configuration"
 		return
diff --git a/tools/testing/selftests/rseq/rseq-s390.h b/tools/testing/selftests/rseq/rseq-s390.h
index 33baaa9f9997..e7b858cd3736 100644
--- a/tools/testing/selftests/rseq/rseq-s390.h
+++ b/tools/testing/selftests/rseq/rseq-s390.h
@@ -28,8 +28,6 @@ do {									\
 	RSEQ_WRITE_ONCE(*(p), v);					\
 } while (0)
 
-#ifdef __s390x__
-
 #define LONG_L			"lg"
 #define LONG_S			"stg"
 #define LONG_LT_R		"ltgr"
@@ -63,43 +61,6 @@ do {									\
 		".quad " __rseq_str(start_ip) ", " __rseq_str(exit_ip) "\n\t" \
 		".popsection\n\t"
 
-#elif __s390__
-
-#define __RSEQ_ASM_DEFINE_TABLE(label, version, flags,			\
-				start_ip, post_commit_offset, abort_ip)	\
-		".pushsection __rseq_cs, \"aw\"\n\t"			\
-		".balign 32\n\t"					\
-		__rseq_str(label) ":\n\t"				\
-		".long " __rseq_str(version) ", " __rseq_str(flags) "\n\t" \
-		".long 0x0, " __rseq_str(start_ip) ", 0x0, " __rseq_str(post_commit_offset) ", 0x0, " __rseq_str(abort_ip) "\n\t" \
-		".popsection\n\t"					\
-		".pushsection __rseq_cs_ptr_array, \"aw\"\n\t"		\
-		".long 0x0, " __rseq_str(label) "b\n\t"			\
-		".popsection\n\t"
-
-/*
- * Exit points of a rseq critical section consist of all instructions outside
- * of the critical section where a critical section can either branch to or
- * reach through the normal course of its execution. The abort IP and the
- * post-commit IP are already part of the __rseq_cs section and should not be
- * explicitly defined as additional exit points. Knowing all exit points is
- * useful to assist debuggers stepping over the critical section.
- */
-#define RSEQ_ASM_DEFINE_EXIT_POINT(start_ip, exit_ip)			\
-		".pushsection __rseq_exit_point_array, \"aw\"\n\t"	\
-		".long 0x0, " __rseq_str(start_ip) ", 0x0, " __rseq_str(exit_ip) "\n\t" \
-		".popsection\n\t"
-
-#define LONG_L			"l"
-#define LONG_S			"st"
-#define LONG_LT_R		"ltr"
-#define LONG_CMP		"c"
-#define LONG_CMP_R		"cr"
-#define LONG_ADDI		"ahi"
-#define LONG_ADD_R		"ar"
-
-#endif
-
 #define RSEQ_ASM_DEFINE_TABLE(label, start_ip, post_commit_ip, abort_ip) \
 	__RSEQ_ASM_DEFINE_TABLE(label, 0x0, 0x0, start_ip,		\
 				(post_commit_ip - start_ip), abort_ip)
diff --git a/tools/testing/selftests/vDSO/vdso_config.h b/tools/testing/selftests/vDSO/vdso_config.h
index 5fdd0f362337..50c261005111 100644
--- a/tools/testing/selftests/vDSO/vdso_config.h
+++ b/tools/testing/selftests/vDSO/vdso_config.h
@@ -25,10 +25,6 @@
 #define VDSO_VERSION		1
 #define VDSO_NAMES		0
 #define VDSO_32BIT		1
-#elif defined (__s390__) && !defined(__s390x__)
-#define VDSO_VERSION		2
-#define VDSO_NAMES		0
-#define VDSO_32BIT		1
 #elif defined (__s390x__)
 #define VDSO_VERSION		2
 #define VDSO_NAMES		0
diff --git a/tools/testing/selftests/x86/test_vsyscall.c b/tools/testing/selftests/x86/test_vsyscall.c
index 05e1e6774fba..918eaec8bfbe 100644
--- a/tools/testing/selftests/x86/test_vsyscall.c
+++ b/tools/testing/selftests/x86/test_vsyscall.c
@@ -308,12 +308,13 @@ static void test_getcpu(int cpu)
 #ifdef __x86_64__
 
 static jmp_buf jmpbuf;
-static volatile unsigned long segv_err;
+static volatile unsigned long segv_err, segv_trapno;
 
 static void sigsegv(int sig, siginfo_t *info, void *ctx_void)
 {
 	ucontext_t *ctx = (ucontext_t *)ctx_void;
 
+	segv_trapno = ctx->uc_mcontext.gregs[REG_TRAPNO];
 	segv_err =  ctx->uc_mcontext.gregs[REG_ERR];
 	siglongjmp(jmpbuf, 1);
 }
@@ -336,7 +337,8 @@ static void test_vsys_r(void)
 	else if (can_read)
 		ksft_test_result_pass("We have read access\n");
 	else
-		ksft_test_result_pass("We do not have read access: #PF(0x%lx)\n", segv_err);
+		ksft_test_result_pass("We do not have read access (trap=%ld, error=0x%lx)\n",
+				      segv_trapno, segv_err);
 }
 
 static void test_vsys_x(void)
@@ -347,7 +349,7 @@ static void test_vsys_x(void)
 		return;
 	}
 
-	ksft_print_msg("Make sure that vsyscalls really page fault\n");
+	ksft_print_msg("Make sure that vsyscalls really cause a fault\n");
 
 	bool can_exec;
 	if (sigsetjmp(jmpbuf, 1) == 0) {
@@ -358,13 +360,14 @@ static void test_vsys_x(void)
 	}
 
 	if (can_exec)
-		ksft_test_result_fail("Executing the vsyscall did not page fault\n");
-	else if (segv_err & (1 << 4)) /* INSTR */
-		ksft_test_result_pass("Executing the vsyscall page failed: #PF(0x%lx)\n",
-				      segv_err);
+		ksft_test_result_fail("Executing the vsyscall did not fault\n");
+	/* #GP or #PF (with X86_PF_INSTR) */
+	else if ((segv_trapno == 13) || ((segv_trapno == 14) && (segv_err & (1 << 4))))
+		ksft_test_result_pass("Executing the vsyscall page failed (trap=%ld, error=0x%lx)\n",
+				      segv_trapno, segv_err);
 	else
-		ksft_test_result_fail("Execution failed with the wrong error: #PF(0x%lx)\n",
-				      segv_err);
+		ksft_test_result_fail("Execution failed with the wrong error (trap=%ld, error=0x%lx)\n",
+				      segv_trapno, segv_err);
 }
 
 /*
diff --git a/tools/thermal/thermal-engine/thermal-engine.c b/tools/thermal/thermal-engine/thermal-engine.c
index 0764dc754771..66b0ba1fcd23 100644
--- a/tools/thermal/thermal-engine/thermal-engine.c
+++ b/tools/thermal/thermal-engine/thermal-engine.c
@@ -374,7 +374,7 @@ int main(int argc, char *argv[])
 	}
 
 	if (options.daemonize && daemon(0, 0)) {
-		ERROR("Failed to daemonize: %p\n");
+		ERROR("Failed to daemonize: %m\n");
 		return THERMAL_ENGINE_DAEMON_ERROR;
 	}