diff options
385 files changed, 16831 insertions, 5989 deletions
diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 51c0bc4c2dc5..0e6c67ac585a 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -15,6 +15,9 @@ v1 is available under :ref:`Documentation/admin-guide/cgroup-v1/index.rst <cgrou .. CONTENTS + [Whenever any new section is added to this document, please also add + an entry here.] + 1. Introduction 1-1. Terminology 1-2. What is cgroup? @@ -25,9 +28,10 @@ v1 is available under :ref:`Documentation/admin-guide/cgroup-v1/index.rst <cgrou 2-2-2. Threads 2-3. [Un]populated Notification 2-4. Controlling Controllers - 2-4-1. Enabling and Disabling - 2-4-2. Top-down Constraint - 2-4-3. No Internal Process Constraint + 2-4-1. Availability + 2-4-2. Enabling and Disabling + 2-4-3. Top-down Constraint + 2-4-4. No Internal Process Constraint 2-5. Delegation 2-5-1. Model of Delegation 2-5-2. Delegation Containment @@ -61,14 +65,15 @@ v1 is available under :ref:`Documentation/admin-guide/cgroup-v1/index.rst <cgrou 5-4-1. PID Interface Files 5-5. Cpuset 5.5-1. Cpuset Interface Files - 5-6. Device + 5-6. Device controller 5-7. RDMA 5-7-1. RDMA Interface Files 5-8. DMEM + 5-8-1. DMEM Interface Files 5-9. HugeTLB 5.9-1. HugeTLB Interface Files 5-10. Misc - 5.10-1 Miscellaneous cgroup Interface Files + 5.10-1 Misc Interface Files 5.10-2 Migration and Ownership 5-11. Others 5-11-1. perf_event @@ -1001,6 +1006,24 @@ All cgroup core files are prefixed with "cgroup." Total number of dying cgroup subsystems (e.g. memory cgroup) at and beneath the current cgroup. + cgroup.stat.local + A read-only flat-keyed file which exists in non-root cgroups. + The following entry is defined: + + frozen_usec + Cumulative time that this cgroup has spent between freezing and + thawing, regardless of whether by self or ancestor groups. + NB: (not) reaching "frozen" state is not accounted here. + + Using the following ASCII representation of a cgroup's freezer + state, :: + + 1 _____ + frozen 0 __/ \__ + ab cd + + the duration being measured is the span between a and c. + cgroup.freeze A read-write single value file which exists on non-root cgroups. Allowed values are "0" and "1". The default is "0". diff --git a/Documentation/admin-guide/hw-vuln/attack_vector_controls.rst b/Documentation/admin-guide/hw-vuln/attack_vector_controls.rst index 5964901d66e3..d0bdbd81dcf9 100644 --- a/Documentation/admin-guide/hw-vuln/attack_vector_controls.rst +++ b/Documentation/admin-guide/hw-vuln/attack_vector_controls.rst @@ -218,6 +218,7 @@ SRSO X X X X SSB X TAA X X X X * (Note 2) TSA X X X X +VMSCAPE X =============== ============== ============ ============= ============== ============ ======== Notes: diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index a3f358fbcbfc..e019db1633fd 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2606,6 +2606,11 @@ for it. Intended to get systems with badly broken firmware running. + irqhandler.duration_warn_us= [KNL] + Warn if an IRQ handler exceeds the specified duration + threshold in microseconds. Useful for identifying + long-running IRQs in the system. + irqpoll [HW] When an interrupt is not handled search all handlers for it. Also check all handlers each timer @@ -3767,8 +3772,16 @@ mga= [HW,DRM] - microcode.force_minrev= [X86] - Format: <bool> + microcode= [X86] Control the behavior of the microcode loader. + Available options, comma separated: + + base_rev=X - with <X> with format: <u32> + Set the base microcode revision of each thread when in + debug mode. + + dis_ucode_ldr: disable the microcode loader + + force_minrev: Enable or disable the microcode minimal revision enforcement for the runtime microcode loader. @@ -6155,7 +6168,7 @@ rdt= [HW,X86,RDT] Turn on/off individual RDT features. List is: cmt, mbmtotal, mbmlocal, l3cat, l3cdp, l2cat, l2cdp, - mba, smba, bmec. + mba, smba, bmec, abmc. E.g. to turn on cmt and turn off mba use: rdt=cmt,!mba diff --git a/Documentation/arch/x86/topology.rst b/Documentation/arch/x86/topology.rst index c12837e61bda..86bec8ac2c4d 100644 --- a/Documentation/arch/x86/topology.rst +++ b/Documentation/arch/x86/topology.rst @@ -141,6 +141,197 @@ Thread-related topology information in the kernel: +System topology enumeration +=========================== + +The topology on x86 systems can be discovered using a combination of vendor +specific CPUID leaves which enumerate the processor topology and the cache +hierarchy. + +The CPUID leaves in their preferred order of parsing for each x86 vendor is as +follows: + +1) AMD + + 1) CPUID leaf 0x80000026 [Extended CPU Topology] (Core::X86::Cpuid::ExCpuTopology) + + The extended CPUID leaf 0x80000026 is the extension of the CPUID leaf 0xB + and provides the topology information of Core, Complex, CCD (Die), and + Socket in each level. + + Support for the leaf is discovered by checking if the maximum extended + CPUID level is >= 0x80000026 and then checking if `LogProcAtThisLevel` + in `EBX[15:0]` at a particular level (starting from 0) is non-zero. + + The `LevelType` in `ECX[15:8]` at the level provides the topology domain + the level describes - Core, Complex, CCD(Die), or the Socket. + + The kernel uses the `CoreMaskWidth` from `EAX[4:0]` to discover the + number of bits that need to be right-shifted from `ExtendedLocalApicId` + in `EDX[31:0]` in order to get a unique Topology ID for the topology + level. CPUs with the same Topology ID share the resources at that level. + + CPUID leaf 0x80000026 also provides more information regarding the power + and efficiency rankings, and about the core type on AMD processors with + heterogeneous characteristics. + + If CPUID leaf 0x80000026 is supported, further parsing is not required. + + 2) CPUID leaf 0x0000000B [Extended Topology Enumeration] (Core::X86::Cpuid::ExtTopEnum) + + The extended CPUID leaf 0x0000000B is the predecessor on the extended + CPUID leaf 0x80000026 and only describes the core, and the socket domains + of the processor topology. + + The support for the leaf is discovered by checking if the maximum supported + CPUID level is >= 0xB and then if `EBX[31:0]` at a particular level + (starting from 0) is non-zero. + + The `LevelType` in `ECX[15:8]` at the level provides the topology domain + that the level describes - Thread, or Processor (Socket). + + The kernel uses the `CoreMaskWidth` from `EAX[4:0]` to discover the + number of bits that need to be right-shifted from the `ExtendedLocalApicId` + in `EDX[31:0]` to get a unique Topology ID for that topology level. CPUs + sharing the Topology ID share the resources at that level. + + If CPUID leaf 0xB is supported, further parsing is not required. + + + 3) CPUID leaf 0x80000008 ECX [Size Identifiers] (Core::X86::Cpuid::SizeId) + + If neither the CPUID leaf 0x80000026 nor 0xB is supported, the number of + CPUs on the package is detected using the Size Identifier leaf + 0x80000008 ECX. + + The support for the leaf is discovered by checking if the supported + extended CPUID level is >= 0x80000008. + + The shifts from the APIC ID for the Socket ID is calculated from the + `ApicIdSize` field in `ECX[15:12]` if it is non-zero. + + If `ApicIdSize` is reported to be zero, the shift is calculated as the + order of the `number of threads` calculated from `NC` field in + `ECX[7:0]` which describes the `number of threads - 1` on the package. + + Unless Extended APIC ID is supported, the APIC ID used to find the + Socket ID is from the `LocalApicId` field of CPUID leaf 0x00000001 + `EBX[31:24]`. + + The topology parsing continues to detect if Extended APIC ID is + supported or not. + + + 4) CPUID leaf 0x8000001E [Extended APIC ID, Core Identifiers, Node Identifiers] + (Core::X86::Cpuid::{ExtApicId,CoreId,NodeId}) + + The support for Extended APIC ID can be detected by checking for the + presence of `TopologyExtensions` in `ECX[22]` of CPUID leaf 0x80000001 + [Feature Identifiers] (Core::X86::Cpuid::FeatureExtIdEcx). + + If Topology Extensions is supported, the APIC ID from `ExtendedApicId` + from CPUID leaf 0x8000001E `EAX[31:0]` should be preferred over that from + `LocalApicId` field of CPUID leaf 0x00000001 `EBX[31:24]` for topology + enumeration. + + On processors of Family 0x17 and above that do not support CPUID leaf + 0x80000026 or CPUID leaf 0xB, the shifts from the APIC ID for the Core + ID is calculated using the order of `number of threads per core` + calculated using the `ThreadsPerCore` field in `EBX[15:8]` which + describes `number of threads per core - 1`. + + On Processors of Family 0x15, the Core ID from `EBX[7:0]` is used as the + `cu_id` (Compute Unit ID) to detect CPUs that share the compute units. + + + All AMD processors that support the `TopologyExtensions` feature store the + `NodeId` from the `ECX[7:0]` of CPUID leaf 0x8000001E + (Core::X86::Cpuid::NodeId) as the per-CPU `node_id`. On older processors, + the `node_id` was discovered using MSR_FAM10H_NODE_ID MSR (MSR + 0x0xc001_100c). The presence of the NODE_ID MSR was detected by checking + `ECX[19]` of CPUID leaf 0x80000001 [Feature Identifiers] + (Core::X86::Cpuid::FeatureExtIdEcx). + + +2) Intel + + On Intel platforms, the CPUID leaves that enumerate the processor + topology are as follows: + + 1) CPUID leaf 0x1F (V2 Extended Topology Enumeration Leaf) + + The CPUID leaf 0x1F is the extension of the CPUID leaf 0xB and provides + the topology information of Core, Module, Tile, Die, DieGrp, and Socket + in each level. + + The support for the leaf is discovered by checking if the supported + CPUID level is >= 0x1F and then `EBX[31:0]` at a particular level + (starting from 0) is non-zero. + + The `Domain Type` in `ECX[15:8]` of the sub-leaf provides the topology + domain that the level describes - Core, Module, Tile, Die, DieGrp, and + Socket. + + The kernel uses the value from `EAX[4:0]` to discover the number of + bits that need to be right shifted from the `x2APIC ID` in `EDX[31:0]` + to get a unique Topology ID for the topology level. CPUs with the same + Topology ID share the resources at that level. + + If CPUID leaf 0x1F is supported, further parsing is not required. + + + 2) CPUID leaf 0x0000000B (Extended Topology Enumeration Leaf) + + The extended CPUID leaf 0x0000000B is the predecessor of the V2 Extended + Topology Enumeration Leaf 0x1F and only describes the core, and the + socket domains of the processor topology. + + The support for the leaf is iscovered by checking if the supported CPUID + level is >= 0xB and then checking if `EBX[31:0]` at a particular level + (starting from 0) is non-zero. + + CPUID leaf 0x0000000B shares the same layout as CPUID leaf 0x1F and + should be enumerated in a similar manner. + + If CPUID leaf 0xB is supported, further parsing is not required. + + + 3) CPUID leaf 0x00000004 (Deterministic Cache Parameters Leaf) + + On Intel processors that support neither CPUID leaf 0x1F, nor CPUID leaf + 0xB, the shifts for the SMT domains is calculated using the number of + CPUs sharing the L1 cache. + + Processors that feature Hyper-Threading is detected using `EDX[28]` of + CPUID leaf 0x1 (Basic CPUID Information). + + The order of `Maximum number of addressable IDs for logical processors + sharing this cache` from `EAX[25:14]` of level-0 of CPUID 0x4 provides + the shifts from the APIC ID required to compute the Core ID. + + The APIC ID and Package information is computed using the data from + CPUID leaf 0x1. + + + 4) CPUID leaf 0x00000001 (Basic CPUID Information) + + The mask and shifts to derive the Physical Package (socket) ID is + computed using the `Maximum number of addressable IDs for logical + processors in this physical package` from `EBX[23:16]` of CPUID leaf + 0x1. + + The APIC ID on the legacy platforms is derived from the `Initial APIC + ID` field from `EBX[31:24]` of CPUID leaf 0x1. + + +3) Centaur and Zhaoxin + + Similar to Intel, Centaur and Zhaoxin use a combination of CPUID leaf + 0x00000004 (Deterministic Cache Parameters Leaf) and CPUID leaf 0x00000001 + (Basic CPUID Information) to derive the topology information. + + + System topology examples ======================== diff --git a/Documentation/devicetree/bindings/arm/cpus.yaml b/Documentation/devicetree/bindings/arm/cpus.yaml index 5bd517befb68..4accf4cbc6c7 100644 --- a/Documentation/devicetree/bindings/arm/cpus.yaml +++ b/Documentation/devicetree/bindings/arm/cpus.yaml @@ -353,6 +353,12 @@ properties: $ref: /schemas/types.yaml#/definitions/phandle description: Link to Mediatek Cache Coherent Interconnect + edac-enabled: + $ref: /schemas/types.yaml#/definitions/flag + description: + A72 CPUs support Error Detection And Correction (EDAC) on their L1 and + L2 caches. This flag marks this function as usable. + qcom,saw: $ref: /schemas/types.yaml#/definitions/phandle description: @@ -400,6 +406,17 @@ allOf: - $ref: /schemas/cpu.yaml# - $ref: /schemas/opp/opp-v1.yaml# - if: + not: + properties: + compatible: + contains: + const: arm,cortex-a72 + then: + # Allow edac-enabled only for Cortex A72 + properties: + edac-enabled: false + + - if: # If the enable-method property contains one of those values properties: enable-method: diff --git a/Documentation/devicetree/bindings/interrupt-controller/aspeed,ast2500-scu-ic.yaml b/Documentation/devicetree/bindings/interrupt-controller/aspeed,ast2500-scu-ic.yaml index d5287a2bf866..d998a9d69b91 100644 --- a/Documentation/devicetree/bindings/interrupt-controller/aspeed,ast2500-scu-ic.yaml +++ b/Documentation/devicetree/bindings/interrupt-controller/aspeed,ast2500-scu-ic.yaml @@ -5,7 +5,7 @@ $id: http://devicetree.org/schemas/interrupt-controller/aspeed,ast2500-scu-ic.yaml# $schema: http://devicetree.org/meta-schemas/core.yaml# -title: Aspeed AST25XX and AST26XX SCU Interrupt Controller +title: Aspeed AST25XX, AST26XX, AST27XX SCU Interrupt Controller maintainers: - Eddie James <eajames@linux.ibm.com> @@ -16,6 +16,10 @@ properties: - aspeed,ast2500-scu-ic - aspeed,ast2600-scu-ic0 - aspeed,ast2600-scu-ic1 + - aspeed,ast2700-scu-ic0 + - aspeed,ast2700-scu-ic1 + - aspeed,ast2700-scu-ic2 + - aspeed,ast2700-scu-ic3 reg: maxItems: 1 diff --git a/Documentation/devicetree/bindings/memory-controllers/xlnx,versal-net-ddrmc5.yaml b/Documentation/devicetree/bindings/memory-controllers/xlnx,versal-net-ddrmc5.yaml new file mode 100644 index 000000000000..479288567d0b --- /dev/null +++ b/Documentation/devicetree/bindings/memory-controllers/xlnx,versal-net-ddrmc5.yaml @@ -0,0 +1,41 @@ +# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/memory-controllers/xlnx,versal-net-ddrmc5.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Xilinx Versal NET Memory Controller + +maintainers: + - Shubhrajyoti Datta <shubhrajyoti.datta@amd.com> + +description: + The integrated DDR Memory Controllers (DDRMCs) support both DDR5 and LPDDR5 + compact and extended memory interfaces. Versal NET DDR memory controller + has an optional ECC support which correct single bit ECC errors and detect + double bit ECC errors. It also has support for reporting other errors like + MMCM (Mixed-Mode Clock Manager) errors and General software errors. + +properties: + compatible: + const: xlnx,versal-net-ddrmc5 + + amd,rproc: + $ref: /schemas/types.yaml#/definitions/phandle + description: + phandle to the remoteproc_r5 rproc node using which APU interacts + with remote processor. APU primarily communicates with the RPU for + accessing the DDRMC address space and getting error notification. + +required: + - compatible + - amd,rproc + +additionalProperties: false + +examples: + - | + memory-controller { + compatible = "xlnx,versal-net-ddrmc5"; + amd,rproc = <&remoteproc_r5>; + }; diff --git a/Documentation/devicetree/bindings/mfd/aspeed,ast2x00-scu.yaml b/Documentation/devicetree/bindings/mfd/aspeed,ast2x00-scu.yaml index 5eccd10d95ce..67be6d095fe4 100644 --- a/Documentation/devicetree/bindings/mfd/aspeed,ast2x00-scu.yaml +++ b/Documentation/devicetree/bindings/mfd/aspeed,ast2x00-scu.yaml @@ -75,6 +75,10 @@ patternProperties: - aspeed,ast2500-scu-ic - aspeed,ast2600-scu-ic0 - aspeed,ast2600-scu-ic1 + - aspeed,ast2700-scu-ic0 + - aspeed,ast2700-scu-ic1 + - aspeed,ast2700-scu-ic2 + - aspeed,ast2700-scu-ic3 '^silicon-id@[0-9a-f]+$': description: Unique hardware silicon identifiers within the SoC diff --git a/Documentation/devicetree/bindings/timer/faraday,fttmr010.txt b/Documentation/devicetree/bindings/timer/faraday,fttmr010.txt deleted file mode 100644 index 3cb2f4c98d64..000000000000 --- a/Documentation/devicetree/bindings/timer/faraday,fttmr010.txt +++ /dev/null @@ -1,38 +0,0 @@ -Faraday Technology timer - -This timer is a generic IP block from Faraday Technology, embedded in the -Cortina Systems Gemini SoCs and other designs. - -Required properties: - -- compatible : Must be one of - "faraday,fttmr010" - "cortina,gemini-timer", "faraday,fttmr010" - "moxa,moxart-timer", "faraday,fttmr010" - "aspeed,ast2400-timer" - "aspeed,ast2500-timer" - "aspeed,ast2600-timer" - -- reg : Should contain registers location and length -- interrupts : Should contain the three timer interrupts usually with - flags for falling edge - -Optionally required properties: - -- clocks : a clock to provide the tick rate for "faraday,fttmr010" -- clock-names : should be "EXTCLK" and "PCLK" for the external tick timer - and peripheral clock respectively, for "faraday,fttmr010" -- syscon : a phandle to the global Gemini system controller if the compatible - type is "cortina,gemini-timer" - -Example: - -timer@43000000 { - compatible = "faraday,fttmr010"; - reg = <0x43000000 0x1000>; - interrupts = <14 IRQ_TYPE_EDGE_FALLING>, /* Timer 1 */ - <15 IRQ_TYPE_EDGE_FALLING>, /* Timer 2 */ - <16 IRQ_TYPE_EDGE_FALLING>; /* Timer 3 */ - clocks = <&extclk>, <&pclk>; - clock-names = "EXTCLK", "PCLK"; -}; diff --git a/Documentation/devicetree/bindings/timer/faraday,fttmr010.yaml b/Documentation/devicetree/bindings/timer/faraday,fttmr010.yaml new file mode 100644 index 000000000000..39506323556c --- /dev/null +++ b/Documentation/devicetree/bindings/timer/faraday,fttmr010.yaml @@ -0,0 +1,89 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/timer/faraday,fttmr010.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Faraday FTTMR010 timer + +maintainers: + - Joel Stanley <joel@jms.id.au> + - Linus Walleij <linus.walleij@linaro.org> + +description: + This timer is a generic IP block from Faraday Technology, embedded in the + Cortina Systems Gemini SoCs and other designs. + +properties: + compatible: + oneOf: + - items: + - const: moxa,moxart-timer + - const: faraday,fttmr010 + - enum: + - aspeed,ast2400-timer + - aspeed,ast2500-timer + - aspeed,ast2600-timer + - cortina,gemini-timer + - faraday,fttmr010 + + reg: + maxItems: 1 + + interrupts: + minItems: 1 + maxItems: 8 + description: One interrupt per timer + + clocks: + minItems: 1 + items: + - description: Peripheral clock + - description: External tick clock + + clock-names: + minItems: 1 + items: + - const: PCLK + - const: EXTCLK + + resets: + maxItems: 1 + + syscon: + description: System controller phandle for Gemini systems + $ref: /schemas/types.yaml#/definitions/phandle + +required: + - compatible + - reg + - interrupts + +allOf: + - if: + properties: + compatible: + contains: + const: cortina,gemini-timer + then: + required: + - syscon + else: + properties: + syscon: false + +additionalProperties: false + +examples: + - | + #include <dt-bindings/interrupt-controller/irq.h> + + timer@43000000 { + compatible = "faraday,fttmr010"; + reg = <0x43000000 0x1000>; + interrupts = <14 IRQ_TYPE_EDGE_FALLING>, /* Timer 1 */ + <15 IRQ_TYPE_EDGE_FALLING>, /* Timer 2 */ + <16 IRQ_TYPE_EDGE_FALLING>; /* Timer 3 */ + clocks = <&pclk>, <&extclk>; + clock-names = "PCLK", "EXTCLK"; + }; diff --git a/Documentation/devicetree/bindings/timer/fsl,ftm-timer.yaml b/Documentation/devicetree/bindings/timer/fsl,ftm-timer.yaml index 0e4a8ddc3de3..e3b61b62521e 100644 --- a/Documentation/devicetree/bindings/timer/fsl,ftm-timer.yaml +++ b/Documentation/devicetree/bindings/timer/fsl,ftm-timer.yaml @@ -14,7 +14,9 @@ properties: const: fsl,ftm-timer reg: - maxItems: 1 + items: + - description: clock event device + - description: clock source device interrupts: maxItems: 1 @@ -50,7 +52,8 @@ examples: ftm@400b8000 { compatible = "fsl,ftm-timer"; - reg = <0x400b8000 0x1000>; + reg = <0x400b8000 0x1000>, + <0x400b9000 0x1000>; interrupts = <0 44 IRQ_TYPE_LEVEL_HIGH>; clock-names = "ftm-evt", "ftm-src", "ftm-evt-counter-en", "ftm-src-counter-en"; clocks = <&clks VF610_CLK_FTM2>, <&clks VF610_CLK_FTM3>, diff --git a/Documentation/devicetree/bindings/timer/fsl,timrot.yaml b/Documentation/devicetree/bindings/timer/fsl,timrot.yaml new file mode 100644 index 000000000000..d181f274ef9f --- /dev/null +++ b/Documentation/devicetree/bindings/timer/fsl,timrot.yaml @@ -0,0 +1,48 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/timer/fsl,timrot.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Freescale MXS Timer + +maintainers: + - Frank Li <Frank.Li@nxp.com> + +properties: + compatible: + items: + - enum: + - fsl,imx23-timrot + - fsl,imx28-timrot + - const: fsl,timrot + + reg: + maxItems: 1 + + interrupts: + items: + - description: irq for timer0 + - description: irq for timer1 + - description: irq for timer2 + - description: irq for timer3 + + clocks: + maxItems: 1 + +required: + - compatible + - reg + - interrupts + - clocks + +additionalProperties: false + +examples: + - | + timer: timer@80068000 { + compatible = "fsl,imx28-timrot", "fsl,timrot"; + reg = <0x80068000 0x2000>; + interrupts = <48>, <49>, <50>, <51>; + clocks = <&clks 26>; + }; diff --git a/Documentation/devicetree/bindings/timer/fsl,vf610-pit.yaml b/Documentation/devicetree/bindings/timer/fsl,vf610-pit.yaml index bee2c35bd0e2..42e130654d58 100644 --- a/Documentation/devicetree/bindings/timer/fsl,vf610-pit.yaml +++ b/Documentation/devicetree/bindings/timer/fsl,vf610-pit.yaml @@ -15,8 +15,13 @@ description: properties: compatible: - enum: - - fsl,vf610-pit + oneOf: + - enum: + - fsl,vf610-pit + - nxp,s32g2-pit + - items: + - const: nxp,s32g3-pit + - const: nxp,s32g2-pit reg: maxItems: 1 diff --git a/Documentation/devicetree/bindings/timer/mediatek,timer.yaml b/Documentation/devicetree/bindings/timer/mediatek,timer.yaml index f68fc7050c56..e3e38066c2cb 100644 --- a/Documentation/devicetree/bindings/timer/mediatek,timer.yaml +++ b/Documentation/devicetree/bindings/timer/mediatek,timer.yaml @@ -26,6 +26,7 @@ properties: - items: - enum: - mediatek,mt2701-timer + - mediatek,mt6572-timer - mediatek,mt6580-timer - mediatek,mt6582-timer - mediatek,mt6589-timer @@ -44,6 +45,7 @@ properties: - mediatek,mt8188-timer - mediatek,mt8192-timer - mediatek,mt8195-timer + - mediatek,mt8196-timer - mediatek,mt8365-systimer - const: mediatek,mt6765-timer diff --git a/Documentation/devicetree/bindings/timer/samsung,exynos4210-mct.yaml b/Documentation/devicetree/bindings/timer/samsung,exynos4210-mct.yaml index 10578f544581..a4b229e0e78a 100644 --- a/Documentation/devicetree/bindings/timer/samsung,exynos4210-mct.yaml +++ b/Documentation/devicetree/bindings/timer/samsung,exynos4210-mct.yaml @@ -26,6 +26,7 @@ properties: - items: - enum: - axis,artpec8-mct + - axis,artpec9-mct - google,gs101-mct - samsung,exynos2200-mct-peris - samsung,exynos3250-mct @@ -131,6 +132,7 @@ allOf: contains: enum: - axis,artpec8-mct + - axis,artpec9-mct - google,gs101-mct - samsung,exynos2200-mct-peris - samsung,exynos5260-mct diff --git a/Documentation/filesystems/resctrl.rst b/Documentation/filesystems/resctrl.rst index c7949dd44f2f..006d23af66e1 100644 --- a/Documentation/filesystems/resctrl.rst +++ b/Documentation/filesystems/resctrl.rst @@ -26,6 +26,7 @@ MBM (Memory Bandwidth Monitoring) "cqm_mbm_total", "cqm_mbm_local" MBA (Memory Bandwidth Allocation) "mba" SMBA (Slow Memory Bandwidth Allocation) "" BMEC (Bandwidth Monitoring Event Configuration) "" +ABMC (Assignable Bandwidth Monitoring Counters) "" =============================================== ================================ Historically, new features were made visible by default in /proc/cpuinfo. This @@ -256,6 +257,144 @@ with the following files: # cat /sys/fs/resctrl/info/L3_MON/mbm_local_bytes_config 0=0x30;1=0x30;3=0x15;4=0x15 +"mbm_assign_mode": + The supported counter assignment modes. The enclosed brackets indicate which mode + is enabled. The MBM events associated with counters may reset when "mbm_assign_mode" + is changed. + :: + + # cat /sys/fs/resctrl/info/L3_MON/mbm_assign_mode + [mbm_event] + default + + "mbm_event": + + mbm_event mode allows users to assign a hardware counter to an RMID, event + pair and monitor the bandwidth usage as long as it is assigned. The hardware + continues to track the assigned counter until it is explicitly unassigned by + the user. Each event within a resctrl group can be assigned independently. + + In this mode, a monitoring event can only accumulate data while it is backed + by a hardware counter. Use "mbm_L3_assignments" found in each CTRL_MON and MON + group to specify which of the events should have a counter assigned. The number + of counters available is described in the "num_mbm_cntrs" file. Changing the + mode may cause all counters on the resource to reset. + + Moving to mbm_event counter assignment mode requires users to assign the counters + to the events. Otherwise, the MBM event counters will return 'Unassigned' when read. + + The mode is beneficial for AMD platforms that support more CTRL_MON + and MON groups than available hardware counters. By default, this + feature is enabled on AMD platforms with the ABMC (Assignable Bandwidth + Monitoring Counters) capability, ensuring counters remain assigned even + when the corresponding RMID is not actively used by any processor. + + "default": + + In default mode, resctrl assumes there is a hardware counter for each + event within every CTRL_MON and MON group. On AMD platforms, it is + recommended to use the mbm_event mode, if supported, to prevent reset of MBM + events between reads resulting from hardware re-allocating counters. This can + result in misleading values or display "Unavailable" if no counter is assigned + to the event. + + * To enable "mbm_event" counter assignment mode: + :: + + # echo "mbm_event" > /sys/fs/resctrl/info/L3_MON/mbm_assign_mode + + * To enable "default" monitoring mode: + :: + + # echo "default" > /sys/fs/resctrl/info/L3_MON/mbm_assign_mode + +"num_mbm_cntrs": + The maximum number of counters (total of available and assigned counters) in + each domain when the system supports mbm_event mode. + + For example, on a system with maximum of 32 memory bandwidth monitoring + counters in each of its L3 domains: + :: + + # cat /sys/fs/resctrl/info/L3_MON/num_mbm_cntrs + 0=32;1=32 + +"available_mbm_cntrs": + The number of counters available for assignment in each domain when mbm_event + mode is enabled on the system. + + For example, on a system with 30 available [hardware] assignable counters + in each of its L3 domains: + :: + + # cat /sys/fs/resctrl/info/L3_MON/available_mbm_cntrs + 0=30;1=30 + +"event_configs": + Directory that exists when "mbm_event" counter assignment mode is supported. + Contains a sub-directory for each MBM event that can be assigned to a counter. + + Two MBM events are supported by default: mbm_local_bytes and mbm_total_bytes. + Each MBM event's sub-directory contains a file named "event_filter" that is + used to view and modify which memory transactions the MBM event is configured + with. The file is accessible only when "mbm_event" counter assignment mode is + enabled. + + List of memory transaction types supported: + + ========================== ======================================================== + Name Description + ========================== ======================================================== + dirty_victim_writes_all Dirty Victims from the QOS domain to all types of memory + remote_reads_slow_memory Reads to slow memory in the non-local NUMA domain + local_reads_slow_memory Reads to slow memory in the local NUMA domain + remote_non_temporal_writes Non-temporal writes to non-local NUMA domain + local_non_temporal_writes Non-temporal writes to local NUMA domain + remote_reads Reads to memory in the non-local NUMA domain + local_reads Reads to memory in the local NUMA domain + ========================== ======================================================== + + For example:: + + # cat /sys/fs/resctrl/info/L3_MON/event_configs/mbm_total_bytes/event_filter + local_reads,remote_reads,local_non_temporal_writes,remote_non_temporal_writes, + local_reads_slow_memory,remote_reads_slow_memory,dirty_victim_writes_all + + # cat /sys/fs/resctrl/info/L3_MON/event_configs/mbm_local_bytes/event_filter + local_reads,local_non_temporal_writes,local_reads_slow_memory + + Modify the event configuration by writing to the "event_filter" file within + the "event_configs" directory. The read/write "event_filter" file contains the + configuration of the event that reflects which memory transactions are counted by it. + + For example:: + + # echo "local_reads, local_non_temporal_writes" > + /sys/fs/resctrl/info/L3_MON/event_configs/mbm_total_bytes/event_filter + + # cat /sys/fs/resctrl/info/L3_MON/event_configs/mbm_total_bytes/event_filter + local_reads,local_non_temporal_writes + +"mbm_assign_on_mkdir": + Exists when "mbm_event" counter assignment mode is supported. Accessible + only when "mbm_event" counter assignment mode is enabled. + + Determines if a counter will automatically be assigned to an RMID, MBM event + pair when its associated monitor group is created via mkdir. Enabled by default + on boot, also when switched from "default" mode to "mbm_event" counter assignment + mode. Users can disable this capability by writing to the interface. + + "0": + Auto assignment is disabled. + "1": + Auto assignment is enabled. + + Example:: + + # echo 0 > /sys/fs/resctrl/info/L3_MON/mbm_assign_on_mkdir + # cat /sys/fs/resctrl/info/L3_MON/mbm_assign_on_mkdir + 0 + "max_threshold_occupancy": Read/write file provides the largest value (in bytes) at which a previously used LLC_occupancy @@ -380,10 +519,77 @@ When monitoring is enabled all MON groups will also contain: for the L3 cache they occupy). These are named "mon_sub_L3_YY" where "YY" is the node number. + When the 'mbm_event' counter assignment mode is enabled, reading + an MBM event of a MON group returns 'Unassigned' if no hardware + counter is assigned to it. For CTRL_MON groups, 'Unassigned' is + returned if the MBM event does not have an assigned counter in the + CTRL_MON group nor in any of its associated MON groups. + "mon_hw_id": Available only with debug option. The identifier used by hardware for the monitor group. On x86 this is the RMID. +When monitoring is enabled all MON groups may also contain: + +"mbm_L3_assignments": + Exists when "mbm_event" counter assignment mode is supported and lists the + counter assignment states of the group. + + The assignment list is displayed in the following format: + + <Event>:<Domain ID>=<Assignment state>;<Domain ID>=<Assignment state> + + Event: A valid MBM event in the + /sys/fs/resctrl/info/L3_MON/event_configs directory. + + Domain ID: A valid domain ID. When writing, '*' applies the changes + to all the domains. + + Assignment states: + + _ : No counter assigned. + + e : Counter assigned exclusively. + + Example: + + To display the counter assignment states for the default group. + :: + + # cd /sys/fs/resctrl + # cat /sys/fs/resctrl/mbm_L3_assignments + mbm_total_bytes:0=e;1=e + mbm_local_bytes:0=e;1=e + + Assignments can be modified by writing to the interface. + + Examples: + + To unassign the counter associated with the mbm_total_bytes event on domain 0: + :: + + # echo "mbm_total_bytes:0=_" > /sys/fs/resctrl/mbm_L3_assignments + # cat /sys/fs/resctrl/mbm_L3_assignments + mbm_total_bytes:0=_;1=e + mbm_local_bytes:0=e;1=e + + To unassign the counter associated with the mbm_total_bytes event on all the domains: + :: + + # echo "mbm_total_bytes:*=_" > /sys/fs/resctrl/mbm_L3_assignments + # cat /sys/fs/resctrl/mbm_L3_assignments + mbm_total_bytes:0=_;1=_ + mbm_local_bytes:0=e;1=e + + To assign a counter associated with the mbm_total_bytes event on all domains in + exclusive mode: + :: + + # echo "mbm_total_bytes:*=e" > /sys/fs/resctrl/mbm_L3_assignments + # cat /sys/fs/resctrl/mbm_L3_assignments + mbm_total_bytes:0=e;1=e + mbm_local_bytes:0=e;1=e + When the "mba_MBps" mount option is used all CTRL_MON groups will also contain: "mba_MBps_event": @@ -1429,6 +1635,125 @@ View the llc occupancy snapshot:: # cat /sys/fs/resctrl/p1/mon_data/mon_L3_00/llc_occupancy 11234000 + +Examples on working with mbm_assign_mode +======================================== + +a. Check if MBM counter assignment mode is supported. +:: + + # mount -t resctrl resctrl /sys/fs/resctrl/ + + # cat /sys/fs/resctrl/info/L3_MON/mbm_assign_mode + [mbm_event] + default + +The "mbm_event" mode is detected and enabled. + +b. Check how many assignable counters are supported. +:: + + # cat /sys/fs/resctrl/info/L3_MON/num_mbm_cntrs + 0=32;1=32 + +c. Check how many assignable counters are available for assignment in each domain. +:: + + # cat /sys/fs/resctrl/info/L3_MON/available_mbm_cntrs + 0=30;1=30 + +d. To list the default group's assign states. +:: + + # cat /sys/fs/resctrl/mbm_L3_assignments + mbm_total_bytes:0=e;1=e + mbm_local_bytes:0=e;1=e + +e. To unassign the counter associated with the mbm_total_bytes event on domain 0. +:: + + # echo "mbm_total_bytes:0=_" > /sys/fs/resctrl/mbm_L3_assignments + # cat /sys/fs/resctrl/mbm_L3_assignments + mbm_total_bytes:0=_;1=e + mbm_local_bytes:0=e;1=e + +f. To unassign the counter associated with the mbm_total_bytes event on all domains. +:: + + # echo "mbm_total_bytes:*=_" > /sys/fs/resctrl/mbm_L3_assignments + # cat /sys/fs/resctrl/mbm_L3_assignment + mbm_total_bytes:0=_;1=_ + mbm_local_bytes:0=e;1=e + +g. To assign a counter associated with the mbm_total_bytes event on all domains in +exclusive mode. +:: + + # echo "mbm_total_bytes:*=e" > /sys/fs/resctrl/mbm_L3_assignments + # cat /sys/fs/resctrl/mbm_L3_assignments + mbm_total_bytes:0=e;1=e + mbm_local_bytes:0=e;1=e + +h. Read the events mbm_total_bytes and mbm_local_bytes of the default group. There is +no change in reading the events with the assignment. +:: + + # cat /sys/fs/resctrl/mon_data/mon_L3_00/mbm_total_bytes + 779247936 + # cat /sys/fs/resctrl/mon_data/mon_L3_01/mbm_total_bytes + 562324232 + # cat /sys/fs/resctrl/mon_data/mon_L3_00/mbm_local_bytes + 212122123 + # cat /sys/fs/resctrl/mon_data/mon_L3_01/mbm_local_bytes + 121212144 + +i. Check the event configurations. +:: + + # cat /sys/fs/resctrl/info/L3_MON/event_configs/mbm_total_bytes/event_filter + local_reads,remote_reads,local_non_temporal_writes,remote_non_temporal_writes, + local_reads_slow_memory,remote_reads_slow_memory,dirty_victim_writes_all + + # cat /sys/fs/resctrl/info/L3_MON/event_configs/mbm_local_bytes/event_filter + local_reads,local_non_temporal_writes,local_reads_slow_memory + +j. Change the event configuration for mbm_local_bytes. +:: + + # echo "local_reads, local_non_temporal_writes, local_reads_slow_memory, remote_reads" > + /sys/fs/resctrl/info/L3_MON/event_configs/mbm_local_bytes/event_filter + + # cat /sys/fs/resctrl/info/L3_MON/event_configs/mbm_local_bytes/event_filter + local_reads,local_non_temporal_writes,local_reads_slow_memory,remote_reads + +k. Now read the local events again. The first read may come back with "Unavailable" +status. The subsequent read of mbm_local_bytes will display the current value. +:: + + # cat /sys/fs/resctrl/mon_data/mon_L3_00/mbm_local_bytes + Unavailable + # cat /sys/fs/resctrl/mon_data/mon_L3_00/mbm_local_bytes + 2252323 + # cat /sys/fs/resctrl/mon_data/mon_L3_01/mbm_local_bytes + Unavailable + # cat /sys/fs/resctrl/mon_data/mon_L3_01/mbm_local_bytes + 1566565 + +l. Users have the option to go back to 'default' mbm_assign_mode if required. This can be +done using the following command. Note that switching the mbm_assign_mode may reset all +the MBM counters (and thus all MBM events) of all the resctrl groups. +:: + + # echo "default" > /sys/fs/resctrl/info/L3_MON/mbm_assign_mode + # cat /sys/fs/resctrl/info/L3_MON/mbm_assign_mode + mbm_event + [default] + +m. Unmount the resctrl filesystem. +:: + + # umount /sys/fs/resctrl/ + Intel RDT Errata ================ @@ -34,13 +34,24 @@ arch/$(SRCARCH)/kernel/asm-offsets.s: $(timeconst-file) $(bounds-file) $(offsets-file): arch/$(SRCARCH)/kernel/asm-offsets.s FORCE $(call filechk,offsets,__ASM_OFFSETS_H__) +# Generate rq-offsets.h + +rq-offsets-file := include/generated/rq-offsets.h + +targets += kernel/sched/rq-offsets.s + +kernel/sched/rq-offsets.s: $(offsets-file) + +$(rq-offsets-file): kernel/sched/rq-offsets.s FORCE + $(call filechk,offsets,__RQ_OFFSETS_H__) + # Check for missing system calls quiet_cmd_syscalls = CALL $< cmd_syscalls = $(CONFIG_SHELL) $< $(CC) $(c_flags) $(missing_syscalls_flags) PHONY += missing-syscalls -missing-syscalls: scripts/checksyscalls.sh $(offsets-file) +missing-syscalls: scripts/checksyscalls.sh $(rq-offsets-file) $(call cmd,syscalls) # Check the manual modification of atomic headers diff --git a/MAINTAINERS b/MAINTAINERS index 8f109287c2f3..331728e93f69 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1989,6 +1989,7 @@ S: Maintained F: arch/arm/include/asm/arch_timer.h F: arch/arm64/include/asm/arch_timer.h F: drivers/clocksource/arm_arch_timer.c +F: drivers/clocksource/arm_arch_timer_mmio.c ARM GENERIC INTERRUPT CONTROLLER DRIVERS M: Marc Zyngier <maz@kernel.org> @@ -3988,8 +3989,9 @@ F: drivers/input/touchscreen/atmel_mxt_ts.c ATOMIC INFRASTRUCTURE M: Will Deacon <will@kernel.org> M: Peter Zijlstra <peterz@infradead.org> -R: Boqun Feng <boqun.feng@gmail.com> +M: Boqun Feng <boqun.feng@gmail.com> R: Mark Rutland <mark.rutland@arm.com> +R: Gary Guo <gary@garyguo.net> L: linux-kernel@vger.kernel.org S: Maintained F: Documentation/atomic_*.txt @@ -3997,6 +3999,9 @@ F: arch/*/include/asm/atomic*.h F: include/*/atomic*.h F: include/linux/refcount.h F: scripts/atomic/ +F: rust/kernel/sync/atomic.rs +F: rust/kernel/sync/atomic/ +F: rust/kernel/sync/refcount.rs ATTO EXPRESSSAS SAS/SATA RAID SCSI DRIVER M: Bradley Grove <linuxdrivers@attotech.com> @@ -8741,9 +8746,6 @@ F: drivers/edac/thunderx_edac* EDAC-CORE M: Borislav Petkov <bp@alien8.de> M: Tony Luck <tony.luck@intel.com> -R: James Morse <james.morse@arm.com> -R: Mauro Carvalho Chehab <mchehab@kernel.org> -R: Robert Richter <rric@kernel.org> L: linux-edac@vger.kernel.org S: Supported T: git git://git.kernel.org/pub/scm/linux/kernel/git/ras/ras.git edac-for-next @@ -8751,6 +8753,13 @@ F: Documentation/driver-api/edac.rst F: drivers/edac/ F: include/linux/edac.h +EDAC-A72 +M: Vijay Balakrishna <vijayb@linux.microsoft.com> +M: Tyler Hicks <code@tyhicks.com> +L: linux-edac@vger.kernel.org +S: Supported +F: drivers/edac/a72_edac.c + EDAC-DMC520 M: Lei Wang <lewan@microsoft.com> L: linux-edac@vger.kernel.org @@ -21178,6 +21187,7 @@ M: Tony Luck <tony.luck@intel.com> M: Reinette Chatre <reinette.chatre@intel.com> R: Dave Martin <Dave.Martin@arm.com> R: James Morse <james.morse@arm.com> +R: Babu Moger <babu.moger@amd.com> L: linux-kernel@vger.kernel.org S: Supported F: Documentation/filesystems/resctrl.rst @@ -27671,6 +27681,13 @@ S: Maintained F: Documentation/devicetree/bindings/memory-controllers/xlnx,versal-ddrmc-edac.yaml F: drivers/edac/versal_edac.c +XILINX VERSALNET EDAC DRIVER +M: Shubhrajyoti Datta <shubhrajyoti.datta@amd.com> +S: Maintained +F: Documentation/devicetree/bindings/memory-controllers/xlnx,versal-net-ddrmc5.yaml +F: drivers/edac/versalnet_edac.c +F: include/linux/cdx/edac_cdx_pcol.h + XILINX WATCHDOG DRIVER M: Srinivas Neeli <srinivas.neeli@amd.com> R: Shubhrajyoti Datta <shubhrajyoti.datta@amd.com> diff --git a/arch/Kconfig b/arch/Kconfig index 97642c08a124..5440616f0774 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -41,6 +41,44 @@ config HOTPLUG_SMT config SMT_NUM_THREADS_DYNAMIC bool +config ARCH_SUPPORTS_SCHED_SMT + bool + +config ARCH_SUPPORTS_SCHED_CLUSTER + bool + +config ARCH_SUPPORTS_SCHED_MC + bool + +config SCHED_SMT + bool "SMT (Hyperthreading) scheduler support" + depends on ARCH_SUPPORTS_SCHED_SMT + default y + help + Improves the CPU scheduler's decision making when dealing with + MultiThreading at a cost of slightly increased overhead in some + places. If unsure say N here. + +config SCHED_CLUSTER + bool "Cluster scheduler support" + depends on ARCH_SUPPORTS_SCHED_CLUSTER + default y + help + Cluster scheduler support improves the CPU scheduler's decision + making when dealing with machines that have clusters of CPUs. + Cluster usually means a couple of CPUs which are placed closely + by sharing mid-level caches, last-level cache tags or internal + busses. + +config SCHED_MC + bool "Multi-Core Cache (MC) scheduler support" + depends on ARCH_SUPPORTS_SCHED_MC + default y + help + Multi-core scheduler support improves the CPU scheduler's decision + making when dealing with multi-core CPU chips at a cost of slightly + increased overhead in some places. If unsure say N here. + # Selected by HOTPLUG_CORE_SYNC_DEAD or HOTPLUG_CORE_SYNC_FULL config HOTPLUG_CORE_SYNC bool @@ -1619,7 +1657,7 @@ config HAVE_SPARSE_SYSCALL_NR related optimizations for a given architecture. config ARCH_HAS_VDSO_ARCH_DATA - depends on GENERIC_VDSO_DATA_STORE + depends on HAVE_GENERIC_VDSO bool config ARCH_HAS_VDSO_TIME_DATA @@ -1740,6 +1778,10 @@ config ARCH_VMLINUX_NEEDS_RELOCS relocations preserved. This is used by some architectures to construct bespoke relocation tables for KASLR. +# Select if architecture uses the common generic TIF bits +config HAVE_GENERIC_TIF_BITS + bool + source "kernel/gcov/Kconfig" source "scripts/gcc-plugins/Kconfig" diff --git a/arch/alpha/kernel/asm-offsets.c b/arch/alpha/kernel/asm-offsets.c index e9dad60b147f..1ebb05890499 100644 --- a/arch/alpha/kernel/asm-offsets.c +++ b/arch/alpha/kernel/asm-offsets.c @@ -4,6 +4,7 @@ * This code generates raw asm output which is post-processed to extract * and format the required data. */ +#define COMPILE_OFFSETS #include <linux/types.h> #include <linux/stddef.h> diff --git a/arch/arc/kernel/asm-offsets.c b/arch/arc/kernel/asm-offsets.c index f77deb799175..2978da85fcb6 100644 --- a/arch/arc/kernel/asm-offsets.c +++ b/arch/arc/kernel/asm-offsets.c @@ -2,6 +2,7 @@ /* * Copyright (C) 2004, 2007-2010, 2011-2012 Synopsys, Inc. (www.synopsys.com) */ +#define COMPILE_OFFSETS #include <linux/sched.h> #include <linux/mm.h> diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 36ab8625be72..358057001859 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -941,28 +941,14 @@ config IRQSTACKS config ARM_CPU_TOPOLOGY bool "Support cpu topology definition" depends on SMP && CPU_V7 + select ARCH_SUPPORTS_SCHED_MC + select ARCH_SUPPORTS_SCHED_SMT default y help Support ARM cpu topology definition. The MPIDR register defines affinity between processors which is then used to describe the cpu topology of an ARM System. -config SCHED_MC - bool "Multi-core scheduler support" - depends on ARM_CPU_TOPOLOGY - help - Multi-core scheduler support improves the CPU scheduler's decision - making when dealing with multi-core CPU chips at a cost of slightly - increased overhead in some places. If unsure say N here. - -config SCHED_SMT - bool "SMT scheduler support" - depends on ARM_CPU_TOPOLOGY - help - Improves the CPU scheduler's decision making when dealing with - MultiThreading at a cost of slightly increased overhead in some - places. If unsure say N here. - config HAVE_ARM_SCU bool help diff --git a/arch/arm/include/asm/vdso/vsyscall.h b/arch/arm/include/asm/vdso/vsyscall.h index 4e7226ad02ec..ff1c729af05f 100644 --- a/arch/arm/include/asm/vdso/vsyscall.h +++ b/arch/arm/include/asm/vdso/vsyscall.h @@ -7,8 +7,6 @@ #include <vdso/datapage.h> #include <asm/cacheflush.h> -extern bool cntvct_ok; - static __always_inline void __arch_sync_vdso_time_data(struct vdso_time_data *vdata) { diff --git a/arch/arm/kernel/asm-offsets.c b/arch/arm/kernel/asm-offsets.c index 123f4a8ef446..2101938d27fc 100644 --- a/arch/arm/kernel/asm-offsets.c +++ b/arch/arm/kernel/asm-offsets.c @@ -7,6 +7,8 @@ * This code generates raw asm output which is post-processed to extract * and format the required data. */ +#define COMPILE_OFFSETS + #include <linux/compiler.h> #include <linux/sched.h> #include <linux/mm.h> diff --git a/arch/arm/kernel/vdso.c b/arch/arm/kernel/vdso.c index 325448ffbba0..e38a30477f3d 100644 --- a/arch/arm/kernel/vdso.c +++ b/arch/arm/kernel/vdso.c @@ -54,11 +54,9 @@ struct elfinfo { char *dynstr; /* ptr to .dynstr section */ }; -/* Cached result of boot-time check for whether the arch timer exists, - * and if so, whether the virtual counter is useable. +/* Boot-time check for whether the arch timer exists, and if so, + * whether the virtual counter is usable. */ -bool cntvct_ok __ro_after_init; - static bool __init cntvct_functional(void) { struct device_node *np; @@ -159,7 +157,7 @@ static void __init patch_vdso(void *ehdr) * want programs to incur the slight additional overhead of * dispatching through the VDSO only to fall back to syscalls. */ - if (!cntvct_ok) { + if (!cntvct_functional()) { vdso_nullpatch_one(&einfo, "__vdso_gettimeofday"); vdso_nullpatch_one(&einfo, "__vdso_clock_gettime"); vdso_nullpatch_one(&einfo, "__vdso_clock_gettime64"); @@ -197,8 +195,6 @@ static int __init vdso_init(void) vdso_total_pages = VDSO_NR_PAGES; /* for the data/vvar pages */ vdso_total_pages += text_pages; - cntvct_ok = cntvct_functional(); - patch_vdso(vdso_start); return 0; diff --git a/arch/arm/mm/Kconfig b/arch/arm/mm/Kconfig index 5c1023a6d78c..7b27ee9482b3 100644 --- a/arch/arm/mm/Kconfig +++ b/arch/arm/mm/Kconfig @@ -926,9 +926,7 @@ config VDSO default y if ARM_ARCH_TIMER select HAVE_GENERIC_VDSO select GENERIC_TIME_VSYSCALL - select GENERIC_VDSO_32 select GENERIC_GETTIMEOFDAY - select GENERIC_VDSO_DATA_STORE help Place in the process address space an ELF shared object providing fast implementations of gettimeofday and diff --git a/arch/arm/probes/uprobes/core.c b/arch/arm/probes/uprobes/core.c index 885e0c5e8c20..3d96fb41d624 100644 --- a/arch/arm/probes/uprobes/core.c +++ b/arch/arm/probes/uprobes/core.c @@ -30,7 +30,7 @@ int set_swbp(struct arch_uprobe *auprobe, struct vm_area_struct *vma, unsigned long vaddr) { return uprobe_write_opcode(auprobe, vma, vaddr, - __opcode_to_mem_arm(auprobe->bpinsn)); + __opcode_to_mem_arm(auprobe->bpinsn), true); } bool arch_uprobe_ignore(struct arch_uprobe *auprobe, struct pt_regs *regs) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index f828781aa840..b3e13f67d598 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -108,6 +108,9 @@ config ARM64 select ARCH_SUPPORTS_PER_VMA_LOCK select ARCH_SUPPORTS_HUGE_PFNMAP if TRANSPARENT_HUGEPAGE select ARCH_SUPPORTS_RT + select ARCH_SUPPORTS_SCHED_SMT + select ARCH_SUPPORTS_SCHED_CLUSTER + select ARCH_SUPPORTS_SCHED_MC select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH select ARCH_WANT_COMPAT_IPC_PARSE_VERSION if COMPAT select ARCH_WANT_DEFAULT_BPF_JIT @@ -163,8 +166,6 @@ config ARM64 select GENERIC_SMP_IDLE_THREAD select GENERIC_TIME_VSYSCALL select GENERIC_GETTIMEOFDAY - select GENERIC_VDSO_DATA_STORE - select GENERIC_VDSO_TIME_NS select HARDIRQS_SW_RESEND select HAS_IOPORT select HAVE_MOVE_PMD @@ -1507,29 +1508,6 @@ config CPU_LITTLE_ENDIAN endchoice -config SCHED_MC - bool "Multi-core scheduler support" - help - Multi-core scheduler support improves the CPU scheduler's decision - making when dealing with multi-core CPU chips at a cost of slightly - increased overhead in some places. If unsure say N here. - -config SCHED_CLUSTER - bool "Cluster scheduler support" - help - Cluster scheduler support improves the CPU scheduler's decision - making when dealing with machines that have clusters of CPUs. - Cluster usually means a couple of CPUs which are placed closely - by sharing mid-level caches, last-level cache tags or internal - busses. - -config SCHED_SMT - bool "SMT scheduler support" - help - Improves the CPU scheduler's decision making when dealing with - MultiThreading at a cost of slightly increased overhead in some - places. If unsure say N here. - config NR_CPUS int "Maximum number of CPUs (2-4096)" range 2 4096 @@ -1770,7 +1748,6 @@ config COMPAT_VDSO bool "Enable vDSO for 32-bit applications" depends on !CPU_BIG_ENDIAN depends on (CC_IS_CLANG && LD_IS_LLD) || "$(CROSS_COMPILE_COMPAT)" != "" - select GENERIC_COMPAT_VDSO default y help Place in the process address space of 32-bit applications an diff --git a/arch/arm64/include/asm/vdso/compat_barrier.h b/arch/arm64/include/asm/vdso/compat_barrier.h index 3ac35f4a667c..6d75e03d3827 100644 --- a/arch/arm64/include/asm/vdso/compat_barrier.h +++ b/arch/arm64/include/asm/vdso/compat_barrier.h @@ -7,11 +7,10 @@ #ifndef __ASSEMBLY__ /* - * Warning: This code is meant to be used with - * ENABLE_COMPAT_VDSO only. + * Warning: This code is meant to be used from the compat vDSO only. */ -#ifndef ENABLE_COMPAT_VDSO -#error This header is meant to be used with ENABLE_COMPAT_VDSO only +#ifdef __arch64__ +#error This header is meant to be used with from the compat vDSO only #endif #ifdef dmb diff --git a/arch/arm64/include/asm/vdso/compat_gettimeofday.h b/arch/arm64/include/asm/vdso/compat_gettimeofday.h index d60ea7a72a9c..7d1a116549b1 100644 --- a/arch/arm64/include/asm/vdso/compat_gettimeofday.h +++ b/arch/arm64/include/asm/vdso/compat_gettimeofday.h @@ -2,8 +2,8 @@ /* * Copyright (C) 2018 ARM Limited */ -#ifndef __ASM_VDSO_GETTIMEOFDAY_H -#define __ASM_VDSO_GETTIMEOFDAY_H +#ifndef __ASM_VDSO_COMPAT_GETTIMEOFDAY_H +#define __ASM_VDSO_COMPAT_GETTIMEOFDAY_H #ifndef __ASSEMBLY__ @@ -163,4 +163,4 @@ static inline bool vdso_clocksource_ok(const struct vdso_clock *vc) #endif /* !__ASSEMBLY__ */ -#endif /* __ASM_VDSO_GETTIMEOFDAY_H */ +#endif /* __ASM_VDSO_COMPAT_GETTIMEOFDAY_H */ diff --git a/arch/arm64/include/asm/vdso/gettimeofday.h b/arch/arm64/include/asm/vdso/gettimeofday.h index da1ab8759592..c59e84105b43 100644 --- a/arch/arm64/include/asm/vdso/gettimeofday.h +++ b/arch/arm64/include/asm/vdso/gettimeofday.h @@ -5,6 +5,8 @@ #ifndef __ASM_VDSO_GETTIMEOFDAY_H #define __ASM_VDSO_GETTIMEOFDAY_H +#ifdef __aarch64__ + #ifndef __ASSEMBLY__ #include <asm/alternative.h> @@ -96,4 +98,10 @@ static __always_inline const struct vdso_time_data *__arch_get_vdso_u_time_data( #endif /* !__ASSEMBLY__ */ +#else /* !__aarch64__ */ + +#include "compat_gettimeofday.h" + +#endif /* __aarch64__ */ + #endif /* __ASM_VDSO_GETTIMEOFDAY_H */ diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c index 30d4bbe68661..b6367ff3a49c 100644 --- a/arch/arm64/kernel/asm-offsets.c +++ b/arch/arm64/kernel/asm-offsets.c @@ -6,6 +6,7 @@ * 2001-2002 Keith Owens * Copyright (C) 2012 ARM Ltd. */ +#define COMPILE_OFFSETS #include <linux/arm_sdei.h> #include <linux/sched.h> diff --git a/arch/arm64/kernel/vdso32/Makefile b/arch/arm64/kernel/vdso32/Makefile index 5de4deaf4299..ffa3536581f6 100644 --- a/arch/arm64/kernel/vdso32/Makefile +++ b/arch/arm64/kernel/vdso32/Makefile @@ -57,7 +57,6 @@ VDSO_CAFLAGS += -DDISABLE_BRANCH_PROFILING VDSO_CAFLAGS += -march=armv8-a VDSO_CFLAGS := $(VDSO_CAFLAGS) -VDSO_CFLAGS += -DENABLE_COMPAT_VDSO=1 # KBUILD_CFLAGS from top-level Makefile VDSO_CFLAGS += -Wall -Wundef -Wstrict-prototypes -Wno-trigraphs \ -fno-strict-aliasing -fno-common \ diff --git a/arch/csky/kernel/asm-offsets.c b/arch/csky/kernel/asm-offsets.c index d1e903579473..5525c8e7e1d9 100644 --- a/arch/csky/kernel/asm-offsets.c +++ b/arch/csky/kernel/asm-offsets.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 // Copyright (C) 2018 Hangzhou C-SKY Microsystems co.,ltd. +#define COMPILE_OFFSETS #include <linux/sched.h> #include <linux/kernel_stat.h> diff --git a/arch/hexagon/kernel/asm-offsets.c b/arch/hexagon/kernel/asm-offsets.c index 03a7063f9456..50eea9fa6f13 100644 --- a/arch/hexagon/kernel/asm-offsets.c +++ b/arch/hexagon/kernel/asm-offsets.c @@ -8,6 +8,7 @@ * * Copyright (c) 2010-2012, The Linux Foundation. All rights reserved. */ +#define COMPILE_OFFSETS #include <linux/compat.h> #include <linux/types.h> diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig index 0631a6b11281..e6225539579f 100644 --- a/arch/loongarch/Kconfig +++ b/arch/loongarch/Kconfig @@ -70,6 +70,8 @@ config LOONGARCH select ARCH_SUPPORTS_MSEAL_SYSTEM_MAPPINGS select ARCH_SUPPORTS_NUMA_BALANCING select ARCH_SUPPORTS_RT + select ARCH_SUPPORTS_SCHED_SMT if SMP + select ARCH_SUPPORTS_SCHED_MC if SMP select ARCH_USE_BUILTIN_BSWAP select ARCH_USE_CMPXCHG_LOCKREF select ARCH_USE_MEMTEST @@ -108,8 +110,6 @@ config LOONGARCH select GENERIC_SCHED_CLOCK select GENERIC_SMP_IDLE_THREAD select GENERIC_TIME_VSYSCALL - select GENERIC_VDSO_DATA_STORE - select GENERIC_VDSO_TIME_NS select GPIOLIB select HAS_IOPORT select HAVE_ARCH_AUDITSYSCALL @@ -140,6 +140,7 @@ config LOONGARCH select HAVE_EBPF_JIT select HAVE_EFFICIENT_UNALIGNED_ACCESS if !ARCH_STRICT_ALIGN select HAVE_EXIT_THREAD + select HAVE_GENERIC_TIF_BITS select HAVE_GUP_FAST select HAVE_FTRACE_GRAPH_FUNC select HAVE_FUNCTION_ARG_ACCESS_API @@ -452,23 +453,6 @@ config EFI_STUB This kernel feature allows the kernel to be loaded directly by EFI firmware without the use of a bootloader. -config SCHED_SMT - bool "SMT scheduler support" - depends on SMP - default y - help - Improves scheduler's performance when there are multiple - threads in one physical core. - -config SCHED_MC - bool "Multi-core scheduler support" - depends on SMP - default y - help - Multi-core scheduler support improves the CPU scheduler's decision - making when dealing with multi-core CPU chips at a cost of slightly - increased overhead in some places. - config SMP bool "Multi-Processing support" help diff --git a/arch/loongarch/include/asm/thread_info.h b/arch/loongarch/include/asm/thread_info.h index 9dfa2ef00816..4d7117fcdc78 100644 --- a/arch/loongarch/include/asm/thread_info.h +++ b/arch/loongarch/include/asm/thread_info.h @@ -65,50 +65,42 @@ register unsigned long current_stack_pointer __asm__("$sp"); * access * - pending work-to-be-done flags are in LSW * - other flags in MSW + * + * Tell the generic TIF infrastructure which special bits loongarch supports */ -#define TIF_NEED_RESCHED 0 /* rescheduling necessary */ -#define TIF_NEED_RESCHED_LAZY 1 /* lazy rescheduling necessary */ -#define TIF_SIGPENDING 2 /* signal pending */ -#define TIF_NOTIFY_RESUME 3 /* callback before returning to user */ -#define TIF_NOTIFY_SIGNAL 4 /* signal notifications exist */ -#define TIF_RESTORE_SIGMASK 5 /* restore signal mask in do_signal() */ -#define TIF_NOHZ 6 /* in adaptive nohz mode */ -#define TIF_UPROBE 7 /* breakpointed or singlestepping */ -#define TIF_USEDFPU 8 /* FPU was used by this task this quantum (SMP) */ -#define TIF_USEDSIMD 9 /* SIMD has been used this quantum */ -#define TIF_MEMDIE 10 /* is terminating due to OOM killer */ -#define TIF_FIXADE 11 /* Fix address errors in software */ -#define TIF_LOGADE 12 /* Log address errors to syslog */ -#define TIF_32BIT_REGS 13 /* 32-bit general purpose registers */ -#define TIF_32BIT_ADDR 14 /* 32-bit address space */ -#define TIF_LOAD_WATCH 15 /* If set, load watch registers */ -#define TIF_SINGLESTEP 16 /* Single Step */ -#define TIF_LSX_CTX_LIVE 17 /* LSX context must be preserved */ -#define TIF_LASX_CTX_LIVE 18 /* LASX context must be preserved */ -#define TIF_USEDLBT 19 /* LBT was used by this task this quantum (SMP) */ -#define TIF_LBT_CTX_LIVE 20 /* LBT context must be preserved */ -#define TIF_PATCH_PENDING 21 /* pending live patching update */ +#define HAVE_TIF_NEED_RESCHED_LAZY +#define HAVE_TIF_RESTORE_SIGMASK + +#include <asm-generic/thread_info_tif.h> + +/* Architecture specific bits */ +#define TIF_NOHZ 16 /* in adaptive nohz mode */ +#define TIF_USEDFPU 17 /* FPU was used by this task this quantum (SMP) */ +#define TIF_USEDSIMD 18 /* SIMD has been used this quantum */ +#define TIF_FIXADE 19 /* Fix address errors in software */ +#define TIF_LOGADE 20 /* Log address errors to syslog */ +#define TIF_32BIT_REGS 21 /* 32-bit general purpose registers */ +#define TIF_32BIT_ADDR 22 /* 32-bit address space */ +#define TIF_LOAD_WATCH 23 /* If set, load watch registers */ +#define TIF_SINGLESTEP 24 /* Single Step */ +#define TIF_LSX_CTX_LIVE 25 /* LSX context must be preserved */ +#define TIF_LASX_CTX_LIVE 26 /* LASX context must be preserved */ +#define TIF_USEDLBT 27 /* LBT was used by this task this quantum (SMP) */ +#define TIF_LBT_CTX_LIVE 28 /* LBT context must be preserved */ -#define _TIF_NEED_RESCHED (1<<TIF_NEED_RESCHED) -#define _TIF_NEED_RESCHED_LAZY (1<<TIF_NEED_RESCHED_LAZY) -#define _TIF_SIGPENDING (1<<TIF_SIGPENDING) -#define _TIF_NOTIFY_RESUME (1<<TIF_NOTIFY_RESUME) -#define _TIF_NOTIFY_SIGNAL (1<<TIF_NOTIFY_SIGNAL) -#define _TIF_NOHZ (1<<TIF_NOHZ) -#define _TIF_UPROBE (1<<TIF_UPROBE) -#define _TIF_USEDFPU (1<<TIF_USEDFPU) -#define _TIF_USEDSIMD (1<<TIF_USEDSIMD) -#define _TIF_FIXADE (1<<TIF_FIXADE) -#define _TIF_LOGADE (1<<TIF_LOGADE) -#define _TIF_32BIT_REGS (1<<TIF_32BIT_REGS) -#define _TIF_32BIT_ADDR (1<<TIF_32BIT_ADDR) -#define _TIF_LOAD_WATCH (1<<TIF_LOAD_WATCH) -#define _TIF_SINGLESTEP (1<<TIF_SINGLESTEP) -#define _TIF_LSX_CTX_LIVE (1<<TIF_LSX_CTX_LIVE) -#define _TIF_LASX_CTX_LIVE (1<<TIF_LASX_CTX_LIVE) -#define _TIF_USEDLBT (1<<TIF_USEDLBT) -#define _TIF_LBT_CTX_LIVE (1<<TIF_LBT_CTX_LIVE) -#define _TIF_PATCH_PENDING (1<<TIF_PATCH_PENDING) +#define _TIF_NOHZ BIT(TIF_NOHZ) +#define _TIF_USEDFPU BIT(TIF_USEDFPU) +#define _TIF_USEDSIMD BIT(TIF_USEDSIMD) +#define _TIF_FIXADE BIT(TIF_FIXADE) +#define _TIF_LOGADE BIT(TIF_LOGADE) +#define _TIF_32BIT_REGS BIT(TIF_32BIT_REGS) +#define _TIF_32BIT_ADDR BIT(TIF_32BIT_ADDR) +#define _TIF_LOAD_WATCH BIT(TIF_LOAD_WATCH) +#define _TIF_SINGLESTEP BIT(TIF_SINGLESTEP) +#define _TIF_LSX_CTX_LIVE BIT(TIF_LSX_CTX_LIVE) +#define _TIF_LASX_CTX_LIVE BIT(TIF_LASX_CTX_LIVE) +#define _TIF_USEDLBT BIT(TIF_USEDLBT) +#define _TIF_LBT_CTX_LIVE BIT(TIF_LBT_CTX_LIVE) #endif /* __KERNEL__ */ #endif /* _ASM_THREAD_INFO_H */ diff --git a/arch/loongarch/kernel/asm-offsets.c b/arch/loongarch/kernel/asm-offsets.c index db1e4bb26b6a..3017c7157600 100644 --- a/arch/loongarch/kernel/asm-offsets.c +++ b/arch/loongarch/kernel/asm-offsets.c @@ -4,6 +4,8 @@ * * Copyright (C) 2020-2022 Loongson Technology Corporation Limited */ +#define COMPILE_OFFSETS + #include <linux/types.h> #include <linux/sched.h> #include <linux/mm.h> diff --git a/arch/loongarch/kernel/time.c b/arch/loongarch/kernel/time.c index f3092f2de8b5..6fb92cc1a4c9 100644 --- a/arch/loongarch/kernel/time.c +++ b/arch/loongarch/kernel/time.c @@ -112,8 +112,6 @@ static int arch_timer_starting(unsigned int cpu) static int arch_timer_dying(unsigned int cpu) { - constant_set_state_shutdown(this_cpu_ptr(&constant_clockevent_device)); - /* Clear Timer Interrupt */ write_csr_tintclear(CSR_TINTCLR_TI); diff --git a/arch/m68k/kernel/asm-offsets.c b/arch/m68k/kernel/asm-offsets.c index 906d73230537..67a1990f9d74 100644 --- a/arch/m68k/kernel/asm-offsets.c +++ b/arch/m68k/kernel/asm-offsets.c @@ -9,6 +9,7 @@ * #defines from the assembly-language output. */ +#define COMPILE_OFFSETS #define ASM_OFFSETS_C #include <linux/stddef.h> diff --git a/arch/microblaze/kernel/asm-offsets.c b/arch/microblaze/kernel/asm-offsets.c index 104c3ac5f30c..b4b67d58e7f6 100644 --- a/arch/microblaze/kernel/asm-offsets.c +++ b/arch/microblaze/kernel/asm-offsets.c @@ -7,6 +7,7 @@ * License. See the file "COPYING" in the main directory of this archive * for more details. */ +#define COMPILE_OFFSETS #include <linux/init.h> #include <linux/stddef.h> diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index caf508f6e9ec..608e01ed6cff 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -51,7 +51,6 @@ config MIPS select GENERIC_SMP_IDLE_THREAD select GENERIC_IDLE_POLL_SETUP select GENERIC_TIME_VSYSCALL - select GENERIC_VDSO_DATA_STORE select GUP_GET_PXX_LOW_HIGH if CPU_MIPS32 && PHYS_ADDR_T_64BIT select HAS_IOPORT if !NO_IOPORT_MAP || ISA select HAVE_ARCH_COMPILER_H @@ -2223,7 +2222,7 @@ config MIPS_MT_SMP select SMP select SMP_UP select SYS_SUPPORTS_SMP - select SYS_SUPPORTS_SCHED_SMT + select ARCH_SUPPORTS_SCHED_SMT select MIPS_PERF_SHARED_TC_COUNTERS help This is a kernel model which is known as SMVP. This is supported @@ -2235,18 +2234,6 @@ config MIPS_MT_SMP config MIPS_MT bool -config SCHED_SMT - bool "SMT (multithreading) scheduler support" - depends on SYS_SUPPORTS_SCHED_SMT - default n - help - SMT scheduler support improves the CPU scheduler's decision making - when dealing with MIPS MT enabled cores at a cost of slightly - increased overhead in some places. If unsure say N here. - -config SYS_SUPPORTS_SCHED_SMT - bool - config SYS_SUPPORTS_MULTITHREADING bool @@ -2318,7 +2305,7 @@ config MIPS_CPS select HOTPLUG_CORE_SYNC_DEAD if HOTPLUG_CPU select SYNC_R4K if (CEVT_R4K || CSRC_R4K) select SYS_SUPPORTS_HOTPLUG_CPU - select SYS_SUPPORTS_SCHED_SMT if CPU_MIPSR6 + select ARCH_SUPPORTS_SCHED_SMT if CPU_MIPSR6 select SYS_SUPPORTS_SMP select WEAK_ORDERING select GENERIC_IRQ_MIGRATION if HOTPLUG_CPU diff --git a/arch/mips/kernel/asm-offsets.c b/arch/mips/kernel/asm-offsets.c index 1e29efcba46e..5debd9a3854a 100644 --- a/arch/mips/kernel/asm-offsets.c +++ b/arch/mips/kernel/asm-offsets.c @@ -9,6 +9,8 @@ * Kevin Kissell, kevink@mips.com and Carsten Langgaard, carstenl@mips.com * Copyright (C) 2000 MIPS Technologies, Inc. */ +#define COMPILE_OFFSETS + #include <linux/compat.h> #include <linux/types.h> #include <linux/sched.h> diff --git a/arch/nios2/kernel/asm-offsets.c b/arch/nios2/kernel/asm-offsets.c index e3d9b7b6fb48..88190b503ce5 100644 --- a/arch/nios2/kernel/asm-offsets.c +++ b/arch/nios2/kernel/asm-offsets.c @@ -2,6 +2,7 @@ /* * Copyright (C) 2011 Tobias Klauser <tklauser@distanz.ch> */ +#define COMPILE_OFFSETS #include <linux/stddef.h> #include <linux/sched.h> diff --git a/arch/openrisc/kernel/asm-offsets.c b/arch/openrisc/kernel/asm-offsets.c index 710651d5aaae..3cc826f2216b 100644 --- a/arch/openrisc/kernel/asm-offsets.c +++ b/arch/openrisc/kernel/asm-offsets.c @@ -18,6 +18,7 @@ * compile this file to assembler, and then extract the * #defines from the assembly-language output. */ +#define COMPILE_OFFSETS #include <linux/signal.h> #include <linux/sched.h> diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig index 2efa4b08b7b8..0940c162f1f7 100644 --- a/arch/parisc/Kconfig +++ b/arch/parisc/Kconfig @@ -44,6 +44,7 @@ config PARISC select ARCH_HAVE_NMI_SAFE_CMPXCHG select GENERIC_SMP_IDLE_THREAD select GENERIC_ARCH_TOPOLOGY if SMP + select ARCH_SUPPORTS_SCHED_MC if SMP && PA8X00 select GENERIC_CPU_DEVICES if !SMP select GENERIC_LIB_DEVMEM_IS_ALLOWED select SYSCTL_ARCH_UNALIGN_ALLOW @@ -319,14 +320,6 @@ config SMP If you don't know what to do here, say N. -config SCHED_MC - bool "Multi-core scheduler support" - depends on GENERIC_ARCH_TOPOLOGY && PA8X00 - help - Multi-core scheduler support improves the CPU scheduler's decision - making when dealing with multi-core CPU chips at a cost of slightly - increased overhead in some places. If unsure say N here. - config IRQSTACKS bool "Use separate kernel stacks when processing interrupts" default y diff --git a/arch/parisc/kernel/asm-offsets.c b/arch/parisc/kernel/asm-offsets.c index 757816a7bd4b..9abfe65492c6 100644 --- a/arch/parisc/kernel/asm-offsets.c +++ b/arch/parisc/kernel/asm-offsets.c @@ -13,6 +13,7 @@ * Copyright (C) 2002 Randolph Chung <tausq with parisc-linux.org> * Copyright (C) 2003 James Bottomley <jejb at parisc-linux.org> */ +#define COMPILE_OFFSETS #include <linux/types.h> #include <linux/sched.h> diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 325c1171894d..22173a897670 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -170,6 +170,9 @@ config PPC select ARCH_STACKWALK select ARCH_SUPPORTS_ATOMIC_RMW select ARCH_SUPPORTS_DEBUG_PAGEALLOC if PPC_BOOK3S || PPC_8xx + select ARCH_SUPPORTS_SCHED_MC if SMP + select ARCH_SUPPORTS_SCHED_SMT if PPC64 && SMP + select SCHED_MC if ARCH_SUPPORTS_SCHED_MC select ARCH_USE_BUILTIN_BSWAP select ARCH_USE_CMPXCHG_LOCKREF if PPC64 select ARCH_USE_MEMTEST @@ -207,8 +210,6 @@ config PPC select GENERIC_PCI_IOMAP if PCI select GENERIC_SMP_IDLE_THREAD select GENERIC_TIME_VSYSCALL - select GENERIC_VDSO_DATA_STORE - select GENERIC_VDSO_TIME_NS select HAS_IOPORT if PCI select HAVE_ARCH_AUDITSYSCALL select HAVE_ARCH_HUGE_VMALLOC if HAVE_ARCH_HUGE_VMAP @@ -965,14 +966,6 @@ config PPC_PROT_SAO_LPAR config PPC_COPRO_BASE bool -config SCHED_SMT - bool "SMT (Hyperthreading) scheduler support" - depends on PPC64 && SMP - help - SMT scheduler support improves the CPU scheduler's decision making - when dealing with POWER5 cpus at a cost of slightly increased - overhead in some places. If unsure say N here. - config PPC_DENORMALISATION bool "PowerPC denormalisation exception handling" depends on PPC_BOOK3S_64 diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h index da15b5efe807..f19ca44512d1 100644 --- a/arch/powerpc/include/asm/topology.h +++ b/arch/powerpc/include/asm/topology.h @@ -131,6 +131,8 @@ static inline int cpu_to_coregroup_id(int cpu) #ifdef CONFIG_SMP #include <asm/cputable.h> +struct cpumask *cpu_coregroup_mask(int cpu); + #ifdef CONFIG_PPC64 #include <asm/smp.h> diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index b3048f6d3822..a4bc80b30410 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -8,6 +8,7 @@ * compile this file to assembler, and then extract the * #defines from the assembly-language output. */ +#define COMPILE_OFFSETS #include <linux/compat.h> #include <linux/signal.h> diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index f59e4b9cc207..68edb66c2964 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -1028,19 +1028,19 @@ static int powerpc_shared_proc_flags(void) * We can't just pass cpu_l2_cache_mask() directly because * returns a non-const pointer and the compiler barfs on that. */ -static const struct cpumask *shared_cache_mask(int cpu) +static const struct cpumask *tl_cache_mask(struct sched_domain_topology_level *tl, int cpu) { return per_cpu(cpu_l2_cache_map, cpu); } #ifdef CONFIG_SCHED_SMT -static const struct cpumask *smallcore_smt_mask(int cpu) +static const struct cpumask *tl_smallcore_smt_mask(struct sched_domain_topology_level *tl, int cpu) { return cpu_smallcore_mask(cpu); } #endif -static struct cpumask *cpu_coregroup_mask(int cpu) +struct cpumask *cpu_coregroup_mask(int cpu) { return per_cpu(cpu_coregroup_map, cpu); } @@ -1054,11 +1054,6 @@ static bool has_coregroup_support(void) return coregroup_enabled; } -static const struct cpumask *cpu_mc_mask(int cpu) -{ - return cpu_coregroup_mask(cpu); -} - static int __init init_big_cores(void) { int cpu; @@ -1448,7 +1443,7 @@ static bool update_mask_by_l2(int cpu, cpumask_var_t *mask) return false; } - cpumask_and(*mask, cpu_online_mask, cpu_cpu_mask(cpu)); + cpumask_and(*mask, cpu_online_mask, cpu_node_mask(cpu)); /* Update l2-cache mask with all the CPUs that are part of submask */ or_cpumasks_related(cpu, cpu, submask_fn, cpu_l2_cache_mask); @@ -1538,7 +1533,7 @@ static void update_coregroup_mask(int cpu, cpumask_var_t *mask) return; } - cpumask_and(*mask, cpu_online_mask, cpu_cpu_mask(cpu)); + cpumask_and(*mask, cpu_online_mask, cpu_node_mask(cpu)); /* Update coregroup mask with all the CPUs that are part of submask */ or_cpumasks_related(cpu, cpu, submask_fn, cpu_coregroup_mask); @@ -1601,7 +1596,7 @@ static void add_cpu_to_masks(int cpu) /* If chip_id is -1; limit the cpu_core_mask to within PKG */ if (chip_id == -1) - cpumask_and(mask, mask, cpu_cpu_mask(cpu)); + cpumask_and(mask, mask, cpu_node_mask(cpu)); for_each_cpu(i, mask) { if (chip_id == cpu_to_chip_id(i)) { @@ -1701,22 +1696,22 @@ static void __init build_sched_topology(void) if (has_big_cores) { pr_info("Big cores detected but using small core scheduling\n"); powerpc_topology[i++] = - SDTL_INIT(smallcore_smt_mask, powerpc_smt_flags, SMT); + SDTL_INIT(tl_smallcore_smt_mask, powerpc_smt_flags, SMT); } else { - powerpc_topology[i++] = SDTL_INIT(cpu_smt_mask, powerpc_smt_flags, SMT); + powerpc_topology[i++] = SDTL_INIT(tl_smt_mask, powerpc_smt_flags, SMT); } #endif if (shared_caches) { powerpc_topology[i++] = - SDTL_INIT(shared_cache_mask, powerpc_shared_cache_flags, CACHE); + SDTL_INIT(tl_cache_mask, powerpc_shared_cache_flags, CACHE); } if (has_coregroup_support()) { powerpc_topology[i++] = - SDTL_INIT(cpu_mc_mask, powerpc_shared_proc_flags, MC); + SDTL_INIT(tl_mc_mask, powerpc_shared_proc_flags, MC); } - powerpc_topology[i++] = SDTL_INIT(cpu_cpu_mask, powerpc_shared_proc_flags, PKG); + powerpc_topology[i++] = SDTL_INIT(tl_pkg_mask, powerpc_shared_proc_flags, PKG); /* There must be one trailing NULL entry left. */ BUG_ON(i >= ARRAY_SIZE(powerpc_topology) - 1); diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 28d00744cbb5..a9ef4eeb72ab 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -53,7 +53,7 @@ config RISCV select ARCH_HAS_SYSCALL_WRAPPER select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST select ARCH_HAS_UBSAN - select ARCH_HAS_VDSO_ARCH_DATA if GENERIC_VDSO_DATA_STORE + select ARCH_HAS_VDSO_ARCH_DATA if HAVE_GENERIC_VDSO select ARCH_HAVE_NMI_SAFE_CMPXCHG select ARCH_KEEP_MEMBLOCK if ACPI select ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE if 64BIT && MMU @@ -74,6 +74,7 @@ config RISCV select ARCH_SUPPORTS_PER_VMA_LOCK if MMU select ARCH_SUPPORTS_RT select ARCH_SUPPORTS_SHADOW_CALL_STACK if HAVE_SHADOW_CALL_STACK + select ARCH_SUPPORTS_SCHED_MC if SMP select ARCH_USE_CMPXCHG_LOCKREF if 64BIT select ARCH_USE_MEMTEST select ARCH_USE_QUEUED_RWLOCKS @@ -109,7 +110,7 @@ config RISCV select GENERIC_CPU_VULNERABILITIES select GENERIC_EARLY_IOREMAP select GENERIC_ENTRY - select GENERIC_GETTIMEOFDAY if HAVE_GENERIC_VDSO + select GENERIC_GETTIMEOFDAY if HAVE_GENERIC_VDSO && 64BIT select GENERIC_IDLE_POLL_SETUP select GENERIC_IOREMAP if MMU select GENERIC_IRQ_IPI if SMP @@ -122,9 +123,7 @@ config RISCV select GENERIC_PCI_IOMAP select GENERIC_SCHED_CLOCK select GENERIC_SMP_IDLE_THREAD - select GENERIC_TIME_VSYSCALL if MMU && 64BIT - select GENERIC_VDSO_DATA_STORE if MMU - select GENERIC_VDSO_TIME_NS if HAVE_GENERIC_VDSO + select GENERIC_TIME_VSYSCALL if GENERIC_GETTIMEOFDAY select HARDIRQS_SW_RESEND select HAS_IOPORT if MMU select HAVE_ALIGNED_STRUCT_PAGE @@ -163,11 +162,12 @@ config RISCV select HAVE_FUNCTION_GRAPH_FREGS select HAVE_FUNCTION_TRACER if !XIP_KERNEL && HAVE_DYNAMIC_FTRACE select HAVE_EBPF_JIT if MMU + select HAVE_GENERIC_TIF_BITS select HAVE_GUP_FAST if MMU select HAVE_FUNCTION_ARG_ACCESS_API select HAVE_FUNCTION_ERROR_INJECTION select HAVE_GCC_PLUGINS - select HAVE_GENERIC_VDSO if MMU && 64BIT + select HAVE_GENERIC_VDSO if MMU select HAVE_IRQ_TIME_ACCOUNTING select HAVE_KERNEL_BZIP2 if !XIP_KERNEL && !EFI_ZBOOT select HAVE_KERNEL_GZIP if !XIP_KERNEL && !EFI_ZBOOT @@ -223,7 +223,7 @@ config RISCV select THREAD_INFO_IN_TASK select TRACE_IRQFLAGS_SUPPORT select UACCESS_MEMCPY if !MMU - select VDSO_GETRANDOM if HAVE_GENERIC_VDSO + select VDSO_GETRANDOM if HAVE_GENERIC_VDSO && 64BIT select USER_STACKTRACE_SUPPORT select ZONE_DMA32 if 64BIT @@ -455,14 +455,6 @@ config SMP If you don't know what to do here, say N. -config SCHED_MC - bool "Multi-core scheduler support" - depends on SMP - help - Multi-core scheduler support improves the CPU scheduler's decision - making when dealing with multi-core CPU chips at a cost of slightly - increased overhead in some places. If unsure say N here. - config NR_CPUS int "Maximum number of CPUs (2-512)" depends on SMP diff --git a/arch/riscv/boot/dts/sophgo/sg2042.dtsi b/arch/riscv/boot/dts/sophgo/sg2042.dtsi index b3e4d3c18fdc..6430c6e25c00 100644 --- a/arch/riscv/boot/dts/sophgo/sg2042.dtsi +++ b/arch/riscv/boot/dts/sophgo/sg2042.dtsi @@ -190,7 +190,7 @@ reg-names = "clr", "doorbell"; msi-controller; #msi-cells = <0>; - msi-ranges = <&intc 64 IRQ_TYPE_LEVEL_HIGH 32>; + msi-ranges = <&intc 64 IRQ_TYPE_EDGE_RISING 32>; }; rpgate: clock-controller@7030010368 { diff --git a/arch/riscv/boot/dts/sophgo/sg2044.dtsi b/arch/riscv/boot/dts/sophgo/sg2044.dtsi index 6ec955744b0c..320c4d1d08e6 100644 --- a/arch/riscv/boot/dts/sophgo/sg2044.dtsi +++ b/arch/riscv/boot/dts/sophgo/sg2044.dtsi @@ -214,7 +214,7 @@ reg-names = "clr", "doorbell"; #msi-cells = <0>; msi-controller; - msi-ranges = <&intc 352 IRQ_TYPE_LEVEL_HIGH 512>; + msi-ranges = <&intc 352 IRQ_TYPE_EDGE_RISING 512>; status = "disabled"; }; diff --git a/arch/riscv/include/asm/thread_info.h b/arch/riscv/include/asm/thread_info.h index c33d8b7dd488..836d80dd2921 100644 --- a/arch/riscv/include/asm/thread_info.h +++ b/arch/riscv/include/asm/thread_info.h @@ -107,23 +107,18 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src); * - pending work-to-be-done flags are in lowest half-word * - other flags in upper half-word(s) */ -#define TIF_NEED_RESCHED 0 /* rescheduling necessary */ -#define TIF_NEED_RESCHED_LAZY 1 /* Lazy rescheduling needed */ -#define TIF_NOTIFY_RESUME 2 /* callback before returning to user */ -#define TIF_SIGPENDING 3 /* signal pending */ -#define TIF_RESTORE_SIGMASK 4 /* restore signal mask in do_signal() */ -#define TIF_MEMDIE 5 /* is terminating due to OOM killer */ -#define TIF_NOTIFY_SIGNAL 9 /* signal notifications exist */ -#define TIF_UPROBE 10 /* uprobe breakpoint or singlestep */ -#define TIF_32BIT 11 /* compat-mode 32bit process */ -#define TIF_RISCV_V_DEFER_RESTORE 12 /* restore Vector before returing to user */ - -#define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) -#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY) -#define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) -#define _TIF_SIGPENDING (1 << TIF_SIGPENDING) -#define _TIF_NOTIFY_SIGNAL (1 << TIF_NOTIFY_SIGNAL) -#define _TIF_UPROBE (1 << TIF_UPROBE) -#define _TIF_RISCV_V_DEFER_RESTORE (1 << TIF_RISCV_V_DEFER_RESTORE) + +/* + * Tell the generic TIF infrastructure which bits riscv supports + */ +#define HAVE_TIF_NEED_RESCHED_LAZY +#define HAVE_TIF_RESTORE_SIGMASK + +#include <asm-generic/thread_info_tif.h> + +#define TIF_32BIT 16 /* compat-mode 32bit process */ +#define TIF_RISCV_V_DEFER_RESTORE 17 /* restore Vector before returing to user */ + +#define _TIF_RISCV_V_DEFER_RESTORE BIT(TIF_RISCV_V_DEFER_RESTORE) #endif /* _ASM_RISCV_THREAD_INFO_H */ diff --git a/arch/riscv/kernel/asm-offsets.c b/arch/riscv/kernel/asm-offsets.c index 6e8c0d6feae9..7d42d3b8a32a 100644 --- a/arch/riscv/kernel/asm-offsets.c +++ b/arch/riscv/kernel/asm-offsets.c @@ -3,6 +3,7 @@ * Copyright (C) 2012 Regents of the University of California * Copyright (C) 2017 SiFive */ +#define COMPILE_OFFSETS #include <linux/kbuild.h> #include <linux/mm.h> diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 22862ce7ec68..2414ee3ff002 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -174,8 +174,6 @@ config S390 select GENERIC_GETTIMEOFDAY select GENERIC_SMP_IDLE_THREAD select GENERIC_TIME_VSYSCALL - select GENERIC_VDSO_DATA_STORE - select GENERIC_VDSO_TIME_NS select GENERIC_IOREMAP if PCI select HAVE_ALIGNED_STRUCT_PAGE select HAVE_ARCH_AUDITSYSCALL @@ -206,6 +204,7 @@ config S390 select HAVE_DYNAMIC_FTRACE_WITH_REGS select HAVE_EBPF_JIT if HAVE_MARCH_Z196_FEATURES select HAVE_EFFICIENT_UNALIGNED_ACCESS + select HAVE_GENERIC_TIF_BITS select HAVE_GUP_FAST select HAVE_FENTRY select HAVE_FTRACE_GRAPH_FUNC @@ -554,15 +553,11 @@ config NODES_SHIFT depends on NUMA default "1" -config SCHED_SMT - def_bool n - -config SCHED_MC - def_bool n - config SCHED_TOPOLOGY def_bool y prompt "Topology scheduler support" + select ARCH_SUPPORTS_SCHED_SMT + select ARCH_SUPPORTS_SCHED_MC select SCHED_SMT select SCHED_MC help diff --git a/arch/s390/include/asm/thread_info.h b/arch/s390/include/asm/thread_info.h index f6ed2c8192c8..7878e9bfbf07 100644 --- a/arch/s390/include/asm/thread_info.h +++ b/arch/s390/include/asm/thread_info.h @@ -56,49 +56,31 @@ void arch_setup_new_exec(void); /* * thread information flags bit numbers + * + * Tell the generic TIF infrastructure which special bits s390 supports */ -#define TIF_NOTIFY_RESUME 0 /* callback before returning to user */ -#define TIF_SIGPENDING 1 /* signal pending */ -#define TIF_NEED_RESCHED 2 /* rescheduling necessary */ -#define TIF_NEED_RESCHED_LAZY 3 /* lazy rescheduling needed */ -#define TIF_UPROBE 4 /* breakpointed or single-stepping */ -#define TIF_PATCH_PENDING 5 /* pending live patching update */ -#define TIF_ASCE_PRIMARY 6 /* primary asce is kernel asce */ -#define TIF_NOTIFY_SIGNAL 7 /* signal notifications exist */ -#define TIF_GUARDED_STORAGE 8 /* load guarded storage control block */ -#define TIF_ISOLATE_BP_GUEST 9 /* Run KVM guests with isolated BP */ -#define TIF_PER_TRAP 10 /* Need to handle PER trap on exit to usermode */ -#define TIF_31BIT 16 /* 32bit process */ -#define TIF_MEMDIE 17 /* is terminating due to OOM killer */ -#define TIF_RESTORE_SIGMASK 18 /* restore signal mask in do_signal() */ -#define TIF_SINGLE_STEP 19 /* This task is single stepped */ -#define TIF_BLOCK_STEP 20 /* This task is block stepped */ -#define TIF_UPROBE_SINGLESTEP 21 /* This task is uprobe single stepped */ -#define TIF_SYSCALL_TRACE 24 /* syscall trace active */ -#define TIF_SYSCALL_AUDIT 25 /* syscall auditing active */ -#define TIF_SECCOMP 26 /* secure computing */ -#define TIF_SYSCALL_TRACEPOINT 27 /* syscall tracepoint instrumentation */ +#define HAVE_TIF_NEED_RESCHED_LAZY +#define HAVE_TIF_RESTORE_SIGMASK + +#include <asm-generic/thread_info_tif.h> + +/* Architecture specific bits */ +#define TIF_ASCE_PRIMARY 16 /* primary asce is kernel asce */ +#define TIF_GUARDED_STORAGE 17 /* load guarded storage control block */ +#define TIF_ISOLATE_BP_GUEST 18 /* Run KVM guests with isolated BP */ +#define TIF_PER_TRAP 19 /* Need to handle PER trap on exit to usermode */ +#define TIF_31BIT 20 /* 32bit process */ +#define TIF_SINGLE_STEP 21 /* This task is single stepped */ +#define TIF_BLOCK_STEP 22 /* This task is block stepped */ +#define TIF_UPROBE_SINGLESTEP 23 /* This task is uprobe single stepped */ -#define _TIF_NOTIFY_RESUME BIT(TIF_NOTIFY_RESUME) -#define _TIF_SIGPENDING BIT(TIF_SIGPENDING) -#define _TIF_NEED_RESCHED BIT(TIF_NEED_RESCHED) -#define _TIF_NEED_RESCHED_LAZY BIT(TIF_NEED_RESCHED_LAZY) -#define _TIF_UPROBE BIT(TIF_UPROBE) -#define _TIF_PATCH_PENDING BIT(TIF_PATCH_PENDING) #define _TIF_ASCE_PRIMARY BIT(TIF_ASCE_PRIMARY) -#define _TIF_NOTIFY_SIGNAL BIT(TIF_NOTIFY_SIGNAL) #define _TIF_GUARDED_STORAGE BIT(TIF_GUARDED_STORAGE) #define _TIF_ISOLATE_BP_GUEST BIT(TIF_ISOLATE_BP_GUEST) #define _TIF_PER_TRAP BIT(TIF_PER_TRAP) #define _TIF_31BIT BIT(TIF_31BIT) -#define _TIF_MEMDIE BIT(TIF_MEMDIE) -#define _TIF_RESTORE_SIGMASK BIT(TIF_RESTORE_SIGMASK) #define _TIF_SINGLE_STEP BIT(TIF_SINGLE_STEP) #define _TIF_BLOCK_STEP BIT(TIF_BLOCK_STEP) #define _TIF_UPROBE_SINGLESTEP BIT(TIF_UPROBE_SINGLESTEP) -#define _TIF_SYSCALL_TRACE BIT(TIF_SYSCALL_TRACE) -#define _TIF_SYSCALL_AUDIT BIT(TIF_SYSCALL_AUDIT) -#define _TIF_SECCOMP BIT(TIF_SECCOMP) -#define _TIF_SYSCALL_TRACEPOINT BIT(TIF_SYSCALL_TRACEPOINT) #endif /* _ASM_THREAD_INFO_H */ diff --git a/arch/s390/kernel/asm-offsets.c b/arch/s390/kernel/asm-offsets.c index 95ecad9c7d7d..a8915663e917 100644 --- a/arch/s390/kernel/asm-offsets.c +++ b/arch/s390/kernel/asm-offsets.c @@ -4,6 +4,7 @@ * This code generates raw asm output which is post-processed to extract * and format the required data. */ +#define COMPILE_OFFSETS #include <linux/kbuild.h> #include <linux/sched.h> diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c index 46569b8e47dd..1594c80e9bc4 100644 --- a/arch/s390/kernel/topology.c +++ b/arch/s390/kernel/topology.c @@ -509,33 +509,27 @@ int topology_cpu_init(struct cpu *cpu) return rc; } -static const struct cpumask *cpu_thread_mask(int cpu) -{ - return &cpu_topology[cpu].thread_mask; -} - - const struct cpumask *cpu_coregroup_mask(int cpu) { return &cpu_topology[cpu].core_mask; } -static const struct cpumask *cpu_book_mask(int cpu) +static const struct cpumask *tl_book_mask(struct sched_domain_topology_level *tl, int cpu) { return &cpu_topology[cpu].book_mask; } -static const struct cpumask *cpu_drawer_mask(int cpu) +static const struct cpumask *tl_drawer_mask(struct sched_domain_topology_level *tl, int cpu) { return &cpu_topology[cpu].drawer_mask; } static struct sched_domain_topology_level s390_topology[] = { - SDTL_INIT(cpu_thread_mask, cpu_smt_flags, SMT), - SDTL_INIT(cpu_coregroup_mask, cpu_core_flags, MC), - SDTL_INIT(cpu_book_mask, NULL, BOOK), - SDTL_INIT(cpu_drawer_mask, NULL, DRAWER), - SDTL_INIT(cpu_cpu_mask, NULL, PKG), + SDTL_INIT(tl_smt_mask, cpu_smt_flags, SMT), + SDTL_INIT(tl_mc_mask, cpu_core_flags, MC), + SDTL_INIT(tl_book_mask, NULL, BOOK), + SDTL_INIT(tl_drawer_mask, NULL, DRAWER), + SDTL_INIT(tl_pkg_mask, NULL, PKG), { NULL, }, }; diff --git a/arch/sh/kernel/asm-offsets.c b/arch/sh/kernel/asm-offsets.c index a0322e832845..429b6a763146 100644 --- a/arch/sh/kernel/asm-offsets.c +++ b/arch/sh/kernel/asm-offsets.c @@ -8,6 +8,7 @@ * compile this file to assembler, and then extract the * #defines from the assembly-language output. */ +#define COMPILE_OFFSETS #include <linux/stddef.h> #include <linux/types.h> diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index 7b595092cbfb..a630d373e645 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -110,6 +110,8 @@ config SPARC64 select HAVE_SETUP_PER_CPU_AREA select NEED_PER_CPU_EMBED_FIRST_CHUNK select NEED_PER_CPU_PAGE_FIRST_CHUNK + select ARCH_SUPPORTS_SCHED_SMT if SMP + select ARCH_SUPPORTS_SCHED_MC if SMP config ARCH_PROC_KCORE_TEXT def_bool y @@ -288,24 +290,6 @@ if SPARC64 || COMPILE_TEST source "kernel/power/Kconfig" endif -config SCHED_SMT - bool "SMT (Hyperthreading) scheduler support" - depends on SPARC64 && SMP - default y - help - SMT scheduler support improves the CPU scheduler's decision making - when dealing with SPARC cpus at a cost of slightly increased overhead - in some places. If unsure say N here. - -config SCHED_MC - bool "Multi-core scheduler support" - depends on SPARC64 && SMP - default y - help - Multi-core scheduler support improves the CPU scheduler's decision - making when dealing with multi-core CPU chips at a cost of slightly - increased overhead in some places. If unsure say N here. - config CMDLINE_BOOL bool "Default bootloader kernel arguments" depends on SPARC64 diff --git a/arch/sparc/kernel/asm-offsets.c b/arch/sparc/kernel/asm-offsets.c index 3d9b9855dce9..6e660bde48dd 100644 --- a/arch/sparc/kernel/asm-offsets.c +++ b/arch/sparc/kernel/asm-offsets.c @@ -10,6 +10,7 @@ * * On sparc, thread_info data is static and TI_XXX offsets are computed by hand. */ +#define COMPILE_OFFSETS #include <linux/sched.h> #include <linux/mm_types.h> diff --git a/arch/um/kernel/asm-offsets.c b/arch/um/kernel/asm-offsets.c index 1fb12235ab9c..a69873aa697f 100644 --- a/arch/um/kernel/asm-offsets.c +++ b/arch/um/kernel/asm-offsets.c @@ -1 +1,3 @@ +#define COMPILE_OFFSETS + #include <sysdep/kernel-offsets.h> diff --git a/arch/x86/Kbuild b/arch/x86/Kbuild index f7fb3d88c57b..36b985d0e7bf 100644 --- a/arch/x86/Kbuild +++ b/arch/x86/Kbuild @@ -3,6 +3,8 @@ # Branch profiling isn't noinstr-safe. Disable it for arch/x86/* subdir-ccflags-$(CONFIG_TRACE_BRANCH_PROFILING) += -DDISABLE_BRANCH_PROFILING +obj-y += boot/startup/ + obj-$(CONFIG_ARCH_HAS_CC_PLATFORM) += coco/ obj-y += entry/ diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 052a5f811203..75f3de70df51 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -14,7 +14,6 @@ config X86_32 select ARCH_WANT_IPC_PARSE_VERSION select CLKSRC_I8253 select CLONE_BACKWARDS - select GENERIC_VDSO_32 select HAVE_DEBUG_STACKOVERFLOW select KMAP_LOCAL select MODULES_USE_ELF_REL @@ -182,8 +181,6 @@ config X86 select GENERIC_SMP_IDLE_THREAD select GENERIC_TIME_VSYSCALL select GENERIC_GETTIMEOFDAY - select GENERIC_VDSO_DATA_STORE - select GENERIC_VDSO_TIME_NS select GENERIC_VDSO_OVERFLOW_PROTECT select GUP_GET_PXX_LOW_HIGH if X86_PAE select HARDIRQS_SW_RESEND @@ -239,6 +236,7 @@ config X86 select HAVE_EFFICIENT_UNALIGNED_ACCESS select HAVE_EISA if X86_32 select HAVE_EXIT_THREAD + select HAVE_GENERIC_TIF_BITS select HAVE_GUP_FAST select HAVE_FENTRY if X86_64 || DYNAMIC_FTRACE select HAVE_FTRACE_GRAPH_FUNC if HAVE_FUNCTION_GRAPH_TRACER @@ -330,6 +328,10 @@ config X86 imply IMA_SECURE_AND_OR_TRUSTED_BOOT if EFI select HAVE_DYNAMIC_FTRACE_NO_PATCHABLE select ARCH_SUPPORTS_PT_RECLAIM if X86_64 + select ARCH_SUPPORTS_SCHED_SMT if SMP + select SCHED_SMT if SMP + select ARCH_SUPPORTS_SCHED_CLUSTER if SMP + select ARCH_SUPPORTS_SCHED_MC if SMP config INSTRUCTION_DECODER def_bool y @@ -483,6 +485,19 @@ config X86_X2APIC If in doubt, say Y. +config AMD_SECURE_AVIC + bool "AMD Secure AVIC" + depends on AMD_MEM_ENCRYPT && X86_X2APIC + help + Enable this to get AMD Secure AVIC support on guests that have this feature. + + AMD Secure AVIC provides hardware acceleration for performance sensitive + APIC accesses and support for managing guest owned APIC state for SEV-SNP + guests. Secure AVIC does not support xAPIC mode. It has functional + dependency on x2apic being enabled in the guest. + + If you don't know what to do here, say N. + config X86_POSTED_MSI bool "Enable MSI and MSI-x delivery by posted interrupts" depends on X86_64 && IRQ_REMAP @@ -879,6 +894,15 @@ config ACRN_GUEST IOT with small footprint and real-time features. More details can be found in https://projectacrn.org/. +config BHYVE_GUEST + bool "Bhyve (BSD Hypervisor) Guest support" + depends on X86_64 + help + This option allows to run Linux to recognise when it is running as a + guest in the Bhyve hypervisor, and to support more than 255 vCPUs when + when doing so. More details about Bhyve can be found at https://bhyve.org + and https://wiki.freebsd.org/bhyve/. + config INTEL_TDX_GUEST bool "Intel TDX (Trust Domain Extensions) - Guest Support" depends on X86_64 && CPU_SUP_INTEL @@ -1031,29 +1055,6 @@ config NR_CPUS This is purely to save memory: each supported CPU adds about 8KB to the kernel image. -config SCHED_CLUSTER - bool "Cluster scheduler support" - depends on SMP - default y - help - Cluster scheduler support improves the CPU scheduler's decision - making when dealing with machines that have clusters of CPUs. - Cluster usually means a couple of CPUs which are placed closely - by sharing mid-level caches, last-level cache tags or internal - busses. - -config SCHED_SMT - def_bool y if SMP - -config SCHED_MC - def_bool y - prompt "Multi-core scheduler support" - depends on SMP - help - Multi-core scheduler support improves the CPU scheduler's decision - making when dealing with multi-core CPU chips at a cost of slightly - increased overhead in some places. If unsure say N here. - config SCHED_MC_PRIO bool "CPU core priorities scheduler support" depends on SCHED_MC @@ -1340,7 +1341,7 @@ config MICROCODE_LATE_LOADING use this at your own risk. Late loading taints the kernel unless the microcode header indicates that it is safe for late loading via the minimal revision check. This minimal revision check can be enforced on - the kernel command line with "microcode.minrev=Y". + the kernel command line with "microcode=force_minrev". config MICROCODE_LATE_FORCE_MINREV bool "Enforce late microcode loading minimal revision check" @@ -1356,10 +1357,22 @@ config MICROCODE_LATE_FORCE_MINREV revision check fails. This minimal revision check can also be controlled via the - "microcode.minrev" parameter on the kernel command line. + "microcode=force_minrev" parameter on the kernel command line. If unsure say Y. +config MICROCODE_DBG + bool "Enable microcode loader debugging" + default n + depends on MICROCODE + help + Enable code which allows for debugging the microcode loader in + a guest. Meaning the patch loading is simulated but everything else + related to patch parsing and handling is done as on baremetal with + the purpose of debugging solely the software side of things. + + You almost certainly want to say n here. + config X86_MSR tristate "/dev/cpu/*/msr - Model-specific register support" help @@ -1753,11 +1766,7 @@ config X86_UMIP config CC_HAS_IBT # GCC >= 9 and binutils >= 2.29 # Retpoline check to work around https://gcc.gnu.org/bugzilla/show_bug.cgi?id=93654 - # Clang/LLVM >= 14 - # https://github.com/llvm/llvm-project/commit/e0b89df2e0f0130881bf6c39bf31d7f6aac00e0f - # https://github.com/llvm/llvm-project/commit/dfcf69770bc522b9e411c66454934a37c1f35332 - def_bool ((CC_IS_GCC && $(cc-option, -fcf-protection=branch -mindirect-branch-register)) || \ - (CC_IS_CLANG && CLANG_VERSION >= 140000)) && \ + def_bool ((CC_IS_GCC && $(cc-option, -fcf-protection=branch -mindirect-branch-register)) || CC_IS_CLANG) && \ $(as-instr,endbr64) config X86_CET @@ -1769,8 +1778,6 @@ config X86_KERNEL_IBT prompt "Indirect Branch Tracking" def_bool y depends on X86_64 && CC_HAS_IBT && HAVE_OBJTOOL - # https://github.com/llvm/llvm-project/commit/9d7001eba9c4cb311e03cd8cdc231f9e579f2d0f - depends on !LD_IS_LLD || LLD_VERSION >= 140000 select OBJTOOL select X86_CET help diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 1913d342969b..4db7e4bf69f5 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -13,8 +13,8 @@ else endif ifdef CONFIG_CC_IS_GCC -RETPOLINE_CFLAGS := $(call cc-option,-mindirect-branch=thunk-extern -mindirect-branch-register) -RETPOLINE_VDSO_CFLAGS := $(call cc-option,-mindirect-branch=thunk-inline -mindirect-branch-register) +RETPOLINE_CFLAGS := -mindirect-branch=thunk-extern -mindirect-branch-register +RETPOLINE_VDSO_CFLAGS := -mindirect-branch=thunk-inline -mindirect-branch-register endif ifdef CONFIG_CC_IS_CLANG RETPOLINE_CFLAGS := -mretpoline-external-thunk @@ -37,10 +37,11 @@ export RETPOLINE_VDSO_CFLAGS # For gcc stack alignment is specified with -mpreferred-stack-boundary, # clang has the option -mstack-alignment for that purpose. -ifneq ($(call cc-option, -mpreferred-stack-boundary=4),) +ifdef CONFIG_CC_IS_GCC cc_stack_align4 := -mpreferred-stack-boundary=2 cc_stack_align8 := -mpreferred-stack-boundary=3 -else ifneq ($(call cc-option, -mstack-alignment=16),) +endif +ifdef CONFIG_CC_IS_CLANG cc_stack_align4 := -mstack-alignment=4 cc_stack_align8 := -mstack-alignment=8 endif @@ -83,19 +84,7 @@ KBUILD_RUSTFLAGS += -Ctarget-feature=-sse,-sse2,-sse3,-ssse3,-sse4.1,-sse4.2,-av # CC_FLAGS_FPU := -msse -msse2 ifdef CONFIG_CC_IS_GCC -# Stack alignment mismatch, proceed with caution. -# GCC < 7.1 cannot compile code using `double` and -mpreferred-stack-boundary=3 -# (8B stack alignment). -# See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=53383 -# -# The "-msse" in the first argument is there so that the -# -mpreferred-stack-boundary=3 build error: -# -# -mpreferred-stack-boundary=3 is not between 4 and 12 -# -# can be triggered. Otherwise gcc doesn't complain. CC_FLAGS_FPU += -mhard-float -CC_FLAGS_FPU += $(call cc-option,-msse -mpreferred-stack-boundary=3,-mpreferred-stack-boundary=4) endif ifeq ($(CONFIG_X86_KERNEL_IBT),y) @@ -159,7 +148,7 @@ else # Don't autogenerate traditional x87 instructions KBUILD_CFLAGS += -mno-80387 - KBUILD_CFLAGS += $(call cc-option,-mno-fp-ret-in-387) + KBUILD_CFLAGS += -mno-fp-ret-in-387 # By default gcc and clang use a stack alignment of 16 bytes for x86. # However the standard kernel entry on x86-64 leaves the stack on an @@ -171,7 +160,7 @@ else KBUILD_CFLAGS += $(cc_stack_align8) # Use -mskip-rax-setup if supported. - KBUILD_CFLAGS += $(call cc-option,-mskip-rax-setup) + KBUILD_CFLAGS += -mskip-rax-setup ifdef CONFIG_X86_NATIVE_CPU KBUILD_CFLAGS += -march=native @@ -286,7 +275,6 @@ archprepare: $(cpufeaturemasks.hdr) ### # Kernel objects -core-y += arch/x86/boot/startup/ libs-y += arch/x86/lib/ # drivers-y are linked after core-y diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 3a38fdcdb9bd..74657589264d 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -73,7 +73,7 @@ LDFLAGS_vmlinux += -T hostprogs := mkpiggy HOST_EXTRACFLAGS += -I$(srctree)/tools/include -sed-voffset := -e 's/^\([0-9a-fA-F]*\) [ABbCDGRSTtVW] \(_text\|__start_rodata\|__bss_start\|_end\)$$/\#define VO_\2 _AC(0x\1,UL)/p' +sed-voffset := -e 's/^\([0-9a-fA-F]*\) [ABbCDGRSTtVW] \(_text\|__start_rodata\|_sinittext\|__inittext_end\|__bss_start\|_end\)$$/\#define VO_\2 _AC(0x\1,UL)/p' quiet_cmd_voffset = VOFFSET $@ cmd_voffset = $(NM) $< | sed -n $(sed-voffset) > $@ diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 94b5991da001..0f41ca0e52c0 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -332,6 +332,8 @@ static size_t parse_elf(void *output) } const unsigned long kernel_text_size = VO___start_rodata - VO__text; +const unsigned long kernel_inittext_offset = VO__sinittext - VO__text; +const unsigned long kernel_inittext_size = VO___inittext_end - VO__sinittext; const unsigned long kernel_total_size = VO__end - VO__text; static u8 boot_heap[BOOT_HEAP_SIZE] __aligned(4); diff --git a/arch/x86/boot/compressed/sev-handle-vc.c b/arch/x86/boot/compressed/sev-handle-vc.c index 89dd02de2a0f..7530ad8b768b 100644 --- a/arch/x86/boot/compressed/sev-handle-vc.c +++ b/arch/x86/boot/compressed/sev-handle-vc.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include "misc.h" +#include "error.h" #include "sev.h" #include <linux/kernel.h> @@ -14,6 +15,8 @@ #include <asm/fpu/xcr.h> #define __BOOT_COMPRESSED +#undef __init +#define __init /* Basic instruction decoding support needed */ #include "../../lib/inat.c" diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c index fd1b67dfea22..6e5c32a53d03 100644 --- a/arch/x86/boot/compressed/sev.c +++ b/arch/x86/boot/compressed/sev.c @@ -32,102 +32,47 @@ struct ghcb *boot_ghcb; #undef __init #define __init -#undef __head -#define __head - #define __BOOT_COMPRESSED -extern struct svsm_ca *boot_svsm_caa; -extern u64 boot_svsm_caa_pa; - -struct svsm_ca *svsm_get_caa(void) -{ - return boot_svsm_caa; -} - -u64 svsm_get_caa_pa(void) -{ - return boot_svsm_caa_pa; -} - -int svsm_perform_call_protocol(struct svsm_call *call); - u8 snp_vmpl; +u16 ghcb_version; + +u64 boot_svsm_caa_pa; /* Include code for early handlers */ #include "../../boot/startup/sev-shared.c" -int svsm_perform_call_protocol(struct svsm_call *call) -{ - struct ghcb *ghcb; - int ret; - - if (boot_ghcb) - ghcb = boot_ghcb; - else - ghcb = NULL; - - do { - ret = ghcb ? svsm_perform_ghcb_protocol(ghcb, call) - : svsm_perform_msr_protocol(call); - } while (ret == -EAGAIN); - - return ret; -} - static bool sev_snp_enabled(void) { return sev_status & MSR_AMD64_SEV_SNP_ENABLED; } -static void __page_state_change(unsigned long paddr, enum psc_op op) -{ - u64 val, msr; - - /* - * If private -> shared then invalidate the page before requesting the - * state change in the RMP table. - */ - if (op == SNP_PAGE_STATE_SHARED) - pvalidate_4k_page(paddr, paddr, false); - - /* Save the current GHCB MSR value */ - msr = sev_es_rd_ghcb_msr(); - - /* Issue VMGEXIT to change the page state in RMP table. */ - sev_es_wr_ghcb_msr(GHCB_MSR_PSC_REQ_GFN(paddr >> PAGE_SHIFT, op)); - VMGEXIT(); - - /* Read the response of the VMGEXIT. */ - val = sev_es_rd_ghcb_msr(); - if ((GHCB_RESP_CODE(val) != GHCB_MSR_PSC_RESP) || GHCB_MSR_PSC_RESP_VAL(val)) - sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PSC); - - /* Restore the GHCB MSR value */ - sev_es_wr_ghcb_msr(msr); - - /* - * Now that page state is changed in the RMP table, validate it so that it is - * consistent with the RMP entry. - */ - if (op == SNP_PAGE_STATE_PRIVATE) - pvalidate_4k_page(paddr, paddr, true); -} - void snp_set_page_private(unsigned long paddr) { + struct psc_desc d = { + SNP_PAGE_STATE_PRIVATE, + (struct svsm_ca *)boot_svsm_caa_pa, + boot_svsm_caa_pa + }; + if (!sev_snp_enabled()) return; - __page_state_change(paddr, SNP_PAGE_STATE_PRIVATE); + __page_state_change(paddr, paddr, &d); } void snp_set_page_shared(unsigned long paddr) { + struct psc_desc d = { + SNP_PAGE_STATE_SHARED, + (struct svsm_ca *)boot_svsm_caa_pa, + boot_svsm_caa_pa + }; + if (!sev_snp_enabled()) return; - __page_state_change(paddr, SNP_PAGE_STATE_SHARED); + __page_state_change(paddr, paddr, &d); } bool early_setup_ghcb(void) @@ -152,8 +97,14 @@ bool early_setup_ghcb(void) void snp_accept_memory(phys_addr_t start, phys_addr_t end) { + struct psc_desc d = { + SNP_PAGE_STATE_PRIVATE, + (struct svsm_ca *)boot_svsm_caa_pa, + boot_svsm_caa_pa + }; + for (phys_addr_t pa = start; pa < end; pa += PAGE_SIZE) - __page_state_change(pa, SNP_PAGE_STATE_PRIVATE); + __page_state_change(pa, pa, &d); } void sev_es_shutdown_ghcb(void) @@ -235,15 +186,23 @@ bool sev_es_check_ghcb_fault(unsigned long address) MSR_AMD64_SNP_VMSA_REG_PROT | \ MSR_AMD64_SNP_RESERVED_BIT13 | \ MSR_AMD64_SNP_RESERVED_BIT15 | \ + MSR_AMD64_SNP_SECURE_AVIC | \ MSR_AMD64_SNP_RESERVED_MASK) +#ifdef CONFIG_AMD_SECURE_AVIC +#define SNP_FEATURE_SECURE_AVIC MSR_AMD64_SNP_SECURE_AVIC +#else +#define SNP_FEATURE_SECURE_AVIC 0 +#endif + /* * SNP_FEATURES_PRESENT is the mask of SNP features that are implemented * by the guest kernel. As and when a new feature is implemented in the * guest kernel, a corresponding bit should be added to the mask. */ #define SNP_FEATURES_PRESENT (MSR_AMD64_SNP_DEBUG_SWAP | \ - MSR_AMD64_SNP_SECURE_TSC) + MSR_AMD64_SNP_SECURE_TSC | \ + SNP_FEATURE_SECURE_AVIC) u64 snp_get_unsupported_features(u64 status) { @@ -347,7 +306,7 @@ static bool early_snp_init(struct boot_params *bp) * running at VMPL0. The CA will be used to communicate with the * SVSM and request its services. */ - svsm_setup_ca(cc_info); + svsm_setup_ca(cc_info, rip_rel_ptr(&boot_ghcb_page)); /* * Pass run-time kernel a pointer to CC info via boot_params so EFI @@ -391,6 +350,8 @@ static int sev_check_cpu_support(void) if (!(eax & BIT(1))) return -ENODEV; + sev_snp_needs_sfw = !(ebx & BIT(31)); + return ebx & 0x3f; } @@ -453,30 +414,16 @@ void sev_enable(struct boot_params *bp) */ if (sev_status & MSR_AMD64_SEV_SNP_ENABLED) { u64 hv_features; - int ret; hv_features = get_hv_features(); if (!(hv_features & GHCB_HV_FT_SNP)) sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED); /* - * Enforce running at VMPL0 or with an SVSM. - * - * Use RMPADJUST (see the rmpadjust() function for a description of - * what the instruction does) to update the VMPL1 permissions of a - * page. If the guest is running at VMPL0, this will succeed. If the - * guest is running at any other VMPL, this will fail. Linux SNP guests - * only ever run at a single VMPL level so permission mask changes of a - * lesser-privileged VMPL are a don't-care. - */ - ret = rmpadjust((unsigned long)&boot_ghcb_page, RMP_PG_SIZE_4K, 1); - - /* - * Running at VMPL0 is not required if an SVSM is present and the hypervisor - * supports the required SVSM GHCB events. + * Running at VMPL0 is required unless an SVSM is present and + * the hypervisor supports the required SVSM GHCB events. */ - if (ret && - !(snp_vmpl && (hv_features & GHCB_HV_FT_SNP_MULTI_VMPL))) + if (snp_vmpl && !(hv_features & GHCB_HV_FT_SNP_MULTI_VMPL)) sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_NOT_VMPL0); } @@ -550,7 +497,6 @@ bool early_is_sevsnp_guest(void) /* Obtain the address of the calling area to use */ boot_rdmsr(MSR_SVSM_CAA, &m); - boot_svsm_caa = (void *)m.q; boot_svsm_caa_pa = m.q; /* diff --git a/arch/x86/boot/cpuflags.c b/arch/x86/boot/cpuflags.c index 63e037e94e4c..916bac09b464 100644 --- a/arch/x86/boot/cpuflags.c +++ b/arch/x86/boot/cpuflags.c @@ -106,18 +106,5 @@ void get_cpuflags(void) cpuid(0x80000001, &ignored, &ignored, &cpu.flags[6], &cpu.flags[1]); } - - if (max_amd_level >= 0x8000001f) { - u32 ebx; - - /* - * The X86_FEATURE_COHERENCY_SFW_NO feature bit is in - * the virtualization flags entry (word 8) and set by - * scattered.c, so the bit needs to be explicitly set. - */ - cpuid(0x8000001f, &ignored, &ebx, &ignored, &ignored); - if (ebx & BIT(31)) - set_bit(X86_FEATURE_COHERENCY_SFW_NO, cpu.flags); - } } } diff --git a/arch/x86/boot/startup/Makefile b/arch/x86/boot/startup/Makefile index b514f7e81332..e8fdf020b422 100644 --- a/arch/x86/boot/startup/Makefile +++ b/arch/x86/boot/startup/Makefile @@ -4,6 +4,7 @@ KBUILD_AFLAGS += -D__DISABLE_EXPORTS KBUILD_CFLAGS += -D__DISABLE_EXPORTS -mcmodel=small -fPIC \ -Os -DDISABLE_BRANCH_PROFILING \ $(DISABLE_STACKLEAK_PLUGIN) \ + $(DISABLE_LATENT_ENTROPY_PLUGIN) \ -fno-stack-protector -D__NO_FORTIFY \ -fno-jump-tables \ -include $(srctree)/include/linux/hidden.h @@ -19,6 +20,7 @@ KCOV_INSTRUMENT := n obj-$(CONFIG_X86_64) += gdt_idt.o map_kernel.o obj-$(CONFIG_AMD_MEM_ENCRYPT) += sme.o sev-startup.o +pi-objs := $(patsubst %.o,$(obj)/%.o,$(obj-y)) lib-$(CONFIG_X86_64) += la57toggle.o lib-$(CONFIG_EFI_MIXED) += efi-mixed.o @@ -28,3 +30,23 @@ lib-$(CONFIG_EFI_MIXED) += efi-mixed.o # to be linked into the decompressor or the EFI stub but not vmlinux # $(patsubst %.o,$(obj)/%.o,$(lib-y)): OBJECT_FILES_NON_STANDARD := y + +# +# Invoke objtool for each object individually to check for absolute +# relocations, even if other objtool actions are being deferred. +# +$(pi-objs): objtool-enabled = 1 +$(pi-objs): objtool-args = $(if $(delay-objtool),,$(objtool-args-y)) --noabs + +# +# Confine the startup code by prefixing all symbols with __pi_ (for position +# independent). This ensures that startup code can only call other startup +# code, or code that has explicitly been made accessible to it via a symbol +# alias. +# +$(obj)/%.pi.o: OBJCOPYFLAGS := --prefix-symbols=__pi_ +$(obj)/%.pi.o: $(obj)/%.o FORCE + $(call if_changed,objcopy) + +targets += $(obj-y) +obj-y := $(patsubst %.o,%.pi.o,$(obj-y)) diff --git a/arch/x86/boot/startup/exports.h b/arch/x86/boot/startup/exports.h new file mode 100644 index 000000000000..01d2363dc445 --- /dev/null +++ b/arch/x86/boot/startup/exports.h @@ -0,0 +1,14 @@ + +/* + * The symbols below are functions that are implemented by the startup code, + * but called at runtime by the SEV code residing in the core kernel. + */ +PROVIDE(early_set_pages_state = __pi_early_set_pages_state); +PROVIDE(early_snp_set_memory_private = __pi_early_snp_set_memory_private); +PROVIDE(early_snp_set_memory_shared = __pi_early_snp_set_memory_shared); +PROVIDE(get_hv_features = __pi_get_hv_features); +PROVIDE(sev_es_terminate = __pi_sev_es_terminate); +PROVIDE(snp_cpuid = __pi_snp_cpuid); +PROVIDE(snp_cpuid_get_table = __pi_snp_cpuid_get_table); +PROVIDE(svsm_issue_call = __pi_svsm_issue_call); +PROVIDE(svsm_process_result_codes = __pi_svsm_process_result_codes); diff --git a/arch/x86/boot/startup/gdt_idt.c b/arch/x86/boot/startup/gdt_idt.c index a3112a69b06a..d16102abdaec 100644 --- a/arch/x86/boot/startup/gdt_idt.c +++ b/arch/x86/boot/startup/gdt_idt.c @@ -24,7 +24,7 @@ static gate_desc bringup_idt_table[NUM_EXCEPTION_VECTORS] __page_aligned_data; /* This may run while still in the direct mapping */ -void __head startup_64_load_idt(void *vc_handler) +void startup_64_load_idt(void *vc_handler) { struct desc_ptr desc = { .address = (unsigned long)rip_rel_ptr(bringup_idt_table), @@ -46,7 +46,7 @@ void __head startup_64_load_idt(void *vc_handler) /* * Setup boot CPU state needed before kernel switches to virtual addresses. */ -void __head startup_64_setup_gdt_idt(void) +void __init startup_64_setup_gdt_idt(void) { struct gdt_page *gp = rip_rel_ptr((void *)(__force unsigned long)&gdt_page); void *handler = NULL; diff --git a/arch/x86/boot/startup/map_kernel.c b/arch/x86/boot/startup/map_kernel.c index 332dbe6688c4..83ba98d61572 100644 --- a/arch/x86/boot/startup/map_kernel.c +++ b/arch/x86/boot/startup/map_kernel.c @@ -30,7 +30,7 @@ static inline bool check_la57_support(void) return true; } -static unsigned long __head sme_postprocess_startup(struct boot_params *bp, +static unsigned long __init sme_postprocess_startup(struct boot_params *bp, pmdval_t *pmd, unsigned long p2v_offset) { @@ -84,7 +84,7 @@ static unsigned long __head sme_postprocess_startup(struct boot_params *bp, * the 1:1 mapping of memory. Kernel virtual addresses can be determined by * subtracting p2v_offset from the RIP-relative address. */ -unsigned long __head __startup_64(unsigned long p2v_offset, +unsigned long __init __startup_64(unsigned long p2v_offset, struct boot_params *bp) { pmd_t (*early_pgts)[PTRS_PER_PMD] = rip_rel_ptr(early_dynamic_pgts); diff --git a/arch/x86/boot/startup/sev-shared.c b/arch/x86/boot/startup/sev-shared.c index a34cd19796f9..4e22ffd73516 100644 --- a/arch/x86/boot/startup/sev-shared.c +++ b/arch/x86/boot/startup/sev-shared.c @@ -12,35 +12,12 @@ #include <asm/setup_data.h> #ifndef __BOOT_COMPRESSED -#define error(v) pr_err(v) #define has_cpuflag(f) boot_cpu_has(f) #else #undef WARN #define WARN(condition, format...) (!!(condition)) -#undef vc_forward_exception -#define vc_forward_exception(c) panic("SNP: Hypervisor requested exception\n") #endif -/* - * SVSM related information: - * During boot, the page tables are set up as identity mapped and later - * changed to use kernel virtual addresses. Maintain separate virtual and - * physical addresses for the CAA to allow SVSM functions to be used during - * early boot, both with identity mapped virtual addresses and proper kernel - * virtual addresses. - */ -struct svsm_ca *boot_svsm_caa __ro_after_init; -u64 boot_svsm_caa_pa __ro_after_init; - -/* - * Since feature negotiation related variables are set early in the boot - * process they must reside in the .data section so as not to be zeroed - * out when the .bss section is later cleared. - * - * GHCB protocol version negotiated with the hypervisor. - */ -static u16 ghcb_version __ro_after_init; - /* Copy of the SNP firmware's CPUID page. */ static struct snp_cpuid_table cpuid_table_copy __ro_after_init; @@ -54,17 +31,9 @@ static u32 cpuid_std_range_max __ro_after_init; static u32 cpuid_hyp_range_max __ro_after_init; static u32 cpuid_ext_range_max __ro_after_init; -bool __init sev_es_check_cpu_features(void) -{ - if (!has_cpuflag(X86_FEATURE_RDRAND)) { - error("RDRAND instruction not supported - no trusted source of randomness available\n"); - return false; - } +bool sev_snp_needs_sfw; - return true; -} - -void __head __noreturn +void __noreturn sev_es_terminate(unsigned int set, unsigned int reason) { u64 val = GHCB_MSR_TERM_REQ; @@ -83,7 +52,7 @@ sev_es_terminate(unsigned int set, unsigned int reason) /* * The hypervisor features are available from GHCB version 2 onward. */ -u64 get_hv_features(void) +u64 __init get_hv_features(void) { u64 val; @@ -100,72 +69,7 @@ u64 get_hv_features(void) return GHCB_MSR_HV_FT_RESP_VAL(val); } -void snp_register_ghcb_early(unsigned long paddr) -{ - unsigned long pfn = paddr >> PAGE_SHIFT; - u64 val; - - sev_es_wr_ghcb_msr(GHCB_MSR_REG_GPA_REQ_VAL(pfn)); - VMGEXIT(); - - val = sev_es_rd_ghcb_msr(); - - /* If the response GPA is not ours then abort the guest */ - if ((GHCB_RESP_CODE(val) != GHCB_MSR_REG_GPA_RESP) || - (GHCB_MSR_REG_GPA_RESP_VAL(val) != pfn)) - sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_REGISTER); -} - -bool sev_es_negotiate_protocol(void) -{ - u64 val; - - /* Do the GHCB protocol version negotiation */ - sev_es_wr_ghcb_msr(GHCB_MSR_SEV_INFO_REQ); - VMGEXIT(); - val = sev_es_rd_ghcb_msr(); - - if (GHCB_MSR_INFO(val) != GHCB_MSR_SEV_INFO_RESP) - return false; - - if (GHCB_MSR_PROTO_MAX(val) < GHCB_PROTOCOL_MIN || - GHCB_MSR_PROTO_MIN(val) > GHCB_PROTOCOL_MAX) - return false; - - ghcb_version = min_t(size_t, GHCB_MSR_PROTO_MAX(val), GHCB_PROTOCOL_MAX); - - return true; -} - -static enum es_result verify_exception_info(struct ghcb *ghcb, struct es_em_ctxt *ctxt) -{ - u32 ret; - - ret = ghcb->save.sw_exit_info_1 & GENMASK_ULL(31, 0); - if (!ret) - return ES_OK; - - if (ret == 1) { - u64 info = ghcb->save.sw_exit_info_2; - unsigned long v = info & SVM_EVTINJ_VEC_MASK; - - /* Check if exception information from hypervisor is sane. */ - if ((info & SVM_EVTINJ_VALID) && - ((v == X86_TRAP_GP) || (v == X86_TRAP_UD)) && - ((info & SVM_EVTINJ_TYPE_MASK) == SVM_EVTINJ_TYPE_EXEPT)) { - ctxt->fi.vector = v; - - if (info & SVM_EVTINJ_VALID_ERR) - ctxt->fi.error_code = info >> 32; - - return ES_EXCEPTION; - } - } - - return ES_VMM_ERROR; -} - -static inline int svsm_process_result_codes(struct svsm_call *call) +int svsm_process_result_codes(struct svsm_call *call) { switch (call->rax_out) { case SVSM_SUCCESS: @@ -193,7 +97,7 @@ static inline int svsm_process_result_codes(struct svsm_call *call) * - RAX specifies the SVSM protocol/callid as input and the return code * as output. */ -static __always_inline void svsm_issue_call(struct svsm_call *call, u8 *pending) +void svsm_issue_call(struct svsm_call *call, u8 *pending) { register unsigned long rax asm("rax") = call->rax; register unsigned long rcx asm("rcx") = call->rcx; @@ -216,7 +120,7 @@ static __always_inline void svsm_issue_call(struct svsm_call *call, u8 *pending) call->r9_out = r9; } -static int svsm_perform_msr_protocol(struct svsm_call *call) +int svsm_perform_msr_protocol(struct svsm_call *call) { u8 pending = 0; u64 val, resp; @@ -247,63 +151,6 @@ static int svsm_perform_msr_protocol(struct svsm_call *call) return svsm_process_result_codes(call); } -static int svsm_perform_ghcb_protocol(struct ghcb *ghcb, struct svsm_call *call) -{ - struct es_em_ctxt ctxt; - u8 pending = 0; - - vc_ghcb_invalidate(ghcb); - - /* - * Fill in protocol and format specifiers. This can be called very early - * in the boot, so use rip-relative references as needed. - */ - ghcb->protocol_version = ghcb_version; - ghcb->ghcb_usage = GHCB_DEFAULT_USAGE; - - ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_SNP_RUN_VMPL); - ghcb_set_sw_exit_info_1(ghcb, 0); - ghcb_set_sw_exit_info_2(ghcb, 0); - - sev_es_wr_ghcb_msr(__pa(ghcb)); - - svsm_issue_call(call, &pending); - - if (pending) - return -EINVAL; - - switch (verify_exception_info(ghcb, &ctxt)) { - case ES_OK: - break; - case ES_EXCEPTION: - vc_forward_exception(&ctxt); - fallthrough; - default: - return -EINVAL; - } - - return svsm_process_result_codes(call); -} - -enum es_result sev_es_ghcb_hv_call(struct ghcb *ghcb, - struct es_em_ctxt *ctxt, - u64 exit_code, u64 exit_info_1, - u64 exit_info_2) -{ - /* Fill in protocol and format specifiers */ - ghcb->protocol_version = ghcb_version; - ghcb->ghcb_usage = GHCB_DEFAULT_USAGE; - - ghcb_set_sw_exit_code(ghcb, exit_code); - ghcb_set_sw_exit_info_1(ghcb, exit_info_1); - ghcb_set_sw_exit_info_2(ghcb, exit_info_2); - - sev_es_wr_ghcb_msr(__pa(ghcb)); - VMGEXIT(); - - return verify_exception_info(ghcb, ctxt); -} - static int __sev_cpuid_hv(u32 fn, int reg_idx, u32 *reg) { u64 val; @@ -342,44 +189,7 @@ static int __sev_cpuid_hv_msr(struct cpuid_leaf *leaf) return ret; } -static int __sev_cpuid_hv_ghcb(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid_leaf *leaf) -{ - u32 cr4 = native_read_cr4(); - int ret; - - ghcb_set_rax(ghcb, leaf->fn); - ghcb_set_rcx(ghcb, leaf->subfn); - - if (cr4 & X86_CR4_OSXSAVE) - /* Safe to read xcr0 */ - ghcb_set_xcr0(ghcb, xgetbv(XCR_XFEATURE_ENABLED_MASK)); - else - /* xgetbv will cause #UD - use reset value for xcr0 */ - ghcb_set_xcr0(ghcb, 1); - - ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_CPUID, 0, 0); - if (ret != ES_OK) - return ret; - - if (!(ghcb_rax_is_valid(ghcb) && - ghcb_rbx_is_valid(ghcb) && - ghcb_rcx_is_valid(ghcb) && - ghcb_rdx_is_valid(ghcb))) - return ES_VMM_ERROR; - leaf->eax = ghcb->save.rax; - leaf->ebx = ghcb->save.rbx; - leaf->ecx = ghcb->save.rcx; - leaf->edx = ghcb->save.rdx; - - return ES_OK; -} - -static int sev_cpuid_hv(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid_leaf *leaf) -{ - return ghcb ? __sev_cpuid_hv_ghcb(ghcb, ctxt, leaf) - : __sev_cpuid_hv_msr(leaf); -} /* * This may be called early while still running on the initial identity @@ -412,7 +222,7 @@ const struct snp_cpuid_table *snp_cpuid_get_table(void) * * Return: XSAVE area size on success, 0 otherwise. */ -static u32 __head snp_cpuid_calc_xsave_size(u64 xfeatures_en, bool compacted) +static u32 snp_cpuid_calc_xsave_size(u64 xfeatures_en, bool compacted) { const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table(); u64 xfeatures_found = 0; @@ -448,7 +258,7 @@ static u32 __head snp_cpuid_calc_xsave_size(u64 xfeatures_en, bool compacted) return xsave_size; } -static bool __head +static bool snp_cpuid_get_validated_func(struct cpuid_leaf *leaf) { const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table(); @@ -484,21 +294,21 @@ snp_cpuid_get_validated_func(struct cpuid_leaf *leaf) return false; } -static void snp_cpuid_hv(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid_leaf *leaf) +static void snp_cpuid_hv_msr(void *ctx, struct cpuid_leaf *leaf) { - if (sev_cpuid_hv(ghcb, ctxt, leaf)) + if (__sev_cpuid_hv_msr(leaf)) sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_CPUID_HV); } -static int __head -snp_cpuid_postprocess(struct ghcb *ghcb, struct es_em_ctxt *ctxt, - struct cpuid_leaf *leaf) +static int +snp_cpuid_postprocess(void (*cpuid_fn)(void *ctx, struct cpuid_leaf *leaf), + void *ctx, struct cpuid_leaf *leaf) { struct cpuid_leaf leaf_hv = *leaf; switch (leaf->fn) { case 0x1: - snp_cpuid_hv(ghcb, ctxt, &leaf_hv); + cpuid_fn(ctx, &leaf_hv); /* initial APIC ID */ leaf->ebx = (leaf_hv.ebx & GENMASK(31, 24)) | (leaf->ebx & GENMASK(23, 0)); @@ -517,7 +327,7 @@ snp_cpuid_postprocess(struct ghcb *ghcb, struct es_em_ctxt *ctxt, break; case 0xB: leaf_hv.subfn = 0; - snp_cpuid_hv(ghcb, ctxt, &leaf_hv); + cpuid_fn(ctx, &leaf_hv); /* extended APIC ID */ leaf->edx = leaf_hv.edx; @@ -565,7 +375,7 @@ snp_cpuid_postprocess(struct ghcb *ghcb, struct es_em_ctxt *ctxt, } break; case 0x8000001E: - snp_cpuid_hv(ghcb, ctxt, &leaf_hv); + cpuid_fn(ctx, &leaf_hv); /* extended APIC ID */ leaf->eax = leaf_hv.eax; @@ -586,8 +396,8 @@ snp_cpuid_postprocess(struct ghcb *ghcb, struct es_em_ctxt *ctxt, * Returns -EOPNOTSUPP if feature not enabled. Any other non-zero return value * should be treated as fatal by caller. */ -int __head -snp_cpuid(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid_leaf *leaf) +int snp_cpuid(void (*cpuid_fn)(void *ctx, struct cpuid_leaf *leaf), + void *ctx, struct cpuid_leaf *leaf) { const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table(); @@ -621,7 +431,7 @@ snp_cpuid(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid_leaf *leaf) return 0; } - return snp_cpuid_postprocess(ghcb, ctxt, leaf); + return snp_cpuid_postprocess(cpuid_fn, ctx, leaf); } /* @@ -629,7 +439,7 @@ snp_cpuid(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid_leaf *leaf) * page yet, so it only supports the MSR based communication with the * hypervisor and only the CPUID exit-code. */ -void __head do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code) +void do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code) { unsigned int subfn = lower_bits(regs->cx, 32); unsigned int fn = lower_bits(regs->ax, 32); @@ -648,13 +458,24 @@ void __head do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code) leaf.fn = fn; leaf.subfn = subfn; - ret = snp_cpuid(NULL, NULL, &leaf); + /* + * If SNP is active, then snp_cpuid() uses the CPUID table to obtain the + * CPUID values (with possible HV interaction during post-processing of + * the values). But if SNP is not active (no CPUID table present), then + * snp_cpuid() returns -EOPNOTSUPP so that an SEV-ES guest can call the + * HV to obtain the CPUID information. + */ + ret = snp_cpuid(snp_cpuid_hv_msr, NULL, &leaf); if (!ret) goto cpuid_done; if (ret != -EOPNOTSUPP) goto fail; + /* + * This is reached by a SEV-ES guest and needs to invoke the HV for + * the CPUID data. + */ if (__sev_cpuid_hv_msr(&leaf)) goto fail; @@ -705,7 +526,7 @@ struct cc_setup_data { * Search for a Confidential Computing blob passed in as a setup_data entry * via the Linux Boot Protocol. */ -static __head +static __init struct cc_blob_sev_info *find_cc_blob_setup_data(struct boot_params *bp) { struct cc_setup_data *sd = NULL; @@ -733,7 +554,7 @@ struct cc_blob_sev_info *find_cc_blob_setup_data(struct boot_params *bp) * mapping needs to be updated in sync with all the changes to virtual memory * layout and related mapping facilities throughout the boot process. */ -static void __head setup_cpuid_table(const struct cc_blob_sev_info *cc_info) +static void __init setup_cpuid_table(const struct cc_blob_sev_info *cc_info) { const struct snp_cpuid_table *cpuid_table_fw, *cpuid_table; int i; @@ -761,13 +582,24 @@ static void __head setup_cpuid_table(const struct cc_blob_sev_info *cc_info) } } -static void __head svsm_pval_4k_page(unsigned long paddr, bool validate) +static int svsm_call_msr_protocol(struct svsm_call *call) +{ + int ret; + + do { + ret = svsm_perform_msr_protocol(call); + } while (ret == -EAGAIN); + + return ret; +} + +static void svsm_pval_4k_page(unsigned long paddr, bool validate, + struct svsm_ca *caa, u64 caa_pa) { struct svsm_pvalidate_call *pc; struct svsm_call call = {}; unsigned long flags; u64 pc_pa; - int ret; /* * This can be called very early in the boot, use native functions in @@ -775,10 +607,10 @@ static void __head svsm_pval_4k_page(unsigned long paddr, bool validate) */ flags = native_local_irq_save(); - call.caa = svsm_get_caa(); + call.caa = caa; pc = (struct svsm_pvalidate_call *)call.caa->svsm_buffer; - pc_pa = svsm_get_caa_pa() + offsetof(struct svsm_ca, svsm_buffer); + pc_pa = caa_pa + offsetof(struct svsm_ca, svsm_buffer); pc->num_entries = 1; pc->cur_index = 0; @@ -792,20 +624,24 @@ static void __head svsm_pval_4k_page(unsigned long paddr, bool validate) call.rax = SVSM_CORE_CALL(SVSM_CORE_PVALIDATE); call.rcx = pc_pa; - ret = svsm_perform_call_protocol(&call); - if (ret) + /* + * Use the MSR protocol exclusively, so that this code is usable in + * startup code where VA/PA translations of the GHCB page's address may + * be problematic. + */ + if (svsm_call_msr_protocol(&call)) sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PVALIDATE); native_local_irq_restore(flags); } -static void __head pvalidate_4k_page(unsigned long vaddr, unsigned long paddr, - bool validate) +static void pvalidate_4k_page(unsigned long vaddr, unsigned long paddr, + bool validate, struct svsm_ca *caa, u64 caa_pa) { int ret; if (snp_vmpl) { - svsm_pval_4k_page(paddr, validate); + svsm_pval_4k_page(paddr, validate, caa, caa_pa); } else { ret = pvalidate(vaddr, RMP_PG_SIZE_4K, validate); if (ret) @@ -816,15 +652,51 @@ static void __head pvalidate_4k_page(unsigned long vaddr, unsigned long paddr, * If validating memory (making it private) and affected by the * cache-coherency vulnerability, perform the cache eviction mitigation. */ - if (validate && !has_cpuflag(X86_FEATURE_COHERENCY_SFW_NO)) + if (validate && sev_snp_needs_sfw) sev_evict_cache((void *)vaddr, 1); } +static void __page_state_change(unsigned long vaddr, unsigned long paddr, + const struct psc_desc *desc) +{ + u64 val, msr; + + /* + * If private -> shared then invalidate the page before requesting the + * state change in the RMP table. + */ + if (desc->op == SNP_PAGE_STATE_SHARED) + pvalidate_4k_page(vaddr, paddr, false, desc->ca, desc->caa_pa); + + /* Save the current GHCB MSR value */ + msr = sev_es_rd_ghcb_msr(); + + /* Issue VMGEXIT to change the page state in RMP table. */ + sev_es_wr_ghcb_msr(GHCB_MSR_PSC_REQ_GFN(paddr >> PAGE_SHIFT, desc->op)); + VMGEXIT(); + + /* Read the response of the VMGEXIT. */ + val = sev_es_rd_ghcb_msr(); + if ((GHCB_RESP_CODE(val) != GHCB_MSR_PSC_RESP) || GHCB_MSR_PSC_RESP_VAL(val)) + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PSC); + + /* Restore the GHCB MSR value */ + sev_es_wr_ghcb_msr(msr); + + /* + * Now that page state is changed in the RMP table, validate it so that it is + * consistent with the RMP entry. + */ + if (desc->op == SNP_PAGE_STATE_PRIVATE) + pvalidate_4k_page(vaddr, paddr, true, desc->ca, desc->caa_pa); +} + /* * Maintain the GPA of the SVSM Calling Area (CA) in order to utilize the SVSM * services needed when not running in VMPL0. */ -static bool __head svsm_setup_ca(const struct cc_blob_sev_info *cc_info) +static bool __init svsm_setup_ca(const struct cc_blob_sev_info *cc_info, + void *page) { struct snp_secrets_page *secrets_page; struct snp_cpuid_table *cpuid_table; @@ -847,7 +719,7 @@ static bool __head svsm_setup_ca(const struct cc_blob_sev_info *cc_info) * routine is running identity mapped when called, both by the decompressor * code and the early kernel code. */ - if (!rmpadjust((unsigned long)rip_rel_ptr(&boot_ghcb_page), RMP_PG_SIZE_4K, 1)) + if (!rmpadjust((unsigned long)page, RMP_PG_SIZE_4K, 1)) return false; /* @@ -875,11 +747,6 @@ static bool __head svsm_setup_ca(const struct cc_blob_sev_info *cc_info) if (caa & (PAGE_SIZE - 1)) sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_SVSM_CAA); - /* - * The CA is identity mapped when this routine is called, both by the - * decompressor code and the early kernel code. - */ - boot_svsm_caa = (struct svsm_ca *)caa; boot_svsm_caa_pa = caa; /* Advertise the SVSM presence via CPUID. */ diff --git a/arch/x86/boot/startup/sev-startup.c b/arch/x86/boot/startup/sev-startup.c index 0b7e3b950183..09725428d3e6 100644 --- a/arch/x86/boot/startup/sev-startup.c +++ b/arch/x86/boot/startup/sev-startup.c @@ -41,143 +41,14 @@ #include <asm/cpuid/api.h> #include <asm/cmdline.h> -/* For early boot hypervisor communication in SEV-ES enabled guests */ -struct ghcb boot_ghcb_page __bss_decrypted __aligned(PAGE_SIZE); - -/* - * Needs to be in the .data section because we need it NULL before bss is - * cleared - */ -struct ghcb *boot_ghcb __section(".data"); - -/* Bitmap of SEV features supported by the hypervisor */ -u64 sev_hv_features __ro_after_init; - -/* Secrets page physical address from the CC blob */ -u64 sev_secrets_pa __ro_after_init; - -/* For early boot SVSM communication */ -struct svsm_ca boot_svsm_ca_page __aligned(PAGE_SIZE); - -DEFINE_PER_CPU(struct svsm_ca *, svsm_caa); -DEFINE_PER_CPU(u64, svsm_caa_pa); - -/* - * Nothing shall interrupt this code path while holding the per-CPU - * GHCB. The backup GHCB is only for NMIs interrupting this path. - * - * Callers must disable local interrupts around it. - */ -noinstr struct ghcb *__sev_get_ghcb(struct ghcb_state *state) -{ - struct sev_es_runtime_data *data; - struct ghcb *ghcb; - - WARN_ON(!irqs_disabled()); - - data = this_cpu_read(runtime_data); - ghcb = &data->ghcb_page; - - if (unlikely(data->ghcb_active)) { - /* GHCB is already in use - save its contents */ - - if (unlikely(data->backup_ghcb_active)) { - /* - * Backup-GHCB is also already in use. There is no way - * to continue here so just kill the machine. To make - * panic() work, mark GHCBs inactive so that messages - * can be printed out. - */ - data->ghcb_active = false; - data->backup_ghcb_active = false; - - instrumentation_begin(); - panic("Unable to handle #VC exception! GHCB and Backup GHCB are already in use"); - instrumentation_end(); - } - - /* Mark backup_ghcb active before writing to it */ - data->backup_ghcb_active = true; - - state->ghcb = &data->backup_ghcb; - - /* Backup GHCB content */ - *state->ghcb = *ghcb; - } else { - state->ghcb = NULL; - data->ghcb_active = true; - } - - return ghcb; -} - /* Include code shared with pre-decompression boot stage */ #include "sev-shared.c" -noinstr void __sev_put_ghcb(struct ghcb_state *state) -{ - struct sev_es_runtime_data *data; - struct ghcb *ghcb; - - WARN_ON(!irqs_disabled()); - - data = this_cpu_read(runtime_data); - ghcb = &data->ghcb_page; - - if (state->ghcb) { - /* Restore GHCB from Backup */ - *ghcb = *state->ghcb; - data->backup_ghcb_active = false; - state->ghcb = NULL; - } else { - /* - * Invalidate the GHCB so a VMGEXIT instruction issued - * from userspace won't appear to be valid. - */ - vc_ghcb_invalidate(ghcb); - data->ghcb_active = false; - } -} - -int svsm_perform_call_protocol(struct svsm_call *call) -{ - struct ghcb_state state; - unsigned long flags; - struct ghcb *ghcb; - int ret; - - /* - * This can be called very early in the boot, use native functions in - * order to avoid paravirt issues. - */ - flags = native_local_irq_save(); - - if (sev_cfg.ghcbs_initialized) - ghcb = __sev_get_ghcb(&state); - else if (boot_ghcb) - ghcb = boot_ghcb; - else - ghcb = NULL; - - do { - ret = ghcb ? svsm_perform_ghcb_protocol(ghcb, call) - : svsm_perform_msr_protocol(call); - } while (ret == -EAGAIN); - - if (sev_cfg.ghcbs_initialized) - __sev_put_ghcb(&state); - - native_local_irq_restore(flags); - - return ret; -} - -void __head +void early_set_pages_state(unsigned long vaddr, unsigned long paddr, - unsigned long npages, enum psc_op op) + unsigned long npages, const struct psc_desc *desc) { unsigned long paddr_end; - u64 val; vaddr = vaddr & PAGE_MASK; @@ -185,42 +56,22 @@ early_set_pages_state(unsigned long vaddr, unsigned long paddr, paddr_end = paddr + (npages << PAGE_SHIFT); while (paddr < paddr_end) { - /* Page validation must be rescinded before changing to shared */ - if (op == SNP_PAGE_STATE_SHARED) - pvalidate_4k_page(vaddr, paddr, false); - - /* - * Use the MSR protocol because this function can be called before - * the GHCB is established. - */ - sev_es_wr_ghcb_msr(GHCB_MSR_PSC_REQ_GFN(paddr >> PAGE_SHIFT, op)); - VMGEXIT(); - - val = sev_es_rd_ghcb_msr(); - - if (GHCB_RESP_CODE(val) != GHCB_MSR_PSC_RESP) - goto e_term; - - if (GHCB_MSR_PSC_RESP_VAL(val)) - goto e_term; - - /* Page validation must be performed after changing to private */ - if (op == SNP_PAGE_STATE_PRIVATE) - pvalidate_4k_page(vaddr, paddr, true); + __page_state_change(vaddr, paddr, desc); vaddr += PAGE_SIZE; paddr += PAGE_SIZE; } - - return; - -e_term: - sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PSC); } -void __head early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr, +void __init early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr, unsigned long npages) { + struct psc_desc d = { + SNP_PAGE_STATE_PRIVATE, + rip_rel_ptr(&boot_svsm_ca_page), + boot_svsm_caa_pa + }; + /* * This can be invoked in early boot while running identity mapped, so * use an open coded check for SNP instead of using cc_platform_has(). @@ -234,12 +85,18 @@ void __head early_snp_set_memory_private(unsigned long vaddr, unsigned long padd * Ask the hypervisor to mark the memory pages as private in the RMP * table. */ - early_set_pages_state(vaddr, paddr, npages, SNP_PAGE_STATE_PRIVATE); + early_set_pages_state(vaddr, paddr, npages, &d); } -void __head early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr, +void __init early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr, unsigned long npages) { + struct psc_desc d = { + SNP_PAGE_STATE_SHARED, + rip_rel_ptr(&boot_svsm_ca_page), + boot_svsm_caa_pa + }; + /* * This can be invoked in early boot while running identity mapped, so * use an open coded check for SNP instead of using cc_platform_has(). @@ -250,7 +107,7 @@ void __head early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr return; /* Ask hypervisor to mark the memory pages shared in the RMP table. */ - early_set_pages_state(vaddr, paddr, npages, SNP_PAGE_STATE_SHARED); + early_set_pages_state(vaddr, paddr, npages, &d); } /* @@ -266,7 +123,7 @@ void __head early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr * * Scan for the blob in that order. */ -static __head struct cc_blob_sev_info *find_cc_blob(struct boot_params *bp) +static struct cc_blob_sev_info *__init find_cc_blob(struct boot_params *bp) { struct cc_blob_sev_info *cc_info; @@ -287,15 +144,15 @@ static __head struct cc_blob_sev_info *find_cc_blob(struct boot_params *bp) found_cc_info: if (cc_info->magic != CC_BLOB_SEV_HDR_MAGIC) - snp_abort(); + sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED); return cc_info; } -static __head void svsm_setup(struct cc_blob_sev_info *cc_info) +static void __init svsm_setup(struct cc_blob_sev_info *cc_info) { + struct snp_secrets_page *secrets = (void *)cc_info->secrets_phys; struct svsm_call call = {}; - int ret; u64 pa; /* @@ -303,7 +160,7 @@ static __head void svsm_setup(struct cc_blob_sev_info *cc_info) * running at VMPL0. The CA will be used to communicate with the * SVSM to perform the SVSM services. */ - if (!svsm_setup_ca(cc_info)) + if (!svsm_setup_ca(cc_info, rip_rel_ptr(&boot_svsm_ca_page))) return; /* @@ -315,25 +172,25 @@ static __head void svsm_setup(struct cc_blob_sev_info *cc_info) pa = (u64)rip_rel_ptr(&boot_svsm_ca_page); /* - * Switch over to the boot SVSM CA while the current CA is still - * addressable. There is no GHCB at this point so use the MSR protocol. + * Switch over to the boot SVSM CA while the current CA is still 1:1 + * mapped and thus addressable with VA == PA. There is no GHCB at this + * point so use the MSR protocol. * * SVSM_CORE_REMAP_CA call: * RAX = 0 (Protocol=0, CallID=0) * RCX = New CA GPA */ - call.caa = svsm_get_caa(); + call.caa = (struct svsm_ca *)secrets->svsm_caa; call.rax = SVSM_CORE_CALL(SVSM_CORE_REMAP_CA); call.rcx = pa; - ret = svsm_perform_call_protocol(&call); - if (ret) + + if (svsm_call_msr_protocol(&call)) sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_SVSM_CA_REMAP_FAIL); - boot_svsm_caa = (struct svsm_ca *)pa; boot_svsm_caa_pa = pa; } -bool __head snp_init(struct boot_params *bp) +bool __init snp_init(struct boot_params *bp) { struct cc_blob_sev_info *cc_info; @@ -361,8 +218,3 @@ bool __head snp_init(struct boot_params *bp) return true; } - -void __head __noreturn snp_abort(void) -{ - sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED); -} diff --git a/arch/x86/boot/startup/sme.c b/arch/x86/boot/startup/sme.c index 70ea1748c0a7..e7ea65f3f1d6 100644 --- a/arch/x86/boot/startup/sme.c +++ b/arch/x86/boot/startup/sme.c @@ -91,7 +91,7 @@ struct sme_populate_pgd_data { */ static char sme_workarea[2 * PMD_SIZE] __section(".init.scratch"); -static void __head sme_clear_pgd(struct sme_populate_pgd_data *ppd) +static void __init sme_clear_pgd(struct sme_populate_pgd_data *ppd) { unsigned long pgd_start, pgd_end, pgd_size; pgd_t *pgd_p; @@ -106,7 +106,7 @@ static void __head sme_clear_pgd(struct sme_populate_pgd_data *ppd) memset(pgd_p, 0, pgd_size); } -static pud_t __head *sme_prepare_pgd(struct sme_populate_pgd_data *ppd) +static pud_t __init *sme_prepare_pgd(struct sme_populate_pgd_data *ppd) { pgd_t *pgd; p4d_t *p4d; @@ -143,7 +143,7 @@ static pud_t __head *sme_prepare_pgd(struct sme_populate_pgd_data *ppd) return pud; } -static void __head sme_populate_pgd_large(struct sme_populate_pgd_data *ppd) +static void __init sme_populate_pgd_large(struct sme_populate_pgd_data *ppd) { pud_t *pud; pmd_t *pmd; @@ -159,7 +159,7 @@ static void __head sme_populate_pgd_large(struct sme_populate_pgd_data *ppd) set_pmd(pmd, __pmd(ppd->paddr | ppd->pmd_flags)); } -static void __head sme_populate_pgd(struct sme_populate_pgd_data *ppd) +static void __init sme_populate_pgd(struct sme_populate_pgd_data *ppd) { pud_t *pud; pmd_t *pmd; @@ -185,7 +185,7 @@ static void __head sme_populate_pgd(struct sme_populate_pgd_data *ppd) set_pte(pte, __pte(ppd->paddr | ppd->pte_flags)); } -static void __head __sme_map_range_pmd(struct sme_populate_pgd_data *ppd) +static void __init __sme_map_range_pmd(struct sme_populate_pgd_data *ppd) { while (ppd->vaddr < ppd->vaddr_end) { sme_populate_pgd_large(ppd); @@ -195,7 +195,7 @@ static void __head __sme_map_range_pmd(struct sme_populate_pgd_data *ppd) } } -static void __head __sme_map_range_pte(struct sme_populate_pgd_data *ppd) +static void __init __sme_map_range_pte(struct sme_populate_pgd_data *ppd) { while (ppd->vaddr < ppd->vaddr_end) { sme_populate_pgd(ppd); @@ -205,7 +205,7 @@ static void __head __sme_map_range_pte(struct sme_populate_pgd_data *ppd) } } -static void __head __sme_map_range(struct sme_populate_pgd_data *ppd, +static void __init __sme_map_range(struct sme_populate_pgd_data *ppd, pmdval_t pmd_flags, pteval_t pte_flags) { unsigned long vaddr_end; @@ -229,22 +229,22 @@ static void __head __sme_map_range(struct sme_populate_pgd_data *ppd, __sme_map_range_pte(ppd); } -static void __head sme_map_range_encrypted(struct sme_populate_pgd_data *ppd) +static void __init sme_map_range_encrypted(struct sme_populate_pgd_data *ppd) { __sme_map_range(ppd, PMD_FLAGS_ENC, PTE_FLAGS_ENC); } -static void __head sme_map_range_decrypted(struct sme_populate_pgd_data *ppd) +static void __init sme_map_range_decrypted(struct sme_populate_pgd_data *ppd) { __sme_map_range(ppd, PMD_FLAGS_DEC, PTE_FLAGS_DEC); } -static void __head sme_map_range_decrypted_wp(struct sme_populate_pgd_data *ppd) +static void __init sme_map_range_decrypted_wp(struct sme_populate_pgd_data *ppd) { __sme_map_range(ppd, PMD_FLAGS_DEC_WP, PTE_FLAGS_DEC_WP); } -static unsigned long __head sme_pgtable_calc(unsigned long len) +static unsigned long __init sme_pgtable_calc(unsigned long len) { unsigned long entries = 0, tables = 0; @@ -281,7 +281,7 @@ static unsigned long __head sme_pgtable_calc(unsigned long len) return entries + tables; } -void __head sme_encrypt_kernel(struct boot_params *bp) +void __init sme_encrypt_kernel(struct boot_params *bp) { unsigned long workarea_start, workarea_end, workarea_len; unsigned long execute_start, execute_end, execute_len; @@ -485,7 +485,7 @@ void __head sme_encrypt_kernel(struct boot_params *bp) native_write_cr3(__native_read_cr3()); } -void __head sme_enable(struct boot_params *bp) +void __init sme_enable(struct boot_params *bp) { unsigned int eax, ebx, ecx, edx; unsigned long feature_mask; @@ -521,6 +521,7 @@ void __head sme_enable(struct boot_params *bp) return; me_mask = 1UL << (ebx & 0x3f); + sev_snp_needs_sfw = !(ebx & BIT(31)); /* Check the SEV MSR whether SEV or SME is enabled */ sev_status = msr = native_rdmsrq(MSR_AMD64_SEV); @@ -531,7 +532,7 @@ void __head sme_enable(struct boot_params *bp) * enablement abort the guest. */ if (snp_en ^ !!(msr & MSR_AMD64_SEV_SNP_ENABLED)) - snp_abort(); + sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED); /* Check if memory encryption is enabled */ if (feature_mask == AMD_SME_BIT) { @@ -567,7 +568,6 @@ void __head sme_enable(struct boot_params *bp) #ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION /* Local version for startup code, which never operates on user page tables */ -__weak pgd_t __pti_set_user_pgtbl(pgd_t *pgdp, pgd_t pgd) { return pgd; diff --git a/arch/x86/coco/core.c b/arch/x86/coco/core.c index d4610af68114..989ca9f72ba3 100644 --- a/arch/x86/coco/core.c +++ b/arch/x86/coco/core.c @@ -104,6 +104,9 @@ static bool noinstr amd_cc_platform_has(enum cc_attr attr) case CC_ATTR_HOST_SEV_SNP: return cc_flags.host_sev_snp; + case CC_ATTR_SNP_SECURE_AVIC: + return sev_status & MSR_AMD64_SNP_SECURE_AVIC; + default: return false; } diff --git a/arch/x86/coco/sev/Makefile b/arch/x86/coco/sev/Makefile index 342d79f0ab6a..3b8ae214a6a6 100644 --- a/arch/x86/coco/sev/Makefile +++ b/arch/x86/coco/sev/Makefile @@ -1,10 +1,10 @@ # SPDX-License-Identifier: GPL-2.0 -obj-y += core.o sev-nmi.o vc-handle.o +obj-y += core.o noinstr.o vc-handle.o # Clang 14 and older may fail to respect __no_sanitize_undefined when inlining -UBSAN_SANITIZE_sev-nmi.o := n +UBSAN_SANITIZE_noinstr.o := n # GCC may fail to respect __no_sanitize_address or __no_kcsan when inlining -KASAN_SANITIZE_sev-nmi.o := n -KCSAN_SANITIZE_sev-nmi.o := n +KASAN_SANITIZE_noinstr.o := n +KCSAN_SANITIZE_noinstr.o := n diff --git a/arch/x86/coco/sev/core.c b/arch/x86/coco/sev/core.c index 14ef5908fb27..9ae3b11754e6 100644 --- a/arch/x86/coco/sev/core.c +++ b/arch/x86/coco/sev/core.c @@ -46,6 +46,48 @@ #include <asm/cmdline.h> #include <asm/msr.h> +/* Bitmap of SEV features supported by the hypervisor */ +u64 sev_hv_features __ro_after_init; +SYM_PIC_ALIAS(sev_hv_features); + +/* Secrets page physical address from the CC blob */ +u64 sev_secrets_pa __ro_after_init; +SYM_PIC_ALIAS(sev_secrets_pa); + +/* For early boot SVSM communication */ +struct svsm_ca boot_svsm_ca_page __aligned(PAGE_SIZE); +SYM_PIC_ALIAS(boot_svsm_ca_page); + +/* + * SVSM related information: + * During boot, the page tables are set up as identity mapped and later + * changed to use kernel virtual addresses. Maintain separate virtual and + * physical addresses for the CAA to allow SVSM functions to be used during + * early boot, both with identity mapped virtual addresses and proper kernel + * virtual addresses. + */ +u64 boot_svsm_caa_pa __ro_after_init; +SYM_PIC_ALIAS(boot_svsm_caa_pa); + +DEFINE_PER_CPU(struct svsm_ca *, svsm_caa); +DEFINE_PER_CPU(u64, svsm_caa_pa); + +static inline struct svsm_ca *svsm_get_caa(void) +{ + if (sev_cfg.use_cas) + return this_cpu_read(svsm_caa); + else + return rip_rel_ptr(&boot_svsm_ca_page); +} + +static inline u64 svsm_get_caa_pa(void) +{ + if (sev_cfg.use_cas) + return this_cpu_read(svsm_caa_pa); + else + return boot_svsm_caa_pa; +} + /* AP INIT values as documented in the APM2 section "Processor Initialization State" */ #define AP_INIT_CS_LIMIT 0xffff #define AP_INIT_DS_LIMIT 0xffff @@ -79,6 +121,7 @@ static const char * const sev_status_feat_names[] = { [MSR_AMD64_SNP_IBS_VIRT_BIT] = "IBSVirt", [MSR_AMD64_SNP_VMSA_REG_PROT_BIT] = "VMSARegProt", [MSR_AMD64_SNP_SMT_PROT_BIT] = "SMTProt", + [MSR_AMD64_SNP_SECURE_AVIC_BIT] = "SecureAVIC", }; /* @@ -100,6 +143,26 @@ DEFINE_PER_CPU(struct sev_es_save_area *, sev_vmsa); */ u8 snp_vmpl __ro_after_init; EXPORT_SYMBOL_GPL(snp_vmpl); +SYM_PIC_ALIAS(snp_vmpl); + +/* + * Since feature negotiation related variables are set early in the boot + * process they must reside in the .data section so as not to be zeroed + * out when the .bss section is later cleared. + * + * GHCB protocol version negotiated with the hypervisor. + */ +u16 ghcb_version __ro_after_init; +SYM_PIC_ALIAS(ghcb_version); + +/* For early boot hypervisor communication in SEV-ES enabled guests */ +static struct ghcb boot_ghcb_page __bss_decrypted __aligned(PAGE_SIZE); + +/* + * Needs to be in the .data section because we need it NULL before bss is + * cleared + */ +struct ghcb *boot_ghcb __section(".data"); static u64 __init get_snp_jump_table_addr(void) { @@ -154,6 +217,73 @@ static u64 __init get_jump_table_addr(void) return ret; } +static int svsm_perform_ghcb_protocol(struct ghcb *ghcb, struct svsm_call *call) +{ + struct es_em_ctxt ctxt; + u8 pending = 0; + + vc_ghcb_invalidate(ghcb); + + /* + * Fill in protocol and format specifiers. This can be called very early + * in the boot, so use rip-relative references as needed. + */ + ghcb->protocol_version = ghcb_version; + ghcb->ghcb_usage = GHCB_DEFAULT_USAGE; + + ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_SNP_RUN_VMPL); + ghcb_set_sw_exit_info_1(ghcb, 0); + ghcb_set_sw_exit_info_2(ghcb, 0); + + sev_es_wr_ghcb_msr(__pa(ghcb)); + + svsm_issue_call(call, &pending); + + if (pending) + return -EINVAL; + + switch (verify_exception_info(ghcb, &ctxt)) { + case ES_OK: + break; + case ES_EXCEPTION: + vc_forward_exception(&ctxt); + fallthrough; + default: + return -EINVAL; + } + + return svsm_process_result_codes(call); +} + +static int svsm_perform_call_protocol(struct svsm_call *call) +{ + struct ghcb_state state; + unsigned long flags; + struct ghcb *ghcb; + int ret; + + flags = native_local_irq_save(); + + if (sev_cfg.ghcbs_initialized) + ghcb = __sev_get_ghcb(&state); + else if (boot_ghcb) + ghcb = boot_ghcb; + else + ghcb = NULL; + + do { + ret = ghcb ? svsm_perform_ghcb_protocol(ghcb, call) + : __pi_svsm_perform_msr_protocol(call); + } while (ret == -EAGAIN); + + if (sev_cfg.ghcbs_initialized) + __sev_put_ghcb(&state); + + native_local_irq_restore(flags); + + return ret; +} + static inline void __pval_terminate(u64 pfn, bool action, unsigned int page_size, int ret, u64 svsm_ret) { @@ -531,8 +661,11 @@ static void set_pages_state(unsigned long vaddr, unsigned long npages, int op) unsigned long vaddr_end; /* Use the MSR protocol when a GHCB is not available. */ - if (!boot_ghcb) - return early_set_pages_state(vaddr, __pa(vaddr), npages, op); + if (!boot_ghcb) { + struct psc_desc d = { op, svsm_get_caa(), svsm_get_caa_pa() }; + + return early_set_pages_state(vaddr, __pa(vaddr), npages, &d); + } vaddr = vaddr & PAGE_MASK; vaddr_end = vaddr + (npages << PAGE_SHIFT); @@ -973,6 +1106,9 @@ static int wakeup_cpu_via_vmgexit(u32 apic_id, unsigned long start_ip, unsigned vmsa->x87_ftw = AP_INIT_X87_FTW_DEFAULT; vmsa->x87_fcw = AP_INIT_X87_FCW_DEFAULT; + if (cc_platform_has(CC_ATTR_SNP_SECURE_AVIC)) + vmsa->vintr_ctrl |= V_GIF_MASK | V_NMI_ENABLE_MASK; + /* SVME must be set. */ vmsa->efer = EFER_SVME; @@ -1107,6 +1243,105 @@ int __init sev_es_efi_map_ghcbs_cas(pgd_t *pgd) return 0; } +u64 savic_ghcb_msr_read(u32 reg) +{ + u64 msr = APIC_BASE_MSR + (reg >> 4); + struct pt_regs regs = { .cx = msr }; + struct es_em_ctxt ctxt = { .regs = ®s }; + struct ghcb_state state; + enum es_result res; + struct ghcb *ghcb; + + guard(irqsave)(); + + ghcb = __sev_get_ghcb(&state); + vc_ghcb_invalidate(ghcb); + + res = sev_es_ghcb_handle_msr(ghcb, &ctxt, false); + if (res != ES_OK) { + pr_err("Secure AVIC MSR (0x%llx) read returned error (%d)\n", msr, res); + /* MSR read failures are treated as fatal errors */ + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_SAVIC_FAIL); + } + + __sev_put_ghcb(&state); + + return regs.ax | regs.dx << 32; +} + +void savic_ghcb_msr_write(u32 reg, u64 value) +{ + u64 msr = APIC_BASE_MSR + (reg >> 4); + struct pt_regs regs = { + .cx = msr, + .ax = lower_32_bits(value), + .dx = upper_32_bits(value) + }; + struct es_em_ctxt ctxt = { .regs = ®s }; + struct ghcb_state state; + enum es_result res; + struct ghcb *ghcb; + + guard(irqsave)(); + + ghcb = __sev_get_ghcb(&state); + vc_ghcb_invalidate(ghcb); + + res = sev_es_ghcb_handle_msr(ghcb, &ctxt, true); + if (res != ES_OK) { + pr_err("Secure AVIC MSR (0x%llx) write returned error (%d)\n", msr, res); + /* MSR writes should never fail. Any failure is fatal error for SNP guest */ + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_SAVIC_FAIL); + } + + __sev_put_ghcb(&state); +} + +enum es_result savic_register_gpa(u64 gpa) +{ + struct ghcb_state state; + struct es_em_ctxt ctxt; + enum es_result res; + struct ghcb *ghcb; + + guard(irqsave)(); + + ghcb = __sev_get_ghcb(&state); + vc_ghcb_invalidate(ghcb); + + ghcb_set_rax(ghcb, SVM_VMGEXIT_SAVIC_SELF_GPA); + ghcb_set_rbx(ghcb, gpa); + res = sev_es_ghcb_hv_call(ghcb, &ctxt, SVM_VMGEXIT_SAVIC, + SVM_VMGEXIT_SAVIC_REGISTER_GPA, 0); + + __sev_put_ghcb(&state); + + return res; +} + +enum es_result savic_unregister_gpa(u64 *gpa) +{ + struct ghcb_state state; + struct es_em_ctxt ctxt; + enum es_result res; + struct ghcb *ghcb; + + guard(irqsave)(); + + ghcb = __sev_get_ghcb(&state); + vc_ghcb_invalidate(ghcb); + + ghcb_set_rax(ghcb, SVM_VMGEXIT_SAVIC_SELF_GPA); + res = sev_es_ghcb_hv_call(ghcb, &ctxt, SVM_VMGEXIT_SAVIC, + SVM_VMGEXIT_SAVIC_UNREGISTER_GPA, 0); + if (gpa && res == ES_OK) + *gpa = ghcb->save.rbx; + + __sev_put_ghcb(&state); + + return res; +} + static void snp_register_per_cpu_ghcb(void) { struct sev_es_runtime_data *data; @@ -1233,7 +1468,8 @@ static void __init alloc_runtime_data(int cpu) struct svsm_ca *caa; /* Allocate the SVSM CA page if an SVSM is present */ - caa = memblock_alloc_or_panic(sizeof(*caa), PAGE_SIZE); + caa = cpu ? memblock_alloc_or_panic(sizeof(*caa), PAGE_SIZE) + : &boot_svsm_ca_page; per_cpu(svsm_caa, cpu) = caa; per_cpu(svsm_caa_pa, cpu) = __pa(caa); @@ -1287,32 +1523,9 @@ void __init sev_es_init_vc_handling(void) init_ghcb(cpu); } - /* If running under an SVSM, switch to the per-cpu CA */ - if (snp_vmpl) { - struct svsm_call call = {}; - unsigned long flags; - int ret; - - local_irq_save(flags); - - /* - * SVSM_CORE_REMAP_CA call: - * RAX = 0 (Protocol=0, CallID=0) - * RCX = New CA GPA - */ - call.caa = svsm_get_caa(); - call.rax = SVSM_CORE_CALL(SVSM_CORE_REMAP_CA); - call.rcx = this_cpu_read(svsm_caa_pa); - ret = svsm_perform_call_protocol(&call); - if (ret) - panic("Can't remap the SVSM CA, ret=%d, rax_out=0x%llx\n", - ret, call.rax_out); - + if (snp_vmpl) sev_cfg.use_cas = true; - local_irq_restore(flags); - } - sev_es_setup_play_dead(); /* Secondary CPUs use the runtime #VC handler */ @@ -1590,15 +1803,6 @@ void sev_show_status(void) pr_cont("\n"); } -void __init snp_update_svsm_ca(void) -{ - if (!snp_vmpl) - return; - - /* Update the CAA to a proper kernel address */ - boot_svsm_caa = &boot_svsm_ca_page; -} - #ifdef CONFIG_SYSFS static ssize_t vmpl_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) diff --git a/arch/x86/coco/sev/sev-nmi.c b/arch/x86/coco/sev/noinstr.c index d8dfaddfb367..b527eafb6312 100644 --- a/arch/x86/coco/sev/sev-nmi.c +++ b/arch/x86/coco/sev/noinstr.c @@ -106,3 +106,77 @@ void noinstr __sev_es_nmi_complete(void) __sev_put_ghcb(&state); } + +/* + * Nothing shall interrupt this code path while holding the per-CPU + * GHCB. The backup GHCB is only for NMIs interrupting this path. + * + * Callers must disable local interrupts around it. + */ +noinstr struct ghcb *__sev_get_ghcb(struct ghcb_state *state) +{ + struct sev_es_runtime_data *data; + struct ghcb *ghcb; + + WARN_ON(!irqs_disabled()); + + data = this_cpu_read(runtime_data); + ghcb = &data->ghcb_page; + + if (unlikely(data->ghcb_active)) { + /* GHCB is already in use - save its contents */ + + if (unlikely(data->backup_ghcb_active)) { + /* + * Backup-GHCB is also already in use. There is no way + * to continue here so just kill the machine. To make + * panic() work, mark GHCBs inactive so that messages + * can be printed out. + */ + data->ghcb_active = false; + data->backup_ghcb_active = false; + + instrumentation_begin(); + panic("Unable to handle #VC exception! GHCB and Backup GHCB are already in use"); + instrumentation_end(); + } + + /* Mark backup_ghcb active before writing to it */ + data->backup_ghcb_active = true; + + state->ghcb = &data->backup_ghcb; + + /* Backup GHCB content */ + *state->ghcb = *ghcb; + } else { + state->ghcb = NULL; + data->ghcb_active = true; + } + + return ghcb; +} + +noinstr void __sev_put_ghcb(struct ghcb_state *state) +{ + struct sev_es_runtime_data *data; + struct ghcb *ghcb; + + WARN_ON(!irqs_disabled()); + + data = this_cpu_read(runtime_data); + ghcb = &data->ghcb_page; + + if (state->ghcb) { + /* Restore GHCB from Backup */ + *ghcb = *state->ghcb; + data->backup_ghcb_active = false; + state->ghcb = NULL; + } else { + /* + * Invalidate the GHCB so a VMGEXIT instruction issued + * from userspace won't appear to be valid. + */ + vc_ghcb_invalidate(ghcb); + data->ghcb_active = false; + } +} diff --git a/arch/x86/coco/sev/vc-handle.c b/arch/x86/coco/sev/vc-handle.c index c3b4acbde0d8..7fc136a35334 100644 --- a/arch/x86/coco/sev/vc-handle.c +++ b/arch/x86/coco/sev/vc-handle.c @@ -351,6 +351,8 @@ fault: } #define sev_printk(fmt, ...) printk(fmt, ##__VA_ARGS__) +#define error(v) +#define has_cpuflag(f) boot_cpu_has(f) #include "vc-shared.c" @@ -402,14 +404,10 @@ static enum es_result __vc_handle_secure_tsc_msrs(struct es_em_ctxt *ctxt, bool return ES_OK; } -static enum es_result vc_handle_msr(struct ghcb *ghcb, struct es_em_ctxt *ctxt) +enum es_result sev_es_ghcb_handle_msr(struct ghcb *ghcb, struct es_em_ctxt *ctxt, bool write) { struct pt_regs *regs = ctxt->regs; enum es_result ret; - bool write; - - /* Is it a WRMSR? */ - write = ctxt->insn.opcode.bytes[1] == 0x30; switch (regs->cx) { case MSR_SVSM_CAA: @@ -419,6 +417,15 @@ static enum es_result vc_handle_msr(struct ghcb *ghcb, struct es_em_ctxt *ctxt) if (sev_status & MSR_AMD64_SNP_SECURE_TSC) return __vc_handle_secure_tsc_msrs(ctxt, write); break; + case MSR_AMD64_SAVIC_CONTROL: + /* + * AMD64_SAVIC_CONTROL should not be intercepted when + * Secure AVIC is enabled. Terminate the Secure AVIC guest + * if the interception is enabled. + */ + if (cc_platform_has(CC_ATTR_SNP_SECURE_AVIC)) + return ES_VMM_ERROR; + break; default: break; } @@ -439,6 +446,11 @@ static enum es_result vc_handle_msr(struct ghcb *ghcb, struct es_em_ctxt *ctxt) return ret; } +static enum es_result vc_handle_msr(struct ghcb *ghcb, struct es_em_ctxt *ctxt) +{ + return sev_es_ghcb_handle_msr(ghcb, ctxt, ctxt->insn.opcode.bytes[1] == 0x30); +} + static void __init vc_early_forward_exception(struct es_em_ctxt *ctxt) { int trapnr = ctxt->fi.vector; diff --git a/arch/x86/coco/sev/vc-shared.c b/arch/x86/coco/sev/vc-shared.c index 2c0ab0fdc060..9b01c9ad81be 100644 --- a/arch/x86/coco/sev/vc-shared.c +++ b/arch/x86/coco/sev/vc-shared.c @@ -409,15 +409,109 @@ static enum es_result vc_handle_ioio(struct ghcb *ghcb, struct es_em_ctxt *ctxt) return ret; } +enum es_result verify_exception_info(struct ghcb *ghcb, struct es_em_ctxt *ctxt) +{ + u32 ret; + + ret = ghcb->save.sw_exit_info_1 & GENMASK_ULL(31, 0); + if (!ret) + return ES_OK; + + if (ret == 1) { + u64 info = ghcb->save.sw_exit_info_2; + unsigned long v = info & SVM_EVTINJ_VEC_MASK; + + /* Check if exception information from hypervisor is sane. */ + if ((info & SVM_EVTINJ_VALID) && + ((v == X86_TRAP_GP) || (v == X86_TRAP_UD)) && + ((info & SVM_EVTINJ_TYPE_MASK) == SVM_EVTINJ_TYPE_EXEPT)) { + ctxt->fi.vector = v; + + if (info & SVM_EVTINJ_VALID_ERR) + ctxt->fi.error_code = info >> 32; + + return ES_EXCEPTION; + } + } + + return ES_VMM_ERROR; +} + +enum es_result sev_es_ghcb_hv_call(struct ghcb *ghcb, + struct es_em_ctxt *ctxt, + u64 exit_code, u64 exit_info_1, + u64 exit_info_2) +{ + /* Fill in protocol and format specifiers */ + ghcb->protocol_version = ghcb_version; + ghcb->ghcb_usage = GHCB_DEFAULT_USAGE; + + ghcb_set_sw_exit_code(ghcb, exit_code); + ghcb_set_sw_exit_info_1(ghcb, exit_info_1); + ghcb_set_sw_exit_info_2(ghcb, exit_info_2); + + sev_es_wr_ghcb_msr(__pa(ghcb)); + VMGEXIT(); + + return verify_exception_info(ghcb, ctxt); +} + +static int __sev_cpuid_hv_ghcb(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid_leaf *leaf) +{ + u32 cr4 = native_read_cr4(); + int ret; + + ghcb_set_rax(ghcb, leaf->fn); + ghcb_set_rcx(ghcb, leaf->subfn); + + if (cr4 & X86_CR4_OSXSAVE) + /* Safe to read xcr0 */ + ghcb_set_xcr0(ghcb, xgetbv(XCR_XFEATURE_ENABLED_MASK)); + else + /* xgetbv will cause #UD - use reset value for xcr0 */ + ghcb_set_xcr0(ghcb, 1); + + ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_CPUID, 0, 0); + if (ret != ES_OK) + return ret; + + if (!(ghcb_rax_is_valid(ghcb) && + ghcb_rbx_is_valid(ghcb) && + ghcb_rcx_is_valid(ghcb) && + ghcb_rdx_is_valid(ghcb))) + return ES_VMM_ERROR; + + leaf->eax = ghcb->save.rax; + leaf->ebx = ghcb->save.rbx; + leaf->ecx = ghcb->save.rcx; + leaf->edx = ghcb->save.rdx; + + return ES_OK; +} + +struct cpuid_ctx { + struct ghcb *ghcb; + struct es_em_ctxt *ctxt; +}; + +static void snp_cpuid_hv_ghcb(void *p, struct cpuid_leaf *leaf) +{ + struct cpuid_ctx *ctx = p; + + if (__sev_cpuid_hv_ghcb(ctx->ghcb, ctx->ctxt, leaf)) + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_CPUID_HV); +} + static int vc_handle_cpuid_snp(struct ghcb *ghcb, struct es_em_ctxt *ctxt) { + struct cpuid_ctx ctx = { ghcb, ctxt }; struct pt_regs *regs = ctxt->regs; struct cpuid_leaf leaf; int ret; leaf.fn = regs->ax; leaf.subfn = regs->cx; - ret = snp_cpuid(ghcb, ctxt, &leaf); + ret = snp_cpuid(snp_cpuid_hv_ghcb, &ctx, &leaf); if (!ret) { regs->ax = leaf.eax; regs->bx = leaf.ebx; @@ -502,3 +596,50 @@ static enum es_result vc_handle_rdtsc(struct ghcb *ghcb, return ES_OK; } + +void snp_register_ghcb_early(unsigned long paddr) +{ + unsigned long pfn = paddr >> PAGE_SHIFT; + u64 val; + + sev_es_wr_ghcb_msr(GHCB_MSR_REG_GPA_REQ_VAL(pfn)); + VMGEXIT(); + + val = sev_es_rd_ghcb_msr(); + + /* If the response GPA is not ours then abort the guest */ + if ((GHCB_RESP_CODE(val) != GHCB_MSR_REG_GPA_RESP) || + (GHCB_MSR_REG_GPA_RESP_VAL(val) != pfn)) + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_REGISTER); +} + +bool __init sev_es_check_cpu_features(void) +{ + if (!has_cpuflag(X86_FEATURE_RDRAND)) { + error("RDRAND instruction not supported - no trusted source of randomness available\n"); + return false; + } + + return true; +} + +bool sev_es_negotiate_protocol(void) +{ + u64 val; + + /* Do the GHCB protocol version negotiation */ + sev_es_wr_ghcb_msr(GHCB_MSR_SEV_INFO_REQ); + VMGEXIT(); + val = sev_es_rd_ghcb_msr(); + + if (GHCB_MSR_INFO(val) != GHCB_MSR_SEV_INFO_RESP) + return false; + + if (GHCB_MSR_PROTO_MAX(val) < GHCB_PROTOCOL_MIN || + GHCB_MSR_PROTO_MIN(val) > GHCB_PROTOCOL_MAX) + return false; + + ghcb_version = min_t(size_t, GHCB_MSR_PROTO_MAX(val), GHCB_PROTOCOL_MAX); + + return true; +} diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index 92cf0fe2291e..ced2a1deecd7 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -345,6 +345,7 @@ 333 common io_pgetevents sys_io_pgetevents 334 common rseq sys_rseq 335 common uretprobe sys_uretprobe +336 common uprobe sys_uprobe # don't use numbers 387 through 423, add new calls after the last # 'common' entry 424 common pidfd_send_signal sys_pidfd_send_signal diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 7610f26dfbd9..745caa6c15a3 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -2069,13 +2069,15 @@ static void _x86_pmu_read(struct perf_event *event) void x86_pmu_show_pmu_cap(struct pmu *pmu) { - pr_info("... version: %d\n", x86_pmu.version); - pr_info("... bit width: %d\n", x86_pmu.cntval_bits); - pr_info("... generic registers: %d\n", x86_pmu_num_counters(pmu)); - pr_info("... value mask: %016Lx\n", x86_pmu.cntval_mask); - pr_info("... max period: %016Lx\n", x86_pmu.max_period); - pr_info("... fixed-purpose events: %d\n", x86_pmu_num_counters_fixed(pmu)); - pr_info("... event mask: %016Lx\n", hybrid(pmu, intel_ctrl)); + pr_info("... version: %d\n", x86_pmu.version); + pr_info("... bit width: %d\n", x86_pmu.cntval_bits); + pr_info("... generic counters: %d\n", x86_pmu_num_counters(pmu)); + pr_info("... generic bitmap: %016llx\n", hybrid(pmu, cntr_mask64)); + pr_info("... fixed-purpose counters: %d\n", x86_pmu_num_counters_fixed(pmu)); + pr_info("... fixed-purpose bitmap: %016llx\n", hybrid(pmu, fixed_cntr_mask64)); + pr_info("... value mask: %016llx\n", x86_pmu.cntval_mask); + pr_info("... max period: %016llx\n", x86_pmu.max_period); + pr_info("... global_ctrl mask: %016llx\n", hybrid(pmu, intel_ctrl)); } static int __init init_hw_perf_events(void) diff --git a/arch/x86/events/intel/bts.c b/arch/x86/events/intel/bts.c index 61da6b8a3d51..cbac54cb3a9e 100644 --- a/arch/x86/events/intel/bts.c +++ b/arch/x86/events/intel/bts.c @@ -643,4 +643,4 @@ static __init int bts_init(void) return perf_pmu_register(&bts_pmu, "intel_bts", -1); } -arch_initcall(bts_init); +early_initcall(bts_init); diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index c2fb729c270e..28f5468a6ea3 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2845,8 +2845,8 @@ static void intel_pmu_enable_fixed(struct perf_event *event) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct hw_perf_event *hwc = &event->hw; - u64 mask, bits = 0; int idx = hwc->idx; + u64 bits = 0; if (is_topdown_idx(idx)) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); @@ -2885,14 +2885,10 @@ static void intel_pmu_enable_fixed(struct perf_event *event) idx -= INTEL_PMC_IDX_FIXED; bits = intel_fixed_bits_by_idx(idx, bits); - mask = intel_fixed_bits_by_idx(idx, INTEL_FIXED_BITS_MASK); - - if (x86_pmu.intel_cap.pebs_baseline && event->attr.precise_ip) { + if (x86_pmu.intel_cap.pebs_baseline && event->attr.precise_ip) bits |= intel_fixed_bits_by_idx(idx, ICL_FIXED_0_ADAPTIVE); - mask |= intel_fixed_bits_by_idx(idx, ICL_FIXED_0_ADAPTIVE); - } - cpuc->fixed_ctrl_val &= ~mask; + cpuc->fixed_ctrl_val &= ~intel_fixed_bits_by_idx(idx, INTEL_FIXED_BITS_MASK); cpuc->fixed_ctrl_val |= bits; } @@ -2997,7 +2993,8 @@ static void intel_pmu_acr_late_setup(struct cpu_hw_events *cpuc) if (event->group_leader != leader->group_leader) break; for_each_set_bit(idx, (unsigned long *)&event->attr.config2, X86_PMC_IDX_MAX) { - if (WARN_ON_ONCE(i + idx > cpuc->n_events)) + if (i + idx >= cpuc->n_events || + !is_acr_event_group(cpuc->event_list[i + idx])) return; __set_bit(cpuc->assign[i + idx], (unsigned long *)&event->hw.config1); } @@ -5318,9 +5315,9 @@ static void intel_pmu_check_hybrid_pmus(struct x86_hybrid_pmu *pmu) 0, x86_pmu_num_counters(&pmu->pmu), 0, 0); if (pmu->intel_cap.perf_metrics) - pmu->intel_ctrl |= 1ULL << GLOBAL_CTRL_EN_PERF_METRICS; + pmu->intel_ctrl |= GLOBAL_CTRL_EN_PERF_METRICS; else - pmu->intel_ctrl &= ~(1ULL << GLOBAL_CTRL_EN_PERF_METRICS); + pmu->intel_ctrl &= ~GLOBAL_CTRL_EN_PERF_METRICS; intel_pmu_check_event_constraints(pmu->event_constraints, pmu->cntr_mask64, @@ -5455,7 +5452,7 @@ static void intel_pmu_cpu_starting(int cpu) rdmsrq(MSR_IA32_PERF_CAPABILITIES, perf_cap.capabilities); if (!perf_cap.perf_metrics) { x86_pmu.intel_cap.perf_metrics = 0; - x86_pmu.intel_ctrl &= ~(1ULL << GLOBAL_CTRL_EN_PERF_METRICS); + x86_pmu.intel_ctrl &= ~GLOBAL_CTRL_EN_PERF_METRICS; } } @@ -7789,7 +7786,7 @@ __init int intel_pmu_init(void) } if (!is_hybrid() && x86_pmu.intel_cap.perf_metrics) - x86_pmu.intel_ctrl |= 1ULL << GLOBAL_CTRL_EN_PERF_METRICS; + x86_pmu.intel_ctrl |= GLOBAL_CTRL_EN_PERF_METRICS; if (x86_pmu.intel_cap.pebs_timing_info) x86_pmu.flags |= PMU_FL_RETIRE_LATENCY; diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 07ba4935e873..a26e66d66444 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -305,6 +305,8 @@ struct apic { /* Probe, setup and smpboot functions */ int (*probe)(void); + void (*setup)(void); + void (*teardown)(void); int (*acpi_madt_oem_check)(char *oem_id, char *oem_table_id); void (*init_apic_ldr)(void); @@ -317,6 +319,8 @@ struct apic { /* wakeup secondary CPU using 64-bit wakeup point */ int (*wakeup_secondary_cpu_64)(u32 apicid, unsigned long start_eip, unsigned int cpu); + void (*update_vector)(unsigned int cpu, unsigned int vector, bool set); + char *name; }; @@ -470,6 +474,12 @@ static __always_inline bool apic_id_valid(u32 apic_id) return apic_id <= apic->max_apic_id; } +static __always_inline void apic_update_vector(unsigned int cpu, unsigned int vector, bool set) +{ + if (apic->update_vector) + apic->update_vector(cpu, vector, set); +} + #else /* CONFIG_X86_LOCAL_APIC */ static inline u32 apic_read(u32 reg) { return 0; } @@ -481,6 +491,7 @@ static inline void apic_wait_icr_idle(void) { } static inline u32 safe_apic_wait_icr_idle(void) { return 0; } static inline void apic_native_eoi(void) { WARN_ON_ONCE(1); } static inline void apic_setup_apic_calls(void) { } +static inline void apic_update_vector(unsigned int cpu, unsigned int vector, bool set) { } #define apic_update_callback(_callback, _fn) do { } while (0) diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h index 094106b6a538..be39a543fbe5 100644 --- a/arch/x86/include/asm/apicdef.h +++ b/arch/x86/include/asm/apicdef.h @@ -135,6 +135,8 @@ #define APIC_TDR_DIV_128 0xA #define APIC_EFEAT 0x400 #define APIC_ECTRL 0x410 +#define APIC_SEOI 0x420 +#define APIC_IER 0x480 #define APIC_EILVTn(n) (0x500 + 0x10 * n) #define APIC_EILVT_NR_AMD_K8 1 /* # of extended interrupts */ #define APIC_EILVT_NR_AMD_10H 4 diff --git a/arch/x86/include/asm/boot.h b/arch/x86/include/asm/boot.h index 02b23aa78955..f7b67cb73915 100644 --- a/arch/x86/include/asm/boot.h +++ b/arch/x86/include/asm/boot.h @@ -82,6 +82,8 @@ #ifndef __ASSEMBLER__ extern unsigned int output_len; extern const unsigned long kernel_text_size; +extern const unsigned long kernel_inittext_offset; +extern const unsigned long kernel_inittext_size; extern const unsigned long kernel_total_size; unsigned long decompress_kernel(unsigned char *outbuf, unsigned long virt_addr, diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 751ca35386b0..b2a562217d3f 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -496,6 +496,7 @@ #define X86_FEATURE_TSA_L1_NO (21*32+12) /* AMD CPU not vulnerable to TSA-L1 */ #define X86_FEATURE_CLEAR_CPU_BUF_VM (21*32+13) /* Clear CPU buffers using VERW before VMRUN */ #define X86_FEATURE_IBPB_EXIT_TO_USER (21*32+14) /* Use IBPB on exit-to-userspace, see VMSCAPE bug */ +#define X86_FEATURE_ABMC (21*32+15) /* Assignable Bandwidth Monitoring Counters */ /* * BUG word(s) diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h index e41cbf2ec41d..9ad86a7d13f6 100644 --- a/arch/x86/include/asm/hypervisor.h +++ b/arch/x86/include/asm/hypervisor.h @@ -30,6 +30,7 @@ enum x86_hypervisor_type { X86_HYPER_KVM, X86_HYPER_JAILHOUSE, X86_HYPER_ACRN, + X86_HYPER_BHYVE, }; #ifdef CONFIG_HYPERVISOR_GUEST @@ -64,6 +65,7 @@ extern const struct hypervisor_x86 x86_hyper_xen_pv; extern const struct hypervisor_x86 x86_hyper_kvm; extern const struct hypervisor_x86 x86_hyper_jailhouse; extern const struct hypervisor_x86 x86_hyper_acrn; +extern const struct hypervisor_x86 x86_hyper_bhyve; extern struct hypervisor_x86 x86_hyper_xen_hvm; extern bool nopv; diff --git a/arch/x86/include/asm/inat.h b/arch/x86/include/asm/inat.h index 97f341777db5..1b3060a3425c 100644 --- a/arch/x86/include/asm/inat.h +++ b/arch/x86/include/asm/inat.h @@ -37,6 +37,8 @@ #define INAT_PFX_EVEX 15 /* EVEX prefix */ /* x86-64 REX2 prefix */ #define INAT_PFX_REX2 16 /* 0xD5 */ +/* AMD XOP prefix */ +#define INAT_PFX_XOP 17 /* 0x8F */ #define INAT_LSTPFX_MAX 3 #define INAT_LGCPFX_MAX 11 @@ -77,6 +79,7 @@ #define INAT_MOFFSET (1 << (INAT_FLAG_OFFS + 3)) #define INAT_VARIANT (1 << (INAT_FLAG_OFFS + 4)) #define INAT_VEXOK (1 << (INAT_FLAG_OFFS + 5)) +#define INAT_XOPOK INAT_VEXOK #define INAT_VEXONLY (1 << (INAT_FLAG_OFFS + 6)) #define INAT_EVEXONLY (1 << (INAT_FLAG_OFFS + 7)) #define INAT_NO_REX2 (1 << (INAT_FLAG_OFFS + 8)) @@ -111,6 +114,8 @@ extern insn_attr_t inat_get_group_attribute(insn_byte_t modrm, extern insn_attr_t inat_get_avx_attribute(insn_byte_t opcode, insn_byte_t vex_m, insn_byte_t vex_pp); +extern insn_attr_t inat_get_xop_attribute(insn_byte_t opcode, + insn_byte_t map_select); /* Attribute checking functions */ static inline int inat_is_legacy_prefix(insn_attr_t attr) @@ -164,6 +169,11 @@ static inline int inat_is_vex3_prefix(insn_attr_t attr) return (attr & INAT_PFX_MASK) == INAT_PFX_VEX3; } +static inline int inat_is_xop_prefix(insn_attr_t attr) +{ + return (attr & INAT_PFX_MASK) == INAT_PFX_XOP; +} + static inline int inat_is_escape(insn_attr_t attr) { return attr & INAT_ESC_MASK; @@ -229,6 +239,11 @@ static inline int inat_accept_vex(insn_attr_t attr) return attr & INAT_VEXOK; } +static inline int inat_accept_xop(insn_attr_t attr) +{ + return attr & INAT_XOPOK; +} + static inline int inat_must_vex(insn_attr_t attr) { return attr & (INAT_VEXONLY | INAT_EVEXONLY); diff --git a/arch/x86/include/asm/init.h b/arch/x86/include/asm/init.h index 5a68e9db6518..01ccdd168df0 100644 --- a/arch/x86/include/asm/init.h +++ b/arch/x86/include/asm/init.h @@ -2,12 +2,6 @@ #ifndef _ASM_X86_INIT_H #define _ASM_X86_INIT_H -#if defined(CONFIG_CC_IS_CLANG) && CONFIG_CLANG_VERSION < 170000 -#define __head __section(".head.text") __no_sanitize_undefined __no_stack_protector -#else -#define __head __section(".head.text") __no_sanitize_undefined __no_kstack_erase -#endif - struct x86_mapping_info { void *(*alloc_pgt_page)(void *); /* allocate buf for page table */ void (*free_pgt_page)(void *, void *); /* free buf for page table */ diff --git a/arch/x86/include/asm/insn.h b/arch/x86/include/asm/insn.h index 7152ea809e6a..091f88c8254d 100644 --- a/arch/x86/include/asm/insn.h +++ b/arch/x86/include/asm/insn.h @@ -71,7 +71,10 @@ struct insn { * prefixes.bytes[3]: last prefix */ struct insn_field rex_prefix; /* REX prefix */ - struct insn_field vex_prefix; /* VEX prefix */ + union { + struct insn_field vex_prefix; /* VEX prefix */ + struct insn_field xop_prefix; /* XOP prefix */ + }; struct insn_field opcode; /* * opcode.bytes[0]: opcode1 * opcode.bytes[1]: opcode2 @@ -135,6 +138,17 @@ struct insn { #define X86_VEX_V(vex) (((vex) & 0x78) >> 3) /* VEX3 Byte2, VEX2 Byte1 */ #define X86_VEX_P(vex) ((vex) & 0x03) /* VEX3 Byte2, VEX2 Byte1 */ #define X86_VEX_M_MAX 0x1f /* VEX3.M Maximum value */ +/* XOP bit fields */ +#define X86_XOP_R(xop) ((xop) & 0x80) /* XOP Byte2 */ +#define X86_XOP_X(xop) ((xop) & 0x40) /* XOP Byte2 */ +#define X86_XOP_B(xop) ((xop) & 0x20) /* XOP Byte2 */ +#define X86_XOP_M(xop) ((xop) & 0x1f) /* XOP Byte2 */ +#define X86_XOP_W(xop) ((xop) & 0x80) /* XOP Byte3 */ +#define X86_XOP_V(xop) ((xop) & 0x78) /* XOP Byte3 */ +#define X86_XOP_L(xop) ((xop) & 0x04) /* XOP Byte3 */ +#define X86_XOP_P(xop) ((xop) & 0x03) /* XOP Byte3 */ +#define X86_XOP_M_MIN 0x08 /* Min of XOP.M */ +#define X86_XOP_M_MAX 0x1f /* Max of XOP.M */ extern void insn_init(struct insn *insn, const void *kaddr, int buf_len, int x86_64); extern int insn_get_prefixes(struct insn *insn); @@ -178,7 +192,7 @@ static inline insn_byte_t insn_rex2_m_bit(struct insn *insn) return X86_REX2_M(insn->rex_prefix.bytes[1]); } -static inline int insn_is_avx(struct insn *insn) +static inline int insn_is_avx_or_xop(struct insn *insn) { if (!insn->prefixes.got) insn_get_prefixes(insn); @@ -192,6 +206,22 @@ static inline int insn_is_evex(struct insn *insn) return (insn->vex_prefix.nbytes == 4); } +/* If we already know this is AVX/XOP encoded */ +static inline int avx_insn_is_xop(struct insn *insn) +{ + insn_attr_t attr = inat_get_opcode_attribute(insn->vex_prefix.bytes[0]); + + return inat_is_xop_prefix(attr); +} + +static inline int insn_is_xop(struct insn *insn) +{ + if (!insn_is_avx_or_xop(insn)) + return 0; + + return avx_insn_is_xop(insn); +} + static inline int insn_has_emulate_prefix(struct insn *insn) { return !!insn->emulate_prefix_size; @@ -222,11 +252,26 @@ static inline insn_byte_t insn_vex_w_bit(struct insn *insn) return X86_VEX_W(insn->vex_prefix.bytes[2]); } +static inline insn_byte_t insn_xop_map_bits(struct insn *insn) +{ + if (insn->xop_prefix.nbytes < 3) /* XOP is 3 bytes */ + return 0; + return X86_XOP_M(insn->xop_prefix.bytes[1]); +} + +static inline insn_byte_t insn_xop_p_bits(struct insn *insn) +{ + return X86_XOP_P(insn->vex_prefix.bytes[2]); +} + /* Get the last prefix id from last prefix or VEX prefix */ static inline int insn_last_prefix_id(struct insn *insn) { - if (insn_is_avx(insn)) + if (insn_is_avx_or_xop(insn)) { + if (avx_insn_is_xop(insn)) + return insn_xop_p_bits(insn); return insn_vex_p_bits(insn); /* VEX_p is a SIMD prefix id */ + } if (insn->prefixes.bytes[3]) return inat_get_last_prefix_id(insn->prefixes.bytes[3]); diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h index e345dbdf933e..f32a0eca2ae5 100644 --- a/arch/x86/include/asm/intel-family.h +++ b/arch/x86/include/asm/intel-family.h @@ -51,7 +51,7 @@ #define INTEL_PENTIUM_MMX IFM(5, 0x04) /* P55C */ #define INTEL_QUARK_X1000 IFM(5, 0x09) /* Quark X1000 SoC */ -/* Family 6 */ +/* Family 6, 18, 19 */ #define INTEL_PENTIUM_PRO IFM(6, 0x01) #define INTEL_PENTIUM_II_KLAMATH IFM(6, 0x03) #define INTEL_PENTIUM_III_DESCHUTES IFM(6, 0x05) @@ -126,6 +126,8 @@ #define INTEL_GRANITERAPIDS_X IFM(6, 0xAD) /* Redwood Cove */ #define INTEL_GRANITERAPIDS_D IFM(6, 0xAE) +#define INTEL_DIAMONDRAPIDS_X IFM(19, 0x01) /* Panther Cove */ + #define INTEL_BARTLETTLAKE IFM(6, 0xD7) /* Raptor Cove */ /* "Hybrid" Processors (P-Core/E-Core) */ @@ -203,9 +205,6 @@ #define INTEL_P4_PRESCOTT_2M IFM(15, 0x04) #define INTEL_P4_CEDARMILL IFM(15, 0x06) /* Also Xeon Dempsey */ -/* Family 19 */ -#define INTEL_PANTHERCOVE_X IFM(19, 0x01) /* Diamond Rapids */ - /* * Intel CPU core types * diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 6c77c03139f7..31e3cb550fb3 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -241,12 +241,14 @@ struct cper_ia_proc_ctx; #ifdef CONFIG_X86_MCE int mcheck_init(void); +void mca_bsp_init(struct cpuinfo_x86 *c); void mcheck_cpu_init(struct cpuinfo_x86 *c); void mcheck_cpu_clear(struct cpuinfo_x86 *c); int apei_smca_report_x86_error(struct cper_ia_proc_ctx *ctx_info, u64 lapic_id); #else static inline int mcheck_init(void) { return 0; } +static inline void mca_bsp_init(struct cpuinfo_x86 *c) {} static inline void mcheck_cpu_init(struct cpuinfo_x86 *c) {} static inline void mcheck_cpu_clear(struct cpuinfo_x86 *c) {} static inline int apei_smca_report_x86_error(struct cper_ia_proc_ctx *ctx_info, @@ -290,8 +292,7 @@ DECLARE_PER_CPU(mce_banks_t, mce_poll_banks); enum mcp_flags { MCP_TIMESTAMP = BIT(0), /* log time stamp */ MCP_UC = BIT(1), /* log uncorrected errors */ - MCP_DONTLOG = BIT(2), /* only clear, don't log */ - MCP_QUEUE_LOG = BIT(3), /* only queue to genpool */ + MCP_QUEUE_LOG = BIT(2), /* only queue to genpool */ }; void machine_check_poll(enum mcp_flags flags, mce_banks_t *b); @@ -371,15 +372,9 @@ enum smca_bank_types { extern bool amd_mce_is_memory_error(struct mce *m); -extern int mce_threshold_create_device(unsigned int cpu); -extern int mce_threshold_remove_device(unsigned int cpu); - void mce_amd_feature_init(struct cpuinfo_x86 *c); enum smca_bank_types smca_get_bank_type(unsigned int cpu, unsigned int bank); #else - -static inline int mce_threshold_create_device(unsigned int cpu) { return 0; }; -static inline int mce_threshold_remove_device(unsigned int cpu) { return 0; }; static inline bool amd_mce_is_memory_error(struct mce *m) { return false; }; static inline void mce_amd_feature_init(struct cpuinfo_x86 *c) { } #endif diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index b65c3ba5fa14..718a55d82fe4 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -315,12 +315,14 @@ #define PERF_CAP_PT_IDX 16 #define MSR_PEBS_LD_LAT_THRESHOLD 0x000003f6 -#define PERF_CAP_PEBS_TRAP BIT_ULL(6) -#define PERF_CAP_ARCH_REG BIT_ULL(7) -#define PERF_CAP_PEBS_FORMAT 0xf00 -#define PERF_CAP_PEBS_BASELINE BIT_ULL(14) -#define PERF_CAP_PEBS_MASK (PERF_CAP_PEBS_TRAP | PERF_CAP_ARCH_REG | \ - PERF_CAP_PEBS_FORMAT | PERF_CAP_PEBS_BASELINE) +#define PERF_CAP_PEBS_TRAP BIT_ULL(6) +#define PERF_CAP_ARCH_REG BIT_ULL(7) +#define PERF_CAP_PEBS_FORMAT 0xf00 +#define PERF_CAP_PEBS_BASELINE BIT_ULL(14) +#define PERF_CAP_PEBS_TIMING_INFO BIT_ULL(17) +#define PERF_CAP_PEBS_MASK (PERF_CAP_PEBS_TRAP | PERF_CAP_ARCH_REG | \ + PERF_CAP_PEBS_FORMAT | PERF_CAP_PEBS_BASELINE | \ + PERF_CAP_PEBS_TIMING_INFO) #define MSR_IA32_RTIT_CTL 0x00000570 #define RTIT_CTL_TRACEEN BIT(0) @@ -631,6 +633,11 @@ #define MSR_AMD_PPIN 0xc00102f1 #define MSR_AMD64_CPUID_FN_7 0xc0011002 #define MSR_AMD64_CPUID_FN_1 0xc0011004 + +#define MSR_AMD64_CPUID_EXT_FEAT 0xc0011005 +#define MSR_AMD64_CPUID_EXT_FEAT_TOPOEXT_BIT 54 +#define MSR_AMD64_CPUID_EXT_FEAT_TOPOEXT BIT_ULL(MSR_AMD64_CPUID_EXT_FEAT_TOPOEXT_BIT) + #define MSR_AMD64_LS_CFG 0xc0011020 #define MSR_AMD64_DC_CFG 0xc0011022 #define MSR_AMD64_TW_CFG 0xc0011023 @@ -699,8 +706,15 @@ #define MSR_AMD64_SNP_VMSA_REG_PROT BIT_ULL(MSR_AMD64_SNP_VMSA_REG_PROT_BIT) #define MSR_AMD64_SNP_SMT_PROT_BIT 17 #define MSR_AMD64_SNP_SMT_PROT BIT_ULL(MSR_AMD64_SNP_SMT_PROT_BIT) -#define MSR_AMD64_SNP_RESV_BIT 18 +#define MSR_AMD64_SNP_SECURE_AVIC_BIT 18 +#define MSR_AMD64_SNP_SECURE_AVIC BIT_ULL(MSR_AMD64_SNP_SECURE_AVIC_BIT) +#define MSR_AMD64_SNP_RESV_BIT 19 #define MSR_AMD64_SNP_RESERVED_MASK GENMASK_ULL(63, MSR_AMD64_SNP_RESV_BIT) +#define MSR_AMD64_SAVIC_CONTROL 0xc0010138 +#define MSR_AMD64_SAVIC_EN_BIT 0 +#define MSR_AMD64_SAVIC_EN BIT_ULL(MSR_AMD64_SAVIC_EN_BIT) +#define MSR_AMD64_SAVIC_ALLOWEDNMI_BIT 1 +#define MSR_AMD64_SAVIC_ALLOWEDNMI BIT_ULL(MSR_AMD64_SAVIC_ALLOWEDNMI_BIT) #define MSR_AMD64_RMP_BASE 0xc0010132 #define MSR_AMD64_RMP_END 0xc0010133 #define MSR_AMD64_RMP_CFG 0xc0010136 @@ -1223,6 +1237,8 @@ /* - AMD: */ #define MSR_IA32_MBA_BW_BASE 0xc0000200 #define MSR_IA32_SMBA_BW_BASE 0xc0000280 +#define MSR_IA32_L3_QOS_ABMC_CFG 0xc00003fd +#define MSR_IA32_L3_QOS_EXT_CFG 0xc00003ff #define MSR_IA32_EVT_CFG_BASE 0xc0000400 /* AMD-V MSRs */ diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index e29f82466f43..08ed5a2e46a5 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -514,6 +514,7 @@ enum spectre_v2_user_mitigation { /* The Speculative Store Bypass disable variants */ enum ssb_mitigation { SPEC_STORE_BYPASS_NONE, + SPEC_STORE_BYPASS_AUTO, SPEC_STORE_BYPASS_DISABLE, SPEC_STORE_BYPASS_PRCTL, SPEC_STORE_BYPASS_SECCOMP, diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 70d1d94aca7e..49a4d442f3fc 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -35,7 +35,6 @@ #define ARCH_PERFMON_EVENTSEL_EQ (1ULL << 36) #define ARCH_PERFMON_EVENTSEL_UMASK2 (0xFFULL << 40) -#define INTEL_FIXED_BITS_MASK 0xFULL #define INTEL_FIXED_BITS_STRIDE 4 #define INTEL_FIXED_0_KERNEL (1ULL << 0) #define INTEL_FIXED_0_USER (1ULL << 1) @@ -48,6 +47,11 @@ #define ICL_EVENTSEL_ADAPTIVE (1ULL << 34) #define ICL_FIXED_0_ADAPTIVE (1ULL << 32) +#define INTEL_FIXED_BITS_MASK \ + (INTEL_FIXED_0_KERNEL | INTEL_FIXED_0_USER | \ + INTEL_FIXED_0_ANYTHREAD | INTEL_FIXED_0_ENABLE_PMI | \ + ICL_FIXED_0_ADAPTIVE) + #define intel_fixed_bits_by_idx(_idx, _bits) \ ((_bits) << ((_idx) * INTEL_FIXED_BITS_STRIDE)) @@ -430,7 +434,7 @@ static inline bool is_topdown_idx(int idx) #define GLOBAL_STATUS_TRACE_TOPAPMI BIT_ULL(GLOBAL_STATUS_TRACE_TOPAPMI_BIT) #define GLOBAL_STATUS_PERF_METRICS_OVF_BIT 48 -#define GLOBAL_CTRL_EN_PERF_METRICS 48 +#define GLOBAL_CTRL_EN_PERF_METRICS BIT_ULL(48) /* * We model guest LBR event tracing as another fixed-mode PMC like BTS. * diff --git a/arch/x86/include/asm/resctrl.h b/arch/x86/include/asm/resctrl.h index feb93b50e990..575f8408a9e7 100644 --- a/arch/x86/include/asm/resctrl.h +++ b/arch/x86/include/asm/resctrl.h @@ -44,7 +44,6 @@ DECLARE_PER_CPU(struct resctrl_pqr_state, pqr_state); extern bool rdt_alloc_capable; extern bool rdt_mon_capable; -extern unsigned int rdt_mon_features; DECLARE_STATIC_KEY_FALSE(rdt_enable_key); DECLARE_STATIC_KEY_FALSE(rdt_alloc_enable_key); @@ -84,21 +83,6 @@ static inline void resctrl_arch_disable_mon(void) static_branch_dec_cpuslocked(&rdt_enable_key); } -static inline bool resctrl_arch_is_llc_occupancy_enabled(void) -{ - return (rdt_mon_features & (1 << QOS_L3_OCCUP_EVENT_ID)); -} - -static inline bool resctrl_arch_is_mbm_total_enabled(void) -{ - return (rdt_mon_features & (1 << QOS_L3_MBM_TOTAL_EVENT_ID)); -} - -static inline bool resctrl_arch_is_mbm_local_enabled(void) -{ - return (rdt_mon_features & (1 << QOS_L3_MBM_LOCAL_EVENT_ID)); -} - /* * __resctrl_sched_in() - Writes the task's CLOSid/RMID to IA32_PQR_MSR * diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h index 77d8f49b92bd..f59ae7186940 100644 --- a/arch/x86/include/asm/segment.h +++ b/arch/x86/include/asm/segment.h @@ -244,7 +244,7 @@ static inline unsigned long vdso_encode_cpunode(int cpu, unsigned long node) static inline void vdso_read_cpunode(unsigned *cpu, unsigned *node) { - unsigned int p; + unsigned long p; /* * Load CPU and node number from the GDT. LSL is faster than RDTSCP @@ -254,10 +254,10 @@ static inline void vdso_read_cpunode(unsigned *cpu, unsigned *node) * * If RDPID is available, use it. */ - alternative_io ("lsl %[seg],%[p]", - ".byte 0xf3,0x0f,0xc7,0xf8", /* RDPID %eax/rax */ + alternative_io ("lsl %[seg],%k[p]", + "rdpid %[p]", X86_FEATURE_RDPID, - [p] "=a" (p), [seg] "r" (__CPUNODE_SEG)); + [p] "=r" (p), [seg] "r" (__CPUNODE_SEG)); if (cpu) *cpu = (p & VDSO_CPUNODE_MASK); diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index 692af46603a1..914eb32581c7 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -53,6 +53,7 @@ extern void i386_reserve_resources(void); extern unsigned long __startup_64(unsigned long p2v_offset, struct boot_params *bp); extern void startup_64_setup_gdt_idt(void); extern void startup_64_load_idt(void *vc_handler); +extern void __pi_startup_64_load_idt(void *vc_handler); extern void early_setup_idt(void); extern void __init do_early_exception(struct pt_regs *regs, int trapnr); diff --git a/arch/x86/include/asm/sev-common.h b/arch/x86/include/asm/sev-common.h index 0020d77a0800..01a6e4dbe423 100644 --- a/arch/x86/include/asm/sev-common.h +++ b/arch/x86/include/asm/sev-common.h @@ -208,6 +208,7 @@ struct snp_psc_desc { #define GHCB_TERM_SVSM_CAA 9 /* SVSM is present but CAA is not page aligned */ #define GHCB_TERM_SECURE_TSC 10 /* Secure TSC initialization failed */ #define GHCB_TERM_SVSM_CA_REMAP_FAIL 11 /* SVSM is present but CA could not be remapped */ +#define GHCB_TERM_SAVIC_FAIL 12 /* Secure AVIC-specific failure */ #define GHCB_RESP_CODE(v) ((v) & GHCB_MSR_INFO_MASK) diff --git a/arch/x86/include/asm/sev-internal.h b/arch/x86/include/asm/sev-internal.h index 3dfd306d1c9e..c58c47c68ab6 100644 --- a/arch/x86/include/asm/sev-internal.h +++ b/arch/x86/include/asm/sev-internal.h @@ -2,7 +2,6 @@ #define DR7_RESET_VALUE 0x400 -extern struct ghcb boot_ghcb_page; extern u64 sev_hv_features; extern u64 sev_secrets_pa; @@ -56,31 +55,15 @@ DECLARE_PER_CPU(struct sev_es_runtime_data*, runtime_data); DECLARE_PER_CPU(struct sev_es_save_area *, sev_vmsa); void early_set_pages_state(unsigned long vaddr, unsigned long paddr, - unsigned long npages, enum psc_op op); + unsigned long npages, const struct psc_desc *desc); DECLARE_PER_CPU(struct svsm_ca *, svsm_caa); DECLARE_PER_CPU(u64, svsm_caa_pa); -extern struct svsm_ca *boot_svsm_caa; extern u64 boot_svsm_caa_pa; -static __always_inline struct svsm_ca *svsm_get_caa(void) -{ - if (sev_cfg.use_cas) - return this_cpu_read(svsm_caa); - else - return boot_svsm_caa; -} - -static __always_inline u64 svsm_get_caa_pa(void) -{ - if (sev_cfg.use_cas) - return this_cpu_read(svsm_caa_pa); - else - return boot_svsm_caa_pa; -} - -int svsm_perform_call_protocol(struct svsm_call *call); +enum es_result verify_exception_info(struct ghcb *ghcb, struct es_em_ctxt *ctxt); +void vc_forward_exception(struct es_em_ctxt *ctxt); static inline u64 sev_es_rd_ghcb_msr(void) { @@ -97,9 +80,8 @@ static __always_inline void sev_es_wr_ghcb_msr(u64 val) native_wrmsr(MSR_AMD64_SEV_ES_GHCB, low, high); } -void snp_register_ghcb_early(unsigned long paddr); -bool sev_es_negotiate_protocol(void); -bool sev_es_check_cpu_features(void); +enum es_result sev_es_ghcb_handle_msr(struct ghcb *ghcb, struct es_em_ctxt *ctxt, bool write); + u64 get_hv_features(void); const struct snp_cpuid_table *snp_cpuid_get_table(void); diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h index 465b19fd1a2d..f9046c4b9a2b 100644 --- a/arch/x86/include/asm/sev.h +++ b/arch/x86/include/asm/sev.h @@ -503,6 +503,7 @@ static inline int pvalidate(unsigned long vaddr, bool rmp_psize, bool validate) } void setup_ghcb(void); +void snp_register_ghcb_early(unsigned long paddr); void early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr, unsigned long npages); void early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr, @@ -511,14 +512,12 @@ void snp_set_memory_shared(unsigned long vaddr, unsigned long npages); void snp_set_memory_private(unsigned long vaddr, unsigned long npages); void snp_set_wakeup_secondary_cpu(void); bool snp_init(struct boot_params *bp); -void __noreturn snp_abort(void); void snp_dmi_setup(void); int snp_issue_svsm_attest_req(u64 call_id, struct svsm_call *call, struct svsm_attest_call *input); void snp_accept_memory(phys_addr_t start, phys_addr_t end); u64 snp_get_unsupported_features(u64 status); u64 sev_get_status(void); void sev_show_status(void); -void snp_update_svsm_ca(void); int prepare_pte_enc(struct pte_enc_desc *d); void set_pte_enc_mask(pte_t *kpte, unsigned long pfn, pgprot_t new_prot); void snp_kexec_finish(void); @@ -533,6 +532,10 @@ int snp_svsm_vtpm_send_command(u8 *buffer); void __init snp_secure_tsc_prepare(void); void __init snp_secure_tsc_init(void); +enum es_result savic_register_gpa(u64 gpa); +enum es_result savic_unregister_gpa(u64 *gpa); +u64 savic_ghcb_msr_read(u32 reg); +void savic_ghcb_msr_write(u32 reg, u64 value); static __always_inline void vc_ghcb_invalidate(struct ghcb *ghcb) { @@ -540,8 +543,6 @@ static __always_inline void vc_ghcb_invalidate(struct ghcb *ghcb) __builtin_memset(ghcb->save.valid_bitmap, 0, sizeof(ghcb->save.valid_bitmap)); } -void vc_forward_exception(struct es_em_ctxt *ctxt); - /* I/O parameters for CPUID-related helpers */ struct cpuid_leaf { u32 fn; @@ -552,7 +553,13 @@ struct cpuid_leaf { u32 edx; }; -int snp_cpuid(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid_leaf *leaf); +int svsm_perform_msr_protocol(struct svsm_call *call); +int __pi_svsm_perform_msr_protocol(struct svsm_call *call); +int snp_cpuid(void (*cpuid_fn)(void *ctx, struct cpuid_leaf *leaf), + void *ctx, struct cpuid_leaf *leaf); + +void svsm_issue_call(struct svsm_call *call, u8 *pending); +int svsm_process_result_codes(struct svsm_call *call); void __noreturn sev_es_terminate(unsigned int set, unsigned int reason); enum es_result sev_es_ghcb_hv_call(struct ghcb *ghcb, @@ -560,7 +567,18 @@ enum es_result sev_es_ghcb_hv_call(struct ghcb *ghcb, u64 exit_code, u64 exit_info_1, u64 exit_info_2); +bool sev_es_negotiate_protocol(void); +bool sev_es_check_cpu_features(void); + +extern u16 ghcb_version; extern struct ghcb *boot_ghcb; +extern bool sev_snp_needs_sfw; + +struct psc_desc { + enum psc_op op; + struct svsm_ca *ca; + u64 caa_pa; +}; static inline void sev_evict_cache(void *va, int npages) { @@ -600,7 +618,6 @@ static inline void snp_set_memory_shared(unsigned long vaddr, unsigned long npag static inline void snp_set_memory_private(unsigned long vaddr, unsigned long npages) { } static inline void snp_set_wakeup_secondary_cpu(void) { } static inline bool snp_init(struct boot_params *bp) { return false; } -static inline void snp_abort(void) { } static inline void snp_dmi_setup(void) { } static inline int snp_issue_svsm_attest_req(u64 call_id, struct svsm_call *call, struct svsm_attest_call *input) { @@ -610,7 +627,6 @@ static inline void snp_accept_memory(phys_addr_t start, phys_addr_t end) { } static inline u64 snp_get_unsupported_features(u64 status) { return 0; } static inline u64 sev_get_status(void) { return 0; } static inline void sev_show_status(void) { } -static inline void snp_update_svsm_ca(void) { } static inline int prepare_pte_enc(struct pte_enc_desc *d) { return 0; } static inline void set_pte_enc_mask(pte_t *kpte, unsigned long pfn, pgprot_t new_prot) { } static inline void snp_kexec_finish(void) { } @@ -624,6 +640,10 @@ static inline int snp_svsm_vtpm_send_command(u8 *buffer) { return -ENODEV; } static inline void __init snp_secure_tsc_prepare(void) { } static inline void __init snp_secure_tsc_init(void) { } static inline void sev_evict_cache(void *va, int npages) {} +static inline enum es_result savic_register_gpa(u64 gpa) { return ES_UNSUPPORTED; } +static inline enum es_result savic_unregister_gpa(u64 *gpa) { return ES_UNSUPPORTED; } +static inline void savic_ghcb_msr_write(u32 reg, u64 value) { } +static inline u64 savic_ghcb_msr_read(u32 reg) { return 0; } #endif /* CONFIG_AMD_MEM_ENCRYPT */ @@ -635,9 +655,13 @@ void snp_dump_hva_rmpentry(unsigned long address); int psmash(u64 pfn); int rmp_make_private(u64 pfn, u64 gpa, enum pg_level level, u32 asid, bool immutable); int rmp_make_shared(u64 pfn, enum pg_level level); -void snp_leak_pages(u64 pfn, unsigned int npages); +void __snp_leak_pages(u64 pfn, unsigned int npages, bool dump_rmp); void kdump_sev_callback(void); void snp_fixup_e820_tables(void); +static inline void snp_leak_pages(u64 pfn, unsigned int pages) +{ + __snp_leak_pages(pfn, pages, true); +} #else static inline bool snp_probe_rmptable_info(void) { return false; } static inline int snp_rmptable_init(void) { return -ENOSYS; } @@ -650,6 +674,7 @@ static inline int rmp_make_private(u64 pfn, u64 gpa, enum pg_level level, u32 as return -ENODEV; } static inline int rmp_make_shared(u64 pfn, enum pg_level level) { return -ENODEV; } +static inline void __snp_leak_pages(u64 pfn, unsigned int npages, bool dump_rmp) {} static inline void snp_leak_pages(u64 pfn, unsigned int npages) {} static inline void kdump_sev_callback(void) { } static inline void snp_fixup_e820_tables(void) {} diff --git a/arch/x86/include/asm/shstk.h b/arch/x86/include/asm/shstk.h index 0f50e0125943..fc7dcec58fd4 100644 --- a/arch/x86/include/asm/shstk.h +++ b/arch/x86/include/asm/shstk.h @@ -23,6 +23,8 @@ int setup_signal_shadow_stack(struct ksignal *ksig); int restore_signal_shadow_stack(void); int shstk_update_last_frame(unsigned long val); bool shstk_is_enabled(void); +int shstk_pop(u64 *val); +int shstk_push(u64 val); #else static inline long shstk_prctl(struct task_struct *task, int option, unsigned long arg2) { return -EINVAL; } @@ -35,6 +37,8 @@ static inline int setup_signal_shadow_stack(struct ksignal *ksig) { return 0; } static inline int restore_signal_shadow_stack(void) { return 0; } static inline int shstk_update_last_frame(unsigned long val) { return 0; } static inline bool shstk_is_enabled(void) { return false; } +static inline int shstk_pop(u64 *val) { return -ENOTSUPP; } +static inline int shstk_push(u64 val) { return -ENOTSUPP; } #endif /* CONFIG_X86_USER_SHADOW_STACK */ #endif /* __ASSEMBLER__ */ diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 9282465eea21..e71e0e8362ed 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -80,56 +80,42 @@ struct thread_info { #endif /* - * thread information flags - * - these are process state flags that various assembly files - * may need to access + * Tell the generic TIF infrastructure which bits x86 supports */ -#define TIF_NOTIFY_RESUME 1 /* callback before returning to user */ -#define TIF_SIGPENDING 2 /* signal pending */ -#define TIF_NEED_RESCHED 3 /* rescheduling necessary */ -#define TIF_NEED_RESCHED_LAZY 4 /* Lazy rescheduling needed */ -#define TIF_SINGLESTEP 5 /* reenable singlestep on user return*/ -#define TIF_SSBD 6 /* Speculative store bypass disable */ -#define TIF_SPEC_IB 9 /* Indirect branch speculation mitigation */ -#define TIF_SPEC_L1D_FLUSH 10 /* Flush L1D on mm switches (processes) */ -#define TIF_USER_RETURN_NOTIFY 11 /* notify kernel of userspace return */ -#define TIF_UPROBE 12 /* breakpointed or singlestepping */ -#define TIF_PATCH_PENDING 13 /* pending live patching update */ -#define TIF_NEED_FPU_LOAD 14 /* load FPU on return to userspace */ -#define TIF_NOCPUID 15 /* CPUID is not accessible in userland */ -#define TIF_NOTSC 16 /* TSC is not accessible in userland */ -#define TIF_NOTIFY_SIGNAL 17 /* signal notifications exist */ -#define TIF_MEMDIE 20 /* is terminating due to OOM killer */ -#define TIF_POLLING_NRFLAG 21 /* idle is polling for TIF_NEED_RESCHED */ +#define HAVE_TIF_NEED_RESCHED_LAZY +#define HAVE_TIF_POLLING_NRFLAG +#define HAVE_TIF_SINGLESTEP + +#include <asm-generic/thread_info_tif.h> + +/* Architecture specific TIF space starts at 16 */ +#define TIF_SSBD 16 /* Speculative store bypass disable */ +#define TIF_SPEC_IB 17 /* Indirect branch speculation mitigation */ +#define TIF_SPEC_L1D_FLUSH 18 /* Flush L1D on mm switches (processes) */ +#define TIF_NEED_FPU_LOAD 19 /* load FPU on return to userspace */ +#define TIF_NOCPUID 20 /* CPUID is not accessible in userland */ +#define TIF_NOTSC 21 /* TSC is not accessible in userland */ #define TIF_IO_BITMAP 22 /* uses I/O bitmap */ #define TIF_SPEC_FORCE_UPDATE 23 /* Force speculation MSR update in context switch */ #define TIF_FORCED_TF 24 /* true if TF in eflags artificially */ -#define TIF_BLOCKSTEP 25 /* set when we want DEBUGCTLMSR_BTF */ +#define TIF_SINGLESTEP 25 /* reenable singlestep on user return*/ +#define TIF_BLOCKSTEP 26 /* set when we want DEBUGCTLMSR_BTF */ #define TIF_LAZY_MMU_UPDATES 27 /* task is updating the mmu lazily */ -#define TIF_ADDR32 29 /* 32-bit address space on 64 bits */ - -#define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) -#define _TIF_SIGPENDING (1 << TIF_SIGPENDING) -#define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) -#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY) -#define _TIF_SINGLESTEP (1 << TIF_SINGLESTEP) -#define _TIF_SSBD (1 << TIF_SSBD) -#define _TIF_SPEC_IB (1 << TIF_SPEC_IB) -#define _TIF_SPEC_L1D_FLUSH (1 << TIF_SPEC_L1D_FLUSH) -#define _TIF_USER_RETURN_NOTIFY (1 << TIF_USER_RETURN_NOTIFY) -#define _TIF_UPROBE (1 << TIF_UPROBE) -#define _TIF_PATCH_PENDING (1 << TIF_PATCH_PENDING) -#define _TIF_NEED_FPU_LOAD (1 << TIF_NEED_FPU_LOAD) -#define _TIF_NOCPUID (1 << TIF_NOCPUID) -#define _TIF_NOTSC (1 << TIF_NOTSC) -#define _TIF_NOTIFY_SIGNAL (1 << TIF_NOTIFY_SIGNAL) -#define _TIF_POLLING_NRFLAG (1 << TIF_POLLING_NRFLAG) -#define _TIF_IO_BITMAP (1 << TIF_IO_BITMAP) -#define _TIF_SPEC_FORCE_UPDATE (1 << TIF_SPEC_FORCE_UPDATE) -#define _TIF_FORCED_TF (1 << TIF_FORCED_TF) -#define _TIF_BLOCKSTEP (1 << TIF_BLOCKSTEP) -#define _TIF_LAZY_MMU_UPDATES (1 << TIF_LAZY_MMU_UPDATES) -#define _TIF_ADDR32 (1 << TIF_ADDR32) +#define TIF_ADDR32 28 /* 32-bit address space on 64 bits */ + +#define _TIF_SSBD BIT(TIF_SSBD) +#define _TIF_SPEC_IB BIT(TIF_SPEC_IB) +#define _TIF_SPEC_L1D_FLUSH BIT(TIF_SPEC_L1D_FLUSH) +#define _TIF_NEED_FPU_LOAD BIT(TIF_NEED_FPU_LOAD) +#define _TIF_NOCPUID BIT(TIF_NOCPUID) +#define _TIF_NOTSC BIT(TIF_NOTSC) +#define _TIF_IO_BITMAP BIT(TIF_IO_BITMAP) +#define _TIF_SPEC_FORCE_UPDATE BIT(TIF_SPEC_FORCE_UPDATE) +#define _TIF_FORCED_TF BIT(TIF_FORCED_TF) +#define _TIF_BLOCKSTEP BIT(TIF_BLOCKSTEP) +#define _TIF_SINGLESTEP BIT(TIF_SINGLESTEP) +#define _TIF_LAZY_MMU_UPDATES BIT(TIF_LAZY_MMU_UPDATES) +#define _TIF_ADDR32 BIT(TIF_ADDR32) /* flags to check in __switch_to() */ #define _TIF_WORK_CTXSW_BASE \ diff --git a/arch/x86/include/asm/uprobes.h b/arch/x86/include/asm/uprobes.h index 678fb546f0a7..1ee2e5115955 100644 --- a/arch/x86/include/asm/uprobes.h +++ b/arch/x86/include/asm/uprobes.h @@ -20,6 +20,11 @@ typedef u8 uprobe_opcode_t; #define UPROBE_SWBP_INSN 0xcc #define UPROBE_SWBP_INSN_SIZE 1 +enum { + ARCH_UPROBE_FLAG_CAN_OPTIMIZE = 0, + ARCH_UPROBE_FLAG_OPTIMIZE_FAIL = 1, +}; + struct uprobe_xol_ops; struct arch_uprobe { @@ -45,6 +50,8 @@ struct arch_uprobe { u8 ilen; } push; }; + + unsigned long flags; }; struct arch_uprobe_task { diff --git a/arch/x86/include/uapi/asm/svm.h b/arch/x86/include/uapi/asm/svm.h index 9c640a521a67..650e3256ea7d 100644 --- a/arch/x86/include/uapi/asm/svm.h +++ b/arch/x86/include/uapi/asm/svm.h @@ -118,6 +118,10 @@ #define SVM_VMGEXIT_AP_CREATE 1 #define SVM_VMGEXIT_AP_DESTROY 2 #define SVM_VMGEXIT_SNP_RUN_VMPL 0x80000018 +#define SVM_VMGEXIT_SAVIC 0x8000001a +#define SVM_VMGEXIT_SAVIC_REGISTER_GPA 0 +#define SVM_VMGEXIT_SAVIC_UNREGISTER_GPA 1 +#define SVM_VMGEXIT_SAVIC_SELF_GPA ~0ULL #define SVM_VMGEXIT_HV_FEATURES 0x8000fffd #define SVM_VMGEXIT_TERM_REQUEST 0x8000fffe #define SVM_VMGEXIT_TERM_REASON(reason_set, reason_code) \ diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile index 52d1808ee360..581db89477f9 100644 --- a/arch/x86/kernel/apic/Makefile +++ b/arch/x86/kernel/apic/Makefile @@ -18,6 +18,7 @@ ifeq ($(CONFIG_X86_64),y) # APIC probe will depend on the listing order here obj-$(CONFIG_X86_NUMACHIP) += apic_numachip.o obj-$(CONFIG_X86_UV) += x2apic_uv_x.o +obj-$(CONFIG_AMD_SECURE_AVIC) += x2apic_savic.o obj-$(CONFIG_X86_X2APIC) += x2apic_phys.o obj-$(CONFIG_X86_X2APIC) += x2apic_cluster.o obj-y += apic_flat_64.o diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index d73ba5a7b623..680d305589a3 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -592,6 +592,8 @@ static void setup_APIC_timer(void) 0xF, ~0UL); } else clockevents_register_device(levt); + + apic_update_vector(smp_processor_id(), LOCAL_TIMER_VECTOR, true); } /* @@ -1168,6 +1170,9 @@ void disable_local_APIC(void) if (!apic_accessible()) return; + if (apic->teardown) + apic->teardown(); + apic_soft_disable(); #ifdef CONFIG_X86_32 @@ -1428,63 +1433,61 @@ union apic_ir { u32 regs[APIC_IR_REGS]; }; -static bool apic_check_and_ack(union apic_ir *irr, union apic_ir *isr) +static bool apic_check_and_eoi_isr(union apic_ir *isr) { int i, bit; - /* Read the IRRs */ - for (i = 0; i < APIC_IR_REGS; i++) - irr->regs[i] = apic_read(APIC_IRR + i * 0x10); - /* Read the ISRs */ for (i = 0; i < APIC_IR_REGS; i++) isr->regs[i] = apic_read(APIC_ISR + i * 0x10); + /* If the ISR map empty, nothing to do here. */ + if (bitmap_empty(isr->map, APIC_IR_BITS)) + return true; + /* - * If the ISR map is not empty. ACK the APIC and run another round - * to verify whether a pending IRR has been unblocked and turned - * into a ISR. + * There can be multiple ISR bits set when a high priority + * interrupt preempted a lower priority one. Issue an EOI for each + * set bit. The priority traversal order does not matter as there + * can't be new ISR bits raised at this point. What matters is that + * an EOI is issued for each ISR bit. */ - if (!bitmap_empty(isr->map, APIC_IR_BITS)) { - /* - * There can be multiple ISR bits set when a high priority - * interrupt preempted a lower priority one. Issue an ACK - * per set bit. - */ - for_each_set_bit(bit, isr->map, APIC_IR_BITS) - apic_eoi(); - return true; - } + for_each_set_bit(bit, isr->map, APIC_IR_BITS) + apic_eoi(); - return !bitmap_empty(irr->map, APIC_IR_BITS); + /* Reread the ISRs, they should be empty now */ + for (i = 0; i < APIC_IR_REGS; i++) + isr->regs[i] = apic_read(APIC_ISR + i * 0x10); + + return bitmap_empty(isr->map, APIC_IR_BITS); } /* - * After a crash, we no longer service the interrupts and a pending - * interrupt from previous kernel might still have ISR bit set. + * If a CPU services an interrupt and crashes before issuing EOI to the + * local APIC, the corresponding ISR bit is still set when the crashing CPU + * jumps into a crash kernel. Read the ISR and issue an EOI for each set + * bit to acknowledge it as otherwise these slots would be locked forever + * waiting for an EOI. * - * Most probably by now the CPU has serviced that pending interrupt and it - * might not have done the apic_eoi() because it thought, interrupt - * came from i8259 as ExtInt. LAPIC did not get EOI so it does not clear - * the ISR bit and cpu thinks it has already serviced the interrupt. Hence - * a vector might get locked. It was noticed for timer irq (vector - * 0x31). Issue an extra EOI to clear ISR. + * If there are pending bits in the IRR, then they won't be converted into + * ISR bits as the CPU has interrupts disabled. They will be delivered once + * the CPU enables interrupts and there is nothing which can prevent that. * - * If there are pending IRR bits they turn into ISR bits after a higher - * priority ISR bit has been acked. + * In the worst case this results in spurious interrupt warnings. */ -static void apic_pending_intr_clear(void) +static void apic_clear_isr(void) { - union apic_ir irr, isr; + union apic_ir ir; unsigned int i; - /* 512 loops are way oversized and give the APIC a chance to obey. */ - for (i = 0; i < 512; i++) { - if (!apic_check_and_ack(&irr, &isr)) - return; - } - /* Dump the IRR/ISR content if that failed */ - pr_warn("APIC: Stale IRR: %256pb ISR: %256pb\n", irr.map, isr.map); + if (!apic_check_and_eoi_isr(&ir)) + pr_warn("APIC: Stale ISR: %256pb\n", ir.map); + + for (i = 0; i < APIC_IR_REGS; i++) + ir.regs[i] = apic_read(APIC_IRR + i * 0x10); + + if (!bitmap_empty(ir.map, APIC_IR_BITS)) + pr_warn("APIC: Stale IRR: %256pb\n", ir.map); } /** @@ -1503,6 +1506,9 @@ static void setup_local_APIC(void) return; } + if (apic->setup) + apic->setup(); + /* * If this comes from kexec/kcrash the APIC might be enabled in * SPIV. Soft disable it before doing further initialization. @@ -1541,8 +1547,7 @@ static void setup_local_APIC(void) value |= 0x10; apic_write(APIC_TASKPRI, value); - /* Clear eventually stale ISR/IRR bits */ - apic_pending_intr_clear(); + apic_clear_isr(); /* * Now that we are all set up, enable the APIC diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index a947b46a8b64..bddc54465399 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -134,13 +134,20 @@ static void apic_update_irq_cfg(struct irq_data *irqd, unsigned int vector, apicd->hw_irq_cfg.vector = vector; apicd->hw_irq_cfg.dest_apicid = apic->calc_dest_apicid(cpu); + + apic_update_vector(cpu, vector, true); + irq_data_update_effective_affinity(irqd, cpumask_of(cpu)); - trace_vector_config(irqd->irq, vector, cpu, - apicd->hw_irq_cfg.dest_apicid); + trace_vector_config(irqd->irq, vector, cpu, apicd->hw_irq_cfg.dest_apicid); } -static void apic_update_vector(struct irq_data *irqd, unsigned int newvec, - unsigned int newcpu) +static void apic_free_vector(unsigned int cpu, unsigned int vector, bool managed) +{ + apic_update_vector(cpu, vector, false); + irq_matrix_free(vector_matrix, cpu, vector, managed); +} + +static void chip_data_update(struct irq_data *irqd, unsigned int newvec, unsigned int newcpu) { struct apic_chip_data *apicd = apic_chip_data(irqd); struct irq_desc *desc = irq_data_to_desc(irqd); @@ -174,8 +181,7 @@ static void apic_update_vector(struct irq_data *irqd, unsigned int newvec, apicd->prev_cpu = apicd->cpu; WARN_ON_ONCE(apicd->cpu == newcpu); } else { - irq_matrix_free(vector_matrix, apicd->cpu, apicd->vector, - managed); + apic_free_vector(apicd->cpu, apicd->vector, managed); } setnew: @@ -261,7 +267,7 @@ assign_vector_locked(struct irq_data *irqd, const struct cpumask *dest) trace_vector_alloc(irqd->irq, vector, resvd, vector); if (vector < 0) return vector; - apic_update_vector(irqd, vector, cpu); + chip_data_update(irqd, vector, cpu); return 0; } @@ -337,7 +343,7 @@ assign_managed_vector(struct irq_data *irqd, const struct cpumask *dest) trace_vector_alloc_managed(irqd->irq, vector, vector); if (vector < 0) return vector; - apic_update_vector(irqd, vector, cpu); + chip_data_update(irqd, vector, cpu); return 0; } @@ -357,7 +363,7 @@ static void clear_irq_vector(struct irq_data *irqd) apicd->prev_cpu); per_cpu(vector_irq, apicd->cpu)[vector] = VECTOR_SHUTDOWN; - irq_matrix_free(vector_matrix, apicd->cpu, vector, managed); + apic_free_vector(apicd->cpu, vector, managed); apicd->vector = 0; /* Clean up move in progress */ @@ -366,7 +372,7 @@ static void clear_irq_vector(struct irq_data *irqd) return; per_cpu(vector_irq, apicd->prev_cpu)[vector] = VECTOR_SHUTDOWN; - irq_matrix_free(vector_matrix, apicd->prev_cpu, vector, managed); + apic_free_vector(apicd->prev_cpu, vector, managed); apicd->prev_vector = 0; apicd->move_in_progress = 0; hlist_del_init(&apicd->clist); @@ -905,7 +911,7 @@ static void free_moved_vector(struct apic_chip_data *apicd) * affinity mask comes online. */ trace_vector_free_moved(apicd->irq, cpu, vector, managed); - irq_matrix_free(vector_matrix, cpu, vector, managed); + apic_free_vector(cpu, vector, managed); per_cpu(vector_irq, cpu)[vector] = VECTOR_UNUSED; hlist_del_init(&apicd->clist); apicd->prev_vector = 0; diff --git a/arch/x86/kernel/apic/x2apic_savic.c b/arch/x86/kernel/apic/x2apic_savic.c new file mode 100644 index 000000000000..dbc5678bc3b6 --- /dev/null +++ b/arch/x86/kernel/apic/x2apic_savic.c @@ -0,0 +1,428 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * AMD Secure AVIC Support (SEV-SNP Guests) + * + * Copyright (C) 2024 Advanced Micro Devices, Inc. + * + * Author: Neeraj Upadhyay <Neeraj.Upadhyay@amd.com> + */ + +#include <linux/cc_platform.h> +#include <linux/cpumask.h> +#include <linux/percpu-defs.h> +#include <linux/align.h> + +#include <asm/apic.h> +#include <asm/sev.h> + +#include "local.h" + +struct secure_avic_page { + u8 regs[PAGE_SIZE]; +} __aligned(PAGE_SIZE); + +static struct secure_avic_page __percpu *savic_page __ro_after_init; + +static int savic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) +{ + return x2apic_enabled() && cc_platform_has(CC_ATTR_SNP_SECURE_AVIC); +} + +static inline void *get_reg_bitmap(unsigned int cpu, unsigned int offset) +{ + return &per_cpu_ptr(savic_page, cpu)->regs[offset]; +} + +static inline void update_vector(unsigned int cpu, unsigned int offset, + unsigned int vector, bool set) +{ + void *bitmap = get_reg_bitmap(cpu, offset); + + if (set) + apic_set_vector(vector, bitmap); + else + apic_clear_vector(vector, bitmap); +} + +#define SAVIC_ALLOWED_IRR 0x204 + +/* + * When Secure AVIC is enabled, RDMSR/WRMSR of the APIC registers + * result in #VC exception (for non-accelerated register accesses) + * with VMEXIT_AVIC_NOACCEL error code. The #VC exception handler + * can read/write the x2APIC register in the guest APIC backing page. + * + * Since doing this would increase the latency of accessing x2APIC + * registers, instead of doing RDMSR/WRMSR based accesses and + * handling the APIC register reads/writes in the #VC exception handler, + * the read() and write() callbacks directly read/write the APIC register + * from/to the vCPU's APIC backing page. + */ +static u32 savic_read(u32 reg) +{ + void *ap = this_cpu_ptr(savic_page); + + switch (reg) { + case APIC_LVTT: + case APIC_TMICT: + case APIC_TMCCT: + case APIC_TDCR: + case APIC_LVTTHMR: + case APIC_LVTPC: + case APIC_LVT0: + case APIC_LVT1: + case APIC_LVTERR: + return savic_ghcb_msr_read(reg); + case APIC_ID: + case APIC_LVR: + case APIC_TASKPRI: + case APIC_ARBPRI: + case APIC_PROCPRI: + case APIC_LDR: + case APIC_SPIV: + case APIC_ESR: + case APIC_EFEAT: + case APIC_ECTRL: + case APIC_SEOI: + case APIC_IER: + case APIC_EILVTn(0) ... APIC_EILVTn(3): + return apic_get_reg(ap, reg); + case APIC_ICR: + return (u32)apic_get_reg64(ap, reg); + case APIC_ISR ... APIC_ISR + 0x70: + case APIC_TMR ... APIC_TMR + 0x70: + if (WARN_ONCE(!IS_ALIGNED(reg, 16), + "APIC register read offset 0x%x not aligned at 16 bytes", reg)) + return 0; + return apic_get_reg(ap, reg); + /* IRR and ALLOWED_IRR offset range */ + case APIC_IRR ... APIC_IRR + 0x74: + /* + * Valid APIC_IRR/SAVIC_ALLOWED_IRR registers are at 16 bytes strides from + * their respective base offset. APIC_IRRs are in the range + * + * (0x200, 0x210, ..., 0x270) + * + * while the SAVIC_ALLOWED_IRR range starts 4 bytes later, in the range + * + * (0x204, 0x214, ..., 0x274). + * + * Filter out everything else. + */ + if (WARN_ONCE(!(IS_ALIGNED(reg, 16) || + IS_ALIGNED(reg - 4, 16)), + "Misaligned APIC_IRR/ALLOWED_IRR APIC register read offset 0x%x", reg)) + return 0; + return apic_get_reg(ap, reg); + default: + pr_err("Error reading unknown Secure AVIC reg offset 0x%x\n", reg); + return 0; + } +} + +#define SAVIC_NMI_REQ 0x278 + +/* + * On WRMSR to APIC_SELF_IPI register by the guest, Secure AVIC hardware + * updates the APIC_IRR in the APIC backing page of the vCPU. In addition, + * hardware evaluates the new APIC_IRR update for interrupt injection to + * the vCPU. So, self IPIs are hardware-accelerated. + */ +static inline void self_ipi_reg_write(unsigned int vector) +{ + native_apic_msr_write(APIC_SELF_IPI, vector); +} + +static void send_ipi_dest(unsigned int cpu, unsigned int vector, bool nmi) +{ + if (nmi) + apic_set_reg(per_cpu_ptr(savic_page, cpu), SAVIC_NMI_REQ, 1); + else + update_vector(cpu, APIC_IRR, vector, true); +} + +static void send_ipi_allbut(unsigned int vector, bool nmi) +{ + unsigned int cpu, src_cpu; + + guard(irqsave)(); + + src_cpu = raw_smp_processor_id(); + + for_each_cpu(cpu, cpu_online_mask) { + if (cpu == src_cpu) + continue; + send_ipi_dest(cpu, vector, nmi); + } +} + +static inline void self_ipi(unsigned int vector, bool nmi) +{ + u32 icr_low = APIC_SELF_IPI | vector; + + if (nmi) + icr_low |= APIC_DM_NMI; + + native_x2apic_icr_write(icr_low, 0); +} + +static void savic_icr_write(u32 icr_low, u32 icr_high) +{ + unsigned int dsh, vector; + u64 icr_data; + bool nmi; + + dsh = icr_low & APIC_DEST_ALLBUT; + vector = icr_low & APIC_VECTOR_MASK; + nmi = ((icr_low & APIC_DM_FIXED_MASK) == APIC_DM_NMI); + + switch (dsh) { + case APIC_DEST_SELF: + self_ipi(vector, nmi); + break; + case APIC_DEST_ALLINC: + self_ipi(vector, nmi); + fallthrough; + case APIC_DEST_ALLBUT: + send_ipi_allbut(vector, nmi); + break; + default: + send_ipi_dest(icr_high, vector, nmi); + break; + } + + icr_data = ((u64)icr_high) << 32 | icr_low; + if (dsh != APIC_DEST_SELF) + savic_ghcb_msr_write(APIC_ICR, icr_data); + apic_set_reg64(this_cpu_ptr(savic_page), APIC_ICR, icr_data); +} + +static void savic_write(u32 reg, u32 data) +{ + void *ap = this_cpu_ptr(savic_page); + + switch (reg) { + case APIC_LVTT: + case APIC_TMICT: + case APIC_TDCR: + case APIC_LVT0: + case APIC_LVT1: + case APIC_LVTTHMR: + case APIC_LVTPC: + case APIC_LVTERR: + savic_ghcb_msr_write(reg, data); + break; + case APIC_TASKPRI: + case APIC_EOI: + case APIC_SPIV: + case SAVIC_NMI_REQ: + case APIC_ESR: + case APIC_ECTRL: + case APIC_SEOI: + case APIC_IER: + case APIC_EILVTn(0) ... APIC_EILVTn(3): + apic_set_reg(ap, reg, data); + break; + case APIC_ICR: + savic_icr_write(data, 0); + break; + case APIC_SELF_IPI: + self_ipi_reg_write(data); + break; + /* ALLOWED_IRR offsets are writable */ + case SAVIC_ALLOWED_IRR ... SAVIC_ALLOWED_IRR + 0x70: + if (IS_ALIGNED(reg - 4, 16)) { + apic_set_reg(ap, reg, data); + break; + } + fallthrough; + default: + pr_err("Error writing unknown Secure AVIC reg offset 0x%x\n", reg); + } +} + +static void send_ipi(u32 dest, unsigned int vector, unsigned int dsh) +{ + unsigned int icr_low; + + icr_low = __prepare_ICR(dsh, vector, APIC_DEST_PHYSICAL); + savic_icr_write(icr_low, dest); +} + +static void savic_send_ipi(int cpu, int vector) +{ + u32 dest = per_cpu(x86_cpu_to_apicid, cpu); + + send_ipi(dest, vector, 0); +} + +static void send_ipi_mask(const struct cpumask *mask, unsigned int vector, bool excl_self) +{ + unsigned int cpu, this_cpu; + + guard(irqsave)(); + + this_cpu = raw_smp_processor_id(); + + for_each_cpu(cpu, mask) { + if (excl_self && cpu == this_cpu) + continue; + send_ipi(per_cpu(x86_cpu_to_apicid, cpu), vector, 0); + } +} + +static void savic_send_ipi_mask(const struct cpumask *mask, int vector) +{ + send_ipi_mask(mask, vector, false); +} + +static void savic_send_ipi_mask_allbutself(const struct cpumask *mask, int vector) +{ + send_ipi_mask(mask, vector, true); +} + +static void savic_send_ipi_allbutself(int vector) +{ + send_ipi(0, vector, APIC_DEST_ALLBUT); +} + +static void savic_send_ipi_all(int vector) +{ + send_ipi(0, vector, APIC_DEST_ALLINC); +} + +static void savic_send_ipi_self(int vector) +{ + self_ipi_reg_write(vector); +} + +static void savic_update_vector(unsigned int cpu, unsigned int vector, bool set) +{ + update_vector(cpu, SAVIC_ALLOWED_IRR, vector, set); +} + +static void savic_eoi(void) +{ + unsigned int cpu; + int vec; + + cpu = raw_smp_processor_id(); + vec = apic_find_highest_vector(get_reg_bitmap(cpu, APIC_ISR)); + if (WARN_ONCE(vec == -1, "EOI write while no active interrupt in APIC_ISR")) + return; + + /* Is level-triggered interrupt? */ + if (apic_test_vector(vec, get_reg_bitmap(cpu, APIC_TMR))) { + update_vector(cpu, APIC_ISR, vec, false); + /* + * Propagate the EOI write to the hypervisor for level-triggered + * interrupts. Return to the guest from GHCB protocol event takes + * care of re-evaluating interrupt state. + */ + savic_ghcb_msr_write(APIC_EOI, 0); + } else { + /* + * Hardware clears APIC_ISR and re-evaluates the interrupt state + * to determine if there is any pending interrupt which can be + * delivered to CPU. + */ + native_apic_msr_eoi(); + } +} + +static void savic_teardown(void) +{ + /* Disable Secure AVIC */ + native_wrmsrq(MSR_AMD64_SAVIC_CONTROL, 0); + savic_unregister_gpa(NULL); +} + +static void savic_setup(void) +{ + void *ap = this_cpu_ptr(savic_page); + enum es_result res; + unsigned long gpa; + + /* + * Before Secure AVIC is enabled, APIC MSR reads are intercepted. + * APIC_ID MSR read returns the value from the hypervisor. + */ + apic_set_reg(ap, APIC_ID, native_apic_msr_read(APIC_ID)); + + gpa = __pa(ap); + + /* + * The NPT entry for a vCPU's APIC backing page must always be + * present when the vCPU is running in order for Secure AVIC to + * function. A VMEXIT_BUSY is returned on VMRUN and the vCPU cannot + * be resumed if the NPT entry for the APIC backing page is not + * present. Notify GPA of the vCPU's APIC backing page to the + * hypervisor by calling savic_register_gpa(). Before executing + * VMRUN, the hypervisor makes use of this information to make sure + * the APIC backing page is mapped in NPT. + */ + res = savic_register_gpa(gpa); + if (res != ES_OK) + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_SAVIC_FAIL); + + native_wrmsrq(MSR_AMD64_SAVIC_CONTROL, + gpa | MSR_AMD64_SAVIC_EN | MSR_AMD64_SAVIC_ALLOWEDNMI); +} + +static int savic_probe(void) +{ + if (!cc_platform_has(CC_ATTR_SNP_SECURE_AVIC)) + return 0; + + if (!x2apic_mode) { + pr_err("Secure AVIC enabled in non x2APIC mode\n"); + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_SAVIC_FAIL); + /* unreachable */ + } + + savic_page = alloc_percpu(struct secure_avic_page); + if (!savic_page) + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_SAVIC_FAIL); + + return 1; +} + +static struct apic apic_x2apic_savic __ro_after_init = { + + .name = "secure avic x2apic", + .probe = savic_probe, + .acpi_madt_oem_check = savic_acpi_madt_oem_check, + .setup = savic_setup, + .teardown = savic_teardown, + + .dest_mode_logical = false, + + .disable_esr = 0, + + .cpu_present_to_apicid = default_cpu_present_to_apicid, + + .max_apic_id = UINT_MAX, + .x2apic_set_max_apicid = true, + .get_apic_id = x2apic_get_apic_id, + + .calc_dest_apicid = apic_default_calc_apicid, + + .send_IPI = savic_send_ipi, + .send_IPI_mask = savic_send_ipi_mask, + .send_IPI_mask_allbutself = savic_send_ipi_mask_allbutself, + .send_IPI_allbutself = savic_send_ipi_allbutself, + .send_IPI_all = savic_send_ipi_all, + .send_IPI_self = savic_send_ipi_self, + + .nmi_to_offline_cpu = true, + + .read = savic_read, + .write = savic_write, + .eoi = savic_eoi, + .icr_read = native_x2apic_icr_read, + .icr_write = savic_icr_write, + + .update_vector = savic_update_vector, +}; + +apic_driver(apic_x2apic_savic); diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 1e26179ff18c..2f8a58ef690e 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -58,6 +58,7 @@ obj-$(CONFIG_X86_SGX) += sgx/ obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o obj-$(CONFIG_HYPERVISOR_GUEST) += vmware.o hypervisor.o mshyperv.o +obj-$(CONFIG_BHYVE_GUEST) += bhyve.o obj-$(CONFIG_ACRN_GUEST) += acrn.o obj-$(CONFIG_DEBUG_FS) += debugfs.o diff --git a/arch/x86/kernel/cpu/bhyve.c b/arch/x86/kernel/cpu/bhyve.c new file mode 100644 index 000000000000..f1a8ca3dd1ed --- /dev/null +++ b/arch/x86/kernel/cpu/bhyve.c @@ -0,0 +1,66 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * FreeBSD Bhyve guest enlightenments + * + * Copyright © 2025 Amazon.com, Inc. or its affiliates. + * + * Author: David Woodhouse <dwmw2@infradead.org> + */ + +#include <linux/init.h> +#include <linux/export.h> +#include <asm/processor.h> +#include <asm/hypervisor.h> + +static uint32_t bhyve_cpuid_base; +static uint32_t bhyve_cpuid_max; + +#define BHYVE_SIGNATURE "bhyve bhyve " + +#define CPUID_BHYVE_FEATURES 0x40000001 + +/* Features advertised in CPUID_BHYVE_FEATURES %eax */ + +/* MSI Extended Dest ID */ +#define CPUID_BHYVE_FEAT_EXT_DEST_ID (1UL << 0) + +static uint32_t __init bhyve_detect(void) +{ + if (!cpu_feature_enabled(X86_FEATURE_HYPERVISOR)) + return 0; + + bhyve_cpuid_base = cpuid_base_hypervisor(BHYVE_SIGNATURE, 0); + if (!bhyve_cpuid_base) + return 0; + + bhyve_cpuid_max = cpuid_eax(bhyve_cpuid_base); + return bhyve_cpuid_max; +} + +static uint32_t bhyve_features(void) +{ + unsigned int cpuid_leaf = bhyve_cpuid_base | CPUID_BHYVE_FEATURES; + + if (bhyve_cpuid_max < cpuid_leaf) + return 0; + + return cpuid_eax(cpuid_leaf); +} + +static bool __init bhyve_ext_dest_id(void) +{ + return !!(bhyve_features() & CPUID_BHYVE_FEAT_EXT_DEST_ID); +} + +static bool __init bhyve_x2apic_available(void) +{ + return true; +} + +const struct hypervisor_x86 x86_hyper_bhyve __refconst = { + .name = "Bhyve", + .detect = bhyve_detect, + .init.init_platform = x86_init_noop, + .init.x2apic_available = bhyve_x2apic_available, + .init.msi_ext_dest_id = bhyve_ext_dest_id, +}; diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 36dcfc5105be..6a526ae1fe99 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -434,6 +434,9 @@ static bool __init should_mitigate_vuln(unsigned int bug) case X86_BUG_SPEC_STORE_BYPASS: return cpu_attack_vector_mitigated(CPU_MITIGATE_USER_USER); + case X86_BUG_VMSCAPE: + return cpu_attack_vector_mitigated(CPU_MITIGATE_GUEST_HOST); + default: WARN(1, "Unknown bug %x\n", bug); return false; @@ -684,8 +687,7 @@ static const char * const mmio_strings[] = { static void __init mmio_select_mitigation(void) { - if (!boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA) || - cpu_mitigations_off()) { + if (!boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA)) { mmio_mitigation = MMIO_MITIGATION_OFF; return; } @@ -1460,8 +1462,10 @@ static void __init retbleed_update_mitigation(void) retbleed_mitigation = RETBLEED_MITIGATION_EIBRS; break; default: - if (retbleed_mitigation != RETBLEED_MITIGATION_STUFF) + if (retbleed_mitigation != RETBLEED_MITIGATION_STUFF) { pr_err(RETBLEED_INTEL_MSG); + retbleed_mitigation = RETBLEED_MITIGATION_NONE; + } } } @@ -1842,9 +1846,10 @@ enum spectre_v2_mitigation_cmd { SPECTRE_V2_CMD_IBRS, }; -static enum spectre_v2_mitigation_cmd spectre_v2_cmd __ro_after_init = SPECTRE_V2_CMD_AUTO; +static enum spectre_v2_mitigation_cmd spectre_v2_cmd __ro_after_init = + IS_ENABLED(CONFIG_MITIGATION_SPECTRE_V2) ? SPECTRE_V2_CMD_AUTO : SPECTRE_V2_CMD_NONE; -enum spectre_v2_user_cmd { +enum spectre_v2_user_mitigation_cmd { SPECTRE_V2_USER_CMD_NONE, SPECTRE_V2_USER_CMD_AUTO, SPECTRE_V2_USER_CMD_FORCE, @@ -1854,6 +1859,9 @@ enum spectre_v2_user_cmd { SPECTRE_V2_USER_CMD_SECCOMP_IBPB, }; +static enum spectre_v2_user_mitigation_cmd spectre_v2_user_cmd __ro_after_init = + IS_ENABLED(CONFIG_MITIGATION_SPECTRE_V2) ? SPECTRE_V2_USER_CMD_AUTO : SPECTRE_V2_USER_CMD_NONE; + static const char * const spectre_v2_user_strings[] = { [SPECTRE_V2_USER_NONE] = "User space: Vulnerable", [SPECTRE_V2_USER_STRICT] = "User space: Mitigation: STIBP protection", @@ -1862,50 +1870,31 @@ static const char * const spectre_v2_user_strings[] = { [SPECTRE_V2_USER_SECCOMP] = "User space: Mitigation: STIBP via seccomp and prctl", }; -static const struct { - const char *option; - enum spectre_v2_user_cmd cmd; - bool secure; -} v2_user_options[] __initconst = { - { "auto", SPECTRE_V2_USER_CMD_AUTO, false }, - { "off", SPECTRE_V2_USER_CMD_NONE, false }, - { "on", SPECTRE_V2_USER_CMD_FORCE, true }, - { "prctl", SPECTRE_V2_USER_CMD_PRCTL, false }, - { "prctl,ibpb", SPECTRE_V2_USER_CMD_PRCTL_IBPB, false }, - { "seccomp", SPECTRE_V2_USER_CMD_SECCOMP, false }, - { "seccomp,ibpb", SPECTRE_V2_USER_CMD_SECCOMP_IBPB, false }, -}; - -static void __init spec_v2_user_print_cond(const char *reason, bool secure) -{ - if (boot_cpu_has_bug(X86_BUG_SPECTRE_V2) != secure) - pr_info("spectre_v2_user=%s forced on command line.\n", reason); -} - -static enum spectre_v2_user_cmd __init spectre_v2_parse_user_cmdline(void) +static int __init spectre_v2_user_parse_cmdline(char *str) { - char arg[20]; - int ret, i; - - if (!IS_ENABLED(CONFIG_MITIGATION_SPECTRE_V2)) - return SPECTRE_V2_USER_CMD_NONE; - - ret = cmdline_find_option(boot_command_line, "spectre_v2_user", - arg, sizeof(arg)); - if (ret < 0) - return SPECTRE_V2_USER_CMD_AUTO; + if (!str) + return -EINVAL; - for (i = 0; i < ARRAY_SIZE(v2_user_options); i++) { - if (match_option(arg, ret, v2_user_options[i].option)) { - spec_v2_user_print_cond(v2_user_options[i].option, - v2_user_options[i].secure); - return v2_user_options[i].cmd; - } - } + if (!strcmp(str, "auto")) + spectre_v2_user_cmd = SPECTRE_V2_USER_CMD_AUTO; + else if (!strcmp(str, "off")) + spectre_v2_user_cmd = SPECTRE_V2_USER_CMD_NONE; + else if (!strcmp(str, "on")) + spectre_v2_user_cmd = SPECTRE_V2_USER_CMD_FORCE; + else if (!strcmp(str, "prctl")) + spectre_v2_user_cmd = SPECTRE_V2_USER_CMD_PRCTL; + else if (!strcmp(str, "prctl,ibpb")) + spectre_v2_user_cmd = SPECTRE_V2_USER_CMD_PRCTL_IBPB; + else if (!strcmp(str, "seccomp")) + spectre_v2_user_cmd = SPECTRE_V2_USER_CMD_SECCOMP; + else if (!strcmp(str, "seccomp,ibpb")) + spectre_v2_user_cmd = SPECTRE_V2_USER_CMD_SECCOMP_IBPB; + else + pr_err("Ignoring unknown spectre_v2_user option (%s).", str); - pr_err("Unknown user space protection option (%s). Switching to default\n", arg); - return SPECTRE_V2_USER_CMD_AUTO; + return 0; } +early_param("spectre_v2_user", spectre_v2_user_parse_cmdline); static inline bool spectre_v2_in_ibrs_mode(enum spectre_v2_mitigation mode) { @@ -1917,7 +1906,7 @@ static void __init spectre_v2_user_select_mitigation(void) if (!boot_cpu_has(X86_FEATURE_IBPB) && !boot_cpu_has(X86_FEATURE_STIBP)) return; - switch (spectre_v2_parse_user_cmdline()) { + switch (spectre_v2_user_cmd) { case SPECTRE_V2_USER_CMD_NONE: return; case SPECTRE_V2_USER_CMD_FORCE: @@ -2045,119 +2034,61 @@ static void __init spectre_v2_user_apply_mitigation(void) static const char * const spectre_v2_strings[] = { [SPECTRE_V2_NONE] = "Vulnerable", [SPECTRE_V2_RETPOLINE] = "Mitigation: Retpolines", - [SPECTRE_V2_LFENCE] = "Mitigation: LFENCE", + [SPECTRE_V2_LFENCE] = "Vulnerable: LFENCE", [SPECTRE_V2_EIBRS] = "Mitigation: Enhanced / Automatic IBRS", [SPECTRE_V2_EIBRS_LFENCE] = "Mitigation: Enhanced / Automatic IBRS + LFENCE", [SPECTRE_V2_EIBRS_RETPOLINE] = "Mitigation: Enhanced / Automatic IBRS + Retpolines", [SPECTRE_V2_IBRS] = "Mitigation: IBRS", }; -static const struct { - const char *option; - enum spectre_v2_mitigation_cmd cmd; - bool secure; -} mitigation_options[] __initconst = { - { "off", SPECTRE_V2_CMD_NONE, false }, - { "on", SPECTRE_V2_CMD_FORCE, true }, - { "retpoline", SPECTRE_V2_CMD_RETPOLINE, false }, - { "retpoline,amd", SPECTRE_V2_CMD_RETPOLINE_LFENCE, false }, - { "retpoline,lfence", SPECTRE_V2_CMD_RETPOLINE_LFENCE, false }, - { "retpoline,generic", SPECTRE_V2_CMD_RETPOLINE_GENERIC, false }, - { "eibrs", SPECTRE_V2_CMD_EIBRS, false }, - { "eibrs,lfence", SPECTRE_V2_CMD_EIBRS_LFENCE, false }, - { "eibrs,retpoline", SPECTRE_V2_CMD_EIBRS_RETPOLINE, false }, - { "auto", SPECTRE_V2_CMD_AUTO, false }, - { "ibrs", SPECTRE_V2_CMD_IBRS, false }, -}; +static bool nospectre_v2 __ro_after_init; -static void __init spec_v2_print_cond(const char *reason, bool secure) +static int __init nospectre_v2_parse_cmdline(char *str) { - if (boot_cpu_has_bug(X86_BUG_SPECTRE_V2) != secure) - pr_info("%s selected on command line.\n", reason); + nospectre_v2 = true; + spectre_v2_cmd = SPECTRE_V2_CMD_NONE; + return 0; } +early_param("nospectre_v2", nospectre_v2_parse_cmdline); -static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void) +static int __init spectre_v2_parse_cmdline(char *str) { - enum spectre_v2_mitigation_cmd cmd; - char arg[20]; - int ret, i; - - cmd = IS_ENABLED(CONFIG_MITIGATION_SPECTRE_V2) ? SPECTRE_V2_CMD_AUTO : SPECTRE_V2_CMD_NONE; - if (cmdline_find_option_bool(boot_command_line, "nospectre_v2")) - return SPECTRE_V2_CMD_NONE; - - ret = cmdline_find_option(boot_command_line, "spectre_v2", arg, sizeof(arg)); - if (ret < 0) - return cmd; - - for (i = 0; i < ARRAY_SIZE(mitigation_options); i++) { - if (!match_option(arg, ret, mitigation_options[i].option)) - continue; - cmd = mitigation_options[i].cmd; - break; - } - - if (i >= ARRAY_SIZE(mitigation_options)) { - pr_err("unknown option (%s). Switching to default mode\n", arg); - return cmd; - } - - if ((cmd == SPECTRE_V2_CMD_RETPOLINE || - cmd == SPECTRE_V2_CMD_RETPOLINE_LFENCE || - cmd == SPECTRE_V2_CMD_RETPOLINE_GENERIC || - cmd == SPECTRE_V2_CMD_EIBRS_LFENCE || - cmd == SPECTRE_V2_CMD_EIBRS_RETPOLINE) && - !IS_ENABLED(CONFIG_MITIGATION_RETPOLINE)) { - pr_err("%s selected but not compiled in. Switching to AUTO select\n", - mitigation_options[i].option); - return SPECTRE_V2_CMD_AUTO; - } - - if ((cmd == SPECTRE_V2_CMD_EIBRS || - cmd == SPECTRE_V2_CMD_EIBRS_LFENCE || - cmd == SPECTRE_V2_CMD_EIBRS_RETPOLINE) && - !boot_cpu_has(X86_FEATURE_IBRS_ENHANCED)) { - pr_err("%s selected but CPU doesn't have Enhanced or Automatic IBRS. Switching to AUTO select\n", - mitigation_options[i].option); - return SPECTRE_V2_CMD_AUTO; - } - - if ((cmd == SPECTRE_V2_CMD_RETPOLINE_LFENCE || - cmd == SPECTRE_V2_CMD_EIBRS_LFENCE) && - !boot_cpu_has(X86_FEATURE_LFENCE_RDTSC)) { - pr_err("%s selected, but CPU doesn't have a serializing LFENCE. Switching to AUTO select\n", - mitigation_options[i].option); - return SPECTRE_V2_CMD_AUTO; - } - - if (cmd == SPECTRE_V2_CMD_IBRS && !IS_ENABLED(CONFIG_MITIGATION_IBRS_ENTRY)) { - pr_err("%s selected but not compiled in. Switching to AUTO select\n", - mitigation_options[i].option); - return SPECTRE_V2_CMD_AUTO; - } - - if (cmd == SPECTRE_V2_CMD_IBRS && boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) { - pr_err("%s selected but not Intel CPU. Switching to AUTO select\n", - mitigation_options[i].option); - return SPECTRE_V2_CMD_AUTO; - } + if (!str) + return -EINVAL; - if (cmd == SPECTRE_V2_CMD_IBRS && !boot_cpu_has(X86_FEATURE_IBRS)) { - pr_err("%s selected but CPU doesn't have IBRS. Switching to AUTO select\n", - mitigation_options[i].option); - return SPECTRE_V2_CMD_AUTO; - } + if (nospectre_v2) + return 0; - if (cmd == SPECTRE_V2_CMD_IBRS && cpu_feature_enabled(X86_FEATURE_XENPV)) { - pr_err("%s selected but running as XenPV guest. Switching to AUTO select\n", - mitigation_options[i].option); - return SPECTRE_V2_CMD_AUTO; + if (!strcmp(str, "off")) { + spectre_v2_cmd = SPECTRE_V2_CMD_NONE; + } else if (!strcmp(str, "on")) { + spectre_v2_cmd = SPECTRE_V2_CMD_FORCE; + setup_force_cpu_bug(X86_BUG_SPECTRE_V2); + setup_force_cpu_bug(X86_BUG_SPECTRE_V2_USER); + } else if (!strcmp(str, "retpoline")) { + spectre_v2_cmd = SPECTRE_V2_CMD_RETPOLINE; + } else if (!strcmp(str, "retpoline,amd") || + !strcmp(str, "retpoline,lfence")) { + spectre_v2_cmd = SPECTRE_V2_CMD_RETPOLINE_LFENCE; + } else if (!strcmp(str, "retpoline,generic")) { + spectre_v2_cmd = SPECTRE_V2_CMD_RETPOLINE_GENERIC; + } else if (!strcmp(str, "eibrs")) { + spectre_v2_cmd = SPECTRE_V2_CMD_EIBRS; + } else if (!strcmp(str, "eibrs,lfence")) { + spectre_v2_cmd = SPECTRE_V2_CMD_EIBRS_LFENCE; + } else if (!strcmp(str, "eibrs,retpoline")) { + spectre_v2_cmd = SPECTRE_V2_CMD_EIBRS_RETPOLINE; + } else if (!strcmp(str, "auto")) { + spectre_v2_cmd = SPECTRE_V2_CMD_AUTO; + } else if (!strcmp(str, "ibrs")) { + spectre_v2_cmd = SPECTRE_V2_CMD_IBRS; + } else { + pr_err("Ignoring unknown spectre_v2 option (%s).", str); } - spec_v2_print_cond(mitigation_options[i].option, - mitigation_options[i].secure); - return cmd; + return 0; } +early_param("spectre_v2", spectre_v2_parse_cmdline); static enum spectre_v2_mitigation __init spectre_v2_select_retpoline(void) { @@ -2306,10 +2237,6 @@ static void __init bhi_update_mitigation(void) { if (spectre_v2_cmd == SPECTRE_V2_CMD_NONE) bhi_mitigation = BHI_MITIGATION_OFF; - - if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2) && - spectre_v2_cmd == SPECTRE_V2_CMD_AUTO) - bhi_mitigation = BHI_MITIGATION_OFF; } static void __init bhi_apply_mitigation(void) @@ -2345,11 +2272,55 @@ static void __init bhi_apply_mitigation(void) static void __init spectre_v2_select_mitigation(void) { - spectre_v2_cmd = spectre_v2_parse_cmdline(); + if ((spectre_v2_cmd == SPECTRE_V2_CMD_RETPOLINE || + spectre_v2_cmd == SPECTRE_V2_CMD_RETPOLINE_LFENCE || + spectre_v2_cmd == SPECTRE_V2_CMD_RETPOLINE_GENERIC || + spectre_v2_cmd == SPECTRE_V2_CMD_EIBRS_LFENCE || + spectre_v2_cmd == SPECTRE_V2_CMD_EIBRS_RETPOLINE) && + !IS_ENABLED(CONFIG_MITIGATION_RETPOLINE)) { + pr_err("RETPOLINE selected but not compiled in. Switching to AUTO select\n"); + spectre_v2_cmd = SPECTRE_V2_CMD_AUTO; + } + + if ((spectre_v2_cmd == SPECTRE_V2_CMD_EIBRS || + spectre_v2_cmd == SPECTRE_V2_CMD_EIBRS_LFENCE || + spectre_v2_cmd == SPECTRE_V2_CMD_EIBRS_RETPOLINE) && + !boot_cpu_has(X86_FEATURE_IBRS_ENHANCED)) { + pr_err("EIBRS selected but CPU doesn't have Enhanced or Automatic IBRS. Switching to AUTO select\n"); + spectre_v2_cmd = SPECTRE_V2_CMD_AUTO; + } + + if ((spectre_v2_cmd == SPECTRE_V2_CMD_RETPOLINE_LFENCE || + spectre_v2_cmd == SPECTRE_V2_CMD_EIBRS_LFENCE) && + !boot_cpu_has(X86_FEATURE_LFENCE_RDTSC)) { + pr_err("LFENCE selected, but CPU doesn't have a serializing LFENCE. Switching to AUTO select\n"); + spectre_v2_cmd = SPECTRE_V2_CMD_AUTO; + } + + if (spectre_v2_cmd == SPECTRE_V2_CMD_IBRS && !IS_ENABLED(CONFIG_MITIGATION_IBRS_ENTRY)) { + pr_err("IBRS selected but not compiled in. Switching to AUTO select\n"); + spectre_v2_cmd = SPECTRE_V2_CMD_AUTO; + } + + if (spectre_v2_cmd == SPECTRE_V2_CMD_IBRS && boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) { + pr_err("IBRS selected but not Intel CPU. Switching to AUTO select\n"); + spectre_v2_cmd = SPECTRE_V2_CMD_AUTO; + } + + if (spectre_v2_cmd == SPECTRE_V2_CMD_IBRS && !boot_cpu_has(X86_FEATURE_IBRS)) { + pr_err("IBRS selected but CPU doesn't have IBRS. Switching to AUTO select\n"); + spectre_v2_cmd = SPECTRE_V2_CMD_AUTO; + } + + if (spectre_v2_cmd == SPECTRE_V2_CMD_IBRS && cpu_feature_enabled(X86_FEATURE_XENPV)) { + pr_err("IBRS selected but running as XenPV guest. Switching to AUTO select\n"); + spectre_v2_cmd = SPECTRE_V2_CMD_AUTO; + } - if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2) && - (spectre_v2_cmd == SPECTRE_V2_CMD_NONE || spectre_v2_cmd == SPECTRE_V2_CMD_AUTO)) + if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2)) { + spectre_v2_cmd = SPECTRE_V2_CMD_NONE; return; + } switch (spectre_v2_cmd) { case SPECTRE_V2_CMD_NONE: @@ -2555,16 +2526,8 @@ static void update_mds_branch_idle(void) #undef pr_fmt #define pr_fmt(fmt) "Speculative Store Bypass: " fmt -static enum ssb_mitigation ssb_mode __ro_after_init = SPEC_STORE_BYPASS_NONE; - -/* The kernel command line selection */ -enum ssb_mitigation_cmd { - SPEC_STORE_BYPASS_CMD_NONE, - SPEC_STORE_BYPASS_CMD_AUTO, - SPEC_STORE_BYPASS_CMD_ON, - SPEC_STORE_BYPASS_CMD_PRCTL, - SPEC_STORE_BYPASS_CMD_SECCOMP, -}; +static enum ssb_mitigation ssb_mode __ro_after_init = + IS_ENABLED(CONFIG_MITIGATION_SSB) ? SPEC_STORE_BYPASS_AUTO : SPEC_STORE_BYPASS_NONE; static const char * const ssb_strings[] = { [SPEC_STORE_BYPASS_NONE] = "Vulnerable", @@ -2573,94 +2536,61 @@ static const char * const ssb_strings[] = { [SPEC_STORE_BYPASS_SECCOMP] = "Mitigation: Speculative Store Bypass disabled via prctl and seccomp", }; -static const struct { - const char *option; - enum ssb_mitigation_cmd cmd; -} ssb_mitigation_options[] __initconst = { - { "auto", SPEC_STORE_BYPASS_CMD_AUTO }, /* Platform decides */ - { "on", SPEC_STORE_BYPASS_CMD_ON }, /* Disable Speculative Store Bypass */ - { "off", SPEC_STORE_BYPASS_CMD_NONE }, /* Don't touch Speculative Store Bypass */ - { "prctl", SPEC_STORE_BYPASS_CMD_PRCTL }, /* Disable Speculative Store Bypass via prctl */ - { "seccomp", SPEC_STORE_BYPASS_CMD_SECCOMP }, /* Disable Speculative Store Bypass via prctl and seccomp */ -}; +static bool nossb __ro_after_init; -static enum ssb_mitigation_cmd __init ssb_parse_cmdline(void) +static int __init nossb_parse_cmdline(char *str) { - enum ssb_mitigation_cmd cmd; - char arg[20]; - int ret, i; - - cmd = IS_ENABLED(CONFIG_MITIGATION_SSB) ? - SPEC_STORE_BYPASS_CMD_AUTO : SPEC_STORE_BYPASS_CMD_NONE; - if (cmdline_find_option_bool(boot_command_line, "nospec_store_bypass_disable") || - cpu_mitigations_off()) { - return SPEC_STORE_BYPASS_CMD_NONE; - } else { - ret = cmdline_find_option(boot_command_line, "spec_store_bypass_disable", - arg, sizeof(arg)); - if (ret < 0) - return cmd; + nossb = true; + ssb_mode = SPEC_STORE_BYPASS_NONE; + return 0; +} +early_param("nospec_store_bypass_disable", nossb_parse_cmdline); - for (i = 0; i < ARRAY_SIZE(ssb_mitigation_options); i++) { - if (!match_option(arg, ret, ssb_mitigation_options[i].option)) - continue; +static int __init ssb_parse_cmdline(char *str) +{ + if (!str) + return -EINVAL; - cmd = ssb_mitigation_options[i].cmd; - break; - } + if (nossb) + return 0; - if (i >= ARRAY_SIZE(ssb_mitigation_options)) { - pr_err("unknown option (%s). Switching to default mode\n", arg); - return cmd; - } - } + if (!strcmp(str, "auto")) + ssb_mode = SPEC_STORE_BYPASS_AUTO; + else if (!strcmp(str, "on")) + ssb_mode = SPEC_STORE_BYPASS_DISABLE; + else if (!strcmp(str, "off")) + ssb_mode = SPEC_STORE_BYPASS_NONE; + else if (!strcmp(str, "prctl")) + ssb_mode = SPEC_STORE_BYPASS_PRCTL; + else if (!strcmp(str, "seccomp")) + ssb_mode = IS_ENABLED(CONFIG_SECCOMP) ? + SPEC_STORE_BYPASS_SECCOMP : SPEC_STORE_BYPASS_PRCTL; + else + pr_err("Ignoring unknown spec_store_bypass_disable option (%s).\n", + str); - return cmd; + return 0; } +early_param("spec_store_bypass_disable", ssb_parse_cmdline); static void __init ssb_select_mitigation(void) { - enum ssb_mitigation_cmd cmd; - - if (!boot_cpu_has(X86_FEATURE_SSBD)) - goto out; - - cmd = ssb_parse_cmdline(); - if (!boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS) && - (cmd == SPEC_STORE_BYPASS_CMD_NONE || - cmd == SPEC_STORE_BYPASS_CMD_AUTO)) + if (!boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS)) { + ssb_mode = SPEC_STORE_BYPASS_NONE; return; + } - switch (cmd) { - case SPEC_STORE_BYPASS_CMD_SECCOMP: - /* - * Choose prctl+seccomp as the default mode if seccomp is - * enabled. - */ - if (IS_ENABLED(CONFIG_SECCOMP)) - ssb_mode = SPEC_STORE_BYPASS_SECCOMP; - else - ssb_mode = SPEC_STORE_BYPASS_PRCTL; - break; - case SPEC_STORE_BYPASS_CMD_ON: - ssb_mode = SPEC_STORE_BYPASS_DISABLE; - break; - case SPEC_STORE_BYPASS_CMD_AUTO: + if (ssb_mode == SPEC_STORE_BYPASS_AUTO) { if (should_mitigate_vuln(X86_BUG_SPEC_STORE_BYPASS)) ssb_mode = SPEC_STORE_BYPASS_PRCTL; else ssb_mode = SPEC_STORE_BYPASS_NONE; - break; - case SPEC_STORE_BYPASS_CMD_PRCTL: - ssb_mode = SPEC_STORE_BYPASS_PRCTL; - break; - case SPEC_STORE_BYPASS_CMD_NONE: - break; } -out: - if (boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS)) - pr_info("%s\n", ssb_strings[ssb_mode]); + if (!boot_cpu_has(X86_FEATURE_SSBD)) + ssb_mode = SPEC_STORE_BYPASS_NONE; + + pr_info("%s\n", ssb_strings[ssb_mode]); } static void __init ssb_apply_mitigation(void) @@ -2876,6 +2806,7 @@ static int ssb_prctl_get(struct task_struct *task) return PR_SPEC_DISABLE; case SPEC_STORE_BYPASS_SECCOMP: case SPEC_STORE_BYPASS_PRCTL: + case SPEC_STORE_BYPASS_AUTO: if (task_spec_ssb_force_disable(task)) return PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE; if (task_spec_ssb_noexec(task)) @@ -3195,14 +3126,15 @@ ibpb_on_vmexit: static void __init srso_update_mitigation(void) { + if (!boot_cpu_has_bug(X86_BUG_SRSO)) + return; + /* If retbleed is using IBPB, that works for SRSO as well */ if (retbleed_mitigation == RETBLEED_MITIGATION_IBPB && boot_cpu_has(X86_FEATURE_IBPB_BRTYPE)) srso_mitigation = SRSO_MITIGATION_IBPB; - if (boot_cpu_has_bug(X86_BUG_SRSO) && - !cpu_mitigations_off()) - pr_info("%s\n", srso_strings[srso_mitigation]); + pr_info("%s\n", srso_strings[srso_mitigation]); } static void __init srso_apply_mitigation(void) @@ -3304,15 +3236,18 @@ early_param("vmscape", vmscape_parse_cmdline); static void __init vmscape_select_mitigation(void) { - if (cpu_mitigations_off() || - !boot_cpu_has_bug(X86_BUG_VMSCAPE) || + if (!boot_cpu_has_bug(X86_BUG_VMSCAPE) || !boot_cpu_has(X86_FEATURE_IBPB)) { vmscape_mitigation = VMSCAPE_MITIGATION_NONE; return; } - if (vmscape_mitigation == VMSCAPE_MITIGATION_AUTO) - vmscape_mitigation = VMSCAPE_MITIGATION_IBPB_EXIT_TO_USER; + if (vmscape_mitigation == VMSCAPE_MITIGATION_AUTO) { + if (should_mitigate_vuln(X86_BUG_VMSCAPE)) + vmscape_mitigation = VMSCAPE_MITIGATION_IBPB_EXIT_TO_USER; + else + vmscape_mitigation = VMSCAPE_MITIGATION_NONE; + } } static void __init vmscape_update_mitigation(void) @@ -3626,9 +3561,6 @@ static const char *spectre_bhi_state(void) static ssize_t spectre_v2_show_state(char *buf) { - if (spectre_v2_enabled == SPECTRE_V2_LFENCE) - return sysfs_emit(buf, "Vulnerable: LFENCE\n"); - if (spectre_v2_enabled == SPECTRE_V2_EIBRS && unprivileged_ebpf_enabled()) return sysfs_emit(buf, "Vulnerable: eIBRS with unprivileged eBPF\n"); diff --git a/arch/x86/kernel/cpu/cacheinfo.c b/arch/x86/kernel/cpu/cacheinfo.c index adfa7e8bb865..51a95b07831f 100644 --- a/arch/x86/kernel/cpu/cacheinfo.c +++ b/arch/x86/kernel/cpu/cacheinfo.c @@ -290,6 +290,22 @@ static int find_num_cache_leaves(struct cpuinfo_x86 *c) } /* + * The max shared threads number comes from CPUID(0x4) EAX[25-14] with input + * ECX as cache index. Then right shift apicid by the number's order to get + * cache id for this cache node. + */ +static unsigned int get_cache_id(u32 apicid, const struct _cpuid4_info *id4) +{ + unsigned long num_threads_sharing; + int index_msb; + + num_threads_sharing = 1 + id4->eax.split.num_threads_sharing; + index_msb = get_count_order(num_threads_sharing); + + return apicid >> index_msb; +} + +/* * AMD/Hygon CPUs may have multiple LLCs if L3 caches exist. */ @@ -312,18 +328,11 @@ void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, u16 die_id) * Newer families: LLC ID is calculated from the number * of threads sharing the L3 cache. */ - u32 eax, ebx, ecx, edx, num_sharing_cache = 0; u32 llc_index = find_num_cache_leaves(c) - 1; + struct _cpuid4_info id4 = {}; - cpuid_count(0x8000001d, llc_index, &eax, &ebx, &ecx, &edx); - if (eax) - num_sharing_cache = ((eax >> 14) & 0xfff) + 1; - - if (num_sharing_cache) { - int index_msb = get_count_order(num_sharing_cache); - - c->topo.llc_id = c->topo.apicid >> index_msb; - } + if (!amd_fill_cpuid4_info(llc_index, &id4)) + c->topo.llc_id = get_cache_id(c->topo.apicid, &id4); } } @@ -598,27 +607,12 @@ int init_cache_level(unsigned int cpu) return 0; } -/* - * The max shared threads number comes from CPUID(0x4) EAX[25-14] with input - * ECX as cache index. Then right shift apicid by the number's order to get - * cache id for this cache node. - */ -static void get_cache_id(int cpu, struct _cpuid4_info *id4) -{ - struct cpuinfo_x86 *c = &cpu_data(cpu); - unsigned long num_threads_sharing; - int index_msb; - - num_threads_sharing = 1 + id4->eax.split.num_threads_sharing; - index_msb = get_count_order(num_threads_sharing); - id4->id = c->topo.apicid >> index_msb; -} - int populate_cache_leaves(unsigned int cpu) { struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu); struct cacheinfo *ci = this_cpu_ci->info_list; u8 cpu_vendor = boot_cpu_data.x86_vendor; + u32 apicid = cpu_data(cpu).topo.apicid; struct amd_northbridge *nb = NULL; struct _cpuid4_info id4 = {}; int idx, ret; @@ -628,7 +622,7 @@ int populate_cache_leaves(unsigned int cpu) if (ret) return ret; - get_cache_id(cpu, &id4); + id4.id = get_cache_id(apicid, &id4); if (cpu_vendor == X86_VENDOR_AMD || cpu_vendor == X86_VENDOR_HYGON) nb = amd_init_l3_cache(idx); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index f98ec9c7fc07..c7d3512914ca 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1808,6 +1808,7 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) setup_clear_cpu_cap(X86_FEATURE_LA57); detect_nopl(); + mca_bsp_init(c); } void __init init_cpu_devs(void) diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c index 553bfbfc3a1b..f3e9219845e8 100644 --- a/arch/x86/kernel/cpu/hypervisor.c +++ b/arch/x86/kernel/cpu/hypervisor.c @@ -45,6 +45,9 @@ static const __initconst struct hypervisor_x86 * const hypervisors[] = #ifdef CONFIG_ACRN_GUEST &x86_hyper_acrn, #endif +#ifdef CONFIG_BHYVE_GUEST + &x86_hyper_bhyve, +#endif }; enum x86_hypervisor_type x86_hyper_type; diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index 5c4eb28c3ac9..d6906442f49b 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -241,7 +241,8 @@ struct threshold_block { struct threshold_bank { struct kobject *kobj; - struct threshold_block *blocks; + /* List of threshold blocks within this MCA bank. */ + struct list_head miscj; }; static DEFINE_PER_CPU(struct threshold_bank **, threshold_banks); @@ -252,9 +253,6 @@ static DEFINE_PER_CPU(struct threshold_bank **, threshold_banks); */ static DEFINE_PER_CPU(u64, bank_map); -/* Map of banks that have more than MCA_MISC0 available. */ -static DEFINE_PER_CPU(u64, smca_misc_banks_map); - static void amd_threshold_interrupt(void); static void amd_deferred_error_interrupt(void); @@ -264,28 +262,6 @@ static void default_deferred_error_interrupt(void) } void (*deferred_error_int_vector)(void) = default_deferred_error_interrupt; -static void smca_set_misc_banks_map(unsigned int bank, unsigned int cpu) -{ - u32 low, high; - - /* - * For SMCA enabled processors, BLKPTR field of the first MISC register - * (MCx_MISC0) indicates presence of additional MISC regs set (MISC1-4). - */ - if (rdmsr_safe(MSR_AMD64_SMCA_MCx_CONFIG(bank), &low, &high)) - return; - - if (!(low & MCI_CONFIG_MCAX)) - return; - - if (rdmsr_safe(MSR_AMD64_SMCA_MCx_MISC(bank), &low, &high)) - return; - - if (low & MASK_BLKPTR_LO) - per_cpu(smca_misc_banks_map, cpu) |= BIT_ULL(bank); - -} - static void smca_configure(unsigned int bank, unsigned int cpu) { u8 *bank_counts = this_cpu_ptr(smca_bank_counts); @@ -326,8 +302,6 @@ static void smca_configure(unsigned int bank, unsigned int cpu) wrmsr(smca_config, low, high); } - smca_set_misc_banks_map(bank, cpu); - if (rdmsr_safe(MSR_AMD64_SMCA_MCx_IPID(bank), &low, &high)) { pr_warn("Failed to read MCA_IPID for bank %d\n", bank); return; @@ -419,8 +393,8 @@ static bool lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi) return true; }; -/* Reprogram MCx_MISC MSR behind this threshold bank. */ -static void threshold_restart_bank(void *_tr) +/* Reprogram MCx_MISC MSR behind this threshold block. */ +static void threshold_restart_block(void *_tr) { struct thresh_restart *tr = _tr; u32 hi, lo; @@ -478,7 +452,7 @@ static void mce_threshold_block_init(struct threshold_block *b, int offset) }; b->threshold_limit = THRESHOLD_MAX; - threshold_restart_bank(&tr); + threshold_restart_block(&tr); }; static int setup_APIC_mce_threshold(int reserved, int new) @@ -525,18 +499,6 @@ static void deferred_error_interrupt_enable(struct cpuinfo_x86 *c) wrmsr(MSR_CU_DEF_ERR, low, high); } -static u32 smca_get_block_address(unsigned int bank, unsigned int block, - unsigned int cpu) -{ - if (!block) - return MSR_AMD64_SMCA_MCx_MISC(bank); - - if (!(per_cpu(smca_misc_banks_map, cpu) & BIT_ULL(bank))) - return 0; - - return MSR_AMD64_SMCA_MCx_MISCy(bank, block - 1); -} - static u32 get_block_address(u32 current_addr, u32 low, u32 high, unsigned int bank, unsigned int block, unsigned int cpu) @@ -546,8 +508,15 @@ static u32 get_block_address(u32 current_addr, u32 low, u32 high, if ((bank >= per_cpu(mce_num_banks, cpu)) || (block >= NR_BLOCKS)) return addr; - if (mce_flags.smca) - return smca_get_block_address(bank, block, cpu); + if (mce_flags.smca) { + if (!block) + return MSR_AMD64_SMCA_MCx_MISC(bank); + + if (!(low & MASK_BLKPTR_LO)) + return 0; + + return MSR_AMD64_SMCA_MCx_MISCy(bank, block - 1); + } /* Fall back to method we used for older processors: */ switch (block) { @@ -677,6 +646,28 @@ static void disable_err_thresholding(struct cpuinfo_x86 *c, unsigned int bank) wrmsrq(MSR_K7_HWCR, hwcr); } +static void amd_apply_cpu_quirks(struct cpuinfo_x86 *c) +{ + struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); + + /* This should be disabled by the BIOS, but isn't always */ + if (c->x86 == 15 && this_cpu_read(mce_num_banks) > 4) { + /* + * disable GART TBL walk error reporting, which + * trips off incorrectly with the IOMMU & 3ware + * & Cerberus: + */ + clear_bit(10, (unsigned long *)&mce_banks[4].ctl); + } + + /* + * Various K7s with broken bank 0 around. Always disable + * by default. + */ + if (c->x86 == 6 && this_cpu_read(mce_num_banks)) + mce_banks[0].ctl = 0; +} + /* cpu init entry point, called from mce.c with preempt off */ void mce_amd_feature_init(struct cpuinfo_x86 *c) { @@ -684,6 +675,9 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) u32 low = 0, high = 0, address = 0; int offset = -1; + amd_apply_cpu_quirks(c); + + mce_flags.amd_threshold = 1; for (bank = 0; bank < this_cpu_read(mce_num_banks); ++bank) { if (mce_flags.smca) @@ -714,6 +708,12 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) deferred_error_interrupt_enable(c); } +void smca_bsp_init(void) +{ + mce_threshold_vector = amd_threshold_interrupt; + deferred_error_int_vector = amd_deferred_error_interrupt; +} + /* * DRAM ECC errors are reported in the Northbridge (bank 4) with * Extended Error Code 8. @@ -921,7 +921,7 @@ static void log_and_reset_block(struct threshold_block *block) /* Reset threshold block after logging error. */ memset(&tr, 0, sizeof(tr)); tr.b = block; - threshold_restart_bank(&tr); + threshold_restart_block(&tr); } /* @@ -930,9 +930,9 @@ static void log_and_reset_block(struct threshold_block *block) */ static void amd_threshold_interrupt(void) { - struct threshold_block *first_block = NULL, *block = NULL, *tmp = NULL; - struct threshold_bank **bp = this_cpu_read(threshold_banks); + struct threshold_bank **bp = this_cpu_read(threshold_banks), *thr_bank; unsigned int bank, cpu = smp_processor_id(); + struct threshold_block *block, *tmp; /* * Validate that the threshold bank has been initialized already. The @@ -946,20 +946,20 @@ static void amd_threshold_interrupt(void) if (!(per_cpu(bank_map, cpu) & BIT_ULL(bank))) continue; - first_block = bp[bank]->blocks; - if (!first_block) + thr_bank = bp[bank]; + if (!thr_bank) continue; - /* - * The first block is also the head of the list. Check it first - * before iterating over the rest. - */ - log_and_reset_block(first_block); - list_for_each_entry_safe(block, tmp, &first_block->miscj, miscj) + list_for_each_entry_safe(block, tmp, &thr_bank->miscj, miscj) log_and_reset_block(block); } } +void amd_clear_bank(struct mce *m) +{ + mce_wrmsrq(mca_msr_reg(m->bank, MCA_STATUS), 0); +} + /* * Sysfs Interface */ @@ -995,7 +995,7 @@ store_interrupt_enable(struct threshold_block *b, const char *buf, size_t size) memset(&tr, 0, sizeof(tr)); tr.b = b; - if (smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1)) + if (smp_call_function_single(b->cpu, threshold_restart_block, &tr, 1)) return -ENODEV; return size; @@ -1020,7 +1020,7 @@ store_threshold_limit(struct threshold_block *b, const char *buf, size_t size) b->threshold_limit = new; tr.b = b; - if (smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1)) + if (smp_call_function_single(b->cpu, threshold_restart_block, &tr, 1)) return -ENODEV; return size; @@ -1181,13 +1181,7 @@ static int allocate_threshold_blocks(unsigned int cpu, struct threshold_bank *tb default_attrs[2] = NULL; } - INIT_LIST_HEAD(&b->miscj); - - /* This is safe as @tb is not visible yet */ - if (tb->blocks) - list_add(&b->miscj, &tb->blocks->miscj); - else - tb->blocks = b; + list_add(&b->miscj, &tb->miscj); err = kobject_init_and_add(&b->kobj, &threshold_ktype, tb->kobj, get_name(cpu, bank, b)); if (err) @@ -1238,6 +1232,8 @@ static int threshold_create_bank(struct threshold_bank **bp, unsigned int cpu, goto out_free; } + INIT_LIST_HEAD(&b->miscj); + err = allocate_threshold_blocks(cpu, b, bank, 0, mca_msr_reg(bank, MCA_MISC)); if (err) goto out_kobj; @@ -1258,26 +1254,15 @@ static void threshold_block_release(struct kobject *kobj) kfree(to_block(kobj)); } -static void deallocate_threshold_blocks(struct threshold_bank *bank) +static void threshold_remove_bank(struct threshold_bank *bank) { struct threshold_block *pos, *tmp; - list_for_each_entry_safe(pos, tmp, &bank->blocks->miscj, miscj) { + list_for_each_entry_safe(pos, tmp, &bank->miscj, miscj) { list_del(&pos->miscj); kobject_put(&pos->kobj); } - kobject_put(&bank->blocks->kobj); -} - -static void threshold_remove_bank(struct threshold_bank *bank) -{ - if (!bank->blocks) - goto out_free; - - deallocate_threshold_blocks(bank); - -out_free: kobject_put(bank->kobj); kfree(bank); } @@ -1296,12 +1281,12 @@ static void __threshold_remove_device(struct threshold_bank **bp) kfree(bp); } -int mce_threshold_remove_device(unsigned int cpu) +void mce_threshold_remove_device(unsigned int cpu) { struct threshold_bank **bp = this_cpu_read(threshold_banks); if (!bp) - return 0; + return; /* * Clear the pointer before cleaning up, so that the interrupt won't @@ -1310,7 +1295,7 @@ int mce_threshold_remove_device(unsigned int cpu) this_cpu_write(threshold_banks, NULL); __threshold_remove_device(bp); - return 0; + return; } /** @@ -1324,36 +1309,34 @@ int mce_threshold_remove_device(unsigned int cpu) * thread running on @cpu. The callback is invoked on all CPUs which are * online when the callback is installed or during a real hotplug event. */ -int mce_threshold_create_device(unsigned int cpu) +void mce_threshold_create_device(unsigned int cpu) { unsigned int numbanks, bank; struct threshold_bank **bp; - int err; if (!mce_flags.amd_threshold) - return 0; + return; bp = this_cpu_read(threshold_banks); if (bp) - return 0; + return; numbanks = this_cpu_read(mce_num_banks); bp = kcalloc(numbanks, sizeof(*bp), GFP_KERNEL); if (!bp) - return -ENOMEM; + return; for (bank = 0; bank < numbanks; ++bank) { if (!(this_cpu_read(bank_map) & BIT_ULL(bank))) continue; - err = threshold_create_bank(bp, cpu, bank); - if (err) { + if (threshold_create_bank(bp, cpu, bank)) { __threshold_remove_device(bp); - return err; + return; } } this_cpu_write(threshold_banks, bp); if (thresholding_irq_en) mce_threshold_vector = amd_threshold_interrupt; - return 0; + return; } diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 4da4eab56c81..460e90a1a0b1 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -423,7 +423,7 @@ noinstr u64 mce_rdmsrq(u32 msr) return EAX_EDX_VAL(val, low, high); } -static noinstr void mce_wrmsrq(u32 msr, u64 v) +noinstr void mce_wrmsrq(u32 msr, u64 v) { u32 low, high; @@ -715,6 +715,60 @@ static noinstr void mce_read_aux(struct mce_hw_err *err, int i) DEFINE_PER_CPU(unsigned, mce_poll_count); /* + * Newer Intel systems that support software error + * recovery need to make additional checks. Other + * CPUs should skip over uncorrected errors, but log + * everything else. + */ +static bool ser_should_log_poll_error(struct mce *m) +{ + /* Log "not enabled" (speculative) errors */ + if (!(m->status & MCI_STATUS_EN)) + return true; + + /* + * Log UCNA (SDM: 15.6.3 "UCR Error Classification") + * UC == 1 && PCC == 0 && S == 0 + */ + if (!(m->status & MCI_STATUS_PCC) && !(m->status & MCI_STATUS_S)) + return true; + + return false; +} + +static bool should_log_poll_error(enum mcp_flags flags, struct mce_hw_err *err) +{ + struct mce *m = &err->m; + + /* If this entry is not valid, ignore it. */ + if (!(m->status & MCI_STATUS_VAL)) + return false; + + /* + * If we are logging everything (at CPU online) or this + * is a corrected error, then we must log it. + */ + if ((flags & MCP_UC) || !(m->status & MCI_STATUS_UC)) + return true; + + if (mca_cfg.ser) + return ser_should_log_poll_error(m); + + if (m->status & MCI_STATUS_UC) + return false; + + return true; +} + +static void clear_bank(struct mce *m) +{ + if (m->cpuvendor == X86_VENDOR_AMD) + return amd_clear_bank(m); + + mce_wrmsrq(mca_msr_reg(m->bank, MCA_STATUS), 0); +} + +/* * Poll for corrected events or events that happened before reset. * Those are just logged through /dev/mcelog. * @@ -765,51 +819,10 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) if (!mca_cfg.cmci_disabled) mce_track_storm(m); - /* If this entry is not valid, ignore it */ - if (!(m->status & MCI_STATUS_VAL)) + /* Verify that the error should be logged based on hardware conditions. */ + if (!should_log_poll_error(flags, &err)) continue; - /* - * If we are logging everything (at CPU online) or this - * is a corrected error, then we must log it. - */ - if ((flags & MCP_UC) || !(m->status & MCI_STATUS_UC)) - goto log_it; - - /* - * Newer Intel systems that support software error - * recovery need to make additional checks. Other - * CPUs should skip over uncorrected errors, but log - * everything else. - */ - if (!mca_cfg.ser) { - if (m->status & MCI_STATUS_UC) - continue; - goto log_it; - } - - /* Log "not enabled" (speculative) errors */ - if (!(m->status & MCI_STATUS_EN)) - goto log_it; - - /* - * Log UCNA (SDM: 15.6.3 "UCR Error Classification") - * UC == 1 && PCC == 0 && S == 0 - */ - if (!(m->status & MCI_STATUS_PCC) && !(m->status & MCI_STATUS_S)) - goto log_it; - - /* - * Skip anything else. Presumption is that our read of this - * bank is racing with a machine check. Leave the log alone - * for do_machine_check() to deal with it. - */ - continue; - -log_it: - if (flags & MCP_DONTLOG) - goto clear_it; - mce_read_aux(&err, i); m->severity = mce_severity(m, NULL, NULL, false); /* @@ -826,10 +839,7 @@ log_it: mce_log(&err); clear_it: - /* - * Clear state for this bank. - */ - mce_wrmsrq(mca_msr_reg(i, MCA_STATUS), 0); + clear_bank(m); } /* @@ -1810,9 +1820,10 @@ static void __mcheck_cpu_mce_banks_init(void) struct mce_bank *b = &mce_banks[i]; /* - * Init them all, __mcheck_cpu_apply_quirks() is going to apply - * the required vendor quirks before - * __mcheck_cpu_init_clear_banks() does the final bank setup. + * Init them all by default. + * + * The required vendor quirks will be applied before + * __mcheck_cpu_init_prepare_banks() does the final bank setup. */ b->ctl = -1ULL; b->init = true; @@ -1840,69 +1851,34 @@ static void __mcheck_cpu_cap_init(void) this_cpu_write(mce_num_banks, b); __mcheck_cpu_mce_banks_init(); - - /* Use accurate RIP reporting if available. */ - if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9) - mca_cfg.rip_msr = MSR_IA32_MCG_EIP; - - if (cap & MCG_SER_P) - mca_cfg.ser = 1; } static void __mcheck_cpu_init_generic(void) { - enum mcp_flags m_fl = 0; - mce_banks_t all_banks; u64 cap; - if (!mca_cfg.bootlog) - m_fl = MCP_DONTLOG; - - /* - * Log the machine checks left over from the previous reset. Log them - * only, do not start processing them. That will happen in mcheck_late_init() - * when all consumers have been registered on the notifier chain. - */ - bitmap_fill(all_banks, MAX_NR_BANKS); - machine_check_poll(MCP_UC | MCP_QUEUE_LOG | m_fl, &all_banks); - - cr4_set_bits(X86_CR4_MCE); - rdmsrq(MSR_IA32_MCG_CAP, cap); if (cap & MCG_CTL_P) wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); } -static void __mcheck_cpu_init_clear_banks(void) +static void __mcheck_cpu_init_prepare_banks(void) { struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); + u64 msrval; int i; - for (i = 0; i < this_cpu_read(mce_num_banks); i++) { - struct mce_bank *b = &mce_banks[i]; + /* + * Log the machine checks left over from the previous reset. Log them + * only, do not start processing them. That will happen in mcheck_late_init() + * when all consumers have been registered on the notifier chain. + */ + if (mca_cfg.bootlog) { + mce_banks_t all_banks; - if (!b->init) - continue; - wrmsrq(mca_msr_reg(i, MCA_CTL), b->ctl); - wrmsrq(mca_msr_reg(i, MCA_STATUS), 0); + bitmap_fill(all_banks, MAX_NR_BANKS); + machine_check_poll(MCP_UC | MCP_QUEUE_LOG, &all_banks); } -} - -/* - * Do a final check to see if there are any unused/RAZ banks. - * - * This must be done after the banks have been initialized and any quirks have - * been applied. - * - * Do not call this from any user-initiated flows, e.g. CPU hotplug or sysfs. - * Otherwise, a user who disables a bank will not be able to re-enable it - * without a system reboot. - */ -static void __mcheck_cpu_check_banks(void) -{ - struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); - u64 msrval; - int i; for (i = 0; i < this_cpu_read(mce_num_banks); i++) { struct mce_bank *b = &mce_banks[i]; @@ -1910,25 +1886,16 @@ static void __mcheck_cpu_check_banks(void) if (!b->init) continue; + wrmsrq(mca_msr_reg(i, MCA_CTL), b->ctl); + wrmsrq(mca_msr_reg(i, MCA_STATUS), 0); + rdmsrq(mca_msr_reg(i, MCA_CTL), msrval); b->init = !!msrval; } } -static void apply_quirks_amd(struct cpuinfo_x86 *c) +static void amd_apply_global_quirks(struct cpuinfo_x86 *c) { - struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); - - /* This should be disabled by the BIOS, but isn't always */ - if (c->x86 == 15 && this_cpu_read(mce_num_banks) > 4) { - /* - * disable GART TBL walk error reporting, which - * trips off incorrectly with the IOMMU & 3ware - * & Cerberus: - */ - clear_bit(10, (unsigned long *)&mce_banks[4].ctl); - } - if (c->x86 < 0x11 && mca_cfg.bootlog < 0) { /* * Lots of broken BIOS around that don't clear them @@ -1938,13 +1905,6 @@ static void apply_quirks_amd(struct cpuinfo_x86 *c) } /* - * Various K7s with broken bank 0 around. Always disable - * by default. - */ - if (c->x86 == 6 && this_cpu_read(mce_num_banks)) - mce_banks[0].ctl = 0; - - /* * overflow_recov is supported for F15h Models 00h-0fh * even though we don't have a CPUID bit for it. */ @@ -1955,26 +1915,13 @@ static void apply_quirks_amd(struct cpuinfo_x86 *c) mce_flags.zen_ifu_quirk = 1; } -static void apply_quirks_intel(struct cpuinfo_x86 *c) +static void intel_apply_global_quirks(struct cpuinfo_x86 *c) { - struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); - /* Older CPUs (prior to family 6) don't need quirks. */ if (c->x86_vfm < INTEL_PENTIUM_PRO) return; /* - * SDM documents that on family 6 bank 0 should not be written - * because it aliases to another special BIOS controlled - * register. - * But it's not aliased anymore on model 0x1a+ - * Don't ignore bank 0 completely because there could be a - * valid event later, merely don't write CTL0. - */ - if (c->x86_vfm < INTEL_NEHALEM_EP && this_cpu_read(mce_num_banks)) - mce_banks[0].init = false; - - /* * All newer Intel systems support MCE broadcasting. Enable * synchronization with a one second timeout. */ @@ -1999,7 +1946,7 @@ static void apply_quirks_intel(struct cpuinfo_x86 *c) mce_flags.skx_repmov_quirk = 1; } -static void apply_quirks_zhaoxin(struct cpuinfo_x86 *c) +static void zhaoxin_apply_global_quirks(struct cpuinfo_x86 *c) { /* * All newer Zhaoxin CPUs support MCE broadcasting. Enable @@ -2011,34 +1958,6 @@ static void apply_quirks_zhaoxin(struct cpuinfo_x86 *c) } } -/* Add per CPU specific workarounds here */ -static bool __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) -{ - struct mca_config *cfg = &mca_cfg; - - switch (c->x86_vendor) { - case X86_VENDOR_UNKNOWN: - pr_info("unknown CPU type - not enabling MCE support\n"); - return false; - case X86_VENDOR_AMD: - apply_quirks_amd(c); - break; - case X86_VENDOR_INTEL: - apply_quirks_intel(c); - break; - case X86_VENDOR_ZHAOXIN: - apply_quirks_zhaoxin(c); - break; - } - - if (cfg->monarch_timeout < 0) - cfg->monarch_timeout = 0; - if (cfg->bootlog != 0) - cfg->panic_timeout = 30; - - return true; -} - static bool __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c) { if (c->x86 != 5) @@ -2060,19 +1979,6 @@ static bool __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c) return false; } -/* - * Init basic CPU features needed for early decoding of MCEs. - */ -static void __mcheck_cpu_init_early(struct cpuinfo_x86 *c) -{ - if (c->x86_vendor == X86_VENDOR_AMD || c->x86_vendor == X86_VENDOR_HYGON) { - mce_flags.overflow_recov = !!cpu_has(c, X86_FEATURE_OVERFLOW_RECOV); - mce_flags.succor = !!cpu_has(c, X86_FEATURE_SUCCOR); - mce_flags.smca = !!cpu_has(c, X86_FEATURE_SMCA); - mce_flags.amd_threshold = 1; - } -} - static void mce_centaur_feature_init(struct cpuinfo_x86 *c) { struct mca_config *cfg = &mca_cfg; @@ -2281,6 +2187,53 @@ DEFINE_IDTENTRY_RAW(exc_machine_check) } #endif +void mca_bsp_init(struct cpuinfo_x86 *c) +{ + u64 cap; + + if (!mce_available(c)) + return; + + if (c->x86_vendor == X86_VENDOR_UNKNOWN) { + mca_cfg.disabled = 1; + pr_info("unknown CPU type - not enabling MCE support\n"); + return; + } + + mce_flags.overflow_recov = cpu_feature_enabled(X86_FEATURE_OVERFLOW_RECOV); + mce_flags.succor = cpu_feature_enabled(X86_FEATURE_SUCCOR); + mce_flags.smca = cpu_feature_enabled(X86_FEATURE_SMCA); + + if (mce_flags.smca) + smca_bsp_init(); + + rdmsrq(MSR_IA32_MCG_CAP, cap); + + /* Use accurate RIP reporting if available. */ + if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9) + mca_cfg.rip_msr = MSR_IA32_MCG_EIP; + + if (cap & MCG_SER_P) + mca_cfg.ser = 1; + + switch (c->x86_vendor) { + case X86_VENDOR_AMD: + amd_apply_global_quirks(c); + break; + case X86_VENDOR_INTEL: + intel_apply_global_quirks(c); + break; + case X86_VENDOR_ZHAOXIN: + zhaoxin_apply_global_quirks(c); + break; + } + + if (mca_cfg.monarch_timeout < 0) + mca_cfg.monarch_timeout = 0; + if (mca_cfg.bootlog != 0) + mca_cfg.panic_timeout = 30; +} + /* * Called for each booted CPU to set up machine checks. * Must be called with preempt off: @@ -2298,11 +2251,6 @@ void mcheck_cpu_init(struct cpuinfo_x86 *c) __mcheck_cpu_cap_init(); - if (!__mcheck_cpu_apply_quirks(c)) { - mca_cfg.disabled = 1; - return; - } - if (!mce_gen_pool_init()) { mca_cfg.disabled = 1; pr_emerg("Couldn't allocate MCE records pool!\n"); @@ -2311,12 +2259,11 @@ void mcheck_cpu_init(struct cpuinfo_x86 *c) mca_cfg.initialized = 1; - __mcheck_cpu_init_early(c); __mcheck_cpu_init_generic(); __mcheck_cpu_init_vendor(c); - __mcheck_cpu_init_clear_banks(); - __mcheck_cpu_check_banks(); + __mcheck_cpu_init_prepare_banks(); __mcheck_cpu_setup_timer(); + cr4_set_bits(X86_CR4_MCE); } /* @@ -2483,7 +2430,8 @@ static void mce_syscore_resume(void) { __mcheck_cpu_init_generic(); __mcheck_cpu_init_vendor(raw_cpu_ptr(&cpu_info)); - __mcheck_cpu_init_clear_banks(); + __mcheck_cpu_init_prepare_banks(); + cr4_set_bits(X86_CR4_MCE); } static struct syscore_ops mce_syscore_ops = { @@ -2501,8 +2449,9 @@ static void mce_cpu_restart(void *data) if (!mce_available(raw_cpu_ptr(&cpu_info))) return; __mcheck_cpu_init_generic(); - __mcheck_cpu_init_clear_banks(); + __mcheck_cpu_init_prepare_banks(); __mcheck_cpu_init_timer(); + cr4_set_bits(X86_CR4_MCE); } /* Reinit MCEs after user configuration changes */ diff --git a/arch/x86/kernel/cpu/mce/intel.c b/arch/x86/kernel/cpu/mce/intel.c index 9b149b9c4109..4655223ba560 100644 --- a/arch/x86/kernel/cpu/mce/intel.c +++ b/arch/x86/kernel/cpu/mce/intel.c @@ -468,8 +468,26 @@ static void intel_imc_init(struct cpuinfo_x86 *c) } } +static void intel_apply_cpu_quirks(struct cpuinfo_x86 *c) +{ + /* + * SDM documents that on family 6 bank 0 should not be written + * because it aliases to another special BIOS controlled + * register. + * But it's not aliased anymore on model 0x1a+ + * Don't ignore bank 0 completely because there could be a + * valid event later, merely don't write CTL0. + * + * Older CPUs (prior to family 6) can't reach this point and already + * return early due to the check of __mcheck_cpu_ancient_init(). + */ + if (c->x86_vfm < INTEL_NEHALEM_EP && this_cpu_read(mce_num_banks)) + this_cpu_ptr(mce_banks_array)[0].init = false; +} + void mce_intel_feature_init(struct cpuinfo_x86 *c) { + intel_apply_cpu_quirks(c); intel_init_cmci(); intel_init_lmce(); intel_imc_init(c); diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h index b5ba598e54cb..b0e00ec5cc8c 100644 --- a/arch/x86/kernel/cpu/mce/internal.h +++ b/arch/x86/kernel/cpu/mce/internal.h @@ -265,8 +265,11 @@ void mce_prep_record_common(struct mce *m); void mce_prep_record_per_cpu(unsigned int cpu, struct mce *m); #ifdef CONFIG_X86_MCE_AMD +void mce_threshold_create_device(unsigned int cpu); +void mce_threshold_remove_device(unsigned int cpu); extern bool amd_filter_mce(struct mce *m); bool amd_mce_usable_address(struct mce *m); +void amd_clear_bank(struct mce *m); /* * If MCA_CONFIG[McaLsbInStatusSupported] is set, extract ErrAddr in bits @@ -292,10 +295,15 @@ static __always_inline void smca_extract_err_addr(struct mce *m) m->addr &= GENMASK_ULL(55, lsb); } +void smca_bsp_init(void); #else +static inline void mce_threshold_create_device(unsigned int cpu) { } +static inline void mce_threshold_remove_device(unsigned int cpu) { } static inline bool amd_filter_mce(struct mce *m) { return false; } static inline bool amd_mce_usable_address(struct mce *m) { return false; } +static inline void amd_clear_bank(struct mce *m) { } static inline void smca_extract_err_addr(struct mce *m) { } +static inline void smca_bsp_init(void) { } #endif #ifdef CONFIG_X86_ANCIENT_MCE @@ -313,6 +321,7 @@ static __always_inline void winchip_machine_check(struct pt_regs *regs) {} #endif noinstr u64 mce_rdmsrq(u32 msr); +noinstr void mce_wrmsrq(u32 msr, u64 v); static __always_inline u32 mca_msr_reg(int bank, enum mca_msr reg) { diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index 514f63340880..cdce885e2fd5 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -269,15 +269,6 @@ static bool verify_sha256_digest(u32 patch_id, u32 cur_rev, const u8 *data, unsi return true; } -static u32 get_patch_level(void) -{ - u32 rev, dummy __always_unused; - - native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); - - return rev; -} - static union cpuid_1_eax ucode_rev_to_cpuid(unsigned int val) { union zen_patch_rev p; @@ -295,6 +286,30 @@ static union cpuid_1_eax ucode_rev_to_cpuid(unsigned int val) return c; } +static u32 get_patch_level(void) +{ + u32 rev, dummy __always_unused; + + if (IS_ENABLED(CONFIG_MICROCODE_DBG)) { + int cpu = smp_processor_id(); + + if (!microcode_rev[cpu]) { + if (!base_rev) + base_rev = cpuid_to_ucode_rev(bsp_cpuid_1_eax); + + microcode_rev[cpu] = base_rev; + + ucode_dbg("CPU%d, base_rev: 0x%x\n", cpu, base_rev); + } + + return microcode_rev[cpu]; + } + + native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); + + return rev; +} + static u16 find_equiv_id(struct equiv_cpu_table *et, u32 sig) { unsigned int i; @@ -324,13 +339,13 @@ static bool verify_container(const u8 *buf, size_t buf_size) u32 cont_magic; if (buf_size <= CONTAINER_HDR_SZ) { - pr_debug("Truncated microcode container header.\n"); + ucode_dbg("Truncated microcode container header.\n"); return false; } cont_magic = *(const u32 *)buf; if (cont_magic != UCODE_MAGIC) { - pr_debug("Invalid magic value (0x%08x).\n", cont_magic); + ucode_dbg("Invalid magic value (0x%08x).\n", cont_magic); return false; } @@ -355,8 +370,8 @@ static bool verify_equivalence_table(const u8 *buf, size_t buf_size) cont_type = hdr[1]; if (cont_type != UCODE_EQUIV_CPU_TABLE_TYPE) { - pr_debug("Wrong microcode container equivalence table type: %u.\n", - cont_type); + ucode_dbg("Wrong microcode container equivalence table type: %u.\n", + cont_type); return false; } @@ -365,7 +380,7 @@ static bool verify_equivalence_table(const u8 *buf, size_t buf_size) equiv_tbl_len = hdr[2]; if (equiv_tbl_len < sizeof(struct equiv_cpu_entry) || buf_size < equiv_tbl_len) { - pr_debug("Truncated equivalence table.\n"); + ucode_dbg("Truncated equivalence table.\n"); return false; } @@ -385,7 +400,7 @@ static bool __verify_patch_section(const u8 *buf, size_t buf_size, u32 *sh_psize const u32 *hdr; if (buf_size < SECTION_HDR_SIZE) { - pr_debug("Truncated patch section.\n"); + ucode_dbg("Truncated patch section.\n"); return false; } @@ -394,13 +409,13 @@ static bool __verify_patch_section(const u8 *buf, size_t buf_size, u32 *sh_psize p_size = hdr[1]; if (p_type != UCODE_UCODE_TYPE) { - pr_debug("Invalid type field (0x%x) in container file section header.\n", - p_type); + ucode_dbg("Invalid type field (0x%x) in container file section header.\n", + p_type); return false; } if (p_size < sizeof(struct microcode_header_amd)) { - pr_debug("Patch of size %u too short.\n", p_size); + ucode_dbg("Patch of size %u too short.\n", p_size); return false; } @@ -477,12 +492,12 @@ static int verify_patch(const u8 *buf, size_t buf_size, u32 *patch_size) * size sh_psize, as the section claims. */ if (buf_size < sh_psize) { - pr_debug("Patch of size %u truncated.\n", sh_psize); + ucode_dbg("Patch of size %u truncated.\n", sh_psize); return -1; } if (!__verify_patch_size(sh_psize, buf_size)) { - pr_debug("Per-family patch size mismatch.\n"); + ucode_dbg("Per-family patch size mismatch.\n"); return -1; } @@ -496,6 +511,9 @@ static int verify_patch(const u8 *buf, size_t buf_size, u32 *patch_size) proc_id = mc_hdr->processor_rev_id; patch_fam = 0xf + (proc_id >> 12); + + ucode_dbg("Patch-ID 0x%08x: family: 0x%x\n", mc_hdr->patch_id, patch_fam); + if (patch_fam != family) return 1; @@ -566,9 +584,14 @@ static size_t parse_container(u8 *ucode, size_t size, struct cont_desc *desc) } mc = (struct microcode_amd *)(buf + SECTION_HDR_SIZE); + + ucode_dbg("patch_id: 0x%x\n", mc->hdr.patch_id); + if (mc_patch_matches(mc, eq_id)) { desc->psize = patch_size; desc->mc = mc; + + ucode_dbg(" match: size: %d\n", patch_size); } skip: @@ -639,8 +662,14 @@ static bool __apply_microcode_amd(struct microcode_amd *mc, u32 *cur_rev, invlpg(p_addr_end); } + if (IS_ENABLED(CONFIG_MICROCODE_DBG)) + microcode_rev[smp_processor_id()] = mc->hdr.patch_id; + /* verify patch application was successful */ *cur_rev = get_patch_level(); + + ucode_dbg("updated rev: 0x%x\n", *cur_rev); + if (*cur_rev != mc->hdr.patch_id) return false; @@ -1026,7 +1055,7 @@ static int verify_and_add_patch(u8 family, u8 *fw, unsigned int leftover, patch->patch_id = mc_hdr->patch_id; patch->equiv_cpu = proc_id; - pr_debug("%s: Adding patch_id: 0x%08x, proc_id: 0x%04x\n", + ucode_dbg("%s: Adding patch_id: 0x%08x, proc_id: 0x%04x\n", __func__, patch->patch_id, proc_id); /* ... and add to cache. */ @@ -1169,7 +1198,7 @@ static enum ucode_state request_microcode_amd(int cpu, struct device *device) snprintf(fw_name, sizeof(fw_name), "amd-ucode/microcode_amd_fam%.2xh.bin", c->x86); if (request_firmware_direct(&fw, (const char *)fw_name, device)) { - pr_debug("failed to load file %s\n", fw_name); + ucode_dbg("failed to load file %s\n", fw_name); goto out; } diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index b92e09a87c69..f75c140906d0 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -43,10 +43,19 @@ #include "internal.h" static struct microcode_ops *microcode_ops; -static bool dis_ucode_ldr = false; +static bool dis_ucode_ldr; bool force_minrev = IS_ENABLED(CONFIG_MICROCODE_LATE_FORCE_MINREV); -module_param(force_minrev, bool, S_IRUSR | S_IWUSR); + +/* + * Those below should be behind CONFIG_MICROCODE_DBG ifdeffery but in + * order to not uglify the code with ifdeffery and use IS_ENABLED() + * instead, leave them in. When microcode debugging is not enabled, + * those are meaningless anyway. + */ +/* base microcode revision for debugging */ +u32 base_rev; +u32 microcode_rev[NR_CPUS] = {}; /* * Synchronization. @@ -119,20 +128,48 @@ bool __init microcode_loader_disabled(void) * overwritten. */ if (!cpuid_feature() || - native_cpuid_ecx(1) & BIT(31) || + ((native_cpuid_ecx(1) & BIT(31)) && + !IS_ENABLED(CONFIG_MICROCODE_DBG)) || amd_check_current_patch_level()) dis_ucode_ldr = true; return dis_ucode_ldr; } +static void early_parse_cmdline(void) +{ + char cmd_buf[64] = {}; + char *s, *p = cmd_buf; + + if (cmdline_find_option(boot_command_line, "microcode", cmd_buf, sizeof(cmd_buf)) > 0) { + while ((s = strsep(&p, ","))) { + if (IS_ENABLED(CONFIG_MICROCODE_DBG)) { + if (strstr(s, "base_rev=")) { + /* advance to the option arg */ + strsep(&s, "="); + if (kstrtouint(s, 16, &base_rev)) { ; } + } + } + + if (!strcmp("force_minrev", s)) + force_minrev = true; + + if (!strcmp(s, "dis_ucode_ldr")) + dis_ucode_ldr = true; + } + } + + /* old, compat option */ + if (cmdline_find_option_bool(boot_command_line, "dis_ucode_ldr") > 0) + dis_ucode_ldr = true; +} + void __init load_ucode_bsp(void) { unsigned int cpuid_1_eax; bool intel = true; - if (cmdline_find_option_bool(boot_command_line, "dis_ucode_ldr") > 0) - dis_ucode_ldr = true; + early_parse_cmdline(); if (microcode_loader_disabled()) return; diff --git a/arch/x86/kernel/cpu/microcode/intel-ucode-defs.h b/arch/x86/kernel/cpu/microcode/intel-ucode-defs.h index cb6e601701ab..2d48e6593540 100644 --- a/arch/x86/kernel/cpu/microcode/intel-ucode-defs.h +++ b/arch/x86/kernel/cpu/microcode/intel-ucode-defs.h @@ -67,9 +67,8 @@ { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x55, .steppings = 0x0008, .driver_data = 0x1000191 }, { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x55, .steppings = 0x0010, .driver_data = 0x2007006 }, { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x55, .steppings = 0x0020, .driver_data = 0x3000010 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x55, .steppings = 0x0040, .driver_data = 0x4003605 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x55, .steppings = 0x0080, .driver_data = 0x5003707 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x55, .steppings = 0x0800, .driver_data = 0x7002904 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x55, .steppings = 0x0080, .driver_data = 0x5003901 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x55, .steppings = 0x0800, .driver_data = 0x7002b01 }, { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x56, .steppings = 0x0004, .driver_data = 0x1c }, { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x56, .steppings = 0x0008, .driver_data = 0x700001c }, { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x56, .steppings = 0x0010, .driver_data = 0xf00001a }, @@ -81,51 +80,62 @@ { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x5f, .steppings = 0x0002, .driver_data = 0x3e }, { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x66, .steppings = 0x0008, .driver_data = 0x2a }, { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x6a, .steppings = 0x0020, .driver_data = 0xc0002f0 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x6a, .steppings = 0x0040, .driver_data = 0xd0003e7 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x6c, .steppings = 0x0002, .driver_data = 0x10002b0 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x6a, .steppings = 0x0040, .driver_data = 0xd000404 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x6c, .steppings = 0x0002, .driver_data = 0x10002d0 }, { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x7a, .steppings = 0x0002, .driver_data = 0x42 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x7a, .steppings = 0x0100, .driver_data = 0x24 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x7e, .steppings = 0x0020, .driver_data = 0xc6 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x7a, .steppings = 0x0100, .driver_data = 0x26 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x7e, .steppings = 0x0020, .driver_data = 0xca }, { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8a, .steppings = 0x0002, .driver_data = 0x33 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8c, .steppings = 0x0002, .driver_data = 0xb8 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8c, .steppings = 0x0004, .driver_data = 0x38 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8d, .steppings = 0x0002, .driver_data = 0x52 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8c, .steppings = 0x0002, .driver_data = 0xbc }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8c, .steppings = 0x0004, .driver_data = 0x3c }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8d, .steppings = 0x0002, .driver_data = 0x56 }, { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8e, .steppings = 0x0200, .driver_data = 0xf6 }, { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8e, .steppings = 0x0400, .driver_data = 0xf6 }, { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8e, .steppings = 0x0800, .driver_data = 0xf6 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8e, .steppings = 0x1000, .driver_data = 0xfc }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8f, .steppings = 0x0100, .driver_data = 0x2c000390 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8f, .steppings = 0x0080, .driver_data = 0x2b000603 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8f, .steppings = 0x0040, .driver_data = 0x2c000390 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8f, .steppings = 0x0020, .driver_data = 0x2c000390 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8f, .steppings = 0x0010, .driver_data = 0x2c000390 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8e, .steppings = 0x1000, .driver_data = 0x100 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8f, .steppings = 0x0010, .driver_data = 0x2c0003f7 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8f, .steppings = 0x0020, .driver_data = 0x2c0003f7 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8f, .steppings = 0x0040, .driver_data = 0x2c0003f7 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8f, .steppings = 0x0080, .driver_data = 0x2b000639 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8f, .steppings = 0x0100, .driver_data = 0x2c0003f7 }, { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x96, .steppings = 0x0002, .driver_data = 0x1a }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x97, .steppings = 0x0004, .driver_data = 0x37 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x97, .steppings = 0x0020, .driver_data = 0x37 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xbf, .steppings = 0x0004, .driver_data = 0x37 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xbf, .steppings = 0x0020, .driver_data = 0x37 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9a, .steppings = 0x0008, .driver_data = 0x435 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9a, .steppings = 0x0010, .driver_data = 0x435 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x97, .steppings = 0x0004, .driver_data = 0x3a }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x97, .steppings = 0x0020, .driver_data = 0x3a }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9a, .steppings = 0x0008, .driver_data = 0x437 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9a, .steppings = 0x0010, .driver_data = 0x437 }, { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9c, .steppings = 0x0001, .driver_data = 0x24000026 }, { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9e, .steppings = 0x0200, .driver_data = 0xf8 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9e, .steppings = 0x0400, .driver_data = 0xf8 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9e, .steppings = 0x0400, .driver_data = 0xfa }, { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9e, .steppings = 0x0800, .driver_data = 0xf6 }, { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9e, .steppings = 0x1000, .driver_data = 0xf8 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9e, .steppings = 0x2000, .driver_data = 0x100 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa5, .steppings = 0x0004, .driver_data = 0xfc }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa5, .steppings = 0x0008, .driver_data = 0xfc }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa5, .steppings = 0x0020, .driver_data = 0xfc }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa6, .steppings = 0x0001, .driver_data = 0xfe }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa6, .steppings = 0x0002, .driver_data = 0xfc }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa7, .steppings = 0x0002, .driver_data = 0x62 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xaa, .steppings = 0x0010, .driver_data = 0x20 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xb7, .steppings = 0x0002, .driver_data = 0x12b }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xba, .steppings = 0x0004, .driver_data = 0x4123 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xba, .steppings = 0x0008, .driver_data = 0x4123 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xba, .steppings = 0x0100, .driver_data = 0x4123 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xbe, .steppings = 0x0001, .driver_data = 0x1a }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xcf, .steppings = 0x0004, .driver_data = 0x21000283 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xcf, .steppings = 0x0002, .driver_data = 0x21000283 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9e, .steppings = 0x2000, .driver_data = 0x104 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa5, .steppings = 0x0004, .driver_data = 0x100 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa5, .steppings = 0x0008, .driver_data = 0x100 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa5, .steppings = 0x0020, .driver_data = 0x100 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa6, .steppings = 0x0001, .driver_data = 0x102 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa6, .steppings = 0x0002, .driver_data = 0x100 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa7, .steppings = 0x0002, .driver_data = 0x64 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xaa, .steppings = 0x0010, .driver_data = 0x24 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xad, .steppings = 0x0002, .driver_data = 0xa0000d1 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xaf, .steppings = 0x0008, .driver_data = 0x3000341 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xb5, .steppings = 0x0001, .driver_data = 0xa }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xb7, .steppings = 0x0002, .driver_data = 0x12f }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xb7, .steppings = 0x0010, .driver_data = 0x12f }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xba, .steppings = 0x0004, .driver_data = 0x4128 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xba, .steppings = 0x0008, .driver_data = 0x4128 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xba, .steppings = 0x0100, .driver_data = 0x4128 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xbd, .steppings = 0x0002, .driver_data = 0x11f }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xbe, .steppings = 0x0001, .driver_data = 0x1d }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xbf, .steppings = 0x0004, .driver_data = 0x3a }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xbf, .steppings = 0x0020, .driver_data = 0x3a }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xbf, .steppings = 0x0040, .driver_data = 0x3a }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xbf, .steppings = 0x0080, .driver_data = 0x3a }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xc5, .steppings = 0x0004, .driver_data = 0x118 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xc6, .steppings = 0x0004, .driver_data = 0x118 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xc6, .steppings = 0x0010, .driver_data = 0x118 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xca, .steppings = 0x0004, .driver_data = 0x118 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xcf, .steppings = 0x0002, .driver_data = 0x210002a9 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xcf, .steppings = 0x0004, .driver_data = 0x210002a9 }, { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x00, .steppings = 0x0080, .driver_data = 0x12 }, { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x00, .steppings = 0x0400, .driver_data = 0x15 }, { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x01, .steppings = 0x0004, .driver_data = 0x2e }, diff --git a/arch/x86/kernel/cpu/microcode/internal.h b/arch/x86/kernel/cpu/microcode/internal.h index 50a9702ae4e2..ae8dbc2b908d 100644 --- a/arch/x86/kernel/cpu/microcode/internal.h +++ b/arch/x86/kernel/cpu/microcode/internal.h @@ -44,6 +44,9 @@ struct early_load_data { extern struct early_load_data early_data; extern struct ucode_cpu_info ucode_cpu_info[]; +extern u32 microcode_rev[NR_CPUS]; +extern u32 base_rev; + struct cpio_data find_microcode_in_initrd(const char *path); #define MAX_UCODE_COUNT 128 @@ -122,4 +125,10 @@ static inline void reload_ucode_intel(void) { } static inline struct microcode_ops *init_intel_microcode(void) { return NULL; } #endif /* !CONFIG_CPU_SUP_INTEL */ +#define ucode_dbg(fmt, ...) \ +({ \ + if (IS_ENABLED(CONFIG_MICROCODE_DBG)) \ + pr_info(fmt, ##__VA_ARGS__); \ +}) + #endif /* _X86_MICROCODE_INTERNAL_H */ diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index 187d527ef73b..06ca5a30140c 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -107,7 +107,7 @@ u32 resctrl_arch_system_num_rmid_idx(void) struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; /* RMID are independent numbers for x86. num_rmid_idx == num_rmid */ - return r->num_rmid; + return r->mon.num_rmid; } struct rdt_resource *resctrl_arch_get_resource(enum resctrl_res_level l) @@ -365,8 +365,10 @@ static void ctrl_domain_free(struct rdt_hw_ctrl_domain *hw_dom) static void mon_domain_free(struct rdt_hw_mon_domain *hw_dom) { - kfree(hw_dom->arch_mbm_total); - kfree(hw_dom->arch_mbm_local); + int idx; + + for_each_mbm_idx(idx) + kfree(hw_dom->arch_mbm_states[idx]); kfree(hw_dom); } @@ -400,25 +402,27 @@ static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_ctrl_domain * */ static int arch_domain_mbm_alloc(u32 num_rmid, struct rdt_hw_mon_domain *hw_dom) { - size_t tsize; - - if (resctrl_arch_is_mbm_total_enabled()) { - tsize = sizeof(*hw_dom->arch_mbm_total); - hw_dom->arch_mbm_total = kcalloc(num_rmid, tsize, GFP_KERNEL); - if (!hw_dom->arch_mbm_total) - return -ENOMEM; - } - if (resctrl_arch_is_mbm_local_enabled()) { - tsize = sizeof(*hw_dom->arch_mbm_local); - hw_dom->arch_mbm_local = kcalloc(num_rmid, tsize, GFP_KERNEL); - if (!hw_dom->arch_mbm_local) { - kfree(hw_dom->arch_mbm_total); - hw_dom->arch_mbm_total = NULL; - return -ENOMEM; - } + size_t tsize = sizeof(*hw_dom->arch_mbm_states[0]); + enum resctrl_event_id eventid; + int idx; + + for_each_mbm_event_id(eventid) { + if (!resctrl_is_mon_event_enabled(eventid)) + continue; + idx = MBM_STATE_IDX(eventid); + hw_dom->arch_mbm_states[idx] = kcalloc(num_rmid, tsize, GFP_KERNEL); + if (!hw_dom->arch_mbm_states[idx]) + goto cleanup; } return 0; +cleanup: + for_each_mbm_idx(idx) { + kfree(hw_dom->arch_mbm_states[idx]); + hw_dom->arch_mbm_states[idx] = NULL; + } + + return -ENOMEM; } static int get_domain_id_from_scope(int cpu, enum resctrl_scope scope) @@ -516,6 +520,9 @@ static void domain_add_cpu_mon(int cpu, struct rdt_resource *r) d = container_of(hdr, struct rdt_mon_domain, hdr); cpumask_set_cpu(cpu, &d->hdr.cpu_mask); + /* Update the mbm_assign_mode state for the CPU if supported */ + if (r->mon.mbm_cntr_assignable) + resctrl_arch_mbm_cntr_assign_set_one(r); return; } @@ -535,9 +542,13 @@ static void domain_add_cpu_mon(int cpu, struct rdt_resource *r) d->ci_id = ci->id; cpumask_set_cpu(cpu, &d->hdr.cpu_mask); + /* Update the mbm_assign_mode state for the CPU if supported */ + if (r->mon.mbm_cntr_assignable) + resctrl_arch_mbm_cntr_assign_set_one(r); + arch_mon_domain_online(r, d); - if (arch_domain_mbm_alloc(r->num_rmid, hw_dom)) { + if (arch_domain_mbm_alloc(r->mon.num_rmid, hw_dom)) { mon_domain_free(hw_dom); return; } @@ -707,6 +718,7 @@ enum { RDT_FLAG_MBA, RDT_FLAG_SMBA, RDT_FLAG_BMEC, + RDT_FLAG_ABMC, }; #define RDT_OPT(idx, n, f) \ @@ -732,6 +744,7 @@ static struct rdt_options rdt_options[] __ro_after_init = { RDT_OPT(RDT_FLAG_MBA, "mba", X86_FEATURE_MBA), RDT_OPT(RDT_FLAG_SMBA, "smba", X86_FEATURE_SMBA), RDT_OPT(RDT_FLAG_BMEC, "bmec", X86_FEATURE_BMEC), + RDT_OPT(RDT_FLAG_ABMC, "abmc", X86_FEATURE_ABMC), }; #define NUM_RDT_OPTIONS ARRAY_SIZE(rdt_options) @@ -863,15 +876,24 @@ static __init bool get_rdt_alloc_resources(void) static __init bool get_rdt_mon_resources(void) { struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; + bool ret = false; - if (rdt_cpu_has(X86_FEATURE_CQM_OCCUP_LLC)) - rdt_mon_features |= (1 << QOS_L3_OCCUP_EVENT_ID); - if (rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL)) - rdt_mon_features |= (1 << QOS_L3_MBM_TOTAL_EVENT_ID); - if (rdt_cpu_has(X86_FEATURE_CQM_MBM_LOCAL)) - rdt_mon_features |= (1 << QOS_L3_MBM_LOCAL_EVENT_ID); + if (rdt_cpu_has(X86_FEATURE_CQM_OCCUP_LLC)) { + resctrl_enable_mon_event(QOS_L3_OCCUP_EVENT_ID); + ret = true; + } + if (rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL)) { + resctrl_enable_mon_event(QOS_L3_MBM_TOTAL_EVENT_ID); + ret = true; + } + if (rdt_cpu_has(X86_FEATURE_CQM_MBM_LOCAL)) { + resctrl_enable_mon_event(QOS_L3_MBM_LOCAL_EVENT_ID); + ret = true; + } + if (rdt_cpu_has(X86_FEATURE_ABMC)) + ret = true; - if (!rdt_mon_features) + if (!ret) return false; return !rdt_get_mon_l3_config(r); @@ -965,7 +987,7 @@ static enum cpuhp_state rdt_online; /* Runs once on the BSP during boot. */ void resctrl_cpu_detect(struct cpuinfo_x86 *c) { - if (!cpu_has(c, X86_FEATURE_CQM_LLC)) { + if (!cpu_has(c, X86_FEATURE_CQM_LLC) && !cpu_has(c, X86_FEATURE_ABMC)) { c->x86_cache_max_rmid = -1; c->x86_cache_occ_scale = -1; c->x86_cache_mbm_width_offset = -1; @@ -977,7 +999,8 @@ void resctrl_cpu_detect(struct cpuinfo_x86 *c) if (cpu_has(c, X86_FEATURE_CQM_OCCUP_LLC) || cpu_has(c, X86_FEATURE_CQM_MBM_TOTAL) || - cpu_has(c, X86_FEATURE_CQM_MBM_LOCAL)) { + cpu_has(c, X86_FEATURE_CQM_MBM_LOCAL) || + cpu_has(c, X86_FEATURE_ABMC)) { u32 eax, ebx, ecx, edx; /* QoS sub-leaf, EAX=0Fh, ECX=1 */ diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index 5e3c41b36437..9f4c2f0aaf5c 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -37,6 +37,15 @@ struct arch_mbm_state { u64 prev_msr; }; +/* Setting bit 0 in L3_QOS_EXT_CFG enables the ABMC feature. */ +#define ABMC_ENABLE_BIT 0 + +/* + * Qos Event Identifiers. + */ +#define ABMC_EXTENDED_EVT_ID BIT(31) +#define ABMC_EVT_ID BIT(0) + /** * struct rdt_hw_ctrl_domain - Arch private attributes of a set of CPUs that share * a resource for a control function @@ -54,15 +63,15 @@ struct rdt_hw_ctrl_domain { * struct rdt_hw_mon_domain - Arch private attributes of a set of CPUs that share * a resource for a monitor function * @d_resctrl: Properties exposed to the resctrl file system - * @arch_mbm_total: arch private state for MBM total bandwidth - * @arch_mbm_local: arch private state for MBM local bandwidth + * @arch_mbm_states: Per-event pointer to the MBM event's saved state. + * An MBM event's state is an array of struct arch_mbm_state + * indexed by RMID on x86. * * Members of this structure are accessed via helpers that provide abstraction. */ struct rdt_hw_mon_domain { struct rdt_mon_domain d_resctrl; - struct arch_mbm_state *arch_mbm_total; - struct arch_mbm_state *arch_mbm_local; + struct arch_mbm_state *arch_mbm_states[QOS_NUM_L3_MBM_EVENTS]; }; static inline struct rdt_hw_ctrl_domain *resctrl_to_arch_ctrl_dom(struct rdt_ctrl_domain *r) @@ -102,6 +111,7 @@ struct msr_param { * @mon_scale: cqm counter * mon_scale = occupancy in bytes * @mbm_width: Monitor width, to detect and correct for overflow. * @cdp_enabled: CDP state of this resource + * @mbm_cntr_assign_enabled: ABMC feature is enabled * * Members of this structure are either private to the architecture * e.g. mbm_width, or accessed via helpers that provide abstraction. e.g. @@ -115,6 +125,7 @@ struct rdt_hw_resource { unsigned int mon_scale; unsigned int mbm_width; bool cdp_enabled; + bool mbm_cntr_assign_enabled; }; static inline struct rdt_hw_resource *resctrl_to_arch_res(struct rdt_resource *r) @@ -159,6 +170,42 @@ union cpuid_0x10_x_edx { unsigned int full; }; +/* + * ABMC counters are configured by writing to MSR_IA32_L3_QOS_ABMC_CFG. + * + * @bw_type : Event configuration that represents the memory + * transactions being tracked by the @cntr_id. + * @bw_src : Bandwidth source (RMID or CLOSID). + * @reserved1 : Reserved. + * @is_clos : @bw_src field is a CLOSID (not an RMID). + * @cntr_id : Counter identifier. + * @reserved : Reserved. + * @cntr_en : Counting enable bit. + * @cfg_en : Configuration enable bit. + * + * Configuration and counting: + * Counter can be configured across multiple writes to MSR. Configuration + * is applied only when @cfg_en = 1. Counter @cntr_id is reset when the + * configuration is applied. + * @cfg_en = 1, @cntr_en = 0 : Apply @cntr_id configuration but do not + * count events. + * @cfg_en = 1, @cntr_en = 1 : Apply @cntr_id configuration and start + * counting events. + */ +union l3_qos_abmc_cfg { + struct { + unsigned long bw_type :32, + bw_src :12, + reserved1: 3, + is_clos : 1, + cntr_id : 5, + reserved : 9, + cntr_en : 1, + cfg_en : 1; + } split; + unsigned long full; +}; + void rdt_ctrl_update(void *arg); int rdt_get_mon_l3_config(struct rdt_resource *r); @@ -168,5 +215,6 @@ bool rdt_cpu_has(int flag); void __init intel_rdt_mbm_apply_quirk(void); void rdt_domain_reconfigure_cdp(struct rdt_resource *r); +void resctrl_arch_mbm_cntr_assign_set_one(struct rdt_resource *r); #endif /* _ASM_X86_RESCTRL_INTERNAL_H */ diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index c261558276cd..c8945610d455 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -31,11 +31,6 @@ */ bool rdt_mon_capable; -/* - * Global to indicate which monitoring events are enabled. - */ -unsigned int rdt_mon_features; - #define CF(cf) ((unsigned long)(1048576 * (cf) + 0.5)) static int snc_nodes_per_l3_cache = 1; @@ -135,7 +130,7 @@ static int logical_rmid_to_physical_rmid(int cpu, int lrmid) if (snc_nodes_per_l3_cache == 1) return lrmid; - return lrmid + (cpu_to_node(cpu) % snc_nodes_per_l3_cache) * r->num_rmid; + return lrmid + (cpu_to_node(cpu) % snc_nodes_per_l3_cache) * r->mon.num_rmid; } static int __rmid_read_phys(u32 prmid, enum resctrl_event_id eventid, u64 *val) @@ -166,18 +161,14 @@ static struct arch_mbm_state *get_arch_mbm_state(struct rdt_hw_mon_domain *hw_do u32 rmid, enum resctrl_event_id eventid) { - switch (eventid) { - case QOS_L3_OCCUP_EVENT_ID: - return NULL; - case QOS_L3_MBM_TOTAL_EVENT_ID: - return &hw_dom->arch_mbm_total[rmid]; - case QOS_L3_MBM_LOCAL_EVENT_ID: - return &hw_dom->arch_mbm_local[rmid]; - default: - /* Never expect to get here */ - WARN_ON_ONCE(1); + struct arch_mbm_state *state; + + if (!resctrl_is_mbm_event(eventid)) return NULL; - } + + state = hw_dom->arch_mbm_states[MBM_STATE_IDX(eventid)]; + + return state ? &state[rmid] : NULL; } void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_mon_domain *d, @@ -206,14 +197,16 @@ void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_mon_domain *d, void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_mon_domain *d) { struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); - - if (resctrl_arch_is_mbm_total_enabled()) - memset(hw_dom->arch_mbm_total, 0, - sizeof(*hw_dom->arch_mbm_total) * r->num_rmid); - - if (resctrl_arch_is_mbm_local_enabled()) - memset(hw_dom->arch_mbm_local, 0, - sizeof(*hw_dom->arch_mbm_local) * r->num_rmid); + enum resctrl_event_id eventid; + int idx; + + for_each_mbm_event_id(eventid) { + if (!resctrl_is_mon_event_enabled(eventid)) + continue; + idx = MBM_STATE_IDX(eventid); + memset(hw_dom->arch_mbm_states[idx], 0, + sizeof(*hw_dom->arch_mbm_states[0]) * r->mon.num_rmid); + } } static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr, unsigned int width) @@ -224,15 +217,33 @@ static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr, unsigned int width) return chunks >> shift; } +static u64 get_corrected_val(struct rdt_resource *r, struct rdt_mon_domain *d, + u32 rmid, enum resctrl_event_id eventid, u64 msr_val) +{ + struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); + struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); + struct arch_mbm_state *am; + u64 chunks; + + am = get_arch_mbm_state(hw_dom, rmid, eventid); + if (am) { + am->chunks += mbm_overflow_count(am->prev_msr, msr_val, + hw_res->mbm_width); + chunks = get_corrected_mbm_count(rmid, am->chunks); + am->prev_msr = msr_val; + } else { + chunks = msr_val; + } + + return chunks * hw_res->mon_scale; +} + int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_mon_domain *d, u32 unused, u32 rmid, enum resctrl_event_id eventid, u64 *val, void *ignored) { - struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); - struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); int cpu = cpumask_any(&d->hdr.cpu_mask); - struct arch_mbm_state *am; - u64 msr_val, chunks; + u64 msr_val; u32 prmid; int ret; @@ -243,17 +254,76 @@ int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_mon_domain *d, if (ret) return ret; + *val = get_corrected_val(r, d, rmid, eventid, msr_val); + + return 0; +} + +static int __cntr_id_read(u32 cntr_id, u64 *val) +{ + u64 msr_val; + + /* + * QM_EVTSEL Register definition: + * ======================================================= + * Bits Mnemonic Description + * ======================================================= + * 63:44 -- Reserved + * 43:32 RMID RMID or counter ID in ABMC mode + * when reading an MBM event + * 31 ExtendedEvtID Extended Event Identifier + * 30:8 -- Reserved + * 7:0 EvtID Event Identifier + * ======================================================= + * The contents of a specific counter can be read by setting the + * following fields in QM_EVTSEL.ExtendedEvtID(=1) and + * QM_EVTSEL.EvtID = L3CacheABMC (=1) and setting QM_EVTSEL.RMID + * to the desired counter ID. Reading the QM_CTR then returns the + * contents of the specified counter. The RMID_VAL_ERROR bit is set + * if the counter configuration is invalid, or if an invalid counter + * ID is set in the QM_EVTSEL.RMID field. The RMID_VAL_UNAVAIL bit + * is set if the counter data is unavailable. + */ + wrmsr(MSR_IA32_QM_EVTSEL, ABMC_EXTENDED_EVT_ID | ABMC_EVT_ID, cntr_id); + rdmsrl(MSR_IA32_QM_CTR, msr_val); + + if (msr_val & RMID_VAL_ERROR) + return -EIO; + if (msr_val & RMID_VAL_UNAVAIL) + return -EINVAL; + + *val = msr_val; + return 0; +} + +void resctrl_arch_reset_cntr(struct rdt_resource *r, struct rdt_mon_domain *d, + u32 unused, u32 rmid, int cntr_id, + enum resctrl_event_id eventid) +{ + struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); + struct arch_mbm_state *am; + am = get_arch_mbm_state(hw_dom, rmid, eventid); if (am) { - am->chunks += mbm_overflow_count(am->prev_msr, msr_val, - hw_res->mbm_width); - chunks = get_corrected_mbm_count(rmid, am->chunks); - am->prev_msr = msr_val; - } else { - chunks = msr_val; + memset(am, 0, sizeof(*am)); + + /* Record any initial, non-zero count value. */ + __cntr_id_read(cntr_id, &am->prev_msr); } +} + +int resctrl_arch_cntr_read(struct rdt_resource *r, struct rdt_mon_domain *d, + u32 unused, u32 rmid, int cntr_id, + enum resctrl_event_id eventid, u64 *val) +{ + u64 msr_val; + int ret; + + ret = __cntr_id_read(cntr_id, &msr_val); + if (ret) + return ret; - *val = chunks * hw_res->mon_scale; + *val = get_corrected_val(r, d, rmid, eventid, msr_val); return 0; } @@ -346,12 +416,13 @@ int __init rdt_get_mon_l3_config(struct rdt_resource *r) unsigned int mbm_offset = boot_cpu_data.x86_cache_mbm_width_offset; struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); unsigned int threshold; + u32 eax, ebx, ecx, edx; snc_nodes_per_l3_cache = snc_get_config(); resctrl_rmid_realloc_limit = boot_cpu_data.x86_cache_size * 1024; hw_res->mon_scale = boot_cpu_data.x86_cache_occ_scale / snc_nodes_per_l3_cache; - r->num_rmid = (boot_cpu_data.x86_cache_max_rmid + 1) / snc_nodes_per_l3_cache; + r->mon.num_rmid = (boot_cpu_data.x86_cache_max_rmid + 1) / snc_nodes_per_l3_cache; hw_res->mbm_width = MBM_CNTR_WIDTH_BASE; if (mbm_offset > 0 && mbm_offset <= MBM_CNTR_WIDTH_OFFSET_MAX) @@ -366,7 +437,7 @@ int __init rdt_get_mon_l3_config(struct rdt_resource *r) * * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC. */ - threshold = resctrl_rmid_realloc_limit / r->num_rmid; + threshold = resctrl_rmid_realloc_limit / r->mon.num_rmid; /* * Because num_rmid may not be a power of two, round the value @@ -375,12 +446,17 @@ int __init rdt_get_mon_l3_config(struct rdt_resource *r) */ resctrl_rmid_realloc_threshold = resctrl_arch_round_mon_val(threshold); - if (rdt_cpu_has(X86_FEATURE_BMEC)) { - u32 eax, ebx, ecx, edx; - + if (rdt_cpu_has(X86_FEATURE_BMEC) || rdt_cpu_has(X86_FEATURE_ABMC)) { /* Detect list of bandwidth sources that can be tracked */ cpuid_count(0x80000020, 3, &eax, &ebx, &ecx, &edx); - r->mbm_cfg_mask = ecx & MAX_EVT_CONFIG_BITS; + r->mon.mbm_cfg_mask = ecx & MAX_EVT_CONFIG_BITS; + } + + if (rdt_cpu_has(X86_FEATURE_ABMC)) { + r->mon.mbm_cntr_assignable = true; + cpuid_count(0x80000020, 5, &eax, &ebx, &ecx, &edx); + r->mon.num_mbm_cntrs = (ebx & GENMASK(15, 0)) + 1; + hw_res->mbm_cntr_assign_enabled = true; } r->mon_capable = true; @@ -401,3 +477,91 @@ void __init intel_rdt_mbm_apply_quirk(void) mbm_cf_rmidthreshold = mbm_cf_table[cf_index].rmidthreshold; mbm_cf = mbm_cf_table[cf_index].cf; } + +static void resctrl_abmc_set_one_amd(void *arg) +{ + bool *enable = arg; + + if (*enable) + msr_set_bit(MSR_IA32_L3_QOS_EXT_CFG, ABMC_ENABLE_BIT); + else + msr_clear_bit(MSR_IA32_L3_QOS_EXT_CFG, ABMC_ENABLE_BIT); +} + +/* + * ABMC enable/disable requires update of L3_QOS_EXT_CFG MSR on all the CPUs + * associated with all monitor domains. + */ +static void _resctrl_abmc_enable(struct rdt_resource *r, bool enable) +{ + struct rdt_mon_domain *d; + + lockdep_assert_cpus_held(); + + list_for_each_entry(d, &r->mon_domains, hdr.list) { + on_each_cpu_mask(&d->hdr.cpu_mask, resctrl_abmc_set_one_amd, + &enable, 1); + resctrl_arch_reset_rmid_all(r, d); + } +} + +int resctrl_arch_mbm_cntr_assign_set(struct rdt_resource *r, bool enable) +{ + struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); + + if (r->mon.mbm_cntr_assignable && + hw_res->mbm_cntr_assign_enabled != enable) { + _resctrl_abmc_enable(r, enable); + hw_res->mbm_cntr_assign_enabled = enable; + } + + return 0; +} + +bool resctrl_arch_mbm_cntr_assign_enabled(struct rdt_resource *r) +{ + return resctrl_to_arch_res(r)->mbm_cntr_assign_enabled; +} + +static void resctrl_abmc_config_one_amd(void *info) +{ + union l3_qos_abmc_cfg *abmc_cfg = info; + + wrmsrl(MSR_IA32_L3_QOS_ABMC_CFG, abmc_cfg->full); +} + +/* + * Send an IPI to the domain to assign the counter to RMID, event pair. + */ +void resctrl_arch_config_cntr(struct rdt_resource *r, struct rdt_mon_domain *d, + enum resctrl_event_id evtid, u32 rmid, u32 closid, + u32 cntr_id, bool assign) +{ + struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); + union l3_qos_abmc_cfg abmc_cfg = { 0 }; + struct arch_mbm_state *am; + + abmc_cfg.split.cfg_en = 1; + abmc_cfg.split.cntr_en = assign ? 1 : 0; + abmc_cfg.split.cntr_id = cntr_id; + abmc_cfg.split.bw_src = rmid; + if (assign) + abmc_cfg.split.bw_type = resctrl_get_mon_evt_cfg(evtid); + + smp_call_function_any(&d->hdr.cpu_mask, resctrl_abmc_config_one_amd, &abmc_cfg, 1); + + /* + * The hardware counter is reset (because cfg_en == 1) so there is no + * need to record initial non-zero counts. + */ + am = get_arch_mbm_state(hw_dom, rmid, evtid); + if (am) + memset(am, 0, sizeof(*am)); +} + +void resctrl_arch_mbm_cntr_assign_set_one(struct rdt_resource *r) +{ + struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); + + resctrl_abmc_set_one_amd(&hw_res->mbm_cntr_assign_enabled); +} diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index 6b868afb26c3..4cee6213d667 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -51,6 +51,7 @@ static const struct cpuid_bit cpuid_bits[] = { { X86_FEATURE_COHERENCY_SFW_NO, CPUID_EBX, 31, 0x8000001f, 0 }, { X86_FEATURE_SMBA, CPUID_EBX, 2, 0x80000020, 0 }, { X86_FEATURE_BMEC, CPUID_EBX, 3, 0x80000020, 0 }, + { X86_FEATURE_ABMC, CPUID_EBX, 5, 0x80000020, 0 }, { X86_FEATURE_TSA_SQ_NO, CPUID_ECX, 1, 0x80000021, 0 }, { X86_FEATURE_TSA_L1_NO, CPUID_ECX, 2, 0x80000021, 0 }, { X86_FEATURE_AMD_WORKLOAD_CLASS, CPUID_EAX, 22, 0x80000021, 0 }, diff --git a/arch/x86/kernel/cpu/topology_amd.c b/arch/x86/kernel/cpu/topology_amd.c index c79ebbb639cb..6ac097e13106 100644 --- a/arch/x86/kernel/cpu/topology_amd.c +++ b/arch/x86/kernel/cpu/topology_amd.c @@ -59,7 +59,7 @@ static void store_node(struct topo_scan *tscan, u16 nr_nodes, u16 node_id) tscan->amd_node_id = node_id; } -static bool parse_8000_001e(struct topo_scan *tscan, bool has_topoext) +static bool parse_8000_001e(struct topo_scan *tscan) { struct { // eax @@ -85,7 +85,7 @@ static bool parse_8000_001e(struct topo_scan *tscan, bool has_topoext) * If leaf 0xb/0x26 is available, then the APIC ID and the domain * shifts are set already. */ - if (!has_topoext) { + if (!cpu_feature_enabled(X86_FEATURE_XTOPOLOGY)) { tscan->c->topo.initial_apicid = leaf.ext_apic_id; /* @@ -163,11 +163,12 @@ static void topoext_fixup(struct topo_scan *tscan) c->x86 != 0x15 || c->x86_model < 0x10 || c->x86_model > 0x6f) return; - if (msr_set_bit(0xc0011005, 54) <= 0) + if (msr_set_bit(MSR_AMD64_CPUID_EXT_FEAT, + MSR_AMD64_CPUID_EXT_FEAT_TOPOEXT_BIT) <= 0) return; - rdmsrq(0xc0011005, msrval); - if (msrval & BIT_64(54)) { + rdmsrq(MSR_AMD64_CPUID_EXT_FEAT, msrval); + if (msrval & MSR_AMD64_CPUID_EXT_FEAT_TOPOEXT) { set_cpu_cap(c, X86_FEATURE_TOPOEXT); pr_info_once(FW_INFO "CPU: Re-enabling disabled Topology Extensions Support.\n"); } @@ -175,30 +176,27 @@ static void topoext_fixup(struct topo_scan *tscan) static void parse_topology_amd(struct topo_scan *tscan) { + if (cpu_feature_enabled(X86_FEATURE_AMD_HTR_CORES)) + tscan->c->topo.cpu_type = cpuid_ebx(0x80000026); + /* * Try to get SMT, CORE, TILE, and DIE shifts from extended * CPUID leaf 0x8000_0026 on supported processors first. If * extended CPUID leaf 0x8000_0026 is not supported, try to * get SMT and CORE shift from leaf 0xb. If either leaf is * available, cpu_parse_topology_ext() will return true. - */ - bool has_xtopology = cpu_parse_topology_ext(tscan); - - if (cpu_feature_enabled(X86_FEATURE_AMD_HTR_CORES)) - tscan->c->topo.cpu_type = cpuid_ebx(0x80000026); - - /* + * * If XTOPOLOGY leaves (0x26/0xb) are not available, try to * get the CORE shift from leaf 0x8000_0008 first. */ - if (!has_xtopology && !parse_8000_0008(tscan)) + if (!cpu_parse_topology_ext(tscan) && !parse_8000_0008(tscan)) return; /* * Prefer leaf 0x8000001e if available to get the SMT shift and * the initial APIC ID if XTOPOLOGY leaves are not available. */ - if (parse_8000_001e(tscan, has_xtopology)) + if (parse_8000_001e(tscan)) return; /* Try the NODEID MSR */ diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 533fcf5636fc..fd28b53dbac5 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -52,10 +52,13 @@ SYM_PIC_ALIAS(next_early_pgt); pmdval_t early_pmd_flags = __PAGE_KERNEL_LARGE & ~(_PAGE_GLOBAL | _PAGE_NX); unsigned int __pgtable_l5_enabled __ro_after_init; +SYM_PIC_ALIAS(__pgtable_l5_enabled); unsigned int pgdir_shift __ro_after_init = 39; EXPORT_SYMBOL(pgdir_shift); +SYM_PIC_ALIAS(pgdir_shift); unsigned int ptrs_per_p4d __ro_after_init = 1; EXPORT_SYMBOL(ptrs_per_p4d); +SYM_PIC_ALIAS(ptrs_per_p4d); unsigned long page_offset_base __ro_after_init = __PAGE_OFFSET_BASE_L4; EXPORT_SYMBOL(page_offset_base); @@ -316,5 +319,5 @@ void early_setup_idt(void) handler = vc_boot_ghcb; } - startup_64_load_idt(handler); + __pi_startup_64_load_idt(handler); } diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 76743dfad6ab..80ef5d386b03 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -61,7 +61,7 @@ RESERVE_BRK(pagetables, INIT_MAP_SIZE) * any particular GDT layout, because we load our own as soon as we * can. */ -__HEAD + __INIT SYM_CODE_START(startup_32) movl pa(initial_stack),%ecx @@ -136,6 +136,9 @@ SYM_CODE_END(startup_32) * If cpu hotplug is not supported then this code can go in init section * which will be freed later */ +#ifdef CONFIG_HOTPLUG_CPU + .text +#endif SYM_FUNC_START(startup_32_smp) cld movl $(__BOOT_DS),%eax diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 3e9b3a3bd039..21816b48537c 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -33,7 +33,7 @@ * because we need identity-mapped pages. */ - __HEAD + __INIT .code64 SYM_CODE_START_NOALIGN(startup_64) UNWIND_HINT_END_OF_STACK @@ -71,7 +71,7 @@ SYM_CODE_START_NOALIGN(startup_64) xorl %edx, %edx wrmsr - call startup_64_setup_gdt_idt + call __pi_startup_64_setup_gdt_idt /* Now switch to __KERNEL_CS so IRET works reliably */ pushq $__KERNEL_CS @@ -91,7 +91,7 @@ SYM_CODE_START_NOALIGN(startup_64) * subsequent code. Pass the boot_params pointer as the first argument. */ movq %r15, %rdi - call sme_enable + call __pi_sme_enable #endif /* Sanitize CPU configuration */ @@ -111,7 +111,7 @@ SYM_CODE_START_NOALIGN(startup_64) * programmed into CR3. */ movq %r15, %rsi - call __startup_64 + call __pi___startup_64 /* Form the CR3 value being sure to include the CR3 modifier */ leaq early_top_pgt(%rip), %rcx @@ -562,7 +562,7 @@ SYM_CODE_START_NOALIGN(vc_no_ghcb) /* Call C handler */ movq %rsp, %rdi movq ORIG_RAX(%rsp), %rsi - call do_vc_no_ghcb + call __pi_do_vc_no_ghcb /* Unwind pt_regs */ POP_REGS diff --git a/arch/x86/kernel/shstk.c b/arch/x86/kernel/shstk.c index 5eba6c5a6775..978232b6d48d 100644 --- a/arch/x86/kernel/shstk.c +++ b/arch/x86/kernel/shstk.c @@ -246,6 +246,46 @@ static unsigned long get_user_shstk_addr(void) return ssp; } +int shstk_pop(u64 *val) +{ + int ret = 0; + u64 ssp; + + if (!features_enabled(ARCH_SHSTK_SHSTK)) + return -ENOTSUPP; + + fpregs_lock_and_load(); + + rdmsrq(MSR_IA32_PL3_SSP, ssp); + if (val && get_user(*val, (__user u64 *)ssp)) + ret = -EFAULT; + else + wrmsrq(MSR_IA32_PL3_SSP, ssp + SS_FRAME_SIZE); + fpregs_unlock(); + + return ret; +} + +int shstk_push(u64 val) +{ + u64 ssp; + int ret; + + if (!features_enabled(ARCH_SHSTK_SHSTK)) + return -ENOTSUPP; + + fpregs_lock_and_load(); + + rdmsrq(MSR_IA32_PL3_SSP, ssp); + ssp -= SS_FRAME_SIZE; + ret = write_user_shstk_64((__user void *)ssp, val); + if (!ret) + wrmsrq(MSR_IA32_PL3_SSP, ssp); + fpregs_unlock(); + + return ret; +} + #define SHSTK_DATA_BIT BIT(63) static int put_shstk_data(u64 __user *addr, u64 data) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 33e166f6ab12..eb289abece23 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -479,14 +479,14 @@ static int x86_cluster_flags(void) static bool x86_has_numa_in_package; static struct sched_domain_topology_level x86_topology[] = { - SDTL_INIT(cpu_smt_mask, cpu_smt_flags, SMT), + SDTL_INIT(tl_smt_mask, cpu_smt_flags, SMT), #ifdef CONFIG_SCHED_CLUSTER - SDTL_INIT(cpu_clustergroup_mask, x86_cluster_flags, CLS), + SDTL_INIT(tl_cls_mask, x86_cluster_flags, CLS), #endif #ifdef CONFIG_SCHED_MC - SDTL_INIT(cpu_coregroup_mask, x86_core_flags, MC), + SDTL_INIT(tl_mc_mask, x86_core_flags, MC), #endif - SDTL_INIT(cpu_cpu_mask, x86_sched_itmt_flags, PKG), + SDTL_INIT(tl_pkg_mask, x86_sched_itmt_flags, PKG), { NULL }, }; diff --git a/arch/x86/kernel/umip.c b/arch/x86/kernel/umip.c index 5a4b21389b1d..d432f3824f0c 100644 --- a/arch/x86/kernel/umip.c +++ b/arch/x86/kernel/umip.c @@ -156,15 +156,26 @@ static int identify_insn(struct insn *insn) if (!insn->modrm.nbytes) return -EINVAL; - /* All the instructions of interest start with 0x0f. */ - if (insn->opcode.bytes[0] != 0xf) + /* The instructions of interest have 2-byte opcodes: 0F 00 or 0F 01. */ + if (insn->opcode.nbytes < 2 || insn->opcode.bytes[0] != 0xf) return -EINVAL; if (insn->opcode.bytes[1] == 0x1) { switch (X86_MODRM_REG(insn->modrm.value)) { case 0: + /* The reg form of 0F 01 /0 encodes VMX instructions. */ + if (X86_MODRM_MOD(insn->modrm.value) == 3) + return -EINVAL; + return UMIP_INST_SGDT; case 1: + /* + * The reg form of 0F 01 /1 encodes MONITOR/MWAIT, + * STAC/CLAC, and ENCLS. + */ + if (X86_MODRM_MOD(insn->modrm.value) == 3) + return -EINVAL; + return UMIP_INST_SIDT; case 4: return UMIP_INST_SMSW; diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 6d383839e839..845aeaf36b8d 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -18,6 +18,7 @@ #include <asm/processor.h> #include <asm/insn.h> #include <asm/mmu_context.h> +#include <asm/nops.h> /* Post-execution fixups. */ @@ -310,25 +311,32 @@ static int uprobe_init_insn(struct arch_uprobe *auprobe, struct insn *insn, bool #ifdef CONFIG_X86_64 +struct uretprobe_syscall_args { + unsigned long r11; + unsigned long cx; + unsigned long ax; +}; + asm ( ".pushsection .rodata\n" ".global uretprobe_trampoline_entry\n" "uretprobe_trampoline_entry:\n" - "pushq %rax\n" - "pushq %rcx\n" - "pushq %r11\n" - "movq $" __stringify(__NR_uretprobe) ", %rax\n" + "push %rax\n" + "push %rcx\n" + "push %r11\n" + "mov $" __stringify(__NR_uretprobe) ", %rax\n" "syscall\n" ".global uretprobe_syscall_check\n" "uretprobe_syscall_check:\n" - "popq %r11\n" - "popq %rcx\n" - - /* The uretprobe syscall replaces stored %rax value with final + "pop %r11\n" + "pop %rcx\n" + /* + * The uretprobe syscall replaces stored %rax value with final * return address, so we don't restore %rax in here and just * call ret. */ - "retq\n" + "ret\n" + "int3\n" ".global uretprobe_trampoline_end\n" "uretprobe_trampoline_end:\n" ".popsection\n" @@ -338,7 +346,7 @@ extern u8 uretprobe_trampoline_entry[]; extern u8 uretprobe_trampoline_end[]; extern u8 uretprobe_syscall_check[]; -void *arch_uprobe_trampoline(unsigned long *psize) +void *arch_uretprobe_trampoline(unsigned long *psize) { static uprobe_opcode_t insn = UPROBE_SWBP_INSN; struct pt_regs *regs = task_pt_regs(current); @@ -365,7 +373,8 @@ static unsigned long trampoline_check_ip(unsigned long tramp) SYSCALL_DEFINE0(uretprobe) { struct pt_regs *regs = task_pt_regs(current); - unsigned long err, ip, sp, r11_cx_ax[3], tramp; + struct uretprobe_syscall_args args; + unsigned long err, ip, sp, tramp; /* If there's no trampoline, we are called from wrong place. */ tramp = uprobe_get_trampoline_vaddr(); @@ -376,15 +385,15 @@ SYSCALL_DEFINE0(uretprobe) if (unlikely(regs->ip != trampoline_check_ip(tramp))) goto sigill; - err = copy_from_user(r11_cx_ax, (void __user *)regs->sp, sizeof(r11_cx_ax)); + err = copy_from_user(&args, (void __user *)regs->sp, sizeof(args)); if (err) goto sigill; /* expose the "right" values of r11/cx/ax/sp to uprobe_consumer/s */ - regs->r11 = r11_cx_ax[0]; - regs->cx = r11_cx_ax[1]; - regs->ax = r11_cx_ax[2]; - regs->sp += sizeof(r11_cx_ax); + regs->r11 = args.r11; + regs->cx = args.cx; + regs->ax = args.ax; + regs->sp += sizeof(args); regs->orig_ax = -1; ip = regs->ip; @@ -400,21 +409,21 @@ SYSCALL_DEFINE0(uretprobe) */ if (regs->sp != sp || shstk_is_enabled()) return regs->ax; - regs->sp -= sizeof(r11_cx_ax); + regs->sp -= sizeof(args); /* for the case uprobe_consumer has changed r11/cx */ - r11_cx_ax[0] = regs->r11; - r11_cx_ax[1] = regs->cx; + args.r11 = regs->r11; + args.cx = regs->cx; /* * ax register is passed through as return value, so we can use * its space on stack for ip value and jump to it through the * trampoline's ret instruction */ - r11_cx_ax[2] = regs->ip; + args.ax = regs->ip; regs->ip = ip; - err = copy_to_user((void __user *)regs->sp, r11_cx_ax, sizeof(r11_cx_ax)); + err = copy_to_user((void __user *)regs->sp, &args, sizeof(args)); if (err) goto sigill; @@ -608,6 +617,581 @@ static void riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) *sr = utask->autask.saved_scratch_register; } } + +static int tramp_mremap(const struct vm_special_mapping *sm, struct vm_area_struct *new_vma) +{ + return -EPERM; +} + +static struct page *tramp_mapping_pages[2] __ro_after_init; + +static struct vm_special_mapping tramp_mapping = { + .name = "[uprobes-trampoline]", + .mremap = tramp_mremap, + .pages = tramp_mapping_pages, +}; + +struct uprobe_trampoline { + struct hlist_node node; + unsigned long vaddr; +}; + +static bool is_reachable_by_call(unsigned long vtramp, unsigned long vaddr) +{ + long delta = (long)(vaddr + 5 - vtramp); + + return delta >= INT_MIN && delta <= INT_MAX; +} + +static unsigned long find_nearest_trampoline(unsigned long vaddr) +{ + struct vm_unmapped_area_info info = { + .length = PAGE_SIZE, + .align_mask = ~PAGE_MASK, + }; + unsigned long low_limit, high_limit; + unsigned long low_tramp, high_tramp; + unsigned long call_end = vaddr + 5; + + if (check_add_overflow(call_end, INT_MIN, &low_limit)) + low_limit = PAGE_SIZE; + + high_limit = call_end + INT_MAX; + + /* Search up from the caller address. */ + info.low_limit = call_end; + info.high_limit = min(high_limit, TASK_SIZE); + high_tramp = vm_unmapped_area(&info); + + /* Search down from the caller address. */ + info.low_limit = max(low_limit, PAGE_SIZE); + info.high_limit = call_end; + info.flags = VM_UNMAPPED_AREA_TOPDOWN; + low_tramp = vm_unmapped_area(&info); + + if (IS_ERR_VALUE(high_tramp) && IS_ERR_VALUE(low_tramp)) + return -ENOMEM; + if (IS_ERR_VALUE(high_tramp)) + return low_tramp; + if (IS_ERR_VALUE(low_tramp)) + return high_tramp; + + /* Return address that's closest to the caller address. */ + if (call_end - low_tramp < high_tramp - call_end) + return low_tramp; + return high_tramp; +} + +static struct uprobe_trampoline *create_uprobe_trampoline(unsigned long vaddr) +{ + struct pt_regs *regs = task_pt_regs(current); + struct mm_struct *mm = current->mm; + struct uprobe_trampoline *tramp; + struct vm_area_struct *vma; + + if (!user_64bit_mode(regs)) + return NULL; + + vaddr = find_nearest_trampoline(vaddr); + if (IS_ERR_VALUE(vaddr)) + return NULL; + + tramp = kzalloc(sizeof(*tramp), GFP_KERNEL); + if (unlikely(!tramp)) + return NULL; + + tramp->vaddr = vaddr; + vma = _install_special_mapping(mm, tramp->vaddr, PAGE_SIZE, + VM_READ|VM_EXEC|VM_MAYEXEC|VM_MAYREAD|VM_DONTCOPY|VM_IO, + &tramp_mapping); + if (IS_ERR(vma)) { + kfree(tramp); + return NULL; + } + return tramp; +} + +static struct uprobe_trampoline *get_uprobe_trampoline(unsigned long vaddr, bool *new) +{ + struct uprobes_state *state = ¤t->mm->uprobes_state; + struct uprobe_trampoline *tramp = NULL; + + if (vaddr > TASK_SIZE || vaddr < PAGE_SIZE) + return NULL; + + hlist_for_each_entry(tramp, &state->head_tramps, node) { + if (is_reachable_by_call(tramp->vaddr, vaddr)) { + *new = false; + return tramp; + } + } + + tramp = create_uprobe_trampoline(vaddr); + if (!tramp) + return NULL; + + *new = true; + hlist_add_head(&tramp->node, &state->head_tramps); + return tramp; +} + +static void destroy_uprobe_trampoline(struct uprobe_trampoline *tramp) +{ + /* + * We do not unmap and release uprobe trampoline page itself, + * because there's no easy way to make sure none of the threads + * is still inside the trampoline. + */ + hlist_del(&tramp->node); + kfree(tramp); +} + +void arch_uprobe_init_state(struct mm_struct *mm) +{ + INIT_HLIST_HEAD(&mm->uprobes_state.head_tramps); +} + +void arch_uprobe_clear_state(struct mm_struct *mm) +{ + struct uprobes_state *state = &mm->uprobes_state; + struct uprobe_trampoline *tramp; + struct hlist_node *n; + + hlist_for_each_entry_safe(tramp, n, &state->head_tramps, node) + destroy_uprobe_trampoline(tramp); +} + +static bool __in_uprobe_trampoline(unsigned long ip) +{ + struct vm_area_struct *vma = vma_lookup(current->mm, ip); + + return vma && vma_is_special_mapping(vma, &tramp_mapping); +} + +static bool in_uprobe_trampoline(unsigned long ip) +{ + struct mm_struct *mm = current->mm; + bool found, retry = true; + unsigned int seq; + + rcu_read_lock(); + if (mmap_lock_speculate_try_begin(mm, &seq)) { + found = __in_uprobe_trampoline(ip); + retry = mmap_lock_speculate_retry(mm, seq); + } + rcu_read_unlock(); + + if (retry) { + mmap_read_lock(mm); + found = __in_uprobe_trampoline(ip); + mmap_read_unlock(mm); + } + return found; +} + +/* + * See uprobe syscall trampoline; the call to the trampoline will push + * the return address on the stack, the trampoline itself then pushes + * cx, r11 and ax. + */ +struct uprobe_syscall_args { + unsigned long ax; + unsigned long r11; + unsigned long cx; + unsigned long retaddr; +}; + +SYSCALL_DEFINE0(uprobe) +{ + struct pt_regs *regs = task_pt_regs(current); + struct uprobe_syscall_args args; + unsigned long ip, sp, sret; + int err; + + /* Allow execution only from uprobe trampolines. */ + if (!in_uprobe_trampoline(regs->ip)) + return -ENXIO; + + err = copy_from_user(&args, (void __user *)regs->sp, sizeof(args)); + if (err) + goto sigill; + + ip = regs->ip; + + /* + * expose the "right" values of ax/r11/cx/ip/sp to uprobe_consumer/s, plus: + * - adjust ip to the probe address, call saved next instruction address + * - adjust sp to the probe's stack frame (check trampoline code) + */ + regs->ax = args.ax; + regs->r11 = args.r11; + regs->cx = args.cx; + regs->ip = args.retaddr - 5; + regs->sp += sizeof(args); + regs->orig_ax = -1; + + sp = regs->sp; + + err = shstk_pop((u64 *)&sret); + if (err == -EFAULT || (!err && sret != args.retaddr)) + goto sigill; + + handle_syscall_uprobe(regs, regs->ip); + + /* + * Some of the uprobe consumers has changed sp, we can do nothing, + * just return via iret. + */ + if (regs->sp != sp) { + /* skip the trampoline call */ + if (args.retaddr - 5 == regs->ip) + regs->ip += 5; + return regs->ax; + } + + regs->sp -= sizeof(args); + + /* for the case uprobe_consumer has changed ax/r11/cx */ + args.ax = regs->ax; + args.r11 = regs->r11; + args.cx = regs->cx; + + /* keep return address unless we are instructed otherwise */ + if (args.retaddr - 5 != regs->ip) + args.retaddr = regs->ip; + + if (shstk_push(args.retaddr) == -EFAULT) + goto sigill; + + regs->ip = ip; + + err = copy_to_user((void __user *)regs->sp, &args, sizeof(args)); + if (err) + goto sigill; + + /* ensure sysret, see do_syscall_64() */ + regs->r11 = regs->flags; + regs->cx = regs->ip; + return 0; + +sigill: + force_sig(SIGILL); + return -1; +} + +asm ( + ".pushsection .rodata\n" + ".balign " __stringify(PAGE_SIZE) "\n" + "uprobe_trampoline_entry:\n" + "push %rcx\n" + "push %r11\n" + "push %rax\n" + "mov $" __stringify(__NR_uprobe) ", %rax\n" + "syscall\n" + "pop %rax\n" + "pop %r11\n" + "pop %rcx\n" + "ret\n" + "int3\n" + ".balign " __stringify(PAGE_SIZE) "\n" + ".popsection\n" +); + +extern u8 uprobe_trampoline_entry[]; + +static int __init arch_uprobes_init(void) +{ + tramp_mapping_pages[0] = virt_to_page(uprobe_trampoline_entry); + return 0; +} + +late_initcall(arch_uprobes_init); + +enum { + EXPECT_SWBP, + EXPECT_CALL, +}; + +struct write_opcode_ctx { + unsigned long base; + int expect; +}; + +static int is_call_insn(uprobe_opcode_t *insn) +{ + return *insn == CALL_INSN_OPCODE; +} + +/* + * Verification callback used by int3_update uprobe_write calls to make sure + * the underlying instruction is as expected - either int3 or call. + */ +static int verify_insn(struct page *page, unsigned long vaddr, uprobe_opcode_t *new_opcode, + int nbytes, void *data) +{ + struct write_opcode_ctx *ctx = data; + uprobe_opcode_t old_opcode[5]; + + uprobe_copy_from_page(page, ctx->base, (uprobe_opcode_t *) &old_opcode, 5); + + switch (ctx->expect) { + case EXPECT_SWBP: + if (is_swbp_insn(&old_opcode[0])) + return 1; + break; + case EXPECT_CALL: + if (is_call_insn(&old_opcode[0])) + return 1; + break; + } + + return -1; +} + +/* + * Modify multi-byte instructions by using INT3 breakpoints on SMP. + * We completely avoid using stop_machine() here, and achieve the + * synchronization using INT3 breakpoints and SMP cross-calls. + * (borrowed comment from smp_text_poke_batch_finish) + * + * The way it is done: + * - Add an INT3 trap to the address that will be patched + * - SMP sync all CPUs + * - Update all but the first byte of the patched range + * - SMP sync all CPUs + * - Replace the first byte (INT3) by the first byte of the replacing opcode + * - SMP sync all CPUs + */ +static int int3_update(struct arch_uprobe *auprobe, struct vm_area_struct *vma, + unsigned long vaddr, char *insn, bool optimize) +{ + uprobe_opcode_t int3 = UPROBE_SWBP_INSN; + struct write_opcode_ctx ctx = { + .base = vaddr, + }; + int err; + + /* + * Write int3 trap. + * + * The swbp_optimize path comes with breakpoint already installed, + * so we can skip this step for optimize == true. + */ + if (!optimize) { + ctx.expect = EXPECT_CALL; + err = uprobe_write(auprobe, vma, vaddr, &int3, 1, verify_insn, + true /* is_register */, false /* do_update_ref_ctr */, + &ctx); + if (err) + return err; + } + + smp_text_poke_sync_each_cpu(); + + /* Write all but the first byte of the patched range. */ + ctx.expect = EXPECT_SWBP; + err = uprobe_write(auprobe, vma, vaddr + 1, insn + 1, 4, verify_insn, + true /* is_register */, false /* do_update_ref_ctr */, + &ctx); + if (err) + return err; + + smp_text_poke_sync_each_cpu(); + + /* + * Write first byte. + * + * The swbp_unoptimize needs to finish uprobe removal together + * with ref_ctr update, using uprobe_write with proper flags. + */ + err = uprobe_write(auprobe, vma, vaddr, insn, 1, verify_insn, + optimize /* is_register */, !optimize /* do_update_ref_ctr */, + &ctx); + if (err) + return err; + + smp_text_poke_sync_each_cpu(); + return 0; +} + +static int swbp_optimize(struct arch_uprobe *auprobe, struct vm_area_struct *vma, + unsigned long vaddr, unsigned long tramp) +{ + u8 call[5]; + + __text_gen_insn(call, CALL_INSN_OPCODE, (const void *) vaddr, + (const void *) tramp, CALL_INSN_SIZE); + return int3_update(auprobe, vma, vaddr, call, true /* optimize */); +} + +static int swbp_unoptimize(struct arch_uprobe *auprobe, struct vm_area_struct *vma, + unsigned long vaddr) +{ + return int3_update(auprobe, vma, vaddr, auprobe->insn, false /* optimize */); +} + +static int copy_from_vaddr(struct mm_struct *mm, unsigned long vaddr, void *dst, int len) +{ + unsigned int gup_flags = FOLL_FORCE|FOLL_SPLIT_PMD; + struct vm_area_struct *vma; + struct page *page; + + page = get_user_page_vma_remote(mm, vaddr, gup_flags, &vma); + if (IS_ERR(page)) + return PTR_ERR(page); + uprobe_copy_from_page(page, vaddr, dst, len); + put_page(page); + return 0; +} + +static bool __is_optimized(uprobe_opcode_t *insn, unsigned long vaddr) +{ + struct __packed __arch_relative_insn { + u8 op; + s32 raddr; + } *call = (struct __arch_relative_insn *) insn; + + if (!is_call_insn(insn)) + return false; + return __in_uprobe_trampoline(vaddr + 5 + call->raddr); +} + +static int is_optimized(struct mm_struct *mm, unsigned long vaddr) +{ + uprobe_opcode_t insn[5]; + int err; + + err = copy_from_vaddr(mm, vaddr, &insn, 5); + if (err) + return err; + return __is_optimized((uprobe_opcode_t *)&insn, vaddr); +} + +static bool should_optimize(struct arch_uprobe *auprobe) +{ + return !test_bit(ARCH_UPROBE_FLAG_OPTIMIZE_FAIL, &auprobe->flags) && + test_bit(ARCH_UPROBE_FLAG_CAN_OPTIMIZE, &auprobe->flags); +} + +int set_swbp(struct arch_uprobe *auprobe, struct vm_area_struct *vma, + unsigned long vaddr) +{ + if (should_optimize(auprobe)) { + /* + * We could race with another thread that already optimized the probe, + * so let's not overwrite it with int3 again in this case. + */ + int ret = is_optimized(vma->vm_mm, vaddr); + if (ret < 0) + return ret; + if (ret) + return 0; + } + return uprobe_write_opcode(auprobe, vma, vaddr, UPROBE_SWBP_INSN, + true /* is_register */); +} + +int set_orig_insn(struct arch_uprobe *auprobe, struct vm_area_struct *vma, + unsigned long vaddr) +{ + if (test_bit(ARCH_UPROBE_FLAG_CAN_OPTIMIZE, &auprobe->flags)) { + int ret = is_optimized(vma->vm_mm, vaddr); + if (ret < 0) + return ret; + if (ret) { + ret = swbp_unoptimize(auprobe, vma, vaddr); + WARN_ON_ONCE(ret); + return ret; + } + } + return uprobe_write_opcode(auprobe, vma, vaddr, *(uprobe_opcode_t *)&auprobe->insn, + false /* is_register */); +} + +static int __arch_uprobe_optimize(struct arch_uprobe *auprobe, struct mm_struct *mm, + unsigned long vaddr) +{ + struct uprobe_trampoline *tramp; + struct vm_area_struct *vma; + bool new = false; + int err = 0; + + vma = find_vma(mm, vaddr); + if (!vma) + return -EINVAL; + tramp = get_uprobe_trampoline(vaddr, &new); + if (!tramp) + return -EINVAL; + err = swbp_optimize(auprobe, vma, vaddr, tramp->vaddr); + if (WARN_ON_ONCE(err) && new) + destroy_uprobe_trampoline(tramp); + return err; +} + +void arch_uprobe_optimize(struct arch_uprobe *auprobe, unsigned long vaddr) +{ + struct mm_struct *mm = current->mm; + uprobe_opcode_t insn[5]; + + if (!should_optimize(auprobe)) + return; + + mmap_write_lock(mm); + + /* + * Check if some other thread already optimized the uprobe for us, + * if it's the case just go away silently. + */ + if (copy_from_vaddr(mm, vaddr, &insn, 5)) + goto unlock; + if (!is_swbp_insn((uprobe_opcode_t*) &insn)) + goto unlock; + + /* + * If we fail to optimize the uprobe we set the fail bit so the + * above should_optimize will fail from now on. + */ + if (__arch_uprobe_optimize(auprobe, mm, vaddr)) + set_bit(ARCH_UPROBE_FLAG_OPTIMIZE_FAIL, &auprobe->flags); + +unlock: + mmap_write_unlock(mm); +} + +static bool insn_is_nop(struct insn *insn) +{ + return insn->opcode.nbytes == 1 && insn->opcode.bytes[0] == 0x90; +} + +static bool insn_is_nopl(struct insn *insn) +{ + if (insn->opcode.nbytes != 2) + return false; + + if (insn->opcode.bytes[0] != 0x0f || insn->opcode.bytes[1] != 0x1f) + return false; + + if (!insn->modrm.nbytes) + return false; + + if (X86_MODRM_REG(insn->modrm.bytes[0]) != 0) + return false; + + /* 0f 1f /0 - NOPL */ + return true; +} + +static bool can_optimize(struct insn *insn, unsigned long vaddr) +{ + if (!insn->x86_64 || insn->length != 5) + return false; + + if (!insn_is_nop(insn) && !insn_is_nopl(insn)) + return false; + + /* We can't do cross page atomic writes yet. */ + return PAGE_SIZE - (vaddr & ~PAGE_MASK) >= 5; +} #else /* 32-bit: */ /* * No RIP-relative addressing on 32-bit @@ -621,6 +1205,10 @@ static void riprel_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) static void riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) { } +static bool can_optimize(struct insn *insn, unsigned long vaddr) +{ + return false; +} #endif /* CONFIG_X86_64 */ struct uprobe_xol_ops { @@ -979,14 +1567,17 @@ static int push_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn) */ int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long addr) { - struct insn insn; u8 fix_ip_or_call = UPROBE_FIX_IP; + struct insn insn; int ret; ret = uprobe_init_insn(auprobe, &insn, is_64bit_mm(mm)); if (ret) return ret; + if (can_optimize(&insn, addr)) + set_bit(ARCH_UPROBE_FLAG_CAN_OPTIMIZE, &auprobe->flags); + ret = branch_setup_xol_ops(auprobe, &insn); if (ret != -ENOSYS) return ret; diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 4fa0be732af1..d7af4a64c211 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -160,11 +160,6 @@ SECTIONS } :text = 0xcccccccc - /* bootstrapping code */ - .head.text : AT(ADDR(.head.text) - LOAD_OFFSET) { - HEAD_TEXT - } :text = 0xcccccccc - /* End of text section, which should occupy whole number of pages */ _etext = .; . = ALIGN(PAGE_SIZE); @@ -227,6 +222,8 @@ SECTIONS */ .altinstr_aux : AT(ADDR(.altinstr_aux) - LOAD_OFFSET) { *(.altinstr_aux) + . = ALIGN(PAGE_SIZE); + __inittext_end = .; } INIT_DATA_SECTION(16) @@ -535,3 +532,5 @@ xen_elfnote_entry_value = xen_elfnote_phys32_entry_value = ABSOLUTE(xen_elfnote_phys32_entry) + ABSOLUTE(pvh_start_xen - LOAD_OFFSET); #endif + +#include "../boot/startup/exports.h" diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index ad89d0bd6005..103604c4b33b 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -13,7 +13,7 @@ #define MSR_IA32_MISC_ENABLE_PMU_RO_MASK (MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL | \ MSR_IA32_MISC_ENABLE_BTS_UNAVAIL) -/* retrieve the 4 bits for EN and PMI out of IA32_FIXED_CTR_CTRL */ +/* retrieve a fixed counter bits out of IA32_FIXED_CTR_CTRL */ #define fixed_ctrl_field(ctrl_reg, idx) \ (((ctrl_reg) >> ((idx) * INTEL_FIXED_BITS_STRIDE)) & INTEL_FIXED_BITS_MASK) diff --git a/arch/x86/lib/inat.c b/arch/x86/lib/inat.c index b0f3b2a62ae2..a5cafd402cfd 100644 --- a/arch/x86/lib/inat.c +++ b/arch/x86/lib/inat.c @@ -81,3 +81,16 @@ insn_attr_t inat_get_avx_attribute(insn_byte_t opcode, insn_byte_t vex_m, return table[opcode]; } +insn_attr_t inat_get_xop_attribute(insn_byte_t opcode, insn_byte_t map_select) +{ + const insn_attr_t *table; + + if (map_select < X86_XOP_M_MIN || map_select > X86_XOP_M_MAX) + return 0; + map_select -= X86_XOP_M_MIN; + /* At first, this checks the master table */ + table = inat_xop_tables[map_select]; + if (!table) + return 0; + return table[opcode]; +} diff --git a/arch/x86/lib/insn.c b/arch/x86/lib/insn.c index 149a57e334ab..225af1399c9d 100644 --- a/arch/x86/lib/insn.c +++ b/arch/x86/lib/insn.c @@ -200,12 +200,15 @@ found: } insn->rex_prefix.got = 1; - /* Decode VEX prefix */ + /* Decode VEX/XOP prefix */ b = peek_next(insn_byte_t, insn); - attr = inat_get_opcode_attribute(b); - if (inat_is_vex_prefix(attr)) { + if (inat_is_vex_prefix(attr) || inat_is_xop_prefix(attr)) { insn_byte_t b2 = peek_nbyte_next(insn_byte_t, insn, 1); - if (!insn->x86_64) { + + if (inat_is_xop_prefix(attr) && X86_MODRM_REG(b2) == 0) { + /* Grp1A.0 is always POP Ev */ + goto vex_end; + } else if (!insn->x86_64) { /* * In 32-bits mode, if the [7:6] bits (mod bits of * ModRM) on the second byte are not 11b, it is @@ -226,13 +229,13 @@ found: if (insn->x86_64 && X86_VEX_W(b2)) /* VEX.W overrides opnd_size */ insn->opnd_bytes = 8; - } else if (inat_is_vex3_prefix(attr)) { + } else if (inat_is_vex3_prefix(attr) || inat_is_xop_prefix(attr)) { b2 = peek_nbyte_next(insn_byte_t, insn, 2); insn_set_byte(&insn->vex_prefix, 2, b2); insn->vex_prefix.nbytes = 3; insn->next_byte += 3; if (insn->x86_64 && X86_VEX_W(b2)) - /* VEX.W overrides opnd_size */ + /* VEX.W/XOP.W overrides opnd_size */ insn->opnd_bytes = 8; } else { /* @@ -288,9 +291,22 @@ int insn_get_opcode(struct insn *insn) insn_set_byte(opcode, 0, op); opcode->nbytes = 1; - /* Check if there is VEX prefix or not */ - if (insn_is_avx(insn)) { + /* Check if there is VEX/XOP prefix or not */ + if (insn_is_avx_or_xop(insn)) { insn_byte_t m, p; + + /* XOP prefix has different encoding */ + if (unlikely(avx_insn_is_xop(insn))) { + m = insn_xop_map_bits(insn); + insn->attr = inat_get_xop_attribute(op, m); + if (!inat_accept_xop(insn->attr)) { + insn->attr = 0; + return -EINVAL; + } + /* XOP has only 1 byte for opcode */ + goto end; + } + m = insn_vex_m_bits(insn); p = insn_vex_p_bits(insn); insn->attr = inat_get_avx_attribute(op, m, p); @@ -383,7 +399,8 @@ int insn_get_modrm(struct insn *insn) pfx_id = insn_last_prefix_id(insn); insn->attr = inat_get_group_attribute(mod, pfx_id, insn->attr); - if (insn_is_avx(insn) && !inat_accept_vex(insn->attr)) { + if (insn_is_avx_or_xop(insn) && !inat_accept_vex(insn->attr) && + !inat_accept_xop(insn->attr)) { /* Bad insn */ insn->attr = 0; return -EINVAL; diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S index d78d769a02bd..f513d33b6d37 100644 --- a/arch/x86/lib/retpoline.S +++ b/arch/x86/lib/retpoline.S @@ -15,7 +15,6 @@ .section .text..__x86.indirect_thunk - .macro POLINE reg ANNOTATE_INTRA_FUNCTION_CALL call .Ldo_rop_\@ @@ -73,6 +72,7 @@ SYM_CODE_END(__x86_indirect_thunk_array) #undef GEN #ifdef CONFIG_MITIGATION_CALL_DEPTH_TRACKING + .macro CALL_THUNK reg .align RETPOLINE_THUNK_SIZE @@ -126,7 +126,45 @@ SYM_CODE_END(__x86_indirect_jump_thunk_array) #define GEN(reg) __EXPORT_THUNK(__x86_indirect_jump_thunk_ ## reg) #include <asm/GEN-for-each-reg.h> #undef GEN -#endif + +#endif /* CONFIG_MITIGATION_CALL_DEPTH_TRACKING */ + +#ifdef CONFIG_MITIGATION_ITS + +.macro ITS_THUNK reg + +/* + * If CFI paranoid is used then the ITS thunk starts with opcodes (0xea; jne 1b) + * that complete the fineibt_paranoid caller sequence. + */ +1: .byte 0xea +SYM_INNER_LABEL(__x86_indirect_paranoid_thunk_\reg, SYM_L_GLOBAL) + UNWIND_HINT_UNDEFINED + ANNOTATE_NOENDBR + jne 1b +SYM_INNER_LABEL(__x86_indirect_its_thunk_\reg, SYM_L_GLOBAL) + UNWIND_HINT_UNDEFINED + ANNOTATE_NOENDBR + ANNOTATE_RETPOLINE_SAFE + jmp *%\reg + int3 + .align 32, 0xcc /* fill to the end of the line */ + .skip 32 - (__x86_indirect_its_thunk_\reg - 1b), 0xcc /* skip to the next upper half */ +.endm + +/* ITS mitigation requires thunks be aligned to upper half of cacheline */ +.align 64, 0xcc +.skip 29, 0xcc + +#define GEN(reg) ITS_THUNK reg +#include <asm/GEN-for-each-reg.h> +#undef GEN + + .align 64, 0xcc +SYM_FUNC_ALIAS(__x86_indirect_its_thunk_array, __x86_indirect_its_thunk_rax) +SYM_CODE_END(__x86_indirect_its_thunk_array) + +#endif /* CONFIG_MITIGATION_ITS */ #ifdef CONFIG_MITIGATION_RETHUNK @@ -370,39 +408,6 @@ SYM_FUNC_END(call_depth_return_thunk) #ifdef CONFIG_MITIGATION_ITS -.macro ITS_THUNK reg - -/* - * If CFI paranoid is used then the ITS thunk starts with opcodes (0xea; jne 1b) - * that complete the fineibt_paranoid caller sequence. - */ -1: .byte 0xea -SYM_INNER_LABEL(__x86_indirect_paranoid_thunk_\reg, SYM_L_GLOBAL) - UNWIND_HINT_UNDEFINED - ANNOTATE_NOENDBR - jne 1b -SYM_INNER_LABEL(__x86_indirect_its_thunk_\reg, SYM_L_GLOBAL) - UNWIND_HINT_UNDEFINED - ANNOTATE_NOENDBR - ANNOTATE_RETPOLINE_SAFE - jmp *%\reg - int3 - .align 32, 0xcc /* fill to the end of the line */ - .skip 32 - (__x86_indirect_its_thunk_\reg - 1b), 0xcc /* skip to the next upper half */ -.endm - -/* ITS mitigation requires thunks be aligned to upper half of cacheline */ -.align 64, 0xcc -.skip 29, 0xcc - -#define GEN(reg) ITS_THUNK reg -#include <asm/GEN-for-each-reg.h> -#undef GEN - - .align 64, 0xcc -SYM_FUNC_ALIAS(__x86_indirect_its_thunk_array, __x86_indirect_its_thunk_rax) -SYM_CODE_END(__x86_indirect_its_thunk_array) - .align 64, 0xcc .skip 32, 0xcc SYM_CODE_START(its_return_thunk) diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt index 262f7ca1fb95..2a4e69ecc2de 100644 --- a/arch/x86/lib/x86-opcode-map.txt +++ b/arch/x86/lib/x86-opcode-map.txt @@ -27,6 +27,11 @@ # (evo): this opcode is changed by EVEX prefix (EVEX opcode) # (v): this opcode requires VEX prefix. # (v1): this opcode only supports 128bit VEX. +# (xop): this opcode accepts XOP prefix. +# +# XOP Superscripts +# (W=0): this opcode requires XOP.W == 0 +# (W=1): this opcode requires XOP.W == 1 # # Last Prefix Superscripts # - (66): the last prefix is 0x66 @@ -194,7 +199,7 @@ AVXcode: 8c: MOV Ev,Sw 8d: LEA Gv,M 8e: MOV Sw,Ew -8f: Grp1A (1A) | POP Ev (d64) +8f: Grp1A (1A) | POP Ev (d64) | XOP (Prefix) # 0x90 - 0x9f 90: NOP | PAUSE (F3) | XCHG r8,rAX 91: XCHG rCX/r9,rAX @@ -1106,6 +1111,84 @@ AVXcode: 7 f8: URDMSR Rq,Id (F2),(v1),(11B) | UWRMSR Id,Rq (F3),(v1),(11B) EndTable +# From AMD64 Architecture Programmer's Manual Vol3, Appendix A.1.5 +Table: XOP map 8h +Referrer: +XOPcode: 0 +85: VPMACSSWW Vo,Ho,Wo,Lo +86: VPMACSSWD Vo,Ho,Wo,Lo +87: VPMACSSDQL Vo,Ho,Wo,Lo +8e: VPMACSSDD Vo,Ho,Wo,Lo +8f: VPMACSSDQH Vo,Ho,Wo,Lo +95: VPMACSWW Vo,Ho,Wo,Lo +96: VPMACSWD Vo,Ho,Wo,Lo +97: VPMACSDQL Vo,Ho,Wo,Lo +9e: VPMACSDD Vo,Ho,Wo,Lo +9f: VPMACSDQH Vo,Ho,Wo,Lo +a2: VPCMOV Vx,Hx,Wx,Lx (W=0) | VPCMOV Vx,Hx,Lx,Wx (W=1) +a3: VPPERM Vo,Ho,Wo,Lo (W=0) | VPPERM Vo,Ho,Lo,Wo (W=1) +a6: VPMADCSSWD Vo,Ho,Wo,Lo +b6: VPMADCSWD Vo,Ho,Wo,Lo +c0: VPROTB Vo,Wo,Ib +c1: VPROTW Vo,Wo,Ib +c2: VPROTD Vo,Wo,Ib +c3: VPROTQ Vo,Wo,Ib +cc: VPCOMccB Vo,Ho,Wo,Ib +cd: VPCOMccW Vo,Ho,Wo,Ib +ce: VPCOMccD Vo,Ho,Wo,Ib +cf: VPCOMccQ Vo,Ho,Wo,Ib +ec: VPCOMccUB Vo,Ho,Wo,Ib +ed: VPCOMccUW Vo,Ho,Wo,Ib +ee: VPCOMccUD Vo,Ho,Wo,Ib +ef: VPCOMccUQ Vo,Ho,Wo,Ib +EndTable + +Table: XOP map 9h +Referrer: +XOPcode: 1 +01: GrpXOP1 +02: GrpXOP2 +12: GrpXOP3 +80: VFRCZPS Vx,Wx +81: VFRCZPD Vx,Wx +82: VFRCZSS Vq,Wss +83: VFRCZSD Vq,Wsd +90: VPROTB Vo,Wo,Ho (W=0) | VPROTB Vo,Ho,Wo (W=1) +91: VPROTW Vo,Wo,Ho (W=0) | VPROTB Vo,Ho,Wo (W=1) +92: VPROTD Vo,Wo,Ho (W=0) | VPROTB Vo,Ho,Wo (W=1) +93: VPROTQ Vo,Wo,Ho (W=0) | VPROTB Vo,Ho,Wo (W=1) +94: VPSHLB Vo,Wo,Ho (W=0) | VPSHLB Vo,Ho,Wo (W=1) +95: VPSHLW Vo,Wo,Ho (W=0) | VPSHLW Vo,Ho,Wo (W=1) +96: VPSHLD Vo,Wo,Ho (W=0) | VPSHLD Vo,Ho,Wo (W=1) +97: VPSHLQ Vo,Wo,Ho (W=0) | VPSHLQ Vo,Ho,Wo (W=1) +98: VPSHAB Vo,Wo,Ho (W=0) | VPSHAB Vo,Ho,Wo (W=1) +99: VPSHAW Vo,Wo,Ho (W=0) | VPSHAW Vo,Ho,Wo (W=1) +9a: VPSHAD Vo,Wo,Ho (W=0) | VPSHAD Vo,Ho,Wo (W=1) +9b: VPSHAQ Vo,Wo,Ho (W=0) | VPSHAQ Vo,Ho,Wo (W=1) +c1: VPHADDBW Vo,Wo +c2: VPHADDBD Vo,Wo +c3: VPHADDBQ Vo,Wo +c6: VPHADDWD Vo,Wo +c7: VPHADDWQ Vo,Wo +cb: VPHADDDQ Vo,Wo +d1: VPHADDUBWD Vo,Wo +d2: VPHADDUBD Vo,Wo +d3: VPHADDUBQ Vo,Wo +d6: VPHADDUWD Vo,Wo +d7: VPHADDUWQ Vo,Wo +db: VPHADDUDQ Vo,Wo +e1: VPHSUBBW Vo,Wo +e2: VPHSUBWD Vo,Wo +e3: VPHSUBDQ Vo,Wo +EndTable + +Table: XOP map Ah +Referrer: +XOPcode: 2 +10: BEXTR Gy,Ey,Id +12: GrpXOP4 +EndTable + GrpTable: Grp1 0: ADD 1: OR @@ -1320,3 +1403,29 @@ GrpTable: GrpRNG 4: xcrypt-cfb 5: xcrypt-ofb EndTable + +# GrpXOP1-4 is shown in AMD APM Vol.3 Appendix A as XOP group #1-4 +GrpTable: GrpXOP1 +1: BLCFILL By,Ey (xop) +2: BLSFILL By,Ey (xop) +3: BLCS By,Ey (xop) +4: TZMSK By,Ey (xop) +5: BLCIC By,Ey (xop) +6: BLSIC By,Ey (xop) +7: T1MSKC By,Ey (xop) +EndTable + +GrpTable: GrpXOP2 +1: BLCMSK By,Ey (xop) +6: BLCI By,Ey (xop) +EndTable + +GrpTable: GrpXOP3 +0: LLWPCB Ry (xop) +1: SLWPCB Ry (xop) +EndTable + +GrpTable: GrpXOP4 +0: LWPINS By,Ed,Id (xop) +1: LWPVAL By,Ed,Id (xop) +EndTable diff --git a/arch/x86/mm/mem_encrypt_amd.c b/arch/x86/mm/mem_encrypt_amd.c index faf3a13fb6ba..2f8c32173972 100644 --- a/arch/x86/mm/mem_encrypt_amd.c +++ b/arch/x86/mm/mem_encrypt_amd.c @@ -536,12 +536,6 @@ void __init sme_early_init(void) x86_init.resources.dmi_setup = snp_dmi_setup; } - /* - * Switch the SVSM CA mapping (if active) from identity mapped to - * kernel mapped. - */ - snp_update_svsm_ca(); - if (sev_status & MSR_AMD64_SNP_SECURE_TSC) setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE); } diff --git a/arch/x86/mm/mem_encrypt_boot.S b/arch/x86/mm/mem_encrypt_boot.S index f8a33b25ae86..edbf9c998848 100644 --- a/arch/x86/mm/mem_encrypt_boot.S +++ b/arch/x86/mm/mem_encrypt_boot.S @@ -16,7 +16,7 @@ .text .code64 -SYM_FUNC_START(sme_encrypt_execute) +SYM_FUNC_START(__pi_sme_encrypt_execute) /* * Entry parameters: @@ -69,9 +69,9 @@ SYM_FUNC_START(sme_encrypt_execute) ANNOTATE_UNRET_SAFE ret int3 -SYM_FUNC_END(sme_encrypt_execute) +SYM_FUNC_END(__pi_sme_encrypt_execute) -SYM_FUNC_START(__enc_copy) +SYM_FUNC_START_LOCAL(__enc_copy) ANNOTATE_NOENDBR /* * Routine used to encrypt memory in place. diff --git a/arch/x86/platform/pvh/head.S b/arch/x86/platform/pvh/head.S index 1d78e5631bb8..344030c1a81d 100644 --- a/arch/x86/platform/pvh/head.S +++ b/arch/x86/platform/pvh/head.S @@ -24,7 +24,7 @@ #include <asm/nospec-branch.h> #include <xen/interface/elfnote.h> - __HEAD + __INIT /* * Entry point for PVH guests. diff --git a/arch/x86/tools/gen-insn-attr-x86.awk b/arch/x86/tools/gen-insn-attr-x86.awk index 2c19d7fc8a85..7ea1b75e59b7 100644 --- a/arch/x86/tools/gen-insn-attr-x86.awk +++ b/arch/x86/tools/gen-insn-attr-x86.awk @@ -21,6 +21,7 @@ function clear_vars() { eid = -1 # escape id gid = -1 # group id aid = -1 # AVX id + xopid = -1 # XOP id tname = "" } @@ -39,9 +40,11 @@ BEGIN { ggid = 1 geid = 1 gaid = 0 + gxopid = 0 delete etable delete gtable delete atable + delete xoptable opnd_expr = "^[A-Za-z/]" ext_expr = "^\\(" @@ -61,6 +64,7 @@ BEGIN { imm_flag["Ob"] = "INAT_MOFFSET" imm_flag["Ov"] = "INAT_MOFFSET" imm_flag["Lx"] = "INAT_MAKE_IMM(INAT_IMM_BYTE)" + imm_flag["Lo"] = "INAT_MAKE_IMM(INAT_IMM_BYTE)" modrm_expr = "^([CDEGMNPQRSUVW/][a-z]+|NTA|T[012])" force64_expr = "\\([df]64\\)" @@ -87,6 +91,8 @@ BEGIN { evexonly_expr = "\\(ev\\)" # (es) is the same as (ev) but also "SCALABLE" i.e. W and pp determine operand size evex_scalable_expr = "\\(es\\)" + # All opcodes in XOP table or with (xop) superscript accept XOP prefix + xopok_expr = "\\(xop\\)" prefix_expr = "\\(Prefix\\)" prefix_num["Operand-Size"] = "INAT_PFX_OPNDSZ" @@ -106,6 +112,7 @@ BEGIN { prefix_num["VEX+2byte"] = "INAT_PFX_VEX3" prefix_num["EVEX"] = "INAT_PFX_EVEX" prefix_num["REX2"] = "INAT_PFX_REX2" + prefix_num["XOP"] = "INAT_PFX_XOP" clear_vars() } @@ -147,6 +154,7 @@ function array_size(arr, i,c) { if (NF != 1) { # AVX/escape opcode table aid = $2 + xopid = -1 if (gaid <= aid) gaid = aid + 1 if (tname == "") # AVX only opcode table @@ -156,6 +164,20 @@ function array_size(arr, i,c) { tname = "inat_primary_table" } +/^XOPcode:/ { + if (NF != 1) { + # XOP opcode table + xopid = $2 + aid = -1 + if (gxopid <= xopid) + gxopid = xopid + 1 + if (tname == "") # XOP only opcode table + tname = sprintf("inat_xop_table_%d", $2) + } + if (xopid == -1 && eid == -1) # primary opcode table + tname = "inat_primary_table" +} + /^GrpTable:/ { print "/* " $0 " */" if (!($2 in group)) @@ -206,6 +228,8 @@ function print_table(tbl,name,fmt,n) etable[eid,0] = tname if (aid >= 0) atable[aid,0] = tname + else if (xopid >= 0) + xoptable[xopid] = tname } if (array_size(lptable1) != 0) { print_table(lptable1,tname "_1[INAT_OPCODE_TABLE_SIZE]", @@ -347,6 +371,8 @@ function convert_operands(count,opnd, i,j,imm,mod) flags = add_flags(flags, "INAT_VEXOK | INAT_VEXONLY") else if (match(ext, vexok_expr) || match(opcode, vexok_opcode_expr)) flags = add_flags(flags, "INAT_VEXOK") + else if (match(ext, xopok_expr) || xopid >= 0) + flags = add_flags(flags, "INAT_XOPOK") # check prefixes if (match(ext, prefix_expr)) { @@ -413,6 +439,14 @@ END { print " ["i"]["j"] = "atable[i,j]"," print "};\n" + print "/* XOP opcode map array */" + print "const insn_attr_t * const inat_xop_tables[X86_XOP_M_MAX - X86_XOP_M_MIN + 1]" \ + " = {" + for (i = 0; i < gxopid; i++) + if (xoptable[i]) + print " ["i"] = "xoptable[i]"," + print "};" + print "#else /* !__BOOT_COMPRESSED */\n" print "/* Escape opcode map array */" @@ -430,6 +464,10 @@ END { "[INAT_LSTPFX_MAX + 1];" print "" + print "/* XOP opcode map array */" + print "static const insn_attr_t *inat_xop_tables[X86_XOP_M_MAX - X86_XOP_M_MIN + 1];" + print "" + print "static void inat_init_tables(void)" print "{" @@ -455,6 +493,12 @@ END { if (atable[i,j]) print "\tinat_avx_tables["i"]["j"] = "atable[i,j]";" + print "" + print "\t/* Print XOP opcode map array */" + for (i = 0; i < gxopid; i++) + if (xoptable[i]) + print "\tinat_xop_tables["i"] = "xoptable[i]";" + print "}" print "#endif" } diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c index 5778bc498415..e5a2b9a912d1 100644 --- a/arch/x86/tools/relocs.c +++ b/arch/x86/tools/relocs.c @@ -740,10 +740,10 @@ static void walk_relocs(int (*process)(struct section *sec, Elf_Rel *rel, static int do_reloc64(struct section *sec, Elf_Rel *rel, ElfW(Sym) *sym, const char *symname) { - int headtext = !strcmp(sec_name(sec->shdr.sh_info), ".head.text"); unsigned r_type = ELF64_R_TYPE(rel->r_info); ElfW(Addr) offset = rel->r_offset; int shn_abs = (sym->st_shndx == SHN_ABS) && !is_reloc(S_REL, symname); + if (sym->st_shndx == SHN_UNDEF) return 0; @@ -783,12 +783,6 @@ static int do_reloc64(struct section *sec, Elf_Rel *rel, ElfW(Sym) *sym, break; } - if (headtext) { - die("Absolute reference to symbol '%s' not permitted in .head.text\n", - symname); - break; - } - /* * Relocation offsets for 64 bit kernels are output * as 32 bits and sign extended back to 64 bits when diff --git a/arch/x86/virt/svm/sev.c b/arch/x86/virt/svm/sev.c index 942372e69b4d..ee643a6cd691 100644 --- a/arch/x86/virt/svm/sev.c +++ b/arch/x86/virt/svm/sev.c @@ -1029,7 +1029,7 @@ int rmp_make_shared(u64 pfn, enum pg_level level) } EXPORT_SYMBOL_GPL(rmp_make_shared); -void snp_leak_pages(u64 pfn, unsigned int npages) +void __snp_leak_pages(u64 pfn, unsigned int npages, bool dump_rmp) { struct page *page = pfn_to_page(pfn); @@ -1052,14 +1052,15 @@ void snp_leak_pages(u64 pfn, unsigned int npages) (PageHead(page) && compound_nr(page) <= npages)) list_add_tail(&page->buddy_list, &snp_leaked_pages_list); - dump_rmpentry(pfn); + if (dump_rmp) + dump_rmpentry(pfn); snp_nr_leaked_pages++; pfn++; page++; } spin_unlock(&snp_leaked_pages_list_lock); } -EXPORT_SYMBOL_GPL(snp_leak_pages); +EXPORT_SYMBOL_GPL(__snp_leak_pages); void kdump_sev_callback(void) { diff --git a/arch/xtensa/kernel/asm-offsets.c b/arch/xtensa/kernel/asm-offsets.c index da38de20ae59..cfbced95e944 100644 --- a/arch/xtensa/kernel/asm-offsets.c +++ b/arch/xtensa/kernel/asm-offsets.c @@ -11,6 +11,7 @@ * * Chris Zankel <chris@zankel.net> */ +#define COMPILE_OFFSETS #include <asm/processor.h> #include <asm/coprocessor.h> diff --git a/drivers/acpi/arm64/gtdt.c b/drivers/acpi/arm64/gtdt.c index 70f8290b659d..fd995a1d3d24 100644 --- a/drivers/acpi/arm64/gtdt.c +++ b/drivers/acpi/arm64/gtdt.c @@ -388,11 +388,11 @@ static int __init gtdt_import_sbsa_gwdt(struct acpi_gtdt_watchdog *wd, return 0; } -static int __init gtdt_sbsa_gwdt_init(void) +static int __init gtdt_platform_timer_init(void) { void *platform_timer; struct acpi_table_header *table; - int ret, timer_count, gwdt_count = 0; + int ret, timer_count, gwdt_count = 0, mmio_timer_count = 0; if (acpi_disabled) return 0; @@ -414,20 +414,41 @@ static int __init gtdt_sbsa_gwdt_init(void) goto out_put_gtdt; for_each_platform_timer(platform_timer) { + ret = 0; + if (is_non_secure_watchdog(platform_timer)) { ret = gtdt_import_sbsa_gwdt(platform_timer, gwdt_count); if (ret) - break; + continue; gwdt_count++; + } else if (is_timer_block(platform_timer)) { + struct arch_timer_mem atm = {}; + struct platform_device *pdev; + + ret = gtdt_parse_timer_block(platform_timer, &atm); + if (ret) + continue; + + pdev = platform_device_register_data(NULL, "gtdt-arm-mmio-timer", + gwdt_count, &atm, + sizeof(atm)); + if (IS_ERR(pdev)) { + pr_err("Can't register timer %d\n", gwdt_count); + continue; + } + + mmio_timer_count++; } } if (gwdt_count) pr_info("found %d SBSA generic Watchdog(s).\n", gwdt_count); + if (mmio_timer_count) + pr_info("found %d Generic MMIO timer(s).\n", mmio_timer_count); out_put_gtdt: acpi_put_table(table); return ret; } -device_initcall(gtdt_sbsa_gwdt_init); +device_initcall(gtdt_platform_timer_init); diff --git a/drivers/cdx/controller/cdx_controller.c b/drivers/cdx/controller/cdx_controller.c index fca83141e3e6..3f8b9041babf 100644 --- a/drivers/cdx/controller/cdx_controller.c +++ b/drivers/cdx/controller/cdx_controller.c @@ -14,7 +14,7 @@ #include "cdx_controller.h" #include "../cdx.h" #include "mcdi_functions.h" -#include "mcdi.h" +#include "mcdid.h" static unsigned int cdx_mcdi_rpc_timeout(struct cdx_mcdi *cdx, unsigned int cmd) { diff --git a/drivers/cdx/controller/cdx_rpmsg.c b/drivers/cdx/controller/cdx_rpmsg.c index 61f1a290ff08..59aabd99fa8f 100644 --- a/drivers/cdx/controller/cdx_rpmsg.c +++ b/drivers/cdx/controller/cdx_rpmsg.c @@ -15,7 +15,7 @@ #include "../cdx.h" #include "cdx_controller.h" #include "mcdi_functions.h" -#include "mcdi.h" +#include "mcdid.h" static struct rpmsg_device_id cdx_rpmsg_id_table[] = { { .name = "mcdi_ipc" }, diff --git a/drivers/cdx/controller/mcdi.c b/drivers/cdx/controller/mcdi.c index e760f8d347cc..2e82ffc18d89 100644 --- a/drivers/cdx/controller/mcdi.c +++ b/drivers/cdx/controller/mcdi.c @@ -23,9 +23,10 @@ #include <linux/log2.h> #include <linux/net_tstamp.h> #include <linux/wait.h> +#include <linux/cdx/bitfield.h> -#include "bitfield.h" -#include "mcdi.h" +#include <linux/cdx/mcdi.h> +#include "mcdid.h" static void cdx_mcdi_cancel_cmd(struct cdx_mcdi *cdx, struct cdx_mcdi_cmd *cmd); static void cdx_mcdi_wait_for_cleanup(struct cdx_mcdi *cdx); @@ -99,6 +100,19 @@ static unsigned long cdx_mcdi_rpc_timeout(struct cdx_mcdi *cdx, unsigned int cmd return cdx->mcdi_ops->mcdi_rpc_timeout(cdx, cmd); } +/** + * cdx_mcdi_init - Initialize MCDI (Management Controller Driver Interface) state + * @cdx: Handle to the CDX MCDI structure + * + * This function allocates and initializes internal MCDI structures and resources + * for the CDX device, including the workqueue, locking primitives, and command + * tracking mechanisms. It sets the initial operating mode and prepares the device + * for MCDI operations. + * + * Return: + * * 0 - on success + * * -ENOMEM - if memory allocation or workqueue creation fails + */ int cdx_mcdi_init(struct cdx_mcdi *cdx) { struct cdx_mcdi_iface *mcdi; @@ -128,7 +142,16 @@ fail2: fail: return rc; } +EXPORT_SYMBOL_GPL(cdx_mcdi_init); +/** + * cdx_mcdi_finish - Cleanup MCDI (Management Controller Driver Interface) state + * @cdx: Handle to the CDX MCDI structure + * + * This function is responsible for cleaning up the MCDI (Management Controller Driver Interface) + * resources associated with a cdx_mcdi structure. Also destroys the mcdi workqueue. + * + */ void cdx_mcdi_finish(struct cdx_mcdi *cdx) { struct cdx_mcdi_iface *mcdi; @@ -143,6 +166,7 @@ void cdx_mcdi_finish(struct cdx_mcdi *cdx) kfree(cdx->mcdi); cdx->mcdi = NULL; } +EXPORT_SYMBOL_GPL(cdx_mcdi_finish); static bool cdx_mcdi_flushed(struct cdx_mcdi_iface *mcdi, bool ignore_cleanups) { @@ -553,6 +577,19 @@ static void cdx_mcdi_start_or_queue(struct cdx_mcdi_iface *mcdi, cdx_mcdi_cmd_start_or_queue(mcdi, cmd); } +/** + * cdx_mcdi_process_cmd - Process an incoming MCDI response + * @cdx: Handle to the CDX MCDI structure + * @outbuf: Pointer to the response buffer received from the management controller + * @len: Length of the response buffer in bytes + * + * This function handles a response from the management controller. It locates the + * corresponding command using the sequence number embedded in the header, + * completes the command if it is still pending, and initiates any necessary cleanup. + * + * The function assumes that the response buffer is well-formed and at least one + * dword in size. + */ void cdx_mcdi_process_cmd(struct cdx_mcdi *cdx, struct cdx_dword *outbuf, int len) { struct cdx_mcdi_iface *mcdi; @@ -590,6 +627,7 @@ void cdx_mcdi_process_cmd(struct cdx_mcdi *cdx, struct cdx_dword *outbuf, int le cdx_mcdi_process_cleanup_list(mcdi->cdx, &cleanup_list); } +EXPORT_SYMBOL_GPL(cdx_mcdi_process_cmd); static void cdx_mcdi_cmd_work(struct work_struct *context) { @@ -757,6 +795,7 @@ int cdx_mcdi_rpc(struct cdx_mcdi *cdx, unsigned int cmd, return cdx_mcdi_rpc_sync(cdx, cmd, inbuf, inlen, outbuf, outlen, outlen_actual, false); } +EXPORT_SYMBOL_GPL(cdx_mcdi_rpc); /** * cdx_mcdi_rpc_async - Schedule an MCDI command to run asynchronously diff --git a/drivers/cdx/controller/mcdi_functions.c b/drivers/cdx/controller/mcdi_functions.c index 885c69e6ebe5..8ae2d99be81e 100644 --- a/drivers/cdx/controller/mcdi_functions.c +++ b/drivers/cdx/controller/mcdi_functions.c @@ -5,7 +5,6 @@ #include <linux/module.h> -#include "mcdi.h" #include "mcdi_functions.h" int cdx_mcdi_get_num_buses(struct cdx_mcdi *cdx) diff --git a/drivers/cdx/controller/mcdi_functions.h b/drivers/cdx/controller/mcdi_functions.h index b9942affdc6b..57fd1bae706b 100644 --- a/drivers/cdx/controller/mcdi_functions.h +++ b/drivers/cdx/controller/mcdi_functions.h @@ -8,7 +8,8 @@ #ifndef CDX_MCDI_FUNCTIONS_H #define CDX_MCDI_FUNCTIONS_H -#include "mcdi.h" +#include <linux/cdx/mcdi.h> +#include "mcdid.h" #include "../cdx.h" /** diff --git a/drivers/cdx/controller/mcdid.h b/drivers/cdx/controller/mcdid.h new file mode 100644 index 000000000000..7fc29f099265 --- /dev/null +++ b/drivers/cdx/controller/mcdid.h @@ -0,0 +1,63 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * Copyright 2008-2013 Solarflare Communications Inc. + * Copyright (C) 2022-2025, Advanced Micro Devices, Inc. + */ + +#ifndef CDX_MCDID_H +#define CDX_MCDID_H + +#include <linux/mutex.h> +#include <linux/kref.h> +#include <linux/rpmsg.h> + +#include "mc_cdx_pcol.h" + +#ifdef DEBUG +#define CDX_WARN_ON_ONCE_PARANOID(x) WARN_ON_ONCE(x) +#define CDX_WARN_ON_PARANOID(x) WARN_ON(x) +#else +#define CDX_WARN_ON_ONCE_PARANOID(x) do {} while (0) +#define CDX_WARN_ON_PARANOID(x) do {} while (0) +#endif + +#define MCDI_BUF_LEN (8 + MCDI_CTL_SDU_LEN_MAX) + +static inline struct cdx_mcdi_iface *cdx_mcdi_if(struct cdx_mcdi *cdx) +{ + return cdx->mcdi ? &cdx->mcdi->iface : NULL; +} + +int cdx_mcdi_rpc_async(struct cdx_mcdi *cdx, unsigned int cmd, + const struct cdx_dword *inbuf, size_t inlen, + cdx_mcdi_async_completer *complete, + unsigned long cookie); +int cdx_mcdi_wait_for_quiescence(struct cdx_mcdi *cdx, + unsigned int timeout_jiffies); + +/* + * We expect that 16- and 32-bit fields in MCDI requests and responses + * are appropriately aligned, but 64-bit fields are only + * 32-bit-aligned. + */ +#define MCDI_BYTE(_buf, _field) \ + ((void)BUILD_BUG_ON_ZERO(MC_CMD_ ## _field ## _LEN != 1), \ + *MCDI_PTR(_buf, _field)) +#define MCDI_WORD(_buf, _field) \ + ((void)BUILD_BUG_ON_ZERO(MC_CMD_ ## _field ## _LEN != 2), \ + le16_to_cpu(*(__force const __le16 *)MCDI_PTR(_buf, _field))) +#define MCDI_POPULATE_DWORD_1(_buf, _field, _name1, _value1) \ + CDX_POPULATE_DWORD_1(*_MCDI_DWORD(_buf, _field), \ + MC_CMD_ ## _name1, _value1) +#define MCDI_SET_QWORD(_buf, _field, _value) \ + do { \ + CDX_POPULATE_DWORD_1(_MCDI_DWORD(_buf, _field)[0], \ + CDX_DWORD, (u32)(_value)); \ + CDX_POPULATE_DWORD_1(_MCDI_DWORD(_buf, _field)[1], \ + CDX_DWORD, (u64)(_value) >> 32); \ + } while (0) +#define MCDI_QWORD(_buf, _field) \ + (CDX_DWORD_FIELD(_MCDI_DWORD(_buf, _field)[0], CDX_DWORD) | \ + (u64)CDX_DWORD_FIELD(_MCDI_DWORD(_buf, _field)[1], CDX_DWORD) << 32) + +#endif /* CDX_MCDID_H */ diff --git a/drivers/clocksource/Kconfig b/drivers/clocksource/Kconfig index 645f517a1ac2..ffcd23668763 100644 --- a/drivers/clocksource/Kconfig +++ b/drivers/clocksource/Kconfig @@ -395,8 +395,7 @@ config ARM_GLOBAL_TIMER config ARM_GT_INITIAL_PRESCALER_VAL int "ARM global timer initial prescaler value" - default 2 if ARCH_ZYNQ - default 1 + default 0 depends on ARM_GLOBAL_TIMER help When the ARM global timer initializes, its current rate is declared @@ -406,6 +405,7 @@ config ARM_GT_INITIAL_PRESCALER_VAL bounds about how much the parent clock is allowed to decrease or increase wrt the initial clock value. This affects CPU_FREQ max delta from the initial frequency. + Use 0 to use auto-detection in the driver. config ARM_TIMER_SP804 bool "Support for Dual Timer SP804 module" @@ -474,11 +474,14 @@ config FSL_FTM_TIMER help Support for Freescale FlexTimer Module (FTM) timer. -config VF_PIT_TIMER - bool +config NXP_PIT_TIMER + bool "NXP Periodic Interrupt Timer" if COMPILE_TEST select CLKSRC_MMIO help - Support for Periodic Interrupt Timer on Freescale Vybrid Family SoCs. + Support for Periodic Interrupt Timer on Freescale / NXP + SoCs. This periodic timer is found on the Vybrid Family and + the Automotive S32G2/3 platforms. It contains 4 channels + where two can be coupled to form a 64 bits channel. config SYS_SUPPORTS_SH_CMT bool diff --git a/drivers/clocksource/Makefile b/drivers/clocksource/Makefile index 205bf3b0a8f3..ec4452ee958f 100644 --- a/drivers/clocksource/Makefile +++ b/drivers/clocksource/Makefile @@ -49,7 +49,7 @@ obj-$(CONFIG_CLKSRC_LPC32XX) += timer-lpc32xx.o obj-$(CONFIG_CLKSRC_MPS2) += mps2-timer.o obj-$(CONFIG_CLKSRC_SAMSUNG_PWM) += samsung_pwm_timer.o obj-$(CONFIG_FSL_FTM_TIMER) += timer-fsl-ftm.o -obj-$(CONFIG_VF_PIT_TIMER) += timer-vf-pit.o +obj-$(CONFIG_NXP_PIT_TIMER) += timer-nxp-pit.o obj-$(CONFIG_CLKSRC_QCOM) += timer-qcom.o obj-$(CONFIG_MTK_TIMER) += timer-mediatek.o obj-$(CONFIG_MTK_CPUX_TIMER) += timer-mediatek-cpux.o @@ -64,6 +64,7 @@ obj-$(CONFIG_REALTEK_OTTO_TIMER) += timer-rtl-otto.o obj-$(CONFIG_ARC_TIMERS) += arc_timer.o obj-$(CONFIG_ARM_ARCH_TIMER) += arm_arch_timer.o +obj-$(CONFIG_ARM_ARCH_TIMER) += arm_arch_timer_mmio.o obj-$(CONFIG_ARM_GLOBAL_TIMER) += arm_global_timer.o obj-$(CONFIG_ARMV7M_SYSTICK) += armv7m_systick.o obj-$(CONFIG_ARM_TIMER_SP804) += timer-sp804.o diff --git a/drivers/clocksource/arm_arch_timer.c b/drivers/clocksource/arm_arch_timer.c index 80ba6a54248c..90aeff44a276 100644 --- a/drivers/clocksource/arm_arch_timer.c +++ b/drivers/clocksource/arm_arch_timer.c @@ -34,42 +34,12 @@ #include <clocksource/arm_arch_timer.h> -#define CNTTIDR 0x08 -#define CNTTIDR_VIRT(n) (BIT(1) << ((n) * 4)) - -#define CNTACR(n) (0x40 + ((n) * 4)) -#define CNTACR_RPCT BIT(0) -#define CNTACR_RVCT BIT(1) -#define CNTACR_RFRQ BIT(2) -#define CNTACR_RVOFF BIT(3) -#define CNTACR_RWVT BIT(4) -#define CNTACR_RWPT BIT(5) - -#define CNTPCT_LO 0x00 -#define CNTVCT_LO 0x08 -#define CNTFRQ 0x10 -#define CNTP_CVAL_LO 0x20 -#define CNTP_CTL 0x2c -#define CNTV_CVAL_LO 0x30 -#define CNTV_CTL 0x3c - /* * The minimum amount of time a generic counter is guaranteed to not roll over * (40 years) */ #define MIN_ROLLOVER_SECS (40ULL * 365 * 24 * 3600) -static unsigned arch_timers_present __initdata; - -struct arch_timer { - void __iomem *base; - struct clock_event_device evt; -}; - -static struct arch_timer *arch_timer_mem __ro_after_init; - -#define to_arch_timer(e) container_of(e, struct arch_timer, evt) - static u32 arch_timer_rate __ro_after_init; static int arch_timer_ppi[ARCH_TIMER_MAX_TIMER_PPI] __ro_after_init; @@ -85,7 +55,6 @@ static struct clock_event_device __percpu *arch_timer_evt; static enum arch_timer_ppi_nr arch_timer_uses_ppi __ro_after_init = ARCH_TIMER_VIRT_PPI; static bool arch_timer_c3stop __ro_after_init; -static bool arch_timer_mem_use_virtual __ro_after_init; static bool arch_counter_suspend_stop __ro_after_init; #ifdef CONFIG_GENERIC_GETTIMEOFDAY static enum vdso_clock_mode vdso_default = VDSO_CLOCKMODE_ARCHTIMER; @@ -121,76 +90,6 @@ static int arch_counter_get_width(void) /* * Architected system timer support. */ - -static __always_inline -void arch_timer_reg_write(int access, enum arch_timer_reg reg, u64 val, - struct clock_event_device *clk) -{ - if (access == ARCH_TIMER_MEM_PHYS_ACCESS) { - struct arch_timer *timer = to_arch_timer(clk); - switch (reg) { - case ARCH_TIMER_REG_CTRL: - writel_relaxed((u32)val, timer->base + CNTP_CTL); - break; - case ARCH_TIMER_REG_CVAL: - /* - * Not guaranteed to be atomic, so the timer - * must be disabled at this point. - */ - writeq_relaxed(val, timer->base + CNTP_CVAL_LO); - break; - default: - BUILD_BUG(); - } - } else if (access == ARCH_TIMER_MEM_VIRT_ACCESS) { - struct arch_timer *timer = to_arch_timer(clk); - switch (reg) { - case ARCH_TIMER_REG_CTRL: - writel_relaxed((u32)val, timer->base + CNTV_CTL); - break; - case ARCH_TIMER_REG_CVAL: - /* Same restriction as above */ - writeq_relaxed(val, timer->base + CNTV_CVAL_LO); - break; - default: - BUILD_BUG(); - } - } else { - arch_timer_reg_write_cp15(access, reg, val); - } -} - -static __always_inline -u32 arch_timer_reg_read(int access, enum arch_timer_reg reg, - struct clock_event_device *clk) -{ - u32 val; - - if (access == ARCH_TIMER_MEM_PHYS_ACCESS) { - struct arch_timer *timer = to_arch_timer(clk); - switch (reg) { - case ARCH_TIMER_REG_CTRL: - val = readl_relaxed(timer->base + CNTP_CTL); - break; - default: - BUILD_BUG(); - } - } else if (access == ARCH_TIMER_MEM_VIRT_ACCESS) { - struct arch_timer *timer = to_arch_timer(clk); - switch (reg) { - case ARCH_TIMER_REG_CTRL: - val = readl_relaxed(timer->base + CNTV_CTL); - break; - default: - BUILD_BUG(); - } - } else { - val = arch_timer_reg_read_cp15(access, reg); - } - - return val; -} - static noinstr u64 raw_counter_get_cntpct_stable(void) { return __arch_counter_get_cntpct_stable(); @@ -424,7 +323,7 @@ void erratum_set_next_event_generic(const int access, unsigned long evt, unsigned long ctrl; u64 cval; - ctrl = arch_timer_reg_read(access, ARCH_TIMER_REG_CTRL, clk); + ctrl = arch_timer_reg_read_cp15(access, ARCH_TIMER_REG_CTRL); ctrl |= ARCH_TIMER_CTRL_ENABLE; ctrl &= ~ARCH_TIMER_CTRL_IT_MASK; @@ -436,7 +335,7 @@ void erratum_set_next_event_generic(const int access, unsigned long evt, write_sysreg(cval, cntv_cval_el0); } - arch_timer_reg_write(access, ARCH_TIMER_REG_CTRL, ctrl, clk); + arch_timer_reg_write_cp15(access, ARCH_TIMER_REG_CTRL, ctrl); } static __maybe_unused int erratum_set_next_event_virt(unsigned long evt, @@ -667,10 +566,10 @@ static __always_inline irqreturn_t timer_handler(const int access, { unsigned long ctrl; - ctrl = arch_timer_reg_read(access, ARCH_TIMER_REG_CTRL, evt); + ctrl = arch_timer_reg_read_cp15(access, ARCH_TIMER_REG_CTRL); if (ctrl & ARCH_TIMER_CTRL_IT_STAT) { ctrl |= ARCH_TIMER_CTRL_IT_MASK; - arch_timer_reg_write(access, ARCH_TIMER_REG_CTRL, ctrl, evt); + arch_timer_reg_write_cp15(access, ARCH_TIMER_REG_CTRL, ctrl); evt->event_handler(evt); return IRQ_HANDLED; } @@ -692,28 +591,14 @@ static irqreturn_t arch_timer_handler_phys(int irq, void *dev_id) return timer_handler(ARCH_TIMER_PHYS_ACCESS, evt); } -static irqreturn_t arch_timer_handler_phys_mem(int irq, void *dev_id) -{ - struct clock_event_device *evt = dev_id; - - return timer_handler(ARCH_TIMER_MEM_PHYS_ACCESS, evt); -} - -static irqreturn_t arch_timer_handler_virt_mem(int irq, void *dev_id) -{ - struct clock_event_device *evt = dev_id; - - return timer_handler(ARCH_TIMER_MEM_VIRT_ACCESS, evt); -} - static __always_inline int arch_timer_shutdown(const int access, struct clock_event_device *clk) { unsigned long ctrl; - ctrl = arch_timer_reg_read(access, ARCH_TIMER_REG_CTRL, clk); + ctrl = arch_timer_reg_read_cp15(access, ARCH_TIMER_REG_CTRL); ctrl &= ~ARCH_TIMER_CTRL_ENABLE; - arch_timer_reg_write(access, ARCH_TIMER_REG_CTRL, ctrl, clk); + arch_timer_reg_write_cp15(access, ARCH_TIMER_REG_CTRL, ctrl); return 0; } @@ -728,23 +613,13 @@ static int arch_timer_shutdown_phys(struct clock_event_device *clk) return arch_timer_shutdown(ARCH_TIMER_PHYS_ACCESS, clk); } -static int arch_timer_shutdown_virt_mem(struct clock_event_device *clk) -{ - return arch_timer_shutdown(ARCH_TIMER_MEM_VIRT_ACCESS, clk); -} - -static int arch_timer_shutdown_phys_mem(struct clock_event_device *clk) -{ - return arch_timer_shutdown(ARCH_TIMER_MEM_PHYS_ACCESS, clk); -} - static __always_inline void set_next_event(const int access, unsigned long evt, struct clock_event_device *clk) { unsigned long ctrl; u64 cnt; - ctrl = arch_timer_reg_read(access, ARCH_TIMER_REG_CTRL, clk); + ctrl = arch_timer_reg_read_cp15(access, ARCH_TIMER_REG_CTRL); ctrl |= ARCH_TIMER_CTRL_ENABLE; ctrl &= ~ARCH_TIMER_CTRL_IT_MASK; @@ -753,8 +628,8 @@ static __always_inline void set_next_event(const int access, unsigned long evt, else cnt = __arch_counter_get_cntvct(); - arch_timer_reg_write(access, ARCH_TIMER_REG_CVAL, evt + cnt, clk); - arch_timer_reg_write(access, ARCH_TIMER_REG_CTRL, ctrl, clk); + arch_timer_reg_write_cp15(access, ARCH_TIMER_REG_CVAL, evt + cnt); + arch_timer_reg_write_cp15(access, ARCH_TIMER_REG_CTRL, ctrl); } static int arch_timer_set_next_event_virt(unsigned long evt, @@ -771,60 +646,6 @@ static int arch_timer_set_next_event_phys(unsigned long evt, return 0; } -static noinstr u64 arch_counter_get_cnt_mem(struct arch_timer *t, int offset_lo) -{ - u32 cnt_lo, cnt_hi, tmp_hi; - - do { - cnt_hi = __le32_to_cpu((__le32 __force)__raw_readl(t->base + offset_lo + 4)); - cnt_lo = __le32_to_cpu((__le32 __force)__raw_readl(t->base + offset_lo)); - tmp_hi = __le32_to_cpu((__le32 __force)__raw_readl(t->base + offset_lo + 4)); - } while (cnt_hi != tmp_hi); - - return ((u64) cnt_hi << 32) | cnt_lo; -} - -static __always_inline void set_next_event_mem(const int access, unsigned long evt, - struct clock_event_device *clk) -{ - struct arch_timer *timer = to_arch_timer(clk); - unsigned long ctrl; - u64 cnt; - - ctrl = arch_timer_reg_read(access, ARCH_TIMER_REG_CTRL, clk); - - /* Timer must be disabled before programming CVAL */ - if (ctrl & ARCH_TIMER_CTRL_ENABLE) { - ctrl &= ~ARCH_TIMER_CTRL_ENABLE; - arch_timer_reg_write(access, ARCH_TIMER_REG_CTRL, ctrl, clk); - } - - ctrl |= ARCH_TIMER_CTRL_ENABLE; - ctrl &= ~ARCH_TIMER_CTRL_IT_MASK; - - if (access == ARCH_TIMER_MEM_VIRT_ACCESS) - cnt = arch_counter_get_cnt_mem(timer, CNTVCT_LO); - else - cnt = arch_counter_get_cnt_mem(timer, CNTPCT_LO); - - arch_timer_reg_write(access, ARCH_TIMER_REG_CVAL, evt + cnt, clk); - arch_timer_reg_write(access, ARCH_TIMER_REG_CTRL, ctrl, clk); -} - -static int arch_timer_set_next_event_virt_mem(unsigned long evt, - struct clock_event_device *clk) -{ - set_next_event_mem(ARCH_TIMER_MEM_VIRT_ACCESS, evt, clk); - return 0; -} - -static int arch_timer_set_next_event_phys_mem(unsigned long evt, - struct clock_event_device *clk) -{ - set_next_event_mem(ARCH_TIMER_MEM_PHYS_ACCESS, evt, clk); - return 0; -} - static u64 __arch_timer_check_delta(void) { #ifdef CONFIG_ARM64 @@ -850,63 +671,41 @@ static u64 __arch_timer_check_delta(void) return CLOCKSOURCE_MASK(arch_counter_get_width()); } -static void __arch_timer_setup(unsigned type, - struct clock_event_device *clk) +static void __arch_timer_setup(struct clock_event_device *clk) { + typeof(clk->set_next_event) sne; u64 max_delta; clk->features = CLOCK_EVT_FEAT_ONESHOT; - if (type == ARCH_TIMER_TYPE_CP15) { - typeof(clk->set_next_event) sne; - - arch_timer_check_ool_workaround(ate_match_local_cap_id, NULL); - - if (arch_timer_c3stop) - clk->features |= CLOCK_EVT_FEAT_C3STOP; - clk->name = "arch_sys_timer"; - clk->rating = 450; - clk->cpumask = cpumask_of(smp_processor_id()); - clk->irq = arch_timer_ppi[arch_timer_uses_ppi]; - switch (arch_timer_uses_ppi) { - case ARCH_TIMER_VIRT_PPI: - clk->set_state_shutdown = arch_timer_shutdown_virt; - clk->set_state_oneshot_stopped = arch_timer_shutdown_virt; - sne = erratum_handler(set_next_event_virt); - break; - case ARCH_TIMER_PHYS_SECURE_PPI: - case ARCH_TIMER_PHYS_NONSECURE_PPI: - case ARCH_TIMER_HYP_PPI: - clk->set_state_shutdown = arch_timer_shutdown_phys; - clk->set_state_oneshot_stopped = arch_timer_shutdown_phys; - sne = erratum_handler(set_next_event_phys); - break; - default: - BUG(); - } + arch_timer_check_ool_workaround(ate_match_local_cap_id, NULL); - clk->set_next_event = sne; - max_delta = __arch_timer_check_delta(); - } else { - clk->features |= CLOCK_EVT_FEAT_DYNIRQ; - clk->name = "arch_mem_timer"; - clk->rating = 400; - clk->cpumask = cpu_possible_mask; - if (arch_timer_mem_use_virtual) { - clk->set_state_shutdown = arch_timer_shutdown_virt_mem; - clk->set_state_oneshot_stopped = arch_timer_shutdown_virt_mem; - clk->set_next_event = - arch_timer_set_next_event_virt_mem; - } else { - clk->set_state_shutdown = arch_timer_shutdown_phys_mem; - clk->set_state_oneshot_stopped = arch_timer_shutdown_phys_mem; - clk->set_next_event = - arch_timer_set_next_event_phys_mem; - } - - max_delta = CLOCKSOURCE_MASK(56); + if (arch_timer_c3stop) + clk->features |= CLOCK_EVT_FEAT_C3STOP; + clk->name = "arch_sys_timer"; + clk->rating = 450; + clk->cpumask = cpumask_of(smp_processor_id()); + clk->irq = arch_timer_ppi[arch_timer_uses_ppi]; + switch (arch_timer_uses_ppi) { + case ARCH_TIMER_VIRT_PPI: + clk->set_state_shutdown = arch_timer_shutdown_virt; + clk->set_state_oneshot_stopped = arch_timer_shutdown_virt; + sne = erratum_handler(set_next_event_virt); + break; + case ARCH_TIMER_PHYS_SECURE_PPI: + case ARCH_TIMER_PHYS_NONSECURE_PPI: + case ARCH_TIMER_HYP_PPI: + clk->set_state_shutdown = arch_timer_shutdown_phys; + clk->set_state_oneshot_stopped = arch_timer_shutdown_phys; + sne = erratum_handler(set_next_event_phys); + break; + default: + BUG(); } + clk->set_next_event = sne; + max_delta = __arch_timer_check_delta(); + clk->set_state_shutdown(clk); clockevents_config_and_register(clk, arch_timer_rate, 0xf, max_delta); @@ -1029,7 +828,7 @@ static int arch_timer_starting_cpu(unsigned int cpu) struct clock_event_device *clk = this_cpu_ptr(arch_timer_evt); u32 flags; - __arch_timer_setup(ARCH_TIMER_TYPE_CP15, clk); + __arch_timer_setup(clk); flags = check_ppi_trigger(arch_timer_ppi[arch_timer_uses_ppi]); enable_percpu_irq(arch_timer_ppi[arch_timer_uses_ppi], flags); @@ -1075,22 +874,12 @@ static void __init arch_timer_of_configure_rate(u32 rate, struct device_node *np pr_warn("frequency not available\n"); } -static void __init arch_timer_banner(unsigned type) +static void __init arch_timer_banner(void) { - pr_info("%s%s%s timer(s) running at %lu.%02luMHz (%s%s%s).\n", - type & ARCH_TIMER_TYPE_CP15 ? "cp15" : "", - type == (ARCH_TIMER_TYPE_CP15 | ARCH_TIMER_TYPE_MEM) ? - " and " : "", - type & ARCH_TIMER_TYPE_MEM ? "mmio" : "", + pr_info("cp15 timer running at %lu.%02luMHz (%s).\n", (unsigned long)arch_timer_rate / 1000000, (unsigned long)(arch_timer_rate / 10000) % 100, - type & ARCH_TIMER_TYPE_CP15 ? - (arch_timer_uses_ppi == ARCH_TIMER_VIRT_PPI) ? "virt" : "phys" : - "", - type == (ARCH_TIMER_TYPE_CP15 | ARCH_TIMER_TYPE_MEM) ? "/" : "", - type & ARCH_TIMER_TYPE_MEM ? - arch_timer_mem_use_virtual ? "virt" : "phys" : - ""); + (arch_timer_uses_ppi == ARCH_TIMER_VIRT_PPI) ? "virt" : "phys"); } u32 arch_timer_get_rate(void) @@ -1108,11 +897,6 @@ bool arch_timer_evtstrm_available(void) return cpumask_test_cpu(raw_smp_processor_id(), &evtstrm_available); } -static noinstr u64 arch_counter_get_cntvct_mem(void) -{ - return arch_counter_get_cnt_mem(arch_timer_mem, CNTVCT_LO); -} - static struct arch_timer_kvm_info arch_timer_kvm_info; struct arch_timer_kvm_info *arch_timer_get_kvm_info(void) @@ -1120,42 +904,35 @@ struct arch_timer_kvm_info *arch_timer_get_kvm_info(void) return &arch_timer_kvm_info; } -static void __init arch_counter_register(unsigned type) +static void __init arch_counter_register(void) { u64 (*scr)(void); + u64 (*rd)(void); u64 start_count; int width; - /* Register the CP15 based counter if we have one */ - if (type & ARCH_TIMER_TYPE_CP15) { - u64 (*rd)(void); - - if ((IS_ENABLED(CONFIG_ARM64) && !is_hyp_mode_available()) || - arch_timer_uses_ppi == ARCH_TIMER_VIRT_PPI) { - if (arch_timer_counter_has_wa()) { - rd = arch_counter_get_cntvct_stable; - scr = raw_counter_get_cntvct_stable; - } else { - rd = arch_counter_get_cntvct; - scr = arch_counter_get_cntvct; - } + if ((IS_ENABLED(CONFIG_ARM64) && !is_hyp_mode_available()) || + arch_timer_uses_ppi == ARCH_TIMER_VIRT_PPI) { + if (arch_timer_counter_has_wa()) { + rd = arch_counter_get_cntvct_stable; + scr = raw_counter_get_cntvct_stable; } else { - if (arch_timer_counter_has_wa()) { - rd = arch_counter_get_cntpct_stable; - scr = raw_counter_get_cntpct_stable; - } else { - rd = arch_counter_get_cntpct; - scr = arch_counter_get_cntpct; - } + rd = arch_counter_get_cntvct; + scr = arch_counter_get_cntvct; } - - arch_timer_read_counter = rd; - clocksource_counter.vdso_clock_mode = vdso_default; } else { - arch_timer_read_counter = arch_counter_get_cntvct_mem; - scr = arch_counter_get_cntvct_mem; + if (arch_timer_counter_has_wa()) { + rd = arch_counter_get_cntpct_stable; + scr = raw_counter_get_cntpct_stable; + } else { + rd = arch_counter_get_cntpct; + scr = arch_counter_get_cntpct; + } } + arch_timer_read_counter = rd; + clocksource_counter.vdso_clock_mode = vdso_default; + width = arch_counter_get_width(); clocksource_counter.mask = CLOCKSOURCE_MASK(width); cyclecounter.mask = CLOCKSOURCE_MASK(width); @@ -1303,76 +1080,10 @@ out: return err; } -static int __init arch_timer_mem_register(void __iomem *base, unsigned int irq) -{ - int ret; - irq_handler_t func; - - arch_timer_mem = kzalloc(sizeof(*arch_timer_mem), GFP_KERNEL); - if (!arch_timer_mem) - return -ENOMEM; - - arch_timer_mem->base = base; - arch_timer_mem->evt.irq = irq; - __arch_timer_setup(ARCH_TIMER_TYPE_MEM, &arch_timer_mem->evt); - - if (arch_timer_mem_use_virtual) - func = arch_timer_handler_virt_mem; - else - func = arch_timer_handler_phys_mem; - - ret = request_irq(irq, func, IRQF_TIMER, "arch_mem_timer", &arch_timer_mem->evt); - if (ret) { - pr_err("Failed to request mem timer irq\n"); - kfree(arch_timer_mem); - arch_timer_mem = NULL; - } - - return ret; -} - -static const struct of_device_id arch_timer_of_match[] __initconst = { - { .compatible = "arm,armv7-timer", }, - { .compatible = "arm,armv8-timer", }, - {}, -}; - -static const struct of_device_id arch_timer_mem_of_match[] __initconst = { - { .compatible = "arm,armv7-timer-mem", }, - {}, -}; - -static bool __init arch_timer_needs_of_probing(void) -{ - struct device_node *dn; - bool needs_probing = false; - unsigned int mask = ARCH_TIMER_TYPE_CP15 | ARCH_TIMER_TYPE_MEM; - - /* We have two timers, and both device-tree nodes are probed. */ - if ((arch_timers_present & mask) == mask) - return false; - - /* - * Only one type of timer is probed, - * check if we have another type of timer node in device-tree. - */ - if (arch_timers_present & ARCH_TIMER_TYPE_CP15) - dn = of_find_matching_node(NULL, arch_timer_mem_of_match); - else - dn = of_find_matching_node(NULL, arch_timer_of_match); - - if (dn && of_device_is_available(dn)) - needs_probing = true; - - of_node_put(dn); - - return needs_probing; -} - static int __init arch_timer_common_init(void) { - arch_timer_banner(arch_timers_present); - arch_counter_register(arch_timers_present); + arch_timer_banner(); + arch_counter_register(); return arch_timer_arch_init(); } @@ -1421,13 +1132,11 @@ static int __init arch_timer_of_init(struct device_node *np) u32 rate; bool has_names; - if (arch_timers_present & ARCH_TIMER_TYPE_CP15) { + if (arch_timer_evt) { pr_warn("multiple nodes in dt, skipping\n"); return 0; } - arch_timers_present |= ARCH_TIMER_TYPE_CP15; - has_names = of_property_present(np, "interrupt-names"); for (i = ARCH_TIMER_PHYS_SECURE_PPI; i < ARCH_TIMER_MAX_TIMER_PPI; i++) { @@ -1472,283 +1181,22 @@ static int __init arch_timer_of_init(struct device_node *np) if (ret) return ret; - if (arch_timer_needs_of_probing()) - return 0; - return arch_timer_common_init(); } TIMER_OF_DECLARE(armv7_arch_timer, "arm,armv7-timer", arch_timer_of_init); TIMER_OF_DECLARE(armv8_arch_timer, "arm,armv8-timer", arch_timer_of_init); -static u32 __init -arch_timer_mem_frame_get_cntfrq(struct arch_timer_mem_frame *frame) -{ - void __iomem *base; - u32 rate; - - base = ioremap(frame->cntbase, frame->size); - if (!base) { - pr_err("Unable to map frame @ %pa\n", &frame->cntbase); - return 0; - } - - rate = readl_relaxed(base + CNTFRQ); - - iounmap(base); - - return rate; -} - -static struct arch_timer_mem_frame * __init -arch_timer_mem_find_best_frame(struct arch_timer_mem *timer_mem) -{ - struct arch_timer_mem_frame *frame, *best_frame = NULL; - void __iomem *cntctlbase; - u32 cnttidr; - int i; - - cntctlbase = ioremap(timer_mem->cntctlbase, timer_mem->size); - if (!cntctlbase) { - pr_err("Can't map CNTCTLBase @ %pa\n", - &timer_mem->cntctlbase); - return NULL; - } - - cnttidr = readl_relaxed(cntctlbase + CNTTIDR); - - /* - * Try to find a virtual capable frame. Otherwise fall back to a - * physical capable frame. - */ - for (i = 0; i < ARCH_TIMER_MEM_MAX_FRAMES; i++) { - u32 cntacr = CNTACR_RFRQ | CNTACR_RWPT | CNTACR_RPCT | - CNTACR_RWVT | CNTACR_RVOFF | CNTACR_RVCT; - - frame = &timer_mem->frame[i]; - if (!frame->valid) - continue; - - /* Try enabling everything, and see what sticks */ - writel_relaxed(cntacr, cntctlbase + CNTACR(i)); - cntacr = readl_relaxed(cntctlbase + CNTACR(i)); - - if ((cnttidr & CNTTIDR_VIRT(i)) && - !(~cntacr & (CNTACR_RWVT | CNTACR_RVCT))) { - best_frame = frame; - arch_timer_mem_use_virtual = true; - break; - } - - if (~cntacr & (CNTACR_RWPT | CNTACR_RPCT)) - continue; - - best_frame = frame; - } - - iounmap(cntctlbase); - - return best_frame; -} - -static int __init -arch_timer_mem_frame_register(struct arch_timer_mem_frame *frame) -{ - void __iomem *base; - int ret, irq; - - if (arch_timer_mem_use_virtual) - irq = frame->virt_irq; - else - irq = frame->phys_irq; - - if (!irq) { - pr_err("Frame missing %s irq.\n", - arch_timer_mem_use_virtual ? "virt" : "phys"); - return -EINVAL; - } - - if (!request_mem_region(frame->cntbase, frame->size, - "arch_mem_timer")) - return -EBUSY; - - base = ioremap(frame->cntbase, frame->size); - if (!base) { - pr_err("Can't map frame's registers\n"); - return -ENXIO; - } - - ret = arch_timer_mem_register(base, irq); - if (ret) { - iounmap(base); - return ret; - } - - arch_timers_present |= ARCH_TIMER_TYPE_MEM; - - return 0; -} - -static int __init arch_timer_mem_of_init(struct device_node *np) -{ - struct arch_timer_mem *timer_mem; - struct arch_timer_mem_frame *frame; - struct resource res; - int ret = -EINVAL; - u32 rate; - - timer_mem = kzalloc(sizeof(*timer_mem), GFP_KERNEL); - if (!timer_mem) - return -ENOMEM; - - if (of_address_to_resource(np, 0, &res)) - goto out; - timer_mem->cntctlbase = res.start; - timer_mem->size = resource_size(&res); - - for_each_available_child_of_node_scoped(np, frame_node) { - u32 n; - struct arch_timer_mem_frame *frame; - - if (of_property_read_u32(frame_node, "frame-number", &n)) { - pr_err(FW_BUG "Missing frame-number.\n"); - goto out; - } - if (n >= ARCH_TIMER_MEM_MAX_FRAMES) { - pr_err(FW_BUG "Wrong frame-number, only 0-%u are permitted.\n", - ARCH_TIMER_MEM_MAX_FRAMES - 1); - goto out; - } - frame = &timer_mem->frame[n]; - - if (frame->valid) { - pr_err(FW_BUG "Duplicated frame-number.\n"); - goto out; - } - - if (of_address_to_resource(frame_node, 0, &res)) - goto out; - - frame->cntbase = res.start; - frame->size = resource_size(&res); - - frame->virt_irq = irq_of_parse_and_map(frame_node, - ARCH_TIMER_VIRT_SPI); - frame->phys_irq = irq_of_parse_and_map(frame_node, - ARCH_TIMER_PHYS_SPI); - - frame->valid = true; - } - - frame = arch_timer_mem_find_best_frame(timer_mem); - if (!frame) { - pr_err("Unable to find a suitable frame in timer @ %pa\n", - &timer_mem->cntctlbase); - ret = -EINVAL; - goto out; - } - - rate = arch_timer_mem_frame_get_cntfrq(frame); - arch_timer_of_configure_rate(rate, np); - - ret = arch_timer_mem_frame_register(frame); - if (!ret && !arch_timer_needs_of_probing()) - ret = arch_timer_common_init(); -out: - kfree(timer_mem); - return ret; -} -TIMER_OF_DECLARE(armv7_arch_timer_mem, "arm,armv7-timer-mem", - arch_timer_mem_of_init); - #ifdef CONFIG_ACPI_GTDT -static int __init -arch_timer_mem_verify_cntfrq(struct arch_timer_mem *timer_mem) -{ - struct arch_timer_mem_frame *frame; - u32 rate; - int i; - - for (i = 0; i < ARCH_TIMER_MEM_MAX_FRAMES; i++) { - frame = &timer_mem->frame[i]; - - if (!frame->valid) - continue; - - rate = arch_timer_mem_frame_get_cntfrq(frame); - if (rate == arch_timer_rate) - continue; - - pr_err(FW_BUG "CNTFRQ mismatch: frame @ %pa: (0x%08lx), CPU: (0x%08lx)\n", - &frame->cntbase, - (unsigned long)rate, (unsigned long)arch_timer_rate); - - return -EINVAL; - } - - return 0; -} - -static int __init arch_timer_mem_acpi_init(int platform_timer_count) -{ - struct arch_timer_mem *timers, *timer; - struct arch_timer_mem_frame *frame, *best_frame = NULL; - int timer_count, i, ret = 0; - - timers = kcalloc(platform_timer_count, sizeof(*timers), - GFP_KERNEL); - if (!timers) - return -ENOMEM; - - ret = acpi_arch_timer_mem_init(timers, &timer_count); - if (ret || !timer_count) - goto out; - - /* - * While unlikely, it's theoretically possible that none of the frames - * in a timer expose the combination of feature we want. - */ - for (i = 0; i < timer_count; i++) { - timer = &timers[i]; - - frame = arch_timer_mem_find_best_frame(timer); - if (!best_frame) - best_frame = frame; - - ret = arch_timer_mem_verify_cntfrq(timer); - if (ret) { - pr_err("Disabling MMIO timers due to CNTFRQ mismatch\n"); - goto out; - } - - if (!best_frame) /* implies !frame */ - /* - * Only complain about missing suitable frames if we - * haven't already found one in a previous iteration. - */ - pr_err("Unable to find a suitable frame in timer @ %pa\n", - &timer->cntctlbase); - } - - if (best_frame) - ret = arch_timer_mem_frame_register(best_frame); -out: - kfree(timers); - return ret; -} - -/* Initialize per-processor generic timer and memory-mapped timer(if present) */ static int __init arch_timer_acpi_init(struct acpi_table_header *table) { - int ret, platform_timer_count; + int ret; - if (arch_timers_present & ARCH_TIMER_TYPE_CP15) { + if (arch_timer_evt) { pr_warn("already initialized, skipping\n"); return -EINVAL; } - arch_timers_present |= ARCH_TIMER_TYPE_CP15; - - ret = acpi_gtdt_init(table, &platform_timer_count); + ret = acpi_gtdt_init(table, NULL); if (ret) return ret; @@ -1790,10 +1238,6 @@ static int __init arch_timer_acpi_init(struct acpi_table_header *table) if (ret) return ret; - if (platform_timer_count && - arch_timer_mem_acpi_init(platform_timer_count)) - pr_err("Failed to initialize memory-mapped timer.\n"); - return arch_timer_common_init(); } TIMER_ACPI_DECLARE(arch_timer, ACPI_SIG_GTDT, arch_timer_acpi_init); diff --git a/drivers/clocksource/arm_arch_timer_mmio.c b/drivers/clocksource/arm_arch_timer_mmio.c new file mode 100644 index 000000000000..ebe1987d651e --- /dev/null +++ b/drivers/clocksource/arm_arch_timer_mmio.c @@ -0,0 +1,440 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * ARM Generic Memory Mapped Timer support + * + * Split from drivers/clocksource/arm_arch_timer.c + * + * Copyright (C) 2011 ARM Ltd. + * All Rights Reserved + */ + +#define pr_fmt(fmt) "arch_timer_mmio: " fmt + +#include <linux/clockchips.h> +#include <linux/interrupt.h> +#include <linux/io-64-nonatomic-lo-hi.h> +#include <linux/of_irq.h> +#include <linux/of_address.h> +#include <linux/platform_device.h> + +#include <clocksource/arm_arch_timer.h> + +#define CNTTIDR 0x08 +#define CNTTIDR_VIRT(n) (BIT(1) << ((n) * 4)) + +#define CNTACR(n) (0x40 + ((n) * 4)) +#define CNTACR_RPCT BIT(0) +#define CNTACR_RVCT BIT(1) +#define CNTACR_RFRQ BIT(2) +#define CNTACR_RVOFF BIT(3) +#define CNTACR_RWVT BIT(4) +#define CNTACR_RWPT BIT(5) + +#define CNTPCT_LO 0x00 +#define CNTVCT_LO 0x08 +#define CNTFRQ 0x10 +#define CNTP_CVAL_LO 0x20 +#define CNTP_CTL 0x2c +#define CNTV_CVAL_LO 0x30 +#define CNTV_CTL 0x3c + +enum arch_timer_access { + PHYS_ACCESS, + VIRT_ACCESS, +}; + +struct arch_timer { + struct clock_event_device evt; + struct clocksource cs; + struct arch_timer_mem *gt_block; + void __iomem *base; + enum arch_timer_access access; + u32 rate; +}; + +#define evt_to_arch_timer(e) container_of(e, struct arch_timer, evt) +#define cs_to_arch_timer(c) container_of(c, struct arch_timer, cs) + +static void arch_timer_mmio_write(struct arch_timer *timer, + enum arch_timer_reg reg, u64 val) +{ + switch (timer->access) { + case PHYS_ACCESS: + switch (reg) { + case ARCH_TIMER_REG_CTRL: + writel_relaxed((u32)val, timer->base + CNTP_CTL); + return; + case ARCH_TIMER_REG_CVAL: + /* + * Not guaranteed to be atomic, so the timer + * must be disabled at this point. + */ + writeq_relaxed(val, timer->base + CNTP_CVAL_LO); + return; + } + break; + case VIRT_ACCESS: + switch (reg) { + case ARCH_TIMER_REG_CTRL: + writel_relaxed((u32)val, timer->base + CNTV_CTL); + return; + case ARCH_TIMER_REG_CVAL: + /* Same restriction as above */ + writeq_relaxed(val, timer->base + CNTV_CVAL_LO); + return; + } + break; + } + + /* Should never be here */ + WARN_ON_ONCE(1); +} + +static u32 arch_timer_mmio_read(struct arch_timer *timer, enum arch_timer_reg reg) +{ + switch (timer->access) { + case PHYS_ACCESS: + switch (reg) { + case ARCH_TIMER_REG_CTRL: + return readl_relaxed(timer->base + CNTP_CTL); + default: + break; + } + break; + case VIRT_ACCESS: + switch (reg) { + case ARCH_TIMER_REG_CTRL: + return readl_relaxed(timer->base + CNTV_CTL); + default: + break; + } + break; + } + + /* Should never be here */ + WARN_ON_ONCE(1); + return 0; +} + +static noinstr u64 arch_counter_mmio_get_cnt(struct arch_timer *t) +{ + int offset_lo = t->access == VIRT_ACCESS ? CNTVCT_LO : CNTPCT_LO; + u32 cnt_lo, cnt_hi, tmp_hi; + + do { + cnt_hi = __le32_to_cpu((__le32 __force)__raw_readl(t->base + offset_lo + 4)); + cnt_lo = __le32_to_cpu((__le32 __force)__raw_readl(t->base + offset_lo)); + tmp_hi = __le32_to_cpu((__le32 __force)__raw_readl(t->base + offset_lo + 4)); + } while (cnt_hi != tmp_hi); + + return ((u64) cnt_hi << 32) | cnt_lo; +} + +static u64 arch_mmio_counter_read(struct clocksource *cs) +{ + struct arch_timer *at = cs_to_arch_timer(cs); + + return arch_counter_mmio_get_cnt(at); +} + +static int arch_timer_mmio_shutdown(struct clock_event_device *clk) +{ + struct arch_timer *at = evt_to_arch_timer(clk); + unsigned long ctrl; + + ctrl = arch_timer_mmio_read(at, ARCH_TIMER_REG_CTRL); + ctrl &= ~ARCH_TIMER_CTRL_ENABLE; + arch_timer_mmio_write(at, ARCH_TIMER_REG_CTRL, ctrl); + + return 0; +} + +static int arch_timer_mmio_set_next_event(unsigned long evt, + struct clock_event_device *clk) +{ + struct arch_timer *timer = evt_to_arch_timer(clk); + unsigned long ctrl; + u64 cnt; + + ctrl = arch_timer_mmio_read(timer, ARCH_TIMER_REG_CTRL); + + /* Timer must be disabled before programming CVAL */ + if (ctrl & ARCH_TIMER_CTRL_ENABLE) { + ctrl &= ~ARCH_TIMER_CTRL_ENABLE; + arch_timer_mmio_write(timer, ARCH_TIMER_REG_CTRL, ctrl); + } + + ctrl |= ARCH_TIMER_CTRL_ENABLE; + ctrl &= ~ARCH_TIMER_CTRL_IT_MASK; + + cnt = arch_counter_mmio_get_cnt(timer); + + arch_timer_mmio_write(timer, ARCH_TIMER_REG_CVAL, evt + cnt); + arch_timer_mmio_write(timer, ARCH_TIMER_REG_CTRL, ctrl); + return 0; +} + +static irqreturn_t arch_timer_mmio_handler(int irq, void *dev_id) +{ + struct clock_event_device *evt = dev_id; + struct arch_timer *at = evt_to_arch_timer(evt); + unsigned long ctrl; + + ctrl = arch_timer_mmio_read(at, ARCH_TIMER_REG_CTRL); + if (ctrl & ARCH_TIMER_CTRL_IT_STAT) { + ctrl |= ARCH_TIMER_CTRL_IT_MASK; + arch_timer_mmio_write(at, ARCH_TIMER_REG_CTRL, ctrl); + evt->event_handler(evt); + return IRQ_HANDLED; + } + + return IRQ_NONE; +} + +static struct arch_timer_mem_frame *find_best_frame(struct platform_device *pdev) +{ + struct arch_timer_mem_frame *frame, *best_frame = NULL; + struct arch_timer *at = platform_get_drvdata(pdev); + void __iomem *cntctlbase; + u32 cnttidr; + + cntctlbase = ioremap(at->gt_block->cntctlbase, at->gt_block->size); + if (!cntctlbase) { + dev_err(&pdev->dev, "Can't map CNTCTLBase @ %pa\n", + &at->gt_block->cntctlbase); + return NULL; + } + + cnttidr = readl_relaxed(cntctlbase + CNTTIDR); + + /* + * Try to find a virtual capable frame. Otherwise fall back to a + * physical capable frame. + */ + for (int i = 0; i < ARCH_TIMER_MEM_MAX_FRAMES; i++) { + u32 cntacr = CNTACR_RFRQ | CNTACR_RWPT | CNTACR_RPCT | + CNTACR_RWVT | CNTACR_RVOFF | CNTACR_RVCT; + + frame = &at->gt_block->frame[i]; + if (!frame->valid) + continue; + + /* Try enabling everything, and see what sticks */ + writel_relaxed(cntacr, cntctlbase + CNTACR(i)); + cntacr = readl_relaxed(cntctlbase + CNTACR(i)); + + /* Pick a suitable frame for which we have an IRQ */ + if ((cnttidr & CNTTIDR_VIRT(i)) && + !(~cntacr & (CNTACR_RWVT | CNTACR_RVCT)) && + frame->virt_irq) { + best_frame = frame; + at->access = VIRT_ACCESS; + break; + } + + if ((~cntacr & (CNTACR_RWPT | CNTACR_RPCT)) || + !frame->phys_irq) + continue; + + at->access = PHYS_ACCESS; + best_frame = frame; + } + + iounmap(cntctlbase); + + return best_frame; +} + +static void arch_timer_mmio_setup(struct arch_timer *at, int irq) +{ + at->evt = (struct clock_event_device) { + .features = (CLOCK_EVT_FEAT_ONESHOT | + CLOCK_EVT_FEAT_DYNIRQ), + .name = "arch_mem_timer", + .rating = 400, + .cpumask = cpu_possible_mask, + .irq = irq, + .set_next_event = arch_timer_mmio_set_next_event, + .set_state_oneshot_stopped = arch_timer_mmio_shutdown, + .set_state_shutdown = arch_timer_mmio_shutdown, + }; + + at->evt.set_state_shutdown(&at->evt); + + clockevents_config_and_register(&at->evt, at->rate, 0xf, + (unsigned long)CLOCKSOURCE_MASK(56)); + + enable_irq(at->evt.irq); + + at->cs = (struct clocksource) { + .name = "arch_mmio_counter", + .rating = 300, + .read = arch_mmio_counter_read, + .mask = CLOCKSOURCE_MASK(56), + .flags = CLOCK_SOURCE_IS_CONTINUOUS, + }; + + clocksource_register_hz(&at->cs, at->rate); +} + +static int arch_timer_mmio_frame_register(struct platform_device *pdev, + struct arch_timer_mem_frame *frame) +{ + struct arch_timer *at = platform_get_drvdata(pdev); + struct device_node *np = pdev->dev.of_node; + int ret, irq; + u32 rate; + + if (!devm_request_mem_region(&pdev->dev, frame->cntbase, frame->size, + "arch_mem_timer")) + return -EBUSY; + + at->base = devm_ioremap(&pdev->dev, frame->cntbase, frame->size); + if (!at->base) { + dev_err(&pdev->dev, "Can't map frame's registers\n"); + return -ENXIO; + } + + /* + * Allow "clock-frequency" to override the probed rate. If neither + * lead to something useful, use the CPU timer frequency as the + * fallback. The nice thing about that last point is that we woudn't + * made it here if we didn't have a valid frequency. + */ + rate = readl_relaxed(at->base + CNTFRQ); + + if (!np || of_property_read_u32(np, "clock-frequency", &at->rate)) + at->rate = rate; + + if (!at->rate) + at->rate = arch_timer_get_rate(); + + irq = at->access == VIRT_ACCESS ? frame->virt_irq : frame->phys_irq; + ret = devm_request_irq(&pdev->dev, irq, arch_timer_mmio_handler, + IRQF_TIMER | IRQF_NO_AUTOEN, "arch_mem_timer", + &at->evt); + if (ret) { + dev_err(&pdev->dev, "Failed to request mem timer irq\n"); + return ret; + } + + /* Afer this point, we're not allowed to fail anymore */ + arch_timer_mmio_setup(at, irq); + return 0; +} + +static int of_populate_gt_block(struct platform_device *pdev, + struct arch_timer *at) +{ + struct resource res; + + if (of_address_to_resource(pdev->dev.of_node, 0, &res)) + return -EINVAL; + + at->gt_block->cntctlbase = res.start; + at->gt_block->size = resource_size(&res); + + for_each_available_child_of_node_scoped(pdev->dev.of_node, frame_node) { + struct arch_timer_mem_frame *frame; + u32 n; + + if (of_property_read_u32(frame_node, "frame-number", &n)) { + dev_err(&pdev->dev, FW_BUG "Missing frame-number\n"); + return -EINVAL; + } + if (n >= ARCH_TIMER_MEM_MAX_FRAMES) { + dev_err(&pdev->dev, + FW_BUG "Wrong frame-number, only 0-%u are permitted\n", + ARCH_TIMER_MEM_MAX_FRAMES - 1); + return -EINVAL; + } + + frame = &at->gt_block->frame[n]; + + if (frame->valid) { + dev_err(&pdev->dev, FW_BUG "Duplicated frame-number\n"); + return -EINVAL; + } + + if (of_address_to_resource(frame_node, 0, &res)) + return -EINVAL; + + frame->cntbase = res.start; + frame->size = resource_size(&res); + + frame->phys_irq = irq_of_parse_and_map(frame_node, 0); + frame->virt_irq = irq_of_parse_and_map(frame_node, 1); + + frame->valid = true; + } + + return 0; +} + +static int arch_timer_mmio_probe(struct platform_device *pdev) +{ + struct arch_timer_mem_frame *frame; + struct arch_timer *at; + struct device_node *np; + int ret; + + np = pdev->dev.of_node; + + at = devm_kmalloc(&pdev->dev, sizeof(*at), GFP_KERNEL | __GFP_ZERO); + if (!at) + return -ENOMEM; + + if (np) { + at->gt_block = devm_kmalloc(&pdev->dev, sizeof(*at->gt_block), + GFP_KERNEL | __GFP_ZERO); + if (!at->gt_block) + return -ENOMEM; + ret = of_populate_gt_block(pdev, at); + if (ret) + return ret; + } else { + at->gt_block = dev_get_platdata(&pdev->dev); + } + + platform_set_drvdata(pdev, at); + + frame = find_best_frame(pdev); + if (!frame) { + dev_err(&pdev->dev, + "Unable to find a suitable frame in timer @ %pa\n", + &at->gt_block->cntctlbase); + return -EINVAL; + } + + ret = arch_timer_mmio_frame_register(pdev, frame); + if (!ret) + dev_info(&pdev->dev, + "mmio timer running at %lu.%02luMHz (%s)\n", + (unsigned long)at->rate / 1000000, + (unsigned long)(at->rate / 10000) % 100, + at->access == VIRT_ACCESS ? "virt" : "phys"); + + return ret; +} + +static const struct of_device_id arch_timer_mmio_of_table[] = { + { .compatible = "arm,armv7-timer-mem", }, + {} +}; + +static struct platform_driver arch_timer_mmio_drv = { + .driver = { + .name = "arch-timer-mmio", + .of_match_table = arch_timer_mmio_of_table, + }, + .probe = arch_timer_mmio_probe, +}; +builtin_platform_driver(arch_timer_mmio_drv); + +static struct platform_driver arch_timer_mmio_acpi_drv = { + .driver = { + .name = "gtdt-arm-mmio-timer", + }, + .probe = arch_timer_mmio_probe, +}; +builtin_platform_driver(arch_timer_mmio_acpi_drv); diff --git a/drivers/clocksource/arm_global_timer.c b/drivers/clocksource/arm_global_timer.c index 2d86bbc2764a..5e3d6bb7e437 100644 --- a/drivers/clocksource/arm_global_timer.c +++ b/drivers/clocksource/arm_global_timer.c @@ -263,14 +263,13 @@ static void __init gt_delay_timer_init(void) register_current_timer_delay(>_delay_timer); } -static int __init gt_clocksource_init(void) +static int __init gt_clocksource_init(unsigned int psv) { writel(0, gt_base + GT_CONTROL); writel(0, gt_base + GT_COUNTER0); writel(0, gt_base + GT_COUNTER1); /* set prescaler and enable timer on all the cores */ - writel(FIELD_PREP(GT_CONTROL_PRESCALER_MASK, - CONFIG_ARM_GT_INITIAL_PRESCALER_VAL - 1) | + writel(FIELD_PREP(GT_CONTROL_PRESCALER_MASK, psv - 1) | GT_CONTROL_TIMER_ENABLE, gt_base + GT_CONTROL); #ifdef CONFIG_CLKSRC_ARM_GLOBAL_TIMER_SCHED_CLOCK @@ -338,11 +337,45 @@ static int gt_clk_rate_change_cb(struct notifier_block *nb, return NOTIFY_DONE; } +struct gt_prescaler_config { + const char *compatible; + unsigned long prescaler; +}; + +static const struct gt_prescaler_config gt_prescaler_configs[] = { + /* + * On am43 the global timer clock is a child of the clock used for CPU + * OPPs, so the initial prescaler has to be compatible with all OPPs + * which are 300, 600, 720, 800 and 1000 with a fixed divider of 2, this + * gives us a GCD of 10. Initial frequency is 1000, so the prescaler is + * 50. + */ + { .compatible = "ti,am43", .prescaler = 50 }, + { .compatible = "xlnx,zynq-7000", .prescaler = 2 }, + { .compatible = NULL } +}; + +static unsigned long gt_get_initial_prescaler_value(struct device_node *np) +{ + const struct gt_prescaler_config *config; + + if (CONFIG_ARM_GT_INITIAL_PRESCALER_VAL != 0) + return CONFIG_ARM_GT_INITIAL_PRESCALER_VAL; + + for (config = gt_prescaler_configs; config->compatible; config++) { + if (of_machine_is_compatible(config->compatible)) + return config->prescaler; + } + + return 1; +} + static int __init global_timer_of_register(struct device_node *np) { struct clk *gt_clk; static unsigned long gt_clk_rate; int err; + unsigned long psv; /* * In A9 r2p0 the comparators for each processor with the global timer @@ -378,8 +411,9 @@ static int __init global_timer_of_register(struct device_node *np) goto out_unmap; } + psv = gt_get_initial_prescaler_value(np); gt_clk_rate = clk_get_rate(gt_clk); - gt_target_rate = gt_clk_rate / CONFIG_ARM_GT_INITIAL_PRESCALER_VAL; + gt_target_rate = gt_clk_rate / psv; gt_clk_rate_change_nb.notifier_call = gt_clk_rate_change_cb; err = clk_notifier_register(gt_clk, >_clk_rate_change_nb); @@ -404,7 +438,7 @@ static int __init global_timer_of_register(struct device_node *np) } /* Register and immediately configure the timer on the boot CPU */ - err = gt_clocksource_init(); + err = gt_clocksource_init(psv); if (err) goto out_irq; diff --git a/drivers/clocksource/clps711x-timer.c b/drivers/clocksource/clps711x-timer.c index e95fdc49c226..bbceb0289d45 100644 --- a/drivers/clocksource/clps711x-timer.c +++ b/drivers/clocksource/clps711x-timer.c @@ -78,24 +78,33 @@ static int __init clps711x_timer_init(struct device_node *np) unsigned int irq = irq_of_parse_and_map(np, 0); struct clk *clock = of_clk_get(np, 0); void __iomem *base = of_iomap(np, 0); + int ret = 0; if (!base) return -ENOMEM; - if (!irq) - return -EINVAL; - if (IS_ERR(clock)) - return PTR_ERR(clock); + if (!irq) { + ret = -EINVAL; + goto unmap_io; + } + if (IS_ERR(clock)) { + ret = PTR_ERR(clock); + goto unmap_io; + } switch (of_alias_get_id(np, "timer")) { case CLPS711X_CLKSRC_CLOCKSOURCE: clps711x_clksrc_init(clock, base); break; case CLPS711X_CLKSRC_CLOCKEVENT: - return _clps711x_clkevt_init(clock, base, irq); + ret = _clps711x_clkevt_init(clock, base, irq); + break; default: - return -EINVAL; + ret = -EINVAL; + break; } - return 0; +unmap_io: + iounmap(base); + return ret; } TIMER_OF_DECLARE(clps711x, "cirrus,ep7209-timer", clps711x_timer_init); diff --git a/drivers/clocksource/ingenic-sysost.c b/drivers/clocksource/ingenic-sysost.c index cb6fc2f152d4..e79cfb0b8e05 100644 --- a/drivers/clocksource/ingenic-sysost.c +++ b/drivers/clocksource/ingenic-sysost.c @@ -127,18 +127,23 @@ static u8 ingenic_ost_get_prescale(unsigned long rate, unsigned long req_rate) return 2; /* /16 divider */ } -static long ingenic_ost_round_rate(struct clk_hw *hw, unsigned long req_rate, - unsigned long *parent_rate) +static int ingenic_ost_determine_rate(struct clk_hw *hw, + struct clk_rate_request *req) { - unsigned long rate = *parent_rate; + unsigned long rate = req->best_parent_rate; u8 prescale; - if (req_rate > rate) - return rate; + if (req->rate > rate) { + req->rate = rate; - prescale = ingenic_ost_get_prescale(rate, req_rate); + return 0; + } + + prescale = ingenic_ost_get_prescale(rate, req->rate); - return rate >> (prescale * 2); + req->rate = rate >> (prescale * 2); + + return 0; } static int ingenic_ost_percpu_timer_set_rate(struct clk_hw *hw, unsigned long req_rate, @@ -175,14 +180,14 @@ static int ingenic_ost_global_timer_set_rate(struct clk_hw *hw, unsigned long re static const struct clk_ops ingenic_ost_percpu_timer_ops = { .recalc_rate = ingenic_ost_percpu_timer_recalc_rate, - .round_rate = ingenic_ost_round_rate, - .set_rate = ingenic_ost_percpu_timer_set_rate, + .determine_rate = ingenic_ost_determine_rate, + .set_rate = ingenic_ost_percpu_timer_set_rate, }; static const struct clk_ops ingenic_ost_global_timer_ops = { .recalc_rate = ingenic_ost_global_timer_recalc_rate, - .round_rate = ingenic_ost_round_rate, - .set_rate = ingenic_ost_global_timer_set_rate, + .determine_rate = ingenic_ost_determine_rate, + .set_rate = ingenic_ost_global_timer_set_rate, }; static const char * const ingenic_ost_clk_parents[] = { "ext" }; diff --git a/drivers/clocksource/scx200_hrt.c b/drivers/clocksource/scx200_hrt.c index c3536fffbe9a..5a99801a1657 100644 --- a/drivers/clocksource/scx200_hrt.c +++ b/drivers/clocksource/scx200_hrt.c @@ -52,6 +52,7 @@ static struct clocksource cs_hrt = { .mask = CLOCKSOURCE_MASK(32), .flags = CLOCK_SOURCE_IS_CONTINUOUS, /* mult, shift are set based on mhz27 flag */ + .owner = THIS_MODULE, }; static int __init init_hrt_clocksource(void) diff --git a/drivers/clocksource/sh_cmt.c b/drivers/clocksource/sh_cmt.c index b72b36e0abed..385eb94bbe7c 100644 --- a/drivers/clocksource/sh_cmt.c +++ b/drivers/clocksource/sh_cmt.c @@ -578,37 +578,74 @@ static irqreturn_t sh_cmt_interrupt(int irq, void *dev_id) return IRQ_HANDLED; } -static int sh_cmt_start(struct sh_cmt_channel *ch, unsigned long flag) +static int sh_cmt_start_clocksource(struct sh_cmt_channel *ch) { int ret = 0; unsigned long flags; - if (flag & FLAG_CLOCKSOURCE) - pm_runtime_get_sync(&ch->cmt->pdev->dev); + pm_runtime_get_sync(&ch->cmt->pdev->dev); raw_spin_lock_irqsave(&ch->lock, flags); - if (!(ch->flags & (FLAG_CLOCKEVENT | FLAG_CLOCKSOURCE))) { - if (flag & FLAG_CLOCKEVENT) - pm_runtime_get_sync(&ch->cmt->pdev->dev); + if (!(ch->flags & (FLAG_CLOCKEVENT | FLAG_CLOCKSOURCE))) ret = sh_cmt_enable(ch); - } if (ret) goto out; - ch->flags |= flag; + + ch->flags |= FLAG_CLOCKSOURCE; /* setup timeout if no clockevent */ - if (ch->cmt->num_channels == 1 && - flag == FLAG_CLOCKSOURCE && (!(ch->flags & FLAG_CLOCKEVENT))) + if (ch->cmt->num_channels == 1 && !(ch->flags & FLAG_CLOCKEVENT)) __sh_cmt_set_next(ch, ch->max_match_value); +out: + raw_spin_unlock_irqrestore(&ch->lock, flags); + + return ret; +} + +static void sh_cmt_stop_clocksource(struct sh_cmt_channel *ch) +{ + unsigned long flags; + unsigned long f; + + raw_spin_lock_irqsave(&ch->lock, flags); + + f = ch->flags & (FLAG_CLOCKEVENT | FLAG_CLOCKSOURCE); + + ch->flags &= ~FLAG_CLOCKSOURCE; + + if (f && !(ch->flags & (FLAG_CLOCKEVENT | FLAG_CLOCKSOURCE))) + sh_cmt_disable(ch); + + raw_spin_unlock_irqrestore(&ch->lock, flags); + + pm_runtime_put(&ch->cmt->pdev->dev); +} + +static int sh_cmt_start_clockevent(struct sh_cmt_channel *ch) +{ + int ret = 0; + unsigned long flags; + + raw_spin_lock_irqsave(&ch->lock, flags); + + if (!(ch->flags & (FLAG_CLOCKEVENT | FLAG_CLOCKSOURCE))) { + pm_runtime_get_sync(&ch->cmt->pdev->dev); + ret = sh_cmt_enable(ch); + } + + if (ret) + goto out; + + ch->flags |= FLAG_CLOCKEVENT; out: raw_spin_unlock_irqrestore(&ch->lock, flags); return ret; } -static void sh_cmt_stop(struct sh_cmt_channel *ch, unsigned long flag) +static void sh_cmt_stop_clockevent(struct sh_cmt_channel *ch) { unsigned long flags; unsigned long f; @@ -616,22 +653,19 @@ static void sh_cmt_stop(struct sh_cmt_channel *ch, unsigned long flag) raw_spin_lock_irqsave(&ch->lock, flags); f = ch->flags & (FLAG_CLOCKEVENT | FLAG_CLOCKSOURCE); - ch->flags &= ~flag; + + ch->flags &= ~FLAG_CLOCKEVENT; if (f && !(ch->flags & (FLAG_CLOCKEVENT | FLAG_CLOCKSOURCE))) { sh_cmt_disable(ch); - if (flag & FLAG_CLOCKEVENT) - pm_runtime_put(&ch->cmt->pdev->dev); + pm_runtime_put(&ch->cmt->pdev->dev); } /* adjust the timeout to maximum if only clocksource left */ - if ((flag == FLAG_CLOCKEVENT) && (ch->flags & FLAG_CLOCKSOURCE)) + if (ch->flags & FLAG_CLOCKSOURCE) __sh_cmt_set_next(ch, ch->max_match_value); raw_spin_unlock_irqrestore(&ch->lock, flags); - - if (flag & FLAG_CLOCKSOURCE) - pm_runtime_put(&ch->cmt->pdev->dev); } static struct sh_cmt_channel *cs_to_sh_cmt(struct clocksource *cs) @@ -672,7 +706,7 @@ static int sh_cmt_clocksource_enable(struct clocksource *cs) ch->total_cycles = 0; - ret = sh_cmt_start(ch, FLAG_CLOCKSOURCE); + ret = sh_cmt_start_clocksource(ch); if (!ret) ch->cs_enabled = true; @@ -685,7 +719,7 @@ static void sh_cmt_clocksource_disable(struct clocksource *cs) WARN_ON(!ch->cs_enabled); - sh_cmt_stop(ch, FLAG_CLOCKSOURCE); + sh_cmt_stop_clocksource(ch); ch->cs_enabled = false; } @@ -696,7 +730,7 @@ static void sh_cmt_clocksource_suspend(struct clocksource *cs) if (!ch->cs_enabled) return; - sh_cmt_stop(ch, FLAG_CLOCKSOURCE); + sh_cmt_stop_clocksource(ch); dev_pm_genpd_suspend(&ch->cmt->pdev->dev); } @@ -708,7 +742,7 @@ static void sh_cmt_clocksource_resume(struct clocksource *cs) return; dev_pm_genpd_resume(&ch->cmt->pdev->dev); - sh_cmt_start(ch, FLAG_CLOCKSOURCE); + sh_cmt_start_clocksource(ch); } static int sh_cmt_register_clocksource(struct sh_cmt_channel *ch, @@ -740,7 +774,7 @@ static struct sh_cmt_channel *ced_to_sh_cmt(struct clock_event_device *ced) static void sh_cmt_clock_event_start(struct sh_cmt_channel *ch, int periodic) { - sh_cmt_start(ch, FLAG_CLOCKEVENT); + sh_cmt_start_clockevent(ch); if (periodic) sh_cmt_set_next(ch, ((ch->cmt->rate + HZ/2) / HZ) - 1); @@ -752,7 +786,7 @@ static int sh_cmt_clock_event_shutdown(struct clock_event_device *ced) { struct sh_cmt_channel *ch = ced_to_sh_cmt(ced); - sh_cmt_stop(ch, FLAG_CLOCKEVENT); + sh_cmt_stop_clockevent(ch); return 0; } @@ -763,7 +797,7 @@ static int sh_cmt_clock_event_set_state(struct clock_event_device *ced, /* deal with old setting first */ if (clockevent_state_oneshot(ced) || clockevent_state_periodic(ced)) - sh_cmt_stop(ch, FLAG_CLOCKEVENT); + sh_cmt_stop_clockevent(ch); dev_info(&ch->cmt->pdev->dev, "ch%u: used for %s clock events\n", ch->index, periodic ? "periodic" : "oneshot"); diff --git a/drivers/clocksource/timer-cs5535.c b/drivers/clocksource/timer-cs5535.c index d47acfe848ae..8af666c39890 100644 --- a/drivers/clocksource/timer-cs5535.c +++ b/drivers/clocksource/timer-cs5535.c @@ -101,6 +101,7 @@ static struct clock_event_device cs5535_clockevent = { .tick_resume = mfgpt_shutdown, .set_next_event = mfgpt_next_event, .rating = 250, + .owner = THIS_MODULE, }; static irqreturn_t mfgpt_tick(int irq, void *dev_id) diff --git a/drivers/clocksource/timer-econet-en751221.c b/drivers/clocksource/timer-econet-en751221.c index 3b449fdaafee..4008076b1a21 100644 --- a/drivers/clocksource/timer-econet-en751221.c +++ b/drivers/clocksource/timer-econet-en751221.c @@ -146,7 +146,7 @@ static int __init cevt_init(struct device_node *np) for_each_possible_cpu(i) { struct clock_event_device *cd = &per_cpu(econet_timer_pcpu, i); - cd->rating = 310, + cd->rating = 310; cd->features = CLOCK_EVT_FEAT_ONESHOT | CLOCK_EVT_FEAT_C3STOP | CLOCK_EVT_FEAT_PERCPU; diff --git a/drivers/clocksource/timer-nxp-pit.c b/drivers/clocksource/timer-nxp-pit.c new file mode 100644 index 000000000000..2d0a3554b6bf --- /dev/null +++ b/drivers/clocksource/timer-nxp-pit.c @@ -0,0 +1,382 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright 2012-2013 Freescale Semiconductor, Inc. + * Copyright 2018,2021-2025 NXP + */ +#include <linux/interrupt.h> +#include <linux/clockchips.h> +#include <linux/cpuhotplug.h> +#include <linux/clk.h> +#include <linux/of_address.h> +#include <linux/of_irq.h> +#include <linux/sched_clock.h> +#include <linux/platform_device.h> + +/* + * Each pit takes 0x10 Bytes register space + */ +#define PIT0_OFFSET 0x100 +#define PIT_CH(n) (PIT0_OFFSET + 0x10 * (n)) + +#define PITMCR(__base) (__base) + +#define PITMCR_FRZ BIT(0) +#define PITMCR_MDIS BIT(1) + +#define PITLDVAL(__base) (__base) +#define PITTCTRL(__base) ((__base) + 0x08) + +#define PITCVAL_OFFSET 0x04 +#define PITCVAL(__base) ((__base) + 0x04) + +#define PITTCTRL_TEN BIT(0) +#define PITTCTRL_TIE BIT(1) + +#define PITTFLG(__base) ((__base) + 0x0c) + +#define PITTFLG_TIF BIT(0) + +struct pit_timer { + void __iomem *clksrc_base; + void __iomem *clkevt_base; + struct clock_event_device ced; + struct clocksource cs; + int rate; +}; + +struct pit_timer_data { + int max_pit_instances; +}; + +static DEFINE_PER_CPU(struct pit_timer *, pit_timers); + +/* + * Global structure for multiple PITs initialization + */ +static int pit_instances; +static int max_pit_instances = 1; + +static void __iomem *sched_clock_base; + +static inline struct pit_timer *ced_to_pit(struct clock_event_device *ced) +{ + return container_of(ced, struct pit_timer, ced); +} + +static inline struct pit_timer *cs_to_pit(struct clocksource *cs) +{ + return container_of(cs, struct pit_timer, cs); +} + +static inline void pit_module_enable(void __iomem *base) +{ + writel(0, PITMCR(base)); +} + +static inline void pit_module_disable(void __iomem *base) +{ + writel(PITMCR_MDIS, PITMCR(base)); +} + +static inline void pit_timer_enable(void __iomem *base, bool tie) +{ + u32 val = PITTCTRL_TEN | (tie ? PITTCTRL_TIE : 0); + + writel(val, PITTCTRL(base)); +} + +static inline void pit_timer_disable(void __iomem *base) +{ + writel(0, PITTCTRL(base)); +} + +static inline void pit_timer_set_counter(void __iomem *base, unsigned int cnt) +{ + writel(cnt, PITLDVAL(base)); +} + +static inline void pit_timer_irqack(struct pit_timer *pit) +{ + writel(PITTFLG_TIF, PITTFLG(pit->clkevt_base)); +} + +static u64 notrace pit_read_sched_clock(void) +{ + return ~readl(sched_clock_base); +} + +static u64 pit_timer_clocksource_read(struct clocksource *cs) +{ + struct pit_timer *pit = cs_to_pit(cs); + + return (u64)~readl(PITCVAL(pit->clksrc_base)); +} + +static int pit_clocksource_init(struct pit_timer *pit, const char *name, + void __iomem *base, unsigned long rate) +{ + /* + * The channels 0 and 1 can be chained to build a 64-bit + * timer. Let's use the channel 2 as a clocksource and leave + * the channels 0 and 1 unused for anyone else who needs them + */ + pit->clksrc_base = base + PIT_CH(2); + pit->cs.name = name; + pit->cs.rating = 300; + pit->cs.read = pit_timer_clocksource_read; + pit->cs.mask = CLOCKSOURCE_MASK(32); + pit->cs.flags = CLOCK_SOURCE_IS_CONTINUOUS; + + /* set the max load value and start the clock source counter */ + pit_timer_disable(pit->clksrc_base); + pit_timer_set_counter(pit->clksrc_base, ~0); + pit_timer_enable(pit->clksrc_base, 0); + + sched_clock_base = pit->clksrc_base + PITCVAL_OFFSET; + sched_clock_register(pit_read_sched_clock, 32, rate); + + return clocksource_register_hz(&pit->cs, rate); +} + +static int pit_set_next_event(unsigned long delta, struct clock_event_device *ced) +{ + struct pit_timer *pit = ced_to_pit(ced); + + /* + * set a new value to PITLDVAL register will not restart the timer, + * to abort the current cycle and start a timer period with the new + * value, the timer must be disabled and enabled again. + * and the PITLAVAL should be set to delta minus one according to pit + * hardware requirement. + */ + pit_timer_disable(pit->clkevt_base); + pit_timer_set_counter(pit->clkevt_base, delta - 1); + pit_timer_enable(pit->clkevt_base, true); + + return 0; +} + +static int pit_shutdown(struct clock_event_device *ced) +{ + struct pit_timer *pit = ced_to_pit(ced); + + pit_timer_disable(pit->clkevt_base); + + return 0; +} + +static int pit_set_periodic(struct clock_event_device *ced) +{ + struct pit_timer *pit = ced_to_pit(ced); + + pit_set_next_event(pit->rate / HZ, ced); + + return 0; +} + +static irqreturn_t pit_timer_interrupt(int irq, void *dev_id) +{ + struct clock_event_device *ced = dev_id; + struct pit_timer *pit = ced_to_pit(ced); + + pit_timer_irqack(pit); + + /* + * pit hardware doesn't support oneshot, it will generate an interrupt + * and reload the counter value from PITLDVAL when PITCVAL reach zero, + * and start the counter again. So software need to disable the timer + * to stop the counter loop in ONESHOT mode. + */ + if (likely(clockevent_state_oneshot(ced))) + pit_timer_disable(pit->clkevt_base); + + ced->event_handler(ced); + + return IRQ_HANDLED; +} + +static int pit_clockevent_per_cpu_init(struct pit_timer *pit, const char *name, + void __iomem *base, unsigned long rate, + int irq, unsigned int cpu) +{ + int ret; + + /* + * The channels 0 and 1 can be chained to build a 64-bit + * timer. Let's use the channel 3 as a clockevent and leave + * the channels 0 and 1 unused for anyone else who needs them + */ + pit->clkevt_base = base + PIT_CH(3); + pit->rate = rate; + + pit_timer_disable(pit->clkevt_base); + + pit_timer_irqack(pit); + + ret = request_irq(irq, pit_timer_interrupt, IRQF_TIMER | IRQF_NOBALANCING, + name, &pit->ced); + if (ret) + return ret; + + pit->ced.cpumask = cpumask_of(cpu); + pit->ced.irq = irq; + + pit->ced.name = name; + pit->ced.features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT; + pit->ced.set_state_shutdown = pit_shutdown; + pit->ced.set_state_periodic = pit_set_periodic; + pit->ced.set_next_event = pit_set_next_event; + pit->ced.rating = 300; + + per_cpu(pit_timers, cpu) = pit; + + return 0; +} + +static void pit_clockevent_per_cpu_exit(struct pit_timer *pit, unsigned int cpu) +{ + pit_timer_disable(pit->clkevt_base); + free_irq(pit->ced.irq, &pit->ced); + per_cpu(pit_timers, cpu) = NULL; +} + +static int pit_clockevent_starting_cpu(unsigned int cpu) +{ + struct pit_timer *pit = per_cpu(pit_timers, cpu); + int ret; + + if (!pit) + return 0; + + ret = irq_force_affinity(pit->ced.irq, cpumask_of(cpu)); + if (ret) { + pit_clockevent_per_cpu_exit(pit, cpu); + return ret; + } + + /* + * The value for the LDVAL register trigger is calculated as: + * LDVAL trigger = (period / clock period) - 1 + * The pit is a 32-bit down count timer, when the counter value + * reaches 0, it will generate an interrupt, thus the minimal + * LDVAL trigger value is 1. And then the min_delta is + * minimal LDVAL trigger value + 1, and the max_delta is full 32-bit. + */ + clockevents_config_and_register(&pit->ced, pit->rate, 2, 0xffffffff); + + return 0; +} + +static int pit_timer_init(struct device_node *np) +{ + struct pit_timer *pit; + struct clk *pit_clk; + void __iomem *timer_base; + const char *name = of_node_full_name(np); + unsigned long clk_rate; + int irq, ret; + + pit = kzalloc(sizeof(*pit), GFP_KERNEL); + if (!pit) + return -ENOMEM; + + ret = -ENXIO; + timer_base = of_iomap(np, 0); + if (!timer_base) { + pr_err("Failed to iomap\n"); + goto out_kfree; + } + + ret = -EINVAL; + irq = irq_of_parse_and_map(np, 0); + if (irq <= 0) { + pr_err("Failed to irq_of_parse_and_map\n"); + goto out_iounmap; + } + + pit_clk = of_clk_get(np, 0); + if (IS_ERR(pit_clk)) { + ret = PTR_ERR(pit_clk); + goto out_irq_dispose_mapping; + } + + ret = clk_prepare_enable(pit_clk); + if (ret) + goto out_clk_put; + + clk_rate = clk_get_rate(pit_clk); + + pit_module_disable(timer_base); + + ret = pit_clocksource_init(pit, name, timer_base, clk_rate); + if (ret) { + pr_err("Failed to initialize clocksource '%pOF'\n", np); + goto out_pit_module_disable; + } + + ret = pit_clockevent_per_cpu_init(pit, name, timer_base, clk_rate, irq, pit_instances); + if (ret) { + pr_err("Failed to initialize clockevent '%pOF'\n", np); + goto out_pit_clocksource_unregister; + } + + /* enable the pit module */ + pit_module_enable(timer_base); + + pit_instances++; + + if (pit_instances == max_pit_instances) { + ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "PIT timer:starting", + pit_clockevent_starting_cpu, NULL); + if (ret < 0) + goto out_pit_clocksource_unregister; + } + + return 0; + +out_pit_clocksource_unregister: + clocksource_unregister(&pit->cs); +out_pit_module_disable: + pit_module_disable(timer_base); + clk_disable_unprepare(pit_clk); +out_clk_put: + clk_put(pit_clk); +out_irq_dispose_mapping: + irq_dispose_mapping(irq); +out_iounmap: + iounmap(timer_base); +out_kfree: + kfree(pit); + + return ret; +} + +static int pit_timer_probe(struct platform_device *pdev) +{ + const struct pit_timer_data *pit_timer_data; + + pit_timer_data = of_device_get_match_data(&pdev->dev); + if (pit_timer_data) + max_pit_instances = pit_timer_data->max_pit_instances; + + return pit_timer_init(pdev->dev.of_node); +} + +static struct pit_timer_data s32g2_data = { .max_pit_instances = 2 }; + +static const struct of_device_id pit_timer_of_match[] = { + { .compatible = "nxp,s32g2-pit", .data = &s32g2_data }, + { } +}; +MODULE_DEVICE_TABLE(of, pit_timer_of_match); + +static struct platform_driver nxp_pit_driver = { + .driver = { + .name = "nxp-pit", + .of_match_table = pit_timer_of_match, + }, + .probe = pit_timer_probe, +}; +module_platform_driver(nxp_pit_driver); + +TIMER_OF_DECLARE(vf610, "fsl,vf610-pit", pit_timer_init); diff --git a/drivers/clocksource/timer-nxp-stm.c b/drivers/clocksource/timer-nxp-stm.c index d7ccf9001729..bbc40623728f 100644 --- a/drivers/clocksource/timer-nxp-stm.c +++ b/drivers/clocksource/timer-nxp-stm.c @@ -201,6 +201,7 @@ static int __init nxp_stm_clocksource_init(struct device *dev, struct stm_timer stm_timer->cs.resume = nxp_stm_clocksource_resume; stm_timer->cs.mask = CLOCKSOURCE_MASK(32); stm_timer->cs.flags = CLOCK_SOURCE_IS_CONTINUOUS; + stm_timer->cs.owner = THIS_MODULE; ret = clocksource_register_hz(&stm_timer->cs, stm_timer->rate); if (ret) @@ -314,6 +315,7 @@ static int __init nxp_stm_clockevent_per_cpu_init(struct device *dev, struct stm stm_timer->ced.cpumask = cpumask_of(cpu); stm_timer->ced.rating = 460; stm_timer->ced.irq = irq; + stm_timer->ced.owner = THIS_MODULE; per_cpu(stm_timers, cpu) = stm_timer; diff --git a/drivers/clocksource/timer-rtl-otto.c b/drivers/clocksource/timer-rtl-otto.c index 8a3068b36e75..6113d2fdd4de 100644 --- a/drivers/clocksource/timer-rtl-otto.c +++ b/drivers/clocksource/timer-rtl-otto.c @@ -38,14 +38,13 @@ #define RTTM_BIT_COUNT 28 #define RTTM_MIN_DELTA 8 #define RTTM_MAX_DELTA CLOCKSOURCE_MASK(28) +#define RTTM_MAX_DIVISOR GENMASK(15, 0) /* - * Timers are derived from the LXB clock frequency. Usually this is a fixed - * multiple of the 25 MHz oscillator. The 930X SOC is an exception from that. - * Its LXB clock has only dividers and uses the switch PLL of 2.45 GHz as its - * base. The only meaningful frequencies we can achieve from that are 175.000 - * MHz and 153.125 MHz. The greatest common divisor of all explained possible - * speeds is 3125000. Pin the timers to this 3.125 MHz reference frequency. + * Timers are derived from the lexra bus (LXB) clock frequency. This is 175 MHz + * on RTL930x and 200 MHz on the other platforms. With 3.125 MHz choose a common + * divisor to have enough range and detail. This provides comparability between + * the different platforms. */ #define RTTM_TICKS_PER_SEC 3125000 @@ -55,11 +54,6 @@ struct rttm_cs { }; /* Simple internal register functions */ -static inline void rttm_set_counter(void __iomem *base, unsigned int counter) -{ - iowrite32(counter, base + RTTM_CNT); -} - static inline unsigned int rttm_get_counter(void __iomem *base) { return ioread32(base + RTTM_CNT); @@ -112,6 +106,22 @@ static irqreturn_t rttm_timer_interrupt(int irq, void *dev_id) return IRQ_HANDLED; } +static void rttm_bounce_timer(void __iomem *base, u32 mode) +{ + /* + * When a running timer has less than ~5us left, a stop/start sequence + * might fail. While the details are unknown the most evident effect is + * that the subsequent interrupt will not be fired. + * + * As a workaround issue an intermediate restart with a very slow + * frequency of ~3kHz keeping the target counter (>=8). So the follow + * up restart will always be issued outside the critical window. + */ + + rttm_disable_timer(base); + rttm_enable_timer(base, mode, RTTM_MAX_DIVISOR); +} + static void rttm_stop_timer(void __iomem *base) { rttm_disable_timer(base); @@ -120,7 +130,6 @@ static void rttm_stop_timer(void __iomem *base) static void rttm_start_timer(struct timer_of *to, u32 mode) { - rttm_set_counter(to->of_base.base, 0); rttm_enable_timer(to->of_base.base, mode, to->of_clk.rate / RTTM_TICKS_PER_SEC); } @@ -129,7 +138,8 @@ static int rttm_next_event(unsigned long delta, struct clock_event_device *clkev struct timer_of *to = to_timer_of(clkevt); RTTM_DEBUG(to->of_base.base); - rttm_stop_timer(to->of_base.base); + rttm_bounce_timer(to->of_base.base, RTTM_CTRL_COUNTER); + rttm_disable_timer(to->of_base.base); rttm_set_period(to->of_base.base, delta); rttm_start_timer(to, RTTM_CTRL_COUNTER); @@ -141,7 +151,8 @@ static int rttm_state_oneshot(struct clock_event_device *clkevt) struct timer_of *to = to_timer_of(clkevt); RTTM_DEBUG(to->of_base.base); - rttm_stop_timer(to->of_base.base); + rttm_bounce_timer(to->of_base.base, RTTM_CTRL_COUNTER); + rttm_disable_timer(to->of_base.base); rttm_set_period(to->of_base.base, RTTM_TICKS_PER_SEC / HZ); rttm_start_timer(to, RTTM_CTRL_COUNTER); @@ -153,7 +164,8 @@ static int rttm_state_periodic(struct clock_event_device *clkevt) struct timer_of *to = to_timer_of(clkevt); RTTM_DEBUG(to->of_base.base); - rttm_stop_timer(to->of_base.base); + rttm_bounce_timer(to->of_base.base, RTTM_CTRL_TIMER); + rttm_disable_timer(to->of_base.base); rttm_set_period(to->of_base.base, RTTM_TICKS_PER_SEC / HZ); rttm_start_timer(to, RTTM_CTRL_TIMER); diff --git a/drivers/clocksource/timer-stm32-lp.c b/drivers/clocksource/timer-stm32-lp.c index 6e7944ffd7c0..c2a699f5c1dd 100644 --- a/drivers/clocksource/timer-stm32-lp.c +++ b/drivers/clocksource/timer-stm32-lp.c @@ -211,6 +211,7 @@ static void stm32_clkevent_lp_init(struct stm32_lp_private *priv, priv->clkevt.rating = STM32_LP_RATING; priv->clkevt.suspend = stm32_clkevent_lp_suspend; priv->clkevt.resume = stm32_clkevent_lp_resume; + priv->clkevt.owner = THIS_MODULE; clockevents_config_and_register(&priv->clkevt, rate, 0x1, STM32_LPTIM_MAX_ARR); diff --git a/drivers/clocksource/timer-sun5i.c b/drivers/clocksource/timer-sun5i.c index 6b48a9006444..f827d3f98f60 100644 --- a/drivers/clocksource/timer-sun5i.c +++ b/drivers/clocksource/timer-sun5i.c @@ -185,6 +185,7 @@ static int sun5i_setup_clocksource(struct platform_device *pdev, cs->clksrc.read = sun5i_clksrc_read; cs->clksrc.mask = CLOCKSOURCE_MASK(32); cs->clksrc.flags = CLOCK_SOURCE_IS_CONTINUOUS; + cs->clksrc.owner = THIS_MODULE; ret = clocksource_register_hz(&cs->clksrc, rate); if (ret) { @@ -214,6 +215,7 @@ static int sun5i_setup_clockevent(struct platform_device *pdev, ce->clkevt.rating = 340; ce->clkevt.irq = irq; ce->clkevt.cpumask = cpu_possible_mask; + ce->clkevt.owner = THIS_MODULE; /* Enable timer0 interrupt */ val = readl(base + TIMER_IRQ_EN_REG); diff --git a/drivers/clocksource/timer-tegra186.c b/drivers/clocksource/timer-tegra186.c index e5394f98a02e..355558893e5f 100644 --- a/drivers/clocksource/timer-tegra186.c +++ b/drivers/clocksource/timer-tegra186.c @@ -159,7 +159,7 @@ static void tegra186_wdt_enable(struct tegra186_wdt *wdt) tmr_writel(wdt->tmr, TMRCSSR_SRC_USEC, TMRCSSR); /* configure timer (system reset happens on the fifth expiration) */ - value = TMRCR_PTV(wdt->base.timeout * USEC_PER_SEC / 5) | + value = TMRCR_PTV(wdt->base.timeout * (USEC_PER_SEC / 5)) | TMRCR_PERIODIC | TMRCR_ENABLE; tmr_writel(wdt->tmr, value, TMRCR); @@ -231,7 +231,7 @@ static unsigned int tegra186_wdt_get_timeleft(struct watchdog_device *wdd) { struct tegra186_wdt *wdt = to_tegra186_wdt(wdd); u32 expiration, val; - u64 timeleft; + u32 timeleft; if (!watchdog_active(&wdt->base)) { /* return zero if the watchdog timer is not activated. */ @@ -266,21 +266,26 @@ static unsigned int tegra186_wdt_get_timeleft(struct watchdog_device *wdd) * Calculate the time remaining by adding the time for the * counter value to the time of the counter expirations that * remain. + * Note: Since wdt->base.timeout is bound to 255, the maximum + * value added to timeleft is + * 255 * (1,000,000 / 5) * 4 + * = 255 * 200,000 * 4 + * = 204,000,000 + * TMRSR_PCV is a 29-bit field. + * Its maximum value is 0x1fffffff = 536,870,911. + * 204,000,000 + 536,870,911 = 740,870,911 = 0x2C28CAFF. + * timeleft can therefore not overflow, and 64-bit calculations + * are not necessary. */ - timeleft += (((u64)wdt->base.timeout * USEC_PER_SEC) / 5) * (4 - expiration); + timeleft += (wdt->base.timeout * (USEC_PER_SEC / 5)) * (4 - expiration); /* * Convert the current counter value to seconds, - * rounding up to the nearest second. Cast u64 to - * u32 under the assumption that no overflow happens - * when coverting to seconds. + * rounding to the nearest second. */ - timeleft = DIV_ROUND_CLOSEST_ULL(timeleft, USEC_PER_SEC); + timeleft = DIV_ROUND_CLOSEST(timeleft, USEC_PER_SEC); - if (WARN_ON_ONCE(timeleft > U32_MAX)) - return U32_MAX; - - return lower_32_bits(timeleft); + return timeleft; } static const struct watchdog_ops tegra186_wdt_ops = { @@ -328,16 +333,12 @@ static struct tegra186_wdt *tegra186_wdt_create(struct tegra186_timer *tegra, wdt->base.parent = tegra->dev; err = watchdog_init_timeout(&wdt->base, 5, tegra->dev); - if (err < 0) { - dev_err(tegra->dev, "failed to initialize timeout: %d\n", err); + if (err < 0) return ERR_PTR(err); - } err = devm_watchdog_register_device(tegra->dev, &wdt->base); - if (err < 0) { - dev_err(tegra->dev, "failed to register WDT: %d\n", err); + if (err < 0) return ERR_PTR(err); - } return wdt; } @@ -373,6 +374,7 @@ static int tegra186_timer_tsc_init(struct tegra186_timer *tegra) tegra->tsc.read = tegra186_timer_tsc_read; tegra->tsc.mask = CLOCKSOURCE_MASK(56); tegra->tsc.flags = CLOCK_SOURCE_IS_CONTINUOUS; + tegra->tsc.owner = THIS_MODULE; return clocksource_register_hz(&tegra->tsc, 31250000); } @@ -392,6 +394,7 @@ static int tegra186_timer_osc_init(struct tegra186_timer *tegra) tegra->osc.read = tegra186_timer_osc_read; tegra->osc.mask = CLOCKSOURCE_MASK(32); tegra->osc.flags = CLOCK_SOURCE_IS_CONTINUOUS; + tegra->osc.owner = THIS_MODULE; return clocksource_register_hz(&tegra->osc, 38400000); } @@ -411,6 +414,7 @@ static int tegra186_timer_usec_init(struct tegra186_timer *tegra) tegra->usec.read = tegra186_timer_usec_read; tegra->usec.mask = CLOCKSOURCE_MASK(32); tegra->usec.flags = CLOCK_SOURCE_IS_CONTINUOUS; + tegra->usec.owner = THIS_MODULE; return clocksource_register_hz(&tegra->usec, USEC_PER_SEC); } diff --git a/drivers/clocksource/timer-ti-dm.c b/drivers/clocksource/timer-ti-dm.c index e9e32df6b566..793e7cdcb1b1 100644 --- a/drivers/clocksource/timer-ti-dm.c +++ b/drivers/clocksource/timer-ti-dm.c @@ -31,6 +31,7 @@ #include <linux/platform_data/dmtimer-omap.h> #include <clocksource/timer-ti-dm.h> +#include <linux/delay.h> /* * timer errata flags @@ -836,6 +837,48 @@ static int omap_dm_timer_set_match(struct omap_dm_timer *cookie, int enable, return 0; } +static int omap_dm_timer_set_cap(struct omap_dm_timer *cookie, + int autoreload, bool config_period) +{ + struct dmtimer *timer; + struct device *dev; + int rc; + u32 l; + + timer = to_dmtimer(cookie); + if (unlikely(!timer)) + return -EINVAL; + + dev = &timer->pdev->dev; + rc = pm_runtime_resume_and_get(dev); + if (rc) + return rc; + /* + * 1. Select autoreload mode. TIMER_TCLR[1] AR bit. + * 2. TIMER_TCLR[14]: Sets the functionality of the TIMER IO pin. + * 3. TIMER_TCLR[13] : Capture mode select bit. + * 3. TIMER_TCLR[9-8] : Select transition capture mode. + */ + + l = dmtimer_read(timer, OMAP_TIMER_CTRL_REG); + + if (autoreload) + l |= OMAP_TIMER_CTRL_AR; + + l |= OMAP_TIMER_CTRL_CAPTMODE | OMAP_TIMER_CTRL_GPOCFG; + + if (config_period == true) + l |= OMAP_TIMER_CTRL_TCM_LOWTOHIGH; /* Time Period config */ + else + l |= OMAP_TIMER_CTRL_TCM_BOTHEDGES; /* Duty Cycle config */ + + dmtimer_write(timer, OMAP_TIMER_CTRL_REG, l); + + pm_runtime_put_sync(dev); + + return 0; +} + static int omap_dm_timer_set_pwm(struct omap_dm_timer *cookie, int def_on, int toggle, int trigger, int autoreload) { @@ -1023,23 +1066,92 @@ static unsigned int omap_dm_timer_read_counter(struct omap_dm_timer *cookie) return __omap_dm_timer_read_counter(timer); } +static inline unsigned int __omap_dm_timer_cap(struct dmtimer *timer, int idx) +{ + return idx == 0 ? dmtimer_read(timer, OMAP_TIMER_CAPTURE_REG) : + dmtimer_read(timer, OMAP_TIMER_CAPTURE2_REG); +} + static int omap_dm_timer_write_counter(struct omap_dm_timer *cookie, unsigned int value) { struct dmtimer *timer; + struct device *dev; timer = to_dmtimer(cookie); - if (unlikely(!timer || !atomic_read(&timer->enabled))) { - pr_err("%s: timer not available or enabled.\n", __func__); + if (unlikely(!timer)) { + pr_err("%s: timer not available.\n", __func__); return -EINVAL; } + dev = &timer->pdev->dev; + + pm_runtime_resume_and_get(dev); dmtimer_write(timer, OMAP_TIMER_COUNTER_REG, value); + pm_runtime_put_sync(dev); /* Save the context */ timer->context.tcrr = value; return 0; } +/** + * omap_dm_timer_cap_counter() - Calculate the high count or period count depending on the + * configuration. + * @cookie:Pointer to OMAP DM timer + * @is_period:Whether to configure timer in period or duty cycle mode + * + * Return high count or period count if timer is enabled else appropriate error. + */ +static unsigned int omap_dm_timer_cap_counter(struct omap_dm_timer *cookie, bool is_period) +{ + struct dmtimer *timer; + unsigned int cap1 = 0; + unsigned int cap2 = 0; + u32 l, ret; + + timer = to_dmtimer(cookie); + if (unlikely(!timer || !atomic_read(&timer->enabled))) { + pr_err("%s:timer is not available or enabled.%p\n", __func__, (void *)timer); + return -EINVAL; + } + + /* Stop the timer */ + omap_dm_timer_stop(cookie); + + /* Clear the timer counter value to 0 */ + ret = omap_dm_timer_write_counter(cookie, 0); + if (ret) + return ret; + + /* Sets the timer capture configuration for period/duty cycle calculation */ + ret = omap_dm_timer_set_cap(cookie, true, is_period); + if (ret) { + pr_err("%s: Failed to set timer capture configuration.\n", __func__); + return ret; + } + /* Start the timer */ + omap_dm_timer_start(cookie); + + /* + * 1 sec delay is given so as to provide + * enough time to capture low frequency signals. + */ + msleep(1000); + + cap1 = __omap_dm_timer_cap(timer, 0); + cap2 = __omap_dm_timer_cap(timer, 1); + + /* + * Clears the TCLR configuration. + * The start bit must be set to 1 as the timer is already in start mode. + */ + l = dmtimer_read(timer, OMAP_TIMER_CTRL_REG); + l &= ~(0xffff) | 0x1; + dmtimer_write(timer, OMAP_TIMER_CTRL_REG, l); + + return (cap2-cap1); +} + static int __maybe_unused omap_dm_timer_runtime_suspend(struct device *dev) { struct dmtimer *timer = dev_get_drvdata(dev); @@ -1246,6 +1358,9 @@ static const struct omap_dm_timer_ops dmtimer_ops = { .write_counter = omap_dm_timer_write_counter, .read_status = omap_dm_timer_read_status, .write_status = omap_dm_timer_write_status, + .set_cap = omap_dm_timer_set_cap, + .get_cap_status = omap_dm_timer_get_pwm_status, + .read_cap = omap_dm_timer_cap_counter, }; static const struct dmtimer_platform_data omap3plus_pdata = { diff --git a/drivers/clocksource/timer-vf-pit.c b/drivers/clocksource/timer-vf-pit.c deleted file mode 100644 index 911c92146eca..000000000000 --- a/drivers/clocksource/timer-vf-pit.c +++ /dev/null @@ -1,194 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * Copyright 2012-2013 Freescale Semiconductor, Inc. - */ - -#include <linux/interrupt.h> -#include <linux/clockchips.h> -#include <linux/clk.h> -#include <linux/of_address.h> -#include <linux/of_irq.h> -#include <linux/sched_clock.h> - -/* - * Each pit takes 0x10 Bytes register space - */ -#define PITMCR 0x00 -#define PIT0_OFFSET 0x100 -#define PITn_OFFSET(n) (PIT0_OFFSET + 0x10 * (n)) -#define PITLDVAL 0x00 -#define PITCVAL 0x04 -#define PITTCTRL 0x08 -#define PITTFLG 0x0c - -#define PITMCR_MDIS (0x1 << 1) - -#define PITTCTRL_TEN (0x1 << 0) -#define PITTCTRL_TIE (0x1 << 1) -#define PITCTRL_CHN (0x1 << 2) - -#define PITTFLG_TIF 0x1 - -static void __iomem *clksrc_base; -static void __iomem *clkevt_base; -static unsigned long cycle_per_jiffy; - -static inline void pit_timer_enable(void) -{ - __raw_writel(PITTCTRL_TEN | PITTCTRL_TIE, clkevt_base + PITTCTRL); -} - -static inline void pit_timer_disable(void) -{ - __raw_writel(0, clkevt_base + PITTCTRL); -} - -static inline void pit_irq_acknowledge(void) -{ - __raw_writel(PITTFLG_TIF, clkevt_base + PITTFLG); -} - -static u64 notrace pit_read_sched_clock(void) -{ - return ~__raw_readl(clksrc_base + PITCVAL); -} - -static int __init pit_clocksource_init(unsigned long rate) -{ - /* set the max load value and start the clock source counter */ - __raw_writel(0, clksrc_base + PITTCTRL); - __raw_writel(~0UL, clksrc_base + PITLDVAL); - __raw_writel(PITTCTRL_TEN, clksrc_base + PITTCTRL); - - sched_clock_register(pit_read_sched_clock, 32, rate); - return clocksource_mmio_init(clksrc_base + PITCVAL, "vf-pit", rate, - 300, 32, clocksource_mmio_readl_down); -} - -static int pit_set_next_event(unsigned long delta, - struct clock_event_device *unused) -{ - /* - * set a new value to PITLDVAL register will not restart the timer, - * to abort the current cycle and start a timer period with the new - * value, the timer must be disabled and enabled again. - * and the PITLAVAL should be set to delta minus one according to pit - * hardware requirement. - */ - pit_timer_disable(); - __raw_writel(delta - 1, clkevt_base + PITLDVAL); - pit_timer_enable(); - - return 0; -} - -static int pit_shutdown(struct clock_event_device *evt) -{ - pit_timer_disable(); - return 0; -} - -static int pit_set_periodic(struct clock_event_device *evt) -{ - pit_set_next_event(cycle_per_jiffy, evt); - return 0; -} - -static irqreturn_t pit_timer_interrupt(int irq, void *dev_id) -{ - struct clock_event_device *evt = dev_id; - - pit_irq_acknowledge(); - - /* - * pit hardware doesn't support oneshot, it will generate an interrupt - * and reload the counter value from PITLDVAL when PITCVAL reach zero, - * and start the counter again. So software need to disable the timer - * to stop the counter loop in ONESHOT mode. - */ - if (likely(clockevent_state_oneshot(evt))) - pit_timer_disable(); - - evt->event_handler(evt); - - return IRQ_HANDLED; -} - -static struct clock_event_device clockevent_pit = { - .name = "VF pit timer", - .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT, - .set_state_shutdown = pit_shutdown, - .set_state_periodic = pit_set_periodic, - .set_next_event = pit_set_next_event, - .rating = 300, -}; - -static int __init pit_clockevent_init(unsigned long rate, int irq) -{ - __raw_writel(0, clkevt_base + PITTCTRL); - __raw_writel(PITTFLG_TIF, clkevt_base + PITTFLG); - - BUG_ON(request_irq(irq, pit_timer_interrupt, IRQF_TIMER | IRQF_IRQPOLL, - "VF pit timer", &clockevent_pit)); - - clockevent_pit.cpumask = cpumask_of(0); - clockevent_pit.irq = irq; - /* - * The value for the LDVAL register trigger is calculated as: - * LDVAL trigger = (period / clock period) - 1 - * The pit is a 32-bit down count timer, when the counter value - * reaches 0, it will generate an interrupt, thus the minimal - * LDVAL trigger value is 1. And then the min_delta is - * minimal LDVAL trigger value + 1, and the max_delta is full 32-bit. - */ - clockevents_config_and_register(&clockevent_pit, rate, 2, 0xffffffff); - - return 0; -} - -static int __init pit_timer_init(struct device_node *np) -{ - struct clk *pit_clk; - void __iomem *timer_base; - unsigned long clk_rate; - int irq, ret; - - timer_base = of_iomap(np, 0); - if (!timer_base) { - pr_err("Failed to iomap\n"); - return -ENXIO; - } - - /* - * PIT0 and PIT1 can be chained to build a 64-bit timer, - * so choose PIT2 as clocksource, PIT3 as clockevent device, - * and leave PIT0 and PIT1 unused for anyone else who needs them. - */ - clksrc_base = timer_base + PITn_OFFSET(2); - clkevt_base = timer_base + PITn_OFFSET(3); - - irq = irq_of_parse_and_map(np, 0); - if (irq <= 0) - return -EINVAL; - - pit_clk = of_clk_get(np, 0); - if (IS_ERR(pit_clk)) - return PTR_ERR(pit_clk); - - ret = clk_prepare_enable(pit_clk); - if (ret) - return ret; - - clk_rate = clk_get_rate(pit_clk); - cycle_per_jiffy = clk_rate / (HZ); - - /* enable the pit module */ - __raw_writel(~PITMCR_MDIS, timer_base + PITMCR); - - ret = pit_clocksource_init(clk_rate); - if (ret) - return ret; - - return pit_clockevent_init(clk_rate, irq); -} -TIMER_OF_DECLARE(vf610, "fsl,vf610-pit", pit_timer_init); diff --git a/drivers/crypto/ccp/Makefile b/drivers/crypto/ccp/Makefile index 394484929dae..a9626b30044a 100644 --- a/drivers/crypto/ccp/Makefile +++ b/drivers/crypto/ccp/Makefile @@ -13,7 +13,8 @@ ccp-$(CONFIG_CRYPTO_DEV_SP_PSP) += psp-dev.o \ tee-dev.o \ platform-access.o \ dbc.o \ - hsti.o + hsti.o \ + sfs.o obj-$(CONFIG_CRYPTO_DEV_CCP_CRYPTO) += ccp-crypto.o ccp-crypto-objs := ccp-crypto-main.o \ diff --git a/drivers/crypto/ccp/psp-dev.c b/drivers/crypto/ccp/psp-dev.c index 1c5a7189631e..9e21da0e298a 100644 --- a/drivers/crypto/ccp/psp-dev.c +++ b/drivers/crypto/ccp/psp-dev.c @@ -17,6 +17,7 @@ #include "psp-dev.h" #include "sev-dev.h" #include "tee-dev.h" +#include "sfs.h" #include "platform-access.h" #include "dbc.h" #include "hsti.h" @@ -182,6 +183,17 @@ static int psp_check_tee_support(struct psp_device *psp) return 0; } +static int psp_check_sfs_support(struct psp_device *psp) +{ + /* Check if device supports SFS feature */ + if (!psp->capability.sfs) { + dev_dbg(psp->dev, "psp does not support SFS\n"); + return -ENODEV; + } + + return 0; +} + static int psp_init(struct psp_device *psp) { int ret; @@ -198,6 +210,12 @@ static int psp_init(struct psp_device *psp) return ret; } + if (!psp_check_sfs_support(psp)) { + ret = sfs_dev_init(psp); + if (ret) + return ret; + } + if (psp->vdata->platform_access) { ret = platform_access_dev_init(psp); if (ret) @@ -302,6 +320,8 @@ void psp_dev_destroy(struct sp_device *sp) tee_dev_destroy(psp); + sfs_dev_destroy(psp); + dbc_dev_destroy(psp); platform_access_dev_destroy(psp); diff --git a/drivers/crypto/ccp/psp-dev.h b/drivers/crypto/ccp/psp-dev.h index e43ce87ede76..268c83f298cb 100644 --- a/drivers/crypto/ccp/psp-dev.h +++ b/drivers/crypto/ccp/psp-dev.h @@ -32,7 +32,8 @@ union psp_cap_register { unsigned int sev :1, tee :1, dbc_thru_ext :1, - rsvd1 :4, + sfs :1, + rsvd1 :3, security_reporting :1, fused_part :1, rsvd2 :1, @@ -68,6 +69,7 @@ struct psp_device { void *tee_data; void *platform_access_data; void *dbc_data; + void *sfs_data; union psp_cap_register capability; }; @@ -118,12 +120,16 @@ struct psp_ext_request { * @PSP_SUB_CMD_DBC_SET_UID: Set UID for DBC * @PSP_SUB_CMD_DBC_GET_PARAMETER: Get parameter from DBC * @PSP_SUB_CMD_DBC_SET_PARAMETER: Set parameter for DBC + * @PSP_SUB_CMD_SFS_GET_FW_VERS: Get firmware versions for ASP and other MP + * @PSP_SUB_CMD_SFS_UPDATE: Command to load, verify and execute SFS package */ enum psp_sub_cmd { PSP_SUB_CMD_DBC_GET_NONCE = PSP_DYNAMIC_BOOST_GET_NONCE, PSP_SUB_CMD_DBC_SET_UID = PSP_DYNAMIC_BOOST_SET_UID, PSP_SUB_CMD_DBC_GET_PARAMETER = PSP_DYNAMIC_BOOST_GET_PARAMETER, PSP_SUB_CMD_DBC_SET_PARAMETER = PSP_DYNAMIC_BOOST_SET_PARAMETER, + PSP_SUB_CMD_SFS_GET_FW_VERS = PSP_SFS_GET_FW_VERSIONS, + PSP_SUB_CMD_SFS_UPDATE = PSP_SFS_UPDATE, }; int psp_extended_mailbox_cmd(struct psp_device *psp, unsigned int timeout_msecs, diff --git a/drivers/crypto/ccp/sev-dev.c b/drivers/crypto/ccp/sev-dev.c index 9f5ccc1720cb..65d6d0af140a 100644 --- a/drivers/crypto/ccp/sev-dev.c +++ b/drivers/crypto/ccp/sev-dev.c @@ -82,6 +82,21 @@ MODULE_FIRMWARE("amd/amd_sev_fam19h_model1xh.sbin"); /* 4th gen EPYC */ static bool psp_dead; static int psp_timeout; +enum snp_hv_fixed_pages_state { + ALLOCATED, + HV_FIXED, +}; + +struct snp_hv_fixed_pages_entry { + struct list_head list; + struct page *page; + unsigned int order; + bool free; + enum snp_hv_fixed_pages_state page_state; +}; + +static LIST_HEAD(snp_hv_fixed_pages); + /* Trusted Memory Region (TMR): * The TMR is a 1MB area that must be 1MB aligned. Use the page allocator * to allocate the memory, which will return aligned memory for the specified @@ -1073,6 +1088,165 @@ static void snp_set_hsave_pa(void *arg) wrmsrq(MSR_VM_HSAVE_PA, 0); } +/* Hypervisor Fixed pages API interface */ +static void snp_hv_fixed_pages_state_update(struct sev_device *sev, + enum snp_hv_fixed_pages_state page_state) +{ + struct snp_hv_fixed_pages_entry *entry; + + /* List is protected by sev_cmd_mutex */ + lockdep_assert_held(&sev_cmd_mutex); + + if (list_empty(&snp_hv_fixed_pages)) + return; + + list_for_each_entry(entry, &snp_hv_fixed_pages, list) + entry->page_state = page_state; +} + +/* + * Allocate HV_FIXED pages in 2MB aligned sizes to ensure the whole + * 2MB pages are marked as HV_FIXED. + */ +struct page *snp_alloc_hv_fixed_pages(unsigned int num_2mb_pages) +{ + struct psp_device *psp_master = psp_get_master_device(); + struct snp_hv_fixed_pages_entry *entry; + struct sev_device *sev; + unsigned int order; + struct page *page; + + if (!psp_master || !psp_master->sev_data) + return NULL; + + sev = psp_master->sev_data; + + order = get_order(PMD_SIZE * num_2mb_pages); + + /* + * SNP_INIT_EX is protected by sev_cmd_mutex, therefore this list + * also needs to be protected using the same mutex. + */ + guard(mutex)(&sev_cmd_mutex); + + /* + * This API uses SNP_INIT_EX to transition allocated pages to HV_Fixed + * page state, fail if SNP is already initialized. + */ + if (sev->snp_initialized) + return NULL; + + /* Re-use freed pages that match the request */ + list_for_each_entry(entry, &snp_hv_fixed_pages, list) { + /* Hypervisor fixed page allocator implements exact fit policy */ + if (entry->order == order && entry->free) { + entry->free = false; + memset(page_address(entry->page), 0, + (1 << entry->order) * PAGE_SIZE); + return entry->page; + } + } + + page = alloc_pages(GFP_KERNEL | __GFP_ZERO, order); + if (!page) + return NULL; + + entry = kzalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) { + __free_pages(page, order); + return NULL; + } + + entry->page = page; + entry->order = order; + list_add_tail(&entry->list, &snp_hv_fixed_pages); + + return page; +} + +void snp_free_hv_fixed_pages(struct page *page) +{ + struct psp_device *psp_master = psp_get_master_device(); + struct snp_hv_fixed_pages_entry *entry, *nentry; + + if (!psp_master || !psp_master->sev_data) + return; + + /* + * SNP_INIT_EX is protected by sev_cmd_mutex, therefore this list + * also needs to be protected using the same mutex. + */ + guard(mutex)(&sev_cmd_mutex); + + list_for_each_entry_safe(entry, nentry, &snp_hv_fixed_pages, list) { + if (entry->page != page) + continue; + + /* + * HV_FIXED page state cannot be changed until reboot + * and they cannot be used by an SNP guest, so they cannot + * be returned back to the page allocator. + * Mark the pages as free internally to allow possible re-use. + */ + if (entry->page_state == HV_FIXED) { + entry->free = true; + } else { + __free_pages(page, entry->order); + list_del(&entry->list); + kfree(entry); + } + return; + } +} + +static void snp_add_hv_fixed_pages(struct sev_device *sev, struct sev_data_range_list *range_list) +{ + struct snp_hv_fixed_pages_entry *entry; + struct sev_data_range *range; + int num_elements; + + lockdep_assert_held(&sev_cmd_mutex); + + if (list_empty(&snp_hv_fixed_pages)) + return; + + num_elements = list_count_nodes(&snp_hv_fixed_pages) + + range_list->num_elements; + + /* + * Ensure the list of HV_FIXED pages that will be passed to firmware + * do not exceed the page-sized argument buffer. + */ + if (num_elements * sizeof(*range) + sizeof(*range_list) > PAGE_SIZE) { + dev_warn(sev->dev, "Additional HV_Fixed pages cannot be accommodated, omitting\n"); + return; + } + + range = &range_list->ranges[range_list->num_elements]; + list_for_each_entry(entry, &snp_hv_fixed_pages, list) { + range->base = page_to_pfn(entry->page) << PAGE_SHIFT; + range->page_count = 1 << entry->order; + range++; + } + range_list->num_elements = num_elements; +} + +static void snp_leak_hv_fixed_pages(void) +{ + struct snp_hv_fixed_pages_entry *entry; + + /* List is protected by sev_cmd_mutex */ + lockdep_assert_held(&sev_cmd_mutex); + + if (list_empty(&snp_hv_fixed_pages)) + return; + + list_for_each_entry(entry, &snp_hv_fixed_pages, list) + if (entry->page_state == HV_FIXED) + __snp_leak_pages(page_to_pfn(entry->page), + 1 << entry->order, false); +} + static int snp_filter_reserved_mem_regions(struct resource *rs, void *arg) { struct sev_data_range_list *range_list = arg; @@ -1163,6 +1337,12 @@ static int __sev_snp_init_locked(int *error) return rc; } + /* + * Add HV_Fixed pages from other PSP sub-devices, such as SFS to the + * HV_Fixed page list. + */ + snp_add_hv_fixed_pages(sev, snp_range_list); + memset(&data, 0, sizeof(data)); data.init_rmp = 1; data.list_paddr_en = 1; @@ -1202,6 +1382,7 @@ static int __sev_snp_init_locked(int *error) return rc; } + snp_hv_fixed_pages_state_update(sev, HV_FIXED); sev->snp_initialized = true; dev_dbg(sev->dev, "SEV-SNP firmware initialized\n"); @@ -1784,6 +1965,7 @@ static int __sev_snp_shutdown_locked(int *error, bool panic) return ret; } + snp_leak_hv_fixed_pages(); sev->snp_initialized = false; dev_dbg(sev->dev, "SEV-SNP firmware shutdown\n"); diff --git a/drivers/crypto/ccp/sev-dev.h b/drivers/crypto/ccp/sev-dev.h index 3e4e5574e88a..28021abc85ad 100644 --- a/drivers/crypto/ccp/sev-dev.h +++ b/drivers/crypto/ccp/sev-dev.h @@ -65,4 +65,7 @@ void sev_dev_destroy(struct psp_device *psp); void sev_pci_init(void); void sev_pci_exit(void); +struct page *snp_alloc_hv_fixed_pages(unsigned int num_2mb_pages); +void snp_free_hv_fixed_pages(struct page *page); + #endif /* __SEV_DEV_H */ diff --git a/drivers/crypto/ccp/sfs.c b/drivers/crypto/ccp/sfs.c new file mode 100644 index 000000000000..2f4beaafe7ec --- /dev/null +++ b/drivers/crypto/ccp/sfs.c @@ -0,0 +1,311 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * AMD Secure Processor Seamless Firmware Servicing support. + * + * Copyright (C) 2025 Advanced Micro Devices, Inc. + * + * Author: Ashish Kalra <ashish.kalra@amd.com> + */ + +#include <linux/firmware.h> + +#include "sfs.h" +#include "sev-dev.h" + +#define SFS_DEFAULT_TIMEOUT (10 * MSEC_PER_SEC) +#define SFS_MAX_PAYLOAD_SIZE (2 * 1024 * 1024) +#define SFS_NUM_2MB_PAGES_CMDBUF (SFS_MAX_PAYLOAD_SIZE / PMD_SIZE) +#define SFS_NUM_PAGES_CMDBUF (SFS_MAX_PAYLOAD_SIZE / PAGE_SIZE) + +static DEFINE_MUTEX(sfs_ioctl_mutex); + +static struct sfs_misc_dev *misc_dev; + +static int send_sfs_cmd(struct sfs_device *sfs_dev, int msg) +{ + int ret; + + sfs_dev->command_buf->hdr.status = 0; + sfs_dev->command_buf->hdr.sub_cmd_id = msg; + + ret = psp_extended_mailbox_cmd(sfs_dev->psp, + SFS_DEFAULT_TIMEOUT, + (struct psp_ext_request *)sfs_dev->command_buf); + if (ret == -EIO) { + dev_dbg(sfs_dev->dev, + "msg 0x%x failed with PSP error: 0x%x, extended status: 0x%x\n", + msg, sfs_dev->command_buf->hdr.status, + *(u32 *)sfs_dev->command_buf->buf); + } + + return ret; +} + +static int send_sfs_get_fw_versions(struct sfs_device *sfs_dev) +{ + /* + * SFS_GET_FW_VERSIONS command needs the output buffer to be + * initialized to 0xC7 in every byte. + */ + memset(sfs_dev->command_buf->sfs_buffer, 0xc7, PAGE_SIZE); + sfs_dev->command_buf->hdr.payload_size = 2 * PAGE_SIZE; + + return send_sfs_cmd(sfs_dev, PSP_SFS_GET_FW_VERSIONS); +} + +static int send_sfs_update_package(struct sfs_device *sfs_dev, const char *payload_name) +{ + char payload_path[PAYLOAD_NAME_SIZE + sizeof("amd/")]; + const struct firmware *firmware; + unsigned long package_size; + int ret; + + /* Sanitize userspace provided payload name */ + if (!strnchr(payload_name, PAYLOAD_NAME_SIZE, '\0')) + return -EINVAL; + + snprintf(payload_path, sizeof(payload_path), "amd/%s", payload_name); + + ret = firmware_request_nowarn(&firmware, payload_path, sfs_dev->dev); + if (ret < 0) { + dev_warn_ratelimited(sfs_dev->dev, "firmware request failed for %s (%d)\n", + payload_path, ret); + return -ENOENT; + } + + /* + * SFS Update Package command's input buffer contains TEE_EXT_CMD_BUFFER + * followed by the Update Package and it should be 64KB aligned. + */ + package_size = ALIGN(firmware->size + PAGE_SIZE, 0x10000U); + + /* + * SFS command buffer is a pre-allocated 2MB buffer, fail update package + * if SFS payload is larger than the pre-allocated command buffer. + */ + if (package_size > SFS_MAX_PAYLOAD_SIZE) { + dev_warn_ratelimited(sfs_dev->dev, + "SFS payload size %ld larger than maximum supported payload size of %u\n", + package_size, SFS_MAX_PAYLOAD_SIZE); + release_firmware(firmware); + return -E2BIG; + } + + /* + * Copy firmware data to a HV_Fixed memory region. + */ + memcpy(sfs_dev->command_buf->sfs_buffer, firmware->data, firmware->size); + sfs_dev->command_buf->hdr.payload_size = package_size; + + release_firmware(firmware); + + return send_sfs_cmd(sfs_dev, PSP_SFS_UPDATE); +} + +static long sfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + struct sfs_user_get_fw_versions __user *sfs_get_fw_versions; + struct sfs_user_update_package __user *sfs_update_package; + struct psp_device *psp_master = psp_get_master_device(); + char payload_name[PAYLOAD_NAME_SIZE]; + struct sfs_device *sfs_dev; + int ret = 0; + + if (!psp_master || !psp_master->sfs_data) + return -ENODEV; + + sfs_dev = psp_master->sfs_data; + + guard(mutex)(&sfs_ioctl_mutex); + + switch (cmd) { + case SFSIOCFWVERS: + dev_dbg(sfs_dev->dev, "in SFSIOCFWVERS\n"); + + sfs_get_fw_versions = (struct sfs_user_get_fw_versions __user *)arg; + + ret = send_sfs_get_fw_versions(sfs_dev); + if (ret && ret != -EIO) + return ret; + + /* + * Return SFS status and extended status back to userspace + * if PSP status indicated success or command error. + */ + if (copy_to_user(&sfs_get_fw_versions->blob, sfs_dev->command_buf->sfs_buffer, + PAGE_SIZE)) + return -EFAULT; + if (copy_to_user(&sfs_get_fw_versions->sfs_status, + &sfs_dev->command_buf->hdr.status, + sizeof(sfs_get_fw_versions->sfs_status))) + return -EFAULT; + if (copy_to_user(&sfs_get_fw_versions->sfs_extended_status, + &sfs_dev->command_buf->buf, + sizeof(sfs_get_fw_versions->sfs_extended_status))) + return -EFAULT; + break; + case SFSIOCUPDATEPKG: + dev_dbg(sfs_dev->dev, "in SFSIOCUPDATEPKG\n"); + + sfs_update_package = (struct sfs_user_update_package __user *)arg; + + if (copy_from_user(payload_name, sfs_update_package->payload_name, + PAYLOAD_NAME_SIZE)) + return -EFAULT; + + ret = send_sfs_update_package(sfs_dev, payload_name); + if (ret && ret != -EIO) + return ret; + + /* + * Return SFS status and extended status back to userspace + * if PSP status indicated success or command error. + */ + if (copy_to_user(&sfs_update_package->sfs_status, + &sfs_dev->command_buf->hdr.status, + sizeof(sfs_update_package->sfs_status))) + return -EFAULT; + if (copy_to_user(&sfs_update_package->sfs_extended_status, + &sfs_dev->command_buf->buf, + sizeof(sfs_update_package->sfs_extended_status))) + return -EFAULT; + break; + default: + ret = -EINVAL; + } + + return ret; +} + +static const struct file_operations sfs_fops = { + .owner = THIS_MODULE, + .unlocked_ioctl = sfs_ioctl, +}; + +static void sfs_exit(struct kref *ref) +{ + misc_deregister(&misc_dev->misc); + kfree(misc_dev); + misc_dev = NULL; +} + +void sfs_dev_destroy(struct psp_device *psp) +{ + struct sfs_device *sfs_dev = psp->sfs_data; + + if (!sfs_dev) + return; + + /* + * Change SFS command buffer back to the default "Write-Back" type. + */ + set_memory_wb((unsigned long)sfs_dev->command_buf, SFS_NUM_PAGES_CMDBUF); + + snp_free_hv_fixed_pages(sfs_dev->page); + + if (sfs_dev->misc) + kref_put(&misc_dev->refcount, sfs_exit); + + psp->sfs_data = NULL; +} + +/* Based on sev_misc_init() */ +static int sfs_misc_init(struct sfs_device *sfs) +{ + struct device *dev = sfs->dev; + int ret; + + /* + * SFS feature support can be detected on multiple devices but the SFS + * FW commands must be issued on the master. During probe, we do not + * know the master hence we create /dev/sfs on the first device probe. + */ + if (!misc_dev) { + struct miscdevice *misc; + + misc_dev = kzalloc(sizeof(*misc_dev), GFP_KERNEL); + if (!misc_dev) + return -ENOMEM; + + misc = &misc_dev->misc; + misc->minor = MISC_DYNAMIC_MINOR; + misc->name = "sfs"; + misc->fops = &sfs_fops; + misc->mode = 0600; + + ret = misc_register(misc); + if (ret) + return ret; + + kref_init(&misc_dev->refcount); + } else { + kref_get(&misc_dev->refcount); + } + + sfs->misc = misc_dev; + dev_dbg(dev, "registered SFS device\n"); + + return 0; +} + +int sfs_dev_init(struct psp_device *psp) +{ + struct device *dev = psp->dev; + struct sfs_device *sfs_dev; + struct page *page; + int ret = -ENOMEM; + + sfs_dev = devm_kzalloc(dev, sizeof(*sfs_dev), GFP_KERNEL); + if (!sfs_dev) + return -ENOMEM; + + /* + * Pre-allocate 2MB command buffer for all SFS commands using + * SNP HV_Fixed page allocator which also transitions the + * SFS command buffer to HV_Fixed page state if SNP is enabled. + */ + page = snp_alloc_hv_fixed_pages(SFS_NUM_2MB_PAGES_CMDBUF); + if (!page) { + dev_dbg(dev, "Command Buffer HV-Fixed page allocation failed\n"); + goto cleanup_dev; + } + sfs_dev->page = page; + sfs_dev->command_buf = page_address(page); + + dev_dbg(dev, "Command buffer 0x%px to be marked as HV_Fixed\n", sfs_dev->command_buf); + + /* + * SFS command buffer must be mapped as non-cacheable. + */ + ret = set_memory_uc((unsigned long)sfs_dev->command_buf, SFS_NUM_PAGES_CMDBUF); + if (ret) { + dev_dbg(dev, "Set memory uc failed\n"); + goto cleanup_cmd_buf; + } + + dev_dbg(dev, "Command buffer 0x%px marked uncacheable\n", sfs_dev->command_buf); + + psp->sfs_data = sfs_dev; + sfs_dev->dev = dev; + sfs_dev->psp = psp; + + ret = sfs_misc_init(sfs_dev); + if (ret) + goto cleanup_mem_attr; + + dev_notice(sfs_dev->dev, "SFS support is available\n"); + + return 0; + +cleanup_mem_attr: + set_memory_wb((unsigned long)sfs_dev->command_buf, SFS_NUM_PAGES_CMDBUF); + +cleanup_cmd_buf: + snp_free_hv_fixed_pages(page); + +cleanup_dev: + psp->sfs_data = NULL; + devm_kfree(dev, sfs_dev); + + return ret; +} diff --git a/drivers/crypto/ccp/sfs.h b/drivers/crypto/ccp/sfs.h new file mode 100644 index 000000000000..97704c210efd --- /dev/null +++ b/drivers/crypto/ccp/sfs.h @@ -0,0 +1,47 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * AMD Platform Security Processor (PSP) Seamless Firmware (SFS) Support. + * + * Copyright (C) 2025 Advanced Micro Devices, Inc. + * + * Author: Ashish Kalra <ashish.kalra@amd.com> + */ + +#ifndef __SFS_H__ +#define __SFS_H__ + +#include <uapi/linux/psp-sfs.h> + +#include <linux/device.h> +#include <linux/miscdevice.h> +#include <linux/psp-sev.h> +#include <linux/psp-platform-access.h> +#include <linux/set_memory.h> + +#include "psp-dev.h" + +struct sfs_misc_dev { + struct kref refcount; + struct miscdevice misc; +}; + +struct sfs_command { + struct psp_ext_req_buffer_hdr hdr; + u8 buf[PAGE_SIZE - sizeof(struct psp_ext_req_buffer_hdr)]; + u8 sfs_buffer[]; +} __packed; + +struct sfs_device { + struct device *dev; + struct psp_device *psp; + + struct page *page; + struct sfs_command *command_buf; + + struct sfs_misc_dev *misc; +}; + +void sfs_dev_destroy(struct psp_device *psp); +int sfs_dev_init(struct psp_device *psp); + +#endif /* __SFS_H__ */ diff --git a/drivers/edac/Kconfig b/drivers/edac/Kconfig index 19ad3c3b675d..39352b9b7a7e 100644 --- a/drivers/edac/Kconfig +++ b/drivers/edac/Kconfig @@ -576,4 +576,20 @@ config EDAC_LOONGSON errors (CE) only. Loongson-3A5000/3C5000/3D5000/3A6000/3C6000 are compatible. +config EDAC_CORTEX_A72 + tristate "ARM Cortex A72" + depends on ARM64 + help + Support for L1/L2 cache error detection for ARM Cortex A72 processor. + The detected and reported errors are from reading CPU/L2 memory error + syndrome registers. + +config EDAC_VERSALNET + tristate "AMD VersalNET DDR Controller" + depends on CDX_CONTROLLER && ARCH_ZYNQMP + help + Support for single bit error correction, double bit error detection + and other system errors from various IP subsystems like RPU, NOCs, + HNICX, PL on the AMD Versal NET DDR memory controller. + endif # EDAC diff --git a/drivers/edac/Makefile b/drivers/edac/Makefile index a8f2d8f6c894..1c14796410a3 100644 --- a/drivers/edac/Makefile +++ b/drivers/edac/Makefile @@ -88,3 +88,5 @@ obj-$(CONFIG_EDAC_NPCM) += npcm_edac.o obj-$(CONFIG_EDAC_ZYNQMP) += zynqmp_edac.o obj-$(CONFIG_EDAC_VERSAL) += versal_edac.o obj-$(CONFIG_EDAC_LOONGSON) += loongson_edac.o +obj-$(CONFIG_EDAC_VERSALNET) += versalnet_edac.o +obj-$(CONFIG_EDAC_CORTEX_A72) += a72_edac.o diff --git a/drivers/edac/a72_edac.c b/drivers/edac/a72_edac.c new file mode 100644 index 000000000000..9262d75c3855 --- /dev/null +++ b/drivers/edac/a72_edac.c @@ -0,0 +1,225 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Cortex A72 EDAC L1 and L2 cache error detection + * + * Copyright (c) 2020 Pengutronix, Sascha Hauer <s.hauer@pengutronix.de> + * Copyright (c) 2025 Microsoft Corporation, <vijayb@linux.microsoft.com> + * + * Based on Code from: + * Copyright (c) 2018, NXP Semiconductor + * Author: York Sun <york.sun@nxp.com> + */ + +#include <linux/module.h> +#include <linux/of.h> +#include <linux/bitfield.h> +#include <asm/smp_plat.h> + +#include "edac_module.h" + +#define DRVNAME "a72-edac" + +#define SYS_CPUMERRSR_EL1 sys_reg(3, 1, 15, 2, 2) +#define SYS_L2MERRSR_EL1 sys_reg(3, 1, 15, 2, 3) + +#define CPUMERRSR_EL1_RAMID GENMASK(30, 24) +#define L2MERRSR_EL1_CPUID_WAY GENMASK(21, 18) + +#define CPUMERRSR_EL1_VALID BIT(31) +#define CPUMERRSR_EL1_FATAL BIT(63) +#define L2MERRSR_EL1_VALID BIT(31) +#define L2MERRSR_EL1_FATAL BIT(63) + +#define L1_I_TAG_RAM 0x00 +#define L1_I_DATA_RAM 0x01 +#define L1_D_TAG_RAM 0x08 +#define L1_D_DATA_RAM 0x09 +#define TLB_RAM 0x18 + +#define MESSAGE_SIZE 64 + +struct mem_err_synd_reg { + u64 cpu_mesr; + u64 l2_mesr; +}; + +static struct cpumask compat_mask; + +static void report_errors(struct edac_device_ctl_info *edac_ctl, int cpu, + struct mem_err_synd_reg *mesr) +{ + u64 cpu_mesr = mesr->cpu_mesr; + u64 l2_mesr = mesr->l2_mesr; + char msg[MESSAGE_SIZE]; + + if (cpu_mesr & CPUMERRSR_EL1_VALID) { + const char *str; + bool fatal = cpu_mesr & CPUMERRSR_EL1_FATAL; + + switch (FIELD_GET(CPUMERRSR_EL1_RAMID, cpu_mesr)) { + case L1_I_TAG_RAM: + str = "L1-I Tag RAM"; + break; + case L1_I_DATA_RAM: + str = "L1-I Data RAM"; + break; + case L1_D_TAG_RAM: + str = "L1-D Tag RAM"; + break; + case L1_D_DATA_RAM: + str = "L1-D Data RAM"; + break; + case TLB_RAM: + str = "TLB RAM"; + break; + default: + str = "Unspecified"; + break; + } + + snprintf(msg, MESSAGE_SIZE, "%s %s error(s) on CPU %d", + str, fatal ? "fatal" : "correctable", cpu); + + if (fatal) + edac_device_handle_ue(edac_ctl, cpu, 0, msg); + else + edac_device_handle_ce(edac_ctl, cpu, 0, msg); + } + + if (l2_mesr & L2MERRSR_EL1_VALID) { + bool fatal = l2_mesr & L2MERRSR_EL1_FATAL; + + snprintf(msg, MESSAGE_SIZE, "L2 %s error(s) on CPU %d CPUID/WAY 0x%lx", + fatal ? "fatal" : "correctable", cpu, + FIELD_GET(L2MERRSR_EL1_CPUID_WAY, l2_mesr)); + if (fatal) + edac_device_handle_ue(edac_ctl, cpu, 1, msg); + else + edac_device_handle_ce(edac_ctl, cpu, 1, msg); + } +} + +static void read_errors(void *data) +{ + struct mem_err_synd_reg *mesr = data; + + mesr->cpu_mesr = read_sysreg_s(SYS_CPUMERRSR_EL1); + if (mesr->cpu_mesr & CPUMERRSR_EL1_VALID) { + write_sysreg_s(0, SYS_CPUMERRSR_EL1); + isb(); + } + mesr->l2_mesr = read_sysreg_s(SYS_L2MERRSR_EL1); + if (mesr->l2_mesr & L2MERRSR_EL1_VALID) { + write_sysreg_s(0, SYS_L2MERRSR_EL1); + isb(); + } +} + +static void a72_edac_check(struct edac_device_ctl_info *edac_ctl) +{ + struct mem_err_synd_reg mesr; + int cpu; + + cpus_read_lock(); + for_each_cpu_and(cpu, cpu_online_mask, &compat_mask) { + smp_call_function_single(cpu, read_errors, &mesr, true); + report_errors(edac_ctl, cpu, &mesr); + } + cpus_read_unlock(); +} + +static int a72_edac_probe(struct platform_device *pdev) +{ + struct edac_device_ctl_info *edac_ctl; + struct device *dev = &pdev->dev; + int rc; + + edac_ctl = edac_device_alloc_ctl_info(0, "cpu", + num_possible_cpus(), "L", 2, 1, + edac_device_alloc_index()); + if (!edac_ctl) + return -ENOMEM; + + edac_ctl->edac_check = a72_edac_check; + edac_ctl->dev = dev; + edac_ctl->mod_name = dev_name(dev); + edac_ctl->dev_name = dev_name(dev); + edac_ctl->ctl_name = DRVNAME; + dev_set_drvdata(dev, edac_ctl); + + rc = edac_device_add_device(edac_ctl); + if (rc) + goto out_dev; + + return 0; + +out_dev: + edac_device_free_ctl_info(edac_ctl); + + return rc; +} + +static void a72_edac_remove(struct platform_device *pdev) +{ + struct edac_device_ctl_info *edac_ctl = dev_get_drvdata(&pdev->dev); + + edac_device_del_device(edac_ctl->dev); + edac_device_free_ctl_info(edac_ctl); +} + +static const struct of_device_id cortex_arm64_edac_of_match[] = { + { .compatible = "arm,cortex-a72" }, + {} +}; +MODULE_DEVICE_TABLE(of, cortex_arm64_edac_of_match); + +static struct platform_driver a72_edac_driver = { + .probe = a72_edac_probe, + .remove = a72_edac_remove, + .driver = { + .name = DRVNAME, + }, +}; + +static struct platform_device *a72_pdev; + +static int __init a72_edac_driver_init(void) +{ + int cpu; + + for_each_possible_cpu(cpu) { + struct device_node *np __free(device_node) = of_cpu_device_node_get(cpu); + if (np) { + if (of_match_node(cortex_arm64_edac_of_match, np) && + of_property_read_bool(np, "edac-enabled")) { + cpumask_set_cpu(cpu, &compat_mask); + } + } else { + pr_warn("failed to find device node for CPU %d\n", cpu); + } + } + + if (cpumask_empty(&compat_mask)) + return 0; + + a72_pdev = platform_device_register_simple(DRVNAME, -1, NULL, 0); + if (IS_ERR(a72_pdev)) { + pr_err("failed to register A72 EDAC device\n"); + return PTR_ERR(a72_pdev); + } + + return platform_driver_register(&a72_edac_driver); +} + +static void __exit a72_edac_driver_exit(void) +{ + platform_device_unregister(a72_pdev); + platform_driver_unregister(&a72_edac_driver); +} + +module_init(a72_edac_driver_init); +module_exit(a72_edac_driver_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Sascha Hauer <s.hauer@pengutronix.de>"); +MODULE_DESCRIPTION("Cortex A72 L1 and L2 cache EDAC driver"); diff --git a/drivers/edac/altera_edac.c b/drivers/edac/altera_edac.c index 7685a8550d4b..103b2c2eba2a 100644 --- a/drivers/edac/altera_edac.c +++ b/drivers/edac/altera_edac.c @@ -2130,8 +2130,8 @@ static int altr_edac_a10_probe(struct platform_device *pdev) edac->irq_chip.name = pdev->dev.of_node->name; edac->irq_chip.irq_mask = a10_eccmgr_irq_mask; edac->irq_chip.irq_unmask = a10_eccmgr_irq_unmask; - edac->domain = irq_domain_create_linear(of_fwnode_handle(pdev->dev.of_node), - 64, &a10_eccmgr_ic_ops, edac); + edac->domain = irq_domain_create_linear(dev_fwnode(&pdev->dev), 64, &a10_eccmgr_ic_ops, + edac); if (!edac->domain) { dev_err(&pdev->dev, "Error adding IRQ domain\n"); return -ENOMEM; diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index 07f1e9dc1ca7..2f6ab783bf20 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -3923,6 +3923,26 @@ static int per_family_init(struct amd64_pvt *pvt) pvt->ctl_name = "F1Ah_M40h"; pvt->flags.zn_regs_v2 = 1; break; + case 0x50 ... 0x57: + pvt->ctl_name = "F1Ah_M50h"; + pvt->max_mcs = 16; + pvt->flags.zn_regs_v2 = 1; + break; + case 0x90 ... 0x9f: + pvt->ctl_name = "F1Ah_M90h"; + pvt->max_mcs = 8; + pvt->flags.zn_regs_v2 = 1; + break; + case 0xa0 ... 0xaf: + pvt->ctl_name = "F1Ah_MA0h"; + pvt->max_mcs = 8; + pvt->flags.zn_regs_v2 = 1; + break; + case 0xc0 ... 0xc7: + pvt->ctl_name = "F1Ah_MC0h"; + pvt->max_mcs = 16; + pvt->flags.zn_regs_v2 = 1; + break; } break; diff --git a/drivers/edac/amd64_edac.h b/drivers/edac/amd64_edac.h index 17228d07de4c..d70b8a8d0b09 100644 --- a/drivers/edac/amd64_edac.h +++ b/drivers/edac/amd64_edac.h @@ -96,7 +96,7 @@ /* Hardware limit on ChipSelect rows per MC and processors per system */ #define NUM_CHIPSELECTS 8 #define DRAM_RANGES 8 -#define NUM_CONTROLLERS 12 +#define NUM_CONTROLLERS 16 #define ON true #define OFF false diff --git a/drivers/edac/ecs.c b/drivers/edac/ecs.c index 51c451c7f0f0..51c451c7f0f0 100755..100644 --- a/drivers/edac/ecs.c +++ b/drivers/edac/ecs.c diff --git a/drivers/edac/edac_mc_sysfs.c b/drivers/edac/edac_mc_sysfs.c index 0f338adf7d93..8689631f1905 100644 --- a/drivers/edac/edac_mc_sysfs.c +++ b/drivers/edac/edac_mc_sysfs.c @@ -305,6 +305,14 @@ DEVICE_CHANNEL(ch10_dimm_label, S_IRUGO | S_IWUSR, channel_dimm_label_show, channel_dimm_label_store, 10); DEVICE_CHANNEL(ch11_dimm_label, S_IRUGO | S_IWUSR, channel_dimm_label_show, channel_dimm_label_store, 11); +DEVICE_CHANNEL(ch12_dimm_label, S_IRUGO | S_IWUSR, + channel_dimm_label_show, channel_dimm_label_store, 12); +DEVICE_CHANNEL(ch13_dimm_label, S_IRUGO | S_IWUSR, + channel_dimm_label_show, channel_dimm_label_store, 13); +DEVICE_CHANNEL(ch14_dimm_label, S_IRUGO | S_IWUSR, + channel_dimm_label_show, channel_dimm_label_store, 14); +DEVICE_CHANNEL(ch15_dimm_label, S_IRUGO | S_IWUSR, + channel_dimm_label_show, channel_dimm_label_store, 15); /* Total possible dynamic DIMM Label attribute file table */ static struct attribute *dynamic_csrow_dimm_attr[] = { @@ -320,6 +328,10 @@ static struct attribute *dynamic_csrow_dimm_attr[] = { &dev_attr_legacy_ch9_dimm_label.attr.attr, &dev_attr_legacy_ch10_dimm_label.attr.attr, &dev_attr_legacy_ch11_dimm_label.attr.attr, + &dev_attr_legacy_ch12_dimm_label.attr.attr, + &dev_attr_legacy_ch13_dimm_label.attr.attr, + &dev_attr_legacy_ch14_dimm_label.attr.attr, + &dev_attr_legacy_ch15_dimm_label.attr.attr, NULL }; @@ -348,6 +360,14 @@ DEVICE_CHANNEL(ch10_ce_count, S_IRUGO, channel_ce_count_show, NULL, 10); DEVICE_CHANNEL(ch11_ce_count, S_IRUGO, channel_ce_count_show, NULL, 11); +DEVICE_CHANNEL(ch12_ce_count, S_IRUGO, + channel_ce_count_show, NULL, 12); +DEVICE_CHANNEL(ch13_ce_count, S_IRUGO, + channel_ce_count_show, NULL, 13); +DEVICE_CHANNEL(ch14_ce_count, S_IRUGO, + channel_ce_count_show, NULL, 14); +DEVICE_CHANNEL(ch15_ce_count, S_IRUGO, + channel_ce_count_show, NULL, 15); /* Total possible dynamic ce_count attribute file table */ static struct attribute *dynamic_csrow_ce_count_attr[] = { @@ -363,6 +383,10 @@ static struct attribute *dynamic_csrow_ce_count_attr[] = { &dev_attr_legacy_ch9_ce_count.attr.attr, &dev_attr_legacy_ch10_ce_count.attr.attr, &dev_attr_legacy_ch11_ce_count.attr.attr, + &dev_attr_legacy_ch12_ce_count.attr.attr, + &dev_attr_legacy_ch13_ce_count.attr.attr, + &dev_attr_legacy_ch14_ce_count.attr.attr, + &dev_attr_legacy_ch15_ce_count.attr.attr, NULL }; diff --git a/drivers/edac/i10nm_base.c b/drivers/edac/i10nm_base.c index bf4171ac191d..2010a47149f4 100644 --- a/drivers/edac/i10nm_base.c +++ b/drivers/edac/i10nm_base.c @@ -468,17 +468,18 @@ static int i10nm_get_imc_num(struct res_config *cfg) return -ENODEV; } - if (imc_num > I10NM_NUM_DDR_IMC) { - i10nm_printk(KERN_ERR, "Need to make I10NM_NUM_DDR_IMC >= %d\n", imc_num); - return -EINVAL; - } - if (cfg->ddr_imc_num != imc_num) { /* - * Store the number of present DDR memory controllers. + * Update the configuration data to reflect the number of + * present DDR memory controllers. */ cfg->ddr_imc_num = imc_num; edac_dbg(2, "Set DDR MC number: %d", imc_num); + + /* Release and reallocate skx_dev list with the updated number. */ + skx_remove(); + if (skx_get_all_bus_mappings(cfg, &i10nm_edac_list) <= 0) + return -ENODEV; } return 0; @@ -1057,6 +1058,15 @@ static bool i10nm_check_ecc(struct skx_imc *imc, int chan) return !!GET_BITFIELD(mcmtr, 2, 2); } +static bool i10nm_channel_disabled(struct skx_imc *imc, int chan) +{ + u32 mcmtr = I10NM_GET_MCMTR(imc, chan); + + edac_dbg(1, "mc%d ch%d mcmtr reg %x\n", imc->mc, chan, mcmtr); + + return (mcmtr == ~0 || GET_BITFIELD(mcmtr, 18, 18)); +} + static int i10nm_get_dimm_config(struct mem_ctl_info *mci, struct res_config *cfg) { @@ -1070,6 +1080,11 @@ static int i10nm_get_dimm_config(struct mem_ctl_info *mci, if (!imc->mbase) continue; + if (i10nm_channel_disabled(imc, i)) { + edac_dbg(1, "mc%d ch%d is disabled.\n", imc->mc, i); + continue; + } + ndimms = 0; if (res_cfg->type != GNR) diff --git a/drivers/edac/ie31200_edac.c b/drivers/edac/ie31200_edac.c index 5c1fa1c0d12e..5a080ab65476 100644 --- a/drivers/edac/ie31200_edac.c +++ b/drivers/edac/ie31200_edac.c @@ -99,6 +99,8 @@ /* Alder Lake-S */ #define PCI_DEVICE_ID_INTEL_IE31200_ADL_S_1 0x4660 +#define PCI_DEVICE_ID_INTEL_IE31200_ADL_S_2 0x4668 /* 8P+4E, e.g. i7-12700K */ +#define PCI_DEVICE_ID_INTEL_IE31200_ADL_S_3 0x4648 /* 6P+4E, e.g. i5-12600K */ /* Bartlett Lake-S */ #define PCI_DEVICE_ID_INTEL_IE31200_BTL_S_1 0x4639 @@ -761,6 +763,8 @@ static const struct pci_device_id ie31200_pci_tbl[] = { { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_RPL_S_6), (kernel_ulong_t)&rpl_s_cfg}, { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_RPL_HX_1), (kernel_ulong_t)&rpl_s_cfg}, { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_ADL_S_1), (kernel_ulong_t)&rpl_s_cfg}, + { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_ADL_S_2), (kernel_ulong_t)&rpl_s_cfg}, + { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_ADL_S_3), (kernel_ulong_t)&rpl_s_cfg}, { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_BTL_S_1), (kernel_ulong_t)&rpl_s_cfg}, { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_BTL_S_2), (kernel_ulong_t)&rpl_s_cfg}, { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_BTL_S_3), (kernel_ulong_t)&rpl_s_cfg}, diff --git a/drivers/edac/mem_repair.c b/drivers/edac/mem_repair.c index 108d69209146..108d69209146 100755..100644 --- a/drivers/edac/mem_repair.c +++ b/drivers/edac/mem_repair.c diff --git a/drivers/edac/scrub.c b/drivers/edac/scrub.c index f9d02af2fc3a..f9d02af2fc3a 100755..100644 --- a/drivers/edac/scrub.c +++ b/drivers/edac/scrub.c diff --git a/drivers/edac/skx_base.c b/drivers/edac/skx_base.c index 29897b21fb8e..078ddf95cc6e 100644 --- a/drivers/edac/skx_base.c +++ b/drivers/edac/skx_base.c @@ -33,6 +33,15 @@ static unsigned int nvdimm_count; #define MASK26 0x3FFFFFF /* Mask for 2^26 */ #define MASK29 0x1FFFFFFF /* Mask for 2^29 */ +static struct res_config skx_cfg = { + .type = SKX, + .decs_did = 0x2016, + .busno_cfg_offset = 0xcc, + .ddr_imc_num = 2, + .ddr_chan_num = 3, + .ddr_dimm_num = 2, +}; + static struct skx_dev *get_skx_dev(struct pci_bus *bus, u8 idx) { struct skx_dev *d; @@ -52,7 +61,7 @@ enum munittype { struct munit { u16 did; - u16 devfn[SKX_NUM_IMC]; + u16 devfn[2]; u8 busidx; u8 per_socket; enum munittype mtype; @@ -89,11 +98,11 @@ static int get_all_munits(const struct munit *m) if (!pdev) break; ndev++; - if (m->per_socket == SKX_NUM_IMC) { - for (i = 0; i < SKX_NUM_IMC; i++) + if (m->per_socket == skx_cfg.ddr_imc_num) { + for (i = 0; i < skx_cfg.ddr_imc_num; i++) if (m->devfn[i] == pdev->devfn) break; - if (i == SKX_NUM_IMC) + if (i == skx_cfg.ddr_imc_num) goto fail; } d = get_skx_dev(pdev->bus, m->busidx); @@ -157,12 +166,6 @@ fail: return -ENODEV; } -static struct res_config skx_cfg = { - .type = SKX, - .decs_did = 0x2016, - .busno_cfg_offset = 0xcc, -}; - static const struct x86_cpu_id skx_cpuids[] = { X86_MATCH_VFM(INTEL_SKYLAKE_X, &skx_cfg), { } @@ -186,11 +189,11 @@ static int skx_get_dimm_config(struct mem_ctl_info *mci, struct res_config *cfg) /* Only the mcmtr on the first channel is effective */ pci_read_config_dword(imc->chan[0].cdev, 0x87c, &mcmtr); - for (i = 0; i < SKX_NUM_CHANNELS; i++) { + for (i = 0; i < cfg->ddr_chan_num; i++) { ndimms = 0; pci_read_config_dword(imc->chan[i].cdev, 0x8C, &amap); pci_read_config_dword(imc->chan[i].cdev, 0x400, &mcddrtcfg); - for (j = 0; j < SKX_NUM_DIMMS; j++) { + for (j = 0; j < cfg->ddr_dimm_num; j++) { dimm = edac_get_dimm(mci, i, j, 0); pci_read_config_dword(imc->chan[i].cdev, 0x80 + 4 * j, &mtr); @@ -620,6 +623,7 @@ static int __init skx_init(void) return -ENODEV; cfg = (struct res_config *)id->driver_data; + skx_set_res_cfg(cfg); rc = skx_get_hi_lo(0x2034, off, &skx_tolm, &skx_tohm); if (rc) @@ -652,10 +656,13 @@ static int __init skx_init(void) goto fail; edac_dbg(2, "src_id = %d\n", src_id); - for (i = 0; i < SKX_NUM_IMC; i++) { + for (i = 0; i < cfg->ddr_imc_num; i++) { d->imc[i].mc = mc++; d->imc[i].lmc = i; d->imc[i].src_id = src_id; + d->imc[i].num_channels = cfg->ddr_chan_num; + d->imc[i].num_dimms = cfg->ddr_dimm_num; + rc = skx_register_mci(&d->imc[i], d->imc[i].chan[0].cdev, "Skylake Socket", EDAC_MOD_STR, skx_get_dimm_config, cfg); diff --git a/drivers/edac/skx_common.c b/drivers/edac/skx_common.c index 39c733dbc5b9..724842f512ac 100644 --- a/drivers/edac/skx_common.c +++ b/drivers/edac/skx_common.c @@ -14,9 +14,11 @@ * Copyright (c) 2018, Intel Corporation. */ +#include <linux/topology.h> #include <linux/acpi.h> #include <linux/dmi.h> #include <linux/adxl.h> +#include <linux/overflow.h> #include <acpi/nfit.h> #include <asm/mce.h> #include <asm/uv/uv.h> @@ -130,8 +132,8 @@ static void skx_init_mc_mapping(struct skx_dev *d) * the logical indices of the memory controllers enumerated by the * EDAC driver. */ - for (int i = 0; i < NUM_IMC; i++) - d->mc_mapping[i] = i; + for (int i = 0; i < d->num_imc; i++) + d->imc[i].mc_mapping = i; } void skx_set_mc_mapping(struct skx_dev *d, u8 pmc, u8 lmc) @@ -139,22 +141,28 @@ void skx_set_mc_mapping(struct skx_dev *d, u8 pmc, u8 lmc) edac_dbg(0, "Set the mapping of mc phy idx to logical idx: %02d -> %02d\n", pmc, lmc); - d->mc_mapping[pmc] = lmc; + d->imc[lmc].mc_mapping = pmc; } EXPORT_SYMBOL_GPL(skx_set_mc_mapping); -static u8 skx_get_mc_mapping(struct skx_dev *d, u8 pmc) +static int skx_get_mc_mapping(struct skx_dev *d, u8 pmc) { - edac_dbg(0, "Get the mapping of mc phy idx to logical idx: %02d -> %02d\n", - pmc, d->mc_mapping[pmc]); + for (int lmc = 0; lmc < d->num_imc; lmc++) { + if (d->imc[lmc].mc_mapping == pmc) { + edac_dbg(0, "Get the mapping of mc phy idx to logical idx: %02d -> %02d\n", + pmc, lmc); - return d->mc_mapping[pmc]; + return lmc; + } + } + + return -1; } static bool skx_adxl_decode(struct decoded_addr *res, enum error_source err_src) { + int i, lmc, len = 0; struct skx_dev *d; - int i, len = 0; if (res->addr >= skx_tohm || (res->addr >= skx_tolm && res->addr < BIT_ULL(32))) { @@ -200,7 +208,7 @@ static bool skx_adxl_decode(struct decoded_addr *res, enum error_source err_src) res->cs = (int)adxl_values[component_indices[INDEX_CS]]; } - if (res->imc > NUM_IMC - 1 || res->imc < 0) { + if (res->imc < 0) { skx_printk(KERN_ERR, "Bad imc %d\n", res->imc); return false; } @@ -218,7 +226,13 @@ static bool skx_adxl_decode(struct decoded_addr *res, enum error_source err_src) return false; } - res->imc = skx_get_mc_mapping(d, res->imc); + lmc = skx_get_mc_mapping(d, res->imc); + if (lmc < 0) { + skx_printk(KERN_ERR, "No lmc for imc %d\n", res->imc); + return false; + } + + res->imc = lmc; for (i = 0; i < adxl_component_count; i++) { if (adxl_values[i] == ~0x0ull) @@ -265,7 +279,7 @@ static int skx_get_pkg_id(struct skx_dev *d, u8 *id) struct cpuinfo_x86 *c = &cpu_data(cpu); if (c->initialized && cpu_to_node(cpu) == node) { - *id = c->topo.pkg_id; + *id = topology_physical_package_id(cpu); return 0; } } @@ -320,10 +334,10 @@ static int get_width(u32 mtr) */ int skx_get_all_bus_mappings(struct res_config *cfg, struct list_head **list) { + int ndev = 0, imc_num = cfg->ddr_imc_num + cfg->hbm_imc_num; struct pci_dev *pdev, *prev; struct skx_dev *d; u32 reg; - int ndev = 0; prev = NULL; for (;;) { @@ -331,7 +345,7 @@ int skx_get_all_bus_mappings(struct res_config *cfg, struct list_head **list) if (!pdev) break; ndev++; - d = kzalloc(sizeof(*d), GFP_KERNEL); + d = kzalloc(struct_size(d, imc, imc_num), GFP_KERNEL); if (!d) { pci_dev_put(pdev); return -ENOMEM; @@ -354,8 +368,10 @@ int skx_get_all_bus_mappings(struct res_config *cfg, struct list_head **list) d->seg = GET_BITFIELD(reg, 16, 23); } - edac_dbg(2, "busses: 0x%x, 0x%x, 0x%x, 0x%x\n", - d->bus[0], d->bus[1], d->bus[2], d->bus[3]); + d->num_imc = imc_num; + + edac_dbg(2, "busses: 0x%x, 0x%x, 0x%x, 0x%x, imcs %d\n", + d->bus[0], d->bus[1], d->bus[2], d->bus[3], imc_num); list_add_tail(&d->list, &dev_edac_list); prev = pdev; @@ -541,10 +557,10 @@ int skx_register_mci(struct skx_imc *imc, struct pci_dev *pdev, /* Allocate a new MC control structure */ layers[0].type = EDAC_MC_LAYER_CHANNEL; - layers[0].size = NUM_CHANNELS; + layers[0].size = imc->num_channels; layers[0].is_virt_csrow = false; layers[1].type = EDAC_MC_LAYER_SLOT; - layers[1].size = NUM_DIMMS; + layers[1].size = imc->num_dimms; layers[1].is_virt_csrow = true; mci = edac_mc_alloc(imc->mc, ARRAY_SIZE(layers), layers, sizeof(struct skx_pvt)); @@ -784,7 +800,7 @@ void skx_remove(void) list_for_each_entry_safe(d, tmp, &dev_edac_list, list) { list_del(&d->list); - for (i = 0; i < NUM_IMC; i++) { + for (i = 0; i < d->num_imc; i++) { if (d->imc[i].mci) skx_unregister_mci(&d->imc[i]); @@ -794,7 +810,7 @@ void skx_remove(void) if (d->imc[i].mbase) iounmap(d->imc[i].mbase); - for (j = 0; j < NUM_CHANNELS; j++) { + for (j = 0; j < d->imc[i].num_channels; j++) { if (d->imc[i].chan[j].cdev) pci_dev_put(d->imc[i].chan[j].cdev); } diff --git a/drivers/edac/skx_common.h b/drivers/edac/skx_common.h index ec4966f7ea40..73ba89786cdf 100644 --- a/drivers/edac/skx_common.h +++ b/drivers/edac/skx_common.h @@ -29,23 +29,18 @@ #define GET_BITFIELD(v, lo, hi) \ (((v) & GENMASK_ULL((hi), (lo))) >> (lo)) -#define SKX_NUM_IMC 2 /* Memory controllers per socket */ #define SKX_NUM_CHANNELS 3 /* Channels per memory controller */ #define SKX_NUM_DIMMS 2 /* Max DIMMS per channel */ -#define I10NM_NUM_DDR_IMC 12 #define I10NM_NUM_DDR_CHANNELS 2 #define I10NM_NUM_DDR_DIMMS 2 -#define I10NM_NUM_HBM_IMC 16 #define I10NM_NUM_HBM_CHANNELS 2 #define I10NM_NUM_HBM_DIMMS 1 -#define I10NM_NUM_IMC (I10NM_NUM_DDR_IMC + I10NM_NUM_HBM_IMC) #define I10NM_NUM_CHANNELS MAX(I10NM_NUM_DDR_CHANNELS, I10NM_NUM_HBM_CHANNELS) #define I10NM_NUM_DIMMS MAX(I10NM_NUM_DDR_DIMMS, I10NM_NUM_HBM_DIMMS) -#define NUM_IMC MAX(SKX_NUM_IMC, I10NM_NUM_IMC) #define NUM_CHANNELS MAX(SKX_NUM_CHANNELS, I10NM_NUM_CHANNELS) #define NUM_DIMMS MAX(SKX_NUM_DIMMS, I10NM_NUM_DIMMS) @@ -134,16 +129,7 @@ struct skx_dev { struct pci_dev *uracu; /* for i10nm CPU */ struct pci_dev *pcu_cr3; /* for HBM memory detection */ u32 mcroute; - /* - * Some server BIOS may hide certain memory controllers, and the - * EDAC driver skips those hidden memory controllers. However, the - * ADXL still decodes memory error address using physical memory - * controller indices. The mapping table is used to convert the - * physical indices (reported by ADXL) to the logical indices - * (used the EDAC driver) of present memory controllers during the - * error handling process. - */ - u8 mc_mapping[NUM_IMC]; + int num_imc; struct skx_imc { struct mem_ctl_info *mci; struct pci_dev *mdev; /* for i10nm CPU */ @@ -155,6 +141,16 @@ struct skx_dev { u8 mc; /* system wide mc# */ u8 lmc; /* socket relative mc# */ u8 src_id; + /* + * Some server BIOS may hide certain memory controllers, and the + * EDAC driver skips those hidden memory controllers. However, the + * ADXL still decodes memory error address using physical memory + * controller indices. The mapping table is used to convert the + * physical indices (reported by ADXL) to the logical indices + * (used the EDAC driver) of present memory controllers during the + * error handling process. + */ + u8 mc_mapping; struct skx_channel { struct pci_dev *cdev; struct pci_dev *edev; @@ -171,7 +167,7 @@ struct skx_dev { u8 colbits; } dimms[NUM_DIMMS]; } chan[NUM_CHANNELS]; - } imc[NUM_IMC]; + } imc[]; }; struct skx_pvt { diff --git a/drivers/edac/versalnet_edac.c b/drivers/edac/versalnet_edac.c new file mode 100644 index 000000000000..7c5db8bf0595 --- /dev/null +++ b/drivers/edac/versalnet_edac.c @@ -0,0 +1,960 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * AMD Versal NET memory controller driver + * Copyright (C) 2025 Advanced Micro Devices, Inc. + */ + +#include <linux/cdx/edac_cdx_pcol.h> +#include <linux/edac.h> +#include <linux/module.h> +#include <linux/of_device.h> +#include <linux/ras.h> +#include <linux/remoteproc.h> +#include <linux/rpmsg.h> +#include <linux/sizes.h> +#include <ras/ras_event.h> + +#include "edac_module.h" + +/* Granularity of reported error in bytes */ +#define MC5_ERR_GRAIN 1 +#define MC_GET_DDR_CONFIG_IN_LEN 4 + +#define MC5_IRQ_CE_MASK GENMASK(18, 15) +#define MC5_IRQ_UE_MASK GENMASK(14, 11) + +#define MC5_RANK_1_MASK GENMASK(11, 6) +#define MASK_24 GENMASK(29, 24) +#define MASK_0 GENMASK(5, 0) + +#define MC5_LRANK_1_MASK GENMASK(11, 6) +#define MC5_LRANK_2_MASK GENMASK(17, 12) +#define MC5_BANK1_MASK GENMASK(11, 6) +#define MC5_GRP_0_MASK GENMASK(17, 12) +#define MC5_GRP_1_MASK GENMASK(23, 18) + +#define MC5_REGHI_ROW 7 +#define MC5_EACHBIT 1 +#define MC5_ERR_TYPE_CE 0 +#define MC5_ERR_TYPE_UE 1 +#define MC5_HIGH_MEM_EN BIT(20) +#define MC5_MEM_MASK GENMASK(19, 0) +#define MC5_X16_BASE 256 +#define MC5_X16_ECC 32 +#define MC5_X16_SIZE (MC5_X16_BASE + MC5_X16_ECC) +#define MC5_X32_SIZE 576 +#define MC5_HIMEM_BASE (256 * SZ_1M) +#define MC5_ILC_HIMEM_EN BIT(28) +#define MC5_ILC_MEM GENMASK(27, 0) +#define MC5_INTERLEAVE_SEL GENMASK(3, 0) +#define MC5_BUS_WIDTH_MASK GENMASK(19, 18) +#define MC5_NUM_CHANS_MASK BIT(17) +#define MC5_RANK_MASK GENMASK(15, 14) + +#define ERROR_LEVEL 2 +#define ERROR_ID 3 +#define TOTAL_ERR_LENGTH 5 +#define MSG_ERR_OFFSET 8 +#define MSG_ERR_LENGTH 9 +#define ERROR_DATA 10 +#define MCDI_RESPONSE 0xFF + +#define REG_MAX 152 +#define ADEC_MAX 152 +#define NUM_CONTROLLERS 8 +#define REGS_PER_CONTROLLER 19 +#define ADEC_NUM 19 +#define BUFFER_SZ 80 + +#define XDDR5_BUS_WIDTH_64 0 +#define XDDR5_BUS_WIDTH_32 1 +#define XDDR5_BUS_WIDTH_16 2 + +/** + * struct ecc_error_info - ECC error log information. + * @burstpos: Burst position. + * @lrank: Logical Rank number. + * @rank: Rank number. + * @group: Group number. + * @bank: Bank number. + * @col: Column number. + * @row: Row number. + * @rowhi: Row number higher bits. + * @i: Combined ECC error vector containing encoded values of burst position, + * rank, bank, column, and row information. + */ +union ecc_error_info { + struct { + u32 burstpos:3; + u32 lrank:4; + u32 rank:2; + u32 group:3; + u32 bank:2; + u32 col:11; + u32 row:7; + u32 rowhi; + }; + u64 i; +} __packed; + +/* Row and column bit positions in the address decoder (ADEC) registers. */ +union row_col_mapping { + struct { + u32 row0:6; + u32 row1:6; + u32 row2:6; + u32 row3:6; + u32 row4:6; + u32 reserved:2; + }; + struct { + u32 col1:6; + u32 col2:6; + u32 col3:6; + u32 col4:6; + u32 col5:6; + u32 reservedcol:2; + }; + u32 i; +} __packed; + +/** + * struct ecc_status - ECC status information to report. + * @ceinfo: Correctable errors. + * @ueinfo: Uncorrected errors. + * @channel: Channel number. + * @error_type: Error type. + */ +struct ecc_status { + union ecc_error_info ceinfo[2]; + union ecc_error_info ueinfo[2]; + u8 channel; + u8 error_type; +}; + +/** + * struct mc_priv - DDR memory controller private instance data. + * @message: Buffer for framing the event specific info. + * @stat: ECC status information. + * @error_id: The error id. + * @error_level: The error level. + * @dwidth: Width of data bus excluding ECC bits. + * @part_len: The support of the message received. + * @regs: The registers sent on the rpmsg. + * @adec: Address decode registers. + * @mci: Memory controller interface. + * @ept: rpmsg endpoint. + * @mcdi: The mcdi handle. + */ +struct mc_priv { + char message[256]; + struct ecc_status stat; + u32 error_id; + u32 error_level; + u32 dwidth; + u32 part_len; + u32 regs[REG_MAX]; + u32 adec[ADEC_MAX]; + struct mem_ctl_info *mci[NUM_CONTROLLERS]; + struct rpmsg_endpoint *ept; + struct cdx_mcdi *mcdi; +}; + +/* + * Address decoder (ADEC) registers to match the order in which the register + * information is received from the firmware. + */ +enum adec_info { + CONF = 0, + ADEC0, + ADEC1, + ADEC2, + ADEC3, + ADEC4, + ADEC5, + ADEC6, + ADEC7, + ADEC8, + ADEC9, + ADEC10, + ADEC11, + ADEC12, + ADEC13, + ADEC14, + ADEC15, + ADEC16, + ADECILC, +}; + +enum reg_info { + ISR = 0, + IMR, + ECCR0_ERR_STATUS, + ECCR0_ADDR_LO, + ECCR0_ADDR_HI, + ECCR0_DATA_LO, + ECCR0_DATA_HI, + ECCR0_PAR, + ECCR1_ERR_STATUS, + ECCR1_ADDR_LO, + ECCR1_ADDR_HI, + ECCR1_DATA_LO, + ECCR1_DATA_HI, + ECCR1_PAR, + XMPU_ERR, + XMPU_ERR_ADDR_L0, + XMPU_ERR_ADDR_HI, + XMPU_ERR_AXI_ID, + ADEC_CHK_ERR_LOG, +}; + +static bool get_ddr_info(u32 *error_data, struct mc_priv *priv) +{ + u32 reglo, reghi, parity, eccr0_val, eccr1_val, isr; + struct ecc_status *p; + + isr = error_data[ISR]; + + if (!(isr & (MC5_IRQ_UE_MASK | MC5_IRQ_CE_MASK))) + return false; + + eccr0_val = error_data[ECCR0_ERR_STATUS]; + eccr1_val = error_data[ECCR1_ERR_STATUS]; + + if (!eccr0_val && !eccr1_val) + return false; + + p = &priv->stat; + + if (!eccr0_val) + p->channel = 1; + else + p->channel = 0; + + reglo = error_data[ECCR0_ADDR_LO]; + reghi = error_data[ECCR0_ADDR_HI]; + if (isr & MC5_IRQ_CE_MASK) + p->ceinfo[0].i = reglo | (u64)reghi << 32; + else if (isr & MC5_IRQ_UE_MASK) + p->ueinfo[0].i = reglo | (u64)reghi << 32; + + parity = error_data[ECCR0_PAR]; + edac_dbg(2, "ERR DATA: 0x%08X%08X PARITY: 0x%08X\n", + reghi, reglo, parity); + + reglo = error_data[ECCR1_ADDR_LO]; + reghi = error_data[ECCR1_ADDR_HI]; + if (isr & MC5_IRQ_CE_MASK) + p->ceinfo[1].i = reglo | (u64)reghi << 32; + else if (isr & MC5_IRQ_UE_MASK) + p->ueinfo[1].i = reglo | (u64)reghi << 32; + + parity = error_data[ECCR1_PAR]; + edac_dbg(2, "ERR DATA: 0x%08X%08X PARITY: 0x%08X\n", + reghi, reglo, parity); + + return true; +} + +/** + * convert_to_physical - Convert @error_data to a physical address. + * @priv: DDR memory controller private instance data. + * @pinf: ECC error info structure. + * @controller: Controller number of the MC5 + * @error_data: the DDRMC5 ADEC address decoder register data + * + * Return: physical address of the DDR memory. + */ +static unsigned long convert_to_physical(struct mc_priv *priv, + union ecc_error_info pinf, + int controller, int *error_data) +{ + u32 row, blk, rsh_req_addr, interleave, ilc_base_ctrl_add, ilc_himem_en, reg, offset; + u64 high_mem_base, high_mem_offset, low_mem_offset, ilcmem_base; + unsigned long err_addr = 0, addr; + union row_col_mapping cols; + union row_col_mapping rows; + u32 col_bit_0; + + row = pinf.rowhi << MC5_REGHI_ROW | pinf.row; + offset = controller * ADEC_NUM; + + reg = error_data[ADEC6]; + rows.i = reg; + err_addr |= (row & BIT(0)) << rows.row0; + row >>= MC5_EACHBIT; + err_addr |= (row & BIT(0)) << rows.row1; + row >>= MC5_EACHBIT; + err_addr |= (row & BIT(0)) << rows.row2; + row >>= MC5_EACHBIT; + err_addr |= (row & BIT(0)) << rows.row3; + row >>= MC5_EACHBIT; + err_addr |= (row & BIT(0)) << rows.row4; + row >>= MC5_EACHBIT; + + reg = error_data[ADEC7]; + rows.i = reg; + err_addr |= (row & BIT(0)) << rows.row0; + row >>= MC5_EACHBIT; + err_addr |= (row & BIT(0)) << rows.row1; + row >>= MC5_EACHBIT; + err_addr |= (row & BIT(0)) << rows.row2; + row >>= MC5_EACHBIT; + err_addr |= (row & BIT(0)) << rows.row3; + row >>= MC5_EACHBIT; + err_addr |= (row & BIT(0)) << rows.row4; + row >>= MC5_EACHBIT; + + reg = error_data[ADEC8]; + rows.i = reg; + err_addr |= (row & BIT(0)) << rows.row0; + row >>= MC5_EACHBIT; + err_addr |= (row & BIT(0)) << rows.row1; + row >>= MC5_EACHBIT; + err_addr |= (row & BIT(0)) << rows.row2; + row >>= MC5_EACHBIT; + err_addr |= (row & BIT(0)) << rows.row3; + row >>= MC5_EACHBIT; + err_addr |= (row & BIT(0)) << rows.row4; + + reg = error_data[ADEC9]; + rows.i = reg; + + err_addr |= (row & BIT(0)) << rows.row0; + row >>= MC5_EACHBIT; + err_addr |= (row & BIT(0)) << rows.row1; + row >>= MC5_EACHBIT; + err_addr |= (row & BIT(0)) << rows.row2; + row >>= MC5_EACHBIT; + + col_bit_0 = FIELD_GET(MASK_24, error_data[ADEC9]); + pinf.col >>= 1; + err_addr |= (pinf.col & 1) << col_bit_0; + + cols.i = error_data[ADEC10]; + err_addr |= (pinf.col & 1) << cols.col1; + pinf.col >>= 1; + err_addr |= (pinf.col & 1) << cols.col2; + pinf.col >>= 1; + err_addr |= (pinf.col & 1) << cols.col3; + pinf.col >>= 1; + err_addr |= (pinf.col & 1) << cols.col4; + pinf.col >>= 1; + err_addr |= (pinf.col & 1) << cols.col5; + pinf.col >>= 1; + + cols.i = error_data[ADEC11]; + err_addr |= (pinf.col & 1) << cols.col1; + pinf.col >>= 1; + err_addr |= (pinf.col & 1) << cols.col2; + pinf.col >>= 1; + err_addr |= (pinf.col & 1) << cols.col3; + pinf.col >>= 1; + err_addr |= (pinf.col & 1) << cols.col4; + pinf.col >>= 1; + err_addr |= (pinf.col & 1) << cols.col5; + pinf.col >>= 1; + + reg = error_data[ADEC12]; + err_addr |= (pinf.bank & BIT(0)) << (reg & MASK_0); + pinf.bank >>= MC5_EACHBIT; + err_addr |= (pinf.bank & BIT(0)) << FIELD_GET(MC5_BANK1_MASK, reg); + pinf.bank >>= MC5_EACHBIT; + + err_addr |= (pinf.bank & BIT(0)) << FIELD_GET(MC5_GRP_0_MASK, reg); + pinf.group >>= MC5_EACHBIT; + err_addr |= (pinf.bank & BIT(0)) << FIELD_GET(MC5_GRP_1_MASK, reg); + pinf.group >>= MC5_EACHBIT; + err_addr |= (pinf.bank & BIT(0)) << FIELD_GET(MASK_24, reg); + pinf.group >>= MC5_EACHBIT; + + reg = error_data[ADEC4]; + err_addr |= (pinf.rank & BIT(0)) << (reg & MASK_0); + pinf.rank >>= MC5_EACHBIT; + err_addr |= (pinf.rank & BIT(0)) << FIELD_GET(MC5_RANK_1_MASK, reg); + pinf.rank >>= MC5_EACHBIT; + + reg = error_data[ADEC5]; + err_addr |= (pinf.lrank & BIT(0)) << (reg & MASK_0); + pinf.lrank >>= MC5_EACHBIT; + err_addr |= (pinf.lrank & BIT(0)) << FIELD_GET(MC5_LRANK_1_MASK, reg); + pinf.lrank >>= MC5_EACHBIT; + err_addr |= (pinf.lrank & BIT(0)) << FIELD_GET(MC5_LRANK_2_MASK, reg); + pinf.lrank >>= MC5_EACHBIT; + err_addr |= (pinf.lrank & BIT(0)) << FIELD_GET(MASK_24, reg); + pinf.lrank >>= MC5_EACHBIT; + + high_mem_base = (priv->adec[ADEC2 + offset] & MC5_MEM_MASK) * MC5_HIMEM_BASE; + interleave = priv->adec[ADEC13 + offset] & MC5_INTERLEAVE_SEL; + + high_mem_offset = priv->adec[ADEC3 + offset] & MC5_MEM_MASK; + low_mem_offset = priv->adec[ADEC1 + offset] & MC5_MEM_MASK; + reg = priv->adec[ADEC14 + offset]; + ilc_himem_en = !!(reg & MC5_ILC_HIMEM_EN); + ilcmem_base = (reg & MC5_ILC_MEM) * SZ_1M; + if (ilc_himem_en) + ilc_base_ctrl_add = ilcmem_base - high_mem_offset; + else + ilc_base_ctrl_add = ilcmem_base - low_mem_offset; + + if (priv->dwidth == DEV_X16) { + blk = err_addr / MC5_X16_SIZE; + rsh_req_addr = (blk << 8) + ilc_base_ctrl_add; + err_addr = rsh_req_addr * interleave * 2; + } else { + blk = err_addr / MC5_X32_SIZE; + rsh_req_addr = (blk << 9) + ilc_base_ctrl_add; + err_addr = rsh_req_addr * interleave * 2; + } + + if ((priv->adec[ADEC2 + offset] & MC5_HIGH_MEM_EN) && err_addr >= high_mem_base) + addr = err_addr - high_mem_offset; + else + addr = err_addr - low_mem_offset; + + return addr; +} + +/** + * handle_error - Handle errors. + * @priv: DDR memory controller private instance data. + * @stat: ECC status structure. + * @ctl_num: Controller number of the MC5 + * @error_data: the MC5 ADEC address decoder register data + * + * Handles ECC correctable and uncorrectable errors. + */ +static void handle_error(struct mc_priv *priv, struct ecc_status *stat, + int ctl_num, int *error_data) +{ + union ecc_error_info pinf; + struct mem_ctl_info *mci; + unsigned long pa; + phys_addr_t pfn; + int err; + + if (WARN_ON_ONCE(ctl_num > NUM_CONTROLLERS)) + return; + + mci = priv->mci[ctl_num]; + + if (stat->error_type == MC5_ERR_TYPE_CE) { + pinf = stat->ceinfo[stat->channel]; + snprintf(priv->message, sizeof(priv->message), + "Error type:%s Controller %d Addr at %lx\n", + "CE", ctl_num, convert_to_physical(priv, pinf, ctl_num, error_data)); + + edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci, + 1, 0, 0, 0, 0, 0, -1, + priv->message, ""); + } + + if (stat->error_type == MC5_ERR_TYPE_UE) { + pinf = stat->ueinfo[stat->channel]; + snprintf(priv->message, sizeof(priv->message), + "Error type:%s controller %d Addr at %lx\n", + "UE", ctl_num, convert_to_physical(priv, pinf, ctl_num, error_data)); + + edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci, + 1, 0, 0, 0, 0, 0, -1, + priv->message, ""); + pa = convert_to_physical(priv, pinf, ctl_num, error_data); + pfn = PHYS_PFN(pa); + + if (IS_ENABLED(CONFIG_MEMORY_FAILURE)) { + err = memory_failure(pfn, MF_ACTION_REQUIRED); + if (err) + edac_dbg(2, "memory_failure() error: %d", err); + else + edac_dbg(2, "Poison page at PA 0x%lx\n", pa); + } + } +} + +static void mc_init(struct mem_ctl_info *mci, struct device *dev) +{ + struct mc_priv *priv = mci->pvt_info; + struct csrow_info *csi; + struct dimm_info *dimm; + u32 row; + int ch; + + /* Initialize controller capabilities and configuration */ + mci->mtype_cap = MEM_FLAG_DDR5; + mci->edac_ctl_cap = EDAC_FLAG_NONE | EDAC_FLAG_SECDED; + mci->scrub_cap = SCRUB_HW_SRC; + mci->scrub_mode = SCRUB_NONE; + + mci->edac_cap = EDAC_FLAG_SECDED; + mci->ctl_name = "VersalNET DDR5"; + mci->dev_name = dev_name(dev); + mci->mod_name = "versalnet_edac"; + + edac_op_state = EDAC_OPSTATE_INT; + + for (row = 0; row < mci->nr_csrows; row++) { + csi = mci->csrows[row]; + for (ch = 0; ch < csi->nr_channels; ch++) { + dimm = csi->channels[ch]->dimm; + dimm->edac_mode = EDAC_SECDED; + dimm->mtype = MEM_DDR5; + dimm->grain = MC5_ERR_GRAIN; + dimm->dtype = priv->dwidth; + } + } +} + +#define to_mci(k) container_of(k, struct mem_ctl_info, dev) + +static unsigned int mcdi_rpc_timeout(struct cdx_mcdi *cdx, unsigned int cmd) +{ + return MCDI_RPC_TIMEOUT; +} + +static void mcdi_request(struct cdx_mcdi *cdx, + const struct cdx_dword *hdr, size_t hdr_len, + const struct cdx_dword *sdu, size_t sdu_len) +{ + void *send_buf; + int ret; + + send_buf = kzalloc(hdr_len + sdu_len, GFP_KERNEL); + if (!send_buf) + return; + + memcpy(send_buf, hdr, hdr_len); + memcpy(send_buf + hdr_len, sdu, sdu_len); + + ret = rpmsg_send(cdx->ept, send_buf, hdr_len + sdu_len); + if (ret) + dev_err(&cdx->rpdev->dev, "Failed to send rpmsg data: %d\n", ret); + + kfree(send_buf); +} + +static const struct cdx_mcdi_ops mcdi_ops = { + .mcdi_rpc_timeout = mcdi_rpc_timeout, + .mcdi_request = mcdi_request, +}; + +static void get_ddr_config(u32 index, u32 *buffer, struct cdx_mcdi *amd_mcdi) +{ + size_t outlen; + int ret; + + MCDI_DECLARE_BUF(inbuf, MC_GET_DDR_CONFIG_IN_LEN); + MCDI_DECLARE_BUF(outbuf, BUFFER_SZ); + + MCDI_SET_DWORD(inbuf, EDAC_GET_DDR_CONFIG_IN_CONTROLLER_INDEX, index); + + ret = cdx_mcdi_rpc(amd_mcdi, MC_CMD_EDAC_GET_DDR_CONFIG, inbuf, sizeof(inbuf), + outbuf, sizeof(outbuf), &outlen); + if (!ret) + memcpy(buffer, MCDI_PTR(outbuf, GET_DDR_CONFIG), + (ADEC_NUM * 4)); +} + +static int setup_mcdi(struct mc_priv *mc_priv) +{ + struct cdx_mcdi *amd_mcdi; + int ret, i; + + amd_mcdi = kzalloc(sizeof(*amd_mcdi), GFP_KERNEL); + if (!amd_mcdi) + return -ENOMEM; + + amd_mcdi->mcdi_ops = &mcdi_ops; + ret = cdx_mcdi_init(amd_mcdi); + if (ret) { + kfree(amd_mcdi); + return ret; + } + + amd_mcdi->ept = mc_priv->ept; + mc_priv->mcdi = amd_mcdi; + + for (i = 0; i < NUM_CONTROLLERS; i++) + get_ddr_config(i, &mc_priv->adec[ADEC_NUM * i], amd_mcdi); + + return 0; +} + +static const guid_t amd_versalnet_guid = GUID_INIT(0x82678888, 0xa556, 0x44f2, + 0xb8, 0xb4, 0x45, 0x56, 0x2e, + 0x8c, 0x5b, 0xec); + +static int rpmsg_cb(struct rpmsg_device *rpdev, void *data, + int len, void *priv, u32 src) +{ + struct mc_priv *mc_priv = dev_get_drvdata(&rpdev->dev); + const guid_t *sec_type = &guid_null; + u32 length, offset, error_id; + u32 *result = (u32 *)data; + struct ecc_status *p; + int i, j, k, sec_sev; + const char *err_str; + u32 *adec_data; + + if (*(u8 *)data == MCDI_RESPONSE) { + cdx_mcdi_process_cmd(mc_priv->mcdi, (struct cdx_dword *)data, len); + return 0; + } + + sec_sev = result[ERROR_LEVEL]; + error_id = result[ERROR_ID]; + length = result[MSG_ERR_LENGTH]; + offset = result[MSG_ERR_OFFSET]; + + if (result[TOTAL_ERR_LENGTH] > length) { + if (!mc_priv->part_len) + mc_priv->part_len = length; + else + mc_priv->part_len += length; + /* + * The data can come in 2 stretches. Construct the regs from 2 + * messages the offset indicates the offset from which the data is to + * be taken + */ + for (i = 0 ; i < length; i++) { + k = offset + i; + j = ERROR_DATA + i; + mc_priv->regs[k] = result[j]; + } + if (mc_priv->part_len < result[TOTAL_ERR_LENGTH]) + return 0; + mc_priv->part_len = 0; + } + + mc_priv->error_id = error_id; + mc_priv->error_level = result[ERROR_LEVEL]; + + switch (error_id) { + case 5: err_str = "General Software Non-Correctable error"; break; + case 6: err_str = "CFU error"; break; + case 7: err_str = "CFRAME error"; break; + case 10: err_str = "DDRMC Microblaze Correctable ECC error"; break; + case 11: err_str = "DDRMC Microblaze Non-Correctable ECC error"; break; + case 15: err_str = "MMCM error"; break; + case 16: err_str = "HNICX Correctable error"; break; + case 17: err_str = "HNICX Non-Correctable error"; break; + + case 18: + p = &mc_priv->stat; + memset(p, 0, sizeof(struct ecc_status)); + p->error_type = MC5_ERR_TYPE_CE; + for (i = 0 ; i < NUM_CONTROLLERS; i++) { + if (get_ddr_info(&mc_priv->regs[i * REGS_PER_CONTROLLER], mc_priv)) { + adec_data = mc_priv->adec + ADEC_NUM * i; + handle_error(mc_priv, &mc_priv->stat, i, adec_data); + } + } + return 0; + case 19: + p = &mc_priv->stat; + memset(p, 0, sizeof(struct ecc_status)); + p->error_type = MC5_ERR_TYPE_UE; + for (i = 0 ; i < NUM_CONTROLLERS; i++) { + if (get_ddr_info(&mc_priv->regs[i * REGS_PER_CONTROLLER], mc_priv)) { + adec_data = mc_priv->adec + ADEC_NUM * i; + handle_error(mc_priv, &mc_priv->stat, i, adec_data); + } + } + return 0; + + case 21: err_str = "GT Non-Correctable error"; break; + case 22: err_str = "PL Sysmon Correctable error"; break; + case 23: err_str = "PL Sysmon Non-Correctable error"; break; + case 111: err_str = "LPX unexpected dfx activation error"; break; + case 114: err_str = "INT_LPD Non-Correctable error"; break; + case 116: err_str = "INT_OCM Non-Correctable error"; break; + case 117: err_str = "INT_FPD Correctable error"; break; + case 118: err_str = "INT_FPD Non-Correctable error"; break; + case 120: err_str = "INT_IOU Non-Correctable error"; break; + case 123: err_str = "err_int_irq from APU GIC Distributor"; break; + case 124: err_str = "fault_int_irq from APU GIC Distribute"; break; + case 132 ... 139: err_str = "FPX SPLITTER error"; break; + case 140: err_str = "APU Cluster 0 error"; break; + case 141: err_str = "APU Cluster 1 error"; break; + case 142: err_str = "APU Cluster 2 error"; break; + case 143: err_str = "APU Cluster 3 error"; break; + case 145: err_str = "WWDT1 LPX error"; break; + case 147: err_str = "IPI error"; break; + case 152 ... 153: err_str = "AFIFS error"; break; + case 154 ... 155: err_str = "LPX glitch error"; break; + case 185 ... 186: err_str = "FPX AFIFS error"; break; + case 195 ... 199: err_str = "AFIFM error"; break; + case 108: err_str = "PSM Correctable error"; break; + case 59: err_str = "PMC correctable error"; break; + case 60: err_str = "PMC Un correctable error"; break; + case 43 ... 47: err_str = "PMC Sysmon error"; break; + case 163 ... 184: err_str = "RPU error"; break; + case 148: err_str = "OCM0 correctable error"; break; + case 149: err_str = "OCM1 correctable error"; break; + case 150: err_str = "OCM0 Un-correctable error"; break; + case 151: err_str = "OCM1 Un-correctable error"; break; + case 189: err_str = "PSX_CMN_3 PD block consolidated error"; break; + case 191: err_str = "FPD_INT_WRAP PD block consolidated error"; break; + case 232: err_str = "CRAM Un-Correctable error"; break; + default: err_str = "VERSAL_EDAC_ERR_ID: %d"; break; + } + + snprintf(mc_priv->message, + sizeof(mc_priv->message), + "[VERSAL_EDAC_ERR_ID: %d] Error type: %s", error_id, err_str); + + /* Convert to bytes */ + length = result[TOTAL_ERR_LENGTH] * 4; + log_non_standard_event(sec_type, &amd_versalnet_guid, mc_priv->message, + sec_sev, (void *)&result[ERROR_DATA], length); + + return 0; +} + +static struct rpmsg_device_id amd_rpmsg_id_table[] = { + { .name = "error_ipc" }, + { }, +}; +MODULE_DEVICE_TABLE(rpmsg, amd_rpmsg_id_table); + +static int rpmsg_probe(struct rpmsg_device *rpdev) +{ + struct rpmsg_channel_info chinfo; + struct mc_priv *pg; + + pg = (struct mc_priv *)amd_rpmsg_id_table[0].driver_data; + chinfo.src = RPMSG_ADDR_ANY; + chinfo.dst = rpdev->dst; + strscpy(chinfo.name, amd_rpmsg_id_table[0].name, + strlen(amd_rpmsg_id_table[0].name)); + + pg->ept = rpmsg_create_ept(rpdev, rpmsg_cb, NULL, chinfo); + if (!pg->ept) + return dev_err_probe(&rpdev->dev, -ENXIO, "Failed to create ept for channel %s\n", + chinfo.name); + + dev_set_drvdata(&rpdev->dev, pg); + + return 0; +} + +static void rpmsg_remove(struct rpmsg_device *rpdev) +{ + struct mc_priv *mc_priv = dev_get_drvdata(&rpdev->dev); + + rpmsg_destroy_ept(mc_priv->ept); + dev_set_drvdata(&rpdev->dev, NULL); +} + +static struct rpmsg_driver amd_rpmsg_driver = { + .drv.name = KBUILD_MODNAME, + .probe = rpmsg_probe, + .remove = rpmsg_remove, + .callback = rpmsg_cb, + .id_table = amd_rpmsg_id_table, +}; + +static void versal_edac_release(struct device *dev) +{ + kfree(dev); +} + +static int init_versalnet(struct mc_priv *priv, struct platform_device *pdev) +{ + u32 num_chans, rank, dwidth, config; + struct edac_mc_layer layers[2]; + struct mem_ctl_info *mci; + struct device *dev; + enum dev_type dt; + char *name; + int rc, i; + + for (i = 0; i < NUM_CONTROLLERS; i++) { + config = priv->adec[CONF + i * ADEC_NUM]; + num_chans = FIELD_GET(MC5_NUM_CHANS_MASK, config); + rank = 1 << FIELD_GET(MC5_RANK_MASK, config); + dwidth = FIELD_GET(MC5_BUS_WIDTH_MASK, config); + + switch (dwidth) { + case XDDR5_BUS_WIDTH_16: + dt = DEV_X16; + break; + case XDDR5_BUS_WIDTH_32: + dt = DEV_X32; + break; + case XDDR5_BUS_WIDTH_64: + dt = DEV_X64; + break; + default: + dt = DEV_UNKNOWN; + } + + if (dt == DEV_UNKNOWN) + continue; + + /* Find the first enabled device and register that one. */ + layers[0].type = EDAC_MC_LAYER_CHIP_SELECT; + layers[0].size = rank; + layers[0].is_virt_csrow = true; + layers[1].type = EDAC_MC_LAYER_CHANNEL; + layers[1].size = num_chans; + layers[1].is_virt_csrow = false; + + rc = -ENOMEM; + mci = edac_mc_alloc(i, ARRAY_SIZE(layers), layers, + sizeof(struct mc_priv)); + if (!mci) { + edac_printk(KERN_ERR, EDAC_MC, "Failed memory allocation for MC%d\n", i); + goto err_alloc; + } + + priv->mci[i] = mci; + priv->dwidth = dt; + + dev = kzalloc(sizeof(*dev), GFP_KERNEL); + dev->release = versal_edac_release; + name = kmalloc(32, GFP_KERNEL); + sprintf(name, "versal-net-ddrmc5-edac-%d", i); + dev->init_name = name; + rc = device_register(dev); + if (rc) + goto err_alloc; + + mci->pdev = dev; + + platform_set_drvdata(pdev, priv); + + mc_init(mci, dev); + rc = edac_mc_add_mc(mci); + if (rc) { + edac_printk(KERN_ERR, EDAC_MC, "Failed to register MC%d with EDAC core\n", i); + goto err_alloc; + } + } + return 0; + +err_alloc: + while (i--) { + mci = priv->mci[i]; + if (!mci) + continue; + + if (mci->pdev) { + device_unregister(mci->pdev); + edac_mc_del_mc(mci->pdev); + } + + edac_mc_free(mci); + } + + return rc; +} + +static void remove_versalnet(struct mc_priv *priv) +{ + struct mem_ctl_info *mci; + int i; + + for (i = 0; i < NUM_CONTROLLERS; i++) { + device_unregister(priv->mci[i]->pdev); + mci = edac_mc_del_mc(priv->mci[i]->pdev); + if (!mci) + return; + + edac_mc_free(mci); + } +} + +static int mc_probe(struct platform_device *pdev) +{ + struct device_node *r5_core_node; + struct mc_priv *priv; + struct rproc *rp; + int rc; + + r5_core_node = of_parse_phandle(pdev->dev.of_node, "amd,rproc", 0); + if (!r5_core_node) { + dev_err(&pdev->dev, "amd,rproc: invalid phandle\n"); + return -EINVAL; + } + + rp = rproc_get_by_phandle(r5_core_node->phandle); + if (!rp) + return -EPROBE_DEFER; + + rc = rproc_boot(rp); + if (rc) { + dev_err(&pdev->dev, "Failed to attach to remote processor\n"); + goto err_rproc_boot; + } + + priv = devm_kzalloc(&pdev->dev, sizeof(*priv), GFP_KERNEL); + if (!priv) { + rc = -ENOMEM; + goto err_alloc; + } + + amd_rpmsg_id_table[0].driver_data = (kernel_ulong_t)priv; + + rc = register_rpmsg_driver(&amd_rpmsg_driver); + if (rc) { + edac_printk(KERN_ERR, EDAC_MC, "Failed to register RPMsg driver: %d\n", rc); + goto err_alloc; + } + + rc = setup_mcdi(priv); + if (rc) + goto err_unreg; + + priv->mcdi->r5_rproc = rp; + + rc = init_versalnet(priv, pdev); + if (rc) + goto err_init; + + return 0; + +err_init: + cdx_mcdi_finish(priv->mcdi); + +err_unreg: + unregister_rpmsg_driver(&amd_rpmsg_driver); + +err_alloc: + rproc_shutdown(rp); + +err_rproc_boot: + rproc_put(rp); + + return rc; +} + +static void mc_remove(struct platform_device *pdev) +{ + struct mc_priv *priv = platform_get_drvdata(pdev); + + unregister_rpmsg_driver(&amd_rpmsg_driver); + remove_versalnet(priv); + rproc_shutdown(priv->mcdi->r5_rproc); + cdx_mcdi_finish(priv->mcdi); +} + +static const struct of_device_id amd_edac_match[] = { + { .compatible = "xlnx,versal-net-ddrmc5", }, + {} +}; +MODULE_DEVICE_TABLE(of, amd_edac_match); + +static struct platform_driver amd_ddr_edac_mc_driver = { + .driver = { + .name = "versal-net-edac", + .of_match_table = amd_edac_match, + }, + .probe = mc_probe, + .remove = mc_remove, +}; + +module_platform_driver(amd_ddr_edac_mc_driver); + +MODULE_AUTHOR("AMD Inc"); +MODULE_DESCRIPTION("Versal NET EDAC driver"); +MODULE_LICENSE("GPL"); diff --git a/drivers/firmware/efi/libstub/x86-stub.c b/drivers/firmware/efi/libstub/x86-stub.c index cafc90d4caaf..0d05eac7c72b 100644 --- a/drivers/firmware/efi/libstub/x86-stub.c +++ b/drivers/firmware/efi/libstub/x86-stub.c @@ -788,7 +788,9 @@ static efi_status_t efi_decompress_kernel(unsigned long *kernel_entry, *kernel_entry = addr + entry; - return efi_adjust_memory_range_protection(addr, kernel_text_size); + return efi_adjust_memory_range_protection(addr, kernel_text_size) ?: + efi_adjust_memory_range_protection(addr + kernel_inittext_offset, + kernel_inittext_size); } static void __noreturn enter_kernel(unsigned long kernel_addr, diff --git a/drivers/irqchip/irq-aspeed-scu-ic.c b/drivers/irqchip/irq-aspeed-scu-ic.c index 1c7045467c48..5584e0f82cce 100644 --- a/drivers/irqchip/irq-aspeed-scu-ic.c +++ b/drivers/irqchip/irq-aspeed-scu-ic.c @@ -1,61 +1,78 @@ // SPDX-License-Identifier: GPL-2.0-or-later /* - * Aspeed AST24XX, AST25XX, and AST26XX SCU Interrupt Controller + * Aspeed AST24XX, AST25XX, AST26XX, and AST27XX SCU Interrupt Controller * Copyright 2019 IBM Corporation * * Eddie James <eajames@linux.ibm.com> */ #include <linux/bitops.h> +#include <linux/io.h> #include <linux/irq.h> #include <linux/irqchip.h> #include <linux/irqchip/chained_irq.h> #include <linux/irqdomain.h> -#include <linux/mfd/syscon.h> +#include <linux/of_address.h> #include <linux/of_irq.h> -#include <linux/regmap.h> -#define ASPEED_SCU_IC_REG 0x018 -#define ASPEED_SCU_IC_SHIFT 0 -#define ASPEED_SCU_IC_ENABLE GENMASK(15, ASPEED_SCU_IC_SHIFT) -#define ASPEED_SCU_IC_NUM_IRQS 7 #define ASPEED_SCU_IC_STATUS GENMASK(28, 16) #define ASPEED_SCU_IC_STATUS_SHIFT 16 +#define AST2700_SCU_IC_STATUS GENMASK(15, 0) + +struct aspeed_scu_ic_variant { + const char *compatible; + unsigned long irq_enable; + unsigned long irq_shift; + unsigned int num_irqs; + unsigned long ier; + unsigned long isr; +}; -#define ASPEED_AST2600_SCU_IC0_REG 0x560 -#define ASPEED_AST2600_SCU_IC0_SHIFT 0 -#define ASPEED_AST2600_SCU_IC0_ENABLE \ - GENMASK(5, ASPEED_AST2600_SCU_IC0_SHIFT) -#define ASPEED_AST2600_SCU_IC0_NUM_IRQS 6 +#define SCU_VARIANT(_compat, _shift, _enable, _num, _ier, _isr) { \ + .compatible = _compat, \ + .irq_shift = _shift, \ + .irq_enable = _enable, \ + .num_irqs = _num, \ + .ier = _ier, \ + .isr = _isr, \ +} -#define ASPEED_AST2600_SCU_IC1_REG 0x570 -#define ASPEED_AST2600_SCU_IC1_SHIFT 4 -#define ASPEED_AST2600_SCU_IC1_ENABLE \ - GENMASK(5, ASPEED_AST2600_SCU_IC1_SHIFT) -#define ASPEED_AST2600_SCU_IC1_NUM_IRQS 2 +static const struct aspeed_scu_ic_variant scu_ic_variants[] __initconst = { + SCU_VARIANT("aspeed,ast2400-scu-ic", 0, GENMASK(15, 0), 7, 0x00, 0x00), + SCU_VARIANT("aspeed,ast2500-scu-ic", 0, GENMASK(15, 0), 7, 0x00, 0x00), + SCU_VARIANT("aspeed,ast2600-scu-ic0", 0, GENMASK(5, 0), 6, 0x00, 0x00), + SCU_VARIANT("aspeed,ast2600-scu-ic1", 4, GENMASK(5, 4), 2, 0x00, 0x00), + SCU_VARIANT("aspeed,ast2700-scu-ic0", 0, GENMASK(3, 0), 4, 0x00, 0x04), + SCU_VARIANT("aspeed,ast2700-scu-ic1", 0, GENMASK(3, 0), 4, 0x00, 0x04), + SCU_VARIANT("aspeed,ast2700-scu-ic2", 0, GENMASK(3, 0), 4, 0x04, 0x00), + SCU_VARIANT("aspeed,ast2700-scu-ic3", 0, GENMASK(1, 0), 2, 0x04, 0x00), +}; struct aspeed_scu_ic { - unsigned long irq_enable; - unsigned long irq_shift; - unsigned int num_irqs; - unsigned int reg; - struct regmap *scu; - struct irq_domain *irq_domain; + unsigned long irq_enable; + unsigned long irq_shift; + unsigned int num_irqs; + void __iomem *base; + struct irq_domain *irq_domain; + unsigned long ier; + unsigned long isr; }; -static void aspeed_scu_ic_irq_handler(struct irq_desc *desc) +static inline bool scu_has_split_isr(struct aspeed_scu_ic *scu) +{ + return scu->ier != scu->isr; +} + +static void aspeed_scu_ic_irq_handler_combined(struct irq_desc *desc) { - unsigned int sts; - unsigned long bit; - unsigned long enabled; - unsigned long max; - unsigned long status; struct aspeed_scu_ic *scu_ic = irq_desc_get_handler_data(desc); struct irq_chip *chip = irq_desc_get_chip(desc); - unsigned int mask = scu_ic->irq_enable << ASPEED_SCU_IC_STATUS_SHIFT; + unsigned long bit, enabled, max, status; + unsigned int sts, mask; chained_irq_enter(chip, desc); + mask = scu_ic->irq_enable << ASPEED_SCU_IC_STATUS_SHIFT; /* * The SCU IC has just one register to control its operation and read * status. The interrupt enable bits occupy the lower 16 bits of the @@ -66,7 +83,7 @@ static void aspeed_scu_ic_irq_handler(struct irq_desc *desc) * shifting the status down to get the mapping and then back up to * clear the bit. */ - regmap_read(scu_ic->scu, scu_ic->reg, &sts); + sts = readl(scu_ic->base); enabled = sts & scu_ic->irq_enable; status = (sts >> ASPEED_SCU_IC_STATUS_SHIFT) & enabled; @@ -74,43 +91,83 @@ static void aspeed_scu_ic_irq_handler(struct irq_desc *desc) max = scu_ic->num_irqs + bit; for_each_set_bit_from(bit, &status, max) { - generic_handle_domain_irq(scu_ic->irq_domain, - bit - scu_ic->irq_shift); + generic_handle_domain_irq(scu_ic->irq_domain, bit - scu_ic->irq_shift); + writel((readl(scu_ic->base) & ~mask) | BIT(bit + ASPEED_SCU_IC_STATUS_SHIFT), + scu_ic->base); + } + + chained_irq_exit(chip, desc); +} + +static void aspeed_scu_ic_irq_handler_split(struct irq_desc *desc) +{ + struct aspeed_scu_ic *scu_ic = irq_desc_get_handler_data(desc); + struct irq_chip *chip = irq_desc_get_chip(desc); + unsigned long bit, enabled, max, status; + unsigned int sts, mask; - regmap_write_bits(scu_ic->scu, scu_ic->reg, mask, - BIT(bit + ASPEED_SCU_IC_STATUS_SHIFT)); + chained_irq_enter(chip, desc); + + mask = scu_ic->irq_enable; + sts = readl(scu_ic->base + scu_ic->isr); + enabled = sts & scu_ic->irq_enable; + sts = readl(scu_ic->base + scu_ic->isr); + status = sts & enabled; + + bit = scu_ic->irq_shift; + max = scu_ic->num_irqs + bit; + + for_each_set_bit_from(bit, &status, max) { + generic_handle_domain_irq(scu_ic->irq_domain, bit - scu_ic->irq_shift); + /* Clear interrupt */ + writel(BIT(bit), scu_ic->base + scu_ic->isr); } chained_irq_exit(chip, desc); } -static void aspeed_scu_ic_irq_mask(struct irq_data *data) +static void aspeed_scu_ic_irq_mask_combined(struct irq_data *data) { struct aspeed_scu_ic *scu_ic = irq_data_get_irq_chip_data(data); - unsigned int mask = BIT(data->hwirq + scu_ic->irq_shift) | - (scu_ic->irq_enable << ASPEED_SCU_IC_STATUS_SHIFT); + unsigned int bit = BIT(data->hwirq + scu_ic->irq_shift); + unsigned int mask = bit | (scu_ic->irq_enable << ASPEED_SCU_IC_STATUS_SHIFT); /* * Status bits are cleared by writing 1. In order to prevent the mask * operation from clearing the status bits, they should be under the * mask and written with 0. */ - regmap_update_bits(scu_ic->scu, scu_ic->reg, mask, 0); + writel(readl(scu_ic->base) & ~mask, scu_ic->base); } -static void aspeed_scu_ic_irq_unmask(struct irq_data *data) +static void aspeed_scu_ic_irq_unmask_combined(struct irq_data *data) { struct aspeed_scu_ic *scu_ic = irq_data_get_irq_chip_data(data); unsigned int bit = BIT(data->hwirq + scu_ic->irq_shift); - unsigned int mask = bit | - (scu_ic->irq_enable << ASPEED_SCU_IC_STATUS_SHIFT); + unsigned int mask = bit | (scu_ic->irq_enable << ASPEED_SCU_IC_STATUS_SHIFT); /* * Status bits are cleared by writing 1. In order to prevent the unmask * operation from clearing the status bits, they should be under the * mask and written with 0. */ - regmap_update_bits(scu_ic->scu, scu_ic->reg, mask, bit); + writel((readl(scu_ic->base) & ~mask) | bit, scu_ic->base); +} + +static void aspeed_scu_ic_irq_mask_split(struct irq_data *data) +{ + struct aspeed_scu_ic *scu_ic = irq_data_get_irq_chip_data(data); + unsigned int mask = BIT(data->hwirq + scu_ic->irq_shift); + + writel(readl(scu_ic->base) & ~mask, scu_ic->base + scu_ic->ier); +} + +static void aspeed_scu_ic_irq_unmask_split(struct irq_data *data) +{ + struct aspeed_scu_ic *scu_ic = irq_data_get_irq_chip_data(data); + unsigned int bit = BIT(data->hwirq + scu_ic->irq_shift); + + writel(readl(scu_ic->base) | bit, scu_ic->base + scu_ic->ier); } static int aspeed_scu_ic_irq_set_affinity(struct irq_data *data, @@ -120,17 +177,29 @@ static int aspeed_scu_ic_irq_set_affinity(struct irq_data *data, return -EINVAL; } -static struct irq_chip aspeed_scu_ic_chip = { +static struct irq_chip aspeed_scu_ic_chip_combined = { .name = "aspeed-scu-ic", - .irq_mask = aspeed_scu_ic_irq_mask, - .irq_unmask = aspeed_scu_ic_irq_unmask, - .irq_set_affinity = aspeed_scu_ic_irq_set_affinity, + .irq_mask = aspeed_scu_ic_irq_mask_combined, + .irq_unmask = aspeed_scu_ic_irq_unmask_combined, + .irq_set_affinity = aspeed_scu_ic_irq_set_affinity, +}; + +static struct irq_chip aspeed_scu_ic_chip_split = { + .name = "ast2700-scu-ic", + .irq_mask = aspeed_scu_ic_irq_mask_split, + .irq_unmask = aspeed_scu_ic_irq_unmask_split, + .irq_set_affinity = aspeed_scu_ic_irq_set_affinity, }; static int aspeed_scu_ic_map(struct irq_domain *domain, unsigned int irq, irq_hw_number_t hwirq) { - irq_set_chip_and_handler(irq, &aspeed_scu_ic_chip, handle_level_irq); + struct aspeed_scu_ic *scu_ic = domain->host_data; + + if (scu_has_split_isr(scu_ic)) + irq_set_chip_and_handler(irq, &aspeed_scu_ic_chip_split, handle_level_irq); + else + irq_set_chip_and_handler(irq, &aspeed_scu_ic_chip_combined, handle_level_irq); irq_set_chip_data(irq, domain->host_data); return 0; @@ -143,21 +212,21 @@ static const struct irq_domain_ops aspeed_scu_ic_domain_ops = { static int aspeed_scu_ic_of_init_common(struct aspeed_scu_ic *scu_ic, struct device_node *node) { - int irq; - int rc = 0; + int irq, rc = 0; - if (!node->parent) { - rc = -ENODEV; + scu_ic->base = of_iomap(node, 0); + if (IS_ERR(scu_ic->base)) { + rc = PTR_ERR(scu_ic->base); goto err; } - scu_ic->scu = syscon_node_to_regmap(node->parent); - if (IS_ERR(scu_ic->scu)) { - rc = PTR_ERR(scu_ic->scu); - goto err; + if (scu_has_split_isr(scu_ic)) { + writel(AST2700_SCU_IC_STATUS, scu_ic->base + scu_ic->isr); + writel(0, scu_ic->base + scu_ic->ier); + } else { + writel(ASPEED_SCU_IC_STATUS, scu_ic->base); + writel(0, scu_ic->base); } - regmap_write_bits(scu_ic->scu, scu_ic->reg, ASPEED_SCU_IC_STATUS, ASPEED_SCU_IC_STATUS); - regmap_write_bits(scu_ic->scu, scu_ic->reg, ASPEED_SCU_IC_ENABLE, 0); irq = irq_of_parse_and_map(node, 0); if (!irq) { @@ -166,75 +235,60 @@ static int aspeed_scu_ic_of_init_common(struct aspeed_scu_ic *scu_ic, } scu_ic->irq_domain = irq_domain_create_linear(of_fwnode_handle(node), scu_ic->num_irqs, - &aspeed_scu_ic_domain_ops, - scu_ic); + &aspeed_scu_ic_domain_ops, scu_ic); if (!scu_ic->irq_domain) { rc = -ENOMEM; goto err; } - irq_set_chained_handler_and_data(irq, aspeed_scu_ic_irq_handler, + irq_set_chained_handler_and_data(irq, scu_has_split_isr(scu_ic) ? + aspeed_scu_ic_irq_handler_split : + aspeed_scu_ic_irq_handler_combined, scu_ic); return 0; err: kfree(scu_ic); - return rc; } -static int __init aspeed_scu_ic_of_init(struct device_node *node, - struct device_node *parent) +static const struct aspeed_scu_ic_variant *aspeed_scu_ic_find_variant(struct device_node *np) { - struct aspeed_scu_ic *scu_ic = kzalloc(sizeof(*scu_ic), GFP_KERNEL); - - if (!scu_ic) - return -ENOMEM; - - scu_ic->irq_enable = ASPEED_SCU_IC_ENABLE; - scu_ic->irq_shift = ASPEED_SCU_IC_SHIFT; - scu_ic->num_irqs = ASPEED_SCU_IC_NUM_IRQS; - scu_ic->reg = ASPEED_SCU_IC_REG; - - return aspeed_scu_ic_of_init_common(scu_ic, node); + for (int i = 0; i < ARRAY_SIZE(scu_ic_variants); i++) { + if (of_device_is_compatible(np, scu_ic_variants[i].compatible)) + return &scu_ic_variants[i]; + } + return NULL; } -static int __init aspeed_ast2600_scu_ic0_of_init(struct device_node *node, - struct device_node *parent) +static int __init aspeed_scu_ic_of_init(struct device_node *node, struct device_node *parent) { - struct aspeed_scu_ic *scu_ic = kzalloc(sizeof(*scu_ic), GFP_KERNEL); + const struct aspeed_scu_ic_variant *variant; + struct aspeed_scu_ic *scu_ic; - if (!scu_ic) - return -ENOMEM; - - scu_ic->irq_enable = ASPEED_AST2600_SCU_IC0_ENABLE; - scu_ic->irq_shift = ASPEED_AST2600_SCU_IC0_SHIFT; - scu_ic->num_irqs = ASPEED_AST2600_SCU_IC0_NUM_IRQS; - scu_ic->reg = ASPEED_AST2600_SCU_IC0_REG; - - return aspeed_scu_ic_of_init_common(scu_ic, node); -} - -static int __init aspeed_ast2600_scu_ic1_of_init(struct device_node *node, - struct device_node *parent) -{ - struct aspeed_scu_ic *scu_ic = kzalloc(sizeof(*scu_ic), GFP_KERNEL); + variant = aspeed_scu_ic_find_variant(node); + if (!variant) + return -ENODEV; + scu_ic = kzalloc(sizeof(*scu_ic), GFP_KERNEL); if (!scu_ic) return -ENOMEM; - scu_ic->irq_enable = ASPEED_AST2600_SCU_IC1_ENABLE; - scu_ic->irq_shift = ASPEED_AST2600_SCU_IC1_SHIFT; - scu_ic->num_irqs = ASPEED_AST2600_SCU_IC1_NUM_IRQS; - scu_ic->reg = ASPEED_AST2600_SCU_IC1_REG; + scu_ic->irq_enable = variant->irq_enable; + scu_ic->irq_shift = variant->irq_shift; + scu_ic->num_irqs = variant->num_irqs; + scu_ic->ier = variant->ier; + scu_ic->isr = variant->isr; return aspeed_scu_ic_of_init_common(scu_ic, node); } IRQCHIP_DECLARE(ast2400_scu_ic, "aspeed,ast2400-scu-ic", aspeed_scu_ic_of_init); IRQCHIP_DECLARE(ast2500_scu_ic, "aspeed,ast2500-scu-ic", aspeed_scu_ic_of_init); -IRQCHIP_DECLARE(ast2600_scu_ic0, "aspeed,ast2600-scu-ic0", - aspeed_ast2600_scu_ic0_of_init); -IRQCHIP_DECLARE(ast2600_scu_ic1, "aspeed,ast2600-scu-ic1", - aspeed_ast2600_scu_ic1_of_init); +IRQCHIP_DECLARE(ast2600_scu_ic0, "aspeed,ast2600-scu-ic0", aspeed_scu_ic_of_init); +IRQCHIP_DECLARE(ast2600_scu_ic1, "aspeed,ast2600-scu-ic1", aspeed_scu_ic_of_init); +IRQCHIP_DECLARE(ast2700_scu_ic0, "aspeed,ast2700-scu-ic0", aspeed_scu_ic_of_init); +IRQCHIP_DECLARE(ast2700_scu_ic1, "aspeed,ast2700-scu-ic1", aspeed_scu_ic_of_init); +IRQCHIP_DECLARE(ast2700_scu_ic2, "aspeed,ast2700-scu-ic2", aspeed_scu_ic_of_init); +IRQCHIP_DECLARE(ast2700_scu_ic3, "aspeed,ast2700-scu-ic3", aspeed_scu_ic_of_init); diff --git a/drivers/irqchip/irq-gic-v2m.c b/drivers/irqchip/irq-gic-v2m.c index 24ef5af569fe..8a3410c2b7b5 100644 --- a/drivers/irqchip/irq-gic-v2m.c +++ b/drivers/irqchip/irq-gic-v2m.c @@ -153,14 +153,19 @@ static int gicv2m_irq_domain_alloc(struct irq_domain *domain, unsigned int virq, { msi_alloc_info_t *info = args; struct v2m_data *v2m = NULL, *tmp; - int hwirq, offset, i, err = 0; + int hwirq, i, err = 0; + unsigned long offset; + unsigned long align_mask = nr_irqs - 1; spin_lock(&v2m_lock); list_for_each_entry(tmp, &v2m_nodes, entry) { - offset = bitmap_find_free_region(tmp->bm, tmp->nr_spis, - get_count_order(nr_irqs)); - if (offset >= 0) { + unsigned long align_off = tmp->spi_start - (tmp->spi_start & ~align_mask); + + offset = bitmap_find_next_zero_area_off(tmp->bm, tmp->nr_spis, 0, + nr_irqs, align_mask, align_off); + if (offset < tmp->nr_spis) { v2m = tmp; + bitmap_set(v2m->bm, offset, nr_irqs); break; } } diff --git a/drivers/irqchip/irq-gic-v3.c b/drivers/irqchip/irq-gic-v3.c index dbeb85677b08..3de351e66ee8 100644 --- a/drivers/irqchip/irq-gic-v3.c +++ b/drivers/irqchip/irq-gic-v3.c @@ -1766,8 +1766,9 @@ static int gic_irq_domain_select(struct irq_domain *d, struct irq_fwspec *fwspec, enum irq_domain_bus_token bus_token) { - unsigned int type, ret, ppi_idx; + unsigned int type, ppi_idx; irq_hw_number_t hwirq; + int ret; /* Not for us */ if (fwspec->fwnode != d->fwnode) diff --git a/drivers/irqchip/irq-gic-v5-irs.c b/drivers/irqchip/irq-gic-v5-irs.c index 13c035727e32..ce2732d649a3 100644 --- a/drivers/irqchip/irq-gic-v5-irs.c +++ b/drivers/irqchip/irq-gic-v5-irs.c @@ -571,7 +571,7 @@ static void __init gicv5_irs_init_bases(struct gicv5_irs_chip_data *irs_data, FIELD_PREP(GICV5_IRS_CR1_IST_RA, GICV5_NO_READ_ALLOC) | FIELD_PREP(GICV5_IRS_CR1_IC, GICV5_NON_CACHE) | FIELD_PREP(GICV5_IRS_CR1_OC, GICV5_NON_CACHE); - irs_data->flags |= IRS_FLAGS_NON_COHERENT; + irs_data->flags |= IRS_FLAGS_NON_COHERENT; } else { cr1 = FIELD_PREP(GICV5_IRS_CR1_VPED_WA, GICV5_WRITE_ALLOC) | FIELD_PREP(GICV5_IRS_CR1_VPED_RA, GICV5_READ_ALLOC) | diff --git a/drivers/irqchip/irq-gic-v5-its.c b/drivers/irqchip/irq-gic-v5-its.c index 9290ac741949..554485f0be1f 100644 --- a/drivers/irqchip/irq-gic-v5-its.c +++ b/drivers/irqchip/irq-gic-v5-its.c @@ -191,9 +191,9 @@ static int gicv5_its_create_itt_two_level(struct gicv5_its_chip_data *its, unsigned int num_events) { unsigned int l1_bits, l2_bits, span, events_per_l2_table; - unsigned int i, complete_tables, final_span, num_ents; + unsigned int complete_tables, final_span, num_ents; __le64 *itt_l1, *itt_l2, **l2ptrs; - int ret; + int i, ret; u64 val; ret = gicv5_its_l2sz_to_l2_bits(itt_l2sz); @@ -768,8 +768,6 @@ static struct gicv5_its_dev *gicv5_its_alloc_device(struct gicv5_its_chip_data * goto out_dev_free; } - gicv5_its_device_cache_inv(its, its_dev); - its_dev->its_node = its; its_dev->event_map = (unsigned long *)bitmap_zalloc(its_dev->num_events, GFP_KERNEL); @@ -949,15 +947,18 @@ static int gicv5_its_irq_domain_alloc(struct irq_domain *domain, unsigned int vi device_id = its_dev->device_id; for (i = 0; i < nr_irqs; i++) { - lpi = gicv5_alloc_lpi(); + ret = gicv5_alloc_lpi(); if (ret < 0) { pr_debug("Failed to find free LPI!\n"); - goto out_eventid; + goto out_free_irqs; } + lpi = ret; ret = irq_domain_alloc_irqs_parent(domain, virq + i, 1, &lpi); - if (ret) - goto out_free_lpi; + if (ret) { + gicv5_free_lpi(lpi); + goto out_free_irqs; + } /* * Store eventid and deviceid into the hwirq for later use. @@ -977,8 +978,13 @@ static int gicv5_its_irq_domain_alloc(struct irq_domain *domain, unsigned int vi return 0; -out_free_lpi: - gicv5_free_lpi(lpi); +out_free_irqs: + while (--i >= 0) { + irqd = irq_domain_get_irq_data(domain, virq + i); + gicv5_free_lpi(irqd->parent_data->hwirq); + irq_domain_reset_irq_data(irqd); + irq_domain_free_irqs_parent(domain, virq + i, 1); + } out_eventid: gicv5_its_free_eventid(its_dev, event_id_base, nr_irqs); return ret; diff --git a/drivers/irqchip/irq-loongson-eiointc.c b/drivers/irqchip/irq-loongson-eiointc.c index b2860eb2d32c..39e5a72ccd3c 100644 --- a/drivers/irqchip/irq-loongson-eiointc.c +++ b/drivers/irqchip/irq-loongson-eiointc.c @@ -46,6 +46,7 @@ #define EIOINTC_ALL_ENABLE_VEC_MASK(vector) (EIOINTC_ALL_ENABLE & ~BIT(vector & 0x1f)) #define EIOINTC_REG_ENABLE_VEC(vector) (EIOINTC_REG_ENABLE + ((vector >> 5) << 2)) #define EIOINTC_USE_CPU_ENCODE BIT(0) +#define EIOINTC_ROUTE_MULT_IP BIT(1) #define MAX_EIO_NODES (NR_CPUS / CORES_PER_EIO_NODE) @@ -59,6 +60,14 @@ #define EIOINTC_REG_ROUTE_VEC_MASK(vector) (0xff << EIOINTC_REG_ROUTE_VEC_SHIFT(vector)) static int nr_pics; +struct eiointc_priv; + +struct eiointc_ip_route { + struct eiointc_priv *priv; + /* Offset Routed destination IP */ + int start; + int end; +}; struct eiointc_priv { u32 node; @@ -68,6 +77,8 @@ struct eiointc_priv { struct fwnode_handle *domain_handle; struct irq_domain *eiointc_domain; int flags; + irq_hw_number_t parent_hwirq; + struct eiointc_ip_route route_info[VEC_REG_COUNT]; }; static struct eiointc_priv *eiointc_priv[MAX_IO_PICS]; @@ -188,6 +199,7 @@ static int eiointc_router_init(unsigned int cpu) { int i, bit, cores, index, node; unsigned int data; + int hwirq, mask; node = cpu_to_eio_node(cpu); index = eiointc_index(node); @@ -197,6 +209,13 @@ static int eiointc_router_init(unsigned int cpu) return -EINVAL; } + /* Enable cpu interrupt pin from eiointc */ + hwirq = eiointc_priv[index]->parent_hwirq; + mask = BIT(hwirq); + if (eiointc_priv[index]->flags & EIOINTC_ROUTE_MULT_IP) + mask |= BIT(hwirq + 1) | BIT(hwirq + 2) | BIT(hwirq + 3); + set_csr_ecfg(mask); + if (!(eiointc_priv[index]->flags & EIOINTC_USE_CPU_ENCODE)) cores = CORES_PER_EIO_NODE; else @@ -211,8 +230,31 @@ static int eiointc_router_init(unsigned int cpu) } for (i = 0; i < eiointc_priv[0]->vec_count / 32 / 4; i++) { - bit = BIT(1 + index); /* Route to IP[1 + index] */ - data = bit | (bit << 8) | (bit << 16) | (bit << 24); + /* + * Route to interrupt pin, relative offset used here + * Offset 0 means routing to IP0 and so on + * + * If flags is set with EIOINTC_ROUTE_MULT_IP, + * every 64 vector routes to different consecutive + * IPs, otherwise all vector routes to the same IP + */ + if (eiointc_priv[index]->flags & EIOINTC_ROUTE_MULT_IP) { + /* The first 64 vectors route to hwirq */ + bit = BIT(hwirq++ - INT_HWI0); + data = bit | (bit << 8); + + /* The second 64 vectors route to hwirq + 1 */ + bit = BIT(hwirq++ - INT_HWI0); + data |= (bit << 16) | (bit << 24); + + /* + * Route to hwirq + 2/hwirq + 3 separately + * in next loop + */ + } else { + bit = BIT(hwirq - INT_HWI0); + data = bit | (bit << 8) | (bit << 16) | (bit << 24); + } iocsr_write32(data, EIOINTC_REG_IPMAP + i * 4); } @@ -241,15 +283,22 @@ static int eiointc_router_init(unsigned int cpu) static void eiointc_irq_dispatch(struct irq_desc *desc) { - int i; - u64 pending; - bool handled = false; + struct eiointc_ip_route *info = irq_desc_get_handler_data(desc); struct irq_chip *chip = irq_desc_get_chip(desc); - struct eiointc_priv *priv = irq_desc_get_handler_data(desc); + bool handled = false; + u64 pending; + int i; chained_irq_enter(chip, desc); - for (i = 0; i < eiointc_priv[0]->vec_count / VEC_COUNT_PER_REG; i++) { + /* + * If EIOINTC_ROUTE_MULT_IP is set, every 64 interrupt vectors in + * eiointc interrupt controller routes to different cpu interrupt pins + * + * Every cpu interrupt pin has its own irq handler, it is ok to + * read ISR for these 64 interrupt vectors rather than all vectors + */ + for (i = info->start; i < info->end; i++) { pending = iocsr_read64(EIOINTC_REG_ISR + (i << 3)); /* Skip handling if pending bitmap is zero */ @@ -262,7 +311,7 @@ static void eiointc_irq_dispatch(struct irq_desc *desc) int bit = __ffs(pending); int irq = bit + VEC_COUNT_PER_REG * i; - generic_handle_domain_irq(priv->eiointc_domain, irq); + generic_handle_domain_irq(info->priv->eiointc_domain, irq); pending &= ~BIT(bit); handled = true; } @@ -462,8 +511,33 @@ static int __init eiointc_init(struct eiointc_priv *priv, int parent_irq, } eiointc_priv[nr_pics++] = priv; + /* + * Only the first eiointc device on VM supports routing to + * different CPU interrupt pins. The later eiointc devices use + * generic method if there are multiple eiointc devices in future + */ + if (cpu_has_hypervisor && (nr_pics == 1)) { + priv->flags |= EIOINTC_ROUTE_MULT_IP; + priv->parent_hwirq = INT_HWI0; + } + + if (priv->flags & EIOINTC_ROUTE_MULT_IP) { + for (i = 0; i < priv->vec_count / VEC_COUNT_PER_REG; i++) { + priv->route_info[i].start = priv->parent_hwirq - INT_HWI0 + i; + priv->route_info[i].end = priv->route_info[i].start + 1; + priv->route_info[i].priv = priv; + parent_irq = get_percpu_irq(priv->parent_hwirq + i); + irq_set_chained_handler_and_data(parent_irq, eiointc_irq_dispatch, + &priv->route_info[i]); + } + } else { + priv->route_info[0].start = 0; + priv->route_info[0].end = priv->vec_count / VEC_COUNT_PER_REG; + priv->route_info[0].priv = priv; + irq_set_chained_handler_and_data(parent_irq, eiointc_irq_dispatch, + &priv->route_info[0]); + } eiointc_router_init(0); - irq_set_chained_handler_and_data(parent_irq, eiointc_irq_dispatch, priv); if (nr_pics == 1) { register_syscore_ops(&eiointc_syscore_ops); @@ -495,7 +569,7 @@ int __init eiointc_acpi_init(struct irq_domain *parent, priv->vec_count = VEC_COUNT; priv->node = acpi_eiointc->node; - + priv->parent_hwirq = acpi_eiointc->cascade; parent_irq = irq_create_mapping(parent, acpi_eiointc->cascade); ret = eiointc_init(priv, parent_irq, acpi_eiointc->node_map); @@ -527,8 +601,9 @@ out_free_priv: static int __init eiointc_of_init(struct device_node *of_node, struct device_node *parent) { - int parent_irq, ret; struct eiointc_priv *priv; + struct irq_data *irq_data; + int parent_irq, ret; priv = kzalloc(sizeof(*priv), GFP_KERNEL); if (!priv) @@ -544,6 +619,12 @@ static int __init eiointc_of_init(struct device_node *of_node, if (ret < 0) goto out_free_priv; + irq_data = irq_get_irq_data(parent_irq); + if (!irq_data) { + ret = -ENODEV; + goto out_free_priv; + } + /* * In particular, the number of devices supported by the LS2K0500 * extended I/O interrupt vector is 128. @@ -552,7 +633,7 @@ static int __init eiointc_of_init(struct device_node *of_node, priv->vec_count = 128; else priv->vec_count = VEC_COUNT; - + priv->parent_hwirq = irqd_to_hwirq(irq_data); priv->node = 0; priv->domain_handle = of_fwnode_handle(of_node); diff --git a/drivers/irqchip/irq-loongson-pch-lpc.c b/drivers/irqchip/irq-loongson-pch-lpc.c index 2d4c3ec128b8..912bf50a5c7c 100644 --- a/drivers/irqchip/irq-loongson-pch-lpc.c +++ b/drivers/irqchip/irq-loongson-pch-lpc.c @@ -200,8 +200,13 @@ int __init pch_lpc_acpi_init(struct irq_domain *parent, goto iounmap_base; } - priv->lpc_domain = irq_domain_create_linear(irq_handle, LPC_COUNT, - &pch_lpc_domain_ops, priv); + /* + * The LPC interrupt controller is a legacy i8259-compatible device, + * which requires a static 1:1 mapping for IRQs 0-15. + * Use irq_domain_create_legacy to establish this static mapping early. + */ + priv->lpc_domain = irq_domain_create_legacy(irq_handle, LPC_COUNT, 0, 0, + &pch_lpc_domain_ops, priv); if (!priv->lpc_domain) { pr_err("Failed to create IRQ domain\n"); goto free_irq_handle; diff --git a/drivers/irqchip/irq-msi-lib.c b/drivers/irqchip/irq-msi-lib.c index 908944009c21..d5eefc3d7215 100644 --- a/drivers/irqchip/irq-msi-lib.c +++ b/drivers/irqchip/irq-msi-lib.c @@ -112,6 +112,20 @@ bool msi_lib_init_dev_msi_info(struct device *dev, struct irq_domain *domain, */ if (!chip->irq_set_affinity && !(info->flags & MSI_FLAG_NO_AFFINITY)) chip->irq_set_affinity = msi_domain_set_affinity; + + /* + * If the parent domain insists on being in charge of masking, obey + * blindly. The interrupt is un-masked at the PCI level on startup + * and masked on shutdown to prevent rogue interrupts after the + * driver freed the interrupt. Not masking it at the PCI level + * speeds up operation for disable/enable_irq() as it avoids + * getting all the way out to the PCI device. + */ + if (info->flags & MSI_FLAG_PCI_MSI_MASK_PARENT) { + chip->irq_mask = irq_chip_mask_parent; + chip->irq_unmask = irq_chip_unmask_parent; + } + return true; } EXPORT_SYMBOL_GPL(msi_lib_init_dev_msi_info); diff --git a/drivers/irqchip/irq-nvic.c b/drivers/irqchip/irq-nvic.c index 76e11cac9631..2191a2b79578 100644 --- a/drivers/irqchip/irq-nvic.c +++ b/drivers/irqchip/irq-nvic.c @@ -73,8 +73,9 @@ static int __init nvic_of_init(struct device_node *node, struct device_node *parent) { unsigned int clr = IRQ_NOREQUEST | IRQ_NOPROBE | IRQ_NOAUTOEN; - unsigned int irqs, i, ret, numbanks; + unsigned int irqs, i, numbanks; void __iomem *nvic_base; + int ret; numbanks = (readl_relaxed(V7M_SCS_ICTR) & V7M_SCS_ICTR_INTLINESNUM_MASK) + 1; diff --git a/drivers/irqchip/irq-renesas-rza1.c b/drivers/irqchip/irq-renesas-rza1.c index a697eb55ac90..6047a524ac77 100644 --- a/drivers/irqchip/irq-renesas-rza1.c +++ b/drivers/irqchip/irq-renesas-rza1.c @@ -142,11 +142,12 @@ static const struct irq_domain_ops rza1_irqc_domain_ops = { static int rza1_irqc_parse_map(struct rza1_irqc_priv *priv, struct device_node *gic_node) { - unsigned int imaplen, i, j, ret; struct device *dev = priv->dev; + unsigned int imaplen, i, j; struct device_node *ipar; const __be32 *imap; u32 intsize; + int ret; imap = of_get_property(dev->of_node, "interrupt-map", &imaplen); if (!imap) diff --git a/drivers/irqchip/irq-renesas-rzg2l.c b/drivers/irqchip/irq-renesas-rzg2l.c index 360d88687e4f..2a54adeb4cc7 100644 --- a/drivers/irqchip/irq-renesas-rzg2l.c +++ b/drivers/irqchip/irq-renesas-rzg2l.c @@ -578,7 +578,7 @@ static int rzg2l_irqc_common_init(struct device_node *node, struct device_node * &rzg2l_irqc_domain_ops, rzg2l_irqc_data); if (!irq_domain) { pm_runtime_put(dev); - return dev_err_probe(dev, -ENOMEM, "failed to add irq domain\n"); + return -ENOMEM; } register_syscore_ops(&rzg2l_irqc_syscore_ops); diff --git a/drivers/irqchip/irq-sg2042-msi.c b/drivers/irqchip/irq-sg2042-msi.c index bcfddc51bc6a..f7cf0dc72eab 100644 --- a/drivers/irqchip/irq-sg2042-msi.c +++ b/drivers/irqchip/irq-sg2042-msi.c @@ -30,6 +30,7 @@ struct sg204x_msi_chip_info { * @doorbell_addr: see TRM, 10.1.32, GP_INTR0_SET * @irq_first: First vectors number that MSIs starts * @num_irqs: Number of vectors for MSIs + * @irq_type: IRQ type for MSIs * @msi_map: mapping for allocated MSI vectors. * @msi_map_lock: Lock for msi_map * @chip_info: chip specific infomations @@ -41,6 +42,7 @@ struct sg204x_msi_chipdata { u32 irq_first; u32 num_irqs; + unsigned int irq_type; unsigned long *msi_map; struct mutex msi_map_lock; @@ -85,6 +87,8 @@ static void sg2042_msi_irq_compose_msi_msg(struct irq_data *d, struct msi_msg *m static const struct irq_chip sg2042_msi_middle_irq_chip = { .name = "SG2042 MSI", + .irq_startup = irq_chip_startup_parent, + .irq_shutdown = irq_chip_shutdown_parent, .irq_ack = sg2042_msi_irq_ack, .irq_mask = irq_chip_mask_parent, .irq_unmask = irq_chip_unmask_parent, @@ -114,6 +118,8 @@ static void sg2044_msi_irq_compose_msi_msg(struct irq_data *d, struct msi_msg *m static struct irq_chip sg2044_msi_middle_irq_chip = { .name = "SG2044 MSI", + .irq_startup = irq_chip_startup_parent, + .irq_shutdown = irq_chip_shutdown_parent, .irq_ack = sg2044_msi_irq_ack, .irq_mask = irq_chip_mask_parent, .irq_unmask = irq_chip_unmask_parent, @@ -133,14 +139,14 @@ static int sg204x_msi_parent_domain_alloc(struct irq_domain *domain, unsigned in fwspec.fwnode = domain->parent->fwnode; fwspec.param_count = 2; fwspec.param[0] = data->irq_first + hwirq; - fwspec.param[1] = IRQ_TYPE_EDGE_RISING; + fwspec.param[1] = data->irq_type; ret = irq_domain_alloc_irqs_parent(domain, virq, 1, &fwspec); if (ret) return ret; d = irq_domain_get_irq_data(domain->parent, virq); - return d->chip->irq_set_type(d, IRQ_TYPE_EDGE_RISING); + return d->chip->irq_set_type(d, data->irq_type); } static int sg204x_msi_middle_domain_alloc(struct irq_domain *domain, unsigned int virq, @@ -185,8 +191,10 @@ static const struct irq_domain_ops sg204x_msi_middle_domain_ops = { .select = msi_lib_irq_domain_select, }; -#define SG2042_MSI_FLAGS_REQUIRED (MSI_FLAG_USE_DEF_DOM_OPS | \ - MSI_FLAG_USE_DEF_CHIP_OPS) +#define SG2042_MSI_FLAGS_REQUIRED (MSI_FLAG_USE_DEF_DOM_OPS | \ + MSI_FLAG_USE_DEF_CHIP_OPS | \ + MSI_FLAG_PCI_MSI_MASK_PARENT | \ + MSI_FLAG_PCI_MSI_STARTUP_PARENT) #define SG2042_MSI_FLAGS_SUPPORTED MSI_GENERIC_FLAGS_MASK @@ -200,10 +208,13 @@ static const struct msi_parent_ops sg2042_msi_parent_ops = { .init_dev_msi_info = msi_lib_init_dev_msi_info, }; -#define SG2044_MSI_FLAGS_REQUIRED (MSI_FLAG_USE_DEF_DOM_OPS | \ - MSI_FLAG_USE_DEF_CHIP_OPS) +#define SG2044_MSI_FLAGS_REQUIRED (MSI_FLAG_USE_DEF_DOM_OPS | \ + MSI_FLAG_USE_DEF_CHIP_OPS | \ + MSI_FLAG_PCI_MSI_MASK_PARENT | \ + MSI_FLAG_PCI_MSI_STARTUP_PARENT) -#define SG2044_MSI_FLAGS_SUPPORTED (MSI_GENERIC_FLAGS_MASK | \ +#define SG2044_MSI_FLAGS_SUPPORTED (MSI_GENERIC_FLAGS_MASK | \ + MSI_FLAG_MULTI_PCI_MSI | \ MSI_FLAG_PCI_MSIX) static const struct msi_parent_ops sg2044_msi_parent_ops = { @@ -289,6 +300,7 @@ static int sg2042_msi_probe(struct platform_device *pdev) } data->irq_first = (u32)args.args[0]; + data->irq_type = (unsigned int)args.args[1]; data->num_irqs = (u32)args.args[args.nargs - 1]; mutex_init(&data->msi_map_lock); diff --git a/drivers/irqchip/irq-sifive-plic.c b/drivers/irqchip/irq-sifive-plic.c index bf69a4802b71..559fda8fb3a8 100644 --- a/drivers/irqchip/irq-sifive-plic.c +++ b/drivers/irqchip/irq-sifive-plic.c @@ -179,12 +179,14 @@ static int plic_set_affinity(struct irq_data *d, if (cpu >= nr_cpu_ids) return -EINVAL; - plic_irq_disable(d); + /* Invalidate the original routing entry */ + plic_irq_toggle(irq_data_get_effective_affinity_mask(d), d, 0); irq_data_update_effective_affinity(d, cpumask_of(cpu)); + /* Setting the new routing entry if irq is enabled */ if (!irqd_irq_disabled(d)) - plic_irq_enable(d); + plic_irq_toggle(irq_data_get_effective_affinity_mask(d), d, 1); return IRQ_SET_MASK_OK_DONE; } @@ -257,7 +259,7 @@ static int plic_irq_suspend(void) readl(priv->regs + PRIORITY_BASE + i * PRIORITY_PER_ID)); } - for_each_cpu(cpu, cpu_present_mask) { + for_each_present_cpu(cpu) { struct plic_handler *handler = per_cpu_ptr(&plic_handlers, cpu); if (!handler->present) @@ -289,7 +291,7 @@ static void plic_irq_resume(void) priv->regs + PRIORITY_BASE + i * PRIORITY_PER_ID); } - for_each_cpu(cpu, cpu_present_mask) { + for_each_present_cpu(cpu) { struct plic_handler *handler = per_cpu_ptr(&plic_handlers, cpu); if (!handler->present) diff --git a/drivers/media/rc/pwm-ir-tx.c b/drivers/media/rc/pwm-ir-tx.c index 84533fdd61aa..047472dc9244 100644 --- a/drivers/media/rc/pwm-ir-tx.c +++ b/drivers/media/rc/pwm-ir-tx.c @@ -117,7 +117,6 @@ static int pwm_ir_tx_atomic(struct rc_dev *dev, unsigned int *txbuf, static enum hrtimer_restart pwm_ir_timer(struct hrtimer *timer) { struct pwm_ir *pwm_ir = container_of(timer, struct pwm_ir, timer); - ktime_t now; /* * If we happen to hit an odd latency spike, loop through the @@ -139,9 +138,7 @@ static enum hrtimer_restart pwm_ir_timer(struct hrtimer *timer) hrtimer_add_expires_ns(timer, ns); pwm_ir->txbuf_index++; - - now = timer->base->get_time(); - } while (hrtimer_get_expires_tv64(timer) < now); + } while (hrtimer_expires_remaining(timer) > 0); return HRTIMER_RESTART; } diff --git a/drivers/of/irq.c b/drivers/of/irq.c index 74aaea61de13..d2b690857e58 100644 --- a/drivers/of/irq.c +++ b/drivers/of/irq.c @@ -519,6 +519,7 @@ int of_irq_count(struct device_node *dev) return nr; } +EXPORT_SYMBOL_GPL(of_irq_count); /** * of_irq_to_resource_table - Fill in resource table with node's IRQ info diff --git a/drivers/pci/msi/irqdomain.c b/drivers/pci/msi/irqdomain.c index 0938ef7ebabf..dfb61f152702 100644 --- a/drivers/pci/msi/irqdomain.c +++ b/drivers/pci/msi/irqdomain.c @@ -148,20 +148,43 @@ static void pci_device_domain_set_desc(msi_alloc_info_t *arg, struct msi_desc *d arg->hwirq = desc->msi_index; } -static __always_inline void cond_mask_parent(struct irq_data *data) +static void cond_shutdown_parent(struct irq_data *data) { struct msi_domain_info *info = data->domain->host_data; - if (unlikely(info->flags & MSI_FLAG_PCI_MSI_MASK_PARENT)) + if (unlikely(info->flags & MSI_FLAG_PCI_MSI_STARTUP_PARENT)) + irq_chip_shutdown_parent(data); + else if (unlikely(info->flags & MSI_FLAG_PCI_MSI_MASK_PARENT)) irq_chip_mask_parent(data); } -static __always_inline void cond_unmask_parent(struct irq_data *data) +static unsigned int cond_startup_parent(struct irq_data *data) { struct msi_domain_info *info = data->domain->host_data; - if (unlikely(info->flags & MSI_FLAG_PCI_MSI_MASK_PARENT)) + if (unlikely(info->flags & MSI_FLAG_PCI_MSI_STARTUP_PARENT)) + return irq_chip_startup_parent(data); + else if (unlikely(info->flags & MSI_FLAG_PCI_MSI_MASK_PARENT)) irq_chip_unmask_parent(data); + + return 0; +} + +static void pci_irq_shutdown_msi(struct irq_data *data) +{ + struct msi_desc *desc = irq_data_get_msi_desc(data); + + pci_msi_mask(desc, BIT(data->irq - desc->irq)); + cond_shutdown_parent(data); +} + +static unsigned int pci_irq_startup_msi(struct irq_data *data) +{ + struct msi_desc *desc = irq_data_get_msi_desc(data); + unsigned int ret = cond_startup_parent(data); + + pci_msi_unmask(desc, BIT(data->irq - desc->irq)); + return ret; } static void pci_irq_mask_msi(struct irq_data *data) @@ -169,14 +192,12 @@ static void pci_irq_mask_msi(struct irq_data *data) struct msi_desc *desc = irq_data_get_msi_desc(data); pci_msi_mask(desc, BIT(data->irq - desc->irq)); - cond_mask_parent(data); } static void pci_irq_unmask_msi(struct irq_data *data) { struct msi_desc *desc = irq_data_get_msi_desc(data); - cond_unmask_parent(data); pci_msi_unmask(desc, BIT(data->irq - desc->irq)); } @@ -194,6 +215,8 @@ static void pci_irq_unmask_msi(struct irq_data *data) static const struct msi_domain_template pci_msi_template = { .chip = { .name = "PCI-MSI", + .irq_startup = pci_irq_startup_msi, + .irq_shutdown = pci_irq_shutdown_msi, .irq_mask = pci_irq_mask_msi, .irq_unmask = pci_irq_unmask_msi, .irq_write_msi_msg = pci_msi_domain_write_msg, @@ -210,15 +233,27 @@ static const struct msi_domain_template pci_msi_template = { }, }; +static void pci_irq_shutdown_msix(struct irq_data *data) +{ + pci_msix_mask(irq_data_get_msi_desc(data)); + cond_shutdown_parent(data); +} + +static unsigned int pci_irq_startup_msix(struct irq_data *data) +{ + unsigned int ret = cond_startup_parent(data); + + pci_msix_unmask(irq_data_get_msi_desc(data)); + return ret; +} + static void pci_irq_mask_msix(struct irq_data *data) { pci_msix_mask(irq_data_get_msi_desc(data)); - cond_mask_parent(data); } static void pci_irq_unmask_msix(struct irq_data *data) { - cond_unmask_parent(data); pci_msix_unmask(irq_data_get_msi_desc(data)); } @@ -234,6 +269,8 @@ EXPORT_SYMBOL_GPL(pci_msix_prepare_desc); static const struct msi_domain_template pci_msix_template = { .chip = { .name = "PCI-MSIX", + .irq_startup = pci_irq_startup_msix, + .irq_shutdown = pci_irq_shutdown_msix, .irq_mask = pci_irq_mask_msix, .irq_unmask = pci_irq_unmask_msix, .irq_write_msi_msg = pci_msi_domain_write_msg, diff --git a/drivers/platform/x86/intel/speed_select_if/isst_if_common.c b/drivers/platform/x86/intel/speed_select_if/isst_if_common.c index 71e104a068e9..7449873c3d40 100644 --- a/drivers/platform/x86/intel/speed_select_if/isst_if_common.c +++ b/drivers/platform/x86/intel/speed_select_if/isst_if_common.c @@ -790,7 +790,7 @@ static const struct x86_cpu_id isst_cpu_ids[] = { X86_MATCH_VFM(INTEL_GRANITERAPIDS_X, SST_HPM_SUPPORTED), X86_MATCH_VFM(INTEL_ICELAKE_D, 0), X86_MATCH_VFM(INTEL_ICELAKE_X, 0), - X86_MATCH_VFM(INTEL_PANTHERCOVE_X, SST_HPM_SUPPORTED), + X86_MATCH_VFM(INTEL_DIAMONDRAPIDS_X, SST_HPM_SUPPORTED), X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, 0), X86_MATCH_VFM(INTEL_SKYLAKE_X, SST_MBOX_SUPPORTED), {} diff --git a/drivers/platform/x86/intel/tpmi_power_domains.c b/drivers/platform/x86/intel/tpmi_power_domains.c index 8641353b2e06..7d93119a4c30 100644 --- a/drivers/platform/x86/intel/tpmi_power_domains.c +++ b/drivers/platform/x86/intel/tpmi_power_domains.c @@ -85,7 +85,7 @@ static const struct x86_cpu_id tpmi_cpu_ids[] = { X86_MATCH_VFM(INTEL_ATOM_CRESTMONT, NULL), X86_MATCH_VFM(INTEL_ATOM_DARKMONT_X, NULL), X86_MATCH_VFM(INTEL_GRANITERAPIDS_D, NULL), - X86_MATCH_VFM(INTEL_PANTHERCOVE_X, NULL), + X86_MATCH_VFM(INTEL_DIAMONDRAPIDS_X, NULL), {} }; MODULE_DEVICE_TABLE(x86cpu, tpmi_cpu_ids); diff --git a/drivers/ras/ras.c b/drivers/ras/ras.c index a6e4792a1b2e..ac0e132ccc3e 100644 --- a/drivers/ras/ras.c +++ b/drivers/ras/ras.c @@ -51,6 +51,7 @@ void log_non_standard_event(const guid_t *sec_type, const guid_t *fru_id, { trace_non_standard_event(sec_type, fru_id, fru_text, sev, err, len); } +EXPORT_SYMBOL_GPL(log_non_standard_event); void log_arm_hw_error(struct cper_sec_proc_arm *err) { diff --git a/fs/resctrl/ctrlmondata.c b/fs/resctrl/ctrlmondata.c index 3c39cfacb251..0d0ef54fc4de 100644 --- a/fs/resctrl/ctrlmondata.c +++ b/fs/resctrl/ctrlmondata.c @@ -473,12 +473,12 @@ ssize_t rdtgroup_mba_mbps_event_write(struct kernfs_open_file *of, rdt_last_cmd_clear(); if (!strcmp(buf, "mbm_local_bytes")) { - if (resctrl_arch_is_mbm_local_enabled()) + if (resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID)) rdtgrp->mba_mbps_event = QOS_L3_MBM_LOCAL_EVENT_ID; else ret = -EINVAL; } else if (!strcmp(buf, "mbm_total_bytes")) { - if (resctrl_arch_is_mbm_total_enabled()) + if (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID)) rdtgrp->mba_mbps_event = QOS_L3_MBM_TOTAL_EVENT_ID; else ret = -EINVAL; @@ -563,10 +563,15 @@ void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, rr->r = r; rr->d = d; rr->first = first; - rr->arch_mon_ctx = resctrl_arch_mon_ctx_alloc(r, evtid); - if (IS_ERR(rr->arch_mon_ctx)) { - rr->err = -EINVAL; - return; + if (resctrl_arch_mbm_cntr_assign_enabled(r) && + resctrl_is_mbm_event(evtid)) { + rr->is_mbm_cntr = true; + } else { + rr->arch_mon_ctx = resctrl_arch_mon_ctx_alloc(r, evtid); + if (IS_ERR(rr->arch_mon_ctx)) { + rr->err = -EINVAL; + return; + } } cpu = cpumask_any_housekeeping(cpumask, RESCTRL_PICK_ANY_CPU); @@ -582,7 +587,8 @@ void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, else smp_call_on_cpu(cpu, smp_mon_event_count, rr, false); - resctrl_arch_mon_ctx_free(r, evtid, rr->arch_mon_ctx); + if (rr->arch_mon_ctx) + resctrl_arch_mon_ctx_free(r, evtid, rr->arch_mon_ctx); } int rdtgroup_mondata_show(struct seq_file *m, void *arg) @@ -653,10 +659,16 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg) checkresult: + /* + * -ENOENT is a special case, set only when "mbm_event" counter assignment + * mode is enabled and no counter has been assigned. + */ if (rr.err == -EIO) seq_puts(m, "Error\n"); else if (rr.err == -EINVAL) seq_puts(m, "Unavailable\n"); + else if (rr.err == -ENOENT) + seq_puts(m, "Unassigned\n"); else seq_printf(m, "%llu\n", rr.val); diff --git a/fs/resctrl/internal.h b/fs/resctrl/internal.h index 9a8cf6f11151..cf1fd82dc5a9 100644 --- a/fs/resctrl/internal.h +++ b/fs/resctrl/internal.h @@ -52,19 +52,31 @@ static inline struct rdt_fs_context *rdt_fc2context(struct fs_context *fc) } /** - * struct mon_evt - Entry in the event list of a resource + * struct mon_evt - Properties of a monitor event * @evtid: event id + * @rid: resource id for this event * @name: name of the event + * @evt_cfg: Event configuration value that represents the + * memory transactions (e.g., READS_TO_LOCAL_MEM, + * READS_TO_REMOTE_MEM) being tracked by @evtid. + * Only valid if @evtid is an MBM event. * @configurable: true if the event is configurable - * @list: entry in &rdt_resource->evt_list + * @enabled: true if the event is enabled */ struct mon_evt { enum resctrl_event_id evtid; + enum resctrl_res_level rid; char *name; + u32 evt_cfg; bool configurable; - struct list_head list; + bool enabled; }; +extern struct mon_evt mon_event_all[QOS_NUM_EVENTS]; + +#define for_each_mon_event(mevt) for (mevt = &mon_event_all[QOS_FIRST_EVENT]; \ + mevt < &mon_event_all[QOS_NUM_EVENTS]; mevt++) + /** * struct mon_data - Monitoring details for each event file. * @list: Member of the global @mon_data_kn_priv_list list. @@ -99,6 +111,8 @@ struct mon_data { * @evtid: Which monitor event to read. * @first: Initialize MBM counter when true. * @ci: Cacheinfo for L3. Only set when @d is NULL. Used when summing domains. + * @is_mbm_cntr: true if "mbm_event" counter assignment mode is enabled and it + * is an MBM event. * @err: Error encountered when reading counter. * @val: Returned value of event counter. If @rgrp is a parent resource group, * @val includes the sum of event counts from its child resource groups. @@ -113,6 +127,7 @@ struct rmid_read { enum resctrl_event_id evtid; bool first; struct cacheinfo *ci; + bool is_mbm_cntr; int err; u64 val; void *arch_mon_ctx; @@ -226,6 +241,8 @@ struct rdtgroup { #define RFTYPE_DEBUG BIT(10) +#define RFTYPE_ASSIGN_CONFIG BIT(11) + #define RFTYPE_CTRL_INFO (RFTYPE_INFO | RFTYPE_CTRL) #define RFTYPE_MON_INFO (RFTYPE_INFO | RFTYPE_MON) @@ -375,6 +392,41 @@ bool closid_allocated(unsigned int closid); int resctrl_find_cleanest_closid(void); +void *rdt_kn_parent_priv(struct kernfs_node *kn); + +int resctrl_mbm_assign_mode_show(struct kernfs_open_file *of, struct seq_file *s, void *v); + +ssize_t resctrl_mbm_assign_mode_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off); + +void resctrl_bmec_files_show(struct rdt_resource *r, struct kernfs_node *l3_mon_kn, + bool show); + +int resctrl_num_mbm_cntrs_show(struct kernfs_open_file *of, struct seq_file *s, void *v); + +int resctrl_available_mbm_cntrs_show(struct kernfs_open_file *of, struct seq_file *s, + void *v); + +void rdtgroup_assign_cntrs(struct rdtgroup *rdtgrp); + +void rdtgroup_unassign_cntrs(struct rdtgroup *rdtgrp); + +int event_filter_show(struct kernfs_open_file *of, struct seq_file *seq, void *v); + +ssize_t event_filter_write(struct kernfs_open_file *of, char *buf, size_t nbytes, + loff_t off); + +int resctrl_mbm_assign_on_mkdir_show(struct kernfs_open_file *of, + struct seq_file *s, void *v); + +ssize_t resctrl_mbm_assign_on_mkdir_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off); + +int mbm_L3_assignments_show(struct kernfs_open_file *of, struct seq_file *s, void *v); + +ssize_t mbm_L3_assignments_write(struct kernfs_open_file *of, char *buf, size_t nbytes, + loff_t off); + #ifdef CONFIG_RESCTRL_FS_PSEUDO_LOCK int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp); diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c index 7326c28a7908..4076336fbba6 100644 --- a/fs/resctrl/monitor.c +++ b/fs/resctrl/monitor.c @@ -336,7 +336,7 @@ void free_rmid(u32 closid, u32 rmid) entry = __rmid_entry(idx); - if (resctrl_arch_is_llc_occupancy_enabled()) + if (resctrl_is_mon_event_enabled(QOS_L3_OCCUP_EVENT_ID)) add_rmid_to_limbo(entry); else list_add_tail(&entry->list, &rmid_free_lru); @@ -346,27 +346,97 @@ static struct mbm_state *get_mbm_state(struct rdt_mon_domain *d, u32 closid, u32 rmid, enum resctrl_event_id evtid) { u32 idx = resctrl_arch_rmid_idx_encode(closid, rmid); + struct mbm_state *state; - switch (evtid) { - case QOS_L3_MBM_TOTAL_EVENT_ID: - return &d->mbm_total[idx]; - case QOS_L3_MBM_LOCAL_EVENT_ID: - return &d->mbm_local[idx]; - default: + if (!resctrl_is_mbm_event(evtid)) return NULL; + + state = d->mbm_states[MBM_STATE_IDX(evtid)]; + + return state ? &state[idx] : NULL; +} + +/* + * mbm_cntr_get() - Return the counter ID for the matching @evtid and @rdtgrp. + * + * Return: + * Valid counter ID on success, or -ENOENT on failure. + */ +static int mbm_cntr_get(struct rdt_resource *r, struct rdt_mon_domain *d, + struct rdtgroup *rdtgrp, enum resctrl_event_id evtid) +{ + int cntr_id; + + if (!r->mon.mbm_cntr_assignable) + return -ENOENT; + + if (!resctrl_is_mbm_event(evtid)) + return -ENOENT; + + for (cntr_id = 0; cntr_id < r->mon.num_mbm_cntrs; cntr_id++) { + if (d->cntr_cfg[cntr_id].rdtgrp == rdtgrp && + d->cntr_cfg[cntr_id].evtid == evtid) + return cntr_id; + } + + return -ENOENT; +} + +/* + * mbm_cntr_alloc() - Initialize and return a new counter ID in the domain @d. + * Caller must ensure that the specified event is not assigned already. + * + * Return: + * Valid counter ID on success, or -ENOSPC on failure. + */ +static int mbm_cntr_alloc(struct rdt_resource *r, struct rdt_mon_domain *d, + struct rdtgroup *rdtgrp, enum resctrl_event_id evtid) +{ + int cntr_id; + + for (cntr_id = 0; cntr_id < r->mon.num_mbm_cntrs; cntr_id++) { + if (!d->cntr_cfg[cntr_id].rdtgrp) { + d->cntr_cfg[cntr_id].rdtgrp = rdtgrp; + d->cntr_cfg[cntr_id].evtid = evtid; + return cntr_id; + } } + + return -ENOSPC; } -static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr) +/* + * mbm_cntr_free() - Clear the counter ID configuration details in the domain @d. + */ +static void mbm_cntr_free(struct rdt_mon_domain *d, int cntr_id) +{ + memset(&d->cntr_cfg[cntr_id], 0, sizeof(*d->cntr_cfg)); +} + +static int __mon_event_count(struct rdtgroup *rdtgrp, struct rmid_read *rr) { int cpu = smp_processor_id(); + u32 closid = rdtgrp->closid; + u32 rmid = rdtgrp->mon.rmid; struct rdt_mon_domain *d; + int cntr_id = -ENOENT; struct mbm_state *m; int err, ret; u64 tval = 0; + if (rr->is_mbm_cntr) { + cntr_id = mbm_cntr_get(rr->r, rr->d, rdtgrp, rr->evtid); + if (cntr_id < 0) { + rr->err = -ENOENT; + return -EINVAL; + } + } + if (rr->first) { - resctrl_arch_reset_rmid(rr->r, rr->d, closid, rmid, rr->evtid); + if (rr->is_mbm_cntr) + resctrl_arch_reset_cntr(rr->r, rr->d, closid, rmid, cntr_id, rr->evtid); + else + resctrl_arch_reset_rmid(rr->r, rr->d, closid, rmid, rr->evtid); m = get_mbm_state(rr->d, closid, rmid, rr->evtid); if (m) memset(m, 0, sizeof(struct mbm_state)); @@ -377,8 +447,12 @@ static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr) /* Reading a single domain, must be on a CPU in that domain. */ if (!cpumask_test_cpu(cpu, &rr->d->hdr.cpu_mask)) return -EINVAL; - rr->err = resctrl_arch_rmid_read(rr->r, rr->d, closid, rmid, - rr->evtid, &tval, rr->arch_mon_ctx); + if (rr->is_mbm_cntr) + rr->err = resctrl_arch_cntr_read(rr->r, rr->d, closid, rmid, cntr_id, + rr->evtid, &tval); + else + rr->err = resctrl_arch_rmid_read(rr->r, rr->d, closid, rmid, + rr->evtid, &tval, rr->arch_mon_ctx); if (rr->err) return rr->err; @@ -402,8 +476,12 @@ static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr) list_for_each_entry(d, &rr->r->mon_domains, hdr.list) { if (d->ci_id != rr->ci->id) continue; - err = resctrl_arch_rmid_read(rr->r, d, closid, rmid, - rr->evtid, &tval, rr->arch_mon_ctx); + if (rr->is_mbm_cntr) + err = resctrl_arch_cntr_read(rr->r, d, closid, rmid, cntr_id, + rr->evtid, &tval); + else + err = resctrl_arch_rmid_read(rr->r, d, closid, rmid, + rr->evtid, &tval, rr->arch_mon_ctx); if (!err) { rr->val += tval; ret = 0; @@ -419,8 +497,8 @@ static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr) /* * mbm_bw_count() - Update bw count from values previously read by * __mon_event_count(). - * @closid: The closid used to identify the cached mbm_state. - * @rmid: The rmid used to identify the cached mbm_state. + * @rdtgrp: resctrl group associated with the CLOSID and RMID to identify + * the cached mbm_state. * @rr: The struct rmid_read populated by __mon_event_count(). * * Supporting function to calculate the memory bandwidth @@ -428,9 +506,11 @@ static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr) * __mon_event_count() is compared with the chunks value from the previous * invocation. This must be called once per second to maintain values in MBps. */ -static void mbm_bw_count(u32 closid, u32 rmid, struct rmid_read *rr) +static void mbm_bw_count(struct rdtgroup *rdtgrp, struct rmid_read *rr) { u64 cur_bw, bytes, cur_bytes; + u32 closid = rdtgrp->closid; + u32 rmid = rdtgrp->mon.rmid; struct mbm_state *m; m = get_mbm_state(rr->d, closid, rmid, rr->evtid); @@ -459,7 +539,7 @@ void mon_event_count(void *info) rdtgrp = rr->rgrp; - ret = __mon_event_count(rdtgrp->closid, rdtgrp->mon.rmid, rr); + ret = __mon_event_count(rdtgrp, rr); /* * For Ctrl groups read data from child monitor groups and @@ -470,8 +550,7 @@ void mon_event_count(void *info) if (rdtgrp->type == RDTCTRL_GROUP) { list_for_each_entry(entry, head, mon.crdtgrp_list) { - if (__mon_event_count(entry->closid, entry->mon.rmid, - rr) == 0) + if (__mon_event_count(entry, rr) == 0) ret = 0; } } @@ -602,44 +681,49 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_mon_domain *dom_mbm) } static void mbm_update_one_event(struct rdt_resource *r, struct rdt_mon_domain *d, - u32 closid, u32 rmid, enum resctrl_event_id evtid) + struct rdtgroup *rdtgrp, enum resctrl_event_id evtid) { struct rmid_read rr = {0}; rr.r = r; rr.d = d; rr.evtid = evtid; - rr.arch_mon_ctx = resctrl_arch_mon_ctx_alloc(rr.r, rr.evtid); - if (IS_ERR(rr.arch_mon_ctx)) { - pr_warn_ratelimited("Failed to allocate monitor context: %ld", - PTR_ERR(rr.arch_mon_ctx)); - return; + if (resctrl_arch_mbm_cntr_assign_enabled(r)) { + rr.is_mbm_cntr = true; + } else { + rr.arch_mon_ctx = resctrl_arch_mon_ctx_alloc(rr.r, rr.evtid); + if (IS_ERR(rr.arch_mon_ctx)) { + pr_warn_ratelimited("Failed to allocate monitor context: %ld", + PTR_ERR(rr.arch_mon_ctx)); + return; + } } - __mon_event_count(closid, rmid, &rr); + __mon_event_count(rdtgrp, &rr); /* * If the software controller is enabled, compute the * bandwidth for this event id. */ if (is_mba_sc(NULL)) - mbm_bw_count(closid, rmid, &rr); + mbm_bw_count(rdtgrp, &rr); - resctrl_arch_mon_ctx_free(rr.r, rr.evtid, rr.arch_mon_ctx); + if (rr.arch_mon_ctx) + resctrl_arch_mon_ctx_free(rr.r, rr.evtid, rr.arch_mon_ctx); } static void mbm_update(struct rdt_resource *r, struct rdt_mon_domain *d, - u32 closid, u32 rmid) + struct rdtgroup *rdtgrp) { /* * This is protected from concurrent reads from user as both * the user and overflow handler hold the global mutex. */ - if (resctrl_arch_is_mbm_total_enabled()) - mbm_update_one_event(r, d, closid, rmid, QOS_L3_MBM_TOTAL_EVENT_ID); + if (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID)) + mbm_update_one_event(r, d, rdtgrp, QOS_L3_MBM_TOTAL_EVENT_ID); - if (resctrl_arch_is_mbm_local_enabled()) - mbm_update_one_event(r, d, closid, rmid, QOS_L3_MBM_LOCAL_EVENT_ID); + if (resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID)) + mbm_update_one_event(r, d, rdtgrp, QOS_L3_MBM_LOCAL_EVENT_ID); } /* @@ -712,11 +796,11 @@ void mbm_handle_overflow(struct work_struct *work) d = container_of(work, struct rdt_mon_domain, mbm_over.work); list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { - mbm_update(r, d, prgrp->closid, prgrp->mon.rmid); + mbm_update(r, d, prgrp); head = &prgrp->mon.crdtgrp_list; list_for_each_entry(crgrp, head, mon.crdtgrp_list) - mbm_update(r, d, crgrp->closid, crgrp->mon.rmid); + mbm_update(r, d, crgrp); if (is_mba_sc(NULL)) update_mba_bw(prgrp, d); @@ -842,38 +926,819 @@ out_unlock: mutex_unlock(&rdtgroup_mutex); } -static struct mon_evt llc_occupancy_event = { - .name = "llc_occupancy", - .evtid = QOS_L3_OCCUP_EVENT_ID, +/* + * All available events. Architecture code marks the ones that + * are supported by a system using resctrl_enable_mon_event() + * to set .enabled. + */ +struct mon_evt mon_event_all[QOS_NUM_EVENTS] = { + [QOS_L3_OCCUP_EVENT_ID] = { + .name = "llc_occupancy", + .evtid = QOS_L3_OCCUP_EVENT_ID, + .rid = RDT_RESOURCE_L3, + }, + [QOS_L3_MBM_TOTAL_EVENT_ID] = { + .name = "mbm_total_bytes", + .evtid = QOS_L3_MBM_TOTAL_EVENT_ID, + .rid = RDT_RESOURCE_L3, + }, + [QOS_L3_MBM_LOCAL_EVENT_ID] = { + .name = "mbm_local_bytes", + .evtid = QOS_L3_MBM_LOCAL_EVENT_ID, + .rid = RDT_RESOURCE_L3, + }, }; -static struct mon_evt mbm_total_event = { - .name = "mbm_total_bytes", - .evtid = QOS_L3_MBM_TOTAL_EVENT_ID, +void resctrl_enable_mon_event(enum resctrl_event_id eventid) +{ + if (WARN_ON_ONCE(eventid < QOS_FIRST_EVENT || eventid >= QOS_NUM_EVENTS)) + return; + if (mon_event_all[eventid].enabled) { + pr_warn("Duplicate enable for event %d\n", eventid); + return; + } + + mon_event_all[eventid].enabled = true; +} + +bool resctrl_is_mon_event_enabled(enum resctrl_event_id eventid) +{ + return eventid >= QOS_FIRST_EVENT && eventid < QOS_NUM_EVENTS && + mon_event_all[eventid].enabled; +} + +u32 resctrl_get_mon_evt_cfg(enum resctrl_event_id evtid) +{ + return mon_event_all[evtid].evt_cfg; +} + +/** + * struct mbm_transaction - Memory transaction an MBM event can be configured with. + * @name: Name of memory transaction (read, write ...). + * @val: The bit (eg. READS_TO_LOCAL_MEM or READS_TO_REMOTE_MEM) used to + * represent the memory transaction within an event's configuration. + */ +struct mbm_transaction { + char name[32]; + u32 val; }; -static struct mon_evt mbm_local_event = { - .name = "mbm_local_bytes", - .evtid = QOS_L3_MBM_LOCAL_EVENT_ID, +/* Decoded values for each type of memory transaction. */ +static struct mbm_transaction mbm_transactions[NUM_MBM_TRANSACTIONS] = { + {"local_reads", READS_TO_LOCAL_MEM}, + {"remote_reads", READS_TO_REMOTE_MEM}, + {"local_non_temporal_writes", NON_TEMP_WRITE_TO_LOCAL_MEM}, + {"remote_non_temporal_writes", NON_TEMP_WRITE_TO_REMOTE_MEM}, + {"local_reads_slow_memory", READS_TO_LOCAL_S_MEM}, + {"remote_reads_slow_memory", READS_TO_REMOTE_S_MEM}, + {"dirty_victim_writes_all", DIRTY_VICTIMS_TO_ALL_MEM}, }; +int event_filter_show(struct kernfs_open_file *of, struct seq_file *seq, void *v) +{ + struct mon_evt *mevt = rdt_kn_parent_priv(of->kn); + struct rdt_resource *r; + bool sep = false; + int ret = 0, i; + + mutex_lock(&rdtgroup_mutex); + rdt_last_cmd_clear(); + + r = resctrl_arch_get_resource(mevt->rid); + if (!resctrl_arch_mbm_cntr_assign_enabled(r)) { + rdt_last_cmd_puts("mbm_event counter assignment mode is not enabled\n"); + ret = -EINVAL; + goto out_unlock; + } + + for (i = 0; i < NUM_MBM_TRANSACTIONS; i++) { + if (mevt->evt_cfg & mbm_transactions[i].val) { + if (sep) + seq_putc(seq, ','); + seq_printf(seq, "%s", mbm_transactions[i].name); + sep = true; + } + } + seq_putc(seq, '\n'); + +out_unlock: + mutex_unlock(&rdtgroup_mutex); + + return ret; +} + +int resctrl_mbm_assign_on_mkdir_show(struct kernfs_open_file *of, struct seq_file *s, + void *v) +{ + struct rdt_resource *r = rdt_kn_parent_priv(of->kn); + int ret = 0; + + mutex_lock(&rdtgroup_mutex); + rdt_last_cmd_clear(); + + if (!resctrl_arch_mbm_cntr_assign_enabled(r)) { + rdt_last_cmd_puts("mbm_event counter assignment mode is not enabled\n"); + ret = -EINVAL; + goto out_unlock; + } + + seq_printf(s, "%u\n", r->mon.mbm_assign_on_mkdir); + +out_unlock: + mutex_unlock(&rdtgroup_mutex); + + return ret; +} + +ssize_t resctrl_mbm_assign_on_mkdir_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) +{ + struct rdt_resource *r = rdt_kn_parent_priv(of->kn); + bool value; + int ret; + + ret = kstrtobool(buf, &value); + if (ret) + return ret; + + mutex_lock(&rdtgroup_mutex); + rdt_last_cmd_clear(); + + if (!resctrl_arch_mbm_cntr_assign_enabled(r)) { + rdt_last_cmd_puts("mbm_event counter assignment mode is not enabled\n"); + ret = -EINVAL; + goto out_unlock; + } + + r->mon.mbm_assign_on_mkdir = value; + +out_unlock: + mutex_unlock(&rdtgroup_mutex); + + return ret ?: nbytes; +} + +/* + * mbm_cntr_free_all() - Clear all the counter ID configuration details in the + * domain @d. Called when mbm_assign_mode is changed. + */ +static void mbm_cntr_free_all(struct rdt_resource *r, struct rdt_mon_domain *d) +{ + memset(d->cntr_cfg, 0, sizeof(*d->cntr_cfg) * r->mon.num_mbm_cntrs); +} + +/* + * resctrl_reset_rmid_all() - Reset all non-architecture states for all the + * supported RMIDs. + */ +static void resctrl_reset_rmid_all(struct rdt_resource *r, struct rdt_mon_domain *d) +{ + u32 idx_limit = resctrl_arch_system_num_rmid_idx(); + enum resctrl_event_id evt; + int idx; + + for_each_mbm_event_id(evt) { + if (!resctrl_is_mon_event_enabled(evt)) + continue; + idx = MBM_STATE_IDX(evt); + memset(d->mbm_states[idx], 0, sizeof(*d->mbm_states[0]) * idx_limit); + } +} + +/* + * rdtgroup_assign_cntr() - Assign/unassign the counter ID for the event, RMID + * pair in the domain. + * + * Assign the counter if @assign is true else unassign the counter. Reset the + * associated non-architectural state. + */ +static void rdtgroup_assign_cntr(struct rdt_resource *r, struct rdt_mon_domain *d, + enum resctrl_event_id evtid, u32 rmid, u32 closid, + u32 cntr_id, bool assign) +{ + struct mbm_state *m; + + resctrl_arch_config_cntr(r, d, evtid, rmid, closid, cntr_id, assign); + + m = get_mbm_state(d, closid, rmid, evtid); + if (m) + memset(m, 0, sizeof(*m)); +} + +/* + * rdtgroup_alloc_assign_cntr() - Allocate a counter ID and assign it to the event + * pointed to by @mevt and the resctrl group @rdtgrp within the domain @d. + * + * Return: + * 0 on success, < 0 on failure. + */ +static int rdtgroup_alloc_assign_cntr(struct rdt_resource *r, struct rdt_mon_domain *d, + struct rdtgroup *rdtgrp, struct mon_evt *mevt) +{ + int cntr_id; + + /* No action required if the counter is assigned already. */ + cntr_id = mbm_cntr_get(r, d, rdtgrp, mevt->evtid); + if (cntr_id >= 0) + return 0; + + cntr_id = mbm_cntr_alloc(r, d, rdtgrp, mevt->evtid); + if (cntr_id < 0) { + rdt_last_cmd_printf("Failed to allocate counter for %s in domain %d\n", + mevt->name, d->hdr.id); + return cntr_id; + } + + rdtgroup_assign_cntr(r, d, mevt->evtid, rdtgrp->mon.rmid, rdtgrp->closid, cntr_id, true); + + return 0; +} + /* - * Initialize the event list for the resource. + * rdtgroup_assign_cntr_event() - Assign a hardware counter for the event in + * @mevt to the resctrl group @rdtgrp. Assign counters to all domains if @d is + * NULL; otherwise, assign the counter to the specified domain @d. + * + * If all counters in a domain are already in use, rdtgroup_alloc_assign_cntr() + * will fail. The assignment process will abort at the first failure encountered + * during domain traversal, which may result in the event being only partially + * assigned. * - * Note that MBM events are also part of RDT_RESOURCE_L3 resource - * because as per the SDM the total and local memory bandwidth - * are enumerated as part of L3 monitoring. + * Return: + * 0 on success, < 0 on failure. + */ +static int rdtgroup_assign_cntr_event(struct rdt_mon_domain *d, struct rdtgroup *rdtgrp, + struct mon_evt *mevt) +{ + struct rdt_resource *r = resctrl_arch_get_resource(mevt->rid); + int ret = 0; + + if (!d) { + list_for_each_entry(d, &r->mon_domains, hdr.list) { + ret = rdtgroup_alloc_assign_cntr(r, d, rdtgrp, mevt); + if (ret) + return ret; + } + } else { + ret = rdtgroup_alloc_assign_cntr(r, d, rdtgrp, mevt); + } + + return ret; +} + +/* + * rdtgroup_assign_cntrs() - Assign counters to MBM events. Called when + * a new group is created. + * + * Each group can accommodate two counters per domain: one for the total + * event and one for the local event. Assignments may fail due to the limited + * number of counters. However, it is not necessary to fail the group creation + * and thus no failure is returned. Users have the option to modify the + * counter assignments after the group has been created. + */ +void rdtgroup_assign_cntrs(struct rdtgroup *rdtgrp) +{ + struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); + + if (!r->mon_capable || !resctrl_arch_mbm_cntr_assign_enabled(r) || + !r->mon.mbm_assign_on_mkdir) + return; + + if (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID)) + rdtgroup_assign_cntr_event(NULL, rdtgrp, + &mon_event_all[QOS_L3_MBM_TOTAL_EVENT_ID]); + + if (resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID)) + rdtgroup_assign_cntr_event(NULL, rdtgrp, + &mon_event_all[QOS_L3_MBM_LOCAL_EVENT_ID]); +} + +/* + * rdtgroup_free_unassign_cntr() - Unassign and reset the counter ID configuration + * for the event pointed to by @mevt within the domain @d and resctrl group @rdtgrp. + */ +static void rdtgroup_free_unassign_cntr(struct rdt_resource *r, struct rdt_mon_domain *d, + struct rdtgroup *rdtgrp, struct mon_evt *mevt) +{ + int cntr_id; + + cntr_id = mbm_cntr_get(r, d, rdtgrp, mevt->evtid); + + /* If there is no cntr_id assigned, nothing to do */ + if (cntr_id < 0) + return; + + rdtgroup_assign_cntr(r, d, mevt->evtid, rdtgrp->mon.rmid, rdtgrp->closid, cntr_id, false); + + mbm_cntr_free(d, cntr_id); +} + +/* + * rdtgroup_unassign_cntr_event() - Unassign a hardware counter associated with + * the event structure @mevt from the domain @d and the group @rdtgrp. Unassign + * the counters from all the domains if @d is NULL else unassign from @d. + */ +static void rdtgroup_unassign_cntr_event(struct rdt_mon_domain *d, struct rdtgroup *rdtgrp, + struct mon_evt *mevt) +{ + struct rdt_resource *r = resctrl_arch_get_resource(mevt->rid); + + if (!d) { + list_for_each_entry(d, &r->mon_domains, hdr.list) + rdtgroup_free_unassign_cntr(r, d, rdtgrp, mevt); + } else { + rdtgroup_free_unassign_cntr(r, d, rdtgrp, mevt); + } +} + +/* + * rdtgroup_unassign_cntrs() - Unassign the counters associated with MBM events. + * Called when a group is deleted. */ -static void l3_mon_evt_init(struct rdt_resource *r) +void rdtgroup_unassign_cntrs(struct rdtgroup *rdtgrp) { - INIT_LIST_HEAD(&r->evt_list); + struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); - if (resctrl_arch_is_llc_occupancy_enabled()) - list_add_tail(&llc_occupancy_event.list, &r->evt_list); - if (resctrl_arch_is_mbm_total_enabled()) - list_add_tail(&mbm_total_event.list, &r->evt_list); - if (resctrl_arch_is_mbm_local_enabled()) - list_add_tail(&mbm_local_event.list, &r->evt_list); + if (!r->mon_capable || !resctrl_arch_mbm_cntr_assign_enabled(r)) + return; + + if (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID)) + rdtgroup_unassign_cntr_event(NULL, rdtgrp, + &mon_event_all[QOS_L3_MBM_TOTAL_EVENT_ID]); + + if (resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID)) + rdtgroup_unassign_cntr_event(NULL, rdtgrp, + &mon_event_all[QOS_L3_MBM_LOCAL_EVENT_ID]); +} + +static int resctrl_parse_mem_transactions(char *tok, u32 *val) +{ + u32 temp_val = 0; + char *evt_str; + bool found; + int i; + +next_config: + if (!tok || tok[0] == '\0') { + *val = temp_val; + return 0; + } + + /* Start processing the strings for each memory transaction type */ + evt_str = strim(strsep(&tok, ",")); + found = false; + for (i = 0; i < NUM_MBM_TRANSACTIONS; i++) { + if (!strcmp(mbm_transactions[i].name, evt_str)) { + temp_val |= mbm_transactions[i].val; + found = true; + break; + } + } + + if (!found) { + rdt_last_cmd_printf("Invalid memory transaction type %s\n", evt_str); + return -EINVAL; + } + + goto next_config; +} + +/* + * rdtgroup_update_cntr_event - Update the counter assignments for the event + * in a group. + * @r: Resource to which update needs to be done. + * @rdtgrp: Resctrl group. + * @evtid: MBM monitor event. + */ +static void rdtgroup_update_cntr_event(struct rdt_resource *r, struct rdtgroup *rdtgrp, + enum resctrl_event_id evtid) +{ + struct rdt_mon_domain *d; + int cntr_id; + + list_for_each_entry(d, &r->mon_domains, hdr.list) { + cntr_id = mbm_cntr_get(r, d, rdtgrp, evtid); + if (cntr_id >= 0) + rdtgroup_assign_cntr(r, d, evtid, rdtgrp->mon.rmid, + rdtgrp->closid, cntr_id, true); + } +} + +/* + * resctrl_update_cntr_allrdtgrp - Update the counter assignments for the event + * for all the groups. + * @mevt MBM Monitor event. + */ +static void resctrl_update_cntr_allrdtgrp(struct mon_evt *mevt) +{ + struct rdt_resource *r = resctrl_arch_get_resource(mevt->rid); + struct rdtgroup *prgrp, *crgrp; + + /* + * Find all the groups where the event is assigned and update the + * configuration of existing assignments. + */ + list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { + rdtgroup_update_cntr_event(r, prgrp, mevt->evtid); + + list_for_each_entry(crgrp, &prgrp->mon.crdtgrp_list, mon.crdtgrp_list) + rdtgroup_update_cntr_event(r, crgrp, mevt->evtid); + } +} + +ssize_t event_filter_write(struct kernfs_open_file *of, char *buf, size_t nbytes, + loff_t off) +{ + struct mon_evt *mevt = rdt_kn_parent_priv(of->kn); + struct rdt_resource *r; + u32 evt_cfg = 0; + int ret = 0; + + /* Valid input requires a trailing newline */ + if (nbytes == 0 || buf[nbytes - 1] != '\n') + return -EINVAL; + + buf[nbytes - 1] = '\0'; + + cpus_read_lock(); + mutex_lock(&rdtgroup_mutex); + + rdt_last_cmd_clear(); + + r = resctrl_arch_get_resource(mevt->rid); + if (!resctrl_arch_mbm_cntr_assign_enabled(r)) { + rdt_last_cmd_puts("mbm_event counter assignment mode is not enabled\n"); + ret = -EINVAL; + goto out_unlock; + } + + ret = resctrl_parse_mem_transactions(buf, &evt_cfg); + if (!ret && mevt->evt_cfg != evt_cfg) { + mevt->evt_cfg = evt_cfg; + resctrl_update_cntr_allrdtgrp(mevt); + } + +out_unlock: + mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); + + return ret ?: nbytes; +} + +int resctrl_mbm_assign_mode_show(struct kernfs_open_file *of, + struct seq_file *s, void *v) +{ + struct rdt_resource *r = rdt_kn_parent_priv(of->kn); + bool enabled; + + mutex_lock(&rdtgroup_mutex); + enabled = resctrl_arch_mbm_cntr_assign_enabled(r); + + if (r->mon.mbm_cntr_assignable) { + if (enabled) + seq_puts(s, "[mbm_event]\n"); + else + seq_puts(s, "[default]\n"); + + if (!IS_ENABLED(CONFIG_RESCTRL_ASSIGN_FIXED)) { + if (enabled) + seq_puts(s, "default\n"); + else + seq_puts(s, "mbm_event\n"); + } + } else { + seq_puts(s, "[default]\n"); + } + + mutex_unlock(&rdtgroup_mutex); + + return 0; +} + +ssize_t resctrl_mbm_assign_mode_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) +{ + struct rdt_resource *r = rdt_kn_parent_priv(of->kn); + struct rdt_mon_domain *d; + int ret = 0; + bool enable; + + /* Valid input requires a trailing newline */ + if (nbytes == 0 || buf[nbytes - 1] != '\n') + return -EINVAL; + + buf[nbytes - 1] = '\0'; + + cpus_read_lock(); + mutex_lock(&rdtgroup_mutex); + + rdt_last_cmd_clear(); + + if (!strcmp(buf, "default")) { + enable = 0; + } else if (!strcmp(buf, "mbm_event")) { + if (r->mon.mbm_cntr_assignable) { + enable = 1; + } else { + ret = -EINVAL; + rdt_last_cmd_puts("mbm_event mode is not supported\n"); + goto out_unlock; + } + } else { + ret = -EINVAL; + rdt_last_cmd_puts("Unsupported assign mode\n"); + goto out_unlock; + } + + if (enable != resctrl_arch_mbm_cntr_assign_enabled(r)) { + ret = resctrl_arch_mbm_cntr_assign_set(r, enable); + if (ret) + goto out_unlock; + + /* Update the visibility of BMEC related files */ + resctrl_bmec_files_show(r, NULL, !enable); + + /* + * Initialize the default memory transaction values for + * total and local events. + */ + if (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID)) + mon_event_all[QOS_L3_MBM_TOTAL_EVENT_ID].evt_cfg = r->mon.mbm_cfg_mask; + if (resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID)) + mon_event_all[QOS_L3_MBM_LOCAL_EVENT_ID].evt_cfg = r->mon.mbm_cfg_mask & + (READS_TO_LOCAL_MEM | + READS_TO_LOCAL_S_MEM | + NON_TEMP_WRITE_TO_LOCAL_MEM); + /* Enable auto assignment when switching to "mbm_event" mode */ + if (enable) + r->mon.mbm_assign_on_mkdir = true; + /* + * Reset all the non-achitectural RMID state and assignable counters. + */ + list_for_each_entry(d, &r->mon_domains, hdr.list) { + mbm_cntr_free_all(r, d); + resctrl_reset_rmid_all(r, d); + } + } + +out_unlock: + mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); + + return ret ?: nbytes; +} + +int resctrl_num_mbm_cntrs_show(struct kernfs_open_file *of, + struct seq_file *s, void *v) +{ + struct rdt_resource *r = rdt_kn_parent_priv(of->kn); + struct rdt_mon_domain *dom; + bool sep = false; + + cpus_read_lock(); + mutex_lock(&rdtgroup_mutex); + + list_for_each_entry(dom, &r->mon_domains, hdr.list) { + if (sep) + seq_putc(s, ';'); + + seq_printf(s, "%d=%d", dom->hdr.id, r->mon.num_mbm_cntrs); + sep = true; + } + seq_putc(s, '\n'); + + mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); + return 0; +} + +int resctrl_available_mbm_cntrs_show(struct kernfs_open_file *of, + struct seq_file *s, void *v) +{ + struct rdt_resource *r = rdt_kn_parent_priv(of->kn); + struct rdt_mon_domain *dom; + bool sep = false; + u32 cntrs, i; + int ret = 0; + + cpus_read_lock(); + mutex_lock(&rdtgroup_mutex); + + rdt_last_cmd_clear(); + + if (!resctrl_arch_mbm_cntr_assign_enabled(r)) { + rdt_last_cmd_puts("mbm_event counter assignment mode is not enabled\n"); + ret = -EINVAL; + goto out_unlock; + } + + list_for_each_entry(dom, &r->mon_domains, hdr.list) { + if (sep) + seq_putc(s, ';'); + + cntrs = 0; + for (i = 0; i < r->mon.num_mbm_cntrs; i++) { + if (!dom->cntr_cfg[i].rdtgrp) + cntrs++; + } + + seq_printf(s, "%d=%u", dom->hdr.id, cntrs); + sep = true; + } + seq_putc(s, '\n'); + +out_unlock: + mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); + + return ret; +} + +int mbm_L3_assignments_show(struct kernfs_open_file *of, struct seq_file *s, void *v) +{ + struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); + struct rdt_mon_domain *d; + struct rdtgroup *rdtgrp; + struct mon_evt *mevt; + int ret = 0; + bool sep; + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + if (!rdtgrp) { + ret = -ENOENT; + goto out_unlock; + } + + rdt_last_cmd_clear(); + if (!resctrl_arch_mbm_cntr_assign_enabled(r)) { + rdt_last_cmd_puts("mbm_event counter assignment mode is not enabled\n"); + ret = -EINVAL; + goto out_unlock; + } + + for_each_mon_event(mevt) { + if (mevt->rid != r->rid || !mevt->enabled || !resctrl_is_mbm_event(mevt->evtid)) + continue; + + sep = false; + seq_printf(s, "%s:", mevt->name); + list_for_each_entry(d, &r->mon_domains, hdr.list) { + if (sep) + seq_putc(s, ';'); + + if (mbm_cntr_get(r, d, rdtgrp, mevt->evtid) < 0) + seq_printf(s, "%d=_", d->hdr.id); + else + seq_printf(s, "%d=e", d->hdr.id); + + sep = true; + } + seq_putc(s, '\n'); + } + +out_unlock: + rdtgroup_kn_unlock(of->kn); + + return ret; +} + +/* + * mbm_get_mon_event_by_name() - Return the mon_evt entry for the matching + * event name. + */ +static struct mon_evt *mbm_get_mon_event_by_name(struct rdt_resource *r, char *name) +{ + struct mon_evt *mevt; + + for_each_mon_event(mevt) { + if (mevt->rid == r->rid && mevt->enabled && + resctrl_is_mbm_event(mevt->evtid) && + !strcmp(mevt->name, name)) + return mevt; + } + + return NULL; +} + +static int rdtgroup_modify_assign_state(char *assign, struct rdt_mon_domain *d, + struct rdtgroup *rdtgrp, struct mon_evt *mevt) +{ + int ret = 0; + + if (!assign || strlen(assign) != 1) + return -EINVAL; + + switch (*assign) { + case 'e': + ret = rdtgroup_assign_cntr_event(d, rdtgrp, mevt); + break; + case '_': + rdtgroup_unassign_cntr_event(d, rdtgrp, mevt); + break; + default: + ret = -EINVAL; + break; + } + + return ret; +} + +static int resctrl_parse_mbm_assignment(struct rdt_resource *r, struct rdtgroup *rdtgrp, + char *event, char *tok) +{ + struct rdt_mon_domain *d; + unsigned long dom_id = 0; + char *dom_str, *id_str; + struct mon_evt *mevt; + int ret; + + mevt = mbm_get_mon_event_by_name(r, event); + if (!mevt) { + rdt_last_cmd_printf("Invalid event %s\n", event); + return -ENOENT; + } + +next: + if (!tok || tok[0] == '\0') + return 0; + + /* Start processing the strings for each domain */ + dom_str = strim(strsep(&tok, ";")); + + id_str = strsep(&dom_str, "="); + + /* Check for domain id '*' which means all domains */ + if (id_str && *id_str == '*') { + ret = rdtgroup_modify_assign_state(dom_str, NULL, rdtgrp, mevt); + if (ret) + rdt_last_cmd_printf("Assign operation '%s:*=%s' failed\n", + event, dom_str); + return ret; + } else if (!id_str || kstrtoul(id_str, 10, &dom_id)) { + rdt_last_cmd_puts("Missing domain id\n"); + return -EINVAL; + } + + /* Verify if the dom_id is valid */ + list_for_each_entry(d, &r->mon_domains, hdr.list) { + if (d->hdr.id == dom_id) { + ret = rdtgroup_modify_assign_state(dom_str, d, rdtgrp, mevt); + if (ret) { + rdt_last_cmd_printf("Assign operation '%s:%ld=%s' failed\n", + event, dom_id, dom_str); + return ret; + } + goto next; + } + } + + rdt_last_cmd_printf("Invalid domain id %ld\n", dom_id); + return -EINVAL; +} + +ssize_t mbm_L3_assignments_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) +{ + struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); + struct rdtgroup *rdtgrp; + char *token, *event; + int ret = 0; + + /* Valid input requires a trailing newline */ + if (nbytes == 0 || buf[nbytes - 1] != '\n') + return -EINVAL; + + buf[nbytes - 1] = '\0'; + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + if (!rdtgrp) { + rdtgroup_kn_unlock(of->kn); + return -ENOENT; + } + rdt_last_cmd_clear(); + + if (!resctrl_arch_mbm_cntr_assign_enabled(r)) { + rdt_last_cmd_puts("mbm_event mode is not enabled\n"); + rdtgroup_kn_unlock(of->kn); + return -EINVAL; + } + + while ((token = strsep(&buf, "\n")) != NULL) { + /* + * The write command follows the following format: + * "<Event>:<Domain ID>=<Assignment state>" + * Extract the event name first. + */ + event = strsep(&token, ":"); + + ret = resctrl_parse_mbm_assignment(r, rdtgrp, event, token); + if (ret) + break; + } + + rdtgroup_kn_unlock(of->kn); + + return ret ?: nbytes; } /** @@ -900,24 +1765,43 @@ int resctrl_mon_resource_init(void) if (ret) return ret; - l3_mon_evt_init(r); - if (resctrl_arch_is_evt_configurable(QOS_L3_MBM_TOTAL_EVENT_ID)) { - mbm_total_event.configurable = true; + mon_event_all[QOS_L3_MBM_TOTAL_EVENT_ID].configurable = true; resctrl_file_fflags_init("mbm_total_bytes_config", RFTYPE_MON_INFO | RFTYPE_RES_CACHE); } if (resctrl_arch_is_evt_configurable(QOS_L3_MBM_LOCAL_EVENT_ID)) { - mbm_local_event.configurable = true; + mon_event_all[QOS_L3_MBM_LOCAL_EVENT_ID].configurable = true; resctrl_file_fflags_init("mbm_local_bytes_config", RFTYPE_MON_INFO | RFTYPE_RES_CACHE); } - if (resctrl_arch_is_mbm_local_enabled()) + if (resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID)) mba_mbps_default_event = QOS_L3_MBM_LOCAL_EVENT_ID; - else if (resctrl_arch_is_mbm_total_enabled()) + else if (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID)) mba_mbps_default_event = QOS_L3_MBM_TOTAL_EVENT_ID; + if (r->mon.mbm_cntr_assignable) { + if (!resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID)) + resctrl_enable_mon_event(QOS_L3_MBM_TOTAL_EVENT_ID); + if (!resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID)) + resctrl_enable_mon_event(QOS_L3_MBM_LOCAL_EVENT_ID); + mon_event_all[QOS_L3_MBM_TOTAL_EVENT_ID].evt_cfg = r->mon.mbm_cfg_mask; + mon_event_all[QOS_L3_MBM_LOCAL_EVENT_ID].evt_cfg = r->mon.mbm_cfg_mask & + (READS_TO_LOCAL_MEM | + READS_TO_LOCAL_S_MEM | + NON_TEMP_WRITE_TO_LOCAL_MEM); + r->mon.mbm_assign_on_mkdir = true; + resctrl_file_fflags_init("num_mbm_cntrs", + RFTYPE_MON_INFO | RFTYPE_RES_CACHE); + resctrl_file_fflags_init("available_mbm_cntrs", + RFTYPE_MON_INFO | RFTYPE_RES_CACHE); + resctrl_file_fflags_init("event_filter", RFTYPE_ASSIGN_CONFIG); + resctrl_file_fflags_init("mbm_assign_on_mkdir", RFTYPE_MON_INFO | + RFTYPE_RES_CACHE); + resctrl_file_fflags_init("mbm_L3_assignments", RFTYPE_MON_BASE); + } + return 0; } diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index 77d08229d855..0320360cd7a6 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -123,14 +123,8 @@ void rdt_staged_configs_clear(void) static bool resctrl_is_mbm_enabled(void) { - return (resctrl_arch_is_mbm_total_enabled() || - resctrl_arch_is_mbm_local_enabled()); -} - -static bool resctrl_is_mbm_event(int e) -{ - return (e >= QOS_L3_MBM_TOTAL_EVENT_ID && - e <= QOS_L3_MBM_LOCAL_EVENT_ID); + return (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID) || + resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID)); } /* @@ -196,7 +190,7 @@ static int closid_alloc(void) lockdep_assert_held(&rdtgroup_mutex); if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID) && - resctrl_arch_is_llc_occupancy_enabled()) { + resctrl_is_mon_event_enabled(QOS_L3_OCCUP_EVENT_ID)) { cleanest_closid = resctrl_find_cleanest_closid(); if (cleanest_closid < 0) return cleanest_closid; @@ -981,7 +975,7 @@ static int rdt_last_cmd_status_show(struct kernfs_open_file *of, return 0; } -static void *rdt_kn_parent_priv(struct kernfs_node *kn) +void *rdt_kn_parent_priv(struct kernfs_node *kn) { /* * The parent pointer is only valid within RCU section since it can be @@ -1141,7 +1135,7 @@ static int rdt_num_rmids_show(struct kernfs_open_file *of, { struct rdt_resource *r = rdt_kn_parent_priv(of->kn); - seq_printf(seq, "%d\n", r->num_rmid); + seq_printf(seq, "%d\n", r->mon.num_rmid); return 0; } @@ -1152,9 +1146,12 @@ static int rdt_mon_features_show(struct kernfs_open_file *of, struct rdt_resource *r = rdt_kn_parent_priv(of->kn); struct mon_evt *mevt; - list_for_each_entry(mevt, &r->evt_list, list) { + for_each_mon_event(mevt) { + if (mevt->rid != r->rid || !mevt->enabled) + continue; seq_printf(seq, "%s\n", mevt->name); - if (mevt->configurable) + if (mevt->configurable && + !resctrl_arch_mbm_cntr_assign_enabled(r)) seq_printf(seq, "%s_config\n", mevt->name); } @@ -1735,9 +1732,9 @@ next: } /* Value from user cannot be more than the supported set of events */ - if ((val & r->mbm_cfg_mask) != val) { + if ((val & r->mon.mbm_cfg_mask) != val) { rdt_last_cmd_printf("Invalid event configuration: max valid mask is 0x%02x\n", - r->mbm_cfg_mask); + r->mon.mbm_cfg_mask); return -EINVAL; } @@ -1803,6 +1800,44 @@ static ssize_t mbm_local_bytes_config_write(struct kernfs_open_file *of, return ret ?: nbytes; } +/* + * resctrl_bmec_files_show() — Controls the visibility of BMEC-related resctrl + * files. When @show is true, the files are displayed; when false, the files + * are hidden. + * Don't treat kernfs_find_and_get failure as an error, since this function may + * be called regardless of whether BMEC is supported or the event is enabled. + */ +void resctrl_bmec_files_show(struct rdt_resource *r, struct kernfs_node *l3_mon_kn, + bool show) +{ + struct kernfs_node *kn_config, *mon_kn = NULL; + char name[32]; + + if (!l3_mon_kn) { + sprintf(name, "%s_MON", r->name); + mon_kn = kernfs_find_and_get(kn_info, name); + if (!mon_kn) + return; + l3_mon_kn = mon_kn; + } + + kn_config = kernfs_find_and_get(l3_mon_kn, "mbm_total_bytes_config"); + if (kn_config) { + kernfs_show(kn_config, show); + kernfs_put(kn_config); + } + + kn_config = kernfs_find_and_get(l3_mon_kn, "mbm_local_bytes_config"); + if (kn_config) { + kernfs_show(kn_config, show); + kernfs_put(kn_config); + } + + /* Release the reference only if it was acquired */ + if (mon_kn) + kernfs_put(mon_kn); +} + /* rdtgroup information files for one cache resource. */ static struct rftype res_common_files[] = { { @@ -1813,6 +1848,13 @@ static struct rftype res_common_files[] = { .fflags = RFTYPE_TOP_INFO, }, { + .name = "mbm_assign_on_mkdir", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = resctrl_mbm_assign_on_mkdir_show, + .write = resctrl_mbm_assign_on_mkdir_write, + }, + { .name = "num_closids", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, @@ -1827,6 +1869,12 @@ static struct rftype res_common_files[] = { .fflags = RFTYPE_MON_INFO, }, { + .name = "available_mbm_cntrs", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = resctrl_available_mbm_cntrs_show, + }, + { .name = "num_rmids", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, @@ -1841,6 +1889,12 @@ static struct rftype res_common_files[] = { .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE, }, { + .name = "num_mbm_cntrs", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = resctrl_num_mbm_cntrs_show, + }, + { .name = "min_cbm_bits", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, @@ -1916,6 +1970,28 @@ static struct rftype res_common_files[] = { .write = mbm_local_bytes_config_write, }, { + .name = "event_filter", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = event_filter_show, + .write = event_filter_write, + }, + { + .name = "mbm_L3_assignments", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = mbm_L3_assignments_show, + .write = mbm_L3_assignments_write, + }, + { + .name = "mbm_assign_mode", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = resctrl_mbm_assign_mode_show, + .write = resctrl_mbm_assign_mode_write, + .fflags = RFTYPE_MON_INFO | RFTYPE_RES_CACHE, + }, + { .name = "cpus", .mode = 0644, .kf_ops = &rdtgroup_kf_single_ops, @@ -2168,10 +2244,48 @@ int rdtgroup_kn_mode_restore(struct rdtgroup *r, const char *name, return ret; } +static int resctrl_mkdir_event_configs(struct rdt_resource *r, struct kernfs_node *l3_mon_kn) +{ + struct kernfs_node *kn_subdir, *kn_subdir2; + struct mon_evt *mevt; + int ret; + + kn_subdir = kernfs_create_dir(l3_mon_kn, "event_configs", l3_mon_kn->mode, NULL); + if (IS_ERR(kn_subdir)) + return PTR_ERR(kn_subdir); + + ret = rdtgroup_kn_set_ugid(kn_subdir); + if (ret) + return ret; + + for_each_mon_event(mevt) { + if (mevt->rid != r->rid || !mevt->enabled || !resctrl_is_mbm_event(mevt->evtid)) + continue; + + kn_subdir2 = kernfs_create_dir(kn_subdir, mevt->name, kn_subdir->mode, mevt); + if (IS_ERR(kn_subdir2)) { + ret = PTR_ERR(kn_subdir2); + goto out; + } + + ret = rdtgroup_kn_set_ugid(kn_subdir2); + if (ret) + goto out; + + ret = rdtgroup_add_files(kn_subdir2, RFTYPE_ASSIGN_CONFIG); + if (ret) + break; + } + +out: + return ret; +} + static int rdtgroup_mkdir_info_resdir(void *priv, char *name, unsigned long fflags) { struct kernfs_node *kn_subdir; + struct rdt_resource *r; int ret; kn_subdir = kernfs_create_dir(kn_info, name, @@ -2184,8 +2298,25 @@ static int rdtgroup_mkdir_info_resdir(void *priv, char *name, return ret; ret = rdtgroup_add_files(kn_subdir, fflags); - if (!ret) - kernfs_activate(kn_subdir); + if (ret) + return ret; + + if ((fflags & RFTYPE_MON_INFO) == RFTYPE_MON_INFO) { + r = priv; + if (r->mon.mbm_cntr_assignable) { + ret = resctrl_mkdir_event_configs(r, kn_subdir); + if (ret) + return ret; + /* + * Hide BMEC related files if mbm_event mode + * is enabled. + */ + if (resctrl_arch_mbm_cntr_assign_enabled(r)) + resctrl_bmec_files_show(r, kn_subdir, false); + } + } + + kernfs_activate(kn_subdir); return ret; } @@ -2608,10 +2739,8 @@ static int rdt_get_tree(struct fs_context *fc) goto out_root; ret = schemata_list_create(); - if (ret) { - schemata_list_destroy(); - goto out_ctx; - } + if (ret) + goto out_schemata_free; ret = closid_init(); if (ret) @@ -2637,6 +2766,8 @@ static int rdt_get_tree(struct fs_context *fc) if (ret < 0) goto out_info; + rdtgroup_assign_cntrs(&rdtgroup_default); + ret = mkdir_mondata_all(rdtgroup_default.kn, &rdtgroup_default, &kn_mondata); if (ret < 0) @@ -2675,15 +2806,16 @@ out_mondata: if (resctrl_arch_mon_capable()) kernfs_remove(kn_mondata); out_mongrp: - if (resctrl_arch_mon_capable()) + if (resctrl_arch_mon_capable()) { + rdtgroup_unassign_cntrs(&rdtgroup_default); kernfs_remove(kn_mongrp); + } out_info: kernfs_remove(kn_info); out_closid_exit: closid_exit(); out_schemata_free: schemata_list_destroy(); -out_ctx: rdt_disable_ctx(); out_root: rdtgroup_destroy_root(); @@ -2822,6 +2954,7 @@ static void free_all_child_rdtgrp(struct rdtgroup *rdtgrp) head = &rdtgrp->mon.crdtgrp_list; list_for_each_entry_safe(sentry, stmp, head, mon.crdtgrp_list) { + rdtgroup_unassign_cntrs(sentry); free_rmid(sentry->closid, sentry->mon.rmid); list_del(&sentry->mon.crdtgrp_list); @@ -2862,6 +2995,8 @@ static void rmdir_all_sub(void) cpumask_or(&rdtgroup_default.cpu_mask, &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask); + rdtgroup_unassign_cntrs(rdtgrp); + free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); kernfs_remove(rdtgrp->kn); @@ -2946,6 +3081,7 @@ static void resctrl_fs_teardown(void) return; rmdir_all_sub(); + rdtgroup_unassign_cntrs(&rdtgroup_default); mon_put_kn_priv(); rdt_pseudo_lock_release(); rdtgroup_default.mode = RDT_MODE_SHAREABLE; @@ -3057,10 +3193,9 @@ static int mon_add_all_files(struct kernfs_node *kn, struct rdt_mon_domain *d, struct mon_evt *mevt; int ret, domid; - if (WARN_ON(list_empty(&r->evt_list))) - return -EPERM; - - list_for_each_entry(mevt, &r->evt_list, list) { + for_each_mon_event(mevt) { + if (mevt->rid != r->rid || !mevt->enabled) + continue; domid = do_sum ? d->ci_id : d->hdr.id; priv = mon_get_kn_priv(r->rid, domid, mevt, do_sum); if (WARN_ON_ONCE(!priv)) @@ -3427,9 +3562,12 @@ static int mkdir_rdt_prepare_rmid_alloc(struct rdtgroup *rdtgrp) } rdtgrp->mon.rmid = ret; + rdtgroup_assign_cntrs(rdtgrp); + ret = mkdir_mondata_all(rdtgrp->kn, rdtgrp, &rdtgrp->mon.mon_data_kn); if (ret) { rdt_last_cmd_puts("kernfs subdir error\n"); + rdtgroup_unassign_cntrs(rdtgrp); free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); return ret; } @@ -3439,8 +3577,10 @@ static int mkdir_rdt_prepare_rmid_alloc(struct rdtgroup *rdtgrp) static void mkdir_rdt_prepare_rmid_free(struct rdtgroup *rgrp) { - if (resctrl_arch_mon_capable()) + if (resctrl_arch_mon_capable()) { + rdtgroup_unassign_cntrs(rgrp); free_rmid(rgrp->closid, rgrp->mon.rmid); + } } /* @@ -3716,6 +3856,9 @@ static int rdtgroup_rmdir_mon(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask) update_closid_rmid(tmpmask, NULL); rdtgrp->flags = RDT_DELETED; + + rdtgroup_unassign_cntrs(rdtgrp); + free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); /* @@ -3763,6 +3906,8 @@ static int rdtgroup_rmdir_ctrl(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask) cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask); update_closid_rmid(tmpmask, NULL); + rdtgroup_unassign_cntrs(rdtgrp); + free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); closid_free(rdtgrp->closid); @@ -4022,9 +4167,14 @@ static void rdtgroup_setup_default(void) static void domain_destroy_mon_state(struct rdt_mon_domain *d) { + int idx; + + kfree(d->cntr_cfg); bitmap_free(d->rmid_busy_llc); - kfree(d->mbm_total); - kfree(d->mbm_local); + for_each_mbm_idx(idx) { + kfree(d->mbm_states[idx]); + d->mbm_states[idx] = NULL; + } } void resctrl_offline_ctrl_domain(struct rdt_resource *r, struct rdt_ctrl_domain *d) @@ -4050,7 +4200,7 @@ void resctrl_offline_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d if (resctrl_is_mbm_enabled()) cancel_delayed_work(&d->mbm_over); - if (resctrl_arch_is_llc_occupancy_enabled() && has_busy_rmid(d)) { + if (resctrl_is_mon_event_enabled(QOS_L3_OCCUP_EVENT_ID) && has_busy_rmid(d)) { /* * When a package is going down, forcefully * decrement rmid->ebusy. There is no way to know @@ -4084,32 +4234,41 @@ void resctrl_offline_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_mon_domain *d) { u32 idx_limit = resctrl_arch_system_num_rmid_idx(); - size_t tsize; + size_t tsize = sizeof(*d->mbm_states[0]); + enum resctrl_event_id eventid; + int idx; - if (resctrl_arch_is_llc_occupancy_enabled()) { + if (resctrl_is_mon_event_enabled(QOS_L3_OCCUP_EVENT_ID)) { d->rmid_busy_llc = bitmap_zalloc(idx_limit, GFP_KERNEL); if (!d->rmid_busy_llc) return -ENOMEM; } - if (resctrl_arch_is_mbm_total_enabled()) { - tsize = sizeof(*d->mbm_total); - d->mbm_total = kcalloc(idx_limit, tsize, GFP_KERNEL); - if (!d->mbm_total) { - bitmap_free(d->rmid_busy_llc); - return -ENOMEM; - } + + for_each_mbm_event_id(eventid) { + if (!resctrl_is_mon_event_enabled(eventid)) + continue; + idx = MBM_STATE_IDX(eventid); + d->mbm_states[idx] = kcalloc(idx_limit, tsize, GFP_KERNEL); + if (!d->mbm_states[idx]) + goto cleanup; } - if (resctrl_arch_is_mbm_local_enabled()) { - tsize = sizeof(*d->mbm_local); - d->mbm_local = kcalloc(idx_limit, tsize, GFP_KERNEL); - if (!d->mbm_local) { - bitmap_free(d->rmid_busy_llc); - kfree(d->mbm_total); - return -ENOMEM; - } + + if (resctrl_is_mbm_enabled() && r->mon.mbm_cntr_assignable) { + tsize = sizeof(*d->cntr_cfg); + d->cntr_cfg = kcalloc(r->mon.num_mbm_cntrs, tsize, GFP_KERNEL); + if (!d->cntr_cfg) + goto cleanup; } return 0; +cleanup: + bitmap_free(d->rmid_busy_llc); + for_each_mbm_idx(idx) { + kfree(d->mbm_states[idx]); + d->mbm_states[idx] = NULL; + } + + return -ENOMEM; } int resctrl_online_ctrl_domain(struct rdt_resource *r, struct rdt_ctrl_domain *d) @@ -4144,7 +4303,7 @@ int resctrl_online_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d) RESCTRL_PICK_ANY_CPU); } - if (resctrl_arch_is_llc_occupancy_enabled()) + if (resctrl_is_mon_event_enabled(QOS_L3_OCCUP_EVENT_ID)) INIT_DELAYED_WORK(&d->cqm_limbo, cqm_handle_limbo); /* @@ -4219,7 +4378,7 @@ void resctrl_offline_cpu(unsigned int cpu) cancel_delayed_work(&d->mbm_over); mbm_setup_overflow_handler(d, 0, cpu); } - if (resctrl_arch_is_llc_occupancy_enabled() && + if (resctrl_is_mon_event_enabled(QOS_L3_OCCUP_EVENT_ID) && cpu == d->cqm_work_cpu && has_busy_rmid(d)) { cancel_delayed_work(&d->cqm_limbo); cqm_setup_limbo_handler(d, 0, cpu); diff --git a/include/asm-generic/thread_info_tif.h b/include/asm-generic/thread_info_tif.h new file mode 100644 index 000000000000..ee3793e9b1a4 --- /dev/null +++ b/include/asm-generic/thread_info_tif.h @@ -0,0 +1,48 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_GENERIC_THREAD_INFO_TIF_H_ +#define _ASM_GENERIC_THREAD_INFO_TIF_H_ + +#include <vdso/bits.h> + +/* Bits 16-31 are reserved for architecture specific purposes */ + +#define TIF_NOTIFY_RESUME 0 // callback before returning to user +#define _TIF_NOTIFY_RESUME BIT(TIF_NOTIFY_RESUME) + +#define TIF_SIGPENDING 1 // signal pending +#define _TIF_SIGPENDING BIT(TIF_SIGPENDING) + +#define TIF_NOTIFY_SIGNAL 2 // signal notifications exist +#define _TIF_NOTIFY_SIGNAL BIT(TIF_NOTIFY_SIGNAL) + +#define TIF_MEMDIE 3 // is terminating due to OOM killer +#define _TIF_MEMDIE BIT(TIF_MEMDIE) + +#define TIF_NEED_RESCHED 4 // rescheduling necessary +#define _TIF_NEED_RESCHED BIT(TIF_NEED_RESCHED) + +#ifdef HAVE_TIF_NEED_RESCHED_LAZY +# define TIF_NEED_RESCHED_LAZY 5 // Lazy rescheduling needed +# define _TIF_NEED_RESCHED_LAZY BIT(TIF_NEED_RESCHED_LAZY) +#endif + +#ifdef HAVE_TIF_POLLING_NRFLAG +# define TIF_POLLING_NRFLAG 6 // idle is polling for TIF_NEED_RESCHED +# define _TIF_POLLING_NRFLAG BIT(TIF_POLLING_NRFLAG) +#endif + +#define TIF_USER_RETURN_NOTIFY 7 // notify kernel of userspace return +#define _TIF_USER_RETURN_NOTIFY BIT(TIF_USER_RETURN_NOTIFY) + +#define TIF_UPROBE 8 // breakpointed or singlestepping +#define _TIF_UPROBE BIT(TIF_UPROBE) + +#define TIF_PATCH_PENDING 9 // pending live patching update +#define _TIF_PATCH_PENDING BIT(TIF_PATCH_PENDING) + +#ifdef HAVE_TIF_RESTORE_SIGMASK +# define TIF_RESTORE_SIGMASK 10 // Restore signal mask in do_signal() */ +# define _TIF_RESTORE_SIGMASK BIT(TIF_RESTORE_SIGMASK) +#endif + +#endif /* _ASM_GENERIC_THREAD_INFO_TIF_H_ */ diff --git a/include/asm-generic/vdso/vsyscall.h b/include/asm-generic/vdso/vsyscall.h index 7fc0b560007d..5c6d9799f4e7 100644 --- a/include/asm-generic/vdso/vsyscall.h +++ b/include/asm-generic/vdso/vsyscall.h @@ -4,8 +4,6 @@ #ifndef __ASSEMBLY__ -#ifdef CONFIG_GENERIC_VDSO_DATA_STORE - #ifndef __arch_get_vdso_u_time_data static __always_inline const struct vdso_time_data *__arch_get_vdso_u_time_data(void) { @@ -20,8 +18,6 @@ static __always_inline const struct vdso_rng_data *__arch_get_vdso_u_rng_data(vo } #endif -#endif /* CONFIG_GENERIC_VDSO_DATA_STORE */ - #ifndef __arch_update_vdso_clock static __always_inline void __arch_update_vdso_clock(struct vdso_clock *vc) { diff --git a/include/clocksource/arm_arch_timer.h b/include/clocksource/arm_arch_timer.h index ce6521ad04d1..2eda895f19f5 100644 --- a/include/clocksource/arm_arch_timer.h +++ b/include/clocksource/arm_arch_timer.h @@ -9,9 +9,6 @@ #include <linux/timecounter.h> #include <linux/types.h> -#define ARCH_TIMER_TYPE_CP15 BIT(0) -#define ARCH_TIMER_TYPE_MEM BIT(1) - #define ARCH_TIMER_CTRL_ENABLE (1 << 0) #define ARCH_TIMER_CTRL_IT_MASK (1 << 1) #define ARCH_TIMER_CTRL_IT_STAT (1 << 2) @@ -51,8 +48,6 @@ enum arch_timer_spi_nr { #define ARCH_TIMER_PHYS_ACCESS 0 #define ARCH_TIMER_VIRT_ACCESS 1 -#define ARCH_TIMER_MEM_PHYS_ACCESS 2 -#define ARCH_TIMER_MEM_VIRT_ACCESS 3 #define ARCH_TIMER_MEM_MAX_FRAMES 8 diff --git a/include/dt-bindings/interrupt-controller/aspeed-scu-ic.h b/include/dt-bindings/interrupt-controller/aspeed-scu-ic.h index f315d5a7f5ee..7dd04424afcc 100644 --- a/include/dt-bindings/interrupt-controller/aspeed-scu-ic.h +++ b/include/dt-bindings/interrupt-controller/aspeed-scu-ic.h @@ -20,4 +20,18 @@ #define ASPEED_AST2600_SCU_IC1_LPC_RESET_LO_TO_HI 0 #define ASPEED_AST2600_SCU_IC1_LPC_RESET_HI_TO_LO 1 +#define ASPEED_AST2700_SCU_IC0_PCIE_PERST_LO_TO_HI 3 +#define ASPEED_AST2700_SCU_IC0_PCIE_PERST_HI_TO_LO 2 + +#define ASPEED_AST2700_SCU_IC1_PCIE_RCRST_LO_TO_HI 3 +#define ASPEED_AST2700_SCU_IC1_PCIE_RCRST_HI_TO_LO 2 + +#define ASPEED_AST2700_SCU_IC2_PCIE_PERST_LO_TO_HI 3 +#define ASPEED_AST2700_SCU_IC2_PCIE_PERST_HI_TO_LO 2 +#define ASPEED_AST2700_SCU_IC2_LPC_RESET_LO_TO_HI 1 +#define ASPEED_AST2700_SCU_IC2_LPC_RESET_HI_TO_LO 0 + +#define ASPEED_AST2700_SCU_IC3_LPC_RESET_LO_TO_HI 1 +#define ASPEED_AST2700_SCU_IC3_LPC_RESET_HI_TO_LO 0 + #endif /* _DT_BINDINGS_INTERRUPT_CONTROLLER_ASPEED_SCU_IC_H_ */ diff --git a/include/linux/cc_platform.h b/include/linux/cc_platform.h index 0bf7d33a1048..7fcec025c5e0 100644 --- a/include/linux/cc_platform.h +++ b/include/linux/cc_platform.h @@ -96,6 +96,14 @@ enum cc_attr { * enabled to run SEV-SNP guests. */ CC_ATTR_HOST_SEV_SNP, + + /** + * @CC_ATTR_SNP_SECURE_AVIC: Secure AVIC mode is active. + * + * The host kernel is running with the necessary features enabled + * to run SEV-SNP guests with full Secure AVIC capabilities. + */ + CC_ATTR_SNP_SECURE_AVIC, }; #ifdef CONFIG_ARCH_HAS_CC_PLATFORM diff --git a/drivers/cdx/controller/bitfield.h b/include/linux/cdx/bitfield.h index 567f8ec47582..567f8ec47582 100644 --- a/drivers/cdx/controller/bitfield.h +++ b/include/linux/cdx/bitfield.h diff --git a/include/linux/cdx/edac_cdx_pcol.h b/include/linux/cdx/edac_cdx_pcol.h new file mode 100644 index 000000000000..749db33bb482 --- /dev/null +++ b/include/linux/cdx/edac_cdx_pcol.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * Driver for AMD network controllers and boards + * + * Copyright (C) 2021, Xilinx, Inc. + * Copyright (C) 2022-2023, Advanced Micro Devices, Inc. + */ + +#ifndef MC_CDX_PCOL_H +#define MC_CDX_PCOL_H +#include <linux/cdx/mcdi.h> + +#define MC_CMD_EDAC_GET_DDR_CONFIG_OUT_WORD_LENGTH_LEN 4 +/* Number of registers for the DDR controller */ +#define MC_CMD_GET_DDR_CONFIG_OFST 4 +#define MC_CMD_GET_DDR_CONFIG_LEN 4 + +/***********************************/ +/* MC_CMD_EDAC_GET_DDR_CONFIG + * Provides detailed configuration for the DDR controller of the given index. + */ +#define MC_CMD_EDAC_GET_DDR_CONFIG 0x3 + +/* MC_CMD_EDAC_GET_DDR_CONFIG_IN msgrequest */ +#define MC_CMD_EDAC_GET_DDR_CONFIG_IN_CONTROLLER_INDEX_OFST 0 +#define MC_CMD_EDAC_GET_DDR_CONFIG_IN_CONTROLLER_INDEX_LEN 4 + +#endif /* MC_CDX_PCOL_H */ diff --git a/drivers/cdx/controller/mcdi.h b/include/linux/cdx/mcdi.h index 54a65e9760ae..74075305cba4 100644 --- a/drivers/cdx/controller/mcdi.h +++ b/include/linux/cdx/mcdi.h @@ -11,16 +11,7 @@ #include <linux/kref.h> #include <linux/rpmsg.h> -#include "bitfield.h" -#include "mc_cdx_pcol.h" - -#ifdef DEBUG -#define CDX_WARN_ON_ONCE_PARANOID(x) WARN_ON_ONCE(x) -#define CDX_WARN_ON_PARANOID(x) WARN_ON(x) -#else -#define CDX_WARN_ON_ONCE_PARANOID(x) do {} while (0) -#define CDX_WARN_ON_PARANOID(x) do {} while (0) -#endif +#include "linux/cdx/bitfield.h" /** * enum cdx_mcdi_mode - MCDI transaction mode @@ -36,8 +27,6 @@ enum cdx_mcdi_mode { #define MCDI_RPC_LONG_TIMEOU (60 * HZ) #define MCDI_RPC_POST_RST_TIME (10 * HZ) -#define MCDI_BUF_LEN (8 + MCDI_CTL_SDU_LEN_MAX) - /** * enum cdx_mcdi_cmd_state - State for an individual MCDI command * @MCDI_STATE_QUEUED: Command not started and is waiting to run. @@ -180,24 +169,12 @@ struct cdx_mcdi_data { u32 fn_flags; }; -static inline struct cdx_mcdi_iface *cdx_mcdi_if(struct cdx_mcdi *cdx) -{ - return cdx->mcdi ? &cdx->mcdi->iface : NULL; -} - -int cdx_mcdi_init(struct cdx_mcdi *cdx); void cdx_mcdi_finish(struct cdx_mcdi *cdx); - +int cdx_mcdi_init(struct cdx_mcdi *cdx); void cdx_mcdi_process_cmd(struct cdx_mcdi *cdx, struct cdx_dword *outbuf, int len); int cdx_mcdi_rpc(struct cdx_mcdi *cdx, unsigned int cmd, const struct cdx_dword *inbuf, size_t inlen, struct cdx_dword *outbuf, size_t outlen, size_t *outlen_actual); -int cdx_mcdi_rpc_async(struct cdx_mcdi *cdx, unsigned int cmd, - const struct cdx_dword *inbuf, size_t inlen, - cdx_mcdi_async_completer *complete, - unsigned long cookie); -int cdx_mcdi_wait_for_quiescence(struct cdx_mcdi *cdx, - unsigned int timeout_jiffies); /* * We expect that 16- and 32-bit fields in MCDI requests and responses @@ -215,28 +192,8 @@ int cdx_mcdi_wait_for_quiescence(struct cdx_mcdi *cdx, #define _MCDI_DWORD(_buf, _field) \ ((_buf) + (_MCDI_CHECK_ALIGN(MC_CMD_ ## _field ## _OFST, 4) >> 2)) -#define MCDI_BYTE(_buf, _field) \ - ((void)BUILD_BUG_ON_ZERO(MC_CMD_ ## _field ## _LEN != 1), \ - *MCDI_PTR(_buf, _field)) -#define MCDI_WORD(_buf, _field) \ - ((void)BUILD_BUG_ON_ZERO(MC_CMD_ ## _field ## _LEN != 2), \ - le16_to_cpu(*(__force const __le16 *)MCDI_PTR(_buf, _field))) #define MCDI_SET_DWORD(_buf, _field, _value) \ CDX_POPULATE_DWORD_1(*_MCDI_DWORD(_buf, _field), CDX_DWORD, _value) #define MCDI_DWORD(_buf, _field) \ CDX_DWORD_FIELD(*_MCDI_DWORD(_buf, _field), CDX_DWORD) -#define MCDI_POPULATE_DWORD_1(_buf, _field, _name1, _value1) \ - CDX_POPULATE_DWORD_1(*_MCDI_DWORD(_buf, _field), \ - MC_CMD_ ## _name1, _value1) -#define MCDI_SET_QWORD(_buf, _field, _value) \ - do { \ - CDX_POPULATE_DWORD_1(_MCDI_DWORD(_buf, _field)[0], \ - CDX_DWORD, (u32)(_value)); \ - CDX_POPULATE_DWORD_1(_MCDI_DWORD(_buf, _field)[1], \ - CDX_DWORD, (u64)(_value) >> 32); \ - } while (0) -#define MCDI_QWORD(_buf, _field) \ - (CDX_DWORD_FIELD(_MCDI_DWORD(_buf, _field)[0], CDX_DWORD) | \ - (u64)CDX_DWORD_FIELD(_MCDI_DWORD(_buf, _field)[1], CDX_DWORD) << 32) - #endif /* CDX_MCDI_H */ diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 6b93a64115fe..93318fce31f3 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -91,6 +91,12 @@ enum { * cgroup_threadgroup_rwsem. This makes hot path operations such as * forks and exits into the slow path and more expensive. * + * Alleviate the contention between fork, exec, exit operations and + * writing to cgroup.procs by taking a per threadgroup rwsem instead of + * the global cgroup_threadgroup_rwsem. Fork and other operations + * from threads in different thread groups no longer contend with + * writing to cgroup.procs. + * * The static usage pattern of creating a cgroup, enabling controllers, * and then seeding it with CLONE_INTO_CGROUP doesn't require write * locking cgroup_threadgroup_rwsem and thus doesn't benefit from @@ -140,6 +146,17 @@ enum { __CFTYPE_ADDED = (1 << 18), }; +enum cgroup_attach_lock_mode { + /* Default */ + CGRP_ATTACH_LOCK_GLOBAL, + + /* When pid=0 && threadgroup=false, see comments in cgroup_procs_write_start */ + CGRP_ATTACH_LOCK_NONE, + + /* When favordynmods is on, see comments above CGRP_ROOT_FAVOR_DYNMODS */ + CGRP_ATTACH_LOCK_PER_THREADGROUP, +}; + /* * cgroup_file is the handle for a file instance created in a cgroup which * is used, for example, to generate file changed notifications. This can @@ -433,6 +450,23 @@ struct cgroup_freezer_state { * frozen, SIGSTOPped, and PTRACEd. */ int nr_frozen_tasks; + + /* Freeze time data consistency protection */ + seqcount_t freeze_seq; + + /* + * Most recent time the cgroup was requested to freeze. + * Accesses guarded by freeze_seq counter. Writes serialized + * by css_set_lock. + */ + u64 freeze_start_nsec; + + /* + * Total duration the cgroup has spent freezing. + * Accesses guarded by freeze_seq counter. Writes serialized + * by css_set_lock. + */ + u64 frozen_nsec; }; struct cgroup { @@ -746,7 +780,6 @@ struct cgroup_subsys { int (*can_attach)(struct cgroup_taskset *tset); void (*cancel_attach)(struct cgroup_taskset *tset); void (*attach)(struct cgroup_taskset *tset); - void (*post_attach)(void); int (*can_fork)(struct task_struct *task, struct css_set *cset); void (*cancel_fork)(struct task_struct *task, struct css_set *cset); @@ -822,6 +855,7 @@ struct cgroup_subsys { }; extern struct percpu_rw_semaphore cgroup_threadgroup_rwsem; +extern bool cgroup_enable_per_threadgroup_rwsem; struct cgroup_of_peak { unsigned long value; @@ -833,11 +867,14 @@ struct cgroup_of_peak { * @tsk: target task * * Allows cgroup operations to synchronize against threadgroup changes - * using a percpu_rw_semaphore. + * using a global percpu_rw_semaphore and a per threadgroup rw_semaphore when + * favordynmods is on. See the comment above CGRP_ROOT_FAVOR_DYNMODS definition. */ static inline void cgroup_threadgroup_change_begin(struct task_struct *tsk) { percpu_down_read(&cgroup_threadgroup_rwsem); + if (cgroup_enable_per_threadgroup_rwsem) + down_read(&tsk->signal->cgroup_threadgroup_rwsem); } /** @@ -848,6 +885,8 @@ static inline void cgroup_threadgroup_change_begin(struct task_struct *tsk) */ static inline void cgroup_threadgroup_change_end(struct task_struct *tsk) { + if (cgroup_enable_per_threadgroup_rwsem) + up_read(&tsk->signal->cgroup_threadgroup_rwsem); percpu_up_read(&cgroup_threadgroup_rwsem); } diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index bab98357960d..7e292bdff2c1 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -355,6 +355,11 @@ static inline bool css_is_dying(struct cgroup_subsys_state *css) return css->flags & CSS_DYING; } +static inline bool css_is_online(struct cgroup_subsys_state *css) +{ + return css->flags & CSS_ONLINE; +} + static inline bool css_is_self(struct cgroup_subsys_state *css) { if (css == &css->cgroup->self) { diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 1ef867bb8c44..2cf1bf65b225 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -154,14 +154,11 @@ static inline s64 hrtimer_get_expires_ns(const struct hrtimer *timer) return ktime_to_ns(timer->node.expires); } -static inline ktime_t hrtimer_expires_remaining(const struct hrtimer *timer) -{ - return ktime_sub(timer->node.expires, timer->base->get_time()); -} +ktime_t hrtimer_cb_get_time(const struct hrtimer *timer); -static inline ktime_t hrtimer_cb_get_time(struct hrtimer *timer) +static inline ktime_t hrtimer_expires_remaining(const struct hrtimer *timer) { - return timer->base->get_time(); + return ktime_sub(timer->node.expires, hrtimer_cb_get_time(timer)); } static inline int hrtimer_is_hres_active(struct hrtimer *timer) @@ -200,8 +197,7 @@ __hrtimer_expires_remaining_adjusted(const struct hrtimer *timer, ktime_t now) static inline ktime_t hrtimer_expires_remaining_adjusted(const struct hrtimer *timer) { - return __hrtimer_expires_remaining_adjusted(timer, - timer->base->get_time()); + return __hrtimer_expires_remaining_adjusted(timer, hrtimer_cb_get_time(timer)); } #ifdef CONFIG_TIMERFD @@ -363,7 +359,7 @@ hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval); static inline u64 hrtimer_forward_now(struct hrtimer *timer, ktime_t interval) { - return hrtimer_forward(timer, timer->base->get_time(), interval); + return hrtimer_forward(timer, hrtimer_cb_get_time(timer), interval); } /* Precise sleep: */ diff --git a/include/linux/hrtimer_defs.h b/include/linux/hrtimer_defs.h index 84a5045f80f3..aa49ffa130e5 100644 --- a/include/linux/hrtimer_defs.h +++ b/include/linux/hrtimer_defs.h @@ -41,7 +41,6 @@ * @seq: seqcount around __run_hrtimer * @running: pointer to the currently running hrtimer * @active: red black tree root node for the active timers - * @get_time: function to retrieve the current time of the clock * @offset: offset of this clock to the monotonic base */ struct hrtimer_clock_base { @@ -51,7 +50,6 @@ struct hrtimer_clock_base { seqcount_raw_spinlock_t seq; struct hrtimer *running; struct timerqueue_head active; - ktime_t (*get_time)(void); ktime_t offset; } __hrtimer_clock_base_align; diff --git a/include/linux/irq.h b/include/linux/irq.h index 1d6b606a81ef..c67e76fbcc07 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -669,6 +669,8 @@ extern int irq_chip_set_parent_state(struct irq_data *data, extern int irq_chip_get_parent_state(struct irq_data *data, enum irqchip_irq_state which, bool *state); +extern void irq_chip_shutdown_parent(struct irq_data *data); +extern unsigned int irq_chip_startup_parent(struct irq_data *data); extern void irq_chip_enable_parent(struct irq_data *data); extern void irq_chip_disable_parent(struct irq_data *data); extern void irq_chip_ack_parent(struct irq_data *data); @@ -976,10 +978,6 @@ static inline void irq_free_desc(unsigned int irq) irq_free_descs(irq, 1); } -#ifdef CONFIG_GENERIC_IRQ_LEGACY -void irq_init_desc(unsigned int irq); -#endif - /** * struct irq_chip_regs - register offsets for struct irq_gci * @enable: Enable register offset to reg_base diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h index 91b20788273d..0d1927da8055 100644 --- a/include/linux/jiffies.h +++ b/include/linux/jiffies.h @@ -61,7 +61,7 @@ extern void register_refined_jiffies(long clock_tick_rate); -/* TICK_USEC is the time between ticks in usec assuming SHIFTED_HZ */ +/* TICK_USEC is the time between ticks in usec */ #define TICK_USEC ((USEC_PER_SEC + HZ/2) / HZ) /* USER_TICK_USEC is the time between ticks in usec assuming fake USER_HZ */ diff --git a/include/linux/msi.h b/include/linux/msi.h index faac634ac230..d415dd15a0a9 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -564,6 +564,8 @@ enum { MSI_FLAG_PARENT_PM_DEV = (1 << 8), /* Support for parent mask/unmask */ MSI_FLAG_PCI_MSI_MASK_PARENT = (1 << 9), + /* Support for parent startup/shutdown */ + MSI_FLAG_PCI_MSI_STARTUP_PARENT = (1 << 10), /* Mask for the generic functionality */ MSI_GENERIC_FLAGS_MASK = GENMASK(15, 0), diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index ec9d96025683..fd1d91017b99 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -859,7 +859,7 @@ struct perf_event { /* mmap bits */ struct mutex mmap_mutex; - atomic_t mmap_count; + refcount_t mmap_count; struct perf_buffer *rb; struct list_head rb_entry; @@ -1719,7 +1719,7 @@ DECLARE_PER_CPU(struct perf_callchain_entry, perf_callchain_entry); extern void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs); extern void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs); extern struct perf_callchain_entry * -get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, +get_perf_callchain(struct pt_regs *regs, bool kernel, bool user, u32 max_stack, bool crosstask, bool add_mark); extern int get_callchain_buffers(int max_stack); extern void put_callchain_buffers(void); diff --git a/include/linux/platform_data/dmtimer-omap.h b/include/linux/platform_data/dmtimer-omap.h index 95d852aef130..726d89143842 100644 --- a/include/linux/platform_data/dmtimer-omap.h +++ b/include/linux/platform_data/dmtimer-omap.h @@ -36,9 +36,13 @@ struct omap_dm_timer_ops { int (*set_pwm)(struct omap_dm_timer *timer, int def_on, int toggle, int trigger, int autoreload); int (*get_pwm_status)(struct omap_dm_timer *timer); + int (*set_cap)(struct omap_dm_timer *timer, + int autoreload, bool config_period); + int (*get_cap_status)(struct omap_dm_timer *timer); int (*set_prescaler)(struct omap_dm_timer *timer, int prescaler); unsigned int (*read_counter)(struct omap_dm_timer *timer); + unsigned int (*read_cap)(struct omap_dm_timer *timer, bool is_period); int (*write_counter)(struct omap_dm_timer *timer, unsigned int value); unsigned int (*read_status)(struct omap_dm_timer *timer); diff --git a/include/linux/preempt.h b/include/linux/preempt.h index 1fad1c8a4c76..102202185d7a 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -372,7 +372,7 @@ static inline void preempt_notifier_init(struct preempt_notifier *notifier, /* * Migrate-Disable and why it is undesired. * - * When a preempted task becomes elegible to run under the ideal model (IOW it + * When a preempted task becomes eligible to run under the ideal model (IOW it * becomes one of the M highest priority tasks), it might still have to wait * for the preemptee's migrate_disable() section to complete. Thereby suffering * a reduction in bandwidth in the exact duration of the migrate_disable() @@ -387,7 +387,7 @@ static inline void preempt_notifier_init(struct preempt_notifier *notifier, * - a lower priority tasks; which under preempt_disable() could've instantly * migrated away when another CPU becomes available, is now constrained * by the ability to push the higher priority task away, which might itself be - * in a migrate_disable() section, reducing it's available bandwidth. + * in a migrate_disable() section, reducing its available bandwidth. * * IOW it trades latency / moves the interference term, but it stays in the * system, and as long as it remains unbounded, the system is not fully @@ -399,7 +399,7 @@ static inline void preempt_notifier_init(struct preempt_notifier *notifier, * PREEMPT_RT breaks a number of assumptions traditionally held. By forcing a * number of primitives into becoming preemptible, they would also allow * migration. This turns out to break a bunch of per-cpu usage. To this end, - * all these primitives employ migirate_disable() to restore this implicit + * all these primitives employ migrate_disable() to restore this implicit * assumption. * * This is a 'temporary' work-around at best. The correct solution is getting @@ -407,7 +407,7 @@ static inline void preempt_notifier_init(struct preempt_notifier *notifier, * per-cpu locking or short preempt-disable regions. * * The end goal must be to get rid of migrate_disable(), alternatively we need - * a schedulability theory that does not depend on abritrary migration. + * a schedulability theory that does not depend on arbitrary migration. * * * Notes on the implementation. @@ -424,8 +424,6 @@ static inline void preempt_notifier_init(struct preempt_notifier *notifier, * work-conserving schedulers. * */ -extern void migrate_disable(void); -extern void migrate_enable(void); /** * preempt_disable_nested - Disable preemption inside a normally preempt disabled section @@ -471,7 +469,6 @@ static __always_inline void preempt_enable_nested(void) DEFINE_LOCK_GUARD_0(preempt, preempt_disable(), preempt_enable()) DEFINE_LOCK_GUARD_0(preempt_notrace, preempt_disable_notrace(), preempt_enable_notrace()) -DEFINE_LOCK_GUARD_0(migrate, migrate_disable(), migrate_enable()) #ifdef CONFIG_PREEMPT_DYNAMIC diff --git a/include/linux/psp-platform-access.h b/include/linux/psp-platform-access.h index 1504fb012c05..540abf7de048 100644 --- a/include/linux/psp-platform-access.h +++ b/include/linux/psp-platform-access.h @@ -7,6 +7,8 @@ enum psp_platform_access_msg { PSP_CMD_NONE = 0x0, + PSP_SFS_GET_FW_VERSIONS, + PSP_SFS_UPDATE, PSP_CMD_HSTI_QUERY = 0x14, PSP_I2C_REQ_BUS_CMD = 0x64, PSP_DYNAMIC_BOOST_GET_NONCE, diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 120536f4c6eb..8f346c847ee5 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -24,7 +24,7 @@ #include <linux/compiler.h> #include <linux/atomic.h> #include <linux/irqflags.h> -#include <linux/preempt.h> +#include <linux/sched.h> #include <linux/bottom_half.h> #include <linux/lockdep.h> #include <linux/cleanup.h> diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index 6fb4894b8cfd..a7d92718b653 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -157,27 +157,42 @@ struct rdt_ctrl_domain { }; /** + * struct mbm_cntr_cfg - Assignable counter configuration. + * @evtid: MBM event to which the counter is assigned. Only valid + * if @rdtgroup is not NULL. + * @rdtgrp: resctrl group assigned to the counter. NULL if the + * counter is free. + */ +struct mbm_cntr_cfg { + enum resctrl_event_id evtid; + struct rdtgroup *rdtgrp; +}; + +/** * struct rdt_mon_domain - group of CPUs sharing a resctrl monitor resource * @hdr: common header for different domain types * @ci_id: cache info id for this domain * @rmid_busy_llc: bitmap of which limbo RMIDs are above threshold - * @mbm_total: saved state for MBM total bandwidth - * @mbm_local: saved state for MBM local bandwidth + * @mbm_states: Per-event pointer to the MBM event's saved state. + * An MBM event's state is an array of struct mbm_state + * indexed by RMID on x86 or combined CLOSID, RMID on Arm. * @mbm_over: worker to periodically read MBM h/w counters * @cqm_limbo: worker to periodically read CQM h/w counters * @mbm_work_cpu: worker CPU for MBM h/w counters * @cqm_work_cpu: worker CPU for CQM h/w counters + * @cntr_cfg: array of assignable counters' configuration (indexed + * by counter ID) */ struct rdt_mon_domain { struct rdt_domain_hdr hdr; unsigned int ci_id; unsigned long *rmid_busy_llc; - struct mbm_state *mbm_total; - struct mbm_state *mbm_local; + struct mbm_state *mbm_states[QOS_NUM_L3_MBM_EVENTS]; struct delayed_work mbm_over; struct delayed_work cqm_limbo; int mbm_work_cpu; int cqm_work_cpu; + struct mbm_cntr_cfg *cntr_cfg; }; /** @@ -256,39 +271,52 @@ enum resctrl_schema_fmt { }; /** + * struct resctrl_mon - Monitoring related data of a resctrl resource. + * @num_rmid: Number of RMIDs available. + * @mbm_cfg_mask: Memory transactions that can be tracked when bandwidth + * monitoring events can be configured. + * @num_mbm_cntrs: Number of assignable counters. + * @mbm_cntr_assignable:Is system capable of supporting counter assignment? + * @mbm_assign_on_mkdir:True if counters should automatically be assigned to MBM + * events of monitor groups created via mkdir. + */ +struct resctrl_mon { + int num_rmid; + unsigned int mbm_cfg_mask; + int num_mbm_cntrs; + bool mbm_cntr_assignable; + bool mbm_assign_on_mkdir; +}; + +/** * struct rdt_resource - attributes of a resctrl resource * @rid: The index of the resource * @alloc_capable: Is allocation available on this machine * @mon_capable: Is monitor feature available on this machine - * @num_rmid: Number of RMIDs available * @ctrl_scope: Scope of this resource for control functions * @mon_scope: Scope of this resource for monitor functions * @cache: Cache allocation related data * @membw: If the component has bandwidth controls, their properties. + * @mon: Monitoring related data. * @ctrl_domains: RCU list of all control domains for this resource * @mon_domains: RCU list of all monitor domains for this resource * @name: Name to use in "schemata" file. * @schema_fmt: Which format string and parser is used for this schema. - * @evt_list: List of monitoring events - * @mbm_cfg_mask: Bandwidth sources that can be tracked when bandwidth - * monitoring events can be configured. * @cdp_capable: Is the CDP feature available on this resource */ struct rdt_resource { int rid; bool alloc_capable; bool mon_capable; - int num_rmid; enum resctrl_scope ctrl_scope; enum resctrl_scope mon_scope; struct resctrl_cache cache; struct resctrl_membw membw; + struct resctrl_mon mon; struct list_head ctrl_domains; struct list_head mon_domains; char *name; enum resctrl_schema_fmt schema_fmt; - struct list_head evt_list; - unsigned int mbm_cfg_mask; bool cdp_capable; }; @@ -372,8 +400,29 @@ u32 resctrl_arch_get_num_closid(struct rdt_resource *r); u32 resctrl_arch_system_num_rmid_idx(void); int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid); +void resctrl_enable_mon_event(enum resctrl_event_id eventid); + +bool resctrl_is_mon_event_enabled(enum resctrl_event_id eventid); + bool resctrl_arch_is_evt_configurable(enum resctrl_event_id evt); +static inline bool resctrl_is_mbm_event(enum resctrl_event_id eventid) +{ + return (eventid >= QOS_L3_MBM_TOTAL_EVENT_ID && + eventid <= QOS_L3_MBM_LOCAL_EVENT_ID); +} + +u32 resctrl_get_mon_evt_cfg(enum resctrl_event_id eventid); + +/* Iterate over all memory bandwidth events */ +#define for_each_mbm_event_id(eventid) \ + for (eventid = QOS_L3_MBM_TOTAL_EVENT_ID; \ + eventid <= QOS_L3_MBM_LOCAL_EVENT_ID; eventid++) + +/* Iterate over memory bandwidth arrays in domain structures */ +#define for_each_mbm_idx(idx) \ + for (idx = 0; idx < QOS_NUM_L3_MBM_EVENTS; idx++) + /** * resctrl_arch_mon_event_config_write() - Write the config for an event. * @config_info: struct resctrl_mon_config_info describing the resource, domain @@ -416,6 +465,26 @@ static inline u32 resctrl_get_config_index(u32 closid, bool resctrl_arch_get_cdp_enabled(enum resctrl_res_level l); int resctrl_arch_set_cdp_enabled(enum resctrl_res_level l, bool enable); +/** + * resctrl_arch_mbm_cntr_assign_enabled() - Check if MBM counter assignment + * mode is enabled. + * @r: Pointer to the resource structure. + * + * Return: + * true if the assignment mode is enabled, false otherwise. + */ +bool resctrl_arch_mbm_cntr_assign_enabled(struct rdt_resource *r); + +/** + * resctrl_arch_mbm_cntr_assign_set() - Configure the MBM counter assignment mode. + * @r: Pointer to the resource structure. + * @enable: Set to true to enable, false to disable the assignment mode. + * + * Return: + * 0 on success, < 0 on error. + */ +int resctrl_arch_mbm_cntr_assign_set(struct rdt_resource *r, bool enable); + /* * Update the ctrl_val and apply this config right now. * Must be called on one of the domain's CPUs. @@ -528,6 +597,63 @@ void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_mon_domain * */ void resctrl_arch_reset_all_ctrls(struct rdt_resource *r); +/** + * resctrl_arch_config_cntr() - Configure the counter with its new RMID + * and event details. + * @r: Resource structure. + * @d: The domain in which counter with ID @cntr_id should be configured. + * @evtid: Monitoring event type (e.g., QOS_L3_MBM_TOTAL_EVENT_ID + * or QOS_L3_MBM_LOCAL_EVENT_ID). + * @rmid: RMID. + * @closid: CLOSID. + * @cntr_id: Counter ID to configure. + * @assign: True to assign the counter or update an existing assignment, + * false to unassign the counter. + * + * This can be called from any CPU. + */ +void resctrl_arch_config_cntr(struct rdt_resource *r, struct rdt_mon_domain *d, + enum resctrl_event_id evtid, u32 rmid, u32 closid, + u32 cntr_id, bool assign); + +/** + * resctrl_arch_cntr_read() - Read the event data corresponding to the counter ID + * assigned to the RMID, event pair for this resource + * and domain. + * @r: Resource that the counter should be read from. + * @d: Domain that the counter should be read from. + * @closid: CLOSID that matches the RMID. + * @rmid: The RMID to which @cntr_id is assigned. + * @cntr_id: The counter to read. + * @eventid: The MBM event to which @cntr_id is assigned. + * @val: Result of the counter read in bytes. + * + * Called on a CPU that belongs to domain @d when "mbm_event" mode is enabled. + * Called from a non-migrateable process context via smp_call_on_cpu() unless all + * CPUs are nohz_full, in which case it is called via IPI (smp_call_function_any()). + * + * Return: + * 0 on success, or -EIO, -EINVAL etc on error. + */ +int resctrl_arch_cntr_read(struct rdt_resource *r, struct rdt_mon_domain *d, + u32 closid, u32 rmid, int cntr_id, + enum resctrl_event_id eventid, u64 *val); + +/** + * resctrl_arch_reset_cntr() - Reset any private state associated with counter ID. + * @r: The domain's resource. + * @d: The counter ID's domain. + * @closid: CLOSID that matches the RMID. + * @rmid: The RMID to which @cntr_id is assigned. + * @cntr_id: The counter to reset. + * @eventid: The MBM event to which @cntr_id is assigned. + * + * This can be called from any CPU. + */ +void resctrl_arch_reset_cntr(struct rdt_resource *r, struct rdt_mon_domain *d, + u32 closid, u32 rmid, int cntr_id, + enum resctrl_event_id eventid); + extern unsigned int resctrl_rmid_realloc_threshold; extern unsigned int resctrl_rmid_realloc_limit; diff --git a/include/linux/resctrl_types.h b/include/linux/resctrl_types.h index a25fb9c4070d..acfe07860b34 100644 --- a/include/linux/resctrl_types.h +++ b/include/linux/resctrl_types.h @@ -34,11 +34,18 @@ /* Max event bits supported */ #define MAX_EVT_CONFIG_BITS GENMASK(6, 0) -/* - * Event IDs, the values match those used to program IA32_QM_EVTSEL before - * reading IA32_QM_CTR on RDT systems. - */ +/* Number of memory transactions that an MBM event can be configured with */ +#define NUM_MBM_TRANSACTIONS 7 + +/* Event IDs */ enum resctrl_event_id { + /* Must match value of first event below */ + QOS_FIRST_EVENT = 0x01, + + /* + * These values match those used to program IA32_QM_EVTSEL before + * reading IA32_QM_CTR on RDT systems. + */ QOS_L3_OCCUP_EVENT_ID = 0x01, QOS_L3_MBM_TOTAL_EVENT_ID = 0x02, QOS_L3_MBM_LOCAL_EVENT_ID = 0x03, @@ -47,4 +54,7 @@ enum resctrl_event_id { QOS_NUM_EVENTS, }; +#define QOS_NUM_L3_MBM_EVENTS (QOS_L3_MBM_LOCAL_EVENT_ID - QOS_L3_MBM_TOTAL_EVENT_ID + 1) +#define MBM_STATE_IDX(evt) ((evt) - QOS_L3_MBM_TOTAL_EVENT_ID) + #endif /* __LINUX_RESCTRL_TYPES_H */ diff --git a/include/linux/rseq.h b/include/linux/rseq.h index a96fd345aa38..69553e7c14c1 100644 --- a/include/linux/rseq.h +++ b/include/linux/rseq.h @@ -7,6 +7,12 @@ #include <linux/preempt.h> #include <linux/sched.h> +#ifdef CONFIG_MEMBARRIER +# define RSEQ_EVENT_GUARD irq +#else +# define RSEQ_EVENT_GUARD preempt +#endif + /* * Map the event mask on the user-space ABI enum rseq_cs_flags * for direct mask checks. @@ -41,9 +47,8 @@ static inline void rseq_handle_notify_resume(struct ksignal *ksig, static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { - preempt_disable(); - __set_bit(RSEQ_EVENT_SIGNAL_BIT, ¤t->rseq_event_mask); - preempt_enable(); + scoped_guard(RSEQ_EVENT_GUARD) + __set_bit(RSEQ_EVENT_SIGNAL_BIT, ¤t->rseq_event_mask); rseq_handle_notify_resume(ksig, regs); } diff --git a/include/linux/sched.h b/include/linux/sched.h index e4ce0a76831e..cbb7340c5866 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -49,6 +49,9 @@ #include <linux/tracepoint-defs.h> #include <linux/unwind_deferred_types.h> #include <asm/kmap_size.h> +#ifndef COMPILE_OFFSETS +#include <generated/rq-offsets.h> +#endif /* task_struct member predeclarations (sorted alphabetically): */ struct audit_context; @@ -881,6 +884,11 @@ struct task_struct { #ifdef CONFIG_CGROUP_SCHED struct task_group *sched_task_group; +#ifdef CONFIG_CFS_BANDWIDTH + struct callback_head sched_throttle_work; + struct list_head throttle_node; + bool throttled; +#endif #endif @@ -2310,4 +2318,114 @@ static __always_inline void alloc_tag_restore(struct alloc_tag *tag, struct allo #define alloc_tag_restore(_tag, _old) do {} while (0) #endif +#ifndef MODULE +#ifndef COMPILE_OFFSETS + +extern void ___migrate_enable(void); + +struct rq; +DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); + +/* + * The "struct rq" is not available here, so we can't access the + * "runqueues" with this_cpu_ptr(), as the compilation will fail in + * this_cpu_ptr() -> raw_cpu_ptr() -> __verify_pcpu_ptr(): + * typeof((ptr) + 0) + * + * So use arch_raw_cpu_ptr()/PERCPU_PTR() directly here. + */ +#ifdef CONFIG_SMP +#define this_rq_raw() arch_raw_cpu_ptr(&runqueues) +#else +#define this_rq_raw() PERCPU_PTR(&runqueues) +#endif +#define this_rq_pinned() (*(unsigned int *)((void *)this_rq_raw() + RQ_nr_pinned)) + +static inline void __migrate_enable(void) +{ + struct task_struct *p = current; + +#ifdef CONFIG_DEBUG_PREEMPT + /* + * Check both overflow from migrate_disable() and superfluous + * migrate_enable(). + */ + if (WARN_ON_ONCE((s16)p->migration_disabled <= 0)) + return; +#endif + + if (p->migration_disabled > 1) { + p->migration_disabled--; + return; + } + + /* + * Ensure stop_task runs either before or after this, and that + * __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule(). + */ + guard(preempt)(); + if (unlikely(p->cpus_ptr != &p->cpus_mask)) + ___migrate_enable(); + /* + * Mustn't clear migration_disabled() until cpus_ptr points back at the + * regular cpus_mask, otherwise things that race (eg. + * select_fallback_rq) get confused. + */ + barrier(); + p->migration_disabled = 0; + this_rq_pinned()--; +} + +static inline void __migrate_disable(void) +{ + struct task_struct *p = current; + + if (p->migration_disabled) { +#ifdef CONFIG_DEBUG_PREEMPT + /* + *Warn about overflow half-way through the range. + */ + WARN_ON_ONCE((s16)p->migration_disabled < 0); +#endif + p->migration_disabled++; + return; + } + + guard(preempt)(); + this_rq_pinned()++; + p->migration_disabled = 1; +} +#else /* !COMPILE_OFFSETS */ +static inline void __migrate_disable(void) { } +static inline void __migrate_enable(void) { } +#endif /* !COMPILE_OFFSETS */ + +/* + * So that it is possible to not export the runqueues variable, define and + * export migrate_enable/migrate_disable in kernel/sched/core.c too, and use + * them for the modules. The macro "INSTANTIATE_EXPORTED_MIGRATE_DISABLE" will + * be defined in kernel/sched/core.c. + */ +#ifndef INSTANTIATE_EXPORTED_MIGRATE_DISABLE +static inline void migrate_disable(void) +{ + __migrate_disable(); +} + +static inline void migrate_enable(void) +{ + __migrate_enable(); +} +#else /* INSTANTIATE_EXPORTED_MIGRATE_DISABLE */ +extern void migrate_disable(void); +extern void migrate_enable(void); +#endif /* INSTANTIATE_EXPORTED_MIGRATE_DISABLE */ + +#else /* MODULE */ +extern void migrate_disable(void); +extern void migrate_enable(void); +#endif /* MODULE */ + +DEFINE_LOCK_GUARD_0(migrate, migrate_disable(), migrate_enable()) + #endif diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index 1ef1edbaaf79..7d6449982822 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -226,6 +226,10 @@ struct signal_struct { struct tty_audit_buf *tty_audit_buf; #endif +#ifdef CONFIG_CGROUPS + struct rw_semaphore cgroup_threadgroup_rwsem; +#endif + /* * Thread is the potential origin of an oom condition; kill first on * oom diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 5263746b63e8..bbcfdf12aa6e 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -30,33 +30,24 @@ struct sd_flag_debug { }; extern const struct sd_flag_debug sd_flag_debug[]; +struct sched_domain_topology_level; + #ifdef CONFIG_SCHED_SMT -static inline int cpu_smt_flags(void) -{ - return SD_SHARE_CPUCAPACITY | SD_SHARE_LLC; -} +extern int cpu_smt_flags(void); +extern const struct cpumask *tl_smt_mask(struct sched_domain_topology_level *tl, int cpu); #endif #ifdef CONFIG_SCHED_CLUSTER -static inline int cpu_cluster_flags(void) -{ - return SD_CLUSTER | SD_SHARE_LLC; -} +extern int cpu_cluster_flags(void); +extern const struct cpumask *tl_cls_mask(struct sched_domain_topology_level *tl, int cpu); #endif #ifdef CONFIG_SCHED_MC -static inline int cpu_core_flags(void) -{ - return SD_SHARE_LLC; -} +extern int cpu_core_flags(void); +extern const struct cpumask *tl_mc_mask(struct sched_domain_topology_level *tl, int cpu); #endif -#ifdef CONFIG_NUMA -static inline int cpu_numa_flags(void) -{ - return SD_NUMA; -} -#endif +extern const struct cpumask *tl_pkg_mask(struct sched_domain_topology_level *tl, int cpu); extern int arch_asym_cpu_priority(int cpu); @@ -172,7 +163,7 @@ bool cpus_equal_capacity(int this_cpu, int that_cpu); bool cpus_share_cache(int this_cpu, int that_cpu); bool cpus_share_resources(int this_cpu, int that_cpu); -typedef const struct cpumask *(*sched_domain_mask_f)(int cpu); +typedef const struct cpumask *(*sched_domain_mask_f)(struct sched_domain_topology_level *tl, int cpu); typedef int (*sched_domain_flags_f)(void); struct sd_data { diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 77f45e5d4413..66c06fcdfe19 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -1005,6 +1005,8 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int on); asmlinkage long sys_uretprobe(void); +asmlinkage long sys_uprobe(void); + /* pciconfig: alpha, arm, arm64, ia64, sparc */ asmlinkage long sys_pciconfig_read(unsigned long bus, unsigned long dfn, unsigned long off, unsigned long len, diff --git a/include/linux/topology.h b/include/linux/topology.h index 33b7fda97d39..6575af39fd10 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -260,7 +260,7 @@ static inline bool topology_is_primary_thread(unsigned int cpu) #endif -static inline const struct cpumask *cpu_cpu_mask(int cpu) +static inline const struct cpumask *cpu_node_mask(int cpu) { return cpumask_of_node(cpu_to_node(cpu)); } diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index 915303a82d84..ee3d36eda45d 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -17,6 +17,7 @@ #include <linux/wait.h> #include <linux/timer.h> #include <linux/seqlock.h> +#include <linux/mutex.h> struct uprobe; struct vm_area_struct; @@ -185,8 +186,14 @@ struct xol_area; struct uprobes_state { struct xol_area *xol_area; +#ifdef CONFIG_X86_64 + struct hlist_head head_tramps; +#endif }; +typedef int (*uprobe_write_verify_t)(struct page *page, unsigned long vaddr, + uprobe_opcode_t *insn, int nbytes, void *data); + extern void __init uprobes_init(void); extern int set_swbp(struct arch_uprobe *aup, struct vm_area_struct *vma, unsigned long vaddr); extern int set_orig_insn(struct arch_uprobe *aup, struct vm_area_struct *vma, unsigned long vaddr); @@ -194,7 +201,11 @@ extern bool is_swbp_insn(uprobe_opcode_t *insn); extern bool is_trap_insn(uprobe_opcode_t *insn); extern unsigned long uprobe_get_swbp_addr(struct pt_regs *regs); extern unsigned long uprobe_get_trap_addr(struct pt_regs *regs); -extern int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma, unsigned long vaddr, uprobe_opcode_t); +extern int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma, unsigned long vaddr, uprobe_opcode_t, + bool is_register); +extern int uprobe_write(struct arch_uprobe *auprobe, struct vm_area_struct *vma, const unsigned long opcode_vaddr, + uprobe_opcode_t *insn, int nbytes, uprobe_write_verify_t verify, bool is_register, bool do_update_ref_ctr, + void *data); extern struct uprobe *uprobe_register(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc); extern int uprobe_apply(struct uprobe *uprobe, struct uprobe_consumer *uc, bool); extern void uprobe_unregister_nosync(struct uprobe *uprobe, struct uprobe_consumer *uc); @@ -224,8 +235,13 @@ extern bool arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *regs); extern void arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr, void *src, unsigned long len); extern void uprobe_handle_trampoline(struct pt_regs *regs); -extern void *arch_uprobe_trampoline(unsigned long *psize); +extern void *arch_uretprobe_trampoline(unsigned long *psize); extern unsigned long uprobe_get_trampoline_vaddr(void); +extern void uprobe_copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len); +extern void arch_uprobe_clear_state(struct mm_struct *mm); +extern void arch_uprobe_init_state(struct mm_struct *mm); +extern void handle_syscall_uprobe(struct pt_regs *regs, unsigned long bp_vaddr); +extern void arch_uprobe_optimize(struct arch_uprobe *auprobe, unsigned long vaddr); #else /* !CONFIG_UPROBES */ struct uprobes_state { }; diff --git a/include/uapi/linux/psp-sfs.h b/include/uapi/linux/psp-sfs.h new file mode 100644 index 000000000000..94e51670383c --- /dev/null +++ b/include/uapi/linux/psp-sfs.h @@ -0,0 +1,87 @@ +/* SPDX-License-Identifier: GPL-2.0-only WITH Linux-syscall-note */ +/* + * Userspace interface for AMD Seamless Firmware Servicing (SFS) + * + * Copyright (C) 2025 Advanced Micro Devices, Inc. + * + * Author: Ashish Kalra <ashish.kalra@amd.com> + */ + +#ifndef __PSP_SFS_USER_H__ +#define __PSP_SFS_USER_H__ + +#include <linux/types.h> + +/** + * SFS: AMD Seamless Firmware Support (SFS) interface + */ + +#define PAYLOAD_NAME_SIZE 64 +#define TEE_EXT_CMD_BUFFER_SIZE 4096 + +/** + * struct sfs_user_get_fw_versions - get current level of base firmware (output). + * @blob: current level of base firmware for ASP and patch levels (input/output). + * @sfs_status: 32-bit SFS status value (output). + * @sfs_extended_status: 32-bit SFS extended status value (output). + */ +struct sfs_user_get_fw_versions { + __u8 blob[TEE_EXT_CMD_BUFFER_SIZE]; + __u32 sfs_status; + __u32 sfs_extended_status; +} __packed; + +/** + * struct sfs_user_update_package - update SFS package (input). + * @payload_name: name of SFS package to load, verify and execute (input). + * @sfs_status: 32-bit SFS status value (output). + * @sfs_extended_status: 32-bit SFS extended status value (output). + */ +struct sfs_user_update_package { + char payload_name[PAYLOAD_NAME_SIZE]; + __u32 sfs_status; + __u32 sfs_extended_status; +} __packed; + +/** + * Seamless Firmware Support (SFS) IOC + * + * possible return codes for all SFS IOCTLs: + * 0: success + * -EINVAL: invalid input + * -E2BIG: excess data passed + * -EFAULT: failed to copy to/from userspace + * -EBUSY: mailbox in recovery or in use + * -ENODEV: driver not bound with PSP device + * -EACCES: request isn't authorized + * -EINVAL: invalid parameter + * -ETIMEDOUT: request timed out + * -EAGAIN: invalid request for state machine + * -ENOENT: not implemented + * -ENFILE: overflow + * -EPERM: invalid signature + * -EIO: PSP I/O error + */ +#define SFS_IOC_TYPE 'S' + +/** + * SFSIOCFWVERS - returns blob containing FW versions + * ASP provides the current level of Base Firmware for the ASP + * and the other microprocessors as well as current patch + * level(s). + */ +#define SFSIOCFWVERS _IOWR(SFS_IOC_TYPE, 0x1, struct sfs_user_get_fw_versions) + +/** + * SFSIOCUPDATEPKG - updates package/payload + * ASP loads, verifies and executes the SFS package. + * By default, the SFS package/payload is loaded from + * /lib/firmware/amd, but alternative firmware loading + * path can be specified using kernel parameter + * firmware_class.path or the firmware loading path + * can be customized using sysfs file: + * /sys/module/firmware_class/parameters/path. + */ +#define SFSIOCUPDATEPKG _IOWR(SFS_IOC_TYPE, 0x2, struct sfs_user_update_package) + +#endif /* __PSP_SFS_USER_H__ */ diff --git a/include/vdso/datapage.h b/include/vdso/datapage.h index 02533038640e..23c39b96190f 100644 --- a/include/vdso/datapage.h +++ b/include/vdso/datapage.h @@ -31,7 +31,7 @@ struct arch_vdso_time_data {}; #if defined(CONFIG_ARCH_HAS_VDSO_ARCH_DATA) #include <asm/vdso/arch_data.h> -#elif defined(CONFIG_GENERIC_VDSO_DATA_STORE) +#else struct vdso_arch_data { /* Needed for the generic code, never actually used at runtime */ char __unused; @@ -164,7 +164,6 @@ struct vdso_rng_data { * With the hidden visibility, the compiler simply generates a PC-relative * relocation, and this is what we need. */ -#ifdef CONFIG_GENERIC_VDSO_DATA_STORE extern struct vdso_time_data vdso_u_time_data __attribute__((visibility("hidden"))); extern struct vdso_rng_data vdso_u_rng_data __attribute__((visibility("hidden"))); extern struct vdso_arch_data vdso_u_arch_data __attribute__((visibility("hidden"))); @@ -185,8 +184,6 @@ enum vdso_pages { VDSO_NR_PAGES }; -#endif /* CONFIG_GENERIC_VDSO_DATA_STORE */ - /* * The generic vDSO implementation requires that gettimeofday.h * provides: @@ -196,11 +193,7 @@ enum vdso_pages { * - clock_gettime_fallback(): fallback for clock_gettime. * - clock_getres_fallback(): fallback for clock_getres. */ -#ifdef ENABLE_COMPAT_VDSO -#include <asm/vdso/compat_gettimeofday.h> -#else #include <asm/vdso/gettimeofday.h> -#endif /* ENABLE_COMPAT_VDSO */ #else /* !__ASSEMBLY__ */ diff --git a/include/vdso/gettime.h b/include/vdso/gettime.h index c50d152e7b3e..9ac161866653 100644 --- a/include/vdso/gettime.h +++ b/include/vdso/gettime.h @@ -5,6 +5,7 @@ #include <linux/types.h> struct __kernel_timespec; +struct __kernel_old_timeval; struct timezone; #if !defined(CONFIG_64BIT) || defined(BUILD_VDSO32_64) diff --git a/include/vdso/jiffies.h b/include/vdso/jiffies.h index 2f9d596c8b29..8ca04a141412 100644 --- a/include/vdso/jiffies.h +++ b/include/vdso/jiffies.h @@ -5,7 +5,7 @@ #include <asm/param.h> /* for HZ */ #include <vdso/time64.h> -/* TICK_NSEC is the time between ticks in nsec assuming SHIFTED_HZ */ +/* TICK_NSEC is the time between ticks in nsec */ #define TICK_NSEC ((NSEC_PER_SEC+HZ/2)/HZ) #endif /* __VDSO_JIFFIES_H */ diff --git a/init/Kconfig b/init/Kconfig index c2b0c9df679a..f3b13463ec26 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1363,7 +1363,7 @@ config UTS_NS config TIME_NS bool "TIME namespace" - depends on GENERIC_VDSO_TIME_NS + depends on GENERIC_GETTIMEOFDAY default y help In this namespace boottime and monotonic clocks can be set. diff --git a/init/init_task.c b/init/init_task.c index e557f622bd90..a55e2189206f 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -27,6 +27,9 @@ static struct signal_struct init_signals = { }, .multiprocess = HLIST_HEAD_INIT, .rlim = INIT_RLIMITS, +#ifdef CONFIG_CGROUPS + .cgroup_threadgroup_rwsem = __RWSEM_INITIALIZER(init_signals.cgroup_threadgroup_rwsem), +#endif .cred_guard_mutex = __MUTEX_INITIALIZER(init_signals.cred_guard_mutex), .exec_update_lock = __RWSEM_INITIALIZER(init_signals.exec_update_lock), #ifdef CONFIG_POSIX_TIMERS diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt index 54ea59ff8fbe..da326800c1c9 100644 --- a/kernel/Kconfig.preempt +++ b/kernel/Kconfig.preempt @@ -103,6 +103,19 @@ config PREEMPT_RT Select this if you are building a kernel for systems which require real-time guarantees. +config PREEMPT_RT_NEEDS_BH_LOCK + bool "Enforce softirq synchronisation on PREEMPT_RT" + depends on PREEMPT_RT + help + Enforce synchronisation across the softirqs context. On PREEMPT_RT + the softirq is preemptible. This enforces the same per-CPU BLK + semantic non-PREEMPT_RT builds have. This should not be needed + because per-CPU locks were added to avoid the per-CPU BKL. + + This switch provides the old behaviour for testing reasons. Select + this if you suspect an error with preemptible softirq and want test + the old synchronized behaviour. + config PREEMPT_COUNT bool diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 3615c06b7dfa..ec3a57a5fba1 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -314,7 +314,7 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, if (max_depth > sysctl_perf_event_max_stack) max_depth = sysctl_perf_event_max_stack; - trace = get_perf_callchain(regs, 0, kernel, user, max_depth, + trace = get_perf_callchain(regs, kernel, user, max_depth, false, false); if (unlikely(!trace)) @@ -451,7 +451,7 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task, else if (kernel && task) trace = get_callchain_entry_for_task(task, max_depth); else - trace = get_perf_callchain(regs, 0, kernel, user, max_depth, + trace = get_perf_callchain(regs, kernel, user, max_depth, crosstask, false); if (unlikely(!trace) || trace->nr < skip) { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 9fb1f957a093..8340cecd1b35 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -23859,6 +23859,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, BTF_SET_START(btf_id_deny) BTF_ID_UNUSED #ifdef CONFIG_SMP +BTF_ID(func, ___migrate_enable) BTF_ID(func, migrate_disable) BTF_ID(func, migrate_enable) #endif diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index b14e61c64a34..22051b4f1ccb 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -249,12 +249,15 @@ int cgroup_migrate(struct task_struct *leader, bool threadgroup, int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, bool threadgroup); -void cgroup_attach_lock(bool lock_threadgroup); -void cgroup_attach_unlock(bool lock_threadgroup); +void cgroup_attach_lock(enum cgroup_attach_lock_mode lock_mode, + struct task_struct *tsk); +void cgroup_attach_unlock(enum cgroup_attach_lock_mode lock_mode, + struct task_struct *tsk); struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup, - bool *locked) + enum cgroup_attach_lock_mode *lock_mode) __acquires(&cgroup_threadgroup_rwsem); -void cgroup_procs_write_finish(struct task_struct *task, bool locked) +void cgroup_procs_write_finish(struct task_struct *task, + enum cgroup_attach_lock_mode lock_mode) __releases(&cgroup_threadgroup_rwsem); void cgroup_lock_and_drain_offline(struct cgroup *cgrp); diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 2a4a387f867a..a9e029b570c8 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -10,6 +10,7 @@ #include <linux/sched/task.h> #include <linux/magic.h> #include <linux/slab.h> +#include <linux/string.h> #include <linux/vmalloc.h> #include <linux/delayacct.h> #include <linux/pid_namespace.h> @@ -68,7 +69,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) int retval = 0; cgroup_lock(); - cgroup_attach_lock(true); + cgroup_attach_lock(CGRP_ATTACH_LOCK_GLOBAL, NULL); for_each_root(root) { struct cgroup *from_cgrp; @@ -80,7 +81,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) if (retval) break; } - cgroup_attach_unlock(true); + cgroup_attach_unlock(CGRP_ATTACH_LOCK_GLOBAL, NULL); cgroup_unlock(); return retval; @@ -117,7 +118,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) cgroup_lock(); - cgroup_attach_lock(true); + cgroup_attach_lock(CGRP_ATTACH_LOCK_GLOBAL, NULL); /* all tasks in @from are being moved, all csets are source */ spin_lock_irq(&css_set_lock); @@ -153,7 +154,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) } while (task && !ret); out_err: cgroup_migrate_finish(&mgctx); - cgroup_attach_unlock(true); + cgroup_attach_unlock(CGRP_ATTACH_LOCK_GLOBAL, NULL); cgroup_unlock(); return ret; } @@ -502,13 +503,13 @@ static ssize_t __cgroup1_procs_write(struct kernfs_open_file *of, struct task_struct *task; const struct cred *cred, *tcred; ssize_t ret; - bool locked; + enum cgroup_attach_lock_mode lock_mode; cgrp = cgroup_kn_lock_live(of->kn, false); if (!cgrp) return -ENODEV; - task = cgroup_procs_write_start(buf, threadgroup, &locked); + task = cgroup_procs_write_start(buf, threadgroup, &lock_mode); ret = PTR_ERR_OR_ZERO(task); if (ret) goto out_unlock; @@ -531,7 +532,7 @@ static ssize_t __cgroup1_procs_write(struct kernfs_open_file *of, ret = cgroup_attach_task(cgrp, task, threadgroup); out_finish: - cgroup_procs_write_finish(task, locked); + cgroup_procs_write_finish(task, lock_mode); out_unlock: cgroup_kn_unlock(of->kn); @@ -1133,7 +1134,7 @@ int cgroup1_reconfigure(struct fs_context *fc) if (ctx->release_agent) { spin_lock(&release_agent_path_lock); - strcpy(root->release_agent_path, ctx->release_agent); + strscpy(root->release_agent_path, ctx->release_agent); spin_unlock(&release_agent_path_lock); } @@ -1325,7 +1326,7 @@ static int __init cgroup1_wq_init(void) * Cap @max_active to 1 too. */ cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy", - 0, 1); + WQ_PERCPU, 1); BUG_ON(!cgroup_pidlist_destroy_wq); return 0; } diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index a0d5d62f1483..da4eaee52178 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -125,7 +125,7 @@ DEFINE_PERCPU_RWSEM(cgroup_threadgroup_rwsem); /* * cgroup destruction makes heavy use of work items and there can be a lot * of concurrent destructions. Use a separate workqueue so that cgroup - * destruction work items don't end up filling up max_active of system_wq + * destruction work items don't end up filling up max_active of system_percpu_wq * which may lead to deadlock. * * A cgroup destruction should enqueue work sequentially to: @@ -240,6 +240,14 @@ static u16 have_canfork_callback __read_mostly; static bool have_favordynmods __ro_after_init = IS_ENABLED(CONFIG_CGROUP_FAVOR_DYNMODS); +/* + * Write protected by cgroup_mutex and write-lock of cgroup_threadgroup_rwsem, + * read protected by either. + * + * Can only be turned on, but not turned off. + */ +bool cgroup_enable_per_threadgroup_rwsem __read_mostly; + /* cgroup namespace for init task */ struct cgroup_namespace init_cgroup_ns = { .ns.__ns_ref = REFCOUNT_INIT(2), @@ -1327,14 +1335,30 @@ void cgroup_favor_dynmods(struct cgroup_root *root, bool favor) { bool favoring = root->flags & CGRP_ROOT_FAVOR_DYNMODS; - /* see the comment above CGRP_ROOT_FAVOR_DYNMODS definition */ + /* + * see the comment above CGRP_ROOT_FAVOR_DYNMODS definition. + * favordynmods can flip while task is between + * cgroup_threadgroup_change_begin() and end(), so down_write global + * cgroup_threadgroup_rwsem to synchronize them. + * + * Once cgroup_enable_per_threadgroup_rwsem is enabled, holding + * cgroup_threadgroup_rwsem doesn't exlude tasks between + * cgroup_thread_group_change_begin() and end() and thus it's unsafe to + * turn off. As the scenario is unlikely, simply disallow disabling once + * enabled and print out a warning. + */ + percpu_down_write(&cgroup_threadgroup_rwsem); if (favor && !favoring) { + cgroup_enable_per_threadgroup_rwsem = true; rcu_sync_enter(&cgroup_threadgroup_rwsem.rss); root->flags |= CGRP_ROOT_FAVOR_DYNMODS; } else if (!favor && favoring) { + if (cgroup_enable_per_threadgroup_rwsem) + pr_warn_once("cgroup favordynmods: per threadgroup rwsem mechanism can't be disabled\n"); rcu_sync_exit(&cgroup_threadgroup_rwsem.rss); root->flags &= ~CGRP_ROOT_FAVOR_DYNMODS; } + percpu_up_write(&cgroup_threadgroup_rwsem); } static int cgroup_init_root_id(struct cgroup_root *root) @@ -2484,7 +2508,8 @@ EXPORT_SYMBOL_GPL(cgroup_path_ns); /** * cgroup_attach_lock - Lock for ->attach() - * @lock_threadgroup: whether to down_write cgroup_threadgroup_rwsem + * @lock_mode: whether acquire and acquire which rwsem + * @tsk: thread group to lock * * cgroup migration sometimes needs to stabilize threadgroups against forks and * exits by write-locking cgroup_threadgroup_rwsem. However, some ->attach() @@ -2504,22 +2529,55 @@ EXPORT_SYMBOL_GPL(cgroup_path_ns); * Resolve the situation by always acquiring cpus_read_lock() before optionally * write-locking cgroup_threadgroup_rwsem. This allows ->attach() to assume that * CPU hotplug is disabled on entry. + * + * When favordynmods is enabled, take per threadgroup rwsem to reduce overhead + * on dynamic cgroup modifications. see the comment above + * CGRP_ROOT_FAVOR_DYNMODS definition. + * + * tsk is not NULL only when writing to cgroup.procs. */ -void cgroup_attach_lock(bool lock_threadgroup) +void cgroup_attach_lock(enum cgroup_attach_lock_mode lock_mode, + struct task_struct *tsk) { cpus_read_lock(); - if (lock_threadgroup) + + switch (lock_mode) { + case CGRP_ATTACH_LOCK_NONE: + break; + case CGRP_ATTACH_LOCK_GLOBAL: percpu_down_write(&cgroup_threadgroup_rwsem); + break; + case CGRP_ATTACH_LOCK_PER_THREADGROUP: + down_write(&tsk->signal->cgroup_threadgroup_rwsem); + break; + default: + pr_warn("cgroup: Unexpected attach lock mode."); + break; + } } /** * cgroup_attach_unlock - Undo cgroup_attach_lock() - * @lock_threadgroup: whether to up_write cgroup_threadgroup_rwsem + * @lock_mode: whether release and release which rwsem + * @tsk: thread group to lock */ -void cgroup_attach_unlock(bool lock_threadgroup) +void cgroup_attach_unlock(enum cgroup_attach_lock_mode lock_mode, + struct task_struct *tsk) { - if (lock_threadgroup) + switch (lock_mode) { + case CGRP_ATTACH_LOCK_NONE: + break; + case CGRP_ATTACH_LOCK_GLOBAL: percpu_up_write(&cgroup_threadgroup_rwsem); + break; + case CGRP_ATTACH_LOCK_PER_THREADGROUP: + up_write(&tsk->signal->cgroup_threadgroup_rwsem); + break; + default: + pr_warn("cgroup: Unexpected attach lock mode."); + break; + } + cpus_read_unlock(); } @@ -2969,14 +3027,12 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, /* look up all src csets */ spin_lock_irq(&css_set_lock); - rcu_read_lock(); task = leader; do { cgroup_migrate_add_src(task_css_set(task), dst_cgrp, &mgctx); if (!threadgroup) break; } while_each_thread(leader, task); - rcu_read_unlock(); spin_unlock_irq(&css_set_lock); /* prepare dst csets and commit */ @@ -2993,7 +3049,7 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, } struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup, - bool *threadgroup_locked) + enum cgroup_attach_lock_mode *lock_mode) { struct task_struct *tsk; pid_t pid; @@ -3001,24 +3057,13 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup, if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0) return ERR_PTR(-EINVAL); - /* - * If we migrate a single thread, we don't care about threadgroup - * stability. If the thread is `current`, it won't exit(2) under our - * hands or change PID through exec(2). We exclude - * cgroup_update_dfl_csses and other cgroup_{proc,thread}s_write - * callers by cgroup_mutex. - * Therefore, we can skip the global lock. - */ - lockdep_assert_held(&cgroup_mutex); - *threadgroup_locked = pid || threadgroup; - cgroup_attach_lock(*threadgroup_locked); - +retry_find_task: rcu_read_lock(); if (pid) { tsk = find_task_by_vpid(pid); if (!tsk) { tsk = ERR_PTR(-ESRCH); - goto out_unlock_threadgroup; + goto out_unlock_rcu; } } else { tsk = current; @@ -3035,33 +3080,58 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup, */ if (tsk->no_cgroup_migration || (tsk->flags & PF_NO_SETAFFINITY)) { tsk = ERR_PTR(-EINVAL); - goto out_unlock_threadgroup; + goto out_unlock_rcu; } - get_task_struct(tsk); - goto out_unlock_rcu; + rcu_read_unlock(); + + /* + * If we migrate a single thread, we don't care about threadgroup + * stability. If the thread is `current`, it won't exit(2) under our + * hands or change PID through exec(2). We exclude + * cgroup_update_dfl_csses and other cgroup_{proc,thread}s_write callers + * by cgroup_mutex. Therefore, we can skip the global lock. + */ + lockdep_assert_held(&cgroup_mutex); + + if (pid || threadgroup) { + if (cgroup_enable_per_threadgroup_rwsem) + *lock_mode = CGRP_ATTACH_LOCK_PER_THREADGROUP; + else + *lock_mode = CGRP_ATTACH_LOCK_GLOBAL; + } else { + *lock_mode = CGRP_ATTACH_LOCK_NONE; + } + + cgroup_attach_lock(*lock_mode, tsk); + + if (threadgroup) { + if (!thread_group_leader(tsk)) { + /* + * A race with de_thread from another thread's exec() + * may strip us of our leadership. If this happens, + * throw this task away and try again. + */ + cgroup_attach_unlock(*lock_mode, tsk); + put_task_struct(tsk); + goto retry_find_task; + } + } + + return tsk; -out_unlock_threadgroup: - cgroup_attach_unlock(*threadgroup_locked); - *threadgroup_locked = false; out_unlock_rcu: rcu_read_unlock(); return tsk; } -void cgroup_procs_write_finish(struct task_struct *task, bool threadgroup_locked) +void cgroup_procs_write_finish(struct task_struct *task, + enum cgroup_attach_lock_mode lock_mode) { - struct cgroup_subsys *ss; - int ssid; + cgroup_attach_unlock(lock_mode, task); /* release reference from cgroup_procs_write_start() */ put_task_struct(task); - - cgroup_attach_unlock(threadgroup_locked); - - for_each_subsys(ss, ssid) - if (ss->post_attach) - ss->post_attach(); } static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask) @@ -3113,6 +3183,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) struct cgroup_subsys_state *d_css; struct cgroup *dsct; struct css_set *src_cset; + enum cgroup_attach_lock_mode lock_mode; bool has_tasks; int ret; @@ -3144,7 +3215,13 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) * write-locking can be skipped safely. */ has_tasks = !list_empty(&mgctx.preloaded_src_csets); - cgroup_attach_lock(has_tasks); + + if (has_tasks) + lock_mode = CGRP_ATTACH_LOCK_GLOBAL; + else + lock_mode = CGRP_ATTACH_LOCK_NONE; + + cgroup_attach_lock(lock_mode, NULL); /* NULL dst indicates self on default hierarchy */ ret = cgroup_migrate_prepare_dst(&mgctx); @@ -3165,7 +3242,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) ret = cgroup_migrate_execute(&mgctx); out_finish: cgroup_migrate_finish(&mgctx); - cgroup_attach_unlock(has_tasks); + cgroup_attach_unlock(lock_mode, NULL); return ret; } @@ -3788,6 +3865,27 @@ static int cgroup_stat_show(struct seq_file *seq, void *v) return 0; } +static int cgroup_core_local_stat_show(struct seq_file *seq, void *v) +{ + struct cgroup *cgrp = seq_css(seq)->cgroup; + unsigned int sequence; + u64 freeze_time; + + do { + sequence = read_seqcount_begin(&cgrp->freezer.freeze_seq); + freeze_time = cgrp->freezer.frozen_nsec; + /* Add in current freezer interval if the cgroup is freezing. */ + if (test_bit(CGRP_FREEZE, &cgrp->flags)) + freeze_time += (ktime_get_ns() - + cgrp->freezer.freeze_start_nsec); + } while (read_seqcount_retry(&cgrp->freezer.freeze_seq, sequence)); + + do_div(freeze_time, NSEC_PER_USEC); + seq_printf(seq, "frozen_usec %llu\n", freeze_time); + + return 0; +} + #ifdef CONFIG_CGROUP_SCHED /** * cgroup_tryget_css - try to get a cgroup's css for the specified subsystem @@ -5267,13 +5365,13 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, struct task_struct *task; const struct cred *saved_cred; ssize_t ret; - bool threadgroup_locked; + enum cgroup_attach_lock_mode lock_mode; dst_cgrp = cgroup_kn_lock_live(of->kn, false); if (!dst_cgrp) return -ENODEV; - task = cgroup_procs_write_start(buf, threadgroup, &threadgroup_locked); + task = cgroup_procs_write_start(buf, threadgroup, &lock_mode); ret = PTR_ERR_OR_ZERO(task); if (ret) goto out_unlock; @@ -5299,7 +5397,7 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, ret = cgroup_attach_task(dst_cgrp, task, threadgroup); out_finish: - cgroup_procs_write_finish(task, threadgroup_locked); + cgroup_procs_write_finish(task, lock_mode); out_unlock: cgroup_kn_unlock(of->kn); @@ -5381,6 +5479,11 @@ static struct cftype cgroup_base_files[] = { .seq_show = cgroup_stat_show, }, { + .name = "cgroup.stat.local", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = cgroup_core_local_stat_show, + }, + { .name = "cgroup.freeze", .flags = CFTYPE_NOT_ON_ROOT, .seq_show = cgroup_freeze_show, @@ -5789,6 +5892,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name, * if the parent has to be frozen, the child has too. */ cgrp->freezer.e_freeze = parent->freezer.e_freeze; + seqcount_init(&cgrp->freezer.freeze_seq); if (cgrp->freezer.e_freeze) { /* * Set the CGRP_FREEZE flag, so when a process will be @@ -5797,6 +5901,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name, * consider it frozen immediately. */ set_bit(CGRP_FREEZE, &cgrp->flags); + cgrp->freezer.freeze_start_nsec = ktime_get_ns(); set_bit(CGRP_FROZEN, &cgrp->flags); } @@ -6352,13 +6457,13 @@ static int __init cgroup_wq_init(void) * We would prefer to do this in cgroup_init() above, but that * is called before init_workqueues(): so leave this until after. */ - cgroup_offline_wq = alloc_workqueue("cgroup_offline", 0, 1); + cgroup_offline_wq = alloc_workqueue("cgroup_offline", WQ_PERCPU, 1); BUG_ON(!cgroup_offline_wq); - cgroup_release_wq = alloc_workqueue("cgroup_release", 0, 1); + cgroup_release_wq = alloc_workqueue("cgroup_release", WQ_PERCPU, 1); BUG_ON(!cgroup_release_wq); - cgroup_free_wq = alloc_workqueue("cgroup_free", 0, 1); + cgroup_free_wq = alloc_workqueue("cgroup_free", WQ_PERCPU, 1); BUG_ON(!cgroup_free_wq); return 0; } diff --git a/kernel/cgroup/cpuset-internal.h b/kernel/cgroup/cpuset-internal.h index 383963e28ac6..337608f408ce 100644 --- a/kernel/cgroup/cpuset-internal.h +++ b/kernel/cgroup/cpuset-internal.h @@ -38,7 +38,6 @@ enum prs_errcode { /* bits in struct cpuset flags field */ typedef enum { - CS_ONLINE, CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE, CS_MEM_HARDWALL, @@ -202,7 +201,7 @@ static inline struct cpuset *parent_cs(struct cpuset *cs) /* convenient tests for these bits */ static inline bool is_cpuset_online(struct cpuset *cs) { - return test_bit(CS_ONLINE, &cs->flags) && !css_is_dying(&cs->css); + return css_is_online(&cs->css) && !css_is_dying(&cs->css); } static inline int is_cpu_exclusive(const struct cpuset *cs) @@ -277,6 +276,8 @@ int cpuset_update_flag(cpuset_flagbits_t bit, struct cpuset *cs, int turning_on) ssize_t cpuset_write_resmask(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off); int cpuset_common_seq_show(struct seq_file *sf, void *v); +void cpuset_full_lock(void); +void cpuset_full_unlock(void); /* * cpuset-v1.c diff --git a/kernel/cgroup/cpuset-v1.c b/kernel/cgroup/cpuset-v1.c index b69a7db67090..12e76774c75b 100644 --- a/kernel/cgroup/cpuset-v1.c +++ b/kernel/cgroup/cpuset-v1.c @@ -169,8 +169,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, cpuset_filetype_t type = cft->private; int retval = -ENODEV; - cpus_read_lock(); - cpuset_lock(); + cpuset_full_lock(); if (!is_cpuset_online(cs)) goto out_unlock; @@ -184,8 +183,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, break; } out_unlock: - cpuset_unlock(); - cpus_read_unlock(); + cpuset_full_unlock(); return retval; } @@ -454,8 +452,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, cpuset_filetype_t type = cft->private; int retval = 0; - cpus_read_lock(); - cpuset_lock(); + cpuset_full_lock(); if (!is_cpuset_online(cs)) { retval = -ENODEV; goto out_unlock; @@ -498,8 +495,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, break; } out_unlock: - cpuset_unlock(); - cpus_read_unlock(); + cpuset_full_unlock(); return retval; } diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 27adb04df675..52468d2c178a 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -40,6 +40,7 @@ #include <linux/sched/isolation.h> #include <linux/wait.h> #include <linux/workqueue.h> +#include <linux/task_work.h> DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key); DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key); @@ -131,11 +132,6 @@ static bool force_sd_rebuild; #define PRS_INVALID_ROOT -1 #define PRS_INVALID_ISOLATED -2 -static inline bool is_prs_invalid(int prs_state) -{ - return prs_state < 0; -} - /* * Temporary cpumasks for working with partitions that are passed among * functions to avoid memory allocation in inner functions. @@ -159,16 +155,21 @@ void dec_dl_tasks_cs(struct task_struct *p) cs->nr_deadline_tasks--; } -static inline int is_partition_valid(const struct cpuset *cs) +static inline bool is_partition_valid(const struct cpuset *cs) { return cs->partition_root_state > 0; } -static inline int is_partition_invalid(const struct cpuset *cs) +static inline bool is_partition_invalid(const struct cpuset *cs) { return cs->partition_root_state < 0; } +static inline bool cs_is_member(const struct cpuset *cs) +{ + return cs->partition_root_state == PRS_MEMBER; +} + /* * Callers should hold callback_lock to modify partition_root_state. */ @@ -207,7 +208,7 @@ static inline void notify_partition_change(struct cpuset *cs, int old_prs) * parallel, we may leave an offline CPU in cpu_allowed or some other masks. */ static struct cpuset top_cpuset = { - .flags = BIT(CS_ONLINE) | BIT(CS_CPU_EXCLUSIVE) | + .flags = BIT(CS_CPU_EXCLUSIVE) | BIT(CS_MEM_EXCLUSIVE) | BIT(CS_SCHED_LOAD_BALANCE), .partition_root_state = PRS_ROOT, .relax_domain_level = -1, @@ -250,6 +251,12 @@ static struct cpuset top_cpuset = { static DEFINE_MUTEX(cpuset_mutex); +/** + * cpuset_lock - Acquire the global cpuset mutex + * + * This locks the global cpuset mutex to prevent modifications to cpuset + * hierarchy and configurations. This helper is not enough to make modification. + */ void cpuset_lock(void) { mutex_lock(&cpuset_mutex); @@ -260,6 +267,24 @@ void cpuset_unlock(void) mutex_unlock(&cpuset_mutex); } +/** + * cpuset_full_lock - Acquire full protection for cpuset modification + * + * Takes both CPU hotplug read lock (cpus_read_lock()) and cpuset mutex + * to safely modify cpuset data. + */ +void cpuset_full_lock(void) +{ + cpus_read_lock(); + mutex_lock(&cpuset_mutex); +} + +void cpuset_full_unlock(void) +{ + mutex_unlock(&cpuset_mutex); + cpus_read_unlock(); +} + static DEFINE_SPINLOCK(callback_lock); void cpuset_callback_lock_irq(void) @@ -411,94 +436,104 @@ static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask) } /** - * alloc_cpumasks - allocate three cpumasks for cpuset - * @cs: the cpuset that have cpumasks to be allocated. - * @tmp: the tmpmasks structure pointer + * alloc_cpumasks - Allocate an array of cpumask variables + * @pmasks: Pointer to array of cpumask_var_t pointers + * @size: Number of cpumasks to allocate * Return: 0 if successful, -ENOMEM otherwise. * - * Only one of the two input arguments should be non-NULL. + * Allocates @size cpumasks and initializes them to empty. Returns 0 on + * success, -ENOMEM on allocation failure. On failure, any previously + * allocated cpumasks are freed. */ -static inline int alloc_cpumasks(struct cpuset *cs, struct tmpmasks *tmp) +static inline int alloc_cpumasks(cpumask_var_t *pmasks[], u32 size) { - cpumask_var_t *pmask1, *pmask2, *pmask3, *pmask4; + int i; - if (cs) { - pmask1 = &cs->cpus_allowed; - pmask2 = &cs->effective_cpus; - pmask3 = &cs->effective_xcpus; - pmask4 = &cs->exclusive_cpus; - } else { - pmask1 = &tmp->new_cpus; - pmask2 = &tmp->addmask; - pmask3 = &tmp->delmask; - pmask4 = NULL; + for (i = 0; i < size; i++) { + if (!zalloc_cpumask_var(pmasks[i], GFP_KERNEL)) { + while (--i >= 0) + free_cpumask_var(*pmasks[i]); + return -ENOMEM; + } } - - if (!zalloc_cpumask_var(pmask1, GFP_KERNEL)) - return -ENOMEM; - - if (!zalloc_cpumask_var(pmask2, GFP_KERNEL)) - goto free_one; - - if (!zalloc_cpumask_var(pmask3, GFP_KERNEL)) - goto free_two; - - if (pmask4 && !zalloc_cpumask_var(pmask4, GFP_KERNEL)) - goto free_three; - - return 0; +} + +/** + * alloc_tmpmasks - Allocate temporary cpumasks for cpuset operations. + * @tmp: Pointer to tmpmasks structure to populate + * Return: 0 on success, -ENOMEM on allocation failure + */ +static inline int alloc_tmpmasks(struct tmpmasks *tmp) +{ + /* + * Array of pointers to the three cpumask_var_t fields in tmpmasks. + * Note: Array size must match actual number of masks (3) + */ + cpumask_var_t *pmask[3] = { + &tmp->new_cpus, + &tmp->addmask, + &tmp->delmask + }; -free_three: - free_cpumask_var(*pmask3); -free_two: - free_cpumask_var(*pmask2); -free_one: - free_cpumask_var(*pmask1); - return -ENOMEM; + return alloc_cpumasks(pmask, ARRAY_SIZE(pmask)); } /** - * free_cpumasks - free cpumasks in a tmpmasks structure - * @cs: the cpuset that have cpumasks to be free. + * free_tmpmasks - free cpumasks in a tmpmasks structure * @tmp: the tmpmasks structure pointer */ -static inline void free_cpumasks(struct cpuset *cs, struct tmpmasks *tmp) +static inline void free_tmpmasks(struct tmpmasks *tmp) { - if (cs) { - free_cpumask_var(cs->cpus_allowed); - free_cpumask_var(cs->effective_cpus); - free_cpumask_var(cs->effective_xcpus); - free_cpumask_var(cs->exclusive_cpus); - } - if (tmp) { - free_cpumask_var(tmp->new_cpus); - free_cpumask_var(tmp->addmask); - free_cpumask_var(tmp->delmask); - } + if (!tmp) + return; + + free_cpumask_var(tmp->new_cpus); + free_cpumask_var(tmp->addmask); + free_cpumask_var(tmp->delmask); } /** - * alloc_trial_cpuset - allocate a trial cpuset - * @cs: the cpuset that the trial cpuset duplicates + * dup_or_alloc_cpuset - Duplicate or allocate a new cpuset + * @cs: Source cpuset to duplicate (NULL for a fresh allocation) + * + * Creates a new cpuset by either: + * 1. Duplicating an existing cpuset (if @cs is non-NULL), or + * 2. Allocating a fresh cpuset with zero-initialized masks (if @cs is NULL) + * + * Return: Pointer to newly allocated cpuset on success, NULL on failure */ -static struct cpuset *alloc_trial_cpuset(struct cpuset *cs) +static struct cpuset *dup_or_alloc_cpuset(struct cpuset *cs) { struct cpuset *trial; - trial = kmemdup(cs, sizeof(*cs), GFP_KERNEL); + /* Allocate base structure */ + trial = cs ? kmemdup(cs, sizeof(*cs), GFP_KERNEL) : + kzalloc(sizeof(*cs), GFP_KERNEL); if (!trial) return NULL; - if (alloc_cpumasks(trial, NULL)) { + /* Setup cpumask pointer array */ + cpumask_var_t *pmask[4] = { + &trial->cpus_allowed, + &trial->effective_cpus, + &trial->effective_xcpus, + &trial->exclusive_cpus + }; + + if (alloc_cpumasks(pmask, ARRAY_SIZE(pmask))) { kfree(trial); return NULL; } - cpumask_copy(trial->cpus_allowed, cs->cpus_allowed); - cpumask_copy(trial->effective_cpus, cs->effective_cpus); - cpumask_copy(trial->effective_xcpus, cs->effective_xcpus); - cpumask_copy(trial->exclusive_cpus, cs->exclusive_cpus); + /* Copy masks if duplicating */ + if (cs) { + cpumask_copy(trial->cpus_allowed, cs->cpus_allowed); + cpumask_copy(trial->effective_cpus, cs->effective_cpus); + cpumask_copy(trial->effective_xcpus, cs->effective_xcpus); + cpumask_copy(trial->exclusive_cpus, cs->exclusive_cpus); + } + return trial; } @@ -508,7 +543,10 @@ static struct cpuset *alloc_trial_cpuset(struct cpuset *cs) */ static inline void free_cpuset(struct cpuset *cs) { - free_cpumasks(cs, NULL); + free_cpumask_var(cs->cpus_allowed); + free_cpumask_var(cs->effective_cpus); + free_cpumask_var(cs->effective_xcpus); + free_cpumask_var(cs->exclusive_cpus); kfree(cs); } @@ -540,6 +578,47 @@ static inline bool cpusets_are_exclusive(struct cpuset *cs1, struct cpuset *cs2) return true; } +/** + * cpus_excl_conflict - Check if two cpusets have exclusive CPU conflicts + * @cs1: first cpuset to check + * @cs2: second cpuset to check + * + * Returns: true if CPU exclusivity conflict exists, false otherwise + * + * Conflict detection rules: + * 1. If either cpuset is CPU exclusive, they must be mutually exclusive + * 2. exclusive_cpus masks cannot intersect between cpusets + * 3. The allowed CPUs of one cpuset cannot be a subset of another's exclusive CPUs + */ +static inline bool cpus_excl_conflict(struct cpuset *cs1, struct cpuset *cs2) +{ + /* If either cpuset is exclusive, check if they are mutually exclusive */ + if (is_cpu_exclusive(cs1) || is_cpu_exclusive(cs2)) + return !cpusets_are_exclusive(cs1, cs2); + + /* Exclusive_cpus cannot intersect */ + if (cpumask_intersects(cs1->exclusive_cpus, cs2->exclusive_cpus)) + return true; + + /* The cpus_allowed of one cpuset cannot be a subset of another cpuset's exclusive_cpus */ + if (!cpumask_empty(cs1->cpus_allowed) && + cpumask_subset(cs1->cpus_allowed, cs2->exclusive_cpus)) + return true; + + if (!cpumask_empty(cs2->cpus_allowed) && + cpumask_subset(cs2->cpus_allowed, cs1->exclusive_cpus)) + return true; + + return false; +} + +static inline bool mems_excl_conflict(struct cpuset *cs1, struct cpuset *cs2) +{ + if ((is_mem_exclusive(cs1) || is_mem_exclusive(cs2))) + return nodes_intersects(cs1->mems_allowed, cs2->mems_allowed); + return false; +} + /* * validate_change() - Used to validate that any proposed cpuset change * follows the structural rules for cpusets. @@ -621,38 +700,11 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) */ ret = -EINVAL; cpuset_for_each_child(c, css, par) { - bool txset, cxset; /* Are exclusive_cpus set? */ - if (c == cur) continue; - - txset = !cpumask_empty(trial->exclusive_cpus); - cxset = !cpumask_empty(c->exclusive_cpus); - if (is_cpu_exclusive(trial) || is_cpu_exclusive(c) || - (txset && cxset)) { - if (!cpusets_are_exclusive(trial, c)) - goto out; - } else if (txset || cxset) { - struct cpumask *xcpus, *acpus; - - /* - * When just one of the exclusive_cpus's is set, - * cpus_allowed of the other cpuset, if set, cannot be - * a subset of it or none of those CPUs will be - * available if these exclusive CPUs are activated. - */ - if (txset) { - xcpus = trial->exclusive_cpus; - acpus = c->cpus_allowed; - } else { - xcpus = c->exclusive_cpus; - acpus = trial->cpus_allowed; - } - if (!cpumask_empty(acpus) && cpumask_subset(acpus, xcpus)) - goto out; - } - if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) && - nodes_intersects(trial->mems_allowed, c->mems_allowed)) + if (cpus_excl_conflict(trial, c)) + goto out; + if (mems_excl_conflict(trial, c)) goto out; } @@ -1363,38 +1415,25 @@ bool cpuset_cpu_is_isolated(int cpu) } EXPORT_SYMBOL_GPL(cpuset_cpu_is_isolated); -/* - * compute_effective_exclusive_cpumask - compute effective exclusive CPUs - * @cs: cpuset - * @xcpus: effective exclusive CPUs value to be set - * @real_cs: the real cpuset (can be NULL) - * Return: 0 if there is no sibling conflict, > 0 otherwise +/** + * rm_siblings_excl_cpus - Remove exclusive CPUs that are used by sibling cpusets + * @parent: Parent cpuset containing all siblings + * @cs: Current cpuset (will be skipped) + * @excpus: exclusive effective CPU mask to modify * - * If exclusive_cpus isn't explicitly set or a real_cs is provided, we have to - * scan the sibling cpusets and exclude their exclusive_cpus or effective_xcpus - * as well. The provision of real_cs means that a cpumask is being changed and - * the given cs is a trial one. + * This function ensures the given @excpus mask doesn't include any CPUs that + * are exclusively allocated to sibling cpusets. It walks through all siblings + * of @cs under @parent and removes their exclusive CPUs from @excpus. */ -static int compute_effective_exclusive_cpumask(struct cpuset *cs, - struct cpumask *xcpus, - struct cpuset *real_cs) +static int rm_siblings_excl_cpus(struct cpuset *parent, struct cpuset *cs, + struct cpumask *excpus) { struct cgroup_subsys_state *css; - struct cpuset *parent = parent_cs(cs); struct cpuset *sibling; int retval = 0; - if (!xcpus) - xcpus = cs->effective_xcpus; - - cpumask_and(xcpus, user_xcpus(cs), parent->effective_xcpus); - - if (!real_cs) { - if (!cpumask_empty(cs->exclusive_cpus)) - return 0; - } else { - cs = real_cs; - } + if (cpumask_empty(excpus)) + return retval; /* * Exclude exclusive CPUs from siblings @@ -1404,20 +1443,66 @@ static int compute_effective_exclusive_cpumask(struct cpuset *cs, if (sibling == cs) continue; - if (cpumask_intersects(xcpus, sibling->exclusive_cpus)) { - cpumask_andnot(xcpus, xcpus, sibling->exclusive_cpus); + if (cpumask_intersects(excpus, sibling->exclusive_cpus)) { + cpumask_andnot(excpus, excpus, sibling->exclusive_cpus); retval++; continue; } - if (cpumask_intersects(xcpus, sibling->effective_xcpus)) { - cpumask_andnot(xcpus, xcpus, sibling->effective_xcpus); + if (cpumask_intersects(excpus, sibling->effective_xcpus)) { + cpumask_andnot(excpus, excpus, sibling->effective_xcpus); retval++; } } rcu_read_unlock(); + return retval; } +/* + * compute_excpus - compute effective exclusive CPUs + * @cs: cpuset + * @xcpus: effective exclusive CPUs value to be set + * Return: 0 if there is no sibling conflict, > 0 otherwise + * + * If exclusive_cpus isn't explicitly set , we have to scan the sibling cpusets + * and exclude their exclusive_cpus or effective_xcpus as well. + */ +static int compute_excpus(struct cpuset *cs, struct cpumask *excpus) +{ + struct cpuset *parent = parent_cs(cs); + + cpumask_and(excpus, user_xcpus(cs), parent->effective_xcpus); + + if (!cpumask_empty(cs->exclusive_cpus)) + return 0; + + return rm_siblings_excl_cpus(parent, cs, excpus); +} + +/* + * compute_trialcs_excpus - Compute effective exclusive CPUs for a trial cpuset + * @trialcs: The trial cpuset containing the proposed new configuration + * @cs: The original cpuset that the trial configuration is based on + * Return: 0 if successful with no sibling conflict, >0 if a conflict is found + * + * Computes the effective_xcpus for a trial configuration. @cs is provided to represent + * the real cs. + */ +static int compute_trialcs_excpus(struct cpuset *trialcs, struct cpuset *cs) +{ + struct cpuset *parent = parent_cs(trialcs); + struct cpumask *excpus = trialcs->effective_xcpus; + + /* trialcs is member, cpuset.cpus has no impact to excpus */ + if (cs_is_member(cs)) + cpumask_and(excpus, trialcs->exclusive_cpus, + parent->effective_xcpus); + else + cpumask_and(excpus, user_xcpus(trialcs), parent->effective_xcpus); + + return rm_siblings_excl_cpus(parent, cs, excpus); +} + static inline bool is_remote_partition(struct cpuset *cs) { return !list_empty(&cs->remote_sibling); @@ -1459,7 +1544,7 @@ static int remote_partition_enable(struct cpuset *cs, int new_prs, * Note that creating a remote partition with any local partition root * above it or remote partition root underneath it is not allowed. */ - compute_effective_exclusive_cpumask(cs, tmp->new_cpus, NULL); + compute_excpus(cs, tmp->new_cpus); WARN_ON_ONCE(cpumask_intersects(tmp->new_cpus, subpartitions_cpus)); if (!cpumask_intersects(tmp->new_cpus, cpu_active_mask) || cpumask_subset(top_cpuset.effective_cpus, tmp->new_cpus)) @@ -1508,7 +1593,7 @@ static void remote_partition_disable(struct cpuset *cs, struct tmpmasks *tmp) cs->partition_root_state = PRS_MEMBER; /* effective_xcpus may need to be changed */ - compute_effective_exclusive_cpumask(cs, NULL, NULL); + compute_excpus(cs, cs->effective_xcpus); reset_partition_data(cs); spin_unlock_irq(&callback_lock); update_unbound_workqueue_cpumask(isolcpus_updated); @@ -1677,7 +1762,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, old_prs = new_prs = cs->partition_root_state; if (cmd == partcmd_invalidate) { - if (is_prs_invalid(old_prs)) + if (is_partition_invalid(cs)) return 0; /* @@ -1709,13 +1794,14 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, if ((cmd == partcmd_enable) || (cmd == partcmd_enablei)) { /* - * Need to call compute_effective_exclusive_cpumask() in case + * Need to call compute_excpus() in case * exclusive_cpus not set. Sibling conflict should only happen * if exclusive_cpus isn't set. */ xcpus = tmp->delmask; - if (compute_effective_exclusive_cpumask(cs, xcpus, NULL)) + if (compute_excpus(cs, xcpus)) WARN_ON_ONCE(!cpumask_empty(cs->exclusive_cpus)); + new_prs = (cmd == partcmd_enable) ? PRS_ROOT : PRS_ISOLATED; /* * Enabling partition root is not allowed if its @@ -1727,11 +1813,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, if (prstate_housekeeping_conflict(new_prs, xcpus)) return PERR_HKEEPING; - /* - * A parent can be left with no CPU as long as there is no - * task directly associated with the parent partition. - */ - if (nocpu) + if (tasks_nocpu_error(parent, cs, xcpus)) return PERR_NOCPUS; /* @@ -1748,7 +1830,6 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, deleting = true; subparts_delta++; - new_prs = (cmd == partcmd_enable) ? PRS_ROOT : PRS_ISOLATED; } else if (cmd == partcmd_disable) { /* * May need to add cpus back to parent's effective_cpus @@ -1788,7 +1869,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, * For invalid partition: * delmask = newmask & parent->effective_xcpus */ - if (is_prs_invalid(old_prs)) { + if (is_partition_invalid(cs)) { adding = false; deleting = cpumask_and(tmp->delmask, newmask, parent->effective_xcpus); @@ -1837,7 +1918,6 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, * A partition error happens when parent has tasks and all * its effective CPUs will have to be distributed out. */ - WARN_ON_ONCE(!is_partition_valid(parent)); if (nocpu) { part_error = PERR_NOCPUS; if (is_partition_valid(cs)) @@ -1996,7 +2076,7 @@ static void compute_partition_effective_cpumask(struct cpuset *cs, * 2) All the effective_cpus will be used up and cp * has tasks */ - compute_effective_exclusive_cpumask(cs, new_ecpus, NULL); + compute_excpus(cs, new_ecpus); cpumask_and(new_ecpus, new_ecpus, cpu_active_mask); rcu_read_lock(); @@ -2075,7 +2155,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp, * its value is being processed. */ if (remote && (cp != cs)) { - compute_effective_exclusive_cpumask(cp, tmp->new_cpus, NULL); + compute_excpus(cp, tmp->new_cpus); if (cpumask_equal(cp->effective_xcpus, tmp->new_cpus)) { pos_css = css_rightmost_descendant(pos_css); continue; @@ -2177,7 +2257,7 @@ get_css: cpumask_copy(cp->effective_cpus, tmp->new_cpus); cp->partition_root_state = new_prs; if (!cpumask_empty(cp->exclusive_cpus) && (cp != cs)) - compute_effective_exclusive_cpumask(cp, NULL, NULL); + compute_excpus(cp, cp->effective_xcpus); /* * Make sure effective_xcpus is properly set for a valid @@ -2284,82 +2364,54 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs, rcu_read_unlock(); } -/** - * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it - * @cs: the cpuset to consider - * @trialcs: trial cpuset - * @buf: buffer of cpu numbers written to this cpuset - */ -static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, - const char *buf) +static int parse_cpuset_cpulist(const char *buf, struct cpumask *out_mask) { int retval; - struct tmpmasks tmp; - struct cpuset *parent = parent_cs(cs); - bool invalidate = false; - bool force = false; - int old_prs = cs->partition_root_state; - /* top_cpuset.cpus_allowed tracks cpu_active_mask; it's read-only */ - if (cs == &top_cpuset) - return -EACCES; + retval = cpulist_parse(buf, out_mask); + if (retval < 0) + return retval; + if (!cpumask_subset(out_mask, top_cpuset.cpus_allowed)) + return -EINVAL; - /* - * An empty cpus_allowed is ok only if the cpuset has no tasks. - * Since cpulist_parse() fails on an empty mask, we special case - * that parsing. The validate_change() call ensures that cpusets - * with tasks have cpus. - */ - if (!*buf) { - cpumask_clear(trialcs->cpus_allowed); - if (cpumask_empty(trialcs->exclusive_cpus)) - cpumask_clear(trialcs->effective_xcpus); - } else { - retval = cpulist_parse(buf, trialcs->cpus_allowed); - if (retval < 0) - return retval; + return 0; +} - if (!cpumask_subset(trialcs->cpus_allowed, - top_cpuset.cpus_allowed)) - return -EINVAL; +/** + * validate_partition - Validate a cpuset partition configuration + * @cs: The cpuset to validate + * @trialcs: The trial cpuset containing proposed configuration changes + * + * If any validation check fails, the appropriate error code is set in the + * cpuset's prs_err field. + * + * Return: PRS error code (0 if valid, non-zero error code if invalid) + */ +static enum prs_errcode validate_partition(struct cpuset *cs, struct cpuset *trialcs) +{ + struct cpuset *parent = parent_cs(cs); - /* - * When exclusive_cpus isn't explicitly set, it is constrained - * by cpus_allowed and parent's effective_xcpus. Otherwise, - * trialcs->effective_xcpus is used as a temporary cpumask - * for checking validity of the partition root. - */ - trialcs->partition_root_state = PRS_MEMBER; - if (!cpumask_empty(trialcs->exclusive_cpus) || is_partition_valid(cs)) - compute_effective_exclusive_cpumask(trialcs, NULL, cs); - } + if (cs_is_member(trialcs)) + return PERR_NONE; - /* Nothing to do if the cpus didn't change */ - if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed)) - return 0; + if (cpumask_empty(trialcs->effective_xcpus)) + return PERR_INVCPUS; - if (alloc_cpumasks(NULL, &tmp)) - return -ENOMEM; + if (prstate_housekeeping_conflict(trialcs->partition_root_state, + trialcs->effective_xcpus)) + return PERR_HKEEPING; - if (old_prs) { - if (is_partition_valid(cs) && - cpumask_empty(trialcs->effective_xcpus)) { - invalidate = true; - cs->prs_err = PERR_INVCPUS; - } else if (prstate_housekeeping_conflict(old_prs, trialcs->effective_xcpus)) { - invalidate = true; - cs->prs_err = PERR_HKEEPING; - } else if (tasks_nocpu_error(parent, cs, trialcs->effective_xcpus)) { - invalidate = true; - cs->prs_err = PERR_NOCPUS; - } - } + if (tasks_nocpu_error(parent, cs, trialcs->effective_xcpus)) + return PERR_NOCPUS; - /* - * Check all the descendants in update_cpumasks_hier() if - * effective_xcpus is to be changed. - */ - force = !cpumask_equal(cs->effective_xcpus, trialcs->effective_xcpus); + return PERR_NONE; +} + +static int cpus_allowed_validate_change(struct cpuset *cs, struct cpuset *trialcs, + struct tmpmasks *tmp) +{ + int retval; + struct cpuset *parent = parent_cs(cs); retval = validate_change(cs, trialcs); @@ -2374,7 +2426,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, * partition. However, any conflicting sibling partitions * have to be marked as invalid too. */ - invalidate = true; + trialcs->prs_err = PERR_NOTEXCL; rcu_read_lock(); cpuset_for_each_child(cp, css, parent) { struct cpumask *xcpus = user_xcpus(trialcs); @@ -2382,36 +2434,92 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, if (is_partition_valid(cp) && cpumask_intersects(xcpus, cp->effective_xcpus)) { rcu_read_unlock(); - update_parent_effective_cpumask(cp, partcmd_invalidate, NULL, &tmp); + update_parent_effective_cpumask(cp, partcmd_invalidate, NULL, tmp); rcu_read_lock(); } } rcu_read_unlock(); retval = 0; } + return retval; +} - if (retval < 0) - goto out_free; +/** + * partition_cpus_change - Handle partition state changes due to CPU mask updates + * @cs: The target cpuset being modified + * @trialcs: The trial cpuset containing proposed configuration changes + * @tmp: Temporary masks for intermediate calculations + * + * This function handles partition state transitions triggered by CPU mask changes. + * CPU modifications may cause a partition to be disabled or require state updates. + */ +static void partition_cpus_change(struct cpuset *cs, struct cpuset *trialcs, + struct tmpmasks *tmp) +{ + enum prs_errcode prs_err; - if (is_partition_valid(cs) || - (is_partition_invalid(cs) && !invalidate)) { - struct cpumask *xcpus = trialcs->effective_xcpus; + if (cs_is_member(cs)) + return; - if (cpumask_empty(xcpus) && is_partition_invalid(cs)) - xcpus = trialcs->cpus_allowed; + prs_err = validate_partition(cs, trialcs); + if (prs_err) + trialcs->prs_err = cs->prs_err = prs_err; - /* - * Call remote_cpus_update() to handle valid remote partition - */ - if (is_remote_partition(cs)) - remote_cpus_update(cs, NULL, xcpus, &tmp); - else if (invalidate) + if (is_remote_partition(cs)) { + if (trialcs->prs_err) + remote_partition_disable(cs, tmp); + else + remote_cpus_update(cs, trialcs->exclusive_cpus, + trialcs->effective_xcpus, tmp); + } else { + if (trialcs->prs_err) update_parent_effective_cpumask(cs, partcmd_invalidate, - NULL, &tmp); + NULL, tmp); else update_parent_effective_cpumask(cs, partcmd_update, - xcpus, &tmp); + trialcs->effective_xcpus, tmp); } +} + +/** + * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it + * @cs: the cpuset to consider + * @trialcs: trial cpuset + * @buf: buffer of cpu numbers written to this cpuset + */ +static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, + const char *buf) +{ + int retval; + struct tmpmasks tmp; + bool force = false; + int old_prs = cs->partition_root_state; + + retval = parse_cpuset_cpulist(buf, trialcs->cpus_allowed); + if (retval < 0) + return retval; + + /* Nothing to do if the cpus didn't change */ + if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed)) + return 0; + + if (alloc_tmpmasks(&tmp)) + return -ENOMEM; + + compute_trialcs_excpus(trialcs, cs); + trialcs->prs_err = PERR_NONE; + + retval = cpus_allowed_validate_change(cs, trialcs, &tmp); + if (retval < 0) + goto out_free; + + /* + * Check all the descendants in update_cpumasks_hier() if + * effective_xcpus is to be changed. + */ + force = !cpumask_equal(cs->effective_xcpus, trialcs->effective_xcpus); + + partition_cpus_change(cs, trialcs, &tmp); spin_lock_irq(&callback_lock); cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed); @@ -2427,7 +2535,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, if (cs->partition_root_state) update_partition_sd_lb(cs, old_prs); out_free: - free_cpumasks(NULL, &tmp); + free_tmpmasks(&tmp); return retval; } @@ -2444,33 +2552,23 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs, { int retval; struct tmpmasks tmp; - struct cpuset *parent = parent_cs(cs); - bool invalidate = false; bool force = false; int old_prs = cs->partition_root_state; - if (!*buf) { - cpumask_clear(trialcs->exclusive_cpus); - cpumask_clear(trialcs->effective_xcpus); - } else { - retval = cpulist_parse(buf, trialcs->exclusive_cpus); - if (retval < 0) - return retval; - } + retval = parse_cpuset_cpulist(buf, trialcs->exclusive_cpus); + if (retval < 0) + return retval; /* Nothing to do if the CPUs didn't change */ if (cpumask_equal(cs->exclusive_cpus, trialcs->exclusive_cpus)) return 0; - if (*buf) { - trialcs->partition_root_state = PRS_MEMBER; - /* - * Reject the change if there is exclusive CPUs conflict with - * the siblings. - */ - if (compute_effective_exclusive_cpumask(trialcs, NULL, cs)) - return -EINVAL; - } + /* + * Reject the change if there is exclusive CPUs conflict with + * the siblings. + */ + if (compute_trialcs_excpus(trialcs, cs)) + return -EINVAL; /* * Check all the descendants in update_cpumasks_hier() if @@ -2482,35 +2580,12 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs, if (retval) return retval; - if (alloc_cpumasks(NULL, &tmp)) + if (alloc_tmpmasks(&tmp)) return -ENOMEM; - if (old_prs) { - if (cpumask_empty(trialcs->effective_xcpus)) { - invalidate = true; - cs->prs_err = PERR_INVCPUS; - } else if (prstate_housekeeping_conflict(old_prs, trialcs->effective_xcpus)) { - invalidate = true; - cs->prs_err = PERR_HKEEPING; - } else if (tasks_nocpu_error(parent, cs, trialcs->effective_xcpus)) { - invalidate = true; - cs->prs_err = PERR_NOCPUS; - } + trialcs->prs_err = PERR_NONE; + partition_cpus_change(cs, trialcs, &tmp); - if (is_remote_partition(cs)) { - if (invalidate) - remote_partition_disable(cs, &tmp); - else - remote_cpus_update(cs, trialcs->exclusive_cpus, - trialcs->effective_xcpus, &tmp); - } else if (invalidate) { - update_parent_effective_cpumask(cs, partcmd_invalidate, - NULL, &tmp); - } else { - update_parent_effective_cpumask(cs, partcmd_update, - trialcs->effective_xcpus, &tmp); - } - } spin_lock_irq(&callback_lock); cpumask_copy(cs->exclusive_cpus, trialcs->exclusive_cpus); cpumask_copy(cs->effective_xcpus, trialcs->effective_xcpus); @@ -2530,7 +2605,7 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs, if (cs->partition_root_state) update_partition_sd_lb(cs, old_prs); - free_cpumasks(NULL, &tmp); + free_tmpmasks(&tmp); return 0; } @@ -2582,9 +2657,24 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, } } -static void cpuset_post_attach(void) +static void flush_migrate_mm_task_workfn(struct callback_head *head) { flush_workqueue(cpuset_migrate_mm_wq); + kfree(head); +} + +static void schedule_flush_migrate_mm(void) +{ + struct callback_head *flush_cb; + + flush_cb = kzalloc(sizeof(struct callback_head), GFP_KERNEL); + if (!flush_cb) + return; + + init_task_work(flush_cb, flush_migrate_mm_task_workfn); + + if (task_work_add(current, flush_cb, TWA_RESUME)) + kfree(flush_cb); } /* @@ -2750,32 +2840,17 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, int retval; /* - * top_cpuset.mems_allowed tracks node_stats[N_MEMORY]; - * it's read-only - */ - if (cs == &top_cpuset) { - retval = -EACCES; - goto done; - } - - /* * An empty mems_allowed is ok iff there are no tasks in the cpuset. - * Since nodelist_parse() fails on an empty mask, we special case - * that parsing. The validate_change() call ensures that cpusets - * with tasks have memory. + * The validate_change() call ensures that cpusets with tasks have memory. */ - if (!*buf) { - nodes_clear(trialcs->mems_allowed); - } else { - retval = nodelist_parse(buf, trialcs->mems_allowed); - if (retval < 0) - goto done; + retval = nodelist_parse(buf, trialcs->mems_allowed); + if (retval < 0) + goto done; - if (!nodes_subset(trialcs->mems_allowed, - top_cpuset.mems_allowed)) { - retval = -EINVAL; - goto done; - } + if (!nodes_subset(trialcs->mems_allowed, + top_cpuset.mems_allowed)) { + retval = -EINVAL; + goto done; } if (nodes_equal(cs->mems_allowed, trialcs->mems_allowed)) { @@ -2826,7 +2901,7 @@ int cpuset_update_flag(cpuset_flagbits_t bit, struct cpuset *cs, int spread_flag_changed; int err; - trialcs = alloc_trial_cpuset(cs); + trialcs = dup_or_alloc_cpuset(cs); if (!trialcs) return -ENOMEM; @@ -2884,10 +2959,10 @@ static int update_prstate(struct cpuset *cs, int new_prs) /* * Treat a previously invalid partition root as if it is a "member". */ - if (new_prs && is_prs_invalid(old_prs)) + if (new_prs && is_partition_invalid(cs)) old_prs = PRS_MEMBER; - if (alloc_cpumasks(NULL, &tmpmask)) + if (alloc_tmpmasks(&tmpmask)) return -ENOMEM; err = update_partition_exclusive_flag(cs, new_prs); @@ -2983,7 +3058,7 @@ out: notify_partition_change(cs, old_prs); if (force_sd_rebuild) rebuild_sched_domains_locked(); - free_cpumasks(NULL, &tmpmask); + free_tmpmasks(&tmpmask); return 0; } @@ -3141,6 +3216,7 @@ static void cpuset_attach(struct cgroup_taskset *tset) struct cpuset *cs; struct cpuset *oldcs = cpuset_attach_old_cs; bool cpus_updated, mems_updated; + bool queue_task_work = false; cgroup_taskset_first(tset, &css); cs = css_cs(css); @@ -3191,15 +3267,18 @@ static void cpuset_attach(struct cgroup_taskset *tset) * @old_mems_allowed is the right nodesets that we * migrate mm from. */ - if (is_memory_migrate(cs)) + if (is_memory_migrate(cs)) { cpuset_migrate_mm(mm, &oldcs->old_mems_allowed, &cpuset_attach_nodemask_to); - else + queue_task_work = true; + } else mmput(mm); } } out: + if (queue_task_work) + schedule_flush_migrate_mm(); cs->old_mems_allowed = cpuset_attach_nodemask_to; if (cs->nr_migrate_dl_tasks) { @@ -3223,13 +3302,16 @@ ssize_t cpuset_write_resmask(struct kernfs_open_file *of, struct cpuset *trialcs; int retval = -ENODEV; + /* root is read-only */ + if (cs == &top_cpuset) + return -EACCES; + buf = strstrip(buf); - cpus_read_lock(); - mutex_lock(&cpuset_mutex); + cpuset_full_lock(); if (!is_cpuset_online(cs)) goto out_unlock; - trialcs = alloc_trial_cpuset(cs); + trialcs = dup_or_alloc_cpuset(cs); if (!trialcs) { retval = -ENOMEM; goto out_unlock; @@ -3254,9 +3336,9 @@ ssize_t cpuset_write_resmask(struct kernfs_open_file *of, if (force_sd_rebuild) rebuild_sched_domains_locked(); out_unlock: - mutex_unlock(&cpuset_mutex); - cpus_read_unlock(); - flush_workqueue(cpuset_migrate_mm_wq); + cpuset_full_unlock(); + if (of_cft(of)->private == FILE_MEMLIST) + schedule_flush_migrate_mm(); return retval ?: nbytes; } @@ -3358,12 +3440,10 @@ static ssize_t cpuset_partition_write(struct kernfs_open_file *of, char *buf, else return -EINVAL; - cpus_read_lock(); - mutex_lock(&cpuset_mutex); + cpuset_full_lock(); if (is_cpuset_online(cs)) retval = update_prstate(cs, val); - mutex_unlock(&cpuset_mutex); - cpus_read_unlock(); + cpuset_full_unlock(); return retval ?: nbytes; } @@ -3462,15 +3542,10 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css) if (!parent_css) return &top_cpuset.css; - cs = kzalloc(sizeof(*cs), GFP_KERNEL); + cs = dup_or_alloc_cpuset(NULL); if (!cs) return ERR_PTR(-ENOMEM); - if (alloc_cpumasks(cs, NULL)) { - kfree(cs); - return ERR_PTR(-ENOMEM); - } - __set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); fmeter_init(&cs->fmeter); cs->relax_domain_level = -1; @@ -3493,10 +3568,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) if (!parent) return 0; - cpus_read_lock(); - mutex_lock(&cpuset_mutex); - - set_bit(CS_ONLINE, &cs->flags); + cpuset_full_lock(); if (is_spread_page(parent)) set_bit(CS_SPREAD_PAGE, &cs->flags); if (is_spread_slab(parent)) @@ -3548,8 +3620,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) cpumask_copy(cs->effective_cpus, parent->cpus_allowed); spin_unlock_irq(&callback_lock); out_unlock: - mutex_unlock(&cpuset_mutex); - cpus_read_unlock(); + cpuset_full_unlock(); return 0; } @@ -3564,17 +3635,12 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css) { struct cpuset *cs = css_cs(css); - cpus_read_lock(); - mutex_lock(&cpuset_mutex); - + cpuset_full_lock(); if (!cpuset_v2() && is_sched_load_balance(cs)) cpuset_update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); cpuset_dec(); - clear_bit(CS_ONLINE, &cs->flags); - - mutex_unlock(&cpuset_mutex); - cpus_read_unlock(); + cpuset_full_unlock(); } /* @@ -3586,16 +3652,11 @@ static void cpuset_css_killed(struct cgroup_subsys_state *css) { struct cpuset *cs = css_cs(css); - cpus_read_lock(); - mutex_lock(&cpuset_mutex); - + cpuset_full_lock(); /* Reset valid partition back to member */ if (is_partition_valid(cs)) update_prstate(cs, PRS_MEMBER); - - mutex_unlock(&cpuset_mutex); - cpus_read_unlock(); - + cpuset_full_unlock(); } static void cpuset_css_free(struct cgroup_subsys_state *css) @@ -3724,7 +3785,6 @@ struct cgroup_subsys cpuset_cgrp_subsys = { .can_attach = cpuset_can_attach, .cancel_attach = cpuset_cancel_attach, .attach = cpuset_attach, - .post_attach = cpuset_post_attach, .bind = cpuset_bind, .can_fork = cpuset_can_fork, .cancel_fork = cpuset_cancel_fork, @@ -3928,7 +3988,7 @@ static void cpuset_handle_hotplug(void) bool on_dfl = is_in_v2_mode(); struct tmpmasks tmp, *ptmp = NULL; - if (on_dfl && !alloc_cpumasks(NULL, &tmp)) + if (on_dfl && !alloc_tmpmasks(&tmp)) ptmp = &tmp; lockdep_assert_cpus_held(); @@ -4008,7 +4068,7 @@ static void cpuset_handle_hotplug(void) if (force_sd_rebuild) rebuild_sched_domains_cpuslocked(); - free_cpumasks(NULL, ptmp); + free_tmpmasks(ptmp); } void cpuset_update_active_cpus(void) @@ -4073,7 +4133,6 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) struct cpuset *cs; spin_lock_irqsave(&callback_lock, flags); - rcu_read_lock(); cs = task_cs(tsk); if (cs != &top_cpuset) @@ -4095,7 +4154,6 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) cpumask_copy(pmask, possible_mask); } - rcu_read_unlock(); spin_unlock_irqrestore(&callback_lock, flags); } @@ -4168,9 +4226,7 @@ nodemask_t cpuset_mems_allowed(struct task_struct *tsk) unsigned long flags; spin_lock_irqsave(&callback_lock, flags); - rcu_read_lock(); guarantee_online_mems(task_cs(tsk), &mask); - rcu_read_unlock(); spin_unlock_irqrestore(&callback_lock, flags); return mask; @@ -4265,10 +4321,8 @@ bool cpuset_current_node_allowed(int node, gfp_t gfp_mask) /* Not hardwall and node outside mems_allowed: scan up cpusets */ spin_lock_irqsave(&callback_lock, flags); - rcu_read_lock(); cs = nearest_hardwall_ancestor(task_cs(current)); allowed = node_isset(node, cs->mems_allowed); - rcu_read_unlock(); spin_unlock_irqrestore(&callback_lock, flags); return allowed; diff --git a/kernel/cgroup/debug.c b/kernel/cgroup/debug.c index 80aa3f027ac3..81ea38dd6f9d 100644 --- a/kernel/cgroup/debug.c +++ b/kernel/cgroup/debug.c @@ -49,7 +49,6 @@ static int current_css_set_read(struct seq_file *seq, void *v) return -ENODEV; spin_lock_irq(&css_set_lock); - rcu_read_lock(); cset = task_css_set(current); refcnt = refcount_read(&cset->refcount); seq_printf(seq, "css_set %pK %d", cset, refcnt); @@ -67,7 +66,6 @@ static int current_css_set_read(struct seq_file *seq, void *v) seq_printf(seq, "%2d: %-4s\t- %p[%d]\n", ss->id, ss->name, css, css->id); } - rcu_read_unlock(); spin_unlock_irq(&css_set_lock); cgroup_kn_unlock(of->kn); return 0; @@ -95,7 +93,6 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v) return -ENOMEM; spin_lock_irq(&css_set_lock); - rcu_read_lock(); cset = task_css_set(current); list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { struct cgroup *c = link->cgrp; @@ -104,7 +101,6 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v) seq_printf(seq, "Root %d group %s\n", c->root->hierarchy_id, name_buf); } - rcu_read_unlock(); spin_unlock_irq(&css_set_lock); kfree(name_buf); return 0; diff --git a/kernel/cgroup/freezer.c b/kernel/cgroup/freezer.c index bf1690a167dd..6c18854bff34 100644 --- a/kernel/cgroup/freezer.c +++ b/kernel/cgroup/freezer.c @@ -171,7 +171,7 @@ static void cgroup_freeze_task(struct task_struct *task, bool freeze) /* * Freeze or unfreeze all tasks in the given cgroup. */ -static void cgroup_do_freeze(struct cgroup *cgrp, bool freeze) +static void cgroup_do_freeze(struct cgroup *cgrp, bool freeze, u64 ts_nsec) { struct css_task_iter it; struct task_struct *task; @@ -179,10 +179,16 @@ static void cgroup_do_freeze(struct cgroup *cgrp, bool freeze) lockdep_assert_held(&cgroup_mutex); spin_lock_irq(&css_set_lock); - if (freeze) + write_seqcount_begin(&cgrp->freezer.freeze_seq); + if (freeze) { set_bit(CGRP_FREEZE, &cgrp->flags); - else + cgrp->freezer.freeze_start_nsec = ts_nsec; + } else { clear_bit(CGRP_FREEZE, &cgrp->flags); + cgrp->freezer.frozen_nsec += (ts_nsec - + cgrp->freezer.freeze_start_nsec); + } + write_seqcount_end(&cgrp->freezer.freeze_seq); spin_unlock_irq(&css_set_lock); if (freeze) @@ -260,6 +266,7 @@ void cgroup_freeze(struct cgroup *cgrp, bool freeze) struct cgroup *parent; struct cgroup *dsct; bool applied = false; + u64 ts_nsec; bool old_e; lockdep_assert_held(&cgroup_mutex); @@ -271,6 +278,7 @@ void cgroup_freeze(struct cgroup *cgrp, bool freeze) return; cgrp->freezer.freeze = freeze; + ts_nsec = ktime_get_ns(); /* * Propagate changes downwards the cgroup tree. @@ -298,7 +306,7 @@ void cgroup_freeze(struct cgroup *cgrp, bool freeze) /* * Do change actual state: freeze or unfreeze. */ - cgroup_do_freeze(dsct, freeze); + cgroup_do_freeze(dsct, freeze, ts_nsec); applied = true; } diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index 6c83ad674d01..808c0d7a31fa 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -217,22 +217,26 @@ static void fixup_uretprobe_trampoline_entries(struct perf_callchain_entry *entr } struct perf_callchain_entry * -get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, +get_perf_callchain(struct pt_regs *regs, bool kernel, bool user, u32 max_stack, bool crosstask, bool add_mark) { struct perf_callchain_entry *entry; struct perf_callchain_entry_ctx ctx; int rctx, start_entry_idx; + /* crosstask is not supported for user stacks */ + if (crosstask && user && !kernel) + return NULL; + entry = get_callchain_entry(&rctx); if (!entry) return NULL; - ctx.entry = entry; - ctx.max_stack = max_stack; - ctx.nr = entry->nr = init_nr; - ctx.contexts = 0; - ctx.contexts_maxed = false; + ctx.entry = entry; + ctx.max_stack = max_stack; + ctx.nr = entry->nr = 0; + ctx.contexts = 0; + ctx.contexts_maxed = false; if (kernel && !user_mode(regs)) { if (add_mark) @@ -240,25 +244,19 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, perf_callchain_kernel(&ctx, regs); } - if (user) { + if (user && !crosstask) { if (!user_mode(regs)) { - if (current->mm) - regs = task_pt_regs(current); - else - regs = NULL; - } - - if (regs) { - if (crosstask) + if (current->flags & (PF_KTHREAD | PF_USER_WORKER)) goto exit_put; + regs = task_pt_regs(current); + } - if (add_mark) - perf_callchain_store_context(&ctx, PERF_CONTEXT_USER); + if (add_mark) + perf_callchain_store_context(&ctx, PERF_CONTEXT_USER); - start_entry_idx = entry->nr; - perf_callchain_user(&ctx, regs); - fixup_uretprobe_trampoline_entries(entry, start_entry_idx); - } + start_entry_idx = entry->nr; + perf_callchain_user(&ctx, regs); + fixup_uretprobe_trampoline_entries(entry, start_entry_idx); } exit_put: diff --git a/kernel/events/core.c b/kernel/events/core.c index 820127536e62..fb1eae762044 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3974,7 +3974,7 @@ static noinline int visit_groups_merge(struct perf_event_context *ctx, */ static inline bool event_update_userpage(struct perf_event *event) { - if (likely(!atomic_read(&event->mmap_count))) + if (likely(!refcount_read(&event->mmap_count))) return false; perf_event_update_time(event); @@ -6710,11 +6710,11 @@ static void perf_mmap_open(struct vm_area_struct *vma) struct perf_event *event = vma->vm_file->private_data; mapped_f mapped = get_mapped(event, event_mapped); - atomic_inc(&event->mmap_count); - atomic_inc(&event->rb->mmap_count); + refcount_inc(&event->mmap_count); + refcount_inc(&event->rb->mmap_count); if (vma->vm_pgoff) - atomic_inc(&event->rb->aux_mmap_count); + refcount_inc(&event->rb->aux_mmap_count); if (mapped) mapped(event, vma->vm_mm); @@ -6749,7 +6749,7 @@ static void perf_mmap_close(struct vm_area_struct *vma) * to avoid complications. */ if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff && - atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &rb->aux_mutex)) { + refcount_dec_and_mutex_lock(&rb->aux_mmap_count, &rb->aux_mutex)) { /* * Stop all AUX events that are writing to this buffer, * so that we can free its AUX pages and corresponding PMU @@ -6769,10 +6769,10 @@ static void perf_mmap_close(struct vm_area_struct *vma) mutex_unlock(&rb->aux_mutex); } - if (atomic_dec_and_test(&rb->mmap_count)) + if (refcount_dec_and_test(&rb->mmap_count)) detach_rest = true; - if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) + if (!refcount_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) goto out_put; ring_buffer_attach(event, NULL); @@ -6933,230 +6933,242 @@ static int map_range(struct perf_buffer *rb, struct vm_area_struct *vma) return err; } -static int perf_mmap(struct file *file, struct vm_area_struct *vma) +static bool perf_mmap_calc_limits(struct vm_area_struct *vma, long *user_extra, long *extra) { - struct perf_event *event = file->private_data; - unsigned long user_locked, user_lock_limit; + unsigned long user_locked, user_lock_limit, locked, lock_limit; struct user_struct *user = current_user(); - struct mutex *aux_mutex = NULL; - struct perf_buffer *rb = NULL; - unsigned long locked, lock_limit; - unsigned long vma_size; - unsigned long nr_pages; - long user_extra = 0, extra = 0; - int ret, flags = 0; - mapped_f mapped; + + user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10); + /* Increase the limit linearly with more CPUs */ + user_lock_limit *= num_online_cpus(); + + user_locked = atomic_long_read(&user->locked_vm); /* - * Don't allow mmap() of inherited per-task counters. This would - * create a performance issue due to all children writing to the - * same rb. + * sysctl_perf_event_mlock may have changed, so that + * user->locked_vm > user_lock_limit */ - if (event->cpu == -1 && event->attr.inherit) - return -EINVAL; + if (user_locked > user_lock_limit) + user_locked = user_lock_limit; + user_locked += *user_extra; - if (!(vma->vm_flags & VM_SHARED)) - return -EINVAL; + if (user_locked > user_lock_limit) { + /* + * charge locked_vm until it hits user_lock_limit; + * charge the rest from pinned_vm + */ + *extra = user_locked - user_lock_limit; + *user_extra -= *extra; + } - ret = security_perf_event_read(event); - if (ret) - return ret; + lock_limit = rlimit(RLIMIT_MEMLOCK); + lock_limit >>= PAGE_SHIFT; + locked = atomic64_read(&vma->vm_mm->pinned_vm) + *extra; - vma_size = vma->vm_end - vma->vm_start; - nr_pages = vma_size / PAGE_SIZE; + return locked <= lock_limit || !perf_is_paranoid() || capable(CAP_IPC_LOCK); +} - if (nr_pages > INT_MAX) - return -ENOMEM; +static void perf_mmap_account(struct vm_area_struct *vma, long user_extra, long extra) +{ + struct user_struct *user = current_user(); - if (vma_size != PAGE_SIZE * nr_pages) - return -EINVAL; + atomic_long_add(user_extra, &user->locked_vm); + atomic64_add(extra, &vma->vm_mm->pinned_vm); +} - user_extra = nr_pages; +static int perf_mmap_rb(struct vm_area_struct *vma, struct perf_event *event, + unsigned long nr_pages) +{ + long extra = 0, user_extra = nr_pages; + struct perf_buffer *rb; + int rb_flags = 0; - mutex_lock(&event->mmap_mutex); - ret = -EINVAL; + nr_pages -= 1; /* - * This relies on __pmu_detach_event() taking mmap_mutex after marking - * the event REVOKED. Either we observe the state, or __pmu_detach_event() - * will detach the rb created here. + * If we have rb pages ensure they're a power-of-two number, so we + * can do bitmasks instead of modulo. */ - if (event->state <= PERF_EVENT_STATE_REVOKED) { - ret = -ENODEV; - goto unlock; - } - - if (vma->vm_pgoff == 0) { - nr_pages -= 1; - - /* - * If we have rb pages ensure they're a power-of-two number, so we - * can do bitmasks instead of modulo. - */ - if (nr_pages != 0 && !is_power_of_2(nr_pages)) - goto unlock; - - WARN_ON_ONCE(event->ctx->parent_ctx); + if (nr_pages != 0 && !is_power_of_2(nr_pages)) + return -EINVAL; - if (event->rb) { - if (data_page_nr(event->rb) != nr_pages) - goto unlock; + WARN_ON_ONCE(event->ctx->parent_ctx); - if (atomic_inc_not_zero(&event->rb->mmap_count)) { - /* - * Success -- managed to mmap() the same buffer - * multiple times. - */ - ret = 0; - /* We need the rb to map pages. */ - rb = event->rb; - goto unlock; - } + if (event->rb) { + if (data_page_nr(event->rb) != nr_pages) + return -EINVAL; + if (refcount_inc_not_zero(&event->rb->mmap_count)) { /* - * Raced against perf_mmap_close()'s - * atomic_dec_and_mutex_lock() remove the - * event and continue as if !event->rb + * Success -- managed to mmap() the same buffer + * multiple times. */ - ring_buffer_attach(event, NULL); + perf_mmap_account(vma, user_extra, extra); + refcount_inc(&event->mmap_count); + return 0; } - } else { /* - * AUX area mapping: if rb->aux_nr_pages != 0, it's already - * mapped, all subsequent mappings should have the same size - * and offset. Must be above the normal perf buffer. + * Raced against perf_mmap_close()'s + * refcount_dec_and_mutex_lock() remove the + * event and continue as if !event->rb */ - u64 aux_offset, aux_size; + ring_buffer_attach(event, NULL); + } - rb = event->rb; - if (!rb) - goto aux_unlock; + if (!perf_mmap_calc_limits(vma, &user_extra, &extra)) + return -EPERM; - aux_mutex = &rb->aux_mutex; - mutex_lock(aux_mutex); + if (vma->vm_flags & VM_WRITE) + rb_flags |= RING_BUFFER_WRITABLE; - aux_offset = READ_ONCE(rb->user_page->aux_offset); - aux_size = READ_ONCE(rb->user_page->aux_size); + rb = rb_alloc(nr_pages, + event->attr.watermark ? event->attr.wakeup_watermark : 0, + event->cpu, rb_flags); - if (aux_offset < perf_data_size(rb) + PAGE_SIZE) - goto aux_unlock; + if (!rb) + return -ENOMEM; - if (aux_offset != vma->vm_pgoff << PAGE_SHIFT) - goto aux_unlock; + refcount_set(&rb->mmap_count, 1); + rb->mmap_user = get_current_user(); + rb->mmap_locked = extra; - /* already mapped with a different offset */ - if (rb_has_aux(rb) && rb->aux_pgoff != vma->vm_pgoff) - goto aux_unlock; + ring_buffer_attach(event, rb); - if (aux_size != vma_size || aux_size != nr_pages * PAGE_SIZE) - goto aux_unlock; + perf_event_update_time(event); + perf_event_init_userpage(event); + perf_event_update_userpage(event); - /* already mapped with a different size */ - if (rb_has_aux(rb) && rb->aux_nr_pages != nr_pages) - goto aux_unlock; + perf_mmap_account(vma, user_extra, extra); + refcount_set(&event->mmap_count, 1); - if (!is_power_of_2(nr_pages)) - goto aux_unlock; + return 0; +} - if (!atomic_inc_not_zero(&rb->mmap_count)) - goto aux_unlock; +static int perf_mmap_aux(struct vm_area_struct *vma, struct perf_event *event, + unsigned long nr_pages) +{ + long extra = 0, user_extra = nr_pages; + u64 aux_offset, aux_size; + struct perf_buffer *rb; + int ret, rb_flags = 0; - if (rb_has_aux(rb)) { - atomic_inc(&rb->aux_mmap_count); - ret = 0; - goto unlock; - } - } + rb = event->rb; + if (!rb) + return -EINVAL; - user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10); + guard(mutex)(&rb->aux_mutex); /* - * Increase the limit linearly with more CPUs: + * AUX area mapping: if rb->aux_nr_pages != 0, it's already + * mapped, all subsequent mappings should have the same size + * and offset. Must be above the normal perf buffer. */ - user_lock_limit *= num_online_cpus(); + aux_offset = READ_ONCE(rb->user_page->aux_offset); + aux_size = READ_ONCE(rb->user_page->aux_size); - user_locked = atomic_long_read(&user->locked_vm); + if (aux_offset < perf_data_size(rb) + PAGE_SIZE) + return -EINVAL; - /* - * sysctl_perf_event_mlock may have changed, so that - * user->locked_vm > user_lock_limit - */ - if (user_locked > user_lock_limit) - user_locked = user_lock_limit; - user_locked += user_extra; + if (aux_offset != vma->vm_pgoff << PAGE_SHIFT) + return -EINVAL; - if (user_locked > user_lock_limit) { - /* - * charge locked_vm until it hits user_lock_limit; - * charge the rest from pinned_vm - */ - extra = user_locked - user_lock_limit; - user_extra -= extra; - } + /* already mapped with a different offset */ + if (rb_has_aux(rb) && rb->aux_pgoff != vma->vm_pgoff) + return -EINVAL; - lock_limit = rlimit(RLIMIT_MEMLOCK); - lock_limit >>= PAGE_SHIFT; - locked = atomic64_read(&vma->vm_mm->pinned_vm) + extra; + if (aux_size != nr_pages * PAGE_SIZE) + return -EINVAL; - if ((locked > lock_limit) && perf_is_paranoid() && - !capable(CAP_IPC_LOCK)) { - ret = -EPERM; - goto unlock; - } + /* already mapped with a different size */ + if (rb_has_aux(rb) && rb->aux_nr_pages != nr_pages) + return -EINVAL; - WARN_ON(!rb && event->rb); + if (!is_power_of_2(nr_pages)) + return -EINVAL; - if (vma->vm_flags & VM_WRITE) - flags |= RING_BUFFER_WRITABLE; + if (!refcount_inc_not_zero(&rb->mmap_count)) + return -EINVAL; - if (!rb) { - rb = rb_alloc(nr_pages, - event->attr.watermark ? event->attr.wakeup_watermark : 0, - event->cpu, flags); + if (rb_has_aux(rb)) { + refcount_inc(&rb->aux_mmap_count); - if (!rb) { - ret = -ENOMEM; - goto unlock; + } else { + if (!perf_mmap_calc_limits(vma, &user_extra, &extra)) { + refcount_dec(&rb->mmap_count); + return -EPERM; } - atomic_set(&rb->mmap_count, 1); - rb->mmap_user = get_current_user(); - rb->mmap_locked = extra; + WARN_ON(!rb && event->rb); - ring_buffer_attach(event, rb); + if (vma->vm_flags & VM_WRITE) + rb_flags |= RING_BUFFER_WRITABLE; - perf_event_update_time(event); - perf_event_init_userpage(event); - perf_event_update_userpage(event); - ret = 0; - } else { ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages, - event->attr.aux_watermark, flags); - if (!ret) { - atomic_set(&rb->aux_mmap_count, 1); - rb->aux_mmap_locked = extra; + event->attr.aux_watermark, rb_flags); + if (ret) { + refcount_dec(&rb->mmap_count); + return ret; } + + refcount_set(&rb->aux_mmap_count, 1); + rb->aux_mmap_locked = extra; } -unlock: - if (!ret) { - atomic_long_add(user_extra, &user->locked_vm); - atomic64_add(extra, &vma->vm_mm->pinned_vm); - - atomic_inc(&event->mmap_count); - } else if (rb) { - /* AUX allocation failed */ - atomic_dec(&rb->mmap_count); - } -aux_unlock: - if (aux_mutex) - mutex_unlock(aux_mutex); - mutex_unlock(&event->mmap_mutex); + perf_mmap_account(vma, user_extra, extra); + refcount_inc(&event->mmap_count); + return 0; +} + +static int perf_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct perf_event *event = file->private_data; + unsigned long vma_size, nr_pages; + mapped_f mapped; + int ret; + + /* + * Don't allow mmap() of inherited per-task counters. This would + * create a performance issue due to all children writing to the + * same rb. + */ + if (event->cpu == -1 && event->attr.inherit) + return -EINVAL; + + if (!(vma->vm_flags & VM_SHARED)) + return -EINVAL; + + ret = security_perf_event_read(event); if (ret) return ret; + vma_size = vma->vm_end - vma->vm_start; + nr_pages = vma_size / PAGE_SIZE; + + if (nr_pages > INT_MAX) + return -ENOMEM; + + if (vma_size != PAGE_SIZE * nr_pages) + return -EINVAL; + + scoped_guard (mutex, &event->mmap_mutex) { + /* + * This relies on __pmu_detach_event() taking mmap_mutex after marking + * the event REVOKED. Either we observe the state, or __pmu_detach_event() + * will detach the rb created here. + */ + if (event->state <= PERF_EVENT_STATE_REVOKED) + return -ENODEV; + + if (vma->vm_pgoff == 0) + ret = perf_mmap_rb(vma, event, nr_pages); + else + ret = perf_mmap_aux(vma, event, nr_pages); + if (ret) + return ret; + } + /* * Since pinned accounting is per vm we cannot allow fork() to copy our * vma. @@ -7174,7 +7186,7 @@ aux_unlock: * full cleanup in this case and therefore does not invoke * vmops::close(). */ - ret = map_range(rb, vma); + ret = map_range(event->rb, vma); if (ret) perf_mmap_close(vma); @@ -7440,7 +7452,7 @@ static void perf_sample_regs_user(struct perf_regs *regs_user, if (user_mode(regs)) { regs_user->abi = perf_reg_abi(current); regs_user->regs = regs; - } else if (!(current->flags & PF_KTHREAD)) { + } else if (!(current->flags & (PF_KTHREAD | PF_USER_WORKER))) { perf_get_regs_user(regs_user, regs); } else { regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE; @@ -8080,7 +8092,7 @@ static u64 perf_virt_to_phys(u64 virt) * Try IRQ-safe get_user_page_fast_only first. * If failed, leave phys_addr as 0. */ - if (current->mm != NULL) { + if (!(current->flags & (PF_KTHREAD | PF_USER_WORKER))) { struct page *p; pagefault_disable(); @@ -8192,7 +8204,8 @@ struct perf_callchain_entry * perf_callchain(struct perf_event *event, struct pt_regs *regs) { bool kernel = !event->attr.exclude_callchain_kernel; - bool user = !event->attr.exclude_callchain_user; + bool user = !event->attr.exclude_callchain_user && + !(current->flags & (PF_KTHREAD | PF_USER_WORKER)); /* Disallow cross-task user callchains. */ bool crosstask = event->ctx->task && event->ctx->task != current; const u32 max_stack = event->attr.sample_max_stack; @@ -8204,7 +8217,7 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs) if (!kernel && !user) return &__empty_callchain; - callchain = get_perf_callchain(regs, 0, kernel, user, + callchain = get_perf_callchain(regs, kernel, user, max_stack, crosstask, true); return callchain ?: &__empty_callchain; } @@ -13249,7 +13262,7 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event) mutex_lock_double(&event->mmap_mutex, &output_event->mmap_mutex); set: /* Can't redirect output if we've got an active mmap() */ - if (atomic_read(&event->mmap_count)) + if (refcount_read(&event->mmap_count)) goto unlock; if (output_event) { @@ -13262,7 +13275,7 @@ set: goto unlock; /* did we race against perf_mmap_close() */ - if (!atomic_read(&rb->mmap_count)) { + if (!refcount_read(&rb->mmap_count)) { ring_buffer_put(rb); goto unlock; } diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 249288d82b8d..d9cc57083091 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -35,7 +35,7 @@ struct perf_buffer { spinlock_t event_lock; struct list_head event_list; - atomic_t mmap_count; + refcount_t mmap_count; unsigned long mmap_locked; struct user_struct *mmap_user; @@ -47,7 +47,7 @@ struct perf_buffer { unsigned long aux_pgoff; int aux_nr_pages; int aux_overwrite; - atomic_t aux_mmap_count; + refcount_t aux_mmap_count; unsigned long aux_mmap_locked; void (*free_aux)(void *); refcount_t aux_refcount; diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index aa9a759e824f..20a905023736 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -400,7 +400,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle, * the same order, see perf_mmap_close. Otherwise we end up freeing * aux pages in this path, which is a bug, because in_atomic(). */ - if (!atomic_read(&rb->aux_mmap_count)) + if (!refcount_read(&rb->aux_mmap_count)) goto err; if (!refcount_inc_not_zero(&rb->aux_refcount)) diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 6d3034ec418d..c8bfff023c6e 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -177,7 +177,7 @@ bool __weak is_trap_insn(uprobe_opcode_t *insn) return is_swbp_insn(insn); } -static void copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len) +void uprobe_copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len) { void *kaddr = kmap_atomic(page); memcpy(dst, kaddr + (vaddr & ~PAGE_MASK), len); @@ -191,7 +191,8 @@ static void copy_to_page(struct page *page, unsigned long vaddr, const void *src kunmap_atomic(kaddr); } -static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *new_opcode) +static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *insn, + int nbytes, void *data) { uprobe_opcode_t old_opcode; bool is_swbp; @@ -205,10 +206,10 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t * is a trap variant; uprobes always wins over any other (gdb) * breakpoint. */ - copy_from_page(page, vaddr, &old_opcode, UPROBE_SWBP_INSN_SIZE); + uprobe_copy_from_page(page, vaddr, &old_opcode, UPROBE_SWBP_INSN_SIZE); is_swbp = is_swbp_insn(&old_opcode); - if (is_swbp_insn(new_opcode)) { + if (is_swbp_insn(insn)) { if (is_swbp) /* register: already installed? */ return 0; } else { @@ -399,12 +400,12 @@ static bool orig_page_is_identical(struct vm_area_struct *vma, return identical; } -static int __uprobe_write_opcode(struct vm_area_struct *vma, +static int __uprobe_write(struct vm_area_struct *vma, struct folio_walk *fw, struct folio *folio, - unsigned long opcode_vaddr, uprobe_opcode_t opcode) + unsigned long insn_vaddr, uprobe_opcode_t *insn, int nbytes, + bool is_register) { - const unsigned long vaddr = opcode_vaddr & PAGE_MASK; - const bool is_register = !!is_swbp_insn(&opcode); + const unsigned long vaddr = insn_vaddr & PAGE_MASK; bool pmd_mappable; /* For now, we'll only handle PTE-mapped folios. */ @@ -429,7 +430,7 @@ static int __uprobe_write_opcode(struct vm_area_struct *vma, */ flush_cache_page(vma, vaddr, pte_pfn(fw->pte)); fw->pte = ptep_clear_flush(vma, vaddr, fw->ptep); - copy_to_page(fw->page, opcode_vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); + copy_to_page(fw->page, insn_vaddr, insn, nbytes); /* * When unregistering, we may only zap a PTE if uffd is disabled and @@ -482,23 +483,32 @@ remap: * @opcode_vaddr: the virtual address to store the opcode. * @opcode: opcode to be written at @opcode_vaddr. * - * Called with mm->mmap_lock held for read or write. + * Called with mm->mmap_lock held for write. * Return 0 (success) or a negative errno. */ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma, - const unsigned long opcode_vaddr, uprobe_opcode_t opcode) + const unsigned long opcode_vaddr, uprobe_opcode_t opcode, + bool is_register) { - const unsigned long vaddr = opcode_vaddr & PAGE_MASK; + return uprobe_write(auprobe, vma, opcode_vaddr, &opcode, UPROBE_SWBP_INSN_SIZE, + verify_opcode, is_register, true /* do_update_ref_ctr */, NULL); +} + +int uprobe_write(struct arch_uprobe *auprobe, struct vm_area_struct *vma, + const unsigned long insn_vaddr, uprobe_opcode_t *insn, int nbytes, + uprobe_write_verify_t verify, bool is_register, bool do_update_ref_ctr, + void *data) +{ + const unsigned long vaddr = insn_vaddr & PAGE_MASK; struct mm_struct *mm = vma->vm_mm; struct uprobe *uprobe; - int ret, is_register, ref_ctr_updated = 0; + int ret, ref_ctr_updated = 0; unsigned int gup_flags = FOLL_FORCE; struct mmu_notifier_range range; struct folio_walk fw; struct folio *folio; struct page *page; - is_register = is_swbp_insn(&opcode); uprobe = container_of(auprobe, struct uprobe, arch); if (WARN_ON_ONCE(!is_cow_mapping(vma->vm_flags))) @@ -509,7 +519,7 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma, * page that we can safely modify. Use FOLL_WRITE to trigger a write * fault if required. When unregistering, we might be lucky and the * anon page is already gone. So defer write faults until really - * required. Use FOLL_SPLIT_PMD, because __uprobe_write_opcode() + * required. Use FOLL_SPLIT_PMD, because __uprobe_write() * cannot deal with PMDs yet. */ if (is_register) @@ -521,14 +531,14 @@ retry: goto out; folio = page_folio(page); - ret = verify_opcode(page, opcode_vaddr, &opcode); + ret = verify(page, insn_vaddr, insn, nbytes, data); if (ret <= 0) { folio_put(folio); goto out; } /* We are going to replace instruction, update ref_ctr. */ - if (!ref_ctr_updated && uprobe->ref_ctr_offset) { + if (do_update_ref_ctr && !ref_ctr_updated && uprobe->ref_ctr_offset) { ret = update_ref_ctr(uprobe, mm, is_register ? 1 : -1); if (ret) { folio_put(folio); @@ -560,7 +570,7 @@ retry: /* Walk the page tables again, to perform the actual update. */ if (folio_walk_start(&fw, vma, vaddr, 0)) { if (fw.page == page) - ret = __uprobe_write_opcode(vma, &fw, folio, opcode_vaddr, opcode); + ret = __uprobe_write(vma, &fw, folio, insn_vaddr, insn, nbytes, is_register); folio_walk_end(&fw, vma); } @@ -580,7 +590,7 @@ retry: out: /* Revert back reference counter if instruction update failed. */ - if (ret < 0 && ref_ctr_updated) + if (do_update_ref_ctr && ret < 0 && ref_ctr_updated) update_ref_ctr(uprobe, mm, is_register ? -1 : 1); /* try collapse pmd for compound page */ @@ -602,7 +612,7 @@ out: int __weak set_swbp(struct arch_uprobe *auprobe, struct vm_area_struct *vma, unsigned long vaddr) { - return uprobe_write_opcode(auprobe, vma, vaddr, UPROBE_SWBP_INSN); + return uprobe_write_opcode(auprobe, vma, vaddr, UPROBE_SWBP_INSN, true); } /** @@ -618,7 +628,7 @@ int __weak set_orig_insn(struct arch_uprobe *auprobe, struct vm_area_struct *vma, unsigned long vaddr) { return uprobe_write_opcode(auprobe, vma, vaddr, - *(uprobe_opcode_t *)&auprobe->insn); + *(uprobe_opcode_t *)&auprobe->insn, false); } /* uprobe should have guaranteed positive refcount */ @@ -1051,7 +1061,7 @@ static int __copy_insn(struct address_space *mapping, struct file *filp, if (IS_ERR(page)) return PTR_ERR(page); - copy_from_page(page, offset, insn, nbytes); + uprobe_copy_from_page(page, offset, insn, nbytes); put_page(page); return 0; @@ -1210,7 +1220,7 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register) * reclaim. This is optimistic, no harm done if it fails. */ prev = kmalloc(sizeof(struct map_info), - GFP_NOWAIT | __GFP_NOMEMALLOC | __GFP_NOWARN); + GFP_NOWAIT | __GFP_NOMEMALLOC); if (prev) prev->next = NULL; } @@ -1397,7 +1407,7 @@ struct uprobe *uprobe_register(struct inode *inode, return ERR_PTR(-EINVAL); /* - * This ensures that copy_from_page(), copy_to_page() and + * This ensures that uprobe_copy_from_page(), copy_to_page() and * __update_ref_ctr() can't cross page boundary. */ if (!IS_ALIGNED(offset, UPROBE_SWBP_INSN_SIZE)) @@ -1463,7 +1473,7 @@ static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm) struct vm_area_struct *vma; int err = 0; - mmap_read_lock(mm); + mmap_write_lock(mm); for_each_vma(vmi, vma) { unsigned long vaddr; loff_t offset; @@ -1480,7 +1490,7 @@ static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm) vaddr = offset_to_vaddr(vma, uprobe->offset); err |= remove_breakpoint(uprobe, vma, vaddr); } - mmap_read_unlock(mm); + mmap_write_unlock(mm); return err; } @@ -1726,7 +1736,7 @@ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area) return ret; } -void * __weak arch_uprobe_trampoline(unsigned long *psize) +void * __weak arch_uretprobe_trampoline(unsigned long *psize) { static uprobe_opcode_t insn = UPROBE_SWBP_INSN; @@ -1758,7 +1768,7 @@ static struct xol_area *__create_xol_area(unsigned long vaddr) init_waitqueue_head(&area->wq); /* Reserve the 1st slot for get_trampoline_vaddr() */ set_bit(0, area->bitmap); - insns = arch_uprobe_trampoline(&insns_size); + insns = arch_uretprobe_trampoline(&insns_size); arch_uprobe_copy_ixol(area->page, 0, insns, insns_size); if (!xol_add_vma(mm, area)) @@ -1792,6 +1802,14 @@ static struct xol_area *get_xol_area(void) return area; } +void __weak arch_uprobe_clear_state(struct mm_struct *mm) +{ +} + +void __weak arch_uprobe_init_state(struct mm_struct *mm) +{ +} + /* * uprobe_clear_state - Free the area allocated for slots. */ @@ -1803,6 +1821,8 @@ void uprobe_clear_state(struct mm_struct *mm) delayed_uprobe_remove(NULL, mm); mutex_unlock(&delayed_uprobe_lock); + arch_uprobe_clear_state(mm); + if (!area) return; @@ -2393,7 +2413,7 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr) if (result < 0) return result; - copy_from_page(page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); + uprobe_copy_from_page(page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); put_page(page); out: /* This needs to return true for any variant of the trap insn */ @@ -2677,6 +2697,10 @@ bool __weak arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check c return true; } +void __weak arch_uprobe_optimize(struct arch_uprobe *auprobe, unsigned long vaddr) +{ +} + /* * Run handler and ask thread to singlestep. * Ensure all non-fatal signals cannot interrupt thread while it singlesteps. @@ -2741,6 +2765,9 @@ static void handle_swbp(struct pt_regs *regs) handler_chain(uprobe, regs); + /* Try to optimize after first hit. */ + arch_uprobe_optimize(&uprobe->arch, bp_vaddr); + if (arch_uprobe_skip_sstep(&uprobe->arch, regs)) goto out; @@ -2752,6 +2779,23 @@ out: rcu_read_unlock_trace(); } +void handle_syscall_uprobe(struct pt_regs *regs, unsigned long bp_vaddr) +{ + struct uprobe *uprobe; + int is_swbp; + + guard(rcu_tasks_trace)(); + + uprobe = find_active_uprobe_rcu(bp_vaddr, &is_swbp); + if (!uprobe) + return; + if (!get_utask()) + return; + if (arch_uprobe_ignore(&uprobe->arch, regs)) + return; + handler_chain(uprobe, regs); +} + /* * Perform required fix-ups and disable singlestep. * Allow pending signals to take effect. diff --git a/kernel/fork.c b/kernel/fork.c index f24f4c71d002..cffa6157a55a 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1014,6 +1014,7 @@ static void mm_init_uprobes_state(struct mm_struct *mm) { #ifdef CONFIG_UPROBES mm->uprobes_state.xol_area = NULL; + arch_uprobe_init_state(mm); #endif } @@ -1688,6 +1689,10 @@ static int copy_signal(u64 clone_flags, struct task_struct *tsk) tty_audit_fork(sig); sched_autogroup_fork(sig); +#ifdef CONFIG_CGROUPS + init_rwsem(&sig->cgroup_threadgroup_rwsem); +#endif + sig->oom_score_adj = current->signal->oom_score_adj; sig->oom_score_adj_min = current->signal->oom_score_adj_min; diff --git a/kernel/futex/syscalls.c b/kernel/futex/syscalls.c index 4b6da9116aa6..880c9bf2f315 100644 --- a/kernel/futex/syscalls.c +++ b/kernel/futex/syscalls.c @@ -39,6 +39,56 @@ SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head, return 0; } +static inline void __user *futex_task_robust_list(struct task_struct *p, bool compat) +{ +#ifdef CONFIG_COMPAT + if (compat) + return p->compat_robust_list; +#endif + return p->robust_list; +} + +static void __user *futex_get_robust_list_common(int pid, bool compat) +{ + struct task_struct *p = current; + void __user *head; + int ret; + + scoped_guard(rcu) { + if (pid) { + p = find_task_by_vpid(pid); + if (!p) + return (void __user *)ERR_PTR(-ESRCH); + } + get_task_struct(p); + } + + /* + * Hold exec_update_lock to serialize with concurrent exec() + * so ptrace_may_access() is checked against stable credentials + */ + ret = down_read_killable(&p->signal->exec_update_lock); + if (ret) + goto err_put; + + ret = -EPERM; + if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS)) + goto err_unlock; + + head = futex_task_robust_list(p, compat); + + up_read(&p->signal->exec_update_lock); + put_task_struct(p); + + return head; + +err_unlock: + up_read(&p->signal->exec_update_lock); +err_put: + put_task_struct(p); + return (void __user *)ERR_PTR(ret); +} + /** * sys_get_robust_list() - Get the robust-futex list head of a task * @pid: pid of the process [zero for current task] @@ -49,36 +99,14 @@ SYSCALL_DEFINE3(get_robust_list, int, pid, struct robust_list_head __user * __user *, head_ptr, size_t __user *, len_ptr) { - struct robust_list_head __user *head; - unsigned long ret; - struct task_struct *p; - - rcu_read_lock(); - - ret = -ESRCH; - if (!pid) - p = current; - else { - p = find_task_by_vpid(pid); - if (!p) - goto err_unlock; - } - - ret = -EPERM; - if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS)) - goto err_unlock; + struct robust_list_head __user *head = futex_get_robust_list_common(pid, false); - head = p->robust_list; - rcu_read_unlock(); + if (IS_ERR(head)) + return PTR_ERR(head); if (put_user(sizeof(*head), len_ptr)) return -EFAULT; return put_user(head, head_ptr); - -err_unlock: - rcu_read_unlock(); - - return ret; } long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, @@ -455,36 +483,14 @@ COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid, compat_uptr_t __user *, head_ptr, compat_size_t __user *, len_ptr) { - struct compat_robust_list_head __user *head; - unsigned long ret; - struct task_struct *p; - - rcu_read_lock(); - - ret = -ESRCH; - if (!pid) - p = current; - else { - p = find_task_by_vpid(pid); - if (!p) - goto err_unlock; - } - - ret = -EPERM; - if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS)) - goto err_unlock; + struct compat_robust_list_head __user *head = futex_get_robust_list_common(pid, true); - head = p->compat_robust_list; - rcu_read_unlock(); + if (IS_ERR(head)) + return PTR_ERR(head); if (put_user(sizeof(*head), len_ptr)) return -EFAULT; return put_user(ptr_to_compat(head), head_ptr); - -err_unlock: - rcu_read_unlock(); - - return ret; } #endif /* CONFIG_COMPAT */ diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 1da5e9d9da71..1b4254d19a73 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -6,10 +6,6 @@ menu "IRQ subsystem" config MAY_HAVE_SPARSE_IRQ bool -# Legacy support, required for itanic -config GENERIC_IRQ_LEGACY - bool - # Enable the generic irq autoprobe mechanism config GENERIC_IRQ_PROBE bool @@ -147,7 +143,9 @@ config GENERIC_IRQ_KEXEC_CLEAR_VM_FORWARD config IRQ_KUNIT_TEST bool "KUnit tests for IRQ management APIs" if !KUNIT_ALL_TESTS depends on KUNIT=y + depends on SPARSE_IRQ default KUNIT_ALL_TESTS + select IRQ_DOMAIN imply SMP help This option enables KUnit tests for the IRQ subsystem API. These are diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 0d0276378c70..3ffa0d80ddd1 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -1260,6 +1260,43 @@ int irq_chip_get_parent_state(struct irq_data *data, EXPORT_SYMBOL_GPL(irq_chip_get_parent_state); /** + * irq_chip_shutdown_parent - Shutdown the parent interrupt + * @data: Pointer to interrupt specific data + * + * Invokes the irq_shutdown() callback of the parent if available or falls + * back to irq_chip_disable_parent(). + */ +void irq_chip_shutdown_parent(struct irq_data *data) +{ + struct irq_data *parent = data->parent_data; + + if (parent->chip->irq_shutdown) + parent->chip->irq_shutdown(parent); + else + irq_chip_disable_parent(data); +} +EXPORT_SYMBOL_GPL(irq_chip_shutdown_parent); + +/** + * irq_chip_startup_parent - Startup the parent interrupt + * @data: Pointer to interrupt specific data + * + * Invokes the irq_startup() callback of the parent if available or falls + * back to irq_chip_enable_parent(). + */ +unsigned int irq_chip_startup_parent(struct irq_data *data) +{ + struct irq_data *parent = data->parent_data; + + if (parent->chip->irq_startup) + return parent->chip->irq_startup(parent); + + irq_chip_enable_parent(data); + return 0; +} +EXPORT_SYMBOL_GPL(irq_chip_startup_parent); + +/** * irq_chip_enable_parent - Enable the parent interrupt (defaults to unmask if * NULL) * @data: Pointer to interrupt specific data diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c index eb16a58e0322..b41188698622 100644 --- a/kernel/irq/devres.c +++ b/kernel/irq/devres.c @@ -30,29 +30,22 @@ static int devm_irq_match(struct device *dev, void *res, void *data) return this->irq == match->irq && this->dev_id == match->dev_id; } -/** - * devm_request_threaded_irq - allocate an interrupt line for a managed device - * @dev: device to request interrupt for - * @irq: Interrupt line to allocate - * @handler: Function to be called when the IRQ occurs - * @thread_fn: function to be called in a threaded interrupt context. NULL - * for devices which handle everything in @handler - * @irqflags: Interrupt type flags - * @devname: An ascii name for the claiming device, dev_name(dev) if NULL - * @dev_id: A cookie passed back to the handler function - * - * Except for the extra @dev argument, this function takes the - * same arguments and performs the same function as - * request_threaded_irq(). IRQs requested with this function will be - * automatically freed on driver detach. - * - * If an IRQ allocated with this function needs to be freed - * separately, devm_free_irq() must be used. - */ -int devm_request_threaded_irq(struct device *dev, unsigned int irq, - irq_handler_t handler, irq_handler_t thread_fn, - unsigned long irqflags, const char *devname, - void *dev_id) +static int devm_request_result(struct device *dev, int rc, unsigned int irq, + irq_handler_t handler, irq_handler_t thread_fn, + const char *devname) +{ + if (rc >= 0) + return rc; + + return dev_err_probe(dev, rc, "request_irq(%u) %ps %ps %s\n", + irq, handler, thread_fn, devname ? : ""); +} + +static int __devm_request_threaded_irq(struct device *dev, unsigned int irq, + irq_handler_t handler, + irq_handler_t thread_fn, + unsigned long irqflags, + const char *devname, void *dev_id) { struct irq_devres *dr; int rc; @@ -78,28 +71,48 @@ int devm_request_threaded_irq(struct device *dev, unsigned int irq, return 0; } -EXPORT_SYMBOL(devm_request_threaded_irq); /** - * devm_request_any_context_irq - allocate an interrupt line for a managed device - * @dev: device to request interrupt for - * @irq: Interrupt line to allocate - * @handler: Function to be called when the IRQ occurs - * @irqflags: Interrupt type flags - * @devname: An ascii name for the claiming device, dev_name(dev) if NULL - * @dev_id: A cookie passed back to the handler function + * devm_request_threaded_irq - allocate an interrupt line for a managed device with error logging + * @dev: Device to request interrupt for + * @irq: Interrupt line to allocate + * @handler: Function to be called when the interrupt occurs + * @thread_fn: Function to be called in a threaded interrupt context. NULL + * for devices which handle everything in @handler + * @irqflags: Interrupt type flags + * @devname: An ascii name for the claiming device, dev_name(dev) if NULL + * @dev_id: A cookie passed back to the handler function * - * Except for the extra @dev argument, this function takes the - * same arguments and performs the same function as - * request_any_context_irq(). IRQs requested with this function will be - * automatically freed on driver detach. + * Except for the extra @dev argument, this function takes the same + * arguments and performs the same function as request_threaded_irq(). + * Interrupts requested with this function will be automatically freed on + * driver detach. + * + * If an interrupt allocated with this function needs to be freed + * separately, devm_free_irq() must be used. + * + * When the request fails, an error message is printed with contextual + * information (device name, interrupt number, handler functions and + * error code). Don't add extra error messages at the call sites. * - * If an IRQ allocated with this function needs to be freed - * separately, devm_free_irq() must be used. + * Return: 0 on success or a negative error number. */ -int devm_request_any_context_irq(struct device *dev, unsigned int irq, - irq_handler_t handler, unsigned long irqflags, - const char *devname, void *dev_id) +int devm_request_threaded_irq(struct device *dev, unsigned int irq, + irq_handler_t handler, irq_handler_t thread_fn, + unsigned long irqflags, const char *devname, + void *dev_id) +{ + int rc = __devm_request_threaded_irq(dev, irq, handler, thread_fn, + irqflags, devname, dev_id); + + return devm_request_result(dev, rc, irq, handler, thread_fn, devname); +} +EXPORT_SYMBOL(devm_request_threaded_irq); + +static int __devm_request_any_context_irq(struct device *dev, unsigned int irq, + irq_handler_t handler, + unsigned long irqflags, + const char *devname, void *dev_id) { struct irq_devres *dr; int rc; @@ -124,6 +137,40 @@ int devm_request_any_context_irq(struct device *dev, unsigned int irq, return rc; } + +/** + * devm_request_any_context_irq - allocate an interrupt line for a managed device with error logging + * @dev: Device to request interrupt for + * @irq: Interrupt line to allocate + * @handler: Function to be called when the interrupt occurs + * @irqflags: Interrupt type flags + * @devname: An ascii name for the claiming device, dev_name(dev) if NULL + * @dev_id: A cookie passed back to the handler function + * + * Except for the extra @dev argument, this function takes the same + * arguments and performs the same function as request_any_context_irq(). + * Interrupts requested with this function will be automatically freed on + * driver detach. + * + * If an interrupt allocated with this function needs to be freed + * separately, devm_free_irq() must be used. + * + * When the request fails, an error message is printed with contextual + * information (device name, interrupt number, handler functions and + * error code). Don't add extra error messages at the call sites. + * + * Return: IRQC_IS_HARDIRQ or IRQC_IS_NESTED on success, or a negative error + * number. + */ +int devm_request_any_context_irq(struct device *dev, unsigned int irq, + irq_handler_t handler, unsigned long irqflags, + const char *devname, void *dev_id) +{ + int rc = __devm_request_any_context_irq(dev, irq, handler, irqflags, + devname, dev_id); + + return devm_request_result(dev, rc, irq, handler, NULL, devname); +} EXPORT_SYMBOL(devm_request_any_context_irq); /** diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 9489f93b3db3..e103451243a0 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -136,6 +136,44 @@ void __irq_wake_thread(struct irq_desc *desc, struct irqaction *action) wake_up_process(action->thread); } +static DEFINE_STATIC_KEY_FALSE(irqhandler_duration_check_enabled); +static u64 irqhandler_duration_threshold_ns __ro_after_init; + +static int __init irqhandler_duration_check_setup(char *arg) +{ + unsigned long val; + int ret; + + ret = kstrtoul(arg, 0, &val); + if (ret) { + pr_err("Unable to parse irqhandler.duration_warn_us setting: ret=%d\n", ret); + return 0; + } + + if (!val) { + pr_err("Invalid irqhandler.duration_warn_us setting, must be > 0\n"); + return 0; + } + + irqhandler_duration_threshold_ns = val * 1000; + static_branch_enable(&irqhandler_duration_check_enabled); + + return 1; +} +__setup("irqhandler.duration_warn_us=", irqhandler_duration_check_setup); + +static inline void irqhandler_duration_check(u64 ts_start, unsigned int irq, + const struct irqaction *action) +{ + u64 delta_ns = local_clock() - ts_start; + + if (unlikely(delta_ns > irqhandler_duration_threshold_ns)) { + pr_warn_ratelimited("[CPU%u] long duration of IRQ[%u:%ps], took: %llu us\n", + smp_processor_id(), irq, action->handler, + div_u64(delta_ns, NSEC_PER_USEC)); + } +} + irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc) { irqreturn_t retval = IRQ_NONE; @@ -155,7 +193,16 @@ irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc) lockdep_hardirq_threaded(); trace_irq_handler_entry(irq, action); - res = action->handler(irq, action->dev_id); + + if (static_branch_unlikely(&irqhandler_duration_check_enabled)) { + u64 ts_start = local_clock(); + + res = action->handler(irq, action->dev_id); + irqhandler_duration_check(ts_start, irq, action); + } else { + res = action->handler(irq, action->dev_id); + } + trace_irq_handler_exit(irq, action, res); if (WARN_ONCE(!irqs_disabled(),"irq %u handler %pS enabled interrupts\n", diff --git a/kernel/irq/irq_test.c b/kernel/irq/irq_test.c index a75abebed7f2..e2d31914b3c4 100644 --- a/kernel/irq/irq_test.c +++ b/kernel/irq/irq_test.c @@ -41,21 +41,37 @@ static struct irq_chip fake_irq_chip = { .flags = IRQCHIP_SKIP_SET_WAKE, }; -static void irq_disable_depth_test(struct kunit *test) +static int irq_test_setup_fake_irq(struct kunit *test, struct irq_affinity_desc *affd) { struct irq_desc *desc; - int virq, ret; + int virq; - virq = irq_domain_alloc_descs(-1, 1, 0, NUMA_NO_NODE, NULL); + virq = irq_domain_alloc_descs(-1, 1, 0, NUMA_NO_NODE, affd); KUNIT_ASSERT_GE(test, virq, 0); - irq_set_chip_and_handler(virq, &dummy_irq_chip, handle_simple_irq); + irq_set_chip_and_handler(virq, &fake_irq_chip, handle_simple_irq); + + desc = irq_to_desc(virq); + KUNIT_ASSERT_PTR_NE(test, desc, NULL); + + /* On some architectures, IRQs are NOREQUEST | NOPROBE by default. */ + irq_settings_clr_norequest(desc); + + return virq; +} + +static void irq_disable_depth_test(struct kunit *test) +{ + struct irq_desc *desc; + int virq, ret; + + virq = irq_test_setup_fake_irq(test, NULL); desc = irq_to_desc(virq); KUNIT_ASSERT_PTR_NE(test, desc, NULL); ret = request_irq(virq, noop_handler, 0, "test_irq", NULL); - KUNIT_EXPECT_EQ(test, ret, 0); + KUNIT_ASSERT_EQ(test, ret, 0); KUNIT_EXPECT_EQ(test, desc->depth, 0); @@ -73,16 +89,13 @@ static void irq_free_disabled_test(struct kunit *test) struct irq_desc *desc; int virq, ret; - virq = irq_domain_alloc_descs(-1, 1, 0, NUMA_NO_NODE, NULL); - KUNIT_ASSERT_GE(test, virq, 0); - - irq_set_chip_and_handler(virq, &dummy_irq_chip, handle_simple_irq); + virq = irq_test_setup_fake_irq(test, NULL); desc = irq_to_desc(virq); KUNIT_ASSERT_PTR_NE(test, desc, NULL); ret = request_irq(virq, noop_handler, 0, "test_irq", NULL); - KUNIT_EXPECT_EQ(test, ret, 0); + KUNIT_ASSERT_EQ(test, ret, 0); KUNIT_EXPECT_EQ(test, desc->depth, 0); @@ -93,7 +106,7 @@ static void irq_free_disabled_test(struct kunit *test) KUNIT_EXPECT_GE(test, desc->depth, 1); ret = request_irq(virq, noop_handler, 0, "test_irq", NULL); - KUNIT_EXPECT_EQ(test, ret, 0); + KUNIT_ASSERT_EQ(test, ret, 0); KUNIT_EXPECT_EQ(test, desc->depth, 0); free_irq(virq, NULL); @@ -112,10 +125,7 @@ static void irq_shutdown_depth_test(struct kunit *test) if (!IS_ENABLED(CONFIG_SMP)) kunit_skip(test, "requires CONFIG_SMP for managed shutdown"); - virq = irq_domain_alloc_descs(-1, 1, 0, NUMA_NO_NODE, &affinity); - KUNIT_ASSERT_GE(test, virq, 0); - - irq_set_chip_and_handler(virq, &dummy_irq_chip, handle_simple_irq); + virq = irq_test_setup_fake_irq(test, &affinity); desc = irq_to_desc(virq); KUNIT_ASSERT_PTR_NE(test, desc, NULL); @@ -124,7 +134,7 @@ static void irq_shutdown_depth_test(struct kunit *test) KUNIT_ASSERT_PTR_NE(test, data, NULL); ret = request_irq(virq, noop_handler, 0, "test_irq", NULL); - KUNIT_EXPECT_EQ(test, ret, 0); + KUNIT_ASSERT_EQ(test, ret, 0); KUNIT_EXPECT_TRUE(test, irqd_is_activated(data)); KUNIT_EXPECT_TRUE(test, irqd_is_started(data)); @@ -169,13 +179,12 @@ static void irq_cpuhotplug_test(struct kunit *test) kunit_skip(test, "requires more than 1 CPU for CPU hotplug"); if (!cpu_is_hotpluggable(1)) kunit_skip(test, "CPU 1 must be hotpluggable"); + if (!cpu_online(1)) + kunit_skip(test, "CPU 1 must be online"); cpumask_copy(&affinity.mask, cpumask_of(1)); - virq = irq_domain_alloc_descs(-1, 1, 0, NUMA_NO_NODE, &affinity); - KUNIT_ASSERT_GE(test, virq, 0); - - irq_set_chip_and_handler(virq, &fake_irq_chip, handle_simple_irq); + virq = irq_test_setup_fake_irq(test, &affinity); desc = irq_to_desc(virq); KUNIT_ASSERT_PTR_NE(test, desc, NULL); @@ -184,7 +193,7 @@ static void irq_cpuhotplug_test(struct kunit *test) KUNIT_ASSERT_PTR_NE(test, data, NULL); ret = request_irq(virq, noop_handler, 0, "test_irq", NULL); - KUNIT_EXPECT_EQ(test, ret, 0); + KUNIT_ASSERT_EQ(test, ret, 0); KUNIT_EXPECT_TRUE(test, irqd_is_activated(data)); KUNIT_EXPECT_TRUE(test, irqd_is_started(data)); @@ -196,13 +205,9 @@ static void irq_cpuhotplug_test(struct kunit *test) KUNIT_EXPECT_EQ(test, desc->depth, 1); KUNIT_EXPECT_EQ(test, remove_cpu(1), 0); - KUNIT_EXPECT_FALSE(test, irqd_is_activated(data)); - KUNIT_EXPECT_FALSE(test, irqd_is_started(data)); KUNIT_EXPECT_GE(test, desc->depth, 1); KUNIT_EXPECT_EQ(test, add_cpu(1), 0); - KUNIT_EXPECT_FALSE(test, irqd_is_activated(data)); - KUNIT_EXPECT_FALSE(test, irqd_is_started(data)); KUNIT_EXPECT_EQ(test, desc->depth, 1); enable_irq(virq); diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index b64c57b44c20..db714d3014b5 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -653,13 +653,6 @@ void irq_mark_irq(unsigned int irq) irq_insert_desc(irq, irq_desc + irq); } -#ifdef CONFIG_GENERIC_IRQ_LEGACY -void irq_init_desc(unsigned int irq) -{ - free_desc(irq); -} -#endif - #endif /* !CONFIG_SPARSE_IRQ */ int handle_irq_desc(struct irq_desc *desc) diff --git a/kernel/rseq.c b/kernel/rseq.c index b7a1ec327e81..2452b7366b00 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -342,12 +342,12 @@ static int rseq_need_restart(struct task_struct *t, u32 cs_flags) /* * Load and clear event mask atomically with respect to - * scheduler preemption. + * scheduler preemption and membarrier IPIs. */ - preempt_disable(); - event_mask = t->rseq_event_mask; - t->rseq_event_mask = 0; - preempt_enable(); + scoped_guard(RSEQ_EVENT_GUARD) { + event_mask = t->rseq_event_mask; + t->rseq_event_mask = 0; + } return !!event_mask; } diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 523a97f28bfc..198d2dd45f59 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7,6 +7,8 @@ * Copyright (C) 1991-2002 Linus Torvalds * Copyright (C) 1998-2024 Ingo Molnar, Red Hat */ +#define INSTANTIATE_EXPORTED_MIGRATE_DISABLE +#include <linux/sched.h> #include <linux/highmem.h> #include <linux/hrtimer_api.h> #include <linux/ktime_api.h> @@ -917,7 +919,7 @@ void hrtick_start(struct rq *rq, u64 delay) * doesn't make sense and can cause timer DoS. */ delta = max_t(s64, delay, 10000LL); - rq->hrtick_time = ktime_add_ns(timer->base->get_time(), delta); + rq->hrtick_time = ktime_add_ns(hrtimer_cb_get_time(timer), delta); if (rq == this_rq()) __hrtick_restart(rq); @@ -2381,28 +2383,7 @@ static void migrate_disable_switch(struct rq *rq, struct task_struct *p) __do_set_cpus_allowed(p, &ac); } -void migrate_disable(void) -{ - struct task_struct *p = current; - - if (p->migration_disabled) { -#ifdef CONFIG_DEBUG_PREEMPT - /* - *Warn about overflow half-way through the range. - */ - WARN_ON_ONCE((s16)p->migration_disabled < 0); -#endif - p->migration_disabled++; - return; - } - - guard(preempt)(); - this_rq()->nr_pinned++; - p->migration_disabled = 1; -} -EXPORT_SYMBOL_GPL(migrate_disable); - -void migrate_enable(void) +void ___migrate_enable(void) { struct task_struct *p = current; struct affinity_context ac = { @@ -2410,35 +2391,19 @@ void migrate_enable(void) .flags = SCA_MIGRATE_ENABLE, }; -#ifdef CONFIG_DEBUG_PREEMPT - /* - * Check both overflow from migrate_disable() and superfluous - * migrate_enable(). - */ - if (WARN_ON_ONCE((s16)p->migration_disabled <= 0)) - return; -#endif + __set_cpus_allowed_ptr(p, &ac); +} +EXPORT_SYMBOL_GPL(___migrate_enable); - if (p->migration_disabled > 1) { - p->migration_disabled--; - return; - } +void migrate_disable(void) +{ + __migrate_disable(); +} +EXPORT_SYMBOL_GPL(migrate_disable); - /* - * Ensure stop_task runs either before or after this, and that - * __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule(). - */ - guard(preempt)(); - if (p->cpus_ptr != &p->cpus_mask) - __set_cpus_allowed_ptr(p, &ac); - /* - * Mustn't clear migration_disabled() until cpus_ptr points back at the - * regular cpus_mask, otherwise things that race (eg. - * select_fallback_rq) get confused. - */ - barrier(); - p->migration_disabled = 0; - this_rq()->nr_pinned--; +void migrate_enable(void) +{ + __migrate_enable(); } EXPORT_SYMBOL_GPL(migrate_enable); @@ -4490,6 +4455,9 @@ static void __sched_fork(u64 clone_flags, struct task_struct *p) #ifdef CONFIG_FAIR_GROUP_SCHED p->se.cfs_rq = NULL; +#ifdef CONFIG_CFS_BANDWIDTH + init_cfs_throttle_work(p); +#endif #endif #ifdef CONFIG_SCHEDSTATS diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 72c1f72463c7..615411a0a881 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -2551,6 +2551,25 @@ static int find_later_rq(struct task_struct *task) return -1; } +static struct task_struct *pick_next_pushable_dl_task(struct rq *rq) +{ + struct task_struct *p; + + if (!has_pushable_dl_tasks(rq)) + return NULL; + + p = __node_2_pdl(rb_first_cached(&rq->dl.pushable_dl_tasks_root)); + + WARN_ON_ONCE(rq->cpu != task_cpu(p)); + WARN_ON_ONCE(task_current(rq, p)); + WARN_ON_ONCE(p->nr_cpus_allowed <= 1); + + WARN_ON_ONCE(!task_on_rq_queued(p)); + WARN_ON_ONCE(!dl_task(p)); + + return p; +} + /* Locks the rq it finds */ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq) { @@ -2578,12 +2597,37 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq) /* Retry if something changed. */ if (double_lock_balance(rq, later_rq)) { - if (unlikely(task_rq(task) != rq || + /* + * double_lock_balance had to release rq->lock, in the + * meantime, task may no longer be fit to be migrated. + * Check the following to ensure that the task is + * still suitable for migration: + * 1. It is possible the task was scheduled, + * migrate_disabled was set and then got preempted, + * so we must check the task migration disable + * flag. + * 2. The CPU picked is in the task's affinity. + * 3. For throttled task (dl_task_offline_migration), + * check the following: + * - the task is not on the rq anymore (it was + * migrated) + * - the task is not on CPU anymore + * - the task is still a dl task + * - the task is not queued on the rq anymore + * 4. For the non-throttled task (push_dl_task), the + * check to ensure that this task is still at the + * head of the pushable tasks list is enough. + */ + if (unlikely(is_migration_disabled(task) || !cpumask_test_cpu(later_rq->cpu, &task->cpus_mask) || - task_on_cpu(rq, task) || - !dl_task(task) || - is_migration_disabled(task) || - !task_on_rq_queued(task))) { + (task->dl.dl_throttled && + (task_rq(task) != rq || + task_on_cpu(rq, task) || + !dl_task(task) || + !task_on_rq_queued(task))) || + (!task->dl.dl_throttled && + task != pick_next_pushable_dl_task(rq)))) { + double_unlock_balance(rq, later_rq); later_rq = NULL; break; @@ -2606,25 +2650,6 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq) return later_rq; } -static struct task_struct *pick_next_pushable_dl_task(struct rq *rq) -{ - struct task_struct *p; - - if (!has_pushable_dl_tasks(rq)) - return NULL; - - p = __node_2_pdl(rb_first_cached(&rq->dl.pushable_dl_tasks_root)); - - WARN_ON_ONCE(rq->cpu != task_cpu(p)); - WARN_ON_ONCE(task_current(rq, p)); - WARN_ON_ONCE(p->nr_cpus_allowed <= 1); - - WARN_ON_ONCE(!task_on_rq_queued(p)); - WARN_ON_ONCE(!dl_task(p)); - - return p; -} - /* * See if the non running -deadline tasks on this rq * can be sent to some other CPU where they can preempt diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 426287930497..3a89f949e307 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3957,9 +3957,6 @@ static void update_cfs_group(struct sched_entity *se) if (!gcfs_rq || !gcfs_rq->load.weight) return; - if (throttled_hierarchy(gcfs_rq)) - return; - shares = calc_group_shares(gcfs_rq); if (unlikely(se->load.weight != shares)) reweight_entity(cfs_rq_of(se), se, shares); @@ -5291,18 +5288,16 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) if (cfs_rq->nr_queued == 1) { check_enqueue_throttle(cfs_rq); - if (!throttled_hierarchy(cfs_rq)) { - list_add_leaf_cfs_rq(cfs_rq); - } else { + list_add_leaf_cfs_rq(cfs_rq); #ifdef CONFIG_CFS_BANDWIDTH + if (cfs_rq->pelt_clock_throttled) { struct rq *rq = rq_of(cfs_rq); - if (cfs_rq_throttled(cfs_rq) && !cfs_rq->throttled_clock) - cfs_rq->throttled_clock = rq_clock(rq); - if (!cfs_rq->throttled_clock_self) - cfs_rq->throttled_clock_self = rq_clock(rq); -#endif + cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) - + cfs_rq->throttled_clock_pelt; + cfs_rq->pelt_clock_throttled = 0; } +#endif } } @@ -5341,8 +5336,6 @@ static void set_delayed(struct sched_entity *se) struct cfs_rq *cfs_rq = cfs_rq_of(se); cfs_rq->h_nr_runnable--; - if (cfs_rq_throttled(cfs_rq)) - break; } } @@ -5363,8 +5356,6 @@ static void clear_delayed(struct sched_entity *se) struct cfs_rq *cfs_rq = cfs_rq_of(se); cfs_rq->h_nr_runnable++; - if (cfs_rq_throttled(cfs_rq)) - break; } } @@ -5392,7 +5383,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * DELAY_DEQUEUE relies on spurious wakeups, special task * states must not suffer spurious wakeups, excempt them. */ - if (flags & DEQUEUE_SPECIAL) + if (flags & (DEQUEUE_SPECIAL | DEQUEUE_THROTTLE)) delay = false; WARN_ON_ONCE(delay && se->sched_delayed); @@ -5450,8 +5441,18 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) if (flags & DEQUEUE_DELAYED) finish_delayed_dequeue_entity(se); - if (cfs_rq->nr_queued == 0) + if (cfs_rq->nr_queued == 0) { update_idle_cfs_rq_clock_pelt(cfs_rq); +#ifdef CONFIG_CFS_BANDWIDTH + if (throttled_hierarchy(cfs_rq)) { + struct rq *rq = rq_of(cfs_rq); + + list_del_leaf_cfs_rq(cfs_rq); + cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq); + cfs_rq->pelt_clock_throttled = 1; + } +#endif + } return true; } @@ -5725,74 +5726,253 @@ static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) return cfs_bandwidth_used() && cfs_rq->throttled; } +static inline bool cfs_rq_pelt_clock_throttled(struct cfs_rq *cfs_rq) +{ + return cfs_bandwidth_used() && cfs_rq->pelt_clock_throttled; +} + /* check whether cfs_rq, or any parent, is throttled */ static inline int throttled_hierarchy(struct cfs_rq *cfs_rq) { return cfs_bandwidth_used() && cfs_rq->throttle_count; } +static inline int lb_throttled_hierarchy(struct task_struct *p, int dst_cpu) +{ + return throttled_hierarchy(task_group(p)->cfs_rq[dst_cpu]); +} + +static inline bool task_is_throttled(struct task_struct *p) +{ + return cfs_bandwidth_used() && p->throttled; +} + +static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags); +static void throttle_cfs_rq_work(struct callback_head *work) +{ + struct task_struct *p = container_of(work, struct task_struct, sched_throttle_work); + struct sched_entity *se; + struct cfs_rq *cfs_rq; + struct rq *rq; + + WARN_ON_ONCE(p != current); + p->sched_throttle_work.next = &p->sched_throttle_work; + + /* + * If task is exiting, then there won't be a return to userspace, so we + * don't have to bother with any of this. + */ + if ((p->flags & PF_EXITING)) + return; + + scoped_guard(task_rq_lock, p) { + se = &p->se; + cfs_rq = cfs_rq_of(se); + + /* Raced, forget */ + if (p->sched_class != &fair_sched_class) + return; + + /* + * If not in limbo, then either replenish has happened or this + * task got migrated out of the throttled cfs_rq, move along. + */ + if (!cfs_rq->throttle_count) + return; + rq = scope.rq; + update_rq_clock(rq); + WARN_ON_ONCE(p->throttled || !list_empty(&p->throttle_node)); + dequeue_task_fair(rq, p, DEQUEUE_SLEEP | DEQUEUE_THROTTLE); + list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list); + /* + * Must not set throttled before dequeue or dequeue will + * mistakenly regard this task as an already throttled one. + */ + p->throttled = true; + resched_curr(rq); + } +} + +void init_cfs_throttle_work(struct task_struct *p) +{ + init_task_work(&p->sched_throttle_work, throttle_cfs_rq_work); + /* Protect against double add, see throttle_cfs_rq() and throttle_cfs_rq_work() */ + p->sched_throttle_work.next = &p->sched_throttle_work; + INIT_LIST_HEAD(&p->throttle_node); +} + /* - * Ensure that neither of the group entities corresponding to src_cpu or - * dest_cpu are members of a throttled hierarchy when performing group - * load-balance operations. + * Task is throttled and someone wants to dequeue it again: + * it could be sched/core when core needs to do things like + * task affinity change, task group change, task sched class + * change etc. and in these cases, DEQUEUE_SLEEP is not set; + * or the task is blocked after throttled due to freezer etc. + * and in these cases, DEQUEUE_SLEEP is set. */ -static inline int throttled_lb_pair(struct task_group *tg, - int src_cpu, int dest_cpu) +static void detach_task_cfs_rq(struct task_struct *p); +static void dequeue_throttled_task(struct task_struct *p, int flags) +{ + WARN_ON_ONCE(p->se.on_rq); + list_del_init(&p->throttle_node); + + /* task blocked after throttled */ + if (flags & DEQUEUE_SLEEP) { + p->throttled = false; + return; + } + + /* + * task is migrating off its old cfs_rq, detach + * the task's load from its old cfs_rq. + */ + if (task_on_rq_migrating(p)) + detach_task_cfs_rq(p); +} + +static bool enqueue_throttled_task(struct task_struct *p) { - struct cfs_rq *src_cfs_rq, *dest_cfs_rq; + struct cfs_rq *cfs_rq = cfs_rq_of(&p->se); - src_cfs_rq = tg->cfs_rq[src_cpu]; - dest_cfs_rq = tg->cfs_rq[dest_cpu]; + /* @p should have gone through dequeue_throttled_task() first */ + WARN_ON_ONCE(!list_empty(&p->throttle_node)); + + /* + * If the throttled task @p is enqueued to a throttled cfs_rq, + * take the fast path by directly putting the task on the + * target cfs_rq's limbo list. + * + * Do not do that when @p is current because the following race can + * cause @p's group_node to be incorectly re-insterted in its rq's + * cfs_tasks list, despite being throttled: + * + * cpuX cpuY + * p ret2user + * throttle_cfs_rq_work() sched_move_task(p) + * LOCK task_rq_lock + * dequeue_task_fair(p) + * UNLOCK task_rq_lock + * LOCK task_rq_lock + * task_current_donor(p) == true + * task_on_rq_queued(p) == true + * dequeue_task(p) + * put_prev_task(p) + * sched_change_group() + * enqueue_task(p) -> p's new cfs_rq + * is throttled, go + * fast path and skip + * actual enqueue + * set_next_task(p) + * list_move(&se->group_node, &rq->cfs_tasks); // bug + * schedule() + * + * In the above race case, @p current cfs_rq is in the same rq as + * its previous cfs_rq because sched_move_task() only moves a task + * to a different group from the same rq, so we can use its current + * cfs_rq to derive rq and test if the task is current. + */ + if (throttled_hierarchy(cfs_rq) && + !task_current_donor(rq_of(cfs_rq), p)) { + list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list); + return true; + } - return throttled_hierarchy(src_cfs_rq) || - throttled_hierarchy(dest_cfs_rq); + /* we can't take the fast path, do an actual enqueue*/ + p->throttled = false; + return false; } +static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags); static int tg_unthrottle_up(struct task_group *tg, void *data) { struct rq *rq = data; struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; + struct task_struct *p, *tmp; - cfs_rq->throttle_count--; - if (!cfs_rq->throttle_count) { + if (--cfs_rq->throttle_count) + return 0; + + if (cfs_rq->pelt_clock_throttled) { cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) - cfs_rq->throttled_clock_pelt; + cfs_rq->pelt_clock_throttled = 0; + } - /* Add cfs_rq with load or one or more already running entities to the list */ - if (!cfs_rq_is_decayed(cfs_rq)) - list_add_leaf_cfs_rq(cfs_rq); + if (cfs_rq->throttled_clock_self) { + u64 delta = rq_clock(rq) - cfs_rq->throttled_clock_self; - if (cfs_rq->throttled_clock_self) { - u64 delta = rq_clock(rq) - cfs_rq->throttled_clock_self; + cfs_rq->throttled_clock_self = 0; - cfs_rq->throttled_clock_self = 0; + if (WARN_ON_ONCE((s64)delta < 0)) + delta = 0; - if (WARN_ON_ONCE((s64)delta < 0)) - delta = 0; + cfs_rq->throttled_clock_self_time += delta; + } - cfs_rq->throttled_clock_self_time += delta; - } + /* Re-enqueue the tasks that have been throttled at this level. */ + list_for_each_entry_safe(p, tmp, &cfs_rq->throttled_limbo_list, throttle_node) { + list_del_init(&p->throttle_node); + p->throttled = false; + enqueue_task_fair(rq_of(cfs_rq), p, ENQUEUE_WAKEUP); } + /* Add cfs_rq with load or one or more already running entities to the list */ + if (!cfs_rq_is_decayed(cfs_rq)) + list_add_leaf_cfs_rq(cfs_rq); + return 0; } +static inline bool task_has_throttle_work(struct task_struct *p) +{ + return p->sched_throttle_work.next != &p->sched_throttle_work; +} + +static inline void task_throttle_setup_work(struct task_struct *p) +{ + if (task_has_throttle_work(p)) + return; + + /* + * Kthreads and exiting tasks don't return to userspace, so adding the + * work is pointless + */ + if ((p->flags & (PF_EXITING | PF_KTHREAD))) + return; + + task_work_add(p, &p->sched_throttle_work, TWA_RESUME); +} + +static void record_throttle_clock(struct cfs_rq *cfs_rq) +{ + struct rq *rq = rq_of(cfs_rq); + + if (cfs_rq_throttled(cfs_rq) && !cfs_rq->throttled_clock) + cfs_rq->throttled_clock = rq_clock(rq); + + if (!cfs_rq->throttled_clock_self) + cfs_rq->throttled_clock_self = rq_clock(rq); +} + static int tg_throttle_down(struct task_group *tg, void *data) { struct rq *rq = data; struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; - /* group is entering throttled state, stop time */ - if (!cfs_rq->throttle_count) { - cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq); - list_del_leaf_cfs_rq(cfs_rq); + if (cfs_rq->throttle_count++) + return 0; - WARN_ON_ONCE(cfs_rq->throttled_clock_self); - if (cfs_rq->nr_queued) - cfs_rq->throttled_clock_self = rq_clock(rq); + /* + * For cfs_rqs that still have entities enqueued, PELT clock + * stop happens at dequeue time when all entities are dequeued. + */ + if (!cfs_rq->nr_queued) { + list_del_leaf_cfs_rq(cfs_rq); + cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq); + cfs_rq->pelt_clock_throttled = 1; } - cfs_rq->throttle_count++; + WARN_ON_ONCE(cfs_rq->throttled_clock_self); + WARN_ON_ONCE(!list_empty(&cfs_rq->throttled_limbo_list)); return 0; } @@ -5800,8 +5980,7 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) { struct rq *rq = rq_of(cfs_rq); struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); - struct sched_entity *se; - long queued_delta, runnable_delta, idle_delta, dequeue = 1; + int dequeue = 1; raw_spin_lock(&cfs_b->lock); /* This will start the period timer if necessary */ @@ -5824,76 +6003,17 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) if (!dequeue) return false; /* Throttle no longer required. */ - se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))]; - /* freeze hierarchy runnable averages while throttled */ rcu_read_lock(); walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq); rcu_read_unlock(); - queued_delta = cfs_rq->h_nr_queued; - runnable_delta = cfs_rq->h_nr_runnable; - idle_delta = cfs_rq->h_nr_idle; - for_each_sched_entity(se) { - struct cfs_rq *qcfs_rq = cfs_rq_of(se); - int flags; - - /* throttled entity or throttle-on-deactivate */ - if (!se->on_rq) - goto done; - - /* - * Abuse SPECIAL to avoid delayed dequeue in this instance. - * This avoids teaching dequeue_entities() about throttled - * entities and keeps things relatively simple. - */ - flags = DEQUEUE_SLEEP | DEQUEUE_SPECIAL; - if (se->sched_delayed) - flags |= DEQUEUE_DELAYED; - dequeue_entity(qcfs_rq, se, flags); - - if (cfs_rq_is_idle(group_cfs_rq(se))) - idle_delta = cfs_rq->h_nr_queued; - - qcfs_rq->h_nr_queued -= queued_delta; - qcfs_rq->h_nr_runnable -= runnable_delta; - qcfs_rq->h_nr_idle -= idle_delta; - - if (qcfs_rq->load.weight) { - /* Avoid re-evaluating load for this entity: */ - se = parent_entity(se); - break; - } - } - - for_each_sched_entity(se) { - struct cfs_rq *qcfs_rq = cfs_rq_of(se); - /* throttled entity or throttle-on-deactivate */ - if (!se->on_rq) - goto done; - - update_load_avg(qcfs_rq, se, 0); - se_update_runnable(se); - - if (cfs_rq_is_idle(group_cfs_rq(se))) - idle_delta = cfs_rq->h_nr_queued; - - qcfs_rq->h_nr_queued -= queued_delta; - qcfs_rq->h_nr_runnable -= runnable_delta; - qcfs_rq->h_nr_idle -= idle_delta; - } - - /* At this point se is NULL and we are at root level*/ - sub_nr_running(rq, queued_delta); -done: /* * Note: distribution will already see us throttled via the * throttled-list. rq->lock protects completion. */ cfs_rq->throttled = 1; WARN_ON_ONCE(cfs_rq->throttled_clock); - if (cfs_rq->nr_queued) - cfs_rq->throttled_clock = rq_clock(rq); return true; } @@ -5901,9 +6021,20 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) { struct rq *rq = rq_of(cfs_rq); struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); - struct sched_entity *se; - long queued_delta, runnable_delta, idle_delta; - long rq_h_nr_queued = rq->cfs.h_nr_queued; + struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)]; + + /* + * It's possible we are called with !runtime_remaining due to things + * like user changed quota setting(see tg_set_cfs_bandwidth()) or async + * unthrottled us with a positive runtime_remaining but other still + * running entities consumed those runtime before we reached here. + * + * Anyway, we can't unthrottle this cfs_rq without any runtime remaining + * because any enqueue in tg_unthrottle_up() will immediately trigger a + * throttle, which is not supposed to happen on unthrottle path. + */ + if (cfs_rq->runtime_enabled && cfs_rq->runtime_remaining <= 0) + return; se = cfs_rq->tg->se[cpu_of(rq)]; @@ -5933,62 +6064,8 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) if (list_add_leaf_cfs_rq(cfs_rq_of(se))) break; } - goto unthrottle_throttle; - } - - queued_delta = cfs_rq->h_nr_queued; - runnable_delta = cfs_rq->h_nr_runnable; - idle_delta = cfs_rq->h_nr_idle; - for_each_sched_entity(se) { - struct cfs_rq *qcfs_rq = cfs_rq_of(se); - - /* Handle any unfinished DELAY_DEQUEUE business first. */ - if (se->sched_delayed) { - int flags = DEQUEUE_SLEEP | DEQUEUE_DELAYED; - - dequeue_entity(qcfs_rq, se, flags); - } else if (se->on_rq) - break; - enqueue_entity(qcfs_rq, se, ENQUEUE_WAKEUP); - - if (cfs_rq_is_idle(group_cfs_rq(se))) - idle_delta = cfs_rq->h_nr_queued; - - qcfs_rq->h_nr_queued += queued_delta; - qcfs_rq->h_nr_runnable += runnable_delta; - qcfs_rq->h_nr_idle += idle_delta; - - /* end evaluation on encountering a throttled cfs_rq */ - if (cfs_rq_throttled(qcfs_rq)) - goto unthrottle_throttle; - } - - for_each_sched_entity(se) { - struct cfs_rq *qcfs_rq = cfs_rq_of(se); - - update_load_avg(qcfs_rq, se, UPDATE_TG); - se_update_runnable(se); - - if (cfs_rq_is_idle(group_cfs_rq(se))) - idle_delta = cfs_rq->h_nr_queued; - - qcfs_rq->h_nr_queued += queued_delta; - qcfs_rq->h_nr_runnable += runnable_delta; - qcfs_rq->h_nr_idle += idle_delta; - - /* end evaluation on encountering a throttled cfs_rq */ - if (cfs_rq_throttled(qcfs_rq)) - goto unthrottle_throttle; } - /* Start the fair server if un-throttling resulted in new runnable tasks */ - if (!rq_h_nr_queued && rq->cfs.h_nr_queued) - dl_server_start(&rq->fair_server); - - /* At this point se is NULL and we are at root level*/ - add_nr_running(rq, queued_delta); - -unthrottle_throttle: assert_list_leaf_cfs_rq(rq); /* Determine whether we need to wake up potentially idle CPU: */ @@ -6472,6 +6549,7 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) cfs_rq->runtime_enabled = 0; INIT_LIST_HEAD(&cfs_rq->throttled_list); INIT_LIST_HEAD(&cfs_rq->throttled_csd_list); + INIT_LIST_HEAD(&cfs_rq->throttled_limbo_list); } void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) @@ -6639,19 +6717,28 @@ static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; } static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} static inline void sync_throttle(struct task_group *tg, int cpu) {} static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} +static void task_throttle_setup_work(struct task_struct *p) {} +static bool task_is_throttled(struct task_struct *p) { return false; } +static void dequeue_throttled_task(struct task_struct *p, int flags) {} +static bool enqueue_throttled_task(struct task_struct *p) { return false; } +static void record_throttle_clock(struct cfs_rq *cfs_rq) {} static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) { return 0; } +static inline bool cfs_rq_pelt_clock_throttled(struct cfs_rq *cfs_rq) +{ + return false; +} + static inline int throttled_hierarchy(struct cfs_rq *cfs_rq) { return 0; } -static inline int throttled_lb_pair(struct task_group *tg, - int src_cpu, int dest_cpu) +static inline int lb_throttled_hierarchy(struct task_struct *p, int dst_cpu) { return 0; } @@ -6831,6 +6918,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) int rq_h_nr_queued = rq->cfs.h_nr_queued; u64 slice = 0; + if (task_is_throttled(p) && enqueue_throttled_task(p)) + return; + /* * The code below (indirectly) updates schedutil which looks at * the cfs_rq utilization to select a frequency. @@ -6883,10 +6973,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_is_idle(cfs_rq)) h_nr_idle = 1; - /* end evaluation on encountering a throttled cfs_rq */ - if (cfs_rq_throttled(cfs_rq)) - goto enqueue_throttle; - flags = ENQUEUE_WAKEUP; } @@ -6908,10 +6994,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_is_idle(cfs_rq)) h_nr_idle = 1; - - /* end evaluation on encountering a throttled cfs_rq */ - if (cfs_rq_throttled(cfs_rq)) - goto enqueue_throttle; } if (!rq_h_nr_queued && rq->cfs.h_nr_queued) { @@ -6941,7 +7023,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (!task_new) check_update_overutilized_status(rq); -enqueue_throttle: assert_list_leaf_cfs_rq(rq); hrtick_update(rq); @@ -6963,6 +7044,7 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) bool was_sched_idle = sched_idle_rq(rq); bool task_sleep = flags & DEQUEUE_SLEEP; bool task_delayed = flags & DEQUEUE_DELAYED; + bool task_throttled = flags & DEQUEUE_THROTTLE; struct task_struct *p = NULL; int h_nr_idle = 0; int h_nr_queued = 0; @@ -6996,9 +7078,8 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) if (cfs_rq_is_idle(cfs_rq)) h_nr_idle = h_nr_queued; - /* end evaluation on encountering a throttled cfs_rq */ - if (cfs_rq_throttled(cfs_rq)) - return 0; + if (throttled_hierarchy(cfs_rq) && task_throttled) + record_throttle_clock(cfs_rq); /* Don't dequeue parent if it has other entities besides us */ if (cfs_rq->load.weight) { @@ -7010,7 +7091,7 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) * Bias pick_next to pick a task from this cfs_rq, as * p is sleeping when it is within its sched_slice. */ - if (task_sleep && se && !throttled_hierarchy(cfs_rq)) + if (task_sleep && se) set_next_buddy(se); break; } @@ -7037,9 +7118,8 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) if (cfs_rq_is_idle(cfs_rq)) h_nr_idle = h_nr_queued; - /* end evaluation on encountering a throttled cfs_rq */ - if (cfs_rq_throttled(cfs_rq)) - return 0; + if (throttled_hierarchy(cfs_rq) && task_throttled) + record_throttle_clock(cfs_rq); } sub_nr_running(rq, h_nr_queued); @@ -7073,6 +7153,11 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) */ static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) { + if (task_is_throttled(p)) { + dequeue_throttled_task(p, flags); + return true; + } + if (!p->se.sched_delayed) util_est_dequeue(&rq->cfs, p); @@ -8660,7 +8745,7 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int * lead to a throttle). This both saves work and prevents false * next-buddy nomination below. */ - if (unlikely(throttled_hierarchy(cfs_rq_of(pse)))) + if (task_is_throttled(p)) return; if (sched_feat(NEXT_BUDDY) && !(wake_flags & WF_FORK) && !pse->sched_delayed) { @@ -8741,19 +8826,22 @@ static struct task_struct *pick_task_fair(struct rq *rq) { struct sched_entity *se; struct cfs_rq *cfs_rq; + struct task_struct *p; + bool throttled; again: cfs_rq = &rq->cfs; if (!cfs_rq->nr_queued) return NULL; + throttled = false; + do { /* Might not have done put_prev_entity() */ if (cfs_rq->curr && cfs_rq->curr->on_rq) update_curr(cfs_rq); - if (unlikely(check_cfs_rq_runtime(cfs_rq))) - goto again; + throttled |= check_cfs_rq_runtime(cfs_rq); se = pick_next_entity(rq, cfs_rq); if (!se) @@ -8761,7 +8849,10 @@ again: cfs_rq = group_cfs_rq(se); } while (cfs_rq); - return task_of(se); + p = task_of(se); + if (unlikely(throttled)) + task_throttle_setup_work(p); + return p; } static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool first); @@ -8923,8 +9014,8 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p) { struct sched_entity *se = &p->se; - /* throttled hierarchies are not runnable */ - if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se))) + /* !se->on_rq also covers throttled task */ + if (!se->on_rq) return false; /* Tell the scheduler that we'd really like se to run next. */ @@ -9283,7 +9374,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) /* * We do not migrate tasks that are: * 1) delayed dequeued unless we migrate load, or - * 2) throttled_lb_pair, or + * 2) target cfs_rq is in throttled hierarchy, or * 3) cannot be migrated to this CPU due to cpus_ptr, or * 4) running (obviously), or * 5) are cache-hot on their current CPU, or @@ -9292,7 +9383,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) if ((p->se.sched_delayed) && (env->migration_type != migrate_load)) return 0; - if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu)) + if (lb_throttled_hierarchy(p, env->dst_cpu)) return 0; /* @@ -13076,10 +13167,13 @@ static void propagate_entity_cfs_rq(struct sched_entity *se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); - if (cfs_rq_throttled(cfs_rq)) - return; - - if (!throttled_hierarchy(cfs_rq)) + /* + * If a task gets attached to this cfs_rq and before being queued, + * it gets migrated to another CPU due to reasons like affinity + * change, make sure this cfs_rq stays on leaf cfs_rq list to have + * that removed load decayed or it can cause faireness problem. + */ + if (!cfs_rq_pelt_clock_throttled(cfs_rq)) list_add_leaf_cfs_rq(cfs_rq); /* Start to propagate at parent */ @@ -13090,10 +13184,7 @@ static void propagate_entity_cfs_rq(struct sched_entity *se) update_load_avg(cfs_rq, se, UPDATE_TG); - if (cfs_rq_throttled(cfs_rq)) - break; - - if (!throttled_hierarchy(cfs_rq)) + if (!cfs_rq_pelt_clock_throttled(cfs_rq)) list_add_leaf_cfs_rq(cfs_rq); } } diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h index 62c3fa543c0f..f921302dc40f 100644 --- a/kernel/sched/pelt.h +++ b/kernel/sched/pelt.h @@ -162,7 +162,7 @@ static inline void update_idle_cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) { u64 throttled; - if (unlikely(cfs_rq->throttle_count)) + if (unlikely(cfs_rq->pelt_clock_throttled)) throttled = U64_MAX; else throttled = cfs_rq->throttled_clock_pelt_time; @@ -173,7 +173,7 @@ static inline void update_idle_cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) /* rq->task_clock normalized against any time this cfs_rq has spent throttled */ static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) { - if (unlikely(cfs_rq->throttle_count)) + if (unlikely(cfs_rq->pelt_clock_throttled)) return cfs_rq->throttled_clock_pelt - cfs_rq->throttled_clock_pelt_time; return rq_clock_pelt(rq_of(cfs_rq)) - cfs_rq->throttled_clock_pelt_time; diff --git a/kernel/sched/rq-offsets.c b/kernel/sched/rq-offsets.c new file mode 100644 index 000000000000..a23747bbe25b --- /dev/null +++ b/kernel/sched/rq-offsets.c @@ -0,0 +1,12 @@ +// SPDX-License-Identifier: GPL-2.0 +#define COMPILE_OFFSETS +#include <linux/kbuild.h> +#include <linux/types.h> +#include "sched.h" + +int main(void) +{ + DEFINE(RQ_nr_pinned, offsetof(struct rq, nr_pinned)); + + return 0; +} diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 69458b9b46dc..1f5d07067f60 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -760,10 +760,12 @@ struct cfs_rq { u64 throttled_clock_pelt_time; u64 throttled_clock_self; u64 throttled_clock_self_time; - int throttled; + bool throttled:1; + bool pelt_clock_throttled:1; int throttle_count; struct list_head throttled_list; struct list_head throttled_csd_list; + struct list_head throttled_limbo_list; #endif /* CONFIG_CFS_BANDWIDTH */ #endif /* CONFIG_FAIR_GROUP_SCHED */ }; @@ -2367,6 +2369,7 @@ extern const u32 sched_prio_to_wmult[40]; #define DEQUEUE_SPECIAL 0x10 #define DEQUEUE_MIGRATING 0x100 /* Matches ENQUEUE_MIGRATING */ #define DEQUEUE_DELAYED 0x200 /* Matches ENQUEUE_DELAYED */ +#define DEQUEUE_THROTTLE 0x800 #define ENQUEUE_WAKEUP 0x01 #define ENQUEUE_RESTORE 0x02 @@ -2683,6 +2686,8 @@ extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq); extern void init_dl_entity(struct sched_dl_entity *dl_se); +extern void init_cfs_throttle_work(struct task_struct *p); + #define BW_SHIFT 20 #define BW_UNIT (1 << BW_SHIFT) #define RATIO_SHIFT 8 diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 6e2f54169e66..444bdfdab731 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1591,7 +1591,6 @@ static void claim_allocations(int cpu, struct sched_domain *sd) enum numa_topology_type sched_numa_topology_type; static int sched_domains_numa_levels; -static int sched_domains_curr_level; int sched_max_numa_distance; static int *sched_domains_numa_distance; @@ -1632,14 +1631,7 @@ sd_init(struct sched_domain_topology_level *tl, int sd_id, sd_weight, sd_flags = 0; struct cpumask *sd_span; -#ifdef CONFIG_NUMA - /* - * Ugly hack to pass state to sd_numa_mask()... - */ - sched_domains_curr_level = tl->numa_level; -#endif - - sd_weight = cpumask_weight(tl->mask(cpu)); + sd_weight = cpumask_weight(tl->mask(tl, cpu)); if (tl->sd_flags) sd_flags = (*tl->sd_flags)(); @@ -1677,7 +1669,7 @@ sd_init(struct sched_domain_topology_level *tl, }; sd_span = sched_domain_span(sd); - cpumask_and(sd_span, cpu_map, tl->mask(cpu)); + cpumask_and(sd_span, cpu_map, tl->mask(tl, cpu)); sd_id = cpumask_first(sd_span); sd->flags |= asym_cpu_capacity_classify(sd_span, cpu_map); @@ -1732,22 +1724,63 @@ sd_init(struct sched_domain_topology_level *tl, return sd; } +#ifdef CONFIG_SCHED_SMT +int cpu_smt_flags(void) +{ + return SD_SHARE_CPUCAPACITY | SD_SHARE_LLC; +} + +const struct cpumask *tl_smt_mask(struct sched_domain_topology_level *tl, int cpu) +{ + return cpu_smt_mask(cpu); +} +#endif + +#ifdef CONFIG_SCHED_CLUSTER +int cpu_cluster_flags(void) +{ + return SD_CLUSTER | SD_SHARE_LLC; +} + +const struct cpumask *tl_cls_mask(struct sched_domain_topology_level *tl, int cpu) +{ + return cpu_clustergroup_mask(cpu); +} +#endif + +#ifdef CONFIG_SCHED_MC +int cpu_core_flags(void) +{ + return SD_SHARE_LLC; +} + +const struct cpumask *tl_mc_mask(struct sched_domain_topology_level *tl, int cpu) +{ + return cpu_coregroup_mask(cpu); +} +#endif + +const struct cpumask *tl_pkg_mask(struct sched_domain_topology_level *tl, int cpu) +{ + return cpu_node_mask(cpu); +} + /* * Topology list, bottom-up. */ static struct sched_domain_topology_level default_topology[] = { #ifdef CONFIG_SCHED_SMT - SDTL_INIT(cpu_smt_mask, cpu_smt_flags, SMT), + SDTL_INIT(tl_smt_mask, cpu_smt_flags, SMT), #endif #ifdef CONFIG_SCHED_CLUSTER - SDTL_INIT(cpu_clustergroup_mask, cpu_cluster_flags, CLS), + SDTL_INIT(tl_cls_mask, cpu_cluster_flags, CLS), #endif #ifdef CONFIG_SCHED_MC - SDTL_INIT(cpu_coregroup_mask, cpu_core_flags, MC), + SDTL_INIT(tl_mc_mask, cpu_core_flags, MC), #endif - SDTL_INIT(cpu_cpu_mask, NULL, PKG), + SDTL_INIT(tl_pkg_mask, NULL, PKG), { NULL, }, }; @@ -1768,10 +1801,14 @@ void __init set_sched_topology(struct sched_domain_topology_level *tl) } #ifdef CONFIG_NUMA +static int cpu_numa_flags(void) +{ + return SD_NUMA; +} -static const struct cpumask *sd_numa_mask(int cpu) +static const struct cpumask *sd_numa_mask(struct sched_domain_topology_level *tl, int cpu) { - return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)]; + return sched_domains_numa_masks[tl->numa_level][cpu_to_node(cpu)]; } static void sched_numa_warn(const char *str) @@ -2413,7 +2450,7 @@ static bool topology_span_sane(const struct cpumask *cpu_map) * breaks the linking done for an earlier span. */ for_each_cpu(cpu, cpu_map) { - const struct cpumask *tl_cpu_mask = tl->mask(cpu); + const struct cpumask *tl_cpu_mask = tl->mask(tl, cpu); int id; /* lowest bit set in this mask is used as a unique id */ @@ -2421,7 +2458,7 @@ static bool topology_span_sane(const struct cpumask *cpu_map) if (cpumask_test_cpu(id, id_seen)) { /* First CPU has already been seen, ensure identical spans */ - if (!cpumask_equal(tl->mask(id), tl_cpu_mask)) + if (!cpumask_equal(tl->mask(tl, id), tl_cpu_mask)) return false; } else { /* First CPU hasn't been seen before, ensure it's a completely new span */ diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 3bbfba30a777..25f62867a16d 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -741,6 +741,26 @@ out: } #ifdef SECCOMP_ARCH_NATIVE +static bool seccomp_uprobe_exception(struct seccomp_data *sd) +{ +#if defined __NR_uretprobe || defined __NR_uprobe +#ifdef SECCOMP_ARCH_COMPAT + if (sd->arch == SECCOMP_ARCH_NATIVE) +#endif + { +#ifdef __NR_uretprobe + if (sd->nr == __NR_uretprobe) + return true; +#endif +#ifdef __NR_uprobe + if (sd->nr == __NR_uprobe) + return true; +#endif + } +#endif + return false; +} + /** * seccomp_is_const_allow - check if filter is constant allow with given data * @fprog: The BPF programs @@ -758,13 +778,8 @@ static bool seccomp_is_const_allow(struct sock_fprog_kern *fprog, return false; /* Our single exception to filtering. */ -#ifdef __NR_uretprobe -#ifdef SECCOMP_ARCH_COMPAT - if (sd->arch == SECCOMP_ARCH_NATIVE) -#endif - if (sd->nr == __NR_uretprobe) - return true; -#endif + if (seccomp_uprobe_exception(sd)) + return true; for (pc = 0; pc < fprog->len; pc++) { struct sock_filter *insn = &fprog->filter[pc]; @@ -1043,6 +1058,9 @@ static const int mode1_syscalls[] = { #ifdef __NR_uretprobe __NR_uretprobe, #endif +#ifdef __NR_uprobe + __NR_uprobe, +#endif -1, /* negative terminated */ }; diff --git a/kernel/smp.c b/kernel/smp.c index 56f83aa58ec8..02f52291fae4 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -884,16 +884,15 @@ static void smp_call_function_many_cond(const struct cpumask *mask, * @mask: The set of cpus to run on (only runs on online subset). * @func: The function to run. This must be fast and non-blocking. * @info: An arbitrary pointer to pass to the function. - * @wait: Bitmask that controls the operation. If %SCF_WAIT is set, wait - * (atomically) until function has completed on other CPUs. If - * %SCF_RUN_LOCAL is set, the function will also be run locally - * if the local CPU is set in the @cpumask. - * - * If @wait is true, then returns once @func has returned. + * @wait: If true, wait (atomically) until function has completed + * on other CPUs. * * You must not call this function with disabled interrupts or from a * hardware interrupt handler or from a bottom half handler. Preemption * must be disabled when calling this function. + * + * @func is not called on the local CPU even if @mask contains it. Consider + * using on_each_cpu_cond_mask() instead if this is not desirable. */ void smp_call_function_many(const struct cpumask *mask, smp_call_func_t func, void *info, bool wait) diff --git a/kernel/softirq.c b/kernel/softirq.c index 513b1945987c..77198911b8dd 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -165,7 +165,11 @@ void __local_bh_disable_ip(unsigned long ip, unsigned int cnt) /* First entry of a task into a BH disabled section? */ if (!current->softirq_disable_cnt) { if (preemptible()) { - local_lock(&softirq_ctrl.lock); + if (IS_ENABLED(CONFIG_PREEMPT_RT_NEEDS_BH_LOCK)) + local_lock(&softirq_ctrl.lock); + else + migrate_disable(); + /* Required to meet the RCU bottomhalf requirements. */ rcu_read_lock(); } else { @@ -177,17 +181,34 @@ void __local_bh_disable_ip(unsigned long ip, unsigned int cnt) * Track the per CPU softirq disabled state. On RT this is per CPU * state to allow preemption of bottom half disabled sections. */ - newcnt = __this_cpu_add_return(softirq_ctrl.cnt, cnt); - /* - * Reflect the result in the task state to prevent recursion on the - * local lock and to make softirq_count() & al work. - */ - current->softirq_disable_cnt = newcnt; + if (IS_ENABLED(CONFIG_PREEMPT_RT_NEEDS_BH_LOCK)) { + newcnt = this_cpu_add_return(softirq_ctrl.cnt, cnt); + /* + * Reflect the result in the task state to prevent recursion on the + * local lock and to make softirq_count() & al work. + */ + current->softirq_disable_cnt = newcnt; - if (IS_ENABLED(CONFIG_TRACE_IRQFLAGS) && newcnt == cnt) { - raw_local_irq_save(flags); - lockdep_softirqs_off(ip); - raw_local_irq_restore(flags); + if (IS_ENABLED(CONFIG_TRACE_IRQFLAGS) && newcnt == cnt) { + raw_local_irq_save(flags); + lockdep_softirqs_off(ip); + raw_local_irq_restore(flags); + } + } else { + bool sirq_dis = false; + + if (!current->softirq_disable_cnt) + sirq_dis = true; + + this_cpu_add(softirq_ctrl.cnt, cnt); + current->softirq_disable_cnt += cnt; + WARN_ON_ONCE(current->softirq_disable_cnt < 0); + + if (IS_ENABLED(CONFIG_TRACE_IRQFLAGS) && sirq_dis) { + raw_local_irq_save(flags); + lockdep_softirqs_off(ip); + raw_local_irq_restore(flags); + } } } EXPORT_SYMBOL(__local_bh_disable_ip); @@ -195,23 +216,42 @@ EXPORT_SYMBOL(__local_bh_disable_ip); static void __local_bh_enable(unsigned int cnt, bool unlock) { unsigned long flags; + bool sirq_en = false; int newcnt; - DEBUG_LOCKS_WARN_ON(current->softirq_disable_cnt != - this_cpu_read(softirq_ctrl.cnt)); + if (IS_ENABLED(CONFIG_PREEMPT_RT_NEEDS_BH_LOCK)) { + DEBUG_LOCKS_WARN_ON(current->softirq_disable_cnt != + this_cpu_read(softirq_ctrl.cnt)); + if (softirq_count() == cnt) + sirq_en = true; + } else { + if (current->softirq_disable_cnt == cnt) + sirq_en = true; + } - if (IS_ENABLED(CONFIG_TRACE_IRQFLAGS) && softirq_count() == cnt) { + if (IS_ENABLED(CONFIG_TRACE_IRQFLAGS) && sirq_en) { raw_local_irq_save(flags); lockdep_softirqs_on(_RET_IP_); raw_local_irq_restore(flags); } - newcnt = __this_cpu_sub_return(softirq_ctrl.cnt, cnt); - current->softirq_disable_cnt = newcnt; + if (IS_ENABLED(CONFIG_PREEMPT_RT_NEEDS_BH_LOCK)) { + newcnt = this_cpu_sub_return(softirq_ctrl.cnt, cnt); + current->softirq_disable_cnt = newcnt; - if (!newcnt && unlock) { - rcu_read_unlock(); - local_unlock(&softirq_ctrl.lock); + if (!newcnt && unlock) { + rcu_read_unlock(); + local_unlock(&softirq_ctrl.lock); + } + } else { + current->softirq_disable_cnt -= cnt; + this_cpu_sub(softirq_ctrl.cnt, cnt); + if (unlock && !current->softirq_disable_cnt) { + migrate_enable(); + rcu_read_unlock(); + } else { + WARN_ON_ONCE(current->softirq_disable_cnt < 0); + } } } @@ -228,7 +268,10 @@ void __local_bh_enable_ip(unsigned long ip, unsigned int cnt) lock_map_release(&bh_lock_map); local_irq_save(flags); - curcnt = __this_cpu_read(softirq_ctrl.cnt); + if (IS_ENABLED(CONFIG_PREEMPT_RT_NEEDS_BH_LOCK)) + curcnt = this_cpu_read(softirq_ctrl.cnt); + else + curcnt = current->softirq_disable_cnt; /* * If this is not reenabling soft interrupts, no point in trying to @@ -805,6 +848,58 @@ static bool tasklet_clear_sched(struct tasklet_struct *t) return false; } +#ifdef CONFIG_PREEMPT_RT +struct tasklet_sync_callback { + spinlock_t cb_lock; + atomic_t cb_waiters; +}; + +static DEFINE_PER_CPU(struct tasklet_sync_callback, tasklet_sync_callback) = { + .cb_lock = __SPIN_LOCK_UNLOCKED(tasklet_sync_callback.cb_lock), + .cb_waiters = ATOMIC_INIT(0), +}; + +static void tasklet_lock_callback(void) +{ + spin_lock(this_cpu_ptr(&tasklet_sync_callback.cb_lock)); +} + +static void tasklet_unlock_callback(void) +{ + spin_unlock(this_cpu_ptr(&tasklet_sync_callback.cb_lock)); +} + +static void tasklet_callback_cancel_wait_running(void) +{ + struct tasklet_sync_callback *sync_cb = this_cpu_ptr(&tasklet_sync_callback); + + atomic_inc(&sync_cb->cb_waiters); + spin_lock(&sync_cb->cb_lock); + atomic_dec(&sync_cb->cb_waiters); + spin_unlock(&sync_cb->cb_lock); +} + +static void tasklet_callback_sync_wait_running(void) +{ + struct tasklet_sync_callback *sync_cb = this_cpu_ptr(&tasklet_sync_callback); + + if (atomic_read(&sync_cb->cb_waiters)) { + spin_unlock(&sync_cb->cb_lock); + spin_lock(&sync_cb->cb_lock); + } +} + +#else /* !CONFIG_PREEMPT_RT: */ + +static void tasklet_lock_callback(void) { } +static void tasklet_unlock_callback(void) { } +static void tasklet_callback_sync_wait_running(void) { } + +#ifdef CONFIG_SMP +static void tasklet_callback_cancel_wait_running(void) { } +#endif +#endif /* !CONFIG_PREEMPT_RT */ + static void tasklet_action_common(struct tasklet_head *tl_head, unsigned int softirq_nr) { @@ -816,6 +911,7 @@ static void tasklet_action_common(struct tasklet_head *tl_head, tl_head->tail = &tl_head->head; local_irq_enable(); + tasklet_lock_callback(); while (list) { struct tasklet_struct *t = list; @@ -835,6 +931,7 @@ static void tasklet_action_common(struct tasklet_head *tl_head, } } tasklet_unlock(t); + tasklet_callback_sync_wait_running(); continue; } tasklet_unlock(t); @@ -847,6 +944,7 @@ static void tasklet_action_common(struct tasklet_head *tl_head, __raise_softirq_irqoff(softirq_nr); local_irq_enable(); } + tasklet_unlock_callback(); } static __latent_entropy void tasklet_action(void) @@ -897,12 +995,9 @@ void tasklet_unlock_spin_wait(struct tasklet_struct *t) /* * Prevent a live lock when current preempted soft * interrupt processing or prevents ksoftirqd from - * running. If the tasklet runs on a different CPU - * then this has no effect other than doing the BH - * disable/enable dance for nothing. + * running. */ - local_bh_disable(); - local_bh_enable(); + tasklet_callback_cancel_wait_running(); } else { cpu_relax(); } diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index c00a86931f8c..bf5d05c635ff 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -392,3 +392,4 @@ COND_SYSCALL(setuid16); COND_SYSCALL(rseq); COND_SYSCALL(uretprobe); +COND_SYSCALL(uprobe); diff --git a/kernel/time/Makefile b/kernel/time/Makefile index e6e9b85d4db5..f7d52d9543cc 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile @@ -26,7 +26,7 @@ obj-$(CONFIG_LEGACY_TIMER_TICK) += tick-legacy.o ifeq ($(CONFIG_SMP),y) obj-$(CONFIG_NO_HZ_COMMON) += timer_migration.o endif -obj-$(CONFIG_HAVE_GENERIC_VDSO) += vsyscall.o +obj-$(CONFIG_GENERIC_GETTIMEOFDAY) += vsyscall.o obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o obj-$(CONFIG_TEST_UDELAY) += test_udelay.o obj-$(CONFIG_TIME_NS) += namespace.o diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 577f0e6842d4..069d93bfb0c7 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -35,7 +35,7 @@ /** * struct alarm_base - Alarm timer bases - * @lock: Lock for syncrhonized access to the base + * @lock: Lock for synchronized access to the base * @timerqueue: Timerqueue head managing the list of events * @get_ktime: Function to read the time correlating to the base * @get_timespec: Function to read the namespace time correlating to the base diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index f3e831f62906..a59bc75ab7c5 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -633,7 +633,7 @@ void tick_offline_cpu(unsigned int cpu) raw_spin_lock(&clockevents_lock); tick_broadcast_offline(cpu); - tick_shutdown(cpu); + tick_shutdown(); /* * Unregister the clock event devices which were diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 0aef0e349e49..a1890a073196 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -144,7 +144,7 @@ static u64 suspend_start; * Default for maximum permissible skew when cs->uncertainty_margin is * not specified, and the lower bound even when cs->uncertainty_margin * is specified. This is also the default that is used when registering - * clocks with unspecifed cs->uncertainty_margin, so this macro is used + * clocks with unspecified cs->uncertainty_margin, so this macro is used * even in CONFIG_CLOCKSOURCE_WATCHDOG=n kernels. */ #define WATCHDOG_MAX_SKEW (MAX_SKEW_USEC * NSEC_PER_USEC) @@ -407,9 +407,8 @@ void clocksource_verify_percpu(struct clocksource *cs) if (!cpumask_empty(&cpus_behind)) pr_warn(" CPUs %*pbl behind CPU %d for clocksource %s.\n", cpumask_pr_args(&cpus_behind), testcpu, cs->name); - if (!cpumask_empty(&cpus_ahead) || !cpumask_empty(&cpus_behind)) - pr_warn(" CPU %d check durations %lldns - %lldns for clocksource %s.\n", - testcpu, cs_nsec_min, cs_nsec_max, cs->name); + pr_info(" CPU %d check durations %lldns - %lldns for clocksource %s.\n", + testcpu, cs_nsec_min, cs_nsec_max, cs->name); } EXPORT_SYMBOL_GPL(clocksource_verify_percpu); diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index e8c479329282..88aa062b8a55 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -59,6 +59,7 @@ #define HRTIMER_ACTIVE_ALL (HRTIMER_ACTIVE_SOFT | HRTIMER_ACTIVE_HARD) static void retrigger_next_event(void *arg); +static ktime_t __hrtimer_cb_get_time(clockid_t clock_id); /* * The timer bases: @@ -76,42 +77,34 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = { .index = HRTIMER_BASE_MONOTONIC, .clockid = CLOCK_MONOTONIC, - .get_time = &ktime_get, }, { .index = HRTIMER_BASE_REALTIME, .clockid = CLOCK_REALTIME, - .get_time = &ktime_get_real, }, { .index = HRTIMER_BASE_BOOTTIME, .clockid = CLOCK_BOOTTIME, - .get_time = &ktime_get_boottime, }, { .index = HRTIMER_BASE_TAI, .clockid = CLOCK_TAI, - .get_time = &ktime_get_clocktai, }, { .index = HRTIMER_BASE_MONOTONIC_SOFT, .clockid = CLOCK_MONOTONIC, - .get_time = &ktime_get, }, { .index = HRTIMER_BASE_REALTIME_SOFT, .clockid = CLOCK_REALTIME, - .get_time = &ktime_get_real, }, { .index = HRTIMER_BASE_BOOTTIME_SOFT, .clockid = CLOCK_BOOTTIME, - .get_time = &ktime_get_boottime, }, { .index = HRTIMER_BASE_TAI_SOFT, .clockid = CLOCK_TAI, - .get_time = &ktime_get_clocktai, }, }, .csd = CSD_INIT(retrigger_next_event, NULL) @@ -208,7 +201,7 @@ static bool hrtimer_suitable_target(struct hrtimer *timer, struct hrtimer_clock_ /* * The offline local CPU can't be the default target if the * next remote target event is after this timer. Keep the - * elected new base. An IPI will we issued to reprogram + * elected new base. An IPI will be issued to reprogram * it as a last resort. */ if (!hrtimer_base_is_online(this_cpu_base)) @@ -1253,7 +1246,7 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, remove_hrtimer(timer, base, true, force_local); if (mode & HRTIMER_MODE_REL) - tim = ktime_add_safe(tim, base->get_time()); + tim = ktime_add_safe(tim, __hrtimer_cb_get_time(base->clockid)); tim = hrtimer_update_lowres(timer, tim, mode); @@ -1574,10 +1567,10 @@ u64 hrtimer_next_event_without(const struct hrtimer *exclude) static inline int hrtimer_clockid_to_base(clockid_t clock_id) { switch (clock_id) { - case CLOCK_REALTIME: - return HRTIMER_BASE_REALTIME; case CLOCK_MONOTONIC: return HRTIMER_BASE_MONOTONIC; + case CLOCK_REALTIME: + return HRTIMER_BASE_REALTIME; case CLOCK_BOOTTIME: return HRTIMER_BASE_BOOTTIME; case CLOCK_TAI: @@ -1588,6 +1581,29 @@ static inline int hrtimer_clockid_to_base(clockid_t clock_id) } } +static ktime_t __hrtimer_cb_get_time(clockid_t clock_id) +{ + switch (clock_id) { + case CLOCK_MONOTONIC: + return ktime_get(); + case CLOCK_REALTIME: + return ktime_get_real(); + case CLOCK_BOOTTIME: + return ktime_get_boottime(); + case CLOCK_TAI: + return ktime_get_clocktai(); + default: + WARN(1, "Invalid clockid %d. Using MONOTONIC\n", clock_id); + return ktime_get(); + } +} + +ktime_t hrtimer_cb_get_time(const struct hrtimer *timer) +{ + return __hrtimer_cb_get_time(timer->base->clockid); +} +EXPORT_SYMBOL_GPL(hrtimer_cb_get_time); + static void __hrtimer_setup(struct hrtimer *timer, enum hrtimer_restart (*function)(struct hrtimer *), clockid_t clock_id, enum hrtimer_mode mode) diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c index 876d389b2e21..7c6110e964e7 100644 --- a/kernel/time/itimer.c +++ b/kernel/time/itimer.c @@ -163,8 +163,7 @@ void posixtimer_rearm_itimer(struct task_struct *tsk) struct hrtimer *tmr = &tsk->signal->real_timer; if (!hrtimer_is_queued(tmr) && tsk->signal->it_real_incr != 0) { - hrtimer_forward(tmr, tmr->base->get_time(), - tsk->signal->it_real_incr); + hrtimer_forward_now(tmr, tsk->signal->it_real_incr); hrtimer_restart(tmr); } } diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 8b582174b1f9..aa3120104a51 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -299,8 +299,7 @@ static void common_hrtimer_rearm(struct k_itimer *timr) { struct hrtimer *timer = &timr->it.real.timer; - timr->it_overrun += hrtimer_forward(timer, timer->base->get_time(), - timr->it_interval); + timr->it_overrun += hrtimer_forward_now(timer, timr->it_interval); hrtimer_restart(timer); } @@ -535,7 +534,7 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event, goto out; } /* - * After succesful copy out, the timer ID is visible to user space + * After successful copy out, the timer ID is visible to user space * now but not yet valid because new_timer::signal low order bit is 1. * * Complete the initialization with the clock specific create @@ -825,7 +824,7 @@ static void common_hrtimer_arm(struct k_itimer *timr, ktime_t expires, hrtimer_setup(&timr->it.real.timer, posix_timer_fn, timr->it_clock, mode); if (!absolute) - expires = ktime_add_safe(expires, timer->base->get_time()); + expires = ktime_add_safe(expires, hrtimer_cb_get_time(timer)); hrtimer_set_expires(timer, expires); if (!sigev_none) diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index cc15fe293719..cc1afec306b3 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -174,8 +174,7 @@ static enum hrtimer_restart sched_clock_poll(struct hrtimer *hrt) return HRTIMER_RESTART; } -void __init -sched_clock_register(u64 (*read)(void), int bits, unsigned long rate) +void sched_clock_register(u64 (*read)(void), int bits, unsigned long rate) { u64 res, wrap, new_mask, new_epoch, cyc, ns; u32 new_mult, new_shift; @@ -247,6 +246,7 @@ sched_clock_register(u64 (*read)(void), int bits, unsigned long rate) pr_debug("Registered %pS as sched_clock source\n", read); } +EXPORT_SYMBOL_GPL(sched_clock_register); void __init generic_sched_clock_init(void) { diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 9a3859443c04..7e33d3f2e889 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -411,24 +411,18 @@ int tick_cpu_dying(unsigned int dying_cpu) } /* - * Shutdown an event device on a given cpu: + * Shutdown an event device on the outgoing CPU: * - * This is called on a life CPU, when a CPU is dead. So we cannot - * access the hardware device itself. - * We just set the mode and remove it from the lists. + * Called by the dying CPU during teardown, with clockevents_lock held + * and interrupts disabled. */ -void tick_shutdown(unsigned int cpu) +void tick_shutdown(void) { - struct tick_device *td = &per_cpu(tick_cpu_device, cpu); + struct tick_device *td = this_cpu_ptr(&tick_cpu_device); struct clock_event_device *dev = td->evtdev; td->mode = TICKDEV_MODE_PERIODIC; if (dev) { - /* - * Prevent that the clock events layer tries to call - * the set mode function! - */ - clockevent_set_state(dev, CLOCK_EVT_STATE_DETACHED); clockevents_exchange_device(dev, NULL); dev->event_handler = clockevents_handle_noop; td->evtdev = NULL; diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index faac36de35b9..4e4f7bbe2a64 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -26,7 +26,7 @@ extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast); extern void tick_handle_periodic(struct clock_event_device *dev); extern void tick_check_new_device(struct clock_event_device *dev); extern void tick_offline_cpu(unsigned int cpu); -extern void tick_shutdown(unsigned int cpu); +extern void tick_shutdown(void); extern void tick_suspend(void); extern void tick_resume(void); extern bool tick_check_replacement(struct clock_event_device *curdev, diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index b03d0ada6469..488e47e96e93 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -102,8 +102,6 @@ print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now) SEQ_printf(m, " .index: %d\n", base->index); SEQ_printf(m, " .resolution: %u nsecs\n", hrtimer_resolution); - - SEQ_printf(m, " .get_time: %ps\n", base->get_time); #ifdef CONFIG_HIGH_RES_TIMERS SEQ_printf(m, " .offset: %Lu nsecs\n", (unsigned long long) ktime_to_ns(base->offset)); diff --git a/lib/test_objpool.c b/lib/test_objpool.c index 8f688187fa87..6a34a7582fdb 100644 --- a/lib/test_objpool.c +++ b/lib/test_objpool.c @@ -164,7 +164,7 @@ static enum hrtimer_restart ot_hrtimer_handler(struct hrtimer *hrt) /* do bulk-testings for objects pop/push */ item->worker(item, 1); - hrtimer_forward(hrt, hrt->base->get_time(), item->hrtcycle); + hrtimer_forward_now(hrt, item->hrtcycle); return HRTIMER_RESTART; } diff --git a/lib/vdso/Kconfig b/lib/vdso/Kconfig index 45df764b49ad..db87ba34ef19 100644 --- a/lib/vdso/Kconfig +++ b/lib/vdso/Kconfig @@ -12,24 +12,6 @@ config GENERIC_GETTIMEOFDAY Each architecture that enables this feature has to provide the fallback implementation. -config GENERIC_VDSO_32 - bool - depends on GENERIC_GETTIMEOFDAY && !64BIT - help - This config option helps to avoid possible performance issues - in 32 bit only architectures. - -config GENERIC_COMPAT_VDSO - bool - help - This config option enables the compat VDSO layer. - -config GENERIC_VDSO_TIME_NS - bool - help - Selected by architectures which support time namespaces in the - VDSO - config GENERIC_VDSO_OVERFLOW_PROTECT bool help @@ -37,14 +19,9 @@ config GENERIC_VDSO_OVERFLOW_PROTECT time getter functions for the price of an extra conditional in the hotpath. -endif - config VDSO_GETRANDOM bool help Selected by architectures that support vDSO getrandom(). -config GENERIC_VDSO_DATA_STORE - bool - help - Selected by architectures that use the generic vDSO data store. +endif diff --git a/lib/vdso/Makefile b/lib/vdso/Makefile index aedd40aaa950..405f743253d7 100644 --- a/lib/vdso/Makefile +++ b/lib/vdso/Makefile @@ -1,3 +1,3 @@ # SPDX-License-Identifier: GPL-2.0-only -obj-$(CONFIG_GENERIC_VDSO_DATA_STORE) += datastore.o +obj-$(CONFIG_HAVE_GENERIC_VDSO) += datastore.o diff --git a/lib/vdso/datastore.c b/lib/vdso/datastore.c index 3693c6caf2c4..a565c30c71a0 100644 --- a/lib/vdso/datastore.c +++ b/lib/vdso/datastore.c @@ -11,14 +11,14 @@ /* * The vDSO data page. */ -#ifdef CONFIG_HAVE_GENERIC_VDSO +#ifdef CONFIG_GENERIC_GETTIMEOFDAY static union { struct vdso_time_data data; u8 page[PAGE_SIZE]; } vdso_time_data_store __page_aligned_data; struct vdso_time_data *vdso_k_time_data = &vdso_time_data_store.data; static_assert(sizeof(vdso_time_data_store) == PAGE_SIZE); -#endif /* CONFIG_HAVE_GENERIC_VDSO */ +#endif /* CONFIG_GENERIC_GETTIMEOFDAY */ #ifdef CONFIG_VDSO_GETRANDOM static union { @@ -46,7 +46,7 @@ static vm_fault_t vvar_fault(const struct vm_special_mapping *sm, switch (vmf->pgoff) { case VDSO_TIME_PAGE_OFFSET: - if (!IS_ENABLED(CONFIG_HAVE_GENERIC_VDSO)) + if (!IS_ENABLED(CONFIG_GENERIC_GETTIMEOFDAY)) return VM_FAULT_SIGBUS; pfn = __phys_to_pfn(__pa_symbol(vdso_k_time_data)); if (timens_page) { diff --git a/lib/vdso/gettimeofday.c b/lib/vdso/gettimeofday.c index 02ea19f67164..95df0153f05a 100644 --- a/lib/vdso/gettimeofday.c +++ b/lib/vdso/gettimeofday.c @@ -108,15 +108,11 @@ bool vdso_get_timestamp(const struct vdso_time_data *vd, const struct vdso_clock return true; } -#ifdef CONFIG_TIME_NS - -#ifdef CONFIG_GENERIC_VDSO_DATA_STORE static __always_inline const struct vdso_time_data *__arch_get_vdso_u_timens_data(const struct vdso_time_data *vd) { return (void *)vd + PAGE_SIZE; } -#endif /* CONFIG_GENERIC_VDSO_DATA_STORE */ static __always_inline bool do_hres_timens(const struct vdso_time_data *vdns, const struct vdso_clock *vcns, @@ -149,20 +145,6 @@ bool do_hres_timens(const struct vdso_time_data *vdns, const struct vdso_clock * return true; } -#else -static __always_inline -const struct vdso_time_data *__arch_get_vdso_u_timens_data(const struct vdso_time_data *vd) -{ - return NULL; -} - -static __always_inline -bool do_hres_timens(const struct vdso_time_data *vdns, const struct vdso_clock *vcns, - clockid_t clk, struct __kernel_timespec *ts) -{ - return false; -} -#endif static __always_inline bool do_hres(const struct vdso_time_data *vd, const struct vdso_clock *vc, @@ -204,7 +186,6 @@ bool do_hres(const struct vdso_time_data *vd, const struct vdso_clock *vc, return true; } -#ifdef CONFIG_TIME_NS static __always_inline bool do_coarse_timens(const struct vdso_time_data *vdns, const struct vdso_clock *vcns, clockid_t clk, struct __kernel_timespec *ts) @@ -233,14 +214,6 @@ bool do_coarse_timens(const struct vdso_time_data *vdns, const struct vdso_clock return true; } -#else -static __always_inline -bool do_coarse_timens(const struct vdso_time_data *vdns, const struct vdso_clock *vcns, - clockid_t clk, struct __kernel_timespec *ts) -{ - return false; -} -#endif static __always_inline bool do_coarse(const struct vdso_time_data *vd, const struct vdso_clock *vc, diff --git a/rust/helpers/atomic.c b/rust/helpers/atomic.c new file mode 100644 index 000000000000..cf06b7ef9a1c --- /dev/null +++ b/rust/helpers/atomic.c @@ -0,0 +1,1040 @@ +// SPDX-License-Identifier: GPL-2.0 + +// Generated by scripts/atomic/gen-rust-atomic-helpers.sh +// DO NOT MODIFY THIS FILE DIRECTLY + +/* + * This file provides helpers for the various atomic functions for Rust. + */ +#ifndef _RUST_ATOMIC_API_H +#define _RUST_ATOMIC_API_H + +#include <linux/atomic.h> + +// TODO: Remove this after INLINE_HELPERS support is added. +#ifndef __rust_helper +#define __rust_helper +#endif + +__rust_helper int +rust_helper_atomic_read(const atomic_t *v) +{ + return atomic_read(v); +} + +__rust_helper int +rust_helper_atomic_read_acquire(const atomic_t *v) +{ + return atomic_read_acquire(v); +} + +__rust_helper void +rust_helper_atomic_set(atomic_t *v, int i) +{ + atomic_set(v, i); +} + +__rust_helper void +rust_helper_atomic_set_release(atomic_t *v, int i) +{ + atomic_set_release(v, i); +} + +__rust_helper void +rust_helper_atomic_add(int i, atomic_t *v) +{ + atomic_add(i, v); +} + +__rust_helper int +rust_helper_atomic_add_return(int i, atomic_t *v) +{ + return atomic_add_return(i, v); +} + +__rust_helper int +rust_helper_atomic_add_return_acquire(int i, atomic_t *v) +{ + return atomic_add_return_acquire(i, v); +} + +__rust_helper int +rust_helper_atomic_add_return_release(int i, atomic_t *v) +{ + return atomic_add_return_release(i, v); +} + +__rust_helper int +rust_helper_atomic_add_return_relaxed(int i, atomic_t *v) +{ + return atomic_add_return_relaxed(i, v); +} + +__rust_helper int +rust_helper_atomic_fetch_add(int i, atomic_t *v) +{ + return atomic_fetch_add(i, v); +} + +__rust_helper int +rust_helper_atomic_fetch_add_acquire(int i, atomic_t *v) +{ + return atomic_fetch_add_acquire(i, v); +} + +__rust_helper int +rust_helper_atomic_fetch_add_release(int i, atomic_t *v) +{ + return atomic_fetch_add_release(i, v); +} + +__rust_helper int +rust_helper_atomic_fetch_add_relaxed(int i, atomic_t *v) +{ + return atomic_fetch_add_relaxed(i, v); +} + +__rust_helper void +rust_helper_atomic_sub(int i, atomic_t *v) +{ + atomic_sub(i, v); +} + +__rust_helper int +rust_helper_atomic_sub_return(int i, atomic_t *v) +{ + return atomic_sub_return(i, v); +} + +__rust_helper int +rust_helper_atomic_sub_return_acquire(int i, atomic_t *v) +{ + return atomic_sub_return_acquire(i, v); +} + +__rust_helper int +rust_helper_atomic_sub_return_release(int i, atomic_t *v) +{ + return atomic_sub_return_release(i, v); +} + +__rust_helper int +rust_helper_atomic_sub_return_relaxed(int i, atomic_t *v) +{ + return atomic_sub_return_relaxed(i, v); +} + +__rust_helper int +rust_helper_atomic_fetch_sub(int i, atomic_t *v) +{ + return atomic_fetch_sub(i, v); +} + +__rust_helper int +rust_helper_atomic_fetch_sub_acquire(int i, atomic_t *v) +{ + return atomic_fetch_sub_acquire(i, v); +} + +__rust_helper int +rust_helper_atomic_fetch_sub_release(int i, atomic_t *v) +{ + return atomic_fetch_sub_release(i, v); +} + +__rust_helper int +rust_helper_atomic_fetch_sub_relaxed(int i, atomic_t *v) +{ + return atomic_fetch_sub_relaxed(i, v); +} + +__rust_helper void +rust_helper_atomic_inc(atomic_t *v) +{ + atomic_inc(v); +} + +__rust_helper int +rust_helper_atomic_inc_return(atomic_t *v) +{ + return atomic_inc_return(v); +} + +__rust_helper int +rust_helper_atomic_inc_return_acquire(atomic_t *v) +{ + return atomic_inc_return_acquire(v); +} + +__rust_helper int +rust_helper_atomic_inc_return_release(atomic_t *v) +{ + return atomic_inc_return_release(v); +} + +__rust_helper int +rust_helper_atomic_inc_return_relaxed(atomic_t *v) +{ + return atomic_inc_return_relaxed(v); +} + +__rust_helper int +rust_helper_atomic_fetch_inc(atomic_t *v) +{ + return atomic_fetch_inc(v); +} + +__rust_helper int +rust_helper_atomic_fetch_inc_acquire(atomic_t *v) +{ + return atomic_fetch_inc_acquire(v); +} + +__rust_helper int +rust_helper_atomic_fetch_inc_release(atomic_t *v) +{ + return atomic_fetch_inc_release(v); +} + +__rust_helper int +rust_helper_atomic_fetch_inc_relaxed(atomic_t *v) +{ + return atomic_fetch_inc_relaxed(v); +} + +__rust_helper void +rust_helper_atomic_dec(atomic_t *v) +{ + atomic_dec(v); +} + +__rust_helper int +rust_helper_atomic_dec_return(atomic_t *v) +{ + return atomic_dec_return(v); +} + +__rust_helper int +rust_helper_atomic_dec_return_acquire(atomic_t *v) +{ + return atomic_dec_return_acquire(v); +} + +__rust_helper int +rust_helper_atomic_dec_return_release(atomic_t *v) +{ + return atomic_dec_return_release(v); +} + +__rust_helper int +rust_helper_atomic_dec_return_relaxed(atomic_t *v) +{ + return atomic_dec_return_relaxed(v); +} + +__rust_helper int +rust_helper_atomic_fetch_dec(atomic_t *v) +{ + return atomic_fetch_dec(v); +} + +__rust_helper int +rust_helper_atomic_fetch_dec_acquire(atomic_t *v) +{ + return atomic_fetch_dec_acquire(v); +} + +__rust_helper int +rust_helper_atomic_fetch_dec_release(atomic_t *v) +{ + return atomic_fetch_dec_release(v); +} + +__rust_helper int +rust_helper_atomic_fetch_dec_relaxed(atomic_t *v) +{ + return atomic_fetch_dec_relaxed(v); +} + +__rust_helper void +rust_helper_atomic_and(int i, atomic_t *v) +{ + atomic_and(i, v); +} + +__rust_helper int +rust_helper_atomic_fetch_and(int i, atomic_t *v) +{ + return atomic_fetch_and(i, v); +} + +__rust_helper int +rust_helper_atomic_fetch_and_acquire(int i, atomic_t *v) +{ + return atomic_fetch_and_acquire(i, v); +} + +__rust_helper int +rust_helper_atomic_fetch_and_release(int i, atomic_t *v) +{ + return atomic_fetch_and_release(i, v); +} + +__rust_helper int +rust_helper_atomic_fetch_and_relaxed(int i, atomic_t *v) +{ + return atomic_fetch_and_relaxed(i, v); +} + +__rust_helper void +rust_helper_atomic_andnot(int i, atomic_t *v) +{ + atomic_andnot(i, v); +} + +__rust_helper int +rust_helper_atomic_fetch_andnot(int i, atomic_t *v) +{ + return atomic_fetch_andnot(i, v); +} + +__rust_helper int +rust_helper_atomic_fetch_andnot_acquire(int i, atomic_t *v) +{ + return atomic_fetch_andnot_acquire(i, v); +} + +__rust_helper int +rust_helper_atomic_fetch_andnot_release(int i, atomic_t *v) +{ + return atomic_fetch_andnot_release(i, v); +} + +__rust_helper int +rust_helper_atomic_fetch_andnot_relaxed(int i, atomic_t *v) +{ + return atomic_fetch_andnot_relaxed(i, v); +} + +__rust_helper void +rust_helper_atomic_or(int i, atomic_t *v) +{ + atomic_or(i, v); +} + +__rust_helper int +rust_helper_atomic_fetch_or(int i, atomic_t *v) +{ + return atomic_fetch_or(i, v); +} + +__rust_helper int +rust_helper_atomic_fetch_or_acquire(int i, atomic_t *v) +{ + return atomic_fetch_or_acquire(i, v); +} + +__rust_helper int +rust_helper_atomic_fetch_or_release(int i, atomic_t *v) +{ + return atomic_fetch_or_release(i, v); +} + +__rust_helper int +rust_helper_atomic_fetch_or_relaxed(int i, atomic_t *v) +{ + return atomic_fetch_or_relaxed(i, v); +} + +__rust_helper void +rust_helper_atomic_xor(int i, atomic_t *v) +{ + atomic_xor(i, v); +} + +__rust_helper int +rust_helper_atomic_fetch_xor(int i, atomic_t *v) +{ + return atomic_fetch_xor(i, v); +} + +__rust_helper int +rust_helper_atomic_fetch_xor_acquire(int i, atomic_t *v) +{ + return atomic_fetch_xor_acquire(i, v); +} + +__rust_helper int +rust_helper_atomic_fetch_xor_release(int i, atomic_t *v) +{ + return atomic_fetch_xor_release(i, v); +} + +__rust_helper int +rust_helper_atomic_fetch_xor_relaxed(int i, atomic_t *v) +{ + return atomic_fetch_xor_relaxed(i, v); +} + +__rust_helper int +rust_helper_atomic_xchg(atomic_t *v, int new) +{ + return atomic_xchg(v, new); +} + +__rust_helper int +rust_helper_atomic_xchg_acquire(atomic_t *v, int new) +{ + return atomic_xchg_acquire(v, new); +} + +__rust_helper int +rust_helper_atomic_xchg_release(atomic_t *v, int new) +{ + return atomic_xchg_release(v, new); +} + +__rust_helper int +rust_helper_atomic_xchg_relaxed(atomic_t *v, int new) +{ + return atomic_xchg_relaxed(v, new); +} + +__rust_helper int +rust_helper_atomic_cmpxchg(atomic_t *v, int old, int new) +{ + return atomic_cmpxchg(v, old, new); +} + +__rust_helper int +rust_helper_atomic_cmpxchg_acquire(atomic_t *v, int old, int new) +{ + return atomic_cmpxchg_acquire(v, old, new); +} + +__rust_helper int +rust_helper_atomic_cmpxchg_release(atomic_t *v, int old, int new) +{ + return atomic_cmpxchg_release(v, old, new); +} + +__rust_helper int +rust_helper_atomic_cmpxchg_relaxed(atomic_t *v, int old, int new) +{ + return atomic_cmpxchg_relaxed(v, old, new); +} + +__rust_helper bool +rust_helper_atomic_try_cmpxchg(atomic_t *v, int *old, int new) +{ + return atomic_try_cmpxchg(v, old, new); +} + +__rust_helper bool +rust_helper_atomic_try_cmpxchg_acquire(atomic_t *v, int *old, int new) +{ + return atomic_try_cmpxchg_acquire(v, old, new); +} + +__rust_helper bool +rust_helper_atomic_try_cmpxchg_release(atomic_t *v, int *old, int new) +{ + return atomic_try_cmpxchg_release(v, old, new); +} + +__rust_helper bool +rust_helper_atomic_try_cmpxchg_relaxed(atomic_t *v, int *old, int new) +{ + return atomic_try_cmpxchg_relaxed(v, old, new); +} + +__rust_helper bool +rust_helper_atomic_sub_and_test(int i, atomic_t *v) +{ + return atomic_sub_and_test(i, v); +} + +__rust_helper bool +rust_helper_atomic_dec_and_test(atomic_t *v) +{ + return atomic_dec_and_test(v); +} + +__rust_helper bool +rust_helper_atomic_inc_and_test(atomic_t *v) +{ + return atomic_inc_and_test(v); +} + +__rust_helper bool +rust_helper_atomic_add_negative(int i, atomic_t *v) +{ + return atomic_add_negative(i, v); +} + +__rust_helper bool +rust_helper_atomic_add_negative_acquire(int i, atomic_t *v) +{ + return atomic_add_negative_acquire(i, v); +} + +__rust_helper bool +rust_helper_atomic_add_negative_release(int i, atomic_t *v) +{ + return atomic_add_negative_release(i, v); +} + +__rust_helper bool +rust_helper_atomic_add_negative_relaxed(int i, atomic_t *v) +{ + return atomic_add_negative_relaxed(i, v); +} + +__rust_helper int +rust_helper_atomic_fetch_add_unless(atomic_t *v, int a, int u) +{ + return atomic_fetch_add_unless(v, a, u); +} + +__rust_helper bool +rust_helper_atomic_add_unless(atomic_t *v, int a, int u) +{ + return atomic_add_unless(v, a, u); +} + +__rust_helper bool +rust_helper_atomic_inc_not_zero(atomic_t *v) +{ + return atomic_inc_not_zero(v); +} + +__rust_helper bool +rust_helper_atomic_inc_unless_negative(atomic_t *v) +{ + return atomic_inc_unless_negative(v); +} + +__rust_helper bool +rust_helper_atomic_dec_unless_positive(atomic_t *v) +{ + return atomic_dec_unless_positive(v); +} + +__rust_helper int +rust_helper_atomic_dec_if_positive(atomic_t *v) +{ + return atomic_dec_if_positive(v); +} + +__rust_helper s64 +rust_helper_atomic64_read(const atomic64_t *v) +{ + return atomic64_read(v); +} + +__rust_helper s64 +rust_helper_atomic64_read_acquire(const atomic64_t *v) +{ + return atomic64_read_acquire(v); +} + +__rust_helper void +rust_helper_atomic64_set(atomic64_t *v, s64 i) +{ + atomic64_set(v, i); +} + +__rust_helper void +rust_helper_atomic64_set_release(atomic64_t *v, s64 i) +{ + atomic64_set_release(v, i); +} + +__rust_helper void +rust_helper_atomic64_add(s64 i, atomic64_t *v) +{ + atomic64_add(i, v); +} + +__rust_helper s64 +rust_helper_atomic64_add_return(s64 i, atomic64_t *v) +{ + return atomic64_add_return(i, v); +} + +__rust_helper s64 +rust_helper_atomic64_add_return_acquire(s64 i, atomic64_t *v) +{ + return atomic64_add_return_acquire(i, v); +} + +__rust_helper s64 +rust_helper_atomic64_add_return_release(s64 i, atomic64_t *v) +{ + return atomic64_add_return_release(i, v); +} + +__rust_helper s64 +rust_helper_atomic64_add_return_relaxed(s64 i, atomic64_t *v) +{ + return atomic64_add_return_relaxed(i, v); +} + +__rust_helper s64 +rust_helper_atomic64_fetch_add(s64 i, atomic64_t *v) +{ + return atomic64_fetch_add(i, v); +} + +__rust_helper s64 +rust_helper_atomic64_fetch_add_acquire(s64 i, atomic64_t *v) +{ + return atomic64_fetch_add_acquire(i, v); +} + +__rust_helper s64 +rust_helper_atomic64_fetch_add_release(s64 i, atomic64_t *v) +{ + return atomic64_fetch_add_release(i, v); +} + +__rust_helper s64 +rust_helper_atomic64_fetch_add_relaxed(s64 i, atomic64_t *v) +{ + return atomic64_fetch_add_relaxed(i, v); +} + +__rust_helper void +rust_helper_atomic64_sub(s64 i, atomic64_t *v) +{ + atomic64_sub(i, v); +} + +__rust_helper s64 +rust_helper_atomic64_sub_return(s64 i, atomic64_t *v) +{ + return atomic64_sub_return(i, v); +} + +__rust_helper s64 +rust_helper_atomic64_sub_return_acquire(s64 i, atomic64_t *v) +{ + return atomic64_sub_return_acquire(i, v); +} + +__rust_helper s64 +rust_helper_atomic64_sub_return_release(s64 i, atomic64_t *v) +{ + return atomic64_sub_return_release(i, v); +} + +__rust_helper s64 +rust_helper_atomic64_sub_return_relaxed(s64 i, atomic64_t *v) +{ + return atomic64_sub_return_relaxed(i, v); +} + +__rust_helper s64 +rust_helper_atomic64_fetch_sub(s64 i, atomic64_t *v) +{ + return atomic64_fetch_sub(i, v); +} + +__rust_helper s64 +rust_helper_atomic64_fetch_sub_acquire(s64 i, atomic64_t *v) +{ + return atomic64_fetch_sub_acquire(i, v); +} + +__rust_helper s64 +rust_helper_atomic64_fetch_sub_release(s64 i, atomic64_t *v) +{ + return atomic64_fetch_sub_release(i, v); +} + +__rust_helper s64 +rust_helper_atomic64_fetch_sub_relaxed(s64 i, atomic64_t *v) +{ + return atomic64_fetch_sub_relaxed(i, v); +} + +__rust_helper void +rust_helper_atomic64_inc(atomic64_t *v) +{ + atomic64_inc(v); +} + +__rust_helper s64 +rust_helper_atomic64_inc_return(atomic64_t *v) +{ + return atomic64_inc_return(v); +} + +__rust_helper s64 +rust_helper_atomic64_inc_return_acquire(atomic64_t *v) +{ + return atomic64_inc_return_acquire(v); +} + +__rust_helper s64 +rust_helper_atomic64_inc_return_release(atomic64_t *v) +{ + return atomic64_inc_return_release(v); +} + +__rust_helper s64 +rust_helper_atomic64_inc_return_relaxed(atomic64_t *v) +{ + return atomic64_inc_return_relaxed(v); +} + +__rust_helper s64 +rust_helper_atomic64_fetch_inc(atomic64_t *v) +{ + return atomic64_fetch_inc(v); +} + +__rust_helper s64 +rust_helper_atomic64_fetch_inc_acquire(atomic64_t *v) +{ + return atomic64_fetch_inc_acquire(v); +} + +__rust_helper s64 +rust_helper_atomic64_fetch_inc_release(atomic64_t *v) +{ + return atomic64_fetch_inc_release(v); +} + +__rust_helper s64 +rust_helper_atomic64_fetch_inc_relaxed(atomic64_t *v) +{ + return atomic64_fetch_inc_relaxed(v); +} + +__rust_helper void +rust_helper_atomic64_dec(atomic64_t *v) +{ + atomic64_dec(v); +} + +__rust_helper s64 +rust_helper_atomic64_dec_return(atomic64_t *v) +{ + return atomic64_dec_return(v); +} + +__rust_helper s64 +rust_helper_atomic64_dec_return_acquire(atomic64_t *v) +{ + return atomic64_dec_return_acquire(v); +} + +__rust_helper s64 +rust_helper_atomic64_dec_return_release(atomic64_t *v) +{ + return atomic64_dec_return_release(v); +} + +__rust_helper s64 +rust_helper_atomic64_dec_return_relaxed(atomic64_t *v) +{ + return atomic64_dec_return_relaxed(v); +} + +__rust_helper s64 +rust_helper_atomic64_fetch_dec(atomic64_t *v) +{ + return atomic64_fetch_dec(v); +} + +__rust_helper s64 +rust_helper_atomic64_fetch_dec_acquire(atomic64_t *v) +{ + return atomic64_fetch_dec_acquire(v); +} + +__rust_helper s64 +rust_helper_atomic64_fetch_dec_release(atomic64_t *v) +{ + return atomic64_fetch_dec_release(v); +} + +__rust_helper s64 +rust_helper_atomic64_fetch_dec_relaxed(atomic64_t *v) +{ + return atomic64_fetch_dec_relaxed(v); +} + +__rust_helper void +rust_helper_atomic64_and(s64 i, atomic64_t *v) +{ + atomic64_and(i, v); +} + +__rust_helper s64 +rust_helper_atomic64_fetch_and(s64 i, atomic64_t *v) +{ + return atomic64_fetch_and(i, v); +} + +__rust_helper s64 +rust_helper_atomic64_fetch_and_acquire(s64 i, atomic64_t *v) +{ + return atomic64_fetch_and_acquire(i, v); +} + +__rust_helper s64 +rust_helper_atomic64_fetch_and_release(s64 i, atomic64_t *v) +{ + return atomic64_fetch_and_release(i, v); +} + +__rust_helper s64 +rust_helper_atomic64_fetch_and_relaxed(s64 i, atomic64_t *v) +{ + return atomic64_fetch_and_relaxed(i, v); +} + +__rust_helper void +rust_helper_atomic64_andnot(s64 i, atomic64_t *v) +{ + atomic64_andnot(i, v); +} + +__rust_helper s64 +rust_helper_atomic64_fetch_andnot(s64 i, atomic64_t *v) +{ + return atomic64_fetch_andnot(i, v); +} + +__rust_helper s64 +rust_helper_atomic64_fetch_andnot_acquire(s64 i, atomic64_t *v) +{ + return atomic64_fetch_andnot_acquire(i, v); +} + +__rust_helper s64 +rust_helper_atomic64_fetch_andnot_release(s64 i, atomic64_t *v) +{ + return atomic64_fetch_andnot_release(i, v); +} + +__rust_helper s64 +rust_helper_atomic64_fetch_andnot_relaxed(s64 i, atomic64_t *v) +{ + return atomic64_fetch_andnot_relaxed(i, v); +} + +__rust_helper void +rust_helper_atomic64_or(s64 i, atomic64_t *v) +{ + atomic64_or(i, v); +} + +__rust_helper s64 +rust_helper_atomic64_fetch_or(s64 i, atomic64_t *v) +{ + return atomic64_fetch_or(i, v); +} + +__rust_helper s64 +rust_helper_atomic64_fetch_or_acquire(s64 i, atomic64_t *v) +{ + return atomic64_fetch_or_acquire(i, v); +} + +__rust_helper s64 +rust_helper_atomic64_fetch_or_release(s64 i, atomic64_t *v) +{ + return atomic64_fetch_or_release(i, v); +} + +__rust_helper s64 +rust_helper_atomic64_fetch_or_relaxed(s64 i, atomic64_t *v) +{ + return atomic64_fetch_or_relaxed(i, v); +} + +__rust_helper void +rust_helper_atomic64_xor(s64 i, atomic64_t *v) +{ + atomic64_xor(i, v); +} + +__rust_helper s64 +rust_helper_atomic64_fetch_xor(s64 i, atomic64_t *v) +{ + return atomic64_fetch_xor(i, v); +} + +__rust_helper s64 +rust_helper_atomic64_fetch_xor_acquire(s64 i, atomic64_t *v) +{ + return atomic64_fetch_xor_acquire(i, v); +} + +__rust_helper s64 +rust_helper_atomic64_fetch_xor_release(s64 i, atomic64_t *v) +{ + return atomic64_fetch_xor_release(i, v); +} + +__rust_helper s64 +rust_helper_atomic64_fetch_xor_relaxed(s64 i, atomic64_t *v) +{ + return atomic64_fetch_xor_relaxed(i, v); +} + +__rust_helper s64 +rust_helper_atomic64_xchg(atomic64_t *v, s64 new) +{ + return atomic64_xchg(v, new); +} + +__rust_helper s64 +rust_helper_atomic64_xchg_acquire(atomic64_t *v, s64 new) +{ + return atomic64_xchg_acquire(v, new); +} + +__rust_helper s64 +rust_helper_atomic64_xchg_release(atomic64_t *v, s64 new) +{ + return atomic64_xchg_release(v, new); +} + +__rust_helper s64 +rust_helper_atomic64_xchg_relaxed(atomic64_t *v, s64 new) +{ + return atomic64_xchg_relaxed(v, new); +} + +__rust_helper s64 +rust_helper_atomic64_cmpxchg(atomic64_t *v, s64 old, s64 new) +{ + return atomic64_cmpxchg(v, old, new); +} + +__rust_helper s64 +rust_helper_atomic64_cmpxchg_acquire(atomic64_t *v, s64 old, s64 new) +{ + return atomic64_cmpxchg_acquire(v, old, new); +} + +__rust_helper s64 +rust_helper_atomic64_cmpxchg_release(atomic64_t *v, s64 old, s64 new) +{ + return atomic64_cmpxchg_release(v, old, new); +} + +__rust_helper s64 +rust_helper_atomic64_cmpxchg_relaxed(atomic64_t *v, s64 old, s64 new) +{ + return atomic64_cmpxchg_relaxed(v, old, new); +} + +__rust_helper bool +rust_helper_atomic64_try_cmpxchg(atomic64_t *v, s64 *old, s64 new) +{ + return atomic64_try_cmpxchg(v, old, new); +} + +__rust_helper bool +rust_helper_atomic64_try_cmpxchg_acquire(atomic64_t *v, s64 *old, s64 new) +{ + return atomic64_try_cmpxchg_acquire(v, old, new); +} + +__rust_helper bool +rust_helper_atomic64_try_cmpxchg_release(atomic64_t *v, s64 *old, s64 new) +{ + return atomic64_try_cmpxchg_release(v, old, new); +} + +__rust_helper bool +rust_helper_atomic64_try_cmpxchg_relaxed(atomic64_t *v, s64 *old, s64 new) +{ + return atomic64_try_cmpxchg_relaxed(v, old, new); +} + +__rust_helper bool +rust_helper_atomic64_sub_and_test(s64 i, atomic64_t *v) +{ + return atomic64_sub_and_test(i, v); +} + +__rust_helper bool +rust_helper_atomic64_dec_and_test(atomic64_t *v) +{ + return atomic64_dec_and_test(v); +} + +__rust_helper bool +rust_helper_atomic64_inc_and_test(atomic64_t *v) +{ + return atomic64_inc_and_test(v); +} + +__rust_helper bool +rust_helper_atomic64_add_negative(s64 i, atomic64_t *v) +{ + return atomic64_add_negative(i, v); +} + +__rust_helper bool +rust_helper_atomic64_add_negative_acquire(s64 i, atomic64_t *v) +{ + return atomic64_add_negative_acquire(i, v); +} + +__rust_helper bool +rust_helper_atomic64_add_negative_release(s64 i, atomic64_t *v) +{ + return atomic64_add_negative_release(i, v); +} + +__rust_helper bool +rust_helper_atomic64_add_negative_relaxed(s64 i, atomic64_t *v) +{ + return atomic64_add_negative_relaxed(i, v); +} + +__rust_helper s64 +rust_helper_atomic64_fetch_add_unless(atomic64_t *v, s64 a, s64 u) +{ + return atomic64_fetch_add_unless(v, a, u); +} + +__rust_helper bool +rust_helper_atomic64_add_unless(atomic64_t *v, s64 a, s64 u) +{ + return atomic64_add_unless(v, a, u); +} + +__rust_helper bool +rust_helper_atomic64_inc_not_zero(atomic64_t *v) +{ + return atomic64_inc_not_zero(v); +} + +__rust_helper bool +rust_helper_atomic64_inc_unless_negative(atomic64_t *v) +{ + return atomic64_inc_unless_negative(v); +} + +__rust_helper bool +rust_helper_atomic64_dec_unless_positive(atomic64_t *v) +{ + return atomic64_dec_unless_positive(v); +} + +__rust_helper s64 +rust_helper_atomic64_dec_if_positive(atomic64_t *v) +{ + return atomic64_dec_if_positive(v); +} + +#endif /* _RUST_ATOMIC_API_H */ +// 615a0e0c98b5973a47fe4fa65e92935051ca00ed diff --git a/rust/helpers/barrier.c b/rust/helpers/barrier.c new file mode 100644 index 000000000000..cdf28ce8e511 --- /dev/null +++ b/rust/helpers/barrier.c @@ -0,0 +1,18 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <asm/barrier.h> + +void rust_helper_smp_mb(void) +{ + smp_mb(); +} + +void rust_helper_smp_wmb(void) +{ + smp_wmb(); +} + +void rust_helper_smp_rmb(void) +{ + smp_rmb(); +} diff --git a/rust/helpers/helpers.c b/rust/helpers/helpers.c index 7cf7fe95e41d..85ad14b81925 100644 --- a/rust/helpers/helpers.c +++ b/rust/helpers/helpers.c @@ -7,7 +7,9 @@ * Sorted alphabetically. */ +#include "atomic.c" #include "auxiliary.c" +#include "barrier.c" #include "blk.c" #include "bug.c" #include "build_assert.c" diff --git a/rust/helpers/refcount.c b/rust/helpers/refcount.c index d6adbd2e45a1..d175898ad7b8 100644 --- a/rust/helpers/refcount.c +++ b/rust/helpers/refcount.c @@ -7,11 +7,21 @@ refcount_t rust_helper_REFCOUNT_INIT(int n) return (refcount_t)REFCOUNT_INIT(n); } +void rust_helper_refcount_set(refcount_t *r, int n) +{ + refcount_set(r, n); +} + void rust_helper_refcount_inc(refcount_t *r) { refcount_inc(r); } +void rust_helper_refcount_dec(refcount_t *r) +{ + refcount_dec(r); +} + bool rust_helper_refcount_dec_and_test(refcount_t *r) { return refcount_dec_and_test(r); diff --git a/rust/kernel/block/mq/operations.rs b/rust/kernel/block/mq/operations.rs index c2b98f507bcb..c0f95a9419c4 100644 --- a/rust/kernel/block/mq/operations.rs +++ b/rust/kernel/block/mq/operations.rs @@ -10,9 +10,10 @@ use crate::{ block::mq::Request, error::{from_result, Result}, prelude::*, + sync::Refcount, types::ARef, }; -use core::{marker::PhantomData, sync::atomic::AtomicU64, sync::atomic::Ordering}; +use core::marker::PhantomData; /// Implement this trait to interface blk-mq as block devices. /// @@ -78,7 +79,7 @@ impl<T: Operations> OperationsVTable<T> { let request = unsafe { &*(*bd).rq.cast::<Request<T>>() }; // One refcount for the ARef, one for being in flight - request.wrapper_ref().refcount().store(2, Ordering::Relaxed); + request.wrapper_ref().refcount().set(2); // SAFETY: // - We own a refcount that we took above. We pass that to `ARef`. @@ -187,7 +188,7 @@ impl<T: Operations> OperationsVTable<T> { // SAFETY: The refcount field is allocated but not initialized, so // it is valid for writes. - unsafe { RequestDataWrapper::refcount_ptr(pdu.as_ptr()).write(AtomicU64::new(0)) }; + unsafe { RequestDataWrapper::refcount_ptr(pdu.as_ptr()).write(Refcount::new(0)) }; Ok(0) }) diff --git a/rust/kernel/block/mq/request.rs b/rust/kernel/block/mq/request.rs index fefd394f064a..f62a376dc313 100644 --- a/rust/kernel/block/mq/request.rs +++ b/rust/kernel/block/mq/request.rs @@ -8,13 +8,10 @@ use crate::{ bindings, block::mq::Operations, error::Result, + sync::{atomic::Relaxed, Refcount}, types::{ARef, AlwaysRefCounted, Opaque}, }; -use core::{ - marker::PhantomData, - ptr::NonNull, - sync::atomic::{AtomicU64, Ordering}, -}; +use core::{marker::PhantomData, ptr::NonNull}; /// A wrapper around a blk-mq [`struct request`]. This represents an IO request. /// @@ -37,6 +34,9 @@ use core::{ /// We need to track 3 and 4 to ensure that it is safe to end the request and hand /// back ownership to the block layer. /// +/// Note that the driver can still obtain new `ARef` even if there is no `ARef`s in existence by +/// using `tag_to_rq`, hence the need to distinguish B and C. +/// /// The states are tracked through the private `refcount` field of /// `RequestDataWrapper`. This structure lives in the private data area of the C /// [`struct request`]. @@ -98,13 +98,16 @@ impl<T: Operations> Request<T> { /// /// [`struct request`]: srctree/include/linux/blk-mq.h fn try_set_end(this: ARef<Self>) -> Result<*mut bindings::request, ARef<Self>> { - // We can race with `TagSet::tag_to_rq` - if let Err(_old) = this.wrapper_ref().refcount().compare_exchange( - 2, - 0, - Ordering::Relaxed, - Ordering::Relaxed, - ) { + // To hand back the ownership, we need the current refcount to be 2. + // Since we can race with `TagSet::tag_to_rq`, this needs to atomically reduce + // refcount to 0. `Refcount` does not provide a way to do this, so use the underlying + // atomics directly. + if let Err(_old) = this + .wrapper_ref() + .refcount() + .as_atomic() + .cmpxchg(2, 0, Relaxed) + { return Err(this); } @@ -173,13 +176,13 @@ pub(crate) struct RequestDataWrapper { /// - 0: The request is owned by C block layer. /// - 1: The request is owned by Rust abstractions but there are no [`ARef`] references to it. /// - 2+: There are [`ARef`] references to the request. - refcount: AtomicU64, + refcount: Refcount, } impl RequestDataWrapper { /// Return a reference to the refcount of the request that is embedding /// `self`. - pub(crate) fn refcount(&self) -> &AtomicU64 { + pub(crate) fn refcount(&self) -> &Refcount { &self.refcount } @@ -189,7 +192,7 @@ impl RequestDataWrapper { /// # Safety /// /// - `this` must point to a live allocation of at least the size of `Self`. - pub(crate) unsafe fn refcount_ptr(this: *mut Self) -> *mut AtomicU64 { + pub(crate) unsafe fn refcount_ptr(this: *mut Self) -> *mut Refcount { // SAFETY: Because of the safety requirements of this function, the // field projection is safe. unsafe { &raw mut (*this).refcount } @@ -205,47 +208,13 @@ unsafe impl<T: Operations> Send for Request<T> {} // mutate `self` are internally synchronized` unsafe impl<T: Operations> Sync for Request<T> {} -/// Store the result of `op(target.load())` in target, returning new value of -/// target. -fn atomic_relaxed_op_return(target: &AtomicU64, op: impl Fn(u64) -> u64) -> u64 { - let old = target.fetch_update(Ordering::Relaxed, Ordering::Relaxed, |x| Some(op(x))); - - // SAFETY: Because the operation passed to `fetch_update` above always - // return `Some`, `old` will always be `Ok`. - let old = unsafe { old.unwrap_unchecked() }; - - op(old) -} - -/// Store the result of `op(target.load)` in `target` if `target.load() != -/// pred`, returning [`true`] if the target was updated. -fn atomic_relaxed_op_unless(target: &AtomicU64, op: impl Fn(u64) -> u64, pred: u64) -> bool { - target - .fetch_update(Ordering::Relaxed, Ordering::Relaxed, |x| { - if x == pred { - None - } else { - Some(op(x)) - } - }) - .is_ok() -} - // SAFETY: All instances of `Request<T>` are reference counted. This // implementation of `AlwaysRefCounted` ensure that increments to the ref count // keeps the object alive in memory at least until a matching reference count // decrement is executed. unsafe impl<T: Operations> AlwaysRefCounted for Request<T> { fn inc_ref(&self) { - let refcount = &self.wrapper_ref().refcount(); - - #[cfg_attr(not(CONFIG_DEBUG_MISC), allow(unused_variables))] - let updated = atomic_relaxed_op_unless(refcount, |x| x + 1, 0); - - #[cfg(CONFIG_DEBUG_MISC)] - if !updated { - panic!("Request refcount zero on clone") - } + self.wrapper_ref().refcount().inc(); } unsafe fn dec_ref(obj: core::ptr::NonNull<Self>) { @@ -257,10 +226,10 @@ unsafe impl<T: Operations> AlwaysRefCounted for Request<T> { let refcount = unsafe { &*RequestDataWrapper::refcount_ptr(wrapper_ptr) }; #[cfg_attr(not(CONFIG_DEBUG_MISC), allow(unused_variables))] - let new_refcount = atomic_relaxed_op_return(refcount, |x| x - 1); + let is_zero = refcount.dec_and_test(); #[cfg(CONFIG_DEBUG_MISC)] - if new_refcount == 0 { + if is_zero { panic!("Request reached refcount zero in Rust abstractions"); } } diff --git a/rust/kernel/sync.rs b/rust/kernel/sync.rs index 00f9b558a3ad..cf5b638a097d 100644 --- a/rust/kernel/sync.rs +++ b/rust/kernel/sync.rs @@ -11,12 +11,15 @@ use pin_init; mod arc; pub mod aref; +pub mod atomic; +pub mod barrier; pub mod completion; mod condvar; pub mod lock; mod locked_by; pub mod poll; pub mod rcu; +mod refcount; pub use arc::{Arc, ArcBorrow, UniqueArc}; pub use completion::Completion; @@ -25,6 +28,7 @@ pub use lock::global::{global_lock, GlobalGuard, GlobalLock, GlobalLockBackend, pub use lock::mutex::{new_mutex, Mutex, MutexGuard}; pub use lock::spinlock::{new_spinlock, SpinLock, SpinLockGuard}; pub use locked_by::LockedBy; +pub use refcount::Refcount; /// Represents a lockdep class. It's a wrapper around C's `lock_class_key`. #[repr(transparent)] diff --git a/rust/kernel/sync/arc.rs b/rust/kernel/sync/arc.rs index 63a66761d0c7..9298993ea7d8 100644 --- a/rust/kernel/sync/arc.rs +++ b/rust/kernel/sync/arc.rs @@ -8,7 +8,7 @@ //! threads. //! //! It is different from the standard library's [`Arc`] in a few ways: -//! 1. It is backed by the kernel's `refcount_t` type. +//! 1. It is backed by the kernel's [`Refcount`] type. //! 2. It does not support weak references, which allows it to be half the size. //! 3. It saturates the reference count instead of aborting when it goes over a threshold. //! 4. It does not provide a `get_mut` method, so the ref counted object is pinned. @@ -18,11 +18,11 @@ use crate::{ alloc::{AllocError, Flags, KBox}, - bindings, ffi::c_void, init::InPlaceInit, + sync::Refcount, try_init, - types::{ForeignOwnable, Opaque}, + types::ForeignOwnable, }; use core::{ alloc::Layout, @@ -145,7 +145,7 @@ pub struct Arc<T: ?Sized> { #[pin_data] #[repr(C)] struct ArcInner<T: ?Sized> { - refcount: Opaque<bindings::refcount_t>, + refcount: Refcount, data: T, } @@ -157,7 +157,7 @@ impl<T: ?Sized> ArcInner<T> { /// `ptr` must have been returned by a previous call to [`Arc::into_raw`], and the `Arc` must /// not yet have been destroyed. unsafe fn container_of(ptr: *const T) -> NonNull<ArcInner<T>> { - let refcount_layout = Layout::new::<bindings::refcount_t>(); + let refcount_layout = Layout::new::<Refcount>(); // SAFETY: The caller guarantees that the pointer is valid. let val_layout = Layout::for_value(unsafe { &*ptr }); // SAFETY: We're computing the layout of a real struct that existed when compiling this @@ -229,8 +229,7 @@ impl<T> Arc<T> { pub fn new(contents: T, flags: Flags) -> Result<Self, AllocError> { // INVARIANT: The refcount is initialised to a non-zero value. let value = ArcInner { - // SAFETY: There are no safety requirements for this FFI call. - refcount: Opaque::new(unsafe { bindings::REFCOUNT_INIT(1) }), + refcount: Refcount::new(1), data: contents, }; @@ -321,7 +320,7 @@ impl<T: ?Sized> Arc<T> { /// use kernel::sync::{Arc, UniqueArc}; /// /// let arc = Arc::new(42, GFP_KERNEL)?; - /// let unique_arc = arc.into_unique_or_drop(); + /// let unique_arc = Arc::into_unique_or_drop(arc); /// /// // The above conversion should succeed since refcount of `arc` is 1. /// assert!(unique_arc.is_some()); @@ -337,35 +336,30 @@ impl<T: ?Sized> Arc<T> { /// let arc = Arc::new(42, GFP_KERNEL)?; /// let another = arc.clone(); /// - /// let unique_arc = arc.into_unique_or_drop(); + /// let unique_arc = Arc::into_unique_or_drop(arc); /// /// // The above conversion should fail since refcount of `arc` is >1. /// assert!(unique_arc.is_none()); /// /// # Ok::<(), Error>(()) /// ``` - pub fn into_unique_or_drop(self) -> Option<Pin<UniqueArc<T>>> { + pub fn into_unique_or_drop(this: Self) -> Option<Pin<UniqueArc<T>>> { // We will manually manage the refcount in this method, so we disable the destructor. - let me = ManuallyDrop::new(self); + let this = ManuallyDrop::new(this); // SAFETY: We own a refcount, so the pointer is still valid. - let refcount = unsafe { me.ptr.as_ref() }.refcount.get(); + let refcount = unsafe { &this.ptr.as_ref().refcount }; // If the refcount reaches a non-zero value, then we have destroyed this `Arc` and will // return without further touching the `Arc`. If the refcount reaches zero, then there are // no other arcs, and we can create a `UniqueArc`. - // - // SAFETY: We own a refcount, so the pointer is not dangling. - let is_zero = unsafe { bindings::refcount_dec_and_test(refcount) }; - if is_zero { - // SAFETY: We have exclusive access to the arc, so we can perform unsynchronized - // accesses to the refcount. - unsafe { core::ptr::write(refcount, bindings::REFCOUNT_INIT(1)) }; + if refcount.dec_and_test() { + refcount.set(1); // INVARIANT: We own the only refcount to this arc, so we may create a `UniqueArc`. We // must pin the `UniqueArc` because the values was previously in an `Arc`, and they pin // their values. Some(Pin::from(UniqueArc { - inner: ManuallyDrop::into_inner(me), + inner: ManuallyDrop::into_inner(this), })) } else { None @@ -456,14 +450,10 @@ impl<T: ?Sized> Borrow<T> for Arc<T> { impl<T: ?Sized> Clone for Arc<T> { fn clone(&self) -> Self { - // SAFETY: By the type invariant, there is necessarily a reference to the object, so it is - // safe to dereference it. - let refcount = unsafe { self.ptr.as_ref() }.refcount.get(); - - // INVARIANT: C `refcount_inc` saturates the refcount, so it cannot overflow to zero. + // INVARIANT: `Refcount` saturates the refcount, so it cannot overflow to zero. // SAFETY: By the type invariant, there is necessarily a reference to the object, so it is // safe to increment the refcount. - unsafe { bindings::refcount_inc(refcount) }; + unsafe { self.ptr.as_ref() }.refcount.inc(); // SAFETY: We just incremented the refcount. This increment is now owned by the new `Arc`. unsafe { Self::from_inner(self.ptr) } @@ -472,16 +462,10 @@ impl<T: ?Sized> Clone for Arc<T> { impl<T: ?Sized> Drop for Arc<T> { fn drop(&mut self) { - // SAFETY: By the type invariant, there is necessarily a reference to the object. We cannot - // touch `refcount` after it's decremented to a non-zero value because another thread/CPU - // may concurrently decrement it to zero and free it. It is ok to have a raw pointer to - // freed/invalid memory as long as it is never dereferenced. - let refcount = unsafe { self.ptr.as_ref() }.refcount.get(); - // INVARIANT: If the refcount reaches zero, there are no other instances of `Arc`, and // this instance is being dropped, so the broken invariant is not observable. - // SAFETY: Also by the type invariant, we are allowed to decrement the refcount. - let is_zero = unsafe { bindings::refcount_dec_and_test(refcount) }; + // SAFETY: By the type invariant, there is necessarily a reference to the object. + let is_zero = unsafe { self.ptr.as_ref() }.refcount.dec_and_test(); if is_zero { // The count reached zero, we must free the memory. // @@ -775,8 +759,7 @@ impl<T> UniqueArc<T> { // INVARIANT: The refcount is initialised to a non-zero value. let inner = KBox::try_init::<AllocError>( try_init!(ArcInner { - // SAFETY: There are no safety requirements for this FFI call. - refcount: Opaque::new(unsafe { bindings::REFCOUNT_INIT(1) }), + refcount: Refcount::new(1), data <- pin_init::uninit::<T, AllocError>(), }? AllocError), flags, diff --git a/rust/kernel/sync/atomic.rs b/rust/kernel/sync/atomic.rs new file mode 100644 index 000000000000..016a6bcaf080 --- /dev/null +++ b/rust/kernel/sync/atomic.rs @@ -0,0 +1,551 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Atomic primitives. +//! +//! These primitives have the same semantics as their C counterparts: and the precise definitions of +//! semantics can be found at [`LKMM`]. Note that Linux Kernel Memory (Consistency) Model is the +//! only model for Rust code in kernel, and Rust's own atomics should be avoided. +//! +//! # Data races +//! +//! [`LKMM`] atomics have different rules regarding data races: +//! +//! - A normal write from C side is treated as an atomic write if +//! CONFIG_KCSAN_ASSUME_PLAIN_WRITES_ATOMIC=y. +//! - Mixed-size atomic accesses don't cause data races. +//! +//! [`LKMM`]: srctree/tools/memory-model/ + +mod internal; +pub mod ordering; +mod predefine; + +pub use internal::AtomicImpl; +pub use ordering::{Acquire, Full, Relaxed, Release}; + +use crate::build_error; +use internal::{AtomicArithmeticOps, AtomicBasicOps, AtomicExchangeOps, AtomicRepr}; +use ordering::OrderingType; + +/// A memory location which can be safely modified from multiple execution contexts. +/// +/// This has the same size, alignment and bit validity as the underlying type `T`. And it disables +/// niche optimization for the same reason as [`UnsafeCell`]. +/// +/// The atomic operations are implemented in a way that is fully compatible with the [Linux Kernel +/// Memory (Consistency) Model][LKMM], hence they should be modeled as the corresponding +/// [`LKMM`][LKMM] atomic primitives. With the help of [`Atomic::from_ptr()`] and +/// [`Atomic::as_ptr()`], this provides a way to interact with [C-side atomic operations] +/// (including those without the `atomic` prefix, e.g. `READ_ONCE()`, `WRITE_ONCE()`, +/// `smp_load_acquire()` and `smp_store_release()`). +/// +/// # Invariants +/// +/// `self.0` is a valid `T`. +/// +/// [`UnsafeCell`]: core::cell::UnsafeCell +/// [LKMM]: srctree/tools/memory-model/ +/// [C-side atomic operations]: srctree/Documentation/atomic_t.txt +#[repr(transparent)] +pub struct Atomic<T: AtomicType>(AtomicRepr<T::Repr>); + +// SAFETY: `Atomic<T>` is safe to share among execution contexts because all accesses are atomic. +unsafe impl<T: AtomicType> Sync for Atomic<T> {} + +/// Types that support basic atomic operations. +/// +/// # Round-trip transmutability +/// +/// `T` is round-trip transmutable to `U` if and only if both of these properties hold: +/// +/// - Any valid bit pattern for `T` is also a valid bit pattern for `U`. +/// - Transmuting (e.g. using [`transmute()`]) a value of type `T` to `U` and then to `T` again +/// yields a value that is in all aspects equivalent to the original value. +/// +/// # Safety +/// +/// - [`Self`] must have the same size and alignment as [`Self::Repr`]. +/// - [`Self`] must be [round-trip transmutable] to [`Self::Repr`]. +/// +/// Note that this is more relaxed than requiring the bi-directional transmutability (i.e. +/// [`transmute()`] is always sound between `U` and `T`) because of the support for atomic +/// variables over unit-only enums, see [Examples]. +/// +/// # Limitations +/// +/// Because C primitives are used to implement the atomic operations, and a C function requires a +/// valid object of a type to operate on (i.e. no `MaybeUninit<_>`), hence at the Rust <-> C +/// surface, only types with all the bits initialized can be passed. As a result, types like `(u8, +/// u16)` (padding bytes are uninitialized) are currently not supported. +/// +/// # Examples +/// +/// A unit-only enum that implements [`AtomicType`]: +/// +/// ``` +/// use kernel::sync::atomic::{AtomicType, Atomic, Relaxed}; +/// +/// #[derive(Clone, Copy, PartialEq, Eq)] +/// #[repr(i32)] +/// enum State { +/// Uninit = 0, +/// Working = 1, +/// Done = 2, +/// }; +/// +/// // SAFETY: `State` and `i32` has the same size and alignment, and it's round-trip +/// // transmutable to `i32`. +/// unsafe impl AtomicType for State { +/// type Repr = i32; +/// } +/// +/// let s = Atomic::new(State::Uninit); +/// +/// assert_eq!(State::Uninit, s.load(Relaxed)); +/// ``` +/// [`transmute()`]: core::mem::transmute +/// [round-trip transmutable]: AtomicType#round-trip-transmutability +/// [Examples]: AtomicType#examples +pub unsafe trait AtomicType: Sized + Send + Copy { + /// The backing atomic implementation type. + type Repr: AtomicImpl; +} + +/// Types that support atomic add operations. +/// +/// # Safety +/// +// TODO: Properly defines `wrapping_add` in the following comment. +/// `wrapping_add` any value of type `Self::Repr::Delta` obtained by [`Self::rhs_into_delta()`] to +/// any value of type `Self::Repr` obtained through transmuting a value of type `Self` to must +/// yield a value with a bit pattern also valid for `Self`. +pub unsafe trait AtomicAdd<Rhs = Self>: AtomicType { + /// Converts `Rhs` into the `Delta` type of the atomic implementation. + fn rhs_into_delta(rhs: Rhs) -> <Self::Repr as AtomicImpl>::Delta; +} + +#[inline(always)] +const fn into_repr<T: AtomicType>(v: T) -> T::Repr { + // SAFETY: Per the safety requirement of `AtomicType`, `T` is round-trip transmutable to + // `T::Repr`, therefore the transmute operation is sound. + unsafe { core::mem::transmute_copy(&v) } +} + +/// # Safety +/// +/// `r` must be a valid bit pattern of `T`. +#[inline(always)] +const unsafe fn from_repr<T: AtomicType>(r: T::Repr) -> T { + // SAFETY: Per the safety requirement of the function, the transmute operation is sound. + unsafe { core::mem::transmute_copy(&r) } +} + +impl<T: AtomicType> Atomic<T> { + /// Creates a new atomic `T`. + pub const fn new(v: T) -> Self { + // INVARIANT: Per the safety requirement of `AtomicType`, `into_repr(v)` is a valid `T`. + Self(AtomicRepr::new(into_repr(v))) + } + + /// Creates a reference to an atomic `T` from a pointer of `T`. + /// + /// This usually is used when communicating with C side or manipulating a C struct, see + /// examples below. + /// + /// # Safety + /// + /// - `ptr` is aligned to `align_of::<T>()`. + /// - `ptr` is valid for reads and writes for `'a`. + /// - For the duration of `'a`, other accesses to `*ptr` must not cause data races (defined + /// by [`LKMM`]) against atomic operations on the returned reference. Note that if all other + /// accesses are atomic, then this safety requirement is trivially fulfilled. + /// + /// [`LKMM`]: srctree/tools/memory-model + /// + /// # Examples + /// + /// Using [`Atomic::from_ptr()`] combined with [`Atomic::load()`] or [`Atomic::store()`] can + /// achieve the same functionality as `READ_ONCE()`/`smp_load_acquire()` or + /// `WRITE_ONCE()`/`smp_store_release()` in C side: + /// + /// ``` + /// # use kernel::types::Opaque; + /// use kernel::sync::atomic::{Atomic, Relaxed, Release}; + /// + /// // Assume there is a C struct `foo`. + /// mod cbindings { + /// #[repr(C)] + /// pub(crate) struct foo { + /// pub(crate) a: i32, + /// pub(crate) b: i32 + /// } + /// } + /// + /// let tmp = Opaque::new(cbindings::foo { a: 1, b: 2 }); + /// + /// // struct foo *foo_ptr = ..; + /// let foo_ptr = tmp.get(); + /// + /// // SAFETY: `foo_ptr` is valid, and `.a` is in bounds. + /// let foo_a_ptr = unsafe { &raw mut (*foo_ptr).a }; + /// + /// // a = READ_ONCE(foo_ptr->a); + /// // + /// // SAFETY: `foo_a_ptr` is valid for read, and all other accesses on it is atomic, so no + /// // data race. + /// let a = unsafe { Atomic::from_ptr(foo_a_ptr) }.load(Relaxed); + /// # assert_eq!(a, 1); + /// + /// // smp_store_release(&foo_ptr->a, 2); + /// // + /// // SAFETY: `foo_a_ptr` is valid for writes, and all other accesses on it is atomic, so + /// // no data race. + /// unsafe { Atomic::from_ptr(foo_a_ptr) }.store(2, Release); + /// ``` + pub unsafe fn from_ptr<'a>(ptr: *mut T) -> &'a Self + where + T: Sync, + { + // CAST: `T` and `Atomic<T>` have the same size, alignment and bit validity. + // SAFETY: Per function safety requirement, `ptr` is a valid pointer and the object will + // live long enough. It's safe to return a `&Atomic<T>` because function safety requirement + // guarantees other accesses won't cause data races. + unsafe { &*ptr.cast::<Self>() } + } + + /// Returns a pointer to the underlying atomic `T`. + /// + /// Note that use of the return pointer must not cause data races defined by [`LKMM`]. + /// + /// # Guarantees + /// + /// The returned pointer is valid and properly aligned (i.e. aligned to [`align_of::<T>()`]). + /// + /// [`LKMM`]: srctree/tools/memory-model + /// [`align_of::<T>()`]: core::mem::align_of + pub const fn as_ptr(&self) -> *mut T { + // GUARANTEE: Per the function guarantee of `AtomicRepr::as_ptr()`, the `self.0.as_ptr()` + // must be a valid and properly aligned pointer for `T::Repr`, and per the safety guarantee + // of `AtomicType`, it's a valid and properly aligned pointer of `T`. + self.0.as_ptr().cast() + } + + /// Returns a mutable reference to the underlying atomic `T`. + /// + /// This is safe because the mutable reference of the atomic `T` guarantees exclusive access. + pub fn get_mut(&mut self) -> &mut T { + // CAST: `T` and `T::Repr` has the same size and alignment per the safety requirement of + // `AtomicType`, and per the type invariants `self.0` is a valid `T`, therefore the casting + // result is a valid pointer of `T`. + // SAFETY: The pointer is valid per the CAST comment above, and the mutable reference + // guarantees exclusive access. + unsafe { &mut *self.0.as_ptr().cast() } + } +} + +impl<T: AtomicType> Atomic<T> +where + T::Repr: AtomicBasicOps, +{ + /// Loads the value from the atomic `T`. + /// + /// # Examples + /// + /// ``` + /// use kernel::sync::atomic::{Atomic, Relaxed}; + /// + /// let x = Atomic::new(42i32); + /// + /// assert_eq!(42, x.load(Relaxed)); + /// + /// let x = Atomic::new(42i64); + /// + /// assert_eq!(42, x.load(Relaxed)); + /// ``` + #[doc(alias("atomic_read", "atomic64_read"))] + #[inline(always)] + pub fn load<Ordering: ordering::AcquireOrRelaxed>(&self, _: Ordering) -> T { + let v = { + match Ordering::TYPE { + OrderingType::Relaxed => T::Repr::atomic_read(&self.0), + OrderingType::Acquire => T::Repr::atomic_read_acquire(&self.0), + _ => build_error!("Wrong ordering"), + } + }; + + // SAFETY: `v` comes from reading `self.0`, which is a valid `T` per the type invariants. + unsafe { from_repr(v) } + } + + /// Stores a value to the atomic `T`. + /// + /// # Examples + /// + /// ``` + /// use kernel::sync::atomic::{Atomic, Relaxed}; + /// + /// let x = Atomic::new(42i32); + /// + /// assert_eq!(42, x.load(Relaxed)); + /// + /// x.store(43, Relaxed); + /// + /// assert_eq!(43, x.load(Relaxed)); + /// ``` + #[doc(alias("atomic_set", "atomic64_set"))] + #[inline(always)] + pub fn store<Ordering: ordering::ReleaseOrRelaxed>(&self, v: T, _: Ordering) { + let v = into_repr(v); + + // INVARIANT: `v` is a valid `T`, and is stored to `self.0` by `atomic_set*()`. + match Ordering::TYPE { + OrderingType::Relaxed => T::Repr::atomic_set(&self.0, v), + OrderingType::Release => T::Repr::atomic_set_release(&self.0, v), + _ => build_error!("Wrong ordering"), + } + } +} + +impl<T: AtomicType> Atomic<T> +where + T::Repr: AtomicExchangeOps, +{ + /// Atomic exchange. + /// + /// Atomically updates `*self` to `v` and returns the old value of `*self`. + /// + /// # Examples + /// + /// ``` + /// use kernel::sync::atomic::{Atomic, Acquire, Relaxed}; + /// + /// let x = Atomic::new(42); + /// + /// assert_eq!(42, x.xchg(52, Acquire)); + /// assert_eq!(52, x.load(Relaxed)); + /// ``` + #[doc(alias("atomic_xchg", "atomic64_xchg", "swap"))] + #[inline(always)] + pub fn xchg<Ordering: ordering::Ordering>(&self, v: T, _: Ordering) -> T { + let v = into_repr(v); + + // INVARIANT: `self.0` is a valid `T` after `atomic_xchg*()` because `v` is transmutable to + // `T`. + let ret = { + match Ordering::TYPE { + OrderingType::Full => T::Repr::atomic_xchg(&self.0, v), + OrderingType::Acquire => T::Repr::atomic_xchg_acquire(&self.0, v), + OrderingType::Release => T::Repr::atomic_xchg_release(&self.0, v), + OrderingType::Relaxed => T::Repr::atomic_xchg_relaxed(&self.0, v), + } + }; + + // SAFETY: `ret` comes from reading `*self`, which is a valid `T` per type invariants. + unsafe { from_repr(ret) } + } + + /// Atomic compare and exchange. + /// + /// If `*self` == `old`, atomically updates `*self` to `new`. Otherwise, `*self` is not + /// modified. + /// + /// Compare: The comparison is done via the byte level comparison between `*self` and `old`. + /// + /// Ordering: When succeeds, provides the corresponding ordering as the `Ordering` type + /// parameter indicates, and a failed one doesn't provide any ordering, the load part of a + /// failed cmpxchg is a [`Relaxed`] load. + /// + /// Returns `Ok(value)` if cmpxchg succeeds, and `value` is guaranteed to be equal to `old`, + /// otherwise returns `Err(value)`, and `value` is the current value of `*self`. + /// + /// # Examples + /// + /// ``` + /// use kernel::sync::atomic::{Atomic, Full, Relaxed}; + /// + /// let x = Atomic::new(42); + /// + /// // Checks whether cmpxchg succeeded. + /// let success = x.cmpxchg(52, 64, Relaxed).is_ok(); + /// # assert!(!success); + /// + /// // Checks whether cmpxchg failed. + /// let failure = x.cmpxchg(52, 64, Relaxed).is_err(); + /// # assert!(failure); + /// + /// // Uses the old value if failed, probably re-try cmpxchg. + /// match x.cmpxchg(52, 64, Relaxed) { + /// Ok(_) => { }, + /// Err(old) => { + /// // do something with `old`. + /// # assert_eq!(old, 42); + /// } + /// } + /// + /// // Uses the latest value regardlessly, same as atomic_cmpxchg() in C. + /// let latest = x.cmpxchg(42, 64, Full).unwrap_or_else(|old| old); + /// # assert_eq!(42, latest); + /// assert_eq!(64, x.load(Relaxed)); + /// ``` + /// + /// [`Relaxed`]: ordering::Relaxed + #[doc(alias( + "atomic_cmpxchg", + "atomic64_cmpxchg", + "atomic_try_cmpxchg", + "atomic64_try_cmpxchg", + "compare_exchange" + ))] + #[inline(always)] + pub fn cmpxchg<Ordering: ordering::Ordering>( + &self, + mut old: T, + new: T, + o: Ordering, + ) -> Result<T, T> { + // Note on code generation: + // + // try_cmpxchg() is used to implement cmpxchg(), and if the helper functions are inlined, + // the compiler is able to figure out that branch is not needed if the users don't care + // about whether the operation succeeds or not. One exception is on x86, due to commit + // 44fe84459faf ("locking/atomic: Fix atomic_try_cmpxchg() semantics"), the + // atomic_try_cmpxchg() on x86 has a branch even if the caller doesn't care about the + // success of cmpxchg and only wants to use the old value. For example, for code like: + // + // let latest = x.cmpxchg(42, 64, Full).unwrap_or_else(|old| old); + // + // It will still generate code: + // + // movl $0x40, %ecx + // movl $0x34, %eax + // lock + // cmpxchgl %ecx, 0x4(%rsp) + // jne 1f + // 2: + // ... + // 1: movl %eax, %ecx + // jmp 2b + // + // This might be "fixed" by introducing a try_cmpxchg_exclusive() that knows the "*old" + // location in the C function is always safe to write. + if self.try_cmpxchg(&mut old, new, o) { + Ok(old) + } else { + Err(old) + } + } + + /// Atomic compare and exchange and returns whether the operation succeeds. + /// + /// If `*self` == `old`, atomically updates `*self` to `new`. Otherwise, `*self` is not + /// modified, `*old` is updated to the current value of `*self`. + /// + /// "Compare" and "Ordering" part are the same as [`Atomic::cmpxchg()`]. + /// + /// Returns `true` means the cmpxchg succeeds otherwise returns `false`. + #[inline(always)] + fn try_cmpxchg<Ordering: ordering::Ordering>(&self, old: &mut T, new: T, _: Ordering) -> bool { + let mut tmp = into_repr(*old); + let new = into_repr(new); + + // INVARIANT: `self.0` is a valid `T` after `atomic_try_cmpxchg*()` because `new` is + // transmutable to `T`. + let ret = { + match Ordering::TYPE { + OrderingType::Full => T::Repr::atomic_try_cmpxchg(&self.0, &mut tmp, new), + OrderingType::Acquire => { + T::Repr::atomic_try_cmpxchg_acquire(&self.0, &mut tmp, new) + } + OrderingType::Release => { + T::Repr::atomic_try_cmpxchg_release(&self.0, &mut tmp, new) + } + OrderingType::Relaxed => { + T::Repr::atomic_try_cmpxchg_relaxed(&self.0, &mut tmp, new) + } + } + }; + + // SAFETY: `tmp` comes from reading `*self`, which is a valid `T` per type invariants. + *old = unsafe { from_repr(tmp) }; + + ret + } +} + +impl<T: AtomicType> Atomic<T> +where + T::Repr: AtomicArithmeticOps, +{ + /// Atomic add. + /// + /// Atomically updates `*self` to `(*self).wrapping_add(v)`. + /// + /// # Examples + /// + /// ``` + /// use kernel::sync::atomic::{Atomic, Relaxed}; + /// + /// let x = Atomic::new(42); + /// + /// assert_eq!(42, x.load(Relaxed)); + /// + /// x.add(12, Relaxed); + /// + /// assert_eq!(54, x.load(Relaxed)); + /// ``` + #[inline(always)] + pub fn add<Rhs>(&self, v: Rhs, _: ordering::Relaxed) + where + T: AtomicAdd<Rhs>, + { + let v = T::rhs_into_delta(v); + + // INVARIANT: `self.0` is a valid `T` after `atomic_add()` due to safety requirement of + // `AtomicAdd`. + T::Repr::atomic_add(&self.0, v); + } + + /// Atomic fetch and add. + /// + /// Atomically updates `*self` to `(*self).wrapping_add(v)`, and returns the value of `*self` + /// before the update. + /// + /// # Examples + /// + /// ``` + /// use kernel::sync::atomic::{Atomic, Acquire, Full, Relaxed}; + /// + /// let x = Atomic::new(42); + /// + /// assert_eq!(42, x.load(Relaxed)); + /// + /// assert_eq!(54, { x.fetch_add(12, Acquire); x.load(Relaxed) }); + /// + /// let x = Atomic::new(42); + /// + /// assert_eq!(42, x.load(Relaxed)); + /// + /// assert_eq!(54, { x.fetch_add(12, Full); x.load(Relaxed) } ); + /// ``` + #[inline(always)] + pub fn fetch_add<Rhs, Ordering: ordering::Ordering>(&self, v: Rhs, _: Ordering) -> T + where + T: AtomicAdd<Rhs>, + { + let v = T::rhs_into_delta(v); + + // INVARIANT: `self.0` is a valid `T` after `atomic_fetch_add*()` due to safety requirement + // of `AtomicAdd`. + let ret = { + match Ordering::TYPE { + OrderingType::Full => T::Repr::atomic_fetch_add(&self.0, v), + OrderingType::Acquire => T::Repr::atomic_fetch_add_acquire(&self.0, v), + OrderingType::Release => T::Repr::atomic_fetch_add_release(&self.0, v), + OrderingType::Relaxed => T::Repr::atomic_fetch_add_relaxed(&self.0, v), + } + }; + + // SAFETY: `ret` comes from reading `self.0`, which is a valid `T` per type invariants. + unsafe { from_repr(ret) } + } +} diff --git a/rust/kernel/sync/atomic/internal.rs b/rust/kernel/sync/atomic/internal.rs new file mode 100644 index 000000000000..6fdd8e59f45b --- /dev/null +++ b/rust/kernel/sync/atomic/internal.rs @@ -0,0 +1,265 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Atomic internal implementations. +//! +//! Provides 1:1 mapping to the C atomic operations. + +use crate::bindings; +use crate::macros::paste; +use core::cell::UnsafeCell; + +mod private { + /// Sealed trait marker to disable customized impls on atomic implementation traits. + pub trait Sealed {} +} + +// `i32` and `i64` are only supported atomic implementations. +impl private::Sealed for i32 {} +impl private::Sealed for i64 {} + +/// A marker trait for types that implement atomic operations with C side primitives. +/// +/// This trait is sealed, and only types that have directly mapping to the C side atomics should +/// impl this: +/// +/// - `i32` maps to `atomic_t`. +/// - `i64` maps to `atomic64_t`. +pub trait AtomicImpl: Sized + Send + Copy + private::Sealed { + /// The type of the delta in arithmetic or logical operations. + /// + /// For example, in `atomic_add(ptr, v)`, it's the type of `v`. Usually it's the same type of + /// [`Self`], but it may be different for the atomic pointer type. + type Delta; +} + +// `atomic_t` implements atomic operations on `i32`. +impl AtomicImpl for i32 { + type Delta = Self; +} + +// `atomic64_t` implements atomic operations on `i64`. +impl AtomicImpl for i64 { + type Delta = Self; +} + +/// Atomic representation. +#[repr(transparent)] +pub struct AtomicRepr<T: AtomicImpl>(UnsafeCell<T>); + +impl<T: AtomicImpl> AtomicRepr<T> { + /// Creates a new atomic representation `T`. + pub const fn new(v: T) -> Self { + Self(UnsafeCell::new(v)) + } + + /// Returns a pointer to the underlying `T`. + /// + /// # Guarantees + /// + /// The returned pointer is valid and properly aligned (i.e. aligned to [`align_of::<T>()`]). + pub const fn as_ptr(&self) -> *mut T { + // GUARANTEE: `self.0` is an `UnsafeCell<T>`, therefore the pointer returned by `.get()` + // must be valid and properly aligned. + self.0.get() + } +} + +// This macro generates the function signature with given argument list and return type. +macro_rules! declare_atomic_method { + ( + $(#[doc=$doc:expr])* + $func:ident($($arg:ident : $arg_type:ty),*) $(-> $ret:ty)? + ) => { + paste!( + $(#[doc = $doc])* + fn [< atomic_ $func >]($($arg: $arg_type,)*) $(-> $ret)?; + ); + }; + ( + $(#[doc=$doc:expr])* + $func:ident [$variant:ident $($rest:ident)*]($($arg_sig:tt)*) $(-> $ret:ty)? + ) => { + paste!( + declare_atomic_method!( + $(#[doc = $doc])* + [< $func _ $variant >]($($arg_sig)*) $(-> $ret)? + ); + ); + + declare_atomic_method!( + $(#[doc = $doc])* + $func [$($rest)*]($($arg_sig)*) $(-> $ret)? + ); + }; + ( + $(#[doc=$doc:expr])* + $func:ident []($($arg_sig:tt)*) $(-> $ret:ty)? + ) => { + declare_atomic_method!( + $(#[doc = $doc])* + $func($($arg_sig)*) $(-> $ret)? + ); + } +} + +// This macro generates the function implementation with given argument list and return type, and it +// will replace "call(...)" expression with "$ctype _ $func" to call the real C function. +macro_rules! impl_atomic_method { + ( + ($ctype:ident) $func:ident($($arg:ident: $arg_type:ty),*) $(-> $ret:ty)? { + $unsafe:tt { call($($c_arg:expr),*) } + } + ) => { + paste!( + #[inline(always)] + fn [< atomic_ $func >]($($arg: $arg_type,)*) $(-> $ret)? { + // TODO: Ideally we want to use the SAFETY comments written at the macro invocation + // (e.g. in `declare_and_impl_atomic_methods!()`, however, since SAFETY comments + // are just comments, and they are not passed to macros as tokens, therefore we + // cannot use them here. One potential improvement is that if we support using + // attributes as an alternative for SAFETY comments, then we can use that for macro + // generating code. + // + // SAFETY: specified on macro invocation. + $unsafe { bindings::[< $ctype _ $func >]($($c_arg,)*) } + } + ); + }; + ( + ($ctype:ident) $func:ident[$variant:ident $($rest:ident)*]($($arg_sig:tt)*) $(-> $ret:ty)? { + $unsafe:tt { call($($arg:tt)*) } + } + ) => { + paste!( + impl_atomic_method!( + ($ctype) [< $func _ $variant >]($($arg_sig)*) $( -> $ret)? { + $unsafe { call($($arg)*) } + } + ); + ); + impl_atomic_method!( + ($ctype) $func [$($rest)*]($($arg_sig)*) $( -> $ret)? { + $unsafe { call($($arg)*) } + } + ); + }; + ( + ($ctype:ident) $func:ident[]($($arg_sig:tt)*) $( -> $ret:ty)? { + $unsafe:tt { call($($arg:tt)*) } + } + ) => { + impl_atomic_method!( + ($ctype) $func($($arg_sig)*) $(-> $ret)? { + $unsafe { call($($arg)*) } + } + ); + } +} + +// Delcares $ops trait with methods and implements the trait for `i32` and `i64`. +macro_rules! declare_and_impl_atomic_methods { + ($(#[$attr:meta])* $pub:vis trait $ops:ident { + $( + $(#[doc=$doc:expr])* + fn $func:ident [$($variant:ident),*]($($arg_sig:tt)*) $( -> $ret:ty)? { + $unsafe:tt { bindings::#call($($arg:tt)*) } + } + )* + }) => { + $(#[$attr])* + $pub trait $ops: AtomicImpl { + $( + declare_atomic_method!( + $(#[doc=$doc])* + $func[$($variant)*]($($arg_sig)*) $(-> $ret)? + ); + )* + } + + impl $ops for i32 { + $( + impl_atomic_method!( + (atomic) $func[$($variant)*]($($arg_sig)*) $(-> $ret)? { + $unsafe { call($($arg)*) } + } + ); + )* + } + + impl $ops for i64 { + $( + impl_atomic_method!( + (atomic64) $func[$($variant)*]($($arg_sig)*) $(-> $ret)? { + $unsafe { call($($arg)*) } + } + ); + )* + } + } +} + +declare_and_impl_atomic_methods!( + /// Basic atomic operations + pub trait AtomicBasicOps { + /// Atomic read (load). + fn read[acquire](a: &AtomicRepr<Self>) -> Self { + // SAFETY: `a.as_ptr()` is valid and properly aligned. + unsafe { bindings::#call(a.as_ptr().cast()) } + } + + /// Atomic set (store). + fn set[release](a: &AtomicRepr<Self>, v: Self) { + // SAFETY: `a.as_ptr()` is valid and properly aligned. + unsafe { bindings::#call(a.as_ptr().cast(), v) } + } + } +); + +declare_and_impl_atomic_methods!( + /// Exchange and compare-and-exchange atomic operations + pub trait AtomicExchangeOps { + /// Atomic exchange. + /// + /// Atomically updates `*a` to `v` and returns the old value. + fn xchg[acquire, release, relaxed](a: &AtomicRepr<Self>, v: Self) -> Self { + // SAFETY: `a.as_ptr()` is valid and properly aligned. + unsafe { bindings::#call(a.as_ptr().cast(), v) } + } + + /// Atomic compare and exchange. + /// + /// If `*a` == `*old`, atomically updates `*a` to `new`. Otherwise, `*a` is not + /// modified, `*old` is updated to the current value of `*a`. + /// + /// Return `true` if the update of `*a` occurred, `false` otherwise. + fn try_cmpxchg[acquire, release, relaxed]( + a: &AtomicRepr<Self>, old: &mut Self, new: Self + ) -> bool { + // SAFETY: `a.as_ptr()` is valid and properly aligned. `core::ptr::from_mut(old)` + // is valid and properly aligned. + unsafe { bindings::#call(a.as_ptr().cast(), core::ptr::from_mut(old), new) } + } + } +); + +declare_and_impl_atomic_methods!( + /// Atomic arithmetic operations + pub trait AtomicArithmeticOps { + /// Atomic add (wrapping). + /// + /// Atomically updates `*a` to `(*a).wrapping_add(v)`. + fn add[](a: &AtomicRepr<Self>, v: Self::Delta) { + // SAFETY: `a.as_ptr()` is valid and properly aligned. + unsafe { bindings::#call(v, a.as_ptr().cast()) } + } + + /// Atomic fetch and add (wrapping). + /// + /// Atomically updates `*a` to `(*a).wrapping_add(v)`, and returns the value of `*a` + /// before the update. + fn fetch_add[acquire, release, relaxed](a: &AtomicRepr<Self>, v: Self::Delta) -> Self { + // SAFETY: `a.as_ptr()` is valid and properly aligned. + unsafe { bindings::#call(v, a.as_ptr().cast()) } + } + } +); diff --git a/rust/kernel/sync/atomic/ordering.rs b/rust/kernel/sync/atomic/ordering.rs new file mode 100644 index 000000000000..3f103aa8db99 --- /dev/null +++ b/rust/kernel/sync/atomic/ordering.rs @@ -0,0 +1,104 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Memory orderings. +//! +//! The semantics of these orderings follows the [`LKMM`] definitions and rules. +//! +//! - [`Acquire`] provides ordering between the load part of the annotated operation and all the +//! following memory accesses, and if there is a store part, the store part has the [`Relaxed`] +//! ordering. +//! - [`Release`] provides ordering between all the preceding memory accesses and the store part of +//! the annotated operation, and if there is a load part, the load part has the [`Relaxed`] +//! ordering. +//! - [`Full`] means "fully-ordered", that is: +//! - It provides ordering between all the preceding memory accesses and the annotated operation. +//! - It provides ordering between the annotated operation and all the following memory accesses. +//! - It provides ordering between all the preceding memory accesses and all the following memory +//! accesses. +//! - All the orderings are the same strength as a full memory barrier (i.e. `smp_mb()`). +//! - [`Relaxed`] provides no ordering except the dependency orderings. Dependency orderings are +//! described in "DEPENDENCY RELATIONS" in [`LKMM`]'s [`explanation`]. +//! +//! [`LKMM`]: srctree/tools/memory-model/ +//! [`explanation`]: srctree/tools/memory-model/Documentation/explanation.txt + +/// The annotation type for relaxed memory ordering, for the description of relaxed memory +/// ordering, see [module-level documentation]. +/// +/// [module-level documentation]: crate::sync::atomic::ordering +pub struct Relaxed; + +/// The annotation type for acquire memory ordering, for the description of acquire memory +/// ordering, see [module-level documentation]. +/// +/// [module-level documentation]: crate::sync::atomic::ordering +pub struct Acquire; + +/// The annotation type for release memory ordering, for the description of release memory +/// ordering, see [module-level documentation]. +/// +/// [module-level documentation]: crate::sync::atomic::ordering +pub struct Release; + +/// The annotation type for fully-ordered memory ordering, for the description fully-ordered memory +/// ordering, see [module-level documentation]. +/// +/// [module-level documentation]: crate::sync::atomic::ordering +pub struct Full; + +/// Describes the exact memory ordering. +#[doc(hidden)] +pub enum OrderingType { + /// Relaxed ordering. + Relaxed, + /// Acquire ordering. + Acquire, + /// Release ordering. + Release, + /// Fully-ordered. + Full, +} + +mod internal { + /// Sealed trait, can be only implemented inside atomic mod. + pub trait Sealed {} + + impl Sealed for super::Relaxed {} + impl Sealed for super::Acquire {} + impl Sealed for super::Release {} + impl Sealed for super::Full {} +} + +/// The trait bound for annotating operations that support any ordering. +pub trait Ordering: internal::Sealed { + /// Describes the exact memory ordering. + const TYPE: OrderingType; +} + +impl Ordering for Relaxed { + const TYPE: OrderingType = OrderingType::Relaxed; +} + +impl Ordering for Acquire { + const TYPE: OrderingType = OrderingType::Acquire; +} + +impl Ordering for Release { + const TYPE: OrderingType = OrderingType::Release; +} + +impl Ordering for Full { + const TYPE: OrderingType = OrderingType::Full; +} + +/// The trait bound for operations that only support acquire or relaxed ordering. +pub trait AcquireOrRelaxed: Ordering {} + +impl AcquireOrRelaxed for Acquire {} +impl AcquireOrRelaxed for Relaxed {} + +/// The trait bound for operations that only support release or relaxed ordering. +pub trait ReleaseOrRelaxed: Ordering {} + +impl ReleaseOrRelaxed for Release {} +impl ReleaseOrRelaxed for Relaxed {} diff --git a/rust/kernel/sync/atomic/predefine.rs b/rust/kernel/sync/atomic/predefine.rs new file mode 100644 index 000000000000..45a17985cda4 --- /dev/null +++ b/rust/kernel/sync/atomic/predefine.rs @@ -0,0 +1,169 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Pre-defined atomic types + +use crate::static_assert; +use core::mem::{align_of, size_of}; + +// SAFETY: `i32` has the same size and alignment with itself, and is round-trip transmutable to +// itself. +unsafe impl super::AtomicType for i32 { + type Repr = i32; +} + +// SAFETY: The wrapping add result of two `i32`s is a valid `i32`. +unsafe impl super::AtomicAdd<i32> for i32 { + fn rhs_into_delta(rhs: i32) -> i32 { + rhs + } +} + +// SAFETY: `i64` has the same size and alignment with itself, and is round-trip transmutable to +// itself. +unsafe impl super::AtomicType for i64 { + type Repr = i64; +} + +// SAFETY: The wrapping add result of two `i64`s is a valid `i64`. +unsafe impl super::AtomicAdd<i64> for i64 { + fn rhs_into_delta(rhs: i64) -> i64 { + rhs + } +} + +// Defines an internal type that always maps to the integer type which has the same size alignment +// as `isize` and `usize`, and `isize` and `usize` are always bi-directional transmutable to +// `isize_atomic_repr`, which also always implements `AtomicImpl`. +#[allow(non_camel_case_types)] +#[cfg(not(CONFIG_64BIT))] +type isize_atomic_repr = i32; +#[allow(non_camel_case_types)] +#[cfg(CONFIG_64BIT)] +type isize_atomic_repr = i64; + +// Ensure size and alignment requirements are checked. +static_assert!(size_of::<isize>() == size_of::<isize_atomic_repr>()); +static_assert!(align_of::<isize>() == align_of::<isize_atomic_repr>()); +static_assert!(size_of::<usize>() == size_of::<isize_atomic_repr>()); +static_assert!(align_of::<usize>() == align_of::<isize_atomic_repr>()); + +// SAFETY: `isize` has the same size and alignment with `isize_atomic_repr`, and is round-trip +// transmutable to `isize_atomic_repr`. +unsafe impl super::AtomicType for isize { + type Repr = isize_atomic_repr; +} + +// SAFETY: The wrapping add result of two `isize_atomic_repr`s is a valid `usize`. +unsafe impl super::AtomicAdd<isize> for isize { + fn rhs_into_delta(rhs: isize) -> isize_atomic_repr { + rhs as isize_atomic_repr + } +} + +// SAFETY: `u32` and `i32` has the same size and alignment, and `u32` is round-trip transmutable to +// `i32`. +unsafe impl super::AtomicType for u32 { + type Repr = i32; +} + +// SAFETY: The wrapping add result of two `i32`s is a valid `u32`. +unsafe impl super::AtomicAdd<u32> for u32 { + fn rhs_into_delta(rhs: u32) -> i32 { + rhs as i32 + } +} + +// SAFETY: `u64` and `i64` has the same size and alignment, and `u64` is round-trip transmutable to +// `i64`. +unsafe impl super::AtomicType for u64 { + type Repr = i64; +} + +// SAFETY: The wrapping add result of two `i64`s is a valid `u64`. +unsafe impl super::AtomicAdd<u64> for u64 { + fn rhs_into_delta(rhs: u64) -> i64 { + rhs as i64 + } +} + +// SAFETY: `usize` has the same size and alignment with `isize_atomic_repr`, and is round-trip +// transmutable to `isize_atomic_repr`. +unsafe impl super::AtomicType for usize { + type Repr = isize_atomic_repr; +} + +// SAFETY: The wrapping add result of two `isize_atomic_repr`s is a valid `usize`. +unsafe impl super::AtomicAdd<usize> for usize { + fn rhs_into_delta(rhs: usize) -> isize_atomic_repr { + rhs as isize_atomic_repr + } +} + +use crate::macros::kunit_tests; + +#[kunit_tests(rust_atomics)] +mod tests { + use super::super::*; + + // Call $fn($val) with each $type of $val. + macro_rules! for_each_type { + ($val:literal in [$($type:ty),*] $fn:expr) => { + $({ + let v: $type = $val; + + $fn(v); + })* + } + } + + #[test] + fn atomic_basic_tests() { + for_each_type!(42 in [i32, i64, u32, u64, isize, usize] |v| { + let x = Atomic::new(v); + + assert_eq!(v, x.load(Relaxed)); + }); + } + + #[test] + fn atomic_xchg_tests() { + for_each_type!(42 in [i32, i64, u32, u64, isize, usize] |v| { + let x = Atomic::new(v); + + let old = v; + let new = v + 1; + + assert_eq!(old, x.xchg(new, Full)); + assert_eq!(new, x.load(Relaxed)); + }); + } + + #[test] + fn atomic_cmpxchg_tests() { + for_each_type!(42 in [i32, i64, u32, u64, isize, usize] |v| { + let x = Atomic::new(v); + + let old = v; + let new = v + 1; + + assert_eq!(Err(old), x.cmpxchg(new, new, Full)); + assert_eq!(old, x.load(Relaxed)); + assert_eq!(Ok(old), x.cmpxchg(old, new, Relaxed)); + assert_eq!(new, x.load(Relaxed)); + }); + } + + #[test] + fn atomic_arithmetic_tests() { + for_each_type!(42 in [i32, i64, u32, u64, isize, usize] |v| { + let x = Atomic::new(v); + + assert_eq!(v, x.fetch_add(12, Full)); + assert_eq!(v + 12, x.load(Relaxed)); + + x.add(13, Relaxed); + + assert_eq!(v + 25, x.load(Relaxed)); + }); + } +} diff --git a/rust/kernel/sync/barrier.rs b/rust/kernel/sync/barrier.rs new file mode 100644 index 000000000000..8f2d435fcd94 --- /dev/null +++ b/rust/kernel/sync/barrier.rs @@ -0,0 +1,61 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Memory barriers. +//! +//! These primitives have the same semantics as their C counterparts: and the precise definitions +//! of semantics can be found at [`LKMM`]. +//! +//! [`LKMM`]: srctree/tools/memory-model/ + +/// A compiler barrier. +/// +/// A barrier that prevents compiler from reordering memory accesses across the barrier. +#[inline(always)] +pub(crate) fn barrier() { + // By default, Rust inline asms are treated as being able to access any memory or flags, hence + // it suffices as a compiler barrier. + // + // SAFETY: An empty asm block. + unsafe { core::arch::asm!("") }; +} + +/// A full memory barrier. +/// +/// A barrier that prevents compiler and CPU from reordering memory accesses across the barrier. +#[inline(always)] +pub fn smp_mb() { + if cfg!(CONFIG_SMP) { + // SAFETY: `smp_mb()` is safe to call. + unsafe { bindings::smp_mb() }; + } else { + barrier(); + } +} + +/// A write-write memory barrier. +/// +/// A barrier that prevents compiler and CPU from reordering memory write accesses across the +/// barrier. +#[inline(always)] +pub fn smp_wmb() { + if cfg!(CONFIG_SMP) { + // SAFETY: `smp_wmb()` is safe to call. + unsafe { bindings::smp_wmb() }; + } else { + barrier(); + } +} + +/// A read-read memory barrier. +/// +/// A barrier that prevents compiler and CPU from reordering memory read accesses across the +/// barrier. +#[inline(always)] +pub fn smp_rmb() { + if cfg!(CONFIG_SMP) { + // SAFETY: `smp_rmb()` is safe to call. + unsafe { bindings::smp_rmb() }; + } else { + barrier(); + } +} diff --git a/rust/kernel/sync/refcount.rs b/rust/kernel/sync/refcount.rs new file mode 100644 index 000000000000..19236a5bccde --- /dev/null +++ b/rust/kernel/sync/refcount.rs @@ -0,0 +1,113 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Atomic reference counting. +//! +//! C header: [`include/linux/refcount.h`](srctree/include/linux/refcount.h) + +use crate::build_assert; +use crate::sync::atomic::Atomic; +use crate::types::Opaque; + +/// Atomic reference counter. +/// +/// This type is conceptually an atomic integer, but provides saturation semantics compared to +/// normal atomic integers. Values in the negative range when viewed as a signed integer are +/// saturation (bad) values. For details about the saturation semantics, please refer to top of +/// [`include/linux/refcount.h`](srctree/include/linux/refcount.h). +/// +/// Wraps the kernel's C `refcount_t`. +#[repr(transparent)] +pub struct Refcount(Opaque<bindings::refcount_t>); + +impl Refcount { + /// Construct a new [`Refcount`] from an initial value. + /// + /// The initial value should be non-saturated. + #[inline] + pub fn new(value: i32) -> Self { + build_assert!(value >= 0, "initial value saturated"); + // SAFETY: There are no safety requirements for this FFI call. + Self(Opaque::new(unsafe { bindings::REFCOUNT_INIT(value) })) + } + + #[inline] + fn as_ptr(&self) -> *mut bindings::refcount_t { + self.0.get() + } + + /// Get the underlying atomic counter that backs the refcount. + /// + /// NOTE: Usage of this function is discouraged as it can circumvent the protections offered by + /// `refcount.h`. If there is no way to achieve the result using APIs in `refcount.h`, then + /// this function can be used. Otherwise consider adding a binding for the required API. + #[inline] + pub fn as_atomic(&self) -> &Atomic<i32> { + let ptr = self.0.get().cast(); + // SAFETY: `refcount_t` is a transparent wrapper of `atomic_t`, which is an atomic 32-bit + // integer that is layout-wise compatible with `Atomic<i32>`. All values are valid for + // `refcount_t`, despite some of the values being considered saturated and "bad". + unsafe { &*ptr } + } + + /// Set a refcount's value. + #[inline] + pub fn set(&self, value: i32) { + // SAFETY: `self.as_ptr()` is valid. + unsafe { bindings::refcount_set(self.as_ptr(), value) } + } + + /// Increment a refcount. + /// + /// It will saturate if overflows and `WARN`. It will also `WARN` if the refcount is 0, as this + /// represents a possible use-after-free condition. + /// + /// Provides no memory ordering, it is assumed that caller already has a reference on the + /// object. + #[inline] + pub fn inc(&self) { + // SAFETY: self is valid. + unsafe { bindings::refcount_inc(self.as_ptr()) } + } + + /// Decrement a refcount. + /// + /// It will `WARN` on underflow and fail to decrement when saturated. + /// + /// Provides release memory ordering, such that prior loads and stores are done + /// before. + #[inline] + pub fn dec(&self) { + // SAFETY: `self.as_ptr()` is valid. + unsafe { bindings::refcount_dec(self.as_ptr()) } + } + + /// Decrement a refcount and test if it is 0. + /// + /// It will `WARN` on underflow and fail to decrement when saturated. + /// + /// Provides release memory ordering, such that prior loads and stores are done + /// before, and provides an acquire ordering on success such that memory deallocation + /// must come after. + /// + /// Returns true if the resulting refcount is 0, false otherwise. + /// + /// # Notes + /// + /// A common pattern of using `Refcount` is to free memory when the reference count reaches + /// zero. This means that the reference to `Refcount` could become invalid after calling this + /// function. This is fine as long as the reference to `Refcount` is no longer used when this + /// function returns `false`. It is not necessary to use raw pointers in this scenario, see + /// <https://github.com/rust-lang/rust/issues/55005>. + #[inline] + #[must_use = "use `dec` instead if you do not need to test if it is 0"] + pub fn dec_and_test(&self) -> bool { + // SAFETY: `self.as_ptr()` is valid. + unsafe { bindings::refcount_dec_and_test(self.as_ptr()) } + } +} + +// SAFETY: `refcount_t` is thread-safe. +unsafe impl Send for Refcount {} + +// SAFETY: `refcount_t` is thread-safe. +unsafe impl Sync for Refcount {} diff --git a/scripts/atomic/gen-atomics.sh b/scripts/atomic/gen-atomics.sh index 5b98a8307693..02508d0d6fe4 100755 --- a/scripts/atomic/gen-atomics.sh +++ b/scripts/atomic/gen-atomics.sh @@ -11,6 +11,7 @@ cat <<EOF | gen-atomic-instrumented.sh linux/atomic/atomic-instrumented.h gen-atomic-long.sh linux/atomic/atomic-long.h gen-atomic-fallback.sh linux/atomic/atomic-arch-fallback.h +gen-rust-atomic-helpers.sh ../rust/helpers/atomic.c EOF while read script header args; do /bin/sh ${ATOMICDIR}/${script} ${ATOMICTBL} ${args} > ${LINUXDIR}/include/${header} diff --git a/scripts/atomic/gen-rust-atomic-helpers.sh b/scripts/atomic/gen-rust-atomic-helpers.sh new file mode 100755 index 000000000000..45b1e100ed7c --- /dev/null +++ b/scripts/atomic/gen-rust-atomic-helpers.sh @@ -0,0 +1,67 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 + +ATOMICDIR=$(dirname $0) + +. ${ATOMICDIR}/atomic-tbl.sh + +#gen_proto_order_variant(meta, pfx, name, sfx, order, atomic, int, arg...) +gen_proto_order_variant() +{ + local meta="$1"; shift + local pfx="$1"; shift + local name="$1"; shift + local sfx="$1"; shift + local order="$1"; shift + local atomic="$1"; shift + local int="$1"; shift + + local atomicname="${atomic}_${pfx}${name}${sfx}${order}" + + local ret="$(gen_ret_type "${meta}" "${int}")" + local params="$(gen_params "${int}" "${atomic}" "$@")" + local args="$(gen_args "$@")" + local retstmt="$(gen_ret_stmt "${meta}")" + +cat <<EOF +__rust_helper ${ret} +rust_helper_${atomicname}(${params}) +{ + ${retstmt}${atomicname}(${args}); +} + +EOF +} + +cat << EOF +// SPDX-License-Identifier: GPL-2.0 + +// Generated by $0 +// DO NOT MODIFY THIS FILE DIRECTLY + +/* + * This file provides helpers for the various atomic functions for Rust. + */ +#ifndef _RUST_ATOMIC_API_H +#define _RUST_ATOMIC_API_H + +#include <linux/atomic.h> + +// TODO: Remove this after INLINE_HELPERS support is added. +#ifndef __rust_helper +#define __rust_helper +#endif + +EOF + +grep '^[a-z]' "$1" | while read name meta args; do + gen_proto "${meta}" "${name}" "atomic" "int" ${args} +done + +grep '^[a-z]' "$1" | while read name meta args; do + gen_proto "${meta}" "${name}" "atomic64" "s64" ${args} +done + +cat <<EOF +#endif /* _RUST_ATOMIC_API_H */ +EOF diff --git a/scripts/gdb/linux/timerlist.py b/scripts/gdb/linux/timerlist.py index 98445671fe83..ccc24d30de80 100644 --- a/scripts/gdb/linux/timerlist.py +++ b/scripts/gdb/linux/timerlist.py @@ -56,8 +56,6 @@ def print_base(base): text += " .index: {}\n".format(base['index']) text += " .resolution: {} nsecs\n".format(constants.LX_hrtimer_resolution) - - text += " .get_time: {}\n".format(base['get_time']) if constants.LX_CONFIG_HIGH_RES_TIMERS: text += " .offset: {} nsecs\n".format(base['offset']) text += "active timers:\n" diff --git a/sound/core/hrtimer.c b/sound/core/hrtimer.c index c364bd126ac8..2d5f4d47071f 100644 --- a/sound/core/hrtimer.c +++ b/sound/core/hrtimer.c @@ -44,7 +44,7 @@ static enum hrtimer_restart snd_hrtimer_callback(struct hrtimer *hrt) } /* calculate the drift */ - delta = ktime_sub(hrt->base->get_time(), hrtimer_get_expires(hrt)); + delta = ktime_sub(hrtimer_cb_get_time(hrt), hrtimer_get_expires(hrt)); if (delta > 0) ticks += ktime_divns(delta, ticks * resolution); diff --git a/tools/arch/x86/include/asm/inat.h b/tools/arch/x86/include/asm/inat.h index 183aa662b165..099e926595bd 100644 --- a/tools/arch/x86/include/asm/inat.h +++ b/tools/arch/x86/include/asm/inat.h @@ -37,6 +37,8 @@ #define INAT_PFX_EVEX 15 /* EVEX prefix */ /* x86-64 REX2 prefix */ #define INAT_PFX_REX2 16 /* 0xD5 */ +/* AMD XOP prefix */ +#define INAT_PFX_XOP 17 /* 0x8F */ #define INAT_LSTPFX_MAX 3 #define INAT_LGCPFX_MAX 11 @@ -77,6 +79,7 @@ #define INAT_MOFFSET (1 << (INAT_FLAG_OFFS + 3)) #define INAT_VARIANT (1 << (INAT_FLAG_OFFS + 4)) #define INAT_VEXOK (1 << (INAT_FLAG_OFFS + 5)) +#define INAT_XOPOK INAT_VEXOK #define INAT_VEXONLY (1 << (INAT_FLAG_OFFS + 6)) #define INAT_EVEXONLY (1 << (INAT_FLAG_OFFS + 7)) #define INAT_NO_REX2 (1 << (INAT_FLAG_OFFS + 8)) @@ -111,6 +114,8 @@ extern insn_attr_t inat_get_group_attribute(insn_byte_t modrm, extern insn_attr_t inat_get_avx_attribute(insn_byte_t opcode, insn_byte_t vex_m, insn_byte_t vex_pp); +extern insn_attr_t inat_get_xop_attribute(insn_byte_t opcode, + insn_byte_t map_select); /* Attribute checking functions */ static inline int inat_is_legacy_prefix(insn_attr_t attr) @@ -164,6 +169,11 @@ static inline int inat_is_vex3_prefix(insn_attr_t attr) return (attr & INAT_PFX_MASK) == INAT_PFX_VEX3; } +static inline int inat_is_xop_prefix(insn_attr_t attr) +{ + return (attr & INAT_PFX_MASK) == INAT_PFX_XOP; +} + static inline int inat_is_escape(insn_attr_t attr) { return attr & INAT_ESC_MASK; @@ -229,6 +239,11 @@ static inline int inat_accept_vex(insn_attr_t attr) return attr & INAT_VEXOK; } +static inline int inat_accept_xop(insn_attr_t attr) +{ + return attr & INAT_XOPOK; +} + static inline int inat_must_vex(insn_attr_t attr) { return attr & (INAT_VEXONLY | INAT_EVEXONLY); diff --git a/tools/arch/x86/include/asm/insn.h b/tools/arch/x86/include/asm/insn.h index 0e5abd896ad4..c683d609934b 100644 --- a/tools/arch/x86/include/asm/insn.h +++ b/tools/arch/x86/include/asm/insn.h @@ -71,7 +71,10 @@ struct insn { * prefixes.bytes[3]: last prefix */ struct insn_field rex_prefix; /* REX prefix */ - struct insn_field vex_prefix; /* VEX prefix */ + union { + struct insn_field vex_prefix; /* VEX prefix */ + struct insn_field xop_prefix; /* XOP prefix */ + }; struct insn_field opcode; /* * opcode.bytes[0]: opcode1 * opcode.bytes[1]: opcode2 @@ -135,6 +138,17 @@ struct insn { #define X86_VEX_V(vex) (((vex) & 0x78) >> 3) /* VEX3 Byte2, VEX2 Byte1 */ #define X86_VEX_P(vex) ((vex) & 0x03) /* VEX3 Byte2, VEX2 Byte1 */ #define X86_VEX_M_MAX 0x1f /* VEX3.M Maximum value */ +/* XOP bit fields */ +#define X86_XOP_R(xop) ((xop) & 0x80) /* XOP Byte2 */ +#define X86_XOP_X(xop) ((xop) & 0x40) /* XOP Byte2 */ +#define X86_XOP_B(xop) ((xop) & 0x20) /* XOP Byte2 */ +#define X86_XOP_M(xop) ((xop) & 0x1f) /* XOP Byte2 */ +#define X86_XOP_W(xop) ((xop) & 0x80) /* XOP Byte3 */ +#define X86_XOP_V(xop) ((xop) & 0x78) /* XOP Byte3 */ +#define X86_XOP_L(xop) ((xop) & 0x04) /* XOP Byte3 */ +#define X86_XOP_P(xop) ((xop) & 0x03) /* XOP Byte3 */ +#define X86_XOP_M_MIN 0x08 /* Min of XOP.M */ +#define X86_XOP_M_MAX 0x1f /* Max of XOP.M */ extern void insn_init(struct insn *insn, const void *kaddr, int buf_len, int x86_64); extern int insn_get_prefixes(struct insn *insn); @@ -178,7 +192,7 @@ static inline insn_byte_t insn_rex2_m_bit(struct insn *insn) return X86_REX2_M(insn->rex_prefix.bytes[1]); } -static inline int insn_is_avx(struct insn *insn) +static inline int insn_is_avx_or_xop(struct insn *insn) { if (!insn->prefixes.got) insn_get_prefixes(insn); @@ -192,6 +206,22 @@ static inline int insn_is_evex(struct insn *insn) return (insn->vex_prefix.nbytes == 4); } +/* If we already know this is AVX/XOP encoded */ +static inline int avx_insn_is_xop(struct insn *insn) +{ + insn_attr_t attr = inat_get_opcode_attribute(insn->vex_prefix.bytes[0]); + + return inat_is_xop_prefix(attr); +} + +static inline int insn_is_xop(struct insn *insn) +{ + if (!insn_is_avx_or_xop(insn)) + return 0; + + return avx_insn_is_xop(insn); +} + static inline int insn_has_emulate_prefix(struct insn *insn) { return !!insn->emulate_prefix_size; @@ -222,11 +252,26 @@ static inline insn_byte_t insn_vex_w_bit(struct insn *insn) return X86_VEX_W(insn->vex_prefix.bytes[2]); } +static inline insn_byte_t insn_xop_map_bits(struct insn *insn) +{ + if (insn->xop_prefix.nbytes < 3) /* XOP is 3 bytes */ + return 0; + return X86_XOP_M(insn->xop_prefix.bytes[1]); +} + +static inline insn_byte_t insn_xop_p_bits(struct insn *insn) +{ + return X86_XOP_P(insn->vex_prefix.bytes[2]); +} + /* Get the last prefix id from last prefix or VEX prefix */ static inline int insn_last_prefix_id(struct insn *insn) { - if (insn_is_avx(insn)) + if (insn_is_avx_or_xop(insn)) { + if (avx_insn_is_xop(insn)) + return insn_xop_p_bits(insn); return insn_vex_p_bits(insn); /* VEX_p is a SIMD prefix id */ + } if (insn->prefixes.bytes[3]) return inat_get_last_prefix_id(insn->prefixes.bytes[3]); diff --git a/tools/arch/x86/include/asm/msr-index.h b/tools/arch/x86/include/asm/msr-index.h index b65c3ba5fa14..f627196eb796 100644 --- a/tools/arch/x86/include/asm/msr-index.h +++ b/tools/arch/x86/include/asm/msr-index.h @@ -315,12 +315,14 @@ #define PERF_CAP_PT_IDX 16 #define MSR_PEBS_LD_LAT_THRESHOLD 0x000003f6 -#define PERF_CAP_PEBS_TRAP BIT_ULL(6) -#define PERF_CAP_ARCH_REG BIT_ULL(7) -#define PERF_CAP_PEBS_FORMAT 0xf00 -#define PERF_CAP_PEBS_BASELINE BIT_ULL(14) -#define PERF_CAP_PEBS_MASK (PERF_CAP_PEBS_TRAP | PERF_CAP_ARCH_REG | \ - PERF_CAP_PEBS_FORMAT | PERF_CAP_PEBS_BASELINE) +#define PERF_CAP_PEBS_TRAP BIT_ULL(6) +#define PERF_CAP_ARCH_REG BIT_ULL(7) +#define PERF_CAP_PEBS_FORMAT 0xf00 +#define PERF_CAP_PEBS_BASELINE BIT_ULL(14) +#define PERF_CAP_PEBS_TIMING_INFO BIT_ULL(17) +#define PERF_CAP_PEBS_MASK (PERF_CAP_PEBS_TRAP | PERF_CAP_ARCH_REG | \ + PERF_CAP_PEBS_FORMAT | PERF_CAP_PEBS_BASELINE | \ + PERF_CAP_PEBS_TIMING_INFO) #define MSR_IA32_RTIT_CTL 0x00000570 #define RTIT_CTL_TRACEEN BIT(0) diff --git a/tools/arch/x86/lib/inat.c b/tools/arch/x86/lib/inat.c index dfbcc6405941..ffcb0e27453b 100644 --- a/tools/arch/x86/lib/inat.c +++ b/tools/arch/x86/lib/inat.c @@ -81,3 +81,16 @@ insn_attr_t inat_get_avx_attribute(insn_byte_t opcode, insn_byte_t vex_m, return table[opcode]; } +insn_attr_t inat_get_xop_attribute(insn_byte_t opcode, insn_byte_t map_select) +{ + const insn_attr_t *table; + + if (map_select < X86_XOP_M_MIN || map_select > X86_XOP_M_MAX) + return 0; + map_select -= X86_XOP_M_MIN; + /* At first, this checks the master table */ + table = inat_xop_tables[map_select]; + if (!table) + return 0; + return table[opcode]; +} diff --git a/tools/arch/x86/lib/insn.c b/tools/arch/x86/lib/insn.c index bce69c6bfa69..1d1c57c74d1f 100644 --- a/tools/arch/x86/lib/insn.c +++ b/tools/arch/x86/lib/insn.c @@ -200,12 +200,15 @@ found: } insn->rex_prefix.got = 1; - /* Decode VEX prefix */ + /* Decode VEX/XOP prefix */ b = peek_next(insn_byte_t, insn); - attr = inat_get_opcode_attribute(b); - if (inat_is_vex_prefix(attr)) { + if (inat_is_vex_prefix(attr) || inat_is_xop_prefix(attr)) { insn_byte_t b2 = peek_nbyte_next(insn_byte_t, insn, 1); - if (!insn->x86_64) { + + if (inat_is_xop_prefix(attr) && X86_MODRM_REG(b2) == 0) { + /* Grp1A.0 is always POP Ev */ + goto vex_end; + } else if (!insn->x86_64) { /* * In 32-bits mode, if the [7:6] bits (mod bits of * ModRM) on the second byte are not 11b, it is @@ -226,13 +229,13 @@ found: if (insn->x86_64 && X86_VEX_W(b2)) /* VEX.W overrides opnd_size */ insn->opnd_bytes = 8; - } else if (inat_is_vex3_prefix(attr)) { + } else if (inat_is_vex3_prefix(attr) || inat_is_xop_prefix(attr)) { b2 = peek_nbyte_next(insn_byte_t, insn, 2); insn_set_byte(&insn->vex_prefix, 2, b2); insn->vex_prefix.nbytes = 3; insn->next_byte += 3; if (insn->x86_64 && X86_VEX_W(b2)) - /* VEX.W overrides opnd_size */ + /* VEX.W/XOP.W overrides opnd_size */ insn->opnd_bytes = 8; } else { /* @@ -288,9 +291,22 @@ int insn_get_opcode(struct insn *insn) insn_set_byte(opcode, 0, op); opcode->nbytes = 1; - /* Check if there is VEX prefix or not */ - if (insn_is_avx(insn)) { + /* Check if there is VEX/XOP prefix or not */ + if (insn_is_avx_or_xop(insn)) { insn_byte_t m, p; + + /* XOP prefix has different encoding */ + if (unlikely(avx_insn_is_xop(insn))) { + m = insn_xop_map_bits(insn); + insn->attr = inat_get_xop_attribute(op, m); + if (!inat_accept_xop(insn->attr)) { + insn->attr = 0; + return -EINVAL; + } + /* XOP has only 1 byte for opcode */ + goto end; + } + m = insn_vex_m_bits(insn); p = insn_vex_p_bits(insn); insn->attr = inat_get_avx_attribute(op, m, p); @@ -383,7 +399,8 @@ int insn_get_modrm(struct insn *insn) pfx_id = insn_last_prefix_id(insn); insn->attr = inat_get_group_attribute(mod, pfx_id, insn->attr); - if (insn_is_avx(insn) && !inat_accept_vex(insn->attr)) { + if (insn_is_avx_or_xop(insn) && !inat_accept_vex(insn->attr) && + !inat_accept_xop(insn->attr)) { /* Bad insn */ insn->attr = 0; return -EINVAL; diff --git a/tools/arch/x86/lib/x86-opcode-map.txt b/tools/arch/x86/lib/x86-opcode-map.txt index 262f7ca1fb95..2a4e69ecc2de 100644 --- a/tools/arch/x86/lib/x86-opcode-map.txt +++ b/tools/arch/x86/lib/x86-opcode-map.txt @@ -27,6 +27,11 @@ # (evo): this opcode is changed by EVEX prefix (EVEX opcode) # (v): this opcode requires VEX prefix. # (v1): this opcode only supports 128bit VEX. +# (xop): this opcode accepts XOP prefix. +# +# XOP Superscripts +# (W=0): this opcode requires XOP.W == 0 +# (W=1): this opcode requires XOP.W == 1 # # Last Prefix Superscripts # - (66): the last prefix is 0x66 @@ -194,7 +199,7 @@ AVXcode: 8c: MOV Ev,Sw 8d: LEA Gv,M 8e: MOV Sw,Ew -8f: Grp1A (1A) | POP Ev (d64) +8f: Grp1A (1A) | POP Ev (d64) | XOP (Prefix) # 0x90 - 0x9f 90: NOP | PAUSE (F3) | XCHG r8,rAX 91: XCHG rCX/r9,rAX @@ -1106,6 +1111,84 @@ AVXcode: 7 f8: URDMSR Rq,Id (F2),(v1),(11B) | UWRMSR Id,Rq (F3),(v1),(11B) EndTable +# From AMD64 Architecture Programmer's Manual Vol3, Appendix A.1.5 +Table: XOP map 8h +Referrer: +XOPcode: 0 +85: VPMACSSWW Vo,Ho,Wo,Lo +86: VPMACSSWD Vo,Ho,Wo,Lo +87: VPMACSSDQL Vo,Ho,Wo,Lo +8e: VPMACSSDD Vo,Ho,Wo,Lo +8f: VPMACSSDQH Vo,Ho,Wo,Lo +95: VPMACSWW Vo,Ho,Wo,Lo +96: VPMACSWD Vo,Ho,Wo,Lo +97: VPMACSDQL Vo,Ho,Wo,Lo +9e: VPMACSDD Vo,Ho,Wo,Lo +9f: VPMACSDQH Vo,Ho,Wo,Lo +a2: VPCMOV Vx,Hx,Wx,Lx (W=0) | VPCMOV Vx,Hx,Lx,Wx (W=1) +a3: VPPERM Vo,Ho,Wo,Lo (W=0) | VPPERM Vo,Ho,Lo,Wo (W=1) +a6: VPMADCSSWD Vo,Ho,Wo,Lo +b6: VPMADCSWD Vo,Ho,Wo,Lo +c0: VPROTB Vo,Wo,Ib +c1: VPROTW Vo,Wo,Ib +c2: VPROTD Vo,Wo,Ib +c3: VPROTQ Vo,Wo,Ib +cc: VPCOMccB Vo,Ho,Wo,Ib +cd: VPCOMccW Vo,Ho,Wo,Ib +ce: VPCOMccD Vo,Ho,Wo,Ib +cf: VPCOMccQ Vo,Ho,Wo,Ib +ec: VPCOMccUB Vo,Ho,Wo,Ib +ed: VPCOMccUW Vo,Ho,Wo,Ib +ee: VPCOMccUD Vo,Ho,Wo,Ib +ef: VPCOMccUQ Vo,Ho,Wo,Ib +EndTable + +Table: XOP map 9h +Referrer: +XOPcode: 1 +01: GrpXOP1 +02: GrpXOP2 +12: GrpXOP3 +80: VFRCZPS Vx,Wx +81: VFRCZPD Vx,Wx +82: VFRCZSS Vq,Wss +83: VFRCZSD Vq,Wsd +90: VPROTB Vo,Wo,Ho (W=0) | VPROTB Vo,Ho,Wo (W=1) +91: VPROTW Vo,Wo,Ho (W=0) | VPROTB Vo,Ho,Wo (W=1) +92: VPROTD Vo,Wo,Ho (W=0) | VPROTB Vo,Ho,Wo (W=1) +93: VPROTQ Vo,Wo,Ho (W=0) | VPROTB Vo,Ho,Wo (W=1) +94: VPSHLB Vo,Wo,Ho (W=0) | VPSHLB Vo,Ho,Wo (W=1) +95: VPSHLW Vo,Wo,Ho (W=0) | VPSHLW Vo,Ho,Wo (W=1) +96: VPSHLD Vo,Wo,Ho (W=0) | VPSHLD Vo,Ho,Wo (W=1) +97: VPSHLQ Vo,Wo,Ho (W=0) | VPSHLQ Vo,Ho,Wo (W=1) +98: VPSHAB Vo,Wo,Ho (W=0) | VPSHAB Vo,Ho,Wo (W=1) +99: VPSHAW Vo,Wo,Ho (W=0) | VPSHAW Vo,Ho,Wo (W=1) +9a: VPSHAD Vo,Wo,Ho (W=0) | VPSHAD Vo,Ho,Wo (W=1) +9b: VPSHAQ Vo,Wo,Ho (W=0) | VPSHAQ Vo,Ho,Wo (W=1) +c1: VPHADDBW Vo,Wo +c2: VPHADDBD Vo,Wo +c3: VPHADDBQ Vo,Wo +c6: VPHADDWD Vo,Wo +c7: VPHADDWQ Vo,Wo +cb: VPHADDDQ Vo,Wo +d1: VPHADDUBWD Vo,Wo +d2: VPHADDUBD Vo,Wo +d3: VPHADDUBQ Vo,Wo +d6: VPHADDUWD Vo,Wo +d7: VPHADDUWQ Vo,Wo +db: VPHADDUDQ Vo,Wo +e1: VPHSUBBW Vo,Wo +e2: VPHSUBWD Vo,Wo +e3: VPHSUBDQ Vo,Wo +EndTable + +Table: XOP map Ah +Referrer: +XOPcode: 2 +10: BEXTR Gy,Ey,Id +12: GrpXOP4 +EndTable + GrpTable: Grp1 0: ADD 1: OR @@ -1320,3 +1403,29 @@ GrpTable: GrpRNG 4: xcrypt-cfb 5: xcrypt-ofb EndTable + +# GrpXOP1-4 is shown in AMD APM Vol.3 Appendix A as XOP group #1-4 +GrpTable: GrpXOP1 +1: BLCFILL By,Ey (xop) +2: BLSFILL By,Ey (xop) +3: BLCS By,Ey (xop) +4: TZMSK By,Ey (xop) +5: BLCIC By,Ey (xop) +6: BLSIC By,Ey (xop) +7: T1MSKC By,Ey (xop) +EndTable + +GrpTable: GrpXOP2 +1: BLCMSK By,Ey (xop) +6: BLCI By,Ey (xop) +EndTable + +GrpTable: GrpXOP3 +0: LLWPCB Ry (xop) +1: SLWPCB Ry (xop) +EndTable + +GrpTable: GrpXOP4 +0: LWPINS By,Ed,Id (xop) +1: LWPVAL By,Ed,Id (xop) +EndTable diff --git a/tools/arch/x86/tools/gen-insn-attr-x86.awk b/tools/arch/x86/tools/gen-insn-attr-x86.awk index 2c19d7fc8a85..7ea1b75e59b7 100644 --- a/tools/arch/x86/tools/gen-insn-attr-x86.awk +++ b/tools/arch/x86/tools/gen-insn-attr-x86.awk @@ -21,6 +21,7 @@ function clear_vars() { eid = -1 # escape id gid = -1 # group id aid = -1 # AVX id + xopid = -1 # XOP id tname = "" } @@ -39,9 +40,11 @@ BEGIN { ggid = 1 geid = 1 gaid = 0 + gxopid = 0 delete etable delete gtable delete atable + delete xoptable opnd_expr = "^[A-Za-z/]" ext_expr = "^\\(" @@ -61,6 +64,7 @@ BEGIN { imm_flag["Ob"] = "INAT_MOFFSET" imm_flag["Ov"] = "INAT_MOFFSET" imm_flag["Lx"] = "INAT_MAKE_IMM(INAT_IMM_BYTE)" + imm_flag["Lo"] = "INAT_MAKE_IMM(INAT_IMM_BYTE)" modrm_expr = "^([CDEGMNPQRSUVW/][a-z]+|NTA|T[012])" force64_expr = "\\([df]64\\)" @@ -87,6 +91,8 @@ BEGIN { evexonly_expr = "\\(ev\\)" # (es) is the same as (ev) but also "SCALABLE" i.e. W and pp determine operand size evex_scalable_expr = "\\(es\\)" + # All opcodes in XOP table or with (xop) superscript accept XOP prefix + xopok_expr = "\\(xop\\)" prefix_expr = "\\(Prefix\\)" prefix_num["Operand-Size"] = "INAT_PFX_OPNDSZ" @@ -106,6 +112,7 @@ BEGIN { prefix_num["VEX+2byte"] = "INAT_PFX_VEX3" prefix_num["EVEX"] = "INAT_PFX_EVEX" prefix_num["REX2"] = "INAT_PFX_REX2" + prefix_num["XOP"] = "INAT_PFX_XOP" clear_vars() } @@ -147,6 +154,7 @@ function array_size(arr, i,c) { if (NF != 1) { # AVX/escape opcode table aid = $2 + xopid = -1 if (gaid <= aid) gaid = aid + 1 if (tname == "") # AVX only opcode table @@ -156,6 +164,20 @@ function array_size(arr, i,c) { tname = "inat_primary_table" } +/^XOPcode:/ { + if (NF != 1) { + # XOP opcode table + xopid = $2 + aid = -1 + if (gxopid <= xopid) + gxopid = xopid + 1 + if (tname == "") # XOP only opcode table + tname = sprintf("inat_xop_table_%d", $2) + } + if (xopid == -1 && eid == -1) # primary opcode table + tname = "inat_primary_table" +} + /^GrpTable:/ { print "/* " $0 " */" if (!($2 in group)) @@ -206,6 +228,8 @@ function print_table(tbl,name,fmt,n) etable[eid,0] = tname if (aid >= 0) atable[aid,0] = tname + else if (xopid >= 0) + xoptable[xopid] = tname } if (array_size(lptable1) != 0) { print_table(lptable1,tname "_1[INAT_OPCODE_TABLE_SIZE]", @@ -347,6 +371,8 @@ function convert_operands(count,opnd, i,j,imm,mod) flags = add_flags(flags, "INAT_VEXOK | INAT_VEXONLY") else if (match(ext, vexok_expr) || match(opcode, vexok_opcode_expr)) flags = add_flags(flags, "INAT_VEXOK") + else if (match(ext, xopok_expr) || xopid >= 0) + flags = add_flags(flags, "INAT_XOPOK") # check prefixes if (match(ext, prefix_expr)) { @@ -413,6 +439,14 @@ END { print " ["i"]["j"] = "atable[i,j]"," print "};\n" + print "/* XOP opcode map array */" + print "const insn_attr_t * const inat_xop_tables[X86_XOP_M_MAX - X86_XOP_M_MIN + 1]" \ + " = {" + for (i = 0; i < gxopid; i++) + if (xoptable[i]) + print " ["i"] = "xoptable[i]"," + print "};" + print "#else /* !__BOOT_COMPRESSED */\n" print "/* Escape opcode map array */" @@ -430,6 +464,10 @@ END { "[INAT_LSTPFX_MAX + 1];" print "" + print "/* XOP opcode map array */" + print "static const insn_attr_t *inat_xop_tables[X86_XOP_M_MAX - X86_XOP_M_MIN + 1];" + print "" + print "static void inat_init_tables(void)" print "{" @@ -455,6 +493,12 @@ END { if (atable[i,j]) print "\tinat_avx_tables["i"]["j"] = "atable[i,j]";" + print "" + print "\t/* Print XOP opcode map array */" + for (i = 0; i < gxopid; i++) + if (xoptable[i]) + print "\tinat_xop_tables["i"] = "xoptable[i]";" + print "}" print "#endif" } diff --git a/tools/objtool/arch/x86/decode.c b/tools/objtool/arch/x86/decode.c index 98c4713c1b09..0ad5cc70ecbe 100644 --- a/tools/objtool/arch/x86/decode.c +++ b/tools/objtool/arch/x86/decode.c @@ -880,3 +880,15 @@ unsigned int arch_reloc_size(struct reloc *reloc) return 8; } } + +bool arch_absolute_reloc(struct elf *elf, struct reloc *reloc) +{ + switch (reloc_type(reloc)) { + case R_X86_64_32: + case R_X86_64_32S: + case R_X86_64_64: + return true; + default: + return false; + } +} diff --git a/tools/objtool/builtin-check.c b/tools/objtool/builtin-check.c index 80239843e9f0..0f6b197cfcb0 100644 --- a/tools/objtool/builtin-check.c +++ b/tools/objtool/builtin-check.c @@ -87,6 +87,7 @@ static const struct option check_options[] = { OPT_BOOLEAN('t', "static-call", &opts.static_call, "annotate static calls"), OPT_BOOLEAN('u', "uaccess", &opts.uaccess, "validate uaccess rules for SMAP"), OPT_BOOLEAN(0 , "cfi", &opts.cfi, "annotate kernel control flow integrity (kCFI) function preambles"), + OPT_BOOLEAN(0 , "noabs", &opts.noabs, "reject absolute references in allocatable sections"), OPT_CALLBACK_OPTARG(0, "dump", NULL, NULL, "orc", "dump metadata", parse_dump), OPT_GROUP("Options:"), @@ -162,6 +163,7 @@ static bool opts_valid(void) opts.hack_noinstr || opts.ibt || opts.mcount || + opts.noabs || opts.noinstr || opts.orc || opts.retpoline || diff --git a/tools/objtool/check.c b/tools/objtool/check.c index d14f20ef1db1..093fcd01dd6e 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -3564,7 +3564,9 @@ static int validate_branch(struct objtool_file *file, struct symbol *func, if (func && insn_func(insn) && func != insn_func(insn)->pfunc) { /* Ignore KCFI type preambles, which always fall through */ if (!strncmp(func->name, "__cfi_", 6) || - !strncmp(func->name, "__pfx_", 6)) + !strncmp(func->name, "__pfx_", 6) || + !strncmp(func->name, "__pi___cfi_", 11) || + !strncmp(func->name, "__pi___pfx_", 11)) return 0; if (file->ignore_unreachables) @@ -4644,6 +4646,47 @@ static void disas_warned_funcs(struct objtool_file *file) disas_funcs(funcs); } +__weak bool arch_absolute_reloc(struct elf *elf, struct reloc *reloc) +{ + unsigned int type = reloc_type(reloc); + size_t sz = elf_addr_size(elf); + + return (sz == 8) ? (type == R_ABS64) : (type == R_ABS32); +} + +static int check_abs_references(struct objtool_file *file) +{ + struct section *sec; + struct reloc *reloc; + int ret = 0; + + for_each_sec(file, sec) { + /* absolute references in non-loadable sections are fine */ + if (!(sec->sh.sh_flags & SHF_ALLOC)) + continue; + + /* section must have an associated .rela section */ + if (!sec->rsec) + continue; + + /* + * Special case for compiler generated metadata that is not + * consumed until after boot. + */ + if (!strcmp(sec->name, "__patchable_function_entries")) + continue; + + for_each_reloc(sec->rsec, reloc) { + if (arch_absolute_reloc(file->elf, reloc)) { + WARN("section %s has absolute relocation at offset 0x%lx", + sec->name, reloc_offset(reloc)); + ret++; + } + } + } + return ret; +} + struct insn_chunk { void *addr; struct insn_chunk *next; @@ -4777,6 +4820,9 @@ int check(struct objtool_file *file) goto out; } + if (opts.noabs) + warnings += check_abs_references(file); + if (opts.orc && nr_insns) { ret = orc_create(file); if (ret) diff --git a/tools/objtool/include/objtool/arch.h b/tools/objtool/include/objtool/arch.h index 01ef6f415adf..be33c7b43180 100644 --- a/tools/objtool/include/objtool/arch.h +++ b/tools/objtool/include/objtool/arch.h @@ -97,6 +97,7 @@ bool arch_is_embedded_insn(struct symbol *sym); int arch_rewrite_retpolines(struct objtool_file *file); bool arch_pc_relative_reloc(struct reloc *reloc); +bool arch_absolute_reloc(struct elf *elf, struct reloc *reloc); unsigned int arch_reloc_size(struct reloc *reloc); unsigned long arch_jump_table_sym_offset(struct reloc *reloc, struct reloc *table); diff --git a/tools/objtool/include/objtool/builtin.h b/tools/objtool/include/objtool/builtin.h index 6b08666fa69d..ab22673862e1 100644 --- a/tools/objtool/include/objtool/builtin.h +++ b/tools/objtool/include/objtool/builtin.h @@ -26,6 +26,7 @@ struct opts { bool uaccess; int prefix; bool cfi; + bool noabs; /* options: */ bool backtrace; diff --git a/tools/objtool/noreturns.h b/tools/objtool/noreturns.h index 6a922d046b8e..802895fae3ca 100644 --- a/tools/objtool/noreturns.h +++ b/tools/objtool/noreturns.h @@ -45,7 +45,6 @@ NORETURN(rewind_stack_and_make_dead) NORETURN(rust_begin_unwind) NORETURN(rust_helper_BUG) NORETURN(sev_es_terminate) -NORETURN(snp_abort) NORETURN(start_kernel) NORETURN(stop_this_cpu) NORETURN(usercopy_abort) diff --git a/tools/perf/util/intel-pt-decoder/intel-pt-insn-decoder.c b/tools/perf/util/intel-pt-decoder/intel-pt-insn-decoder.c index 8fabddc1c0da..72c7a4e15d61 100644 --- a/tools/perf/util/intel-pt-decoder/intel-pt-insn-decoder.c +++ b/tools/perf/util/intel-pt-decoder/intel-pt-insn-decoder.c @@ -32,7 +32,7 @@ static void intel_pt_insn_decoder(struct insn *insn, intel_pt_insn->rel = 0; intel_pt_insn->emulated_ptwrite = false; - if (insn_is_avx(insn)) { + if (insn_is_avx_or_xop(insn)) { intel_pt_insn->op = INTEL_PT_OP_OTHER; intel_pt_insn->branch = INTEL_PT_BR_NO_BRANCH; intel_pt_insn->length = insn->length; diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 72a280e7a9d5..47eb2d4d13a5 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -1195,7 +1195,7 @@ static const struct platform_data turbostat_pdata[] = { { INTEL_EMERALDRAPIDS_X, &spr_features }, { INTEL_GRANITERAPIDS_X, &spr_features }, { INTEL_GRANITERAPIDS_D, &spr_features }, - { INTEL_PANTHERCOVE_X, &dmr_features }, + { INTEL_DIAMONDRAPIDS_X, &dmr_features }, { INTEL_LAKEFIELD, &cnl_features }, { INTEL_ALDERLAKE, &adl_features }, { INTEL_ALDERLAKE_L, &adl_features }, diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c b/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c index b17dc39a23db..6d75ede16e7c 100644 --- a/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c +++ b/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c @@ -8,22 +8,31 @@ #include <asm/ptrace.h> #include <linux/compiler.h> #include <linux/stringify.h> +#include <linux/kernel.h> #include <sys/wait.h> #include <sys/syscall.h> #include <sys/prctl.h> #include <asm/prctl.h> #include "uprobe_syscall.skel.h" #include "uprobe_syscall_executed.skel.h" +#include "bpf/libbpf_internal.h" -__naked unsigned long uretprobe_regs_trigger(void) +#define USDT_NOP .byte 0x0f, 0x1f, 0x44, 0x00, 0x00 +#include "usdt.h" + +#pragma GCC diagnostic ignored "-Wattributes" + +__attribute__((aligned(16))) +__nocf_check __weak __naked unsigned long uprobe_regs_trigger(void) { asm volatile ( + ".byte 0x0f, 0x1f, 0x44, 0x00, 0x00\n" /* nop5 */ "movq $0xdeadbeef, %rax\n" "ret\n" ); } -__naked void uretprobe_regs(struct pt_regs *before, struct pt_regs *after) +__naked void uprobe_regs(struct pt_regs *before, struct pt_regs *after) { asm volatile ( "movq %r15, 0(%rdi)\n" @@ -44,15 +53,17 @@ __naked void uretprobe_regs(struct pt_regs *before, struct pt_regs *after) "movq $0, 120(%rdi)\n" /* orig_rax */ "movq $0, 128(%rdi)\n" /* rip */ "movq $0, 136(%rdi)\n" /* cs */ + "pushq %rax\n" "pushf\n" "pop %rax\n" "movq %rax, 144(%rdi)\n" /* eflags */ + "pop %rax\n" "movq %rsp, 152(%rdi)\n" /* rsp */ "movq $0, 160(%rdi)\n" /* ss */ /* save 2nd argument */ "pushq %rsi\n" - "call uretprobe_regs_trigger\n" + "call uprobe_regs_trigger\n" /* save return value and load 2nd argument pointer to rax */ "pushq %rax\n" @@ -92,25 +103,37 @@ __naked void uretprobe_regs(struct pt_regs *before, struct pt_regs *after) ); } -static void test_uretprobe_regs_equal(void) +static void test_uprobe_regs_equal(bool retprobe) { + LIBBPF_OPTS(bpf_uprobe_opts, opts, + .retprobe = retprobe, + ); struct uprobe_syscall *skel = NULL; struct pt_regs before = {}, after = {}; unsigned long *pb = (unsigned long *) &before; unsigned long *pa = (unsigned long *) &after; unsigned long *pp; + unsigned long offset; unsigned int i, cnt; - int err; + + offset = get_uprobe_offset(&uprobe_regs_trigger); + if (!ASSERT_GE(offset, 0, "get_uprobe_offset")) + return; skel = uprobe_syscall__open_and_load(); if (!ASSERT_OK_PTR(skel, "uprobe_syscall__open_and_load")) goto cleanup; - err = uprobe_syscall__attach(skel); - if (!ASSERT_OK(err, "uprobe_syscall__attach")) + skel->links.probe = bpf_program__attach_uprobe_opts(skel->progs.probe, + 0, "/proc/self/exe", offset, &opts); + if (!ASSERT_OK_PTR(skel->links.probe, "bpf_program__attach_uprobe_opts")) goto cleanup; - uretprobe_regs(&before, &after); + /* make sure uprobe gets optimized */ + if (!retprobe) + uprobe_regs_trigger(); + + uprobe_regs(&before, &after); pp = (unsigned long *) &skel->bss->regs; cnt = sizeof(before)/sizeof(*pb); @@ -119,7 +142,7 @@ static void test_uretprobe_regs_equal(void) unsigned int offset = i * sizeof(unsigned long); /* - * Check register before and after uretprobe_regs_trigger call + * Check register before and after uprobe_regs_trigger call * that triggers the uretprobe. */ switch (offset) { @@ -133,7 +156,7 @@ static void test_uretprobe_regs_equal(void) /* * Check register seen from bpf program and register after - * uretprobe_regs_trigger call + * uprobe_regs_trigger call (with rax exception, check below). */ switch (offset) { /* @@ -146,6 +169,15 @@ static void test_uretprobe_regs_equal(void) case offsetof(struct pt_regs, rsp): case offsetof(struct pt_regs, ss): break; + /* + * uprobe does not see return value in rax, it needs to see the + * original (before) rax value + */ + case offsetof(struct pt_regs, rax): + if (!retprobe) { + ASSERT_EQ(pp[i], pb[i], "uprobe rax prog-before value check"); + break; + } default: if (!ASSERT_EQ(pp[i], pa[i], "register prog-after value check")) fprintf(stdout, "failed register offset %u\n", offset); @@ -175,7 +207,7 @@ static int write_bpf_testmod_uprobe(unsigned long offset) return ret != n ? (int) ret : 0; } -static void test_uretprobe_regs_change(void) +static void test_regs_change(void) { struct pt_regs before = {}, after = {}; unsigned long *pb = (unsigned long *) &before; @@ -183,13 +215,16 @@ static void test_uretprobe_regs_change(void) unsigned long cnt = sizeof(before)/sizeof(*pb); unsigned int i, err, offset; - offset = get_uprobe_offset(uretprobe_regs_trigger); + offset = get_uprobe_offset(uprobe_regs_trigger); err = write_bpf_testmod_uprobe(offset); if (!ASSERT_OK(err, "register_uprobe")) return; - uretprobe_regs(&before, &after); + /* make sure uprobe gets optimized */ + uprobe_regs_trigger(); + + uprobe_regs(&before, &after); err = write_bpf_testmod_uprobe(0); if (!ASSERT_OK(err, "unregister_uprobe")) @@ -252,6 +287,7 @@ static void test_uretprobe_syscall_call(void) ); struct uprobe_syscall_executed *skel; int pid, status, err, go[2], c = 0; + struct bpf_link *link; if (!ASSERT_OK(pipe(go), "pipe")) return; @@ -277,11 +313,14 @@ static void test_uretprobe_syscall_call(void) _exit(0); } - skel->links.test = bpf_program__attach_uprobe_multi(skel->progs.test, pid, - "/proc/self/exe", - "uretprobe_syscall_call", &opts); - if (!ASSERT_OK_PTR(skel->links.test, "bpf_program__attach_uprobe_multi")) + skel->bss->pid = pid; + + link = bpf_program__attach_uprobe_multi(skel->progs.test_uretprobe_multi, + pid, "/proc/self/exe", + "uretprobe_syscall_call", &opts); + if (!ASSERT_OK_PTR(link, "bpf_program__attach_uprobe_multi")) goto cleanup; + skel->links.test_uretprobe_multi = link; /* kick the child */ write(go[1], &c, 1); @@ -301,6 +340,256 @@ cleanup: close(go[0]); } +#define TRAMP "[uprobes-trampoline]" + +__attribute__((aligned(16))) +__nocf_check __weak __naked void uprobe_test(void) +{ + asm volatile (" \n" + ".byte 0x0f, 0x1f, 0x44, 0x00, 0x00 \n" + "ret \n" + ); +} + +__attribute__((aligned(16))) +__nocf_check __weak void usdt_test(void) +{ + USDT(optimized_uprobe, usdt); +} + +static int find_uprobes_trampoline(void *tramp_addr) +{ + void *start, *end; + char line[128]; + int ret = -1; + FILE *maps; + + maps = fopen("/proc/self/maps", "r"); + if (!maps) { + fprintf(stderr, "cannot open maps\n"); + return -1; + } + + while (fgets(line, sizeof(line), maps)) { + int m = -1; + + /* We care only about private r-x mappings. */ + if (sscanf(line, "%p-%p r-xp %*x %*x:%*x %*u %n", &start, &end, &m) != 2) + continue; + if (m < 0) + continue; + if (!strncmp(&line[m], TRAMP, sizeof(TRAMP)-1) && (start == tramp_addr)) { + ret = 0; + break; + } + } + + fclose(maps); + return ret; +} + +static unsigned char nop5[5] = { 0x0f, 0x1f, 0x44, 0x00, 0x00 }; + +static void *find_nop5(void *fn) +{ + int i; + + for (i = 0; i < 10; i++) { + if (!memcmp(nop5, fn + i, 5)) + return fn + i; + } + return NULL; +} + +typedef void (__attribute__((nocf_check)) *trigger_t)(void); + +static void *check_attach(struct uprobe_syscall_executed *skel, trigger_t trigger, + void *addr, int executed) +{ + struct __arch_relative_insn { + __u8 op; + __s32 raddr; + } __packed *call; + void *tramp = NULL; + + /* Uprobe gets optimized after first trigger, so let's press twice. */ + trigger(); + trigger(); + + /* Make sure bpf program got executed.. */ + ASSERT_EQ(skel->bss->executed, executed, "executed"); + + /* .. and check the trampoline is as expected. */ + call = (struct __arch_relative_insn *) addr; + tramp = (void *) (call + 1) + call->raddr; + ASSERT_EQ(call->op, 0xe8, "call"); + ASSERT_OK(find_uprobes_trampoline(tramp), "uprobes_trampoline"); + + return tramp; +} + +static void check_detach(void *addr, void *tramp) +{ + /* [uprobes_trampoline] stays after detach */ + ASSERT_OK(find_uprobes_trampoline(tramp), "uprobes_trampoline"); + ASSERT_OK(memcmp(addr, nop5, 5), "nop5"); +} + +static void check(struct uprobe_syscall_executed *skel, struct bpf_link *link, + trigger_t trigger, void *addr, int executed) +{ + void *tramp; + + tramp = check_attach(skel, trigger, addr, executed); + bpf_link__destroy(link); + check_detach(addr, tramp); +} + +static void test_uprobe_legacy(void) +{ + struct uprobe_syscall_executed *skel = NULL; + LIBBPF_OPTS(bpf_uprobe_opts, opts, + .retprobe = true, + ); + struct bpf_link *link; + unsigned long offset; + + offset = get_uprobe_offset(&uprobe_test); + if (!ASSERT_GE(offset, 0, "get_uprobe_offset")) + goto cleanup; + + /* uprobe */ + skel = uprobe_syscall_executed__open_and_load(); + if (!ASSERT_OK_PTR(skel, "uprobe_syscall_executed__open_and_load")) + return; + + skel->bss->pid = getpid(); + + link = bpf_program__attach_uprobe_opts(skel->progs.test_uprobe, + 0, "/proc/self/exe", offset, NULL); + if (!ASSERT_OK_PTR(link, "bpf_program__attach_uprobe_opts")) + goto cleanup; + + check(skel, link, uprobe_test, uprobe_test, 2); + + /* uretprobe */ + skel->bss->executed = 0; + + link = bpf_program__attach_uprobe_opts(skel->progs.test_uretprobe, + 0, "/proc/self/exe", offset, &opts); + if (!ASSERT_OK_PTR(link, "bpf_program__attach_uprobe_opts")) + goto cleanup; + + check(skel, link, uprobe_test, uprobe_test, 2); + +cleanup: + uprobe_syscall_executed__destroy(skel); +} + +static void test_uprobe_multi(void) +{ + struct uprobe_syscall_executed *skel = NULL; + LIBBPF_OPTS(bpf_uprobe_multi_opts, opts); + struct bpf_link *link; + unsigned long offset; + + offset = get_uprobe_offset(&uprobe_test); + if (!ASSERT_GE(offset, 0, "get_uprobe_offset")) + goto cleanup; + + opts.offsets = &offset; + opts.cnt = 1; + + skel = uprobe_syscall_executed__open_and_load(); + if (!ASSERT_OK_PTR(skel, "uprobe_syscall_executed__open_and_load")) + return; + + skel->bss->pid = getpid(); + + /* uprobe.multi */ + link = bpf_program__attach_uprobe_multi(skel->progs.test_uprobe_multi, + 0, "/proc/self/exe", NULL, &opts); + if (!ASSERT_OK_PTR(link, "bpf_program__attach_uprobe_multi")) + goto cleanup; + + check(skel, link, uprobe_test, uprobe_test, 2); + + /* uretprobe.multi */ + skel->bss->executed = 0; + opts.retprobe = true; + link = bpf_program__attach_uprobe_multi(skel->progs.test_uretprobe_multi, + 0, "/proc/self/exe", NULL, &opts); + if (!ASSERT_OK_PTR(link, "bpf_program__attach_uprobe_multi")) + goto cleanup; + + check(skel, link, uprobe_test, uprobe_test, 2); + +cleanup: + uprobe_syscall_executed__destroy(skel); +} + +static void test_uprobe_session(void) +{ + struct uprobe_syscall_executed *skel = NULL; + LIBBPF_OPTS(bpf_uprobe_multi_opts, opts, + .session = true, + ); + struct bpf_link *link; + unsigned long offset; + + offset = get_uprobe_offset(&uprobe_test); + if (!ASSERT_GE(offset, 0, "get_uprobe_offset")) + goto cleanup; + + opts.offsets = &offset; + opts.cnt = 1; + + skel = uprobe_syscall_executed__open_and_load(); + if (!ASSERT_OK_PTR(skel, "uprobe_syscall_executed__open_and_load")) + return; + + skel->bss->pid = getpid(); + + link = bpf_program__attach_uprobe_multi(skel->progs.test_uprobe_session, + 0, "/proc/self/exe", NULL, &opts); + if (!ASSERT_OK_PTR(link, "bpf_program__attach_uprobe_multi")) + goto cleanup; + + check(skel, link, uprobe_test, uprobe_test, 4); + +cleanup: + uprobe_syscall_executed__destroy(skel); +} + +static void test_uprobe_usdt(void) +{ + struct uprobe_syscall_executed *skel; + struct bpf_link *link; + void *addr; + + errno = 0; + addr = find_nop5(usdt_test); + if (!ASSERT_OK_PTR(addr, "find_nop5")) + return; + + skel = uprobe_syscall_executed__open_and_load(); + if (!ASSERT_OK_PTR(skel, "uprobe_syscall_executed__open_and_load")) + return; + + skel->bss->pid = getpid(); + + link = bpf_program__attach_usdt(skel->progs.test_usdt, + -1 /* all PIDs */, "/proc/self/exe", + "optimized_uprobe", "usdt", NULL); + if (!ASSERT_OK_PTR(link, "bpf_program__attach_usdt")) + goto cleanup; + + check(skel, link, usdt_test, addr, 2); + +cleanup: + uprobe_syscall_executed__destroy(skel); +} + /* * Borrowed from tools/testing/selftests/x86/test_shadow_stack.c. * @@ -343,43 +632,172 @@ static void test_uretprobe_shadow_stack(void) return; } - /* Run all of the uretprobe tests. */ - test_uretprobe_regs_equal(); - test_uretprobe_regs_change(); + /* Run all the tests with shadow stack in place. */ + + test_uprobe_regs_equal(false); + test_uprobe_regs_equal(true); test_uretprobe_syscall_call(); + test_uprobe_legacy(); + test_uprobe_multi(); + test_uprobe_session(); + test_uprobe_usdt(); + + test_regs_change(); + ARCH_PRCTL(ARCH_SHSTK_DISABLE, ARCH_SHSTK_SHSTK); } -#else -static void test_uretprobe_regs_equal(void) + +static volatile bool race_stop; + +static USDT_DEFINE_SEMA(race); + +static void *worker_trigger(void *arg) { - test__skip(); + unsigned long rounds = 0; + + while (!race_stop) { + uprobe_test(); + rounds++; + } + + printf("tid %d trigger rounds: %lu\n", gettid(), rounds); + return NULL; } -static void test_uretprobe_regs_change(void) +static void *worker_attach(void *arg) { - test__skip(); + LIBBPF_OPTS(bpf_uprobe_opts, opts); + struct uprobe_syscall_executed *skel; + unsigned long rounds = 0, offset; + const char *sema[2] = { + __stringify(USDT_SEMA(race)), + NULL, + }; + unsigned long *ref; + int err; + + offset = get_uprobe_offset(&uprobe_test); + if (!ASSERT_GE(offset, 0, "get_uprobe_offset")) + return NULL; + + err = elf_resolve_syms_offsets("/proc/self/exe", 1, (const char **) &sema, &ref, STT_OBJECT); + if (!ASSERT_OK(err, "elf_resolve_syms_offsets_sema")) + return NULL; + + opts.ref_ctr_offset = *ref; + + skel = uprobe_syscall_executed__open_and_load(); + if (!ASSERT_OK_PTR(skel, "uprobe_syscall_executed__open_and_load")) + return NULL; + + skel->bss->pid = getpid(); + + while (!race_stop) { + skel->links.test_uprobe = bpf_program__attach_uprobe_opts(skel->progs.test_uprobe, + 0, "/proc/self/exe", offset, &opts); + if (!ASSERT_OK_PTR(skel->links.test_uprobe, "bpf_program__attach_uprobe_opts")) + break; + + bpf_link__destroy(skel->links.test_uprobe); + skel->links.test_uprobe = NULL; + rounds++; + } + + printf("tid %d attach rounds: %lu hits: %d\n", gettid(), rounds, skel->bss->executed); + uprobe_syscall_executed__destroy(skel); + free(ref); + return NULL; } -static void test_uretprobe_syscall_call(void) +static useconds_t race_msec(void) { - test__skip(); + char *env; + + env = getenv("BPF_SELFTESTS_UPROBE_SYSCALL_RACE_MSEC"); + if (env) + return atoi(env); + + /* default duration is 500ms */ + return 500; } -static void test_uretprobe_shadow_stack(void) +static void test_uprobe_race(void) { - test__skip(); + int err, i, nr_threads; + pthread_t *threads; + + nr_threads = libbpf_num_possible_cpus(); + if (!ASSERT_GT(nr_threads, 0, "libbpf_num_possible_cpus")) + return; + nr_threads = max(2, nr_threads); + + threads = alloca(sizeof(*threads) * nr_threads); + if (!ASSERT_OK_PTR(threads, "malloc")) + return; + + for (i = 0; i < nr_threads; i++) { + err = pthread_create(&threads[i], NULL, i % 2 ? worker_trigger : worker_attach, + NULL); + if (!ASSERT_OK(err, "pthread_create")) + goto cleanup; + } + + usleep(race_msec() * 1000); + +cleanup: + race_stop = true; + for (nr_threads = i, i = 0; i < nr_threads; i++) + pthread_join(threads[i], NULL); + + ASSERT_FALSE(USDT_SEMA_IS_ACTIVE(race), "race_semaphore"); } + +#ifndef __NR_uprobe +#define __NR_uprobe 336 #endif -void test_uprobe_syscall(void) +static void test_uprobe_error(void) +{ + long err = syscall(__NR_uprobe); + + ASSERT_EQ(err, -1, "error"); + ASSERT_EQ(errno, ENXIO, "errno"); +} + +static void __test_uprobe_syscall(void) { if (test__start_subtest("uretprobe_regs_equal")) - test_uretprobe_regs_equal(); - if (test__start_subtest("uretprobe_regs_change")) - test_uretprobe_regs_change(); + test_uprobe_regs_equal(true); if (test__start_subtest("uretprobe_syscall_call")) test_uretprobe_syscall_call(); if (test__start_subtest("uretprobe_shadow_stack")) test_uretprobe_shadow_stack(); + if (test__start_subtest("uprobe_legacy")) + test_uprobe_legacy(); + if (test__start_subtest("uprobe_multi")) + test_uprobe_multi(); + if (test__start_subtest("uprobe_session")) + test_uprobe_session(); + if (test__start_subtest("uprobe_usdt")) + test_uprobe_usdt(); + if (test__start_subtest("uprobe_race")) + test_uprobe_race(); + if (test__start_subtest("uprobe_error")) + test_uprobe_error(); + if (test__start_subtest("uprobe_regs_equal")) + test_uprobe_regs_equal(false); + if (test__start_subtest("regs_change")) + test_regs_change(); +} +#else +static void __test_uprobe_syscall(void) +{ + test__skip(); +} +#endif + +void test_uprobe_syscall(void) +{ + __test_uprobe_syscall(); } diff --git a/tools/testing/selftests/bpf/prog_tests/usdt.c b/tools/testing/selftests/bpf/prog_tests/usdt.c index 9057e983cc54..833eb87483a1 100644 --- a/tools/testing/selftests/bpf/prog_tests/usdt.c +++ b/tools/testing/selftests/bpf/prog_tests/usdt.c @@ -40,12 +40,19 @@ static void __always_inline trigger_func(int x) { } } -static void subtest_basic_usdt(void) +static void subtest_basic_usdt(bool optimized) { LIBBPF_OPTS(bpf_usdt_opts, opts); struct test_usdt *skel; struct test_usdt__bss *bss; - int err, i; + int err, i, called; + +#define TRIGGER(x) ({ \ + trigger_func(x); \ + if (optimized) \ + trigger_func(x); \ + optimized ? 2 : 1; \ + }) skel = test_usdt__open_and_load(); if (!ASSERT_OK_PTR(skel, "skel_open")) @@ -66,11 +73,11 @@ static void subtest_basic_usdt(void) if (!ASSERT_OK_PTR(skel->links.usdt0, "usdt0_link")) goto cleanup; - trigger_func(1); + called = TRIGGER(1); - ASSERT_EQ(bss->usdt0_called, 1, "usdt0_called"); - ASSERT_EQ(bss->usdt3_called, 1, "usdt3_called"); - ASSERT_EQ(bss->usdt12_called, 1, "usdt12_called"); + ASSERT_EQ(bss->usdt0_called, called, "usdt0_called"); + ASSERT_EQ(bss->usdt3_called, called, "usdt3_called"); + ASSERT_EQ(bss->usdt12_called, called, "usdt12_called"); ASSERT_EQ(bss->usdt0_cookie, 0xcafedeadbeeffeed, "usdt0_cookie"); ASSERT_EQ(bss->usdt0_arg_cnt, 0, "usdt0_arg_cnt"); @@ -119,11 +126,11 @@ static void subtest_basic_usdt(void) * bpf_program__attach_usdt() handles this properly and attaches to * all possible places of USDT invocation. */ - trigger_func(2); + called += TRIGGER(2); - ASSERT_EQ(bss->usdt0_called, 2, "usdt0_called"); - ASSERT_EQ(bss->usdt3_called, 2, "usdt3_called"); - ASSERT_EQ(bss->usdt12_called, 2, "usdt12_called"); + ASSERT_EQ(bss->usdt0_called, called, "usdt0_called"); + ASSERT_EQ(bss->usdt3_called, called, "usdt3_called"); + ASSERT_EQ(bss->usdt12_called, called, "usdt12_called"); /* only check values that depend on trigger_func()'s input value */ ASSERT_EQ(bss->usdt3_args[0], 2, "usdt3_arg1"); @@ -142,9 +149,9 @@ static void subtest_basic_usdt(void) if (!ASSERT_OK_PTR(skel->links.usdt3, "usdt3_reattach")) goto cleanup; - trigger_func(3); + called += TRIGGER(3); - ASSERT_EQ(bss->usdt3_called, 3, "usdt3_called"); + ASSERT_EQ(bss->usdt3_called, called, "usdt3_called"); /* this time usdt3 has custom cookie */ ASSERT_EQ(bss->usdt3_cookie, 0xBADC00C51E, "usdt3_cookie"); ASSERT_EQ(bss->usdt3_arg_cnt, 3, "usdt3_arg_cnt"); @@ -158,6 +165,7 @@ static void subtest_basic_usdt(void) cleanup: test_usdt__destroy(skel); +#undef TRIGGER } unsigned short test_usdt_100_semaphore SEC(".probes"); @@ -425,7 +433,11 @@ cleanup: void test_usdt(void) { if (test__start_subtest("basic")) - subtest_basic_usdt(); + subtest_basic_usdt(false); +#ifdef __x86_64__ + if (test__start_subtest("basic_optimized")) + subtest_basic_usdt(true); +#endif if (test__start_subtest("multispec")) subtest_multispec_usdt(); if (test__start_subtest("urand_auto_attach")) diff --git a/tools/testing/selftests/bpf/progs/uprobe_syscall.c b/tools/testing/selftests/bpf/progs/uprobe_syscall.c index 8a4fa6c7ef59..e08c31669e5a 100644 --- a/tools/testing/selftests/bpf/progs/uprobe_syscall.c +++ b/tools/testing/selftests/bpf/progs/uprobe_syscall.c @@ -7,8 +7,8 @@ struct pt_regs regs; char _license[] SEC("license") = "GPL"; -SEC("uretprobe//proc/self/exe:uretprobe_regs_trigger") -int uretprobe(struct pt_regs *ctx) +SEC("uprobe") +int probe(struct pt_regs *ctx) { __builtin_memcpy(®s, ctx, sizeof(regs)); return 0; diff --git a/tools/testing/selftests/bpf/progs/uprobe_syscall_executed.c b/tools/testing/selftests/bpf/progs/uprobe_syscall_executed.c index 0d7f1a7db2e2..915d38591bf6 100644 --- a/tools/testing/selftests/bpf/progs/uprobe_syscall_executed.c +++ b/tools/testing/selftests/bpf/progs/uprobe_syscall_executed.c @@ -1,6 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 #include "vmlinux.h" #include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> +#include <bpf/usdt.bpf.h> #include <string.h> struct pt_regs regs; @@ -8,10 +10,64 @@ struct pt_regs regs; char _license[] SEC("license") = "GPL"; int executed = 0; +int pid; + +SEC("uprobe") +int BPF_UPROBE(test_uprobe) +{ + if (bpf_get_current_pid_tgid() >> 32 != pid) + return 0; + + executed++; + return 0; +} + +SEC("uretprobe") +int BPF_URETPROBE(test_uretprobe) +{ + if (bpf_get_current_pid_tgid() >> 32 != pid) + return 0; + + executed++; + return 0; +} + +SEC("uprobe.multi") +int test_uprobe_multi(struct pt_regs *ctx) +{ + if (bpf_get_current_pid_tgid() >> 32 != pid) + return 0; + + executed++; + return 0; +} SEC("uretprobe.multi") -int test(struct pt_regs *regs) +int test_uretprobe_multi(struct pt_regs *ctx) +{ + if (bpf_get_current_pid_tgid() >> 32 != pid) + return 0; + + executed++; + return 0; +} + +SEC("uprobe.session") +int test_uprobe_session(struct pt_regs *ctx) { - executed = 1; + if (bpf_get_current_pid_tgid() >> 32 != pid) + return 0; + + executed++; + return 0; +} + +SEC("usdt") +int test_usdt(struct pt_regs *ctx) +{ + if (bpf_get_current_pid_tgid() >> 32 != pid) + return 0; + + executed++; return 0; } diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c index e9e918cdf31f..511911053bdc 100644 --- a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c +++ b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c @@ -501,14 +501,20 @@ static struct bin_attribute bin_attr_bpf_testmod_file __ro_after_init = { #ifdef __x86_64__ static int +uprobe_handler(struct uprobe_consumer *self, struct pt_regs *regs, __u64 *data) +{ + regs->cx = 0x87654321feebdaed; + return 0; +} + +static int uprobe_ret_handler(struct uprobe_consumer *self, unsigned long func, struct pt_regs *regs, __u64 *data) { regs->ax = 0x12345678deadbeef; - regs->cx = 0x87654321feebdaed; regs->r11 = (u64) -1; - return true; + return 0; } struct testmod_uprobe { @@ -520,6 +526,7 @@ struct testmod_uprobe { static DEFINE_MUTEX(testmod_uprobe_mutex); static struct testmod_uprobe uprobe = { + .consumer.handler = uprobe_handler, .consumer.ret_handler = uprobe_ret_handler, }; diff --git a/tools/testing/selftests/bpf/usdt.h b/tools/testing/selftests/bpf/usdt.h new file mode 100644 index 000000000000..549d1f774810 --- /dev/null +++ b/tools/testing/selftests/bpf/usdt.h @@ -0,0 +1,545 @@ +// SPDX-License-Identifier: BSD-2-Clause +/* + * This single-header library defines a collection of variadic macros for + * defining and triggering USDTs (User Statically-Defined Tracepoints): + * + * - For USDTs without associated semaphore: + * USDT(group, name, args...) + * + * - For USDTs with implicit (transparent to the user) semaphore: + * USDT_WITH_SEMA(group, name, args...) + * USDT_IS_ACTIVE(group, name) + * + * - For USDTs with explicit (user-defined and provided) semaphore: + * USDT_WITH_EXPLICIT_SEMA(sema, group, name, args...) + * USDT_SEMA_IS_ACTIVE(sema) + * + * all of which emit a NOP instruction into the instruction stream, and so + * have *zero* overhead for the surrounding code. USDTs are identified by + * a combination of `group` and `name` identifiers, which is used by external + * tracing tooling (tracers) for identifying exact USDTs of interest. + * + * USDTs can have an associated (2-byte) activity counter (USDT semaphore), + * automatically maintained by Linux kernel whenever any correctly written + * BPF-based tracer is attached to the USDT. This USDT semaphore can be used + * to check whether there is a need to do any extra data collection and + * processing for a given USDT (if necessary), and otherwise avoid extra work + * for a common case of USDT not being traced ("active"). + * + * See documentation for USDT_WITH_SEMA()/USDT_IS_ACTIVE() or + * USDT_WITH_EXPLICIT_SEMA()/USDT_SEMA_IS_ACTIVE() APIs below for details on + * working with USDTs with implicitly or explicitly associated + * USDT semaphores, respectively. + * + * There is also some additional data recorded into an auxiliary note + * section. The data in the note section describes the operands, in terms of + * size and location, used by tracing tooling to know where to find USDT + * arguments. Each location is encoded as an assembler operand string. + * Tracing tools (bpftrace and BPF-based tracers, systemtap, etc) insert + * breakpoints on top of the nop, and decode the location operand-strings, + * like an assembler, to find the values being passed. + * + * The operand strings are selected by the compiler for each operand. + * They are constrained by inline-assembler codes.The default is: + * + * #define USDT_ARG_CONSTRAINT nor + * + * This is a good default if the operands tend to be integral and + * moderate in number (smaller than number of registers). In other + * cases, the compiler may report "'asm' requires impossible reload" or + * similar. In this case, consider simplifying the macro call (fewer + * and simpler operands), reduce optimization, or override the default + * constraints string via: + * + * #define USDT_ARG_CONSTRAINT g + * #include <usdt.h> + * + * For some historical description of USDT v3 format (the one used by this + * library and generally recognized and assumed by BPF-based tracing tools) + * see [0]. The more formal specification can be found at [1]. Additional + * argument constraints information can be found at [2]. + * + * Original SystemTap's sys/sdt.h implementation ([3]) was used as a base for + * this USDT library implementation. Current implementation differs *a lot* in + * terms of exposed user API and general usability, which was the main goal + * and focus of the reimplementation work. Nevertheless, underlying recorded + * USDT definitions are fully binary compatible and any USDT-based tooling + * should work equally well with USDTs defined by either SystemTap's or this + * library's USDT implementation. + * + * [0] https://ecos.sourceware.org/ml/systemtap/2010-q3/msg00145.html + * [1] https://sourceware.org/systemtap/wiki/UserSpaceProbeImplementation + * [2] https://gcc.gnu.org/onlinedocs/gcc/Constraints.html + * [3] https://sourceware.org/git/?p=systemtap.git;a=blob;f=includes/sys/sdt.h + */ +#ifndef __USDT_H +#define __USDT_H + +/* + * Changelog: + * + * 0.1.0 + * ----- + * - Initial release + */ +#define USDT_MAJOR_VERSION 0 +#define USDT_MINOR_VERSION 1 +#define USDT_PATCH_VERSION 0 + +/* C++20 and C23 added __VA_OPT__ as a standard replacement for non-standard `##__VA_ARGS__` extension */ +#if (defined(__STDC_VERSION__) && __STDC_VERSION__ > 201710L) || (defined(__cplusplus) && __cplusplus > 201703L) +#define __usdt_va_opt 1 +#define __usdt_va_args(...) __VA_OPT__(,) __VA_ARGS__ +#else +#define __usdt_va_args(...) , ##__VA_ARGS__ +#endif + +/* + * Trigger USDT with `group`:`name` identifier and pass through `args` as its + * arguments. Zero arguments are acceptable as well. No USDT semaphore is + * associated with this USDT. + * + * Such "semaphoreless" USDTs are commonly used when there is no extra data + * collection or processing needed to collect and prepare USDT arguments and + * they are just available in the surrounding code. USDT() macro will just + * record their locations in CPU registers or in memory for tracing tooling to + * be able to access them, if necessary. + */ +#ifdef __usdt_va_opt +#define USDT(group, name, ...) \ + __usdt_probe(group, name, __usdt_sema_none, 0 __VA_OPT__(,) __VA_ARGS__) +#else +#define USDT(group, name, ...) \ + __usdt_probe(group, name, __usdt_sema_none, 0, ##__VA_ARGS__) +#endif + +/* + * Trigger USDT with `group`:`name` identifier and pass through `args` as its + * arguments. Zero arguments are acceptable as well. USDT also get an + * implicitly-defined associated USDT semaphore, which will be "activated" by + * tracing tooling and can be used to check whether USDT is being actively + * observed. + * + * USDTs with semaphore are commonly used when there is a need to perform + * additional data collection and processing to prepare USDT arguments, which + * otherwise might not be necessary for the rest of application logic. In such + * case, USDT semaphore can be used to avoid unnecessary extra work. If USDT + * is not traced (which is presumed to be a common situation), the associated + * USDT semaphore is "inactive", and so there is no need to waste resources to + * prepare USDT arguments. Use USDT_IS_ACTIVE(group, name) to check whether + * USDT is "active". + * + * N.B. There is an inherent (albeit short) gap between checking whether USDT + * is active and triggering corresponding USDT, in which external tracer can + * be attached to an USDT and activate USDT semaphore after the activity check. + * If such a race occurs, tracers might miss one USDT execution. Tracers are + * expected to accommodate such possibility and this is expected to not be + * a problem for applications and tracers. + * + * N.B. Implicit USDT semaphore defined by USDT_WITH_SEMA() is contained + * within a single executable or shared library and is not shared outside + * them. I.e., if you use USDT_WITH_SEMA() with the same USDT group and name + * identifier across executable and shared library, it will work and won't + * conflict, per se, but will define independent USDT semaphores, one for each + * shared library/executable in which USDT_WITH_SEMA(group, name) is used. + * That is, if you attach to this USDT in one shared library (or executable), + * then only USDT semaphore within that shared library (or executable) will be + * updated by the kernel, while other libraries (or executable) will not see + * activated USDT semaphore. In short, it's best to use unique USDT group:name + * identifiers across different shared libraries (and, equivalently, between + * executable and shared library). This is advanced consideration and is + * rarely (if ever) seen in practice, but just to avoid surprises this is + * called out here. (Static libraries become a part of final executable, once + * linked by linker, so the above considerations don't apply to them.) + */ +#ifdef __usdt_va_opt +#define USDT_WITH_SEMA(group, name, ...) \ + __usdt_probe(group, name, \ + __usdt_sema_implicit, __usdt_sema_name(group, name) \ + __VA_OPT__(,) __VA_ARGS__) +#else +#define USDT_WITH_SEMA(group, name, ...) \ + __usdt_probe(group, name, \ + __usdt_sema_implicit, __usdt_sema_name(group, name), \ + ##__VA_ARGS__) +#endif + +struct usdt_sema { volatile unsigned short active; }; + +/* + * Check if USDT with `group`:`name` identifier is "active" (i.e., whether it + * is attached to by external tracing tooling and is actively observed). + * + * This macro can be used to decide whether any additional and potentially + * expensive data collection or processing should be done to pass extra + * information into the given USDT. It is assumed that USDT is triggered with + * USDT_WITH_SEMA() macro which will implicitly define associated USDT + * semaphore. (If one needs more control over USDT semaphore, see + * USDT_DEFINE_SEMA() and USDT_WITH_EXPLICIT_SEMA() macros below.) + * + * N.B. Such checks are necessarily racy and speculative. Between checking + * whether USDT is active and triggering the USDT itself, tracer can be + * detached with no notification. This race should be extremely rare and worst + * case should result in one-time wasted extra data collection and processing. + */ +#define USDT_IS_ACTIVE(group, name) ({ \ + extern struct usdt_sema __usdt_sema_name(group, name) \ + __usdt_asm_name(__usdt_sema_name(group, name)); \ + __usdt_sema_implicit(__usdt_sema_name(group, name)); \ + __usdt_sema_name(group, name).active > 0; \ +}) + +/* + * APIs for working with user-defined explicit USDT semaphores. + * + * This is a less commonly used advanced API for use cases in which user needs + * an explicit control over (potentially shared across multiple USDTs) USDT + * semaphore instance. This can be used when there is a group of logically + * related USDTs that all need extra data collection and processing whenever + * any of a family of related USDTs are "activated" (i.e., traced). In such + * a case, all such related USDTs will be associated with the same shared USDT + * semaphore defined with USDT_DEFINE_SEMA() and the USDTs themselves will be + * triggered with USDT_WITH_EXPLICIT_SEMA() macros, taking an explicit extra + * USDT semaphore identifier as an extra parameter. + */ + +/** + * Underlying C global variable name for user-defined USDT semaphore with + * `sema` identifier. Could be useful for debugging, but normally shouldn't be + * used explicitly. + */ +#define USDT_SEMA(sema) __usdt_sema_##sema + +/* + * Define storage for user-defined USDT semaphore `sema`. + * + * Should be used only once in non-header source file to let compiler allocate + * space for the semaphore variable. Just like with any other global variable. + * + * This macro can be used anywhere where global variable declaration is + * allowed. Just like with global variable definitions, there should be only + * one definition of user-defined USDT semaphore with given `sema` identifier, + * otherwise compiler or linker will complain about duplicate variable + * definition. + * + * For C++, it is allowed to use USDT_DEFINE_SEMA() both in global namespace + * and inside namespaces (including nested namespaces). Just make sure that + * USDT_DECLARE_SEMA() is placed within the namespace where this semaphore is + * referenced, or any of its parent namespaces, so the C++ language-level + * identifier is visible to the code that needs to reference the semaphore. + * At the lowest layer, USDT semaphores have global naming and visibility + * (they have a corresponding `__usdt_sema_<name>` symbol, which can be linked + * against from C or C++ code, if necessary). To keep it simple, putting + * USDT_DECLARE_SEMA() declarations into global namespaces is the simplest + * no-brainer solution. All these aspects are irrelevant for plain C, because + * C doesn't have namespaces and everything is always in the global namespace. + * + * N.B. Due to USDT metadata being recorded in non-allocatable ELF note + * section, it has limitations when it comes to relocations, which, in + * practice, means that it's not possible to correctly share USDT semaphores + * between main executable and shared libraries, or even between multiple + * shared libraries. USDT semaphore has to be contained to individual shared + * library or executable to avoid unpleasant surprises with half-working USDT + * semaphores. We enforce this by marking semaphore ELF symbols as having + * a hidden visibility. This is quite an advanced use case and consideration + * and for most users this should have no consequences whatsoever. + */ +#define USDT_DEFINE_SEMA(sema) \ + struct usdt_sema __usdt_sema_sec USDT_SEMA(sema) \ + __usdt_asm_name(USDT_SEMA(sema)) \ + __attribute__((visibility("hidden"))) = { 0 } + +/* + * Declare extern reference to user-defined USDT semaphore `sema`. + * + * Refers to a variable defined in another compilation unit by + * USDT_DEFINE_SEMA() and allows to use the same USDT semaphore across + * multiple compilation units (i.e., .c and .cpp files). + * + * See USDT_DEFINE_SEMA() notes above for C++ language usage peculiarities. + */ +#define USDT_DECLARE_SEMA(sema) \ + extern struct usdt_sema USDT_SEMA(sema) __usdt_asm_name(USDT_SEMA(sema)) + +/* + * Check if user-defined USDT semaphore `sema` is "active" (i.e., whether it + * is attached to by external tracing tooling and is actively observed). + * + * This macro can be used to decide whether any additional and potentially + * expensive data collection or processing should be done to pass extra + * information into USDT(s) associated with USDT semaphore `sema`. + * + * N.B. Such checks are necessarily racy. Between checking the state of USDT + * semaphore and triggering associated USDT(s), the active tracer might attach + * or detach. This race should be extremely rare and worst case should result + * in one-time missed USDT event or wasted extra data collection and + * processing. USDT-using tracers should be written with this in mind and is + * not a concern of the application defining USDTs with associated semaphore. + */ +#define USDT_SEMA_IS_ACTIVE(sema) (USDT_SEMA(sema).active > 0) + +/* + * Invoke USDT specified by `group` and `name` identifiers and associate + * explicitly user-defined semaphore `sema` with it. Pass through `args` as + * USDT arguments. `args` are optional and zero arguments are acceptable. + * + * Semaphore is defined with the help of USDT_DEFINE_SEMA() macro and can be + * checked whether active with USDT_SEMA_IS_ACTIVE(). + */ +#ifdef __usdt_va_opt +#define USDT_WITH_EXPLICIT_SEMA(sema, group, name, ...) \ + __usdt_probe(group, name, __usdt_sema_explicit, USDT_SEMA(sema), ##__VA_ARGS__) +#else +#define USDT_WITH_EXPLICIT_SEMA(sema, group, name, ...) \ + __usdt_probe(group, name, __usdt_sema_explicit, USDT_SEMA(sema) __VA_OPT__(,) __VA_ARGS__) +#endif + +/* + * Adjustable implementation aspects + */ +#ifndef USDT_ARG_CONSTRAINT +#if defined __powerpc__ +#define USDT_ARG_CONSTRAINT nZr +#elif defined __arm__ +#define USDT_ARG_CONSTRAINT g +#elif defined __loongarch__ +#define USDT_ARG_CONSTRAINT nmr +#else +#define USDT_ARG_CONSTRAINT nor +#endif +#endif /* USDT_ARG_CONSTRAINT */ + +#ifndef USDT_NOP +#if defined(__ia64__) || defined(__s390__) || defined(__s390x__) +#define USDT_NOP nop 0 +#else +#define USDT_NOP nop +#endif +#endif /* USDT_NOP */ + +/* + * Implementation details + */ +/* USDT name for implicitly-defined USDT semaphore, derived from group:name */ +#define __usdt_sema_name(group, name) __usdt_sema_##group##__##name +/* ELF section into which USDT semaphores are put */ +#define __usdt_sema_sec __attribute__((section(".probes"))) + +#define __usdt_concat(a, b) a ## b +#define __usdt_apply(fn, n) __usdt_concat(fn, n) + +#ifndef __usdt_nth +#define __usdt_nth(_, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, N, ...) N +#endif + +#ifndef __usdt_narg +#ifdef __usdt_va_opt +#define __usdt_narg(...) __usdt_nth(_ __VA_OPT__(,) __VA_ARGS__, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0) +#else +#define __usdt_narg(...) __usdt_nth(_, ##__VA_ARGS__, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0) +#endif +#endif /* __usdt_narg */ + +#define __usdt_hash # +#define __usdt_str_(x) #x +#define __usdt_str(x) __usdt_str_(x) + +#ifndef __usdt_asm_name +#define __usdt_asm_name(name) __asm__(__usdt_str(name)) +#endif + +#define __usdt_asm0() "\n" +#define __usdt_asm1(x) __usdt_str(x) "\n" +#define __usdt_asm2(x, ...) __usdt_str(x) "," __usdt_asm1(__VA_ARGS__) +#define __usdt_asm3(x, ...) __usdt_str(x) "," __usdt_asm2(__VA_ARGS__) +#define __usdt_asm4(x, ...) __usdt_str(x) "," __usdt_asm3(__VA_ARGS__) +#define __usdt_asm5(x, ...) __usdt_str(x) "," __usdt_asm4(__VA_ARGS__) +#define __usdt_asm6(x, ...) __usdt_str(x) "," __usdt_asm5(__VA_ARGS__) +#define __usdt_asm7(x, ...) __usdt_str(x) "," __usdt_asm6(__VA_ARGS__) +#define __usdt_asm8(x, ...) __usdt_str(x) "," __usdt_asm7(__VA_ARGS__) +#define __usdt_asm9(x, ...) __usdt_str(x) "," __usdt_asm8(__VA_ARGS__) +#define __usdt_asm10(x, ...) __usdt_str(x) "," __usdt_asm9(__VA_ARGS__) +#define __usdt_asm11(x, ...) __usdt_str(x) "," __usdt_asm10(__VA_ARGS__) +#define __usdt_asm12(x, ...) __usdt_str(x) "," __usdt_asm11(__VA_ARGS__) +#define __usdt_asm(...) __usdt_apply(__usdt_asm, __usdt_narg(__VA_ARGS__))(__VA_ARGS__) + +#ifdef __LP64__ +#define __usdt_asm_addr .8byte +#else +#define __usdt_asm_addr .4byte +#endif + +#define __usdt_asm_strz_(x) __usdt_asm1(.asciz #x) +#define __usdt_asm_strz(x) __usdt_asm_strz_(x) +#define __usdt_asm_str_(x) __usdt_asm1(.ascii #x) +#define __usdt_asm_str(x) __usdt_asm_str_(x) + +/* "semaphoreless" USDT case */ +#ifndef __usdt_sema_none +#define __usdt_sema_none(sema) +#endif + +/* implicitly defined __usdt_sema__group__name semaphore (using weak symbols) */ +#ifndef __usdt_sema_implicit +#define __usdt_sema_implicit(sema) \ + __asm__ __volatile__ ( \ + __usdt_asm1(.ifndef sema) \ + __usdt_asm3( .pushsection .probes, "aw", "progbits") \ + __usdt_asm1( .weak sema) \ + __usdt_asm1( .hidden sema) \ + __usdt_asm1( .align 2) \ + __usdt_asm1(sema:) \ + __usdt_asm1( .zero 2) \ + __usdt_asm2( .type sema, @object) \ + __usdt_asm2( .size sema, 2) \ + __usdt_asm1( .popsection) \ + __usdt_asm1(.endif) \ + ); +#endif + +/* externally defined semaphore using USDT_DEFINE_SEMA() and passed explicitly by user */ +#ifndef __usdt_sema_explicit +#define __usdt_sema_explicit(sema) \ + __asm__ __volatile__ ("" :: "m" (sema)); +#endif + +/* main USDT definition (nop and .note.stapsdt metadata) */ +#define __usdt_probe(group, name, sema_def, sema, ...) do { \ + sema_def(sema) \ + __asm__ __volatile__ ( \ + __usdt_asm( 990: USDT_NOP) \ + __usdt_asm3( .pushsection .note.stapsdt, "", "note") \ + __usdt_asm1( .balign 4) \ + __usdt_asm3( .4byte 992f-991f,994f-993f,3) \ + __usdt_asm1(991: .asciz "stapsdt") \ + __usdt_asm1(992: .balign 4) \ + __usdt_asm1(993: __usdt_asm_addr 990b) \ + __usdt_asm1( __usdt_asm_addr _.stapsdt.base) \ + __usdt_asm1( __usdt_asm_addr sema) \ + __usdt_asm_strz(group) \ + __usdt_asm_strz(name) \ + __usdt_asm_args(__VA_ARGS__) \ + __usdt_asm1( .ascii "\0") \ + __usdt_asm1(994: .balign 4) \ + __usdt_asm1( .popsection) \ + __usdt_asm1(.ifndef _.stapsdt.base) \ + __usdt_asm5( .pushsection .stapsdt.base,"aG","progbits",.stapsdt.base,comdat)\ + __usdt_asm1( .weak _.stapsdt.base) \ + __usdt_asm1( .hidden _.stapsdt.base) \ + __usdt_asm1(_.stapsdt.base:) \ + __usdt_asm1( .space 1) \ + __usdt_asm2( .size _.stapsdt.base, 1) \ + __usdt_asm1( .popsection) \ + __usdt_asm1(.endif) \ + :: __usdt_asm_ops(__VA_ARGS__) \ + ); \ +} while (0) + +/* + * NB: gdb PR24541 highlighted an unspecified corner of the sdt.h + * operand note format. + * + * The named register may be a longer or shorter (!) alias for the + * storage where the value in question is found. For example, on + * i386, 64-bit value may be put in register pairs, and a register + * name stored would identify just one of them. Previously, gcc was + * asked to emit the %w[id] (16-bit alias of some registers holding + * operands), even when a wider 32-bit value was used. + * + * Bottom line: the byte-width given before the @ sign governs. If + * there is a mismatch between that width and that of the named + * register, then a sys/sdt.h note consumer may need to employ + * architecture-specific heuristics to figure out where the compiler + * has actually put the complete value. + */ +#if defined(__powerpc__) || defined(__powerpc64__) +#define __usdt_argref(id) %I[id]%[id] +#elif defined(__i386__) +#define __usdt_argref(id) %k[id] /* gcc.gnu.org/PR80115 sourceware.org/PR24541 */ +#else +#define __usdt_argref(id) %[id] +#endif + +#define __usdt_asm_arg(n) __usdt_asm_str(%c[__usdt_asz##n]) \ + __usdt_asm1(.ascii "@") \ + __usdt_asm_str(__usdt_argref(__usdt_aval##n)) + +#define __usdt_asm_args0 /* no arguments */ +#define __usdt_asm_args1 __usdt_asm_arg(1) +#define __usdt_asm_args2 __usdt_asm_args1 __usdt_asm1(.ascii " ") __usdt_asm_arg(2) +#define __usdt_asm_args3 __usdt_asm_args2 __usdt_asm1(.ascii " ") __usdt_asm_arg(3) +#define __usdt_asm_args4 __usdt_asm_args3 __usdt_asm1(.ascii " ") __usdt_asm_arg(4) +#define __usdt_asm_args5 __usdt_asm_args4 __usdt_asm1(.ascii " ") __usdt_asm_arg(5) +#define __usdt_asm_args6 __usdt_asm_args5 __usdt_asm1(.ascii " ") __usdt_asm_arg(6) +#define __usdt_asm_args7 __usdt_asm_args6 __usdt_asm1(.ascii " ") __usdt_asm_arg(7) +#define __usdt_asm_args8 __usdt_asm_args7 __usdt_asm1(.ascii " ") __usdt_asm_arg(8) +#define __usdt_asm_args9 __usdt_asm_args8 __usdt_asm1(.ascii " ") __usdt_asm_arg(9) +#define __usdt_asm_args10 __usdt_asm_args9 __usdt_asm1(.ascii " ") __usdt_asm_arg(10) +#define __usdt_asm_args11 __usdt_asm_args10 __usdt_asm1(.ascii " ") __usdt_asm_arg(11) +#define __usdt_asm_args12 __usdt_asm_args11 __usdt_asm1(.ascii " ") __usdt_asm_arg(12) +#define __usdt_asm_args(...) __usdt_apply(__usdt_asm_args, __usdt_narg(__VA_ARGS__)) + +#define __usdt_is_arr(x) (__builtin_classify_type(x) == 14 || __builtin_classify_type(x) == 5) +#define __usdt_arg_size(x) (__usdt_is_arr(x) ? sizeof(void *) : sizeof(x)) + +/* + * We can't use __builtin_choose_expr() in C++, so fall back to table-based + * signedness determination for known types, utilizing templates magic. + */ +#ifdef __cplusplus + +#define __usdt_is_signed(x) (!__usdt_is_arr(x) && __usdt_t<__typeof(x)>::is_signed) + +#include <cstddef> + +template<typename T> struct __usdt_t { static const bool is_signed = false; }; +template<typename A> struct __usdt_t<A[]> : public __usdt_t<A *> {}; +template<typename A, size_t N> struct __usdt_t<A[N]> : public __usdt_t<A *> {}; + +#define __usdt_def_signed(T) \ +template<> struct __usdt_t<T> { static const bool is_signed = true; }; \ +template<> struct __usdt_t<const T> { static const bool is_signed = true; }; \ +template<> struct __usdt_t<volatile T> { static const bool is_signed = true; }; \ +template<> struct __usdt_t<const volatile T> { static const bool is_signed = true; } +#define __usdt_maybe_signed(T) \ +template<> struct __usdt_t<T> { static const bool is_signed = (T)-1 < (T)1; }; \ +template<> struct __usdt_t<const T> { static const bool is_signed = (T)-1 < (T)1; }; \ +template<> struct __usdt_t<volatile T> { static const bool is_signed = (T)-1 < (T)1; }; \ +template<> struct __usdt_t<const volatile T> { static const bool is_signed = (T)-1 < (T)1; } + +__usdt_def_signed(signed char); +__usdt_def_signed(short); +__usdt_def_signed(int); +__usdt_def_signed(long); +__usdt_def_signed(long long); +__usdt_maybe_signed(char); +__usdt_maybe_signed(wchar_t); + +#else /* !__cplusplus */ + +#define __usdt_is_inttype(x) (__builtin_classify_type(x) >= 1 && __builtin_classify_type(x) <= 4) +#define __usdt_inttype(x) __typeof(__builtin_choose_expr(__usdt_is_inttype(x), (x), 0U)) +#define __usdt_is_signed(x) ((__usdt_inttype(x))-1 < (__usdt_inttype(x))1) + +#endif /* __cplusplus */ + +#define __usdt_asm_op(n, x) \ + [__usdt_asz##n] "n" ((__usdt_is_signed(x) ? (int)-1 : 1) * (int)__usdt_arg_size(x)), \ + [__usdt_aval##n] __usdt_str(USDT_ARG_CONSTRAINT)(x) + +#define __usdt_asm_ops0() [__usdt_dummy] "g" (0) +#define __usdt_asm_ops1(x) __usdt_asm_op(1, x) +#define __usdt_asm_ops2(a,x) __usdt_asm_ops1(a), __usdt_asm_op(2, x) +#define __usdt_asm_ops3(a,b,x) __usdt_asm_ops2(a,b), __usdt_asm_op(3, x) +#define __usdt_asm_ops4(a,b,c,x) __usdt_asm_ops3(a,b,c), __usdt_asm_op(4, x) +#define __usdt_asm_ops5(a,b,c,d,x) __usdt_asm_ops4(a,b,c,d), __usdt_asm_op(5, x) +#define __usdt_asm_ops6(a,b,c,d,e,x) __usdt_asm_ops5(a,b,c,d,e), __usdt_asm_op(6, x) +#define __usdt_asm_ops7(a,b,c,d,e,f,x) __usdt_asm_ops6(a,b,c,d,e,f), __usdt_asm_op(7, x) +#define __usdt_asm_ops8(a,b,c,d,e,f,g,x) __usdt_asm_ops7(a,b,c,d,e,f,g), __usdt_asm_op(8, x) +#define __usdt_asm_ops9(a,b,c,d,e,f,g,h,x) __usdt_asm_ops8(a,b,c,d,e,f,g,h), __usdt_asm_op(9, x) +#define __usdt_asm_ops10(a,b,c,d,e,f,g,h,i,x) __usdt_asm_ops9(a,b,c,d,e,f,g,h,i), __usdt_asm_op(10, x) +#define __usdt_asm_ops11(a,b,c,d,e,f,g,h,i,j,x) __usdt_asm_ops10(a,b,c,d,e,f,g,h,i,j), __usdt_asm_op(11, x) +#define __usdt_asm_ops12(a,b,c,d,e,f,g,h,i,j,k,x) __usdt_asm_ops11(a,b,c,d,e,f,g,h,i,j,k), __usdt_asm_op(12, x) +#define __usdt_asm_ops(...) __usdt_apply(__usdt_asm_ops, __usdt_narg(__VA_ARGS__))(__VA_ARGS__) + +#endif /* __USDT_H */ diff --git a/tools/testing/selftests/cgroup/lib/cgroup_util.c b/tools/testing/selftests/cgroup/lib/cgroup_util.c index 0e89fcff4d05..44c52f620fda 100644 --- a/tools/testing/selftests/cgroup/lib/cgroup_util.c +++ b/tools/testing/selftests/cgroup/lib/cgroup_util.c @@ -522,6 +522,18 @@ int proc_mount_contains(const char *option) return strstr(buf, option) != NULL; } +int cgroup_feature(const char *feature) +{ + char buf[PAGE_SIZE]; + ssize_t read; + + read = read_text("/sys/kernel/cgroup/features", buf, sizeof(buf)); + if (read < 0) + return read; + + return strstr(buf, feature) != NULL; +} + ssize_t proc_read_text(int pid, bool thread, const char *item, char *buf, size_t size) { char path[PATH_MAX]; diff --git a/tools/testing/selftests/cgroup/lib/include/cgroup_util.h b/tools/testing/selftests/cgroup/lib/include/cgroup_util.h index c69cab66254b..9dc90a1b386d 100644 --- a/tools/testing/selftests/cgroup/lib/include/cgroup_util.h +++ b/tools/testing/selftests/cgroup/lib/include/cgroup_util.h @@ -60,6 +60,7 @@ extern int cg_run_nowait(const char *cgroup, extern int cg_wait_for_proc_count(const char *cgroup, int count); extern int cg_killall(const char *cgroup); int proc_mount_contains(const char *option); +int cgroup_feature(const char *feature); extern ssize_t proc_read_text(int pid, bool thread, const char *item, char *buf, size_t size); extern int proc_read_strstr(int pid, bool thread, const char *item, const char *needle); extern pid_t clone_into_cgroup(int cgroup_fd); diff --git a/tools/testing/selftests/cgroup/test_freezer.c b/tools/testing/selftests/cgroup/test_freezer.c index 8730645d363a..dfb763819581 100644 --- a/tools/testing/selftests/cgroup/test_freezer.c +++ b/tools/testing/selftests/cgroup/test_freezer.c @@ -804,6 +804,662 @@ cleanup: return ret; } +/* + * Get the current frozen_usec for the cgroup. + */ +static long cg_check_freezetime(const char *cgroup) +{ + return cg_read_key_long(cgroup, "cgroup.stat.local", + "frozen_usec "); +} + +/* + * Test that the freeze time will behave as expected for an empty cgroup. + */ +static int test_cgfreezer_time_empty(const char *root) +{ + int ret = KSFT_FAIL; + char *cgroup = NULL; + long prev, curr; + + cgroup = cg_name(root, "cg_time_test_empty"); + if (!cgroup) + goto cleanup; + + /* + * 1) Create an empty cgroup and check that its freeze time + * is 0. + */ + if (cg_create(cgroup)) + goto cleanup; + + curr = cg_check_freezetime(cgroup); + if (curr < 0) { + ret = KSFT_SKIP; + goto cleanup; + } + if (curr > 0) { + debug("Expect time (%ld) to be 0\n", curr); + goto cleanup; + } + + if (cg_freeze_nowait(cgroup, true)) + goto cleanup; + + /* + * 2) Sleep for 1000 us. Check that the freeze time is at + * least 1000 us. + */ + usleep(1000); + curr = cg_check_freezetime(cgroup); + if (curr < 1000) { + debug("Expect time (%ld) to be at least 1000 us\n", + curr); + goto cleanup; + } + + /* + * 3) Unfreeze the cgroup. Check that the freeze time is + * larger than at 2). + */ + if (cg_freeze_nowait(cgroup, false)) + goto cleanup; + prev = curr; + curr = cg_check_freezetime(cgroup); + if (curr <= prev) { + debug("Expect time (%ld) to be more than previous check (%ld)\n", + curr, prev); + goto cleanup; + } + + /* + * 4) Check the freeze time again to ensure that it has not + * changed. + */ + prev = curr; + curr = cg_check_freezetime(cgroup); + if (curr != prev) { + debug("Expect time (%ld) to be unchanged from previous check (%ld)\n", + curr, prev); + goto cleanup; + } + + ret = KSFT_PASS; + +cleanup: + if (cgroup) + cg_destroy(cgroup); + free(cgroup); + return ret; +} + +/* + * A simple test for cgroup freezer time accounting. This test follows + * the same flow as test_cgfreezer_time_empty, but with a single process + * in the cgroup. + */ +static int test_cgfreezer_time_simple(const char *root) +{ + int ret = KSFT_FAIL; + char *cgroup = NULL; + long prev, curr; + + cgroup = cg_name(root, "cg_time_test_simple"); + if (!cgroup) + goto cleanup; + + /* + * 1) Create a cgroup and check that its freeze time is 0. + */ + if (cg_create(cgroup)) + goto cleanup; + + curr = cg_check_freezetime(cgroup); + if (curr < 0) { + ret = KSFT_SKIP; + goto cleanup; + } + if (curr > 0) { + debug("Expect time (%ld) to be 0\n", curr); + goto cleanup; + } + + /* + * 2) Populate the cgroup with one child and check that the + * freeze time is still 0. + */ + cg_run_nowait(cgroup, child_fn, NULL); + prev = curr; + curr = cg_check_freezetime(cgroup); + if (curr > prev) { + debug("Expect time (%ld) to be 0\n", curr); + goto cleanup; + } + + if (cg_freeze_nowait(cgroup, true)) + goto cleanup; + + /* + * 3) Sleep for 1000 us. Check that the freeze time is at + * least 1000 us. + */ + usleep(1000); + prev = curr; + curr = cg_check_freezetime(cgroup); + if (curr < 1000) { + debug("Expect time (%ld) to be at least 1000 us\n", + curr); + goto cleanup; + } + + /* + * 4) Unfreeze the cgroup. Check that the freeze time is + * larger than at 3). + */ + if (cg_freeze_nowait(cgroup, false)) + goto cleanup; + prev = curr; + curr = cg_check_freezetime(cgroup); + if (curr <= prev) { + debug("Expect time (%ld) to be more than previous check (%ld)\n", + curr, prev); + goto cleanup; + } + + /* + * 5) Sleep for 1000 us. Check that the freeze time is the + * same as at 4). + */ + usleep(1000); + prev = curr; + curr = cg_check_freezetime(cgroup); + if (curr != prev) { + debug("Expect time (%ld) to be unchanged from previous check (%ld)\n", + curr, prev); + goto cleanup; + } + + ret = KSFT_PASS; + +cleanup: + if (cgroup) + cg_destroy(cgroup); + free(cgroup); + return ret; +} + +/* + * Test that freezer time accounting works as expected, even while we're + * populating a cgroup with processes. + */ +static int test_cgfreezer_time_populate(const char *root) +{ + int ret = KSFT_FAIL; + char *cgroup = NULL; + long prev, curr; + int i; + + cgroup = cg_name(root, "cg_time_test_populate"); + if (!cgroup) + goto cleanup; + + if (cg_create(cgroup)) + goto cleanup; + + curr = cg_check_freezetime(cgroup); + if (curr < 0) { + ret = KSFT_SKIP; + goto cleanup; + } + if (curr > 0) { + debug("Expect time (%ld) to be 0\n", curr); + goto cleanup; + } + + /* + * 1) Populate the cgroup with 100 processes. Check that + * the freeze time is 0. + */ + for (i = 0; i < 100; i++) + cg_run_nowait(cgroup, child_fn, NULL); + prev = curr; + curr = cg_check_freezetime(cgroup); + if (curr != prev) { + debug("Expect time (%ld) to be 0\n", curr); + goto cleanup; + } + + /* + * 2) Wait for the group to become fully populated. Check + * that the freeze time is 0. + */ + if (cg_wait_for_proc_count(cgroup, 100)) + goto cleanup; + prev = curr; + curr = cg_check_freezetime(cgroup); + if (curr != prev) { + debug("Expect time (%ld) to be 0\n", curr); + goto cleanup; + } + + /* + * 3) Freeze the cgroup and then populate it with 100 more + * processes. Check that the freeze time continues to grow. + */ + if (cg_freeze_nowait(cgroup, true)) + goto cleanup; + prev = curr; + curr = cg_check_freezetime(cgroup); + if (curr <= prev) { + debug("Expect time (%ld) to be more than previous check (%ld)\n", + curr, prev); + goto cleanup; + } + + for (i = 0; i < 100; i++) + cg_run_nowait(cgroup, child_fn, NULL); + prev = curr; + curr = cg_check_freezetime(cgroup); + if (curr <= prev) { + debug("Expect time (%ld) to be more than previous check (%ld)\n", + curr, prev); + goto cleanup; + } + + /* + * 4) Wait for the group to become fully populated. Check + * that the freeze time is larger than at 3). + */ + if (cg_wait_for_proc_count(cgroup, 200)) + goto cleanup; + prev = curr; + curr = cg_check_freezetime(cgroup); + if (curr <= prev) { + debug("Expect time (%ld) to be more than previous check (%ld)\n", + curr, prev); + goto cleanup; + } + + /* + * 5) Unfreeze the cgroup. Check that the freeze time is + * larger than at 4). + */ + if (cg_freeze_nowait(cgroup, false)) + goto cleanup; + prev = curr; + curr = cg_check_freezetime(cgroup); + if (curr <= prev) { + debug("Expect time (%ld) to be more than previous check (%ld)\n", + curr, prev); + goto cleanup; + } + + /* + * 6) Kill the processes. Check that the freeze time is the + * same as it was at 5). + */ + if (cg_killall(cgroup)) + goto cleanup; + prev = curr; + curr = cg_check_freezetime(cgroup); + if (curr != prev) { + debug("Expect time (%ld) to be unchanged from previous check (%ld)\n", + curr, prev); + goto cleanup; + } + + /* + * 7) Freeze and unfreeze the cgroup. Check that the freeze + * time is larger than it was at 6). + */ + if (cg_freeze_nowait(cgroup, true)) + goto cleanup; + if (cg_freeze_nowait(cgroup, false)) + goto cleanup; + prev = curr; + curr = cg_check_freezetime(cgroup); + if (curr <= prev) { + debug("Expect time (%ld) to be more than previous check (%ld)\n", + curr, prev); + goto cleanup; + } + + ret = KSFT_PASS; + +cleanup: + if (cgroup) + cg_destroy(cgroup); + free(cgroup); + return ret; +} + +/* + * Test that frozen time for a cgroup continues to work as expected, + * even as processes are migrated. Frozen cgroup A's freeze time should + * continue to increase and running cgroup B's should stay 0. + */ +static int test_cgfreezer_time_migrate(const char *root) +{ + long prev_A, curr_A, curr_B; + char *cgroup[2] = {0}; + int ret = KSFT_FAIL; + int pid; + + cgroup[0] = cg_name(root, "cg_time_test_migrate_A"); + if (!cgroup[0]) + goto cleanup; + + cgroup[1] = cg_name(root, "cg_time_test_migrate_B"); + if (!cgroup[1]) + goto cleanup; + + if (cg_create(cgroup[0])) + goto cleanup; + + if (cg_check_freezetime(cgroup[0]) < 0) { + ret = KSFT_SKIP; + goto cleanup; + } + + if (cg_create(cgroup[1])) + goto cleanup; + + pid = cg_run_nowait(cgroup[0], child_fn, NULL); + if (pid < 0) + goto cleanup; + + if (cg_wait_for_proc_count(cgroup[0], 1)) + goto cleanup; + + curr_A = cg_check_freezetime(cgroup[0]); + if (curr_A) { + debug("Expect time (%ld) to be 0\n", curr_A); + goto cleanup; + } + curr_B = cg_check_freezetime(cgroup[1]); + if (curr_B) { + debug("Expect time (%ld) to be 0\n", curr_B); + goto cleanup; + } + + /* + * Freeze cgroup A. + */ + if (cg_freeze_wait(cgroup[0], true)) + goto cleanup; + prev_A = curr_A; + curr_A = cg_check_freezetime(cgroup[0]); + if (curr_A <= prev_A) { + debug("Expect time (%ld) to be > 0\n", curr_A); + goto cleanup; + } + + /* + * Migrate from A (frozen) to B (running). + */ + if (cg_enter(cgroup[1], pid)) + goto cleanup; + + usleep(1000); + curr_B = cg_check_freezetime(cgroup[1]); + if (curr_B) { + debug("Expect time (%ld) to be 0\n", curr_B); + goto cleanup; + } + + prev_A = curr_A; + curr_A = cg_check_freezetime(cgroup[0]); + if (curr_A <= prev_A) { + debug("Expect time (%ld) to be more than previous check (%ld)\n", + curr_A, prev_A); + goto cleanup; + } + + ret = KSFT_PASS; + +cleanup: + if (cgroup[0]) + cg_destroy(cgroup[0]); + free(cgroup[0]); + if (cgroup[1]) + cg_destroy(cgroup[1]); + free(cgroup[1]); + return ret; +} + +/* + * The test creates a cgroup and freezes it. Then it creates a child cgroup. + * After that it checks that the child cgroup has a non-zero freeze time + * that is less than the parent's. Next, it freezes the child, unfreezes + * the parent, and sleeps. Finally, it checks that the child's freeze + * time has grown larger than the parent's. + */ +static int test_cgfreezer_time_parent(const char *root) +{ + char *parent, *child = NULL; + int ret = KSFT_FAIL; + long ptime, ctime; + + parent = cg_name(root, "cg_test_parent_A"); + if (!parent) + goto cleanup; + + child = cg_name(parent, "cg_test_parent_B"); + if (!child) + goto cleanup; + + if (cg_create(parent)) + goto cleanup; + + if (cg_check_freezetime(parent) < 0) { + ret = KSFT_SKIP; + goto cleanup; + } + + if (cg_freeze_wait(parent, true)) + goto cleanup; + + usleep(1000); + if (cg_create(child)) + goto cleanup; + + if (cg_check_frozen(child, true)) + goto cleanup; + + /* + * Since the parent was frozen the entire time the child cgroup + * was being created, we expect the parent's freeze time to be + * larger than the child's. + * + * Ideally, we would be able to check both times simultaneously, + * but here we get the child's after we get the parent's. + */ + ptime = cg_check_freezetime(parent); + ctime = cg_check_freezetime(child); + if (ptime <= ctime) { + debug("Expect ptime (%ld) > ctime (%ld)\n", ptime, ctime); + goto cleanup; + } + + if (cg_freeze_nowait(child, true)) + goto cleanup; + + if (cg_freeze_wait(parent, false)) + goto cleanup; + + if (cg_check_frozen(child, true)) + goto cleanup; + + usleep(100000); + + ctime = cg_check_freezetime(child); + ptime = cg_check_freezetime(parent); + + if (ctime <= ptime) { + debug("Expect ctime (%ld) > ptime (%ld)\n", ctime, ptime); + goto cleanup; + } + + ret = KSFT_PASS; + +cleanup: + if (child) + cg_destroy(child); + free(child); + if (parent) + cg_destroy(parent); + free(parent); + return ret; +} + +/* + * The test creates a parent cgroup and a child cgroup. Then, it freezes + * the child and checks that the child's freeze time is greater than the + * parent's, which should be zero. + */ +static int test_cgfreezer_time_child(const char *root) +{ + char *parent, *child = NULL; + int ret = KSFT_FAIL; + long ptime, ctime; + + parent = cg_name(root, "cg_test_child_A"); + if (!parent) + goto cleanup; + + child = cg_name(parent, "cg_test_child_B"); + if (!child) + goto cleanup; + + if (cg_create(parent)) + goto cleanup; + + if (cg_check_freezetime(parent) < 0) { + ret = KSFT_SKIP; + goto cleanup; + } + + if (cg_create(child)) + goto cleanup; + + if (cg_freeze_wait(child, true)) + goto cleanup; + + ctime = cg_check_freezetime(child); + ptime = cg_check_freezetime(parent); + if (ptime != 0) { + debug("Expect ptime (%ld) to be 0\n", ptime); + goto cleanup; + } + + if (ctime <= ptime) { + debug("Expect ctime (%ld) <= ptime (%ld)\n", ctime, ptime); + goto cleanup; + } + + ret = KSFT_PASS; + +cleanup: + if (child) + cg_destroy(child); + free(child); + if (parent) + cg_destroy(parent); + free(parent); + return ret; +} + +/* + * The test creates the following hierarchy: + * A + * | + * B + * | + * C + * + * Then it freezes the cgroups in the order C, B, A. + * Then it unfreezes the cgroups in the order A, B, C. + * Then it checks that C's freeze time is larger than B's and + * that B's is larger than A's. + */ +static int test_cgfreezer_time_nested(const char *root) +{ + char *cgroup[3] = {0}; + int ret = KSFT_FAIL; + long time[3] = {0}; + int i; + + cgroup[0] = cg_name(root, "cg_test_time_A"); + if (!cgroup[0]) + goto cleanup; + + cgroup[1] = cg_name(cgroup[0], "B"); + if (!cgroup[1]) + goto cleanup; + + cgroup[2] = cg_name(cgroup[1], "C"); + if (!cgroup[2]) + goto cleanup; + + if (cg_create(cgroup[0])) + goto cleanup; + + if (cg_check_freezetime(cgroup[0]) < 0) { + ret = KSFT_SKIP; + goto cleanup; + } + + if (cg_create(cgroup[1])) + goto cleanup; + + if (cg_create(cgroup[2])) + goto cleanup; + + if (cg_freeze_nowait(cgroup[2], true)) + goto cleanup; + + if (cg_freeze_nowait(cgroup[1], true)) + goto cleanup; + + if (cg_freeze_nowait(cgroup[0], true)) + goto cleanup; + + usleep(1000); + + if (cg_freeze_nowait(cgroup[0], false)) + goto cleanup; + + if (cg_freeze_nowait(cgroup[1], false)) + goto cleanup; + + if (cg_freeze_nowait(cgroup[2], false)) + goto cleanup; + + time[2] = cg_check_freezetime(cgroup[2]); + time[1] = cg_check_freezetime(cgroup[1]); + time[0] = cg_check_freezetime(cgroup[0]); + + if (time[2] <= time[1]) { + debug("Expect C's time (%ld) > B's time (%ld)", time[2], time[1]); + goto cleanup; + } + + if (time[1] <= time[0]) { + debug("Expect B's time (%ld) > A's time (%ld)", time[1], time[0]); + goto cleanup; + } + + ret = KSFT_PASS; + +cleanup: + for (i = 2; i >= 0 && cgroup[i]; i--) { + cg_destroy(cgroup[i]); + free(cgroup[i]); + } + + return ret; +} + #define T(x) { x, #x } struct cgfreezer_test { int (*fn)(const char *root); @@ -819,6 +1475,13 @@ struct cgfreezer_test { T(test_cgfreezer_stopped), T(test_cgfreezer_ptraced), T(test_cgfreezer_vfork), + T(test_cgfreezer_time_empty), + T(test_cgfreezer_time_simple), + T(test_cgfreezer_time_populate), + T(test_cgfreezer_time_migrate), + T(test_cgfreezer_time_parent), + T(test_cgfreezer_time_child), + T(test_cgfreezer_time_nested), }; #undef T diff --git a/tools/testing/selftests/cgroup/test_pids.c b/tools/testing/selftests/cgroup/test_pids.c index 9ecb83c6cc5c..d8a1d1cd5007 100644 --- a/tools/testing/selftests/cgroup/test_pids.c +++ b/tools/testing/selftests/cgroup/test_pids.c @@ -77,6 +77,9 @@ static int test_pids_events(const char *root) char *cg_parent = NULL, *cg_child = NULL; int pid; + if (cgroup_feature("pids_localevents") <= 0) + return KSFT_SKIP; + cg_parent = cg_name(root, "pids_parent"); cg_child = cg_name(cg_parent, "pids_child"); if (!cg_parent || !cg_child) diff --git a/tools/testing/selftests/futex/functional/Makefile b/tools/testing/selftests/futex/functional/Makefile index 8cfb87f7f7c5..490ace1f017e 100644 --- a/tools/testing/selftests/futex/functional/Makefile +++ b/tools/testing/selftests/futex/functional/Makefile @@ -1,12 +1,14 @@ # SPDX-License-Identifier: GPL-2.0 +PKG_CONFIG ?= pkg-config +LIBNUMA_TEST = $(shell sh -c "$(PKG_CONFIG) numa --atleast-version 2.0.16 > /dev/null 2>&1 && echo SUFFICIENT || echo NO") + INCLUDES := -I../include -I../../ $(KHDR_INCLUDES) -CFLAGS := $(CFLAGS) -g -O2 -Wall -pthread $(INCLUDES) $(KHDR_INCLUDES) +CFLAGS := $(CFLAGS) -g -O2 -Wall -pthread -D_FILE_OFFSET_BITS=64 -D_TIME_BITS=64 $(INCLUDES) $(KHDR_INCLUDES) -DLIBNUMA_VER_$(LIBNUMA_TEST)=1 LDLIBS := -lpthread -lrt -lnuma LOCAL_HDRS := \ ../include/futextest.h \ - ../include/atomic.h \ - ../include/logging.h + ../include/atomic.h TEST_GEN_PROGS := \ futex_wait_timeout \ futex_wait_wouldblock \ diff --git a/tools/testing/selftests/futex/functional/futex_numa.c b/tools/testing/selftests/futex/functional/futex_numa.c index f29e4d627e79..e0a33510ccb6 100644 --- a/tools/testing/selftests/futex/functional/futex_numa.c +++ b/tools/testing/selftests/futex/functional/futex_numa.c @@ -5,9 +5,10 @@ #include <sys/mman.h> #include <fcntl.h> #include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> #include <time.h> #include <assert.h> -#include "logging.h" #include "futextest.h" #include "futex2test.h" diff --git a/tools/testing/selftests/futex/functional/futex_numa_mpol.c b/tools/testing/selftests/futex/functional/futex_numa_mpol.c index a9ecfb2d3932..d037a3f10ee8 100644 --- a/tools/testing/selftests/futex/functional/futex_numa_mpol.c +++ b/tools/testing/selftests/futex/functional/futex_numa_mpol.c @@ -16,9 +16,9 @@ #include <linux/futex.h> #include <sys/mman.h> -#include "logging.h" #include "futextest.h" #include "futex2test.h" +#include "../../kselftest_harness.h" #define MAX_THREADS 64 @@ -77,7 +77,7 @@ static void join_max_threads(void) } } -static void __test_futex(void *futex_ptr, int must_fail, unsigned int futex_flags) +static void __test_futex(void *futex_ptr, int err_value, unsigned int futex_flags) { int to_wake, ret, i, need_exit = 0; @@ -88,11 +88,17 @@ static void __test_futex(void *futex_ptr, int must_fail, unsigned int futex_flag do { ret = futex2_wake(futex_ptr, to_wake, futex_flags); - if (must_fail) { - if (ret < 0) - break; - ksft_exit_fail_msg("futex2_wake(%d, 0x%x) should fail, but didn't\n", - to_wake, futex_flags); + + if (err_value) { + if (ret >= 0) + ksft_exit_fail_msg("futex2_wake(%d, 0x%x) should fail, but didn't\n", + to_wake, futex_flags); + + if (errno != err_value) + ksft_exit_fail_msg("futex2_wake(%d, 0x%x) expected error was %d, but returned %d (%s)\n", + to_wake, futex_flags, err_value, errno, strerror(errno)); + + break; } if (ret < 0) { ksft_exit_fail_msg("Failed futex2_wake(%d, 0x%x): %m\n", @@ -106,12 +112,12 @@ static void __test_futex(void *futex_ptr, int must_fail, unsigned int futex_flag join_max_threads(); for (i = 0; i < MAX_THREADS; i++) { - if (must_fail && thread_args[i].result != -1) { + if (err_value && thread_args[i].result != -1) { ksft_print_msg("Thread %d should fail but succeeded (%d)\n", i, thread_args[i].result); need_exit = 1; } - if (!must_fail && thread_args[i].result != 0) { + if (!err_value && thread_args[i].result != 0) { ksft_print_msg("Thread %d failed (%d)\n", i, thread_args[i].result); need_exit = 1; } @@ -120,58 +126,30 @@ static void __test_futex(void *futex_ptr, int must_fail, unsigned int futex_flag ksft_exit_fail_msg("Aborting due to earlier errors.\n"); } -static void test_futex(void *futex_ptr, int must_fail) +static void test_futex(void *futex_ptr, int err_value) { - __test_futex(futex_ptr, must_fail, FUTEX2_SIZE_U32 | FUTEX_PRIVATE_FLAG | FUTEX2_NUMA); + __test_futex(futex_ptr, err_value, FUTEX2_SIZE_U32 | FUTEX_PRIVATE_FLAG | FUTEX2_NUMA); } -static void test_futex_mpol(void *futex_ptr, int must_fail) +static void test_futex_mpol(void *futex_ptr, int err_value) { - __test_futex(futex_ptr, must_fail, FUTEX2_SIZE_U32 | FUTEX_PRIVATE_FLAG | FUTEX2_NUMA | FUTEX2_MPOL); + __test_futex(futex_ptr, err_value, FUTEX2_SIZE_U32 | FUTEX_PRIVATE_FLAG | FUTEX2_NUMA | FUTEX2_MPOL); } -static void usage(char *prog) -{ - printf("Usage: %s\n", prog); - printf(" -c Use color\n"); - printf(" -h Display this help message\n"); - printf(" -v L Verbosity level: %d=QUIET %d=CRITICAL %d=INFO\n", - VQUIET, VCRITICAL, VINFO); -} - -int main(int argc, char *argv[]) +TEST(futex_numa_mpol) { struct futex32_numa *futex_numa; - int mem_size, i; void *futex_ptr; - int c; - - while ((c = getopt(argc, argv, "chv:")) != -1) { - switch (c) { - case 'c': - log_color(1); - break; - case 'h': - usage(basename(argv[0])); - exit(0); - break; - case 'v': - log_verbosity(atoi(optarg)); - break; - default: - usage(basename(argv[0])); - exit(1); - } - } - - ksft_print_header(); - ksft_set_plan(1); + int mem_size; mem_size = sysconf(_SC_PAGE_SIZE); - futex_ptr = mmap(NULL, mem_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, 0, 0); + futex_ptr = mmap(NULL, mem_size * 2, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, 0, 0); if (futex_ptr == MAP_FAILED) ksft_exit_fail_msg("mmap() for %d bytes failed\n", mem_size); + /* Create an invalid memory region for the "Memory out of range" test */ + mprotect(futex_ptr + mem_size, mem_size, PROT_NONE); + futex_numa = futex_ptr; ksft_print_msg("Regular test\n"); @@ -182,27 +160,31 @@ int main(int argc, char *argv[]) if (futex_numa->numa == FUTEX_NO_NODE) ksft_exit_fail_msg("NUMA node is left uninitialized\n"); - ksft_print_msg("Memory too small\n"); - test_futex(futex_ptr + mem_size - 4, 1); + /* FUTEX2_NUMA futex must be 8-byte aligned */ + ksft_print_msg("Mis-aligned futex\n"); + test_futex(futex_ptr + mem_size - 4, EINVAL); ksft_print_msg("Memory out of range\n"); - test_futex(futex_ptr + mem_size, 1); + test_futex(futex_ptr + mem_size, EFAULT); futex_numa->numa = FUTEX_NO_NODE; mprotect(futex_ptr, mem_size, PROT_READ); ksft_print_msg("Memory, RO\n"); - test_futex(futex_ptr, 1); + test_futex(futex_ptr, EFAULT); mprotect(futex_ptr, mem_size, PROT_NONE); ksft_print_msg("Memory, no access\n"); - test_futex(futex_ptr, 1); + test_futex(futex_ptr, EFAULT); mprotect(futex_ptr, mem_size, PROT_READ | PROT_WRITE); ksft_print_msg("Memory back to RW\n"); test_futex(futex_ptr, 0); + ksft_test_result_pass("futex2 memory boundary tests passed\n"); + /* MPOL test. Does not work as expected */ - for (i = 0; i < 4; i++) { +#ifdef LIBNUMA_VER_SUFFICIENT + for (int i = 0; i < 4; i++) { unsigned long nodemask; int ret; @@ -221,15 +203,17 @@ int main(int argc, char *argv[]) ret = futex2_wake(futex_ptr, 0, FUTEX2_SIZE_U32 | FUTEX_PRIVATE_FLAG | FUTEX2_NUMA | FUTEX2_MPOL); if (ret < 0) ksft_test_result_fail("Failed to wake 0 with MPOL: %m\n"); - if (0) - test_futex_mpol(futex_numa, 0); if (futex_numa->numa != i) { ksft_exit_fail_msg("Returned NUMA node is %d expected %d\n", futex_numa->numa, i); } } } - ksft_test_result_pass("NUMA MPOL tests passed\n"); - ksft_finished(); - return 0; + ksft_test_result_pass("futex2 MPOL hints test passed\n"); +#else + ksft_test_result_skip("futex2 MPOL hints test requires libnuma 2.0.16+\n"); +#endif + munmap(futex_ptr, mem_size * 2); } + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/futex/functional/futex_priv_hash.c b/tools/testing/selftests/futex/functional/futex_priv_hash.c index aea001ac4946..3b7b5851f290 100644 --- a/tools/testing/selftests/futex/functional/futex_priv_hash.c +++ b/tools/testing/selftests/futex/functional/futex_priv_hash.c @@ -14,7 +14,7 @@ #include <linux/prctl.h> #include <sys/prctl.h> -#include "logging.h" +#include "../../kselftest_harness.h" #define MAX_THREADS 64 @@ -128,46 +128,14 @@ static void futex_dummy_op(void) ksft_exit_fail_msg("pthread_mutex_timedlock() did not timeout: %d.\n", ret); } -static void usage(char *prog) -{ - printf("Usage: %s\n", prog); - printf(" -c Use color\n"); - printf(" -g Test global hash instead intead local immutable \n"); - printf(" -h Display this help message\n"); - printf(" -v L Verbosity level: %d=QUIET %d=CRITICAL %d=INFO\n", - VQUIET, VCRITICAL, VINFO); -} - static const char *test_msg_auto_create = "Automatic hash bucket init on thread creation.\n"; static const char *test_msg_auto_inc = "Automatic increase with more than 16 CPUs\n"; -int main(int argc, char *argv[]) +TEST(priv_hash) { int futex_slots1, futex_slotsn, online_cpus; pthread_mutexattr_t mutex_attr_pi; int ret, retry = 20; - int c; - - while ((c = getopt(argc, argv, "chv:")) != -1) { - switch (c) { - case 'c': - log_color(1); - break; - case 'h': - usage(basename(argv[0])); - exit(0); - break; - case 'v': - log_verbosity(atoi(optarg)); - break; - default: - usage(basename(argv[0])); - exit(1); - } - } - - ksft_print_header(); - ksft_set_plan(21); ret = pthread_mutexattr_init(&mutex_attr_pi); ret |= pthread_mutexattr_setprotocol(&mutex_attr_pi, PTHREAD_PRIO_INHERIT); @@ -189,14 +157,14 @@ int main(int argc, char *argv[]) if (ret != 0) ksft_exit_fail_msg("pthread_join() failed: %d, %m\n", ret); - /* First thread, has to initialiaze private hash */ + /* First thread, has to initialize private hash */ futex_slots1 = futex_hash_slots_get(); if (futex_slots1 <= 0) { ksft_print_msg("Current hash buckets: %d\n", futex_slots1); - ksft_exit_fail_msg(test_msg_auto_create); + ksft_exit_fail_msg("%s", test_msg_auto_create); } - ksft_test_result_pass(test_msg_auto_create); + ksft_test_result_pass("%s", test_msg_auto_create); online_cpus = sysconf(_SC_NPROCESSORS_ONLN); ret = pthread_barrier_init(&barrier_main, NULL, MAX_THREADS + 1); @@ -237,11 +205,11 @@ retry_getslots: } ksft_print_msg("Expected increase of hash buckets but got: %d -> %d\n", futex_slots1, futex_slotsn); - ksft_exit_fail_msg(test_msg_auto_inc); + ksft_exit_fail_msg("%s", test_msg_auto_inc); } - ksft_test_result_pass(test_msg_auto_inc); + ksft_test_result_pass("%s", test_msg_auto_inc); } else { - ksft_test_result_skip(test_msg_auto_inc); + ksft_test_result_skip("%s", test_msg_auto_inc); } ret = pthread_mutex_unlock(&global_lock); @@ -257,17 +225,17 @@ retry_getslots: futex_hash_slots_set_verify(2); join_max_threads(); - ksft_test_result(counter == MAX_THREADS, "Created of waited for %d of %d threads\n", + ksft_test_result(counter == MAX_THREADS, "Created and waited for %d of %d threads\n", counter, MAX_THREADS); counter = 0; - /* Once the user set something, auto reisze must be disabled */ + /* Once the user set something, auto resize must be disabled */ ret = pthread_barrier_init(&barrier_main, NULL, MAX_THREADS); create_max_threads(thread_lock_fn); join_max_threads(); ret = futex_hash_slots_get(); - ksft_test_result(ret == 2, "No more auto-resize after manaul setting, got %d\n", + ksft_test_result(ret == 2, "No more auto-resize after manual setting, got %d\n", ret); futex_hash_slots_set_must_fail(1 << 29); @@ -280,7 +248,7 @@ retry_getslots: ret = futex_hash_slots_set(0); ksft_test_result(ret == 0, "Global hash request\n"); if (ret != 0) - goto out; + return; futex_hash_slots_set_must_fail(4); futex_hash_slots_set_must_fail(8); @@ -289,17 +257,14 @@ retry_getslots: futex_hash_slots_set_must_fail(6); ret = pthread_barrier_init(&barrier_main, NULL, MAX_THREADS); - if (ret != 0) { + if (ret != 0) ksft_exit_fail_msg("pthread_barrier_init failed: %m\n"); - return 1; - } + create_max_threads(thread_lock_fn); join_max_threads(); ret = futex_hash_slots_get(); ksft_test_result(ret == 0, "Continue to use global hash\n"); - -out: - ksft_finished(); - return 0; } + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/futex/functional/futex_requeue.c b/tools/testing/selftests/futex/functional/futex_requeue.c index 51485be6eb2f..69e2555b6039 100644 --- a/tools/testing/selftests/futex/functional/futex_requeue.c +++ b/tools/testing/selftests/futex/functional/futex_requeue.c @@ -7,24 +7,15 @@ #include <pthread.h> #include <limits.h> -#include "logging.h" + #include "futextest.h" +#include "../../kselftest_harness.h" -#define TEST_NAME "futex-requeue" #define timeout_ns 30000000 #define WAKE_WAIT_US 10000 volatile futex_t *f1; -void usage(char *prog) -{ - printf("Usage: %s\n", prog); - printf(" -c Use color\n"); - printf(" -h Display this help message\n"); - printf(" -v L Verbosity level: %d=QUIET %d=CRITICAL %d=INFO\n", - VQUIET, VCRITICAL, VINFO); -} - void *waiterfn(void *arg) { struct timespec to; @@ -38,67 +29,49 @@ void *waiterfn(void *arg) return NULL; } -int main(int argc, char *argv[]) +TEST(requeue_single) { - pthread_t waiter[10]; - int res, ret = RET_PASS; - int c, i; volatile futex_t _f1 = 0; volatile futex_t f2 = 0; + pthread_t waiter[10]; + int res; f1 = &_f1; - while ((c = getopt(argc, argv, "cht:v:")) != -1) { - switch (c) { - case 'c': - log_color(1); - break; - case 'h': - usage(basename(argv[0])); - exit(0); - case 'v': - log_verbosity(atoi(optarg)); - break; - default: - usage(basename(argv[0])); - exit(1); - } - } - - ksft_print_header(); - ksft_set_plan(2); - ksft_print_msg("%s: Test futex_requeue\n", - basename(argv[0])); - /* * Requeue a waiter from f1 to f2, and wake f2. */ if (pthread_create(&waiter[0], NULL, waiterfn, NULL)) - error("pthread_create failed\n", errno); + ksft_exit_fail_msg("pthread_create failed\n"); usleep(WAKE_WAIT_US); - info("Requeuing 1 futex from f1 to f2\n"); + ksft_print_dbg_msg("Requeuing 1 futex from f1 to f2\n"); res = futex_cmp_requeue(f1, 0, &f2, 0, 1, 0); - if (res != 1) { + if (res != 1) ksft_test_result_fail("futex_requeue simple returned: %d %s\n", res ? errno : res, res ? strerror(errno) : ""); - ret = RET_FAIL; - } - - info("Waking 1 futex at f2\n"); + ksft_print_dbg_msg("Waking 1 futex at f2\n"); res = futex_wake(&f2, 1, 0); if (res != 1) { ksft_test_result_fail("futex_requeue simple returned: %d %s\n", res ? errno : res, res ? strerror(errno) : ""); - ret = RET_FAIL; } else { ksft_test_result_pass("futex_requeue simple succeeds\n"); } +} + +TEST(requeue_multiple) +{ + volatile futex_t _f1 = 0; + volatile futex_t f2 = 0; + pthread_t waiter[10]; + int res, i; + f1 = &_f1; /* * Create 10 waiters at f1. At futex_requeue, wake 3 and requeue 7. @@ -106,31 +79,28 @@ int main(int argc, char *argv[]) */ for (i = 0; i < 10; i++) { if (pthread_create(&waiter[i], NULL, waiterfn, NULL)) - error("pthread_create failed\n", errno); + ksft_exit_fail_msg("pthread_create failed\n"); } usleep(WAKE_WAIT_US); - info("Waking 3 futexes at f1 and requeuing 7 futexes from f1 to f2\n"); + ksft_print_dbg_msg("Waking 3 futexes at f1 and requeuing 7 futexes from f1 to f2\n"); res = futex_cmp_requeue(f1, 0, &f2, 3, 7, 0); if (res != 10) { ksft_test_result_fail("futex_requeue many returned: %d %s\n", res ? errno : res, res ? strerror(errno) : ""); - ret = RET_FAIL; } - info("Waking INT_MAX futexes at f2\n"); + ksft_print_dbg_msg("Waking INT_MAX futexes at f2\n"); res = futex_wake(&f2, INT_MAX, 0); if (res != 7) { ksft_test_result_fail("futex_requeue many returned: %d %s\n", res ? errno : res, res ? strerror(errno) : ""); - ret = RET_FAIL; } else { ksft_test_result_pass("futex_requeue many succeeds\n"); } - - ksft_print_cnts(); - return ret; } + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/futex/functional/futex_requeue_pi.c b/tools/testing/selftests/futex/functional/futex_requeue_pi.c index 215c6cb539b4..f299d75848cd 100644 --- a/tools/testing/selftests/futex/functional/futex_requeue_pi.c +++ b/tools/testing/selftests/futex/functional/futex_requeue_pi.c @@ -26,11 +26,11 @@ #include <stdlib.h> #include <signal.h> #include <string.h> + #include "atomic.h" #include "futextest.h" -#include "logging.h" +#include "../../kselftest_harness.h" -#define TEST_NAME "futex-requeue-pi" #define MAX_WAKE_ITERS 1000 #define THREAD_MAX 10 #define SIGNAL_PERIOD_US 100 @@ -42,12 +42,6 @@ futex_t f1 = FUTEX_INITIALIZER; futex_t f2 = FUTEX_INITIALIZER; futex_t wake_complete = FUTEX_INITIALIZER; -/* Test option defaults */ -static long timeout_ns; -static int broadcast; -static int owner; -static int locked; - struct thread_arg { long id; struct timespec *timeout; @@ -56,18 +50,73 @@ struct thread_arg { }; #define THREAD_ARG_INITIALIZER { 0, NULL, 0, 0 } -void usage(char *prog) +FIXTURE(args) { - printf("Usage: %s\n", prog); - printf(" -b Broadcast wakeup (all waiters)\n"); - printf(" -c Use color\n"); - printf(" -h Display this help message\n"); - printf(" -l Lock the pi futex across requeue\n"); - printf(" -o Use a third party pi futex owner during requeue (cancels -l)\n"); - printf(" -t N Timeout in nanoseconds (default: 0)\n"); - printf(" -v L Verbosity level: %d=QUIET %d=CRITICAL %d=INFO\n", - VQUIET, VCRITICAL, VINFO); -} +}; + +FIXTURE_SETUP(args) +{ +}; + +FIXTURE_TEARDOWN(args) +{ +}; + +FIXTURE_VARIANT(args) +{ + long timeout_ns; + bool broadcast; + bool owner; + bool locked; +}; + +/* + * For a given timeout value, this macro creates a test input with all the + * possible combinations of valid arguments + */ +#define FIXTURE_VARIANT_ADD_TIMEOUT(timeout) \ + \ +FIXTURE_VARIANT_ADD(args, t_##timeout) \ +{ \ + .timeout_ns = timeout, \ +}; \ + \ +FIXTURE_VARIANT_ADD(args, t_##timeout##_broadcast) \ +{ \ + .timeout_ns = timeout, \ + .broadcast = true, \ +}; \ + \ +FIXTURE_VARIANT_ADD(args, t_##timeout##_broadcast_locked) \ +{ \ + .timeout_ns = timeout, \ + .broadcast = true, \ + .locked = true, \ +}; \ + \ +FIXTURE_VARIANT_ADD(args, t_##timeout##_broadcast_owner) \ +{ \ + .timeout_ns = timeout, \ + .broadcast = true, \ + .owner = true, \ +}; \ + \ +FIXTURE_VARIANT_ADD(args, t_##timeout##_locked) \ +{ \ + .timeout_ns = timeout, \ + .locked = true, \ +}; \ + \ +FIXTURE_VARIANT_ADD(args, t_##timeout##_owner) \ +{ \ + .timeout_ns = timeout, \ + .owner = true, \ +}; \ + +FIXTURE_VARIANT_ADD_TIMEOUT(0); +FIXTURE_VARIANT_ADD_TIMEOUT(5000); +FIXTURE_VARIANT_ADD_TIMEOUT(500000); +FIXTURE_VARIANT_ADD_TIMEOUT(2000000000); int create_rt_thread(pthread_t *pth, void*(*func)(void *), void *arg, int policy, int prio) @@ -81,26 +130,26 @@ int create_rt_thread(pthread_t *pth, void*(*func)(void *), void *arg, ret = pthread_attr_setinheritsched(&attr, PTHREAD_EXPLICIT_SCHED); if (ret) { - error("pthread_attr_setinheritsched\n", ret); + ksft_exit_fail_msg("pthread_attr_setinheritsched\n"); return -1; } ret = pthread_attr_setschedpolicy(&attr, policy); if (ret) { - error("pthread_attr_setschedpolicy\n", ret); + ksft_exit_fail_msg("pthread_attr_setschedpolicy\n"); return -1; } schedp.sched_priority = prio; ret = pthread_attr_setschedparam(&attr, &schedp); if (ret) { - error("pthread_attr_setschedparam\n", ret); + ksft_exit_fail_msg("pthread_attr_setschedparam\n"); return -1; } ret = pthread_create(pth, &attr, func, arg); if (ret) { - error("pthread_create\n", ret); + ksft_exit_fail_msg("pthread_create\n"); return -1; } return 0; @@ -112,7 +161,7 @@ void *waiterfn(void *arg) struct thread_arg *args = (struct thread_arg *)arg; futex_t old_val; - info("Waiter %ld: running\n", args->id); + ksft_print_dbg_msg("Waiter %ld: running\n", args->id); /* Each thread sleeps for a different amount of time * This is to avoid races, because we don't lock the * external mutex here */ @@ -120,26 +169,25 @@ void *waiterfn(void *arg) old_val = f1; atomic_inc(&waiters_blocked); - info("Calling futex_wait_requeue_pi: %p (%u) -> %p\n", + ksft_print_dbg_msg("Calling futex_wait_requeue_pi: %p (%u) -> %p\n", &f1, f1, &f2); args->ret = futex_wait_requeue_pi(&f1, old_val, &f2, args->timeout, FUTEX_PRIVATE_FLAG); - info("waiter %ld woke with %d %s\n", args->id, args->ret, + ksft_print_dbg_msg("waiter %ld woke with %d %s\n", args->id, args->ret, args->ret < 0 ? strerror(errno) : ""); atomic_inc(&waiters_woken); if (args->ret < 0) { if (args->timeout && errno == ETIMEDOUT) args->ret = 0; else { - args->ret = RET_ERROR; - error("futex_wait_requeue_pi\n", errno); + ksft_exit_fail_msg("futex_wait_requeue_pi\n"); } futex_lock_pi(&f2, NULL, 0, FUTEX_PRIVATE_FLAG); } futex_unlock_pi(&f2, FUTEX_PRIVATE_FLAG); - info("Waiter %ld: exiting with %d\n", args->id, args->ret); + ksft_print_dbg_msg("Waiter %ld: exiting with %d\n", args->id, args->ret); pthread_exit((void *)&args->ret); } @@ -152,14 +200,14 @@ void *broadcast_wakerfn(void *arg) int nr_wake = 1; int i = 0; - info("Waker: waiting for waiters to block\n"); + ksft_print_dbg_msg("Waker: waiting for waiters to block\n"); while (waiters_blocked.val < THREAD_MAX) usleep(1000); usleep(1000); - info("Waker: Calling broadcast\n"); + ksft_print_dbg_msg("Waker: Calling broadcast\n"); if (args->lock) { - info("Calling FUTEX_LOCK_PI on mutex=%x @ %p\n", f2, &f2); + ksft_print_dbg_msg("Calling FUTEX_LOCK_PI on mutex=%x @ %p\n", f2, &f2); futex_lock_pi(&f2, NULL, 0, FUTEX_PRIVATE_FLAG); } continue_requeue: @@ -167,16 +215,14 @@ void *broadcast_wakerfn(void *arg) args->ret = futex_cmp_requeue_pi(&f1, old_val, &f2, nr_wake, nr_requeue, FUTEX_PRIVATE_FLAG); if (args->ret < 0) { - args->ret = RET_ERROR; - error("FUTEX_CMP_REQUEUE_PI failed\n", errno); + ksft_exit_fail_msg("FUTEX_CMP_REQUEUE_PI failed\n"); } else if (++i < MAX_WAKE_ITERS) { task_count += args->ret; if (task_count < THREAD_MAX - waiters_woken.val) goto continue_requeue; } else { - error("max broadcast iterations (%d) reached with %d/%d tasks woken or requeued\n", - 0, MAX_WAKE_ITERS, task_count, THREAD_MAX); - args->ret = RET_ERROR; + ksft_exit_fail_msg("max broadcast iterations (%d) reached with %d/%d tasks woken or requeued\n", + MAX_WAKE_ITERS, task_count, THREAD_MAX); } futex_wake(&wake_complete, 1, FUTEX_PRIVATE_FLAG); @@ -187,7 +233,7 @@ void *broadcast_wakerfn(void *arg) if (args->ret > 0) args->ret = task_count; - info("Waker: exiting with %d\n", args->ret); + ksft_print_dbg_msg("Waker: exiting with %d\n", args->ret); pthread_exit((void *)&args->ret); } @@ -200,20 +246,20 @@ void *signal_wakerfn(void *arg) int nr_wake = 1; int i = 0; - info("Waker: waiting for waiters to block\n"); + ksft_print_dbg_msg("Waker: waiting for waiters to block\n"); while (waiters_blocked.val < THREAD_MAX) usleep(1000); usleep(1000); while (task_count < THREAD_MAX && waiters_woken.val < THREAD_MAX) { - info("task_count: %d, waiters_woken: %d\n", + ksft_print_dbg_msg("task_count: %d, waiters_woken: %d\n", task_count, waiters_woken.val); if (args->lock) { - info("Calling FUTEX_LOCK_PI on mutex=%x @ %p\n", - f2, &f2); + ksft_print_dbg_msg("Calling FUTEX_LOCK_PI on mutex=%x @ %p\n", + f2, &f2); futex_lock_pi(&f2, NULL, 0, FUTEX_PRIVATE_FLAG); } - info("Waker: Calling signal\n"); + ksft_print_dbg_msg("Waker: Calling signal\n"); /* cond_signal */ old_val = f1; args->ret = futex_cmp_requeue_pi(&f1, old_val, &f2, @@ -221,28 +267,23 @@ void *signal_wakerfn(void *arg) FUTEX_PRIVATE_FLAG); if (args->ret < 0) args->ret = -errno; - info("futex: %x\n", f2); + ksft_print_dbg_msg("futex: %x\n", f2); if (args->lock) { - info("Calling FUTEX_UNLOCK_PI on mutex=%x @ %p\n", - f2, &f2); + ksft_print_dbg_msg("Calling FUTEX_UNLOCK_PI on mutex=%x @ %p\n", + f2, &f2); futex_unlock_pi(&f2, FUTEX_PRIVATE_FLAG); } - info("futex: %x\n", f2); - if (args->ret < 0) { - error("FUTEX_CMP_REQUEUE_PI failed\n", errno); - args->ret = RET_ERROR; - break; - } + ksft_print_dbg_msg("futex: %x\n", f2); + if (args->ret < 0) + ksft_exit_fail_msg("FUTEX_CMP_REQUEUE_PI failed\n"); task_count += args->ret; usleep(SIGNAL_PERIOD_US); i++; /* we have to loop at least THREAD_MAX times */ if (i > MAX_WAKE_ITERS + THREAD_MAX) { - error("max signaling iterations (%d) reached, giving up on pending waiters.\n", - 0, MAX_WAKE_ITERS + THREAD_MAX); - args->ret = RET_ERROR; - break; + ksft_exit_fail_msg("max signaling iterations (%d) reached, giving up on pending waiters.\n", + MAX_WAKE_ITERS + THREAD_MAX); } } @@ -251,8 +292,8 @@ void *signal_wakerfn(void *arg) if (args->ret >= 0) args->ret = task_count; - info("Waker: exiting with %d\n", args->ret); - info("Waker: waiters_woken: %d\n", waiters_woken.val); + ksft_print_dbg_msg("Waker: exiting with %d\n", args->ret); + ksft_print_dbg_msg("Waker: waiters_woken: %d\n", waiters_woken.val); pthread_exit((void *)&args->ret); } @@ -269,35 +310,40 @@ void *third_party_blocker(void *arg) ret2 = futex_unlock_pi(&f2, FUTEX_PRIVATE_FLAG); out: - if (args->ret || ret2) { - error("third_party_blocker() futex error", 0); - args->ret = RET_ERROR; - } + if (args->ret || ret2) + ksft_exit_fail_msg("third_party_blocker() futex error"); pthread_exit((void *)&args->ret); } -int unit_test(int broadcast, long lock, int third_party_owner, long timeout_ns) +TEST_F(args, futex_requeue_pi) { - void *(*wakerfn)(void *) = signal_wakerfn; struct thread_arg blocker_arg = THREAD_ARG_INITIALIZER; struct thread_arg waker_arg = THREAD_ARG_INITIALIZER; pthread_t waiter[THREAD_MAX], waker, blocker; - struct timespec ts, *tsp = NULL; + void *(*wakerfn)(void *) = signal_wakerfn; + bool third_party_owner = variant->owner; + long timeout_ns = variant->timeout_ns; + bool broadcast = variant->broadcast; struct thread_arg args[THREAD_MAX]; - int *waiter_ret; - int i, ret = RET_PASS; + struct timespec ts, *tsp = NULL; + bool lock = variant->locked; + int *waiter_ret, i, ret = 0; + + ksft_print_msg( + "\tArguments: broadcast=%d locked=%d owner=%d timeout=%ldns\n", + broadcast, lock, third_party_owner, timeout_ns); if (timeout_ns) { time_t secs; - info("timeout_ns = %ld\n", timeout_ns); + ksft_print_dbg_msg("timeout_ns = %ld\n", timeout_ns); ret = clock_gettime(CLOCK_MONOTONIC, &ts); secs = (ts.tv_nsec + timeout_ns) / 1000000000; ts.tv_nsec = ((int64_t)ts.tv_nsec + timeout_ns) % 1000000000; ts.tv_sec += secs; - info("ts.tv_sec = %ld\n", ts.tv_sec); - info("ts.tv_nsec = %ld\n", ts.tv_nsec); + ksft_print_dbg_msg("ts.tv_sec = %ld\n", ts.tv_sec); + ksft_print_dbg_msg("ts.tv_nsec = %ld\n", ts.tv_nsec); tsp = &ts; } @@ -307,10 +353,7 @@ int unit_test(int broadcast, long lock, int third_party_owner, long timeout_ns) if (third_party_owner) { if (create_rt_thread(&blocker, third_party_blocker, (void *)&blocker_arg, SCHED_FIFO, 1)) { - error("Creating third party blocker thread failed\n", - errno); - ret = RET_ERROR; - goto out; + ksft_exit_fail_msg("Creating third party blocker thread failed\n"); } } @@ -318,20 +361,16 @@ int unit_test(int broadcast, long lock, int third_party_owner, long timeout_ns) for (i = 0; i < THREAD_MAX; i++) { args[i].id = i; args[i].timeout = tsp; - info("Starting thread %d\n", i); + ksft_print_dbg_msg("Starting thread %d\n", i); if (create_rt_thread(&waiter[i], waiterfn, (void *)&args[i], SCHED_FIFO, 1)) { - error("Creating waiting thread failed\n", errno); - ret = RET_ERROR; - goto out; + ksft_exit_fail_msg("Creating waiting thread failed\n"); } } waker_arg.lock = lock; if (create_rt_thread(&waker, wakerfn, (void *)&waker_arg, SCHED_FIFO, 1)) { - error("Creating waker thread failed\n", errno); - ret = RET_ERROR; - goto out; + ksft_exit_fail_msg("Creating waker thread failed\n"); } /* Wait for threads to finish */ @@ -345,7 +384,6 @@ int unit_test(int broadcast, long lock, int third_party_owner, long timeout_ns) pthread_join(blocker, NULL); pthread_join(waker, NULL); -out: if (!ret) { if (*waiter_ret) ret = *waiter_ret; @@ -355,66 +393,8 @@ out: ret = blocker_arg.ret; } - return ret; + if (ret) + ksft_test_result_fail("fail"); } -int main(int argc, char *argv[]) -{ - char *test_name; - int c, ret; - - while ((c = getopt(argc, argv, "bchlot:v:")) != -1) { - switch (c) { - case 'b': - broadcast = 1; - break; - case 'c': - log_color(1); - break; - case 'h': - usage(basename(argv[0])); - exit(0); - case 'l': - locked = 1; - break; - case 'o': - owner = 1; - locked = 0; - break; - case 't': - timeout_ns = atoi(optarg); - break; - case 'v': - log_verbosity(atoi(optarg)); - break; - default: - usage(basename(argv[0])); - exit(1); - } - } - - ksft_print_header(); - ksft_set_plan(1); - ksft_print_msg("%s: Test requeue functionality\n", basename(argv[0])); - ksft_print_msg( - "\tArguments: broadcast=%d locked=%d owner=%d timeout=%ldns\n", - broadcast, locked, owner, timeout_ns); - - ret = asprintf(&test_name, - "%s broadcast=%d locked=%d owner=%d timeout=%ldns", - TEST_NAME, broadcast, locked, owner, timeout_ns); - if (ret < 0) { - ksft_print_msg("Failed to generate test name\n"); - test_name = TEST_NAME; - } - - /* - * FIXME: unit_test is obsolete now that we parse options and the - * various style of runs are done by run.sh - simplify the code and move - * unit_test into main() - */ - ret = unit_test(broadcast, locked, owner, timeout_ns); - - print_result(test_name, ret); - return ret; -} +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/futex/functional/futex_requeue_pi_mismatched_ops.c b/tools/testing/selftests/futex/functional/futex_requeue_pi_mismatched_ops.c index d0a4d332ea44..77135a22a583 100644 --- a/tools/testing/selftests/futex/functional/futex_requeue_pi_mismatched_ops.c +++ b/tools/testing/selftests/futex/functional/futex_requeue_pi_mismatched_ops.c @@ -23,67 +23,32 @@ #include <stdlib.h> #include <string.h> #include <time.h> -#include "futextest.h" -#include "logging.h" -#define TEST_NAME "futex-requeue-pi-mismatched-ops" +#include "futextest.h" +#include "../../kselftest_harness.h" futex_t f1 = FUTEX_INITIALIZER; futex_t f2 = FUTEX_INITIALIZER; int child_ret = 0; -void usage(char *prog) -{ - printf("Usage: %s\n", prog); - printf(" -c Use color\n"); - printf(" -h Display this help message\n"); - printf(" -v L Verbosity level: %d=QUIET %d=CRITICAL %d=INFO\n", - VQUIET, VCRITICAL, VINFO); -} - void *blocking_child(void *arg) { child_ret = futex_wait(&f1, f1, NULL, FUTEX_PRIVATE_FLAG); if (child_ret < 0) { child_ret = -errno; - error("futex_wait\n", errno); + ksft_exit_fail_msg("futex_wait\n"); } return (void *)&child_ret; } -int main(int argc, char *argv[]) +TEST(requeue_pi_mismatched_ops) { - int ret = RET_PASS; pthread_t child; - int c; + int ret; - while ((c = getopt(argc, argv, "chv:")) != -1) { - switch (c) { - case 'c': - log_color(1); - break; - case 'h': - usage(basename(argv[0])); - exit(0); - case 'v': - log_verbosity(atoi(optarg)); - break; - default: - usage(basename(argv[0])); - exit(1); - } - } - - ksft_print_header(); - ksft_set_plan(1); - ksft_print_msg("%s: Detect mismatched requeue_pi operations\n", - basename(argv[0])); + if (pthread_create(&child, NULL, blocking_child, NULL)) + ksft_exit_fail_msg("pthread_create\n"); - if (pthread_create(&child, NULL, blocking_child, NULL)) { - error("pthread_create\n", errno); - ret = RET_ERROR; - goto out; - } /* Allow the child to block in the kernel. */ sleep(1); @@ -102,34 +67,27 @@ int main(int argc, char *argv[]) * FUTEX_WAKE. */ ret = futex_wake(&f1, 1, FUTEX_PRIVATE_FLAG); - if (ret == 1) { - ret = RET_PASS; - } else if (ret < 0) { - error("futex_wake\n", errno); - ret = RET_ERROR; - } else { - error("futex_wake did not wake the child\n", 0); - ret = RET_ERROR; - } + if (ret == 1) + ret = 0; + else if (ret < 0) + ksft_exit_fail_msg("futex_wake\n"); + else + ksft_exit_fail_msg("futex_wake did not wake the child\n"); } else { - error("futex_cmp_requeue_pi\n", errno); - ret = RET_ERROR; + ksft_exit_fail_msg("futex_cmp_requeue_pi\n"); } } else if (ret > 0) { - fail("futex_cmp_requeue_pi failed to detect the mismatch\n"); - ret = RET_FAIL; + ksft_test_result_fail("futex_cmp_requeue_pi failed to detect the mismatch\n"); } else { - error("futex_cmp_requeue_pi found no waiters\n", 0); - ret = RET_ERROR; + ksft_exit_fail_msg("futex_cmp_requeue_pi found no waiters\n"); } pthread_join(child, NULL); - if (!ret) - ret = child_ret; - - out: - /* If the kernel crashes, we shouldn't return at all. */ - print_result(TEST_NAME, ret); - return ret; + if (!ret && !child_ret) + ksft_test_result_pass("futex_requeue_pi_mismatched_ops passed\n"); + else + ksft_test_result_pass("futex_requeue_pi_mismatched_ops failed\n"); } + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/futex/functional/futex_requeue_pi_signal_restart.c b/tools/testing/selftests/futex/functional/futex_requeue_pi_signal_restart.c index c6b8f32990c8..e34ee0f9ebcc 100644 --- a/tools/testing/selftests/futex/functional/futex_requeue_pi_signal_restart.c +++ b/tools/testing/selftests/futex/functional/futex_requeue_pi_signal_restart.c @@ -24,11 +24,11 @@ #include <stdio.h> #include <stdlib.h> #include <string.h> + #include "atomic.h" #include "futextest.h" -#include "logging.h" +#include "../../kselftest_harness.h" -#define TEST_NAME "futex-requeue-pi-signal-restart" #define DELAY_US 100 futex_t f1 = FUTEX_INITIALIZER; @@ -37,15 +37,6 @@ atomic_t requeued = ATOMIC_INITIALIZER; int waiter_ret = 0; -void usage(char *prog) -{ - printf("Usage: %s\n", prog); - printf(" -c Use color\n"); - printf(" -h Display this help message\n"); - printf(" -v L Verbosity level: %d=QUIET %d=CRITICAL %d=INFO\n", - VQUIET, VCRITICAL, VINFO); -} - int create_rt_thread(pthread_t *pth, void*(*func)(void *), void *arg, int policy, int prio) { @@ -57,35 +48,28 @@ int create_rt_thread(pthread_t *pth, void*(*func)(void *), void *arg, memset(&schedp, 0, sizeof(schedp)); ret = pthread_attr_setinheritsched(&attr, PTHREAD_EXPLICIT_SCHED); - if (ret) { - error("pthread_attr_setinheritsched\n", ret); - return -1; - } + if (ret) + ksft_exit_fail_msg("pthread_attr_setinheritsched\n"); ret = pthread_attr_setschedpolicy(&attr, policy); - if (ret) { - error("pthread_attr_setschedpolicy\n", ret); - return -1; - } + if (ret) + ksft_exit_fail_msg("pthread_attr_setschedpolicy\n"); schedp.sched_priority = prio; ret = pthread_attr_setschedparam(&attr, &schedp); - if (ret) { - error("pthread_attr_setschedparam\n", ret); - return -1; - } + if (ret) + ksft_exit_fail_msg("pthread_attr_setschedparam\n"); ret = pthread_create(pth, &attr, func, arg); - if (ret) { - error("pthread_create\n", ret); - return -1; - } + if (ret) + ksft_exit_fail_msg("pthread_create\n"); + return 0; } void handle_signal(int signo) { - info("signal received %s requeue\n", + ksft_print_dbg_msg("signal received %s requeue\n", requeued.val ? "after" : "prior to"); } @@ -94,78 +78,46 @@ void *waiterfn(void *arg) unsigned int old_val; int res; - waiter_ret = RET_PASS; - - info("Waiter running\n"); - info("Calling FUTEX_LOCK_PI on f2=%x @ %p\n", f2, &f2); + ksft_print_dbg_msg("Waiter running\n"); + ksft_print_dbg_msg("Calling FUTEX_LOCK_PI on f2=%x @ %p\n", f2, &f2); old_val = f1; res = futex_wait_requeue_pi(&f1, old_val, &(f2), NULL, FUTEX_PRIVATE_FLAG); if (!requeued.val || errno != EWOULDBLOCK) { - fail("unexpected return from futex_wait_requeue_pi: %d (%s)\n", + ksft_test_result_fail("unexpected return from futex_wait_requeue_pi: %d (%s)\n", res, strerror(errno)); - info("w2:futex: %x\n", f2); + ksft_print_dbg_msg("w2:futex: %x\n", f2); if (!res) futex_unlock_pi(&f2, FUTEX_PRIVATE_FLAG); - waiter_ret = RET_FAIL; } - info("Waiter exiting with %d\n", waiter_ret); pthread_exit(NULL); } -int main(int argc, char *argv[]) +TEST(futex_requeue_pi_signal_restart) { unsigned int old_val; struct sigaction sa; pthread_t waiter; - int c, res, ret = RET_PASS; - - while ((c = getopt(argc, argv, "chv:")) != -1) { - switch (c) { - case 'c': - log_color(1); - break; - case 'h': - usage(basename(argv[0])); - exit(0); - case 'v': - log_verbosity(atoi(optarg)); - break; - default: - usage(basename(argv[0])); - exit(1); - } - } - - ksft_print_header(); - ksft_set_plan(1); - ksft_print_msg("%s: Test signal handling during requeue_pi\n", - basename(argv[0])); - ksft_print_msg("\tArguments: <none>\n"); + int res; sa.sa_handler = handle_signal; sigemptyset(&sa.sa_mask); sa.sa_flags = 0; - if (sigaction(SIGUSR1, &sa, NULL)) { - error("sigaction\n", errno); - exit(1); - } + if (sigaction(SIGUSR1, &sa, NULL)) + ksft_exit_fail_msg("sigaction\n"); - info("m1:f2: %x\n", f2); - info("Creating waiter\n"); + ksft_print_dbg_msg("m1:f2: %x\n", f2); + ksft_print_dbg_msg("Creating waiter\n"); res = create_rt_thread(&waiter, waiterfn, NULL, SCHED_FIFO, 1); - if (res) { - error("Creating waiting thread failed", res); - ret = RET_ERROR; - goto out; - } + if (res) + ksft_exit_fail_msg("Creating waiting thread failed"); - info("Calling FUTEX_LOCK_PI on f2=%x @ %p\n", f2, &f2); - info("m2:f2: %x\n", f2); + ksft_print_dbg_msg("Calling FUTEX_LOCK_PI on f2=%x @ %p\n", f2, &f2); + ksft_print_dbg_msg("m2:f2: %x\n", f2); futex_lock_pi(&f2, 0, 0, FUTEX_PRIVATE_FLAG); - info("m3:f2: %x\n", f2); + ksft_print_dbg_msg("m3:f2: %x\n", f2); while (1) { /* @@ -173,11 +125,11 @@ int main(int argc, char *argv[]) * restart futex_wait_requeue_pi() in the kernel. Wait for the * waiter to block on f1 again. */ - info("Issuing SIGUSR1 to waiter\n"); + ksft_print_dbg_msg("Issuing SIGUSR1 to waiter\n"); pthread_kill(waiter, SIGUSR1); usleep(DELAY_US); - info("Requeueing waiter via FUTEX_CMP_REQUEUE_PI\n"); + ksft_print_dbg_msg("Requeueing waiter via FUTEX_CMP_REQUEUE_PI\n"); old_val = f1; res = futex_cmp_requeue_pi(&f1, old_val, &(f2), 1, 0, FUTEX_PRIVATE_FLAG); @@ -191,12 +143,10 @@ int main(int argc, char *argv[]) atomic_set(&requeued, 1); break; } else if (res < 0) { - error("FUTEX_CMP_REQUEUE_PI failed\n", errno); - ret = RET_ERROR; - break; + ksft_exit_fail_msg("FUTEX_CMP_REQUEUE_PI failed\n"); } } - info("m4:f2: %x\n", f2); + ksft_print_dbg_msg("m4:f2: %x\n", f2); /* * Signal the waiter after requeue, waiter should return from @@ -204,19 +154,14 @@ int main(int argc, char *argv[]) * futex_unlock_pi() can't happen before the signal wakeup is detected * in the kernel. */ - info("Issuing SIGUSR1 to waiter\n"); + ksft_print_dbg_msg("Issuing SIGUSR1 to waiter\n"); pthread_kill(waiter, SIGUSR1); - info("Waiting for waiter to return\n"); + ksft_print_dbg_msg("Waiting for waiter to return\n"); pthread_join(waiter, NULL); - info("Calling FUTEX_UNLOCK_PI on mutex=%x @ %p\n", f2, &f2); + ksft_print_dbg_msg("Calling FUTEX_UNLOCK_PI on mutex=%x @ %p\n", f2, &f2); futex_unlock_pi(&f2, FUTEX_PRIVATE_FLAG); - info("m5:f2: %x\n", f2); - - out: - if (ret == RET_PASS && waiter_ret) - ret = waiter_ret; - - print_result(TEST_NAME, ret); - return ret; + ksft_print_dbg_msg("m5:f2: %x\n", f2); } + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/futex/functional/futex_wait.c b/tools/testing/selftests/futex/functional/futex_wait.c index 685140d9b93d..152ca4612886 100644 --- a/tools/testing/selftests/futex/functional/futex_wait.c +++ b/tools/testing/selftests/futex/functional/futex_wait.c @@ -9,25 +9,16 @@ #include <sys/shm.h> #include <sys/mman.h> #include <fcntl.h> -#include "logging.h" + #include "futextest.h" +#include "../../kselftest_harness.h" -#define TEST_NAME "futex-wait" #define timeout_ns 30000000 #define WAKE_WAIT_US 10000 #define SHM_PATH "futex_shm_file" void *futex; -void usage(char *prog) -{ - printf("Usage: %s\n", prog); - printf(" -c Use color\n"); - printf(" -h Display this help message\n"); - printf(" -v L Verbosity level: %d=QUIET %d=CRITICAL %d=INFO\n", - VQUIET, VCRITICAL, VINFO); -} - static void *waiterfn(void *arg) { struct timespec to; @@ -45,53 +36,37 @@ static void *waiterfn(void *arg) return NULL; } -int main(int argc, char *argv[]) +TEST(private_futex) { - int res, ret = RET_PASS, fd, c, shm_id; - u_int32_t f_private = 0, *shared_data; unsigned int flags = FUTEX_PRIVATE_FLAG; + u_int32_t f_private = 0; pthread_t waiter; - void *shm; + int res; futex = &f_private; - while ((c = getopt(argc, argv, "cht:v:")) != -1) { - switch (c) { - case 'c': - log_color(1); - break; - case 'h': - usage(basename(argv[0])); - exit(0); - case 'v': - log_verbosity(atoi(optarg)); - break; - default: - usage(basename(argv[0])); - exit(1); - } - } - - ksft_print_header(); - ksft_set_plan(3); - ksft_print_msg("%s: Test futex_wait\n", basename(argv[0])); - /* Testing a private futex */ - info("Calling private futex_wait on futex: %p\n", futex); + ksft_print_dbg_msg("Calling private futex_wait on futex: %p\n", futex); if (pthread_create(&waiter, NULL, waiterfn, (void *) &flags)) - error("pthread_create failed\n", errno); + ksft_exit_fail_msg("pthread_create failed\n"); usleep(WAKE_WAIT_US); - info("Calling private futex_wake on futex: %p\n", futex); + ksft_print_dbg_msg("Calling private futex_wake on futex: %p\n", futex); res = futex_wake(futex, 1, FUTEX_PRIVATE_FLAG); if (res != 1) { ksft_test_result_fail("futex_wake private returned: %d %s\n", errno, strerror(errno)); - ret = RET_FAIL; } else { ksft_test_result_pass("futex_wake private succeeds\n"); } +} + +TEST(anon_page) +{ + u_int32_t *shared_data; + pthread_t waiter; + int res, shm_id; /* Testing an anon page shared memory */ shm_id = shmget(IPC_PRIVATE, 4096, IPC_CREAT | 0666); @@ -105,67 +80,65 @@ int main(int argc, char *argv[]) *shared_data = 0; futex = shared_data; - info("Calling shared (page anon) futex_wait on futex: %p\n", futex); + ksft_print_dbg_msg("Calling shared (page anon) futex_wait on futex: %p\n", futex); if (pthread_create(&waiter, NULL, waiterfn, NULL)) - error("pthread_create failed\n", errno); + ksft_exit_fail_msg("pthread_create failed\n"); usleep(WAKE_WAIT_US); - info("Calling shared (page anon) futex_wake on futex: %p\n", futex); + ksft_print_dbg_msg("Calling shared (page anon) futex_wake on futex: %p\n", futex); res = futex_wake(futex, 1, 0); if (res != 1) { ksft_test_result_fail("futex_wake shared (page anon) returned: %d %s\n", errno, strerror(errno)); - ret = RET_FAIL; } else { ksft_test_result_pass("futex_wake shared (page anon) succeeds\n"); } + shmdt(shared_data); +} + +TEST(file_backed) +{ + u_int32_t f_private = 0; + pthread_t waiter; + int res, fd; + void *shm; /* Testing a file backed shared memory */ fd = open(SHM_PATH, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR); - if (fd < 0) { - perror("open"); - exit(1); - } + if (fd < 0) + ksft_exit_fail_msg("open"); - if (ftruncate(fd, sizeof(f_private))) { - perror("ftruncate"); - exit(1); - } + if (ftruncate(fd, sizeof(f_private))) + ksft_exit_fail_msg("ftruncate"); shm = mmap(NULL, sizeof(f_private), PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); - if (shm == MAP_FAILED) { - perror("mmap"); - exit(1); - } + if (shm == MAP_FAILED) + ksft_exit_fail_msg("mmap"); memcpy(shm, &f_private, sizeof(f_private)); futex = shm; - info("Calling shared (file backed) futex_wait on futex: %p\n", futex); + ksft_print_dbg_msg("Calling shared (file backed) futex_wait on futex: %p\n", futex); if (pthread_create(&waiter, NULL, waiterfn, NULL)) - error("pthread_create failed\n", errno); + ksft_exit_fail_msg("pthread_create failed\n"); usleep(WAKE_WAIT_US); - info("Calling shared (file backed) futex_wake on futex: %p\n", futex); + ksft_print_dbg_msg("Calling shared (file backed) futex_wake on futex: %p\n", futex); res = futex_wake(shm, 1, 0); if (res != 1) { ksft_test_result_fail("futex_wake shared (file backed) returned: %d %s\n", errno, strerror(errno)); - ret = RET_FAIL; } else { ksft_test_result_pass("futex_wake shared (file backed) succeeds\n"); } - /* Freeing resources */ - shmdt(shared_data); munmap(shm, sizeof(f_private)); remove(SHM_PATH); close(fd); - - ksft_print_cnts(); - return ret; } + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/futex/functional/futex_wait_private_mapped_file.c b/tools/testing/selftests/futex/functional/futex_wait_private_mapped_file.c index fb4148f23fa3..8952ebda14ab 100644 --- a/tools/testing/selftests/futex/functional/futex_wait_private_mapped_file.c +++ b/tools/testing/selftests/futex/functional/futex_wait_private_mapped_file.c @@ -27,10 +27,9 @@ #include <libgen.h> #include <signal.h> -#include "logging.h" #include "futextest.h" +#include "../../kselftest_harness.h" -#define TEST_NAME "futex-wait-private-mapped-file" #define PAGE_SZ 4096 char pad[PAGE_SZ] = {1}; @@ -40,86 +39,44 @@ char pad2[PAGE_SZ] = {1}; #define WAKE_WAIT_US 3000000 struct timespec wait_timeout = { .tv_sec = 5, .tv_nsec = 0}; -void usage(char *prog) -{ - printf("Usage: %s\n", prog); - printf(" -c Use color\n"); - printf(" -h Display this help message\n"); - printf(" -v L Verbosity level: %d=QUIET %d=CRITICAL %d=INFO\n", - VQUIET, VCRITICAL, VINFO); -} - void *thr_futex_wait(void *arg) { int ret; - info("futex wait\n"); + ksft_print_dbg_msg("futex wait\n"); ret = futex_wait(&val, 1, &wait_timeout, 0); - if (ret && errno != EWOULDBLOCK && errno != ETIMEDOUT) { - error("futex error.\n", errno); - print_result(TEST_NAME, RET_ERROR); - exit(RET_ERROR); - } + if (ret && errno != EWOULDBLOCK && errno != ETIMEDOUT) + ksft_exit_fail_msg("futex error.\n"); if (ret && errno == ETIMEDOUT) - fail("waiter timedout\n"); + ksft_exit_fail_msg("waiter timedout\n"); - info("futex_wait: ret = %d, errno = %d\n", ret, errno); + ksft_print_dbg_msg("futex_wait: ret = %d, errno = %d\n", ret, errno); return NULL; } -int main(int argc, char **argv) +TEST(wait_private_mapped_file) { pthread_t thr; - int ret = RET_PASS; int res; - int c; - - while ((c = getopt(argc, argv, "chv:")) != -1) { - switch (c) { - case 'c': - log_color(1); - break; - case 'h': - usage(basename(argv[0])); - exit(0); - case 'v': - log_verbosity(atoi(optarg)); - break; - default: - usage(basename(argv[0])); - exit(1); - } - } - - ksft_print_header(); - ksft_set_plan(1); - ksft_print_msg( - "%s: Test the futex value of private file mappings in FUTEX_WAIT\n", - basename(argv[0])); - - ret = pthread_create(&thr, NULL, thr_futex_wait, NULL); - if (ret < 0) { - fprintf(stderr, "pthread_create error\n"); - ret = RET_ERROR; - goto out; - } - - info("wait a while\n"); + + res = pthread_create(&thr, NULL, thr_futex_wait, NULL); + if (res < 0) + ksft_exit_fail_msg("pthread_create error\n"); + + ksft_print_dbg_msg("wait a while\n"); usleep(WAKE_WAIT_US); val = 2; res = futex_wake(&val, 1, 0); - info("futex_wake %d\n", res); - if (res != 1) { - fail("FUTEX_WAKE didn't find the waiting thread.\n"); - ret = RET_FAIL; - } + ksft_print_dbg_msg("futex_wake %d\n", res); + if (res != 1) + ksft_exit_fail_msg("FUTEX_WAKE didn't find the waiting thread.\n"); - info("join\n"); + ksft_print_dbg_msg("join\n"); pthread_join(thr, NULL); - out: - print_result(TEST_NAME, ret); - return ret; + ksft_test_result_pass("wait_private_mapped_file"); } + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/futex/functional/futex_wait_timeout.c b/tools/testing/selftests/futex/functional/futex_wait_timeout.c index d183f878360b..0c8766aced2e 100644 --- a/tools/testing/selftests/futex/functional/futex_wait_timeout.c +++ b/tools/testing/selftests/futex/functional/futex_wait_timeout.c @@ -16,26 +16,15 @@ *****************************************************************************/ #include <pthread.h> + #include "futextest.h" #include "futex2test.h" -#include "logging.h" - -#define TEST_NAME "futex-wait-timeout" +#include "../../kselftest_harness.h" static long timeout_ns = 100000; /* 100us default timeout */ static futex_t futex_pi; static pthread_barrier_t barrier; -void usage(char *prog) -{ - printf("Usage: %s\n", prog); - printf(" -c Use color\n"); - printf(" -h Display this help message\n"); - printf(" -t N Timeout in nanoseconds (default: 100,000)\n"); - printf(" -v L Verbosity level: %d=QUIET %d=CRITICAL %d=INFO\n", - VQUIET, VCRITICAL, VINFO); -} - /* * Get a PI lock and hold it forever, so the main thread lock_pi will block * and we can test the timeout @@ -47,13 +36,13 @@ void *get_pi_lock(void *arg) ret = futex_lock_pi(&futex_pi, NULL, 0, 0); if (ret != 0) - error("futex_lock_pi failed\n", ret); + ksft_exit_fail_msg("futex_lock_pi failed\n"); pthread_barrier_wait(&barrier); /* Blocks forever */ ret = futex_wait(&lock, 0, NULL, 0); - error("futex_wait failed\n", ret); + ksft_exit_fail_msg("futex_wait failed\n"); return NULL; } @@ -61,12 +50,11 @@ void *get_pi_lock(void *arg) /* * Check if the function returned the expected error */ -static void test_timeout(int res, int *ret, char *test_name, int err) +static void test_timeout(int res, char *test_name, int err) { if (!res || errno != err) { ksft_test_result_fail("%s returned %d\n", test_name, res < 0 ? errno : res); - *ret = RET_FAIL; } else { ksft_test_result_pass("%s succeeds\n", test_name); } @@ -78,10 +66,8 @@ static void test_timeout(int res, int *ret, char *test_name, int err) static int futex_get_abs_timeout(clockid_t clockid, struct timespec *to, long timeout_ns) { - if (clock_gettime(clockid, to)) { - error("clock_gettime failed\n", errno); - return errno; - } + if (clock_gettime(clockid, to)) + ksft_exit_fail_msg("clock_gettime failed\n"); to->tv_nsec += timeout_ns; @@ -93,83 +79,66 @@ static int futex_get_abs_timeout(clockid_t clockid, struct timespec *to, return 0; } -int main(int argc, char *argv[]) +TEST(wait_bitset) { futex_t f1 = FUTEX_INITIALIZER; - int res, ret = RET_PASS; struct timespec to; - pthread_t thread; - int c; - struct futex_waitv waitv = { - .uaddr = (uintptr_t)&f1, - .val = f1, - .flags = FUTEX_32, - .__reserved = 0 - }; - - while ((c = getopt(argc, argv, "cht:v:")) != -1) { - switch (c) { - case 'c': - log_color(1); - break; - case 'h': - usage(basename(argv[0])); - exit(0); - case 't': - timeout_ns = atoi(optarg); - break; - case 'v': - log_verbosity(atoi(optarg)); - break; - default: - usage(basename(argv[0])); - exit(1); - } - } - - ksft_print_header(); - ksft_set_plan(9); - ksft_print_msg("%s: Block on a futex and wait for timeout\n", - basename(argv[0])); - ksft_print_msg("\tArguments: timeout=%ldns\n", timeout_ns); - - pthread_barrier_init(&barrier, NULL, 2); - pthread_create(&thread, NULL, get_pi_lock, NULL); + int res; /* initialize relative timeout */ to.tv_sec = 0; to.tv_nsec = timeout_ns; res = futex_wait(&f1, f1, &to, 0); - test_timeout(res, &ret, "futex_wait relative", ETIMEDOUT); + test_timeout(res, "futex_wait relative", ETIMEDOUT); /* FUTEX_WAIT_BITSET with CLOCK_REALTIME */ if (futex_get_abs_timeout(CLOCK_REALTIME, &to, timeout_ns)) - return RET_FAIL; + ksft_test_result_error("get_time error"); res = futex_wait_bitset(&f1, f1, &to, 1, FUTEX_CLOCK_REALTIME); - test_timeout(res, &ret, "futex_wait_bitset realtime", ETIMEDOUT); + test_timeout(res, "futex_wait_bitset realtime", ETIMEDOUT); /* FUTEX_WAIT_BITSET with CLOCK_MONOTONIC */ if (futex_get_abs_timeout(CLOCK_MONOTONIC, &to, timeout_ns)) - return RET_FAIL; + ksft_test_result_error("get_time error"); res = futex_wait_bitset(&f1, f1, &to, 1, 0); - test_timeout(res, &ret, "futex_wait_bitset monotonic", ETIMEDOUT); + test_timeout(res, "futex_wait_bitset monotonic", ETIMEDOUT); +} + +TEST(requeue_pi) +{ + futex_t f1 = FUTEX_INITIALIZER; + struct timespec to; + int res; /* FUTEX_WAIT_REQUEUE_PI with CLOCK_REALTIME */ if (futex_get_abs_timeout(CLOCK_REALTIME, &to, timeout_ns)) - return RET_FAIL; + ksft_test_result_error("get_time error"); res = futex_wait_requeue_pi(&f1, f1, &futex_pi, &to, FUTEX_CLOCK_REALTIME); - test_timeout(res, &ret, "futex_wait_requeue_pi realtime", ETIMEDOUT); + test_timeout(res, "futex_wait_requeue_pi realtime", ETIMEDOUT); /* FUTEX_WAIT_REQUEUE_PI with CLOCK_MONOTONIC */ if (futex_get_abs_timeout(CLOCK_MONOTONIC, &to, timeout_ns)) - return RET_FAIL; + ksft_test_result_error("get_time error"); res = futex_wait_requeue_pi(&f1, f1, &futex_pi, &to, 0); - test_timeout(res, &ret, "futex_wait_requeue_pi monotonic", ETIMEDOUT); + test_timeout(res, "futex_wait_requeue_pi monotonic", ETIMEDOUT); + +} + +TEST(lock_pi) +{ + struct timespec to; + pthread_t thread; + int res; + + /* Create a thread that will lock forever so any waiter will timeout */ + pthread_barrier_init(&barrier, NULL, 2); + pthread_create(&thread, NULL, get_pi_lock, NULL); /* Wait until the other thread calls futex_lock_pi() */ pthread_barrier_wait(&barrier); pthread_barrier_destroy(&barrier); + /* * FUTEX_LOCK_PI with CLOCK_REALTIME * Due to historical reasons, FUTEX_LOCK_PI supports only realtime @@ -181,26 +150,38 @@ int main(int argc, char *argv[]) * smaller than realtime and the syscall will timeout immediately. */ if (futex_get_abs_timeout(CLOCK_REALTIME, &to, timeout_ns)) - return RET_FAIL; + ksft_test_result_error("get_time error"); res = futex_lock_pi(&futex_pi, &to, 0, 0); - test_timeout(res, &ret, "futex_lock_pi realtime", ETIMEDOUT); + test_timeout(res, "futex_lock_pi realtime", ETIMEDOUT); /* Test operations that don't support FUTEX_CLOCK_REALTIME */ res = futex_lock_pi(&futex_pi, NULL, 0, FUTEX_CLOCK_REALTIME); - test_timeout(res, &ret, "futex_lock_pi invalid timeout flag", ENOSYS); + test_timeout(res, "futex_lock_pi invalid timeout flag", ENOSYS); +} + +TEST(waitv) +{ + futex_t f1 = FUTEX_INITIALIZER; + struct futex_waitv waitv = { + .uaddr = (uintptr_t)&f1, + .val = f1, + .flags = FUTEX_32, + .__reserved = 0, + }; + struct timespec to; + int res; /* futex_waitv with CLOCK_MONOTONIC */ if (futex_get_abs_timeout(CLOCK_MONOTONIC, &to, timeout_ns)) - return RET_FAIL; + ksft_test_result_error("get_time error"); res = futex_waitv(&waitv, 1, 0, &to, CLOCK_MONOTONIC); - test_timeout(res, &ret, "futex_waitv monotonic", ETIMEDOUT); + test_timeout(res, "futex_waitv monotonic", ETIMEDOUT); /* futex_waitv with CLOCK_REALTIME */ if (futex_get_abs_timeout(CLOCK_REALTIME, &to, timeout_ns)) - return RET_FAIL; + ksft_test_result_error("get_time error"); res = futex_waitv(&waitv, 1, 0, &to, CLOCK_REALTIME); - test_timeout(res, &ret, "futex_waitv realtime", ETIMEDOUT); - - ksft_print_cnts(); - return ret; + test_timeout(res, "futex_waitv realtime", ETIMEDOUT); } + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/futex/functional/futex_wait_uninitialized_heap.c b/tools/testing/selftests/futex/functional/futex_wait_uninitialized_heap.c index ed9cd07e31c1..ce2301500d83 100644 --- a/tools/testing/selftests/futex/functional/futex_wait_uninitialized_heap.c +++ b/tools/testing/selftests/futex/functional/futex_wait_uninitialized_heap.c @@ -29,95 +29,55 @@ #include <linux/futex.h> #include <libgen.h> -#include "logging.h" #include "futextest.h" +#include "../../kselftest_harness.h" -#define TEST_NAME "futex-wait-uninitialized-heap" #define WAIT_US 5000000 static int child_blocked = 1; -static int child_ret; +static bool child_ret; void *buf; -void usage(char *prog) -{ - printf("Usage: %s\n", prog); - printf(" -c Use color\n"); - printf(" -h Display this help message\n"); - printf(" -v L Verbosity level: %d=QUIET %d=CRITICAL %d=INFO\n", - VQUIET, VCRITICAL, VINFO); -} - void *wait_thread(void *arg) { int res; - child_ret = RET_PASS; + child_ret = true; res = futex_wait(buf, 1, NULL, 0); child_blocked = 0; if (res != 0 && errno != EWOULDBLOCK) { - error("futex failure\n", errno); - child_ret = RET_ERROR; + ksft_exit_fail_msg("futex failure\n"); + child_ret = false; } pthread_exit(NULL); } -int main(int argc, char **argv) +TEST(futex_wait_uninitialized_heap) { - int c, ret = RET_PASS; long page_size; pthread_t thr; - - while ((c = getopt(argc, argv, "chv:")) != -1) { - switch (c) { - case 'c': - log_color(1); - break; - case 'h': - usage(basename(argv[0])); - exit(0); - case 'v': - log_verbosity(atoi(optarg)); - break; - default: - usage(basename(argv[0])); - exit(1); - } - } + int ret; page_size = sysconf(_SC_PAGESIZE); buf = mmap(NULL, page_size, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, 0, 0); - if (buf == (void *)-1) { - error("mmap\n", errno); - exit(1); - } - - ksft_print_header(); - ksft_set_plan(1); - ksft_print_msg("%s: Test the uninitialized futex value in FUTEX_WAIT\n", - basename(argv[0])); - + if (buf == (void *)-1) + ksft_exit_fail_msg("mmap\n"); ret = pthread_create(&thr, NULL, wait_thread, NULL); - if (ret) { - error("pthread_create\n", errno); - ret = RET_ERROR; - goto out; - } + if (ret) + ksft_exit_fail_msg("pthread_create\n"); - info("waiting %dus for child to return\n", WAIT_US); + ksft_print_dbg_msg("waiting %dus for child to return\n", WAIT_US); usleep(WAIT_US); - ret = child_ret; - if (child_blocked) { - fail("child blocked in kernel\n"); - ret = RET_FAIL; - } + if (child_blocked) + ksft_test_result_fail("child blocked in kernel\n"); - out: - print_result(TEST_NAME, ret); - return ret; + if (!child_ret) + ksft_test_result_fail("child error\n"); } + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/futex/functional/futex_wait_wouldblock.c b/tools/testing/selftests/futex/functional/futex_wait_wouldblock.c index 2d8230da9064..36b7a54a4085 100644 --- a/tools/testing/selftests/futex/functional/futex_wait_wouldblock.c +++ b/tools/testing/selftests/futex/functional/futex_wait_wouldblock.c @@ -21,72 +21,44 @@ #include <stdlib.h> #include <string.h> #include <time.h> + #include "futextest.h" #include "futex2test.h" -#include "logging.h" +#include "../../kselftest_harness.h" -#define TEST_NAME "futex-wait-wouldblock" #define timeout_ns 100000 -void usage(char *prog) -{ - printf("Usage: %s\n", prog); - printf(" -c Use color\n"); - printf(" -h Display this help message\n"); - printf(" -v L Verbosity level: %d=QUIET %d=CRITICAL %d=INFO\n", - VQUIET, VCRITICAL, VINFO); -} - -int main(int argc, char *argv[]) +TEST(futex_wait_wouldblock) { struct timespec to = {.tv_sec = 0, .tv_nsec = timeout_ns}; futex_t f1 = FUTEX_INITIALIZER; - int res, ret = RET_PASS; - int c; - struct futex_waitv waitv = { - .uaddr = (uintptr_t)&f1, - .val = f1+1, - .flags = FUTEX_32, - .__reserved = 0 - }; + int res; - while ((c = getopt(argc, argv, "cht:v:")) != -1) { - switch (c) { - case 'c': - log_color(1); - break; - case 'h': - usage(basename(argv[0])); - exit(0); - case 'v': - log_verbosity(atoi(optarg)); - break; - default: - usage(basename(argv[0])); - exit(1); - } - } - - ksft_print_header(); - ksft_set_plan(2); - ksft_print_msg("%s: Test the unexpected futex value in FUTEX_WAIT\n", - basename(argv[0])); - - info("Calling futex_wait on f1: %u @ %p with val=%u\n", f1, &f1, f1+1); + ksft_print_dbg_msg("Calling futex_wait on f1: %u @ %p with val=%u\n", f1, &f1, f1+1); res = futex_wait(&f1, f1+1, &to, FUTEX_PRIVATE_FLAG); if (!res || errno != EWOULDBLOCK) { ksft_test_result_fail("futex_wait returned: %d %s\n", res ? errno : res, res ? strerror(errno) : ""); - ret = RET_FAIL; } else { ksft_test_result_pass("futex_wait\n"); } +} - if (clock_gettime(CLOCK_MONOTONIC, &to)) { - error("clock_gettime failed\n", errno); - return errno; - } +TEST(futex_waitv_wouldblock) +{ + struct timespec to = {.tv_sec = 0, .tv_nsec = timeout_ns}; + futex_t f1 = FUTEX_INITIALIZER; + struct futex_waitv waitv = { + .uaddr = (uintptr_t)&f1, + .val = f1 + 1, + .flags = FUTEX_32, + .__reserved = 0, + }; + int res; + + if (clock_gettime(CLOCK_MONOTONIC, &to)) + ksft_exit_fail_msg("clock_gettime failed %d\n", errno); to.tv_nsec += timeout_ns; @@ -95,17 +67,15 @@ int main(int argc, char *argv[]) to.tv_nsec -= 1000000000; } - info("Calling futex_waitv on f1: %u @ %p with val=%u\n", f1, &f1, f1+1); + ksft_print_dbg_msg("Calling futex_waitv on f1: %u @ %p with val=%u\n", f1, &f1, f1+1); res = futex_waitv(&waitv, 1, 0, &to, CLOCK_MONOTONIC); if (!res || errno != EWOULDBLOCK) { ksft_test_result_fail("futex_waitv returned: %d %s\n", res ? errno : res, res ? strerror(errno) : ""); - ret = RET_FAIL; } else { ksft_test_result_pass("futex_waitv\n"); } - - ksft_print_cnts(); - return ret; } + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/futex/functional/futex_waitv.c b/tools/testing/selftests/futex/functional/futex_waitv.c index a94337f677e1..c684b10eb76e 100644 --- a/tools/testing/selftests/futex/functional/futex_waitv.c +++ b/tools/testing/selftests/futex/functional/futex_waitv.c @@ -15,25 +15,16 @@ #include <pthread.h> #include <stdint.h> #include <sys/shm.h> + #include "futextest.h" #include "futex2test.h" -#include "logging.h" +#include "../../kselftest_harness.h" -#define TEST_NAME "futex-wait" #define WAKE_WAIT_US 10000 #define NR_FUTEXES 30 static struct futex_waitv waitv[NR_FUTEXES]; u_int32_t futexes[NR_FUTEXES] = {0}; -void usage(char *prog) -{ - printf("Usage: %s\n", prog); - printf(" -c Use color\n"); - printf(" -h Display this help message\n"); - printf(" -v L Verbosity level: %d=QUIET %d=CRITICAL %d=INFO\n", - VQUIET, VCRITICAL, VINFO); -} - void *waiterfn(void *arg) { struct timespec to; @@ -41,7 +32,7 @@ void *waiterfn(void *arg) /* setting absolute timeout for futex2 */ if (clock_gettime(CLOCK_MONOTONIC, &to)) - error("gettime64 failed\n", errno); + ksft_exit_fail_msg("gettime64 failed\n"); to.tv_sec++; @@ -57,34 +48,10 @@ void *waiterfn(void *arg) return NULL; } -int main(int argc, char *argv[]) +TEST(private_waitv) { pthread_t waiter; - int res, ret = RET_PASS; - struct timespec to; - int c, i; - - while ((c = getopt(argc, argv, "cht:v:")) != -1) { - switch (c) { - case 'c': - log_color(1); - break; - case 'h': - usage(basename(argv[0])); - exit(0); - case 'v': - log_verbosity(atoi(optarg)); - break; - default: - usage(basename(argv[0])); - exit(1); - } - } - - ksft_print_header(); - ksft_set_plan(7); - ksft_print_msg("%s: Test FUTEX_WAITV\n", - basename(argv[0])); + int res, i; for (i = 0; i < NR_FUTEXES; i++) { waitv[i].uaddr = (uintptr_t)&futexes[i]; @@ -95,7 +62,7 @@ int main(int argc, char *argv[]) /* Private waitv */ if (pthread_create(&waiter, NULL, waiterfn, NULL)) - error("pthread_create failed\n", errno); + ksft_exit_fail_msg("pthread_create failed\n"); usleep(WAKE_WAIT_US); @@ -104,10 +71,15 @@ int main(int argc, char *argv[]) ksft_test_result_fail("futex_wake private returned: %d %s\n", res ? errno : res, res ? strerror(errno) : ""); - ret = RET_FAIL; } else { ksft_test_result_pass("futex_waitv private\n"); } +} + +TEST(shared_waitv) +{ + pthread_t waiter; + int res, i; /* Shared waitv */ for (i = 0; i < NR_FUTEXES; i++) { @@ -128,7 +100,7 @@ int main(int argc, char *argv[]) } if (pthread_create(&waiter, NULL, waiterfn, NULL)) - error("pthread_create failed\n", errno); + ksft_exit_fail_msg("pthread_create failed\n"); usleep(WAKE_WAIT_US); @@ -137,19 +109,24 @@ int main(int argc, char *argv[]) ksft_test_result_fail("futex_wake shared returned: %d %s\n", res ? errno : res, res ? strerror(errno) : ""); - ret = RET_FAIL; } else { ksft_test_result_pass("futex_waitv shared\n"); } for (i = 0; i < NR_FUTEXES; i++) shmdt(u64_to_ptr(waitv[i].uaddr)); +} + +TEST(invalid_flag) +{ + struct timespec to; + int res; /* Testing a waiter without FUTEX_32 flag */ waitv[0].flags = FUTEX_PRIVATE_FLAG; if (clock_gettime(CLOCK_MONOTONIC, &to)) - error("gettime64 failed\n", errno); + ksft_exit_fail_msg("gettime64 failed\n"); to.tv_sec++; @@ -158,17 +135,22 @@ int main(int argc, char *argv[]) ksft_test_result_fail("futex_waitv private returned: %d %s\n", res ? errno : res, res ? strerror(errno) : ""); - ret = RET_FAIL; } else { ksft_test_result_pass("futex_waitv without FUTEX_32\n"); } +} + +TEST(unaligned_address) +{ + struct timespec to; + int res; /* Testing a waiter with an unaligned address */ waitv[0].flags = FUTEX_PRIVATE_FLAG | FUTEX_32; waitv[0].uaddr = 1; if (clock_gettime(CLOCK_MONOTONIC, &to)) - error("gettime64 failed\n", errno); + ksft_exit_fail_msg("gettime64 failed\n"); to.tv_sec++; @@ -177,16 +159,21 @@ int main(int argc, char *argv[]) ksft_test_result_fail("futex_wake private returned: %d %s\n", res ? errno : res, res ? strerror(errno) : ""); - ret = RET_FAIL; } else { ksft_test_result_pass("futex_waitv with an unaligned address\n"); } +} + +TEST(null_address) +{ + struct timespec to; + int res; /* Testing a NULL address for waiters.uaddr */ waitv[0].uaddr = 0x00000000; if (clock_gettime(CLOCK_MONOTONIC, &to)) - error("gettime64 failed\n", errno); + ksft_exit_fail_msg("gettime64 failed\n"); to.tv_sec++; @@ -195,14 +182,13 @@ int main(int argc, char *argv[]) ksft_test_result_fail("futex_waitv private returned: %d %s\n", res ? errno : res, res ? strerror(errno) : ""); - ret = RET_FAIL; } else { ksft_test_result_pass("futex_waitv NULL address in waitv.uaddr\n"); } /* Testing a NULL address for *waiters */ if (clock_gettime(CLOCK_MONOTONIC, &to)) - error("gettime64 failed\n", errno); + ksft_exit_fail_msg("gettime64 failed\n"); to.tv_sec++; @@ -211,14 +197,19 @@ int main(int argc, char *argv[]) ksft_test_result_fail("futex_waitv private returned: %d %s\n", res ? errno : res, res ? strerror(errno) : ""); - ret = RET_FAIL; } else { ksft_test_result_pass("futex_waitv NULL address in *waiters\n"); } +} + +TEST(invalid_clockid) +{ + struct timespec to; + int res; /* Testing an invalid clockid */ if (clock_gettime(CLOCK_MONOTONIC, &to)) - error("gettime64 failed\n", errno); + ksft_exit_fail_msg("gettime64 failed\n"); to.tv_sec++; @@ -227,11 +218,9 @@ int main(int argc, char *argv[]) ksft_test_result_fail("futex_waitv private returned: %d %s\n", res ? errno : res, res ? strerror(errno) : ""); - ret = RET_FAIL; } else { ksft_test_result_pass("futex_waitv invalid clockid\n"); } - - ksft_print_cnts(); - return ret; } + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/futex/functional/run.sh b/tools/testing/selftests/futex/functional/run.sh index 81739849f299..e88545c06d57 100755 --- a/tools/testing/selftests/futex/functional/run.sh +++ b/tools/testing/selftests/futex/functional/run.sh @@ -18,74 +18,36 @@ # ############################################################################### -# Test for a color capable console -if [ -z "$USE_COLOR" ]; then - tput setf 7 || tput setaf 7 - if [ $? -eq 0 ]; then - USE_COLOR=1 - tput sgr0 - fi -fi -if [ "$USE_COLOR" -eq 1 ]; then - COLOR="-c" -fi - - echo -# requeue pi testing -# without timeouts -./futex_requeue_pi $COLOR -./futex_requeue_pi $COLOR -b -./futex_requeue_pi $COLOR -b -l -./futex_requeue_pi $COLOR -b -o -./futex_requeue_pi $COLOR -l -./futex_requeue_pi $COLOR -o -# with timeouts -./futex_requeue_pi $COLOR -b -l -t 5000 -./futex_requeue_pi $COLOR -l -t 5000 -./futex_requeue_pi $COLOR -b -l -t 500000 -./futex_requeue_pi $COLOR -l -t 500000 -./futex_requeue_pi $COLOR -b -t 5000 -./futex_requeue_pi $COLOR -t 5000 -./futex_requeue_pi $COLOR -b -t 500000 -./futex_requeue_pi $COLOR -t 500000 -./futex_requeue_pi $COLOR -b -o -t 5000 -./futex_requeue_pi $COLOR -l -t 5000 -./futex_requeue_pi $COLOR -b -o -t 500000 -./futex_requeue_pi $COLOR -l -t 500000 -# with long timeout -./futex_requeue_pi $COLOR -b -l -t 2000000000 -./futex_requeue_pi $COLOR -l -t 2000000000 - +./futex_requeue_pi echo -./futex_requeue_pi_mismatched_ops $COLOR +./futex_requeue_pi_mismatched_ops echo -./futex_requeue_pi_signal_restart $COLOR +./futex_requeue_pi_signal_restart echo -./futex_wait_timeout $COLOR +./futex_wait_timeout echo -./futex_wait_wouldblock $COLOR +./futex_wait_wouldblock echo -./futex_wait_uninitialized_heap $COLOR -./futex_wait_private_mapped_file $COLOR +./futex_wait_uninitialized_heap +./futex_wait_private_mapped_file echo -./futex_wait $COLOR +./futex_wait echo -./futex_requeue $COLOR +./futex_requeue echo -./futex_waitv $COLOR +./futex_waitv echo -./futex_priv_hash $COLOR -./futex_priv_hash -g $COLOR +./futex_priv_hash echo -./futex_numa_mpol $COLOR +./futex_numa_mpol diff --git a/tools/testing/selftests/futex/include/futextest.h b/tools/testing/selftests/futex/include/futextest.h index 7a5fd1d5355e..3d48e9789d9f 100644 --- a/tools/testing/selftests/futex/include/futextest.h +++ b/tools/testing/selftests/futex/include/futextest.h @@ -58,6 +58,17 @@ typedef volatile u_int32_t futex_t; #define SYS_futex SYS_futex_time64 #endif +/* + * On 32bit systems if we use "-D_FILE_OFFSET_BITS=64 -D_TIME_BITS=64" or if + * we are using a newer compiler then the size of the timestamps will be 64bit, + * however, the SYS_futex will still point to the 32bit futex system call. + */ +#if __SIZEOF_POINTER__ == 4 && defined(SYS_futex_time64) && \ + defined(_TIME_BITS) && _TIME_BITS == 64 +# undef SYS_futex +# define SYS_futex SYS_futex_time64 +#endif + /** * futex() - SYS_futex syscall wrapper * @uaddr: address of first futex diff --git a/tools/testing/selftests/futex/include/logging.h b/tools/testing/selftests/futex/include/logging.h deleted file mode 100644 index 874c69ce5cce..000000000000 --- a/tools/testing/selftests/futex/include/logging.h +++ /dev/null @@ -1,148 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/****************************************************************************** - * - * Copyright © International Business Machines Corp., 2009 - * - * DESCRIPTION - * Glibc independent futex library for testing kernel functionality. - * - * AUTHOR - * Darren Hart <dvhart@linux.intel.com> - * - * HISTORY - * 2009-Nov-6: Initial version by Darren Hart <dvhart@linux.intel.com> - * - *****************************************************************************/ - -#ifndef _LOGGING_H -#define _LOGGING_H - -#include <stdio.h> -#include <string.h> -#include <unistd.h> -#include <linux/futex.h> -#include "kselftest.h" - -/* - * Define PASS, ERROR, and FAIL strings with and without color escape - * sequences, default to no color. - */ -#define ESC 0x1B, '[' -#define BRIGHT '1' -#define GREEN '3', '2' -#define YELLOW '3', '3' -#define RED '3', '1' -#define ESCEND 'm' -#define BRIGHT_GREEN ESC, BRIGHT, ';', GREEN, ESCEND -#define BRIGHT_YELLOW ESC, BRIGHT, ';', YELLOW, ESCEND -#define BRIGHT_RED ESC, BRIGHT, ';', RED, ESCEND -#define RESET_COLOR ESC, '0', 'm' -static const char PASS_COLOR[] = {BRIGHT_GREEN, ' ', 'P', 'A', 'S', 'S', - RESET_COLOR, 0}; -static const char ERROR_COLOR[] = {BRIGHT_YELLOW, 'E', 'R', 'R', 'O', 'R', - RESET_COLOR, 0}; -static const char FAIL_COLOR[] = {BRIGHT_RED, ' ', 'F', 'A', 'I', 'L', - RESET_COLOR, 0}; -static const char INFO_NORMAL[] = " INFO"; -static const char PASS_NORMAL[] = " PASS"; -static const char ERROR_NORMAL[] = "ERROR"; -static const char FAIL_NORMAL[] = " FAIL"; -const char *INFO = INFO_NORMAL; -const char *PASS = PASS_NORMAL; -const char *ERROR = ERROR_NORMAL; -const char *FAIL = FAIL_NORMAL; - -/* Verbosity setting for INFO messages */ -#define VQUIET 0 -#define VCRITICAL 1 -#define VINFO 2 -#define VMAX VINFO -int _verbose = VCRITICAL; - -/* Functional test return codes */ -#define RET_PASS 0 -#define RET_ERROR -1 -#define RET_FAIL -2 - -/** - * log_color() - Use colored output for PASS, ERROR, and FAIL strings - * @use_color: use color (1) or not (0) - */ -void log_color(int use_color) -{ - if (use_color) { - PASS = PASS_COLOR; - ERROR = ERROR_COLOR; - FAIL = FAIL_COLOR; - } else { - PASS = PASS_NORMAL; - ERROR = ERROR_NORMAL; - FAIL = FAIL_NORMAL; - } -} - -/** - * log_verbosity() - Set verbosity of test output - * @verbose: Enable (1) verbose output or not (0) - * - * Currently setting verbose=1 will enable INFO messages and 0 will disable - * them. FAIL and ERROR messages are always displayed. - */ -void log_verbosity(int level) -{ - if (level > VMAX) - level = VMAX; - else if (level < 0) - level = 0; - _verbose = level; -} - -/** - * print_result() - Print standard PASS | ERROR | FAIL results - * @ret: the return value to be considered: 0 | RET_ERROR | RET_FAIL - * - * print_result() is primarily intended for functional tests. - */ -void print_result(const char *test_name, int ret) -{ - switch (ret) { - case RET_PASS: - ksft_test_result_pass("%s\n", test_name); - ksft_print_cnts(); - return; - case RET_ERROR: - ksft_test_result_error("%s\n", test_name); - ksft_print_cnts(); - return; - case RET_FAIL: - ksft_test_result_fail("%s\n", test_name); - ksft_print_cnts(); - return; - } -} - -/* log level macros */ -#define info(message, vargs...) \ -do { \ - if (_verbose >= VINFO) \ - fprintf(stderr, "\t%s: "message, INFO, ##vargs); \ -} while (0) - -#define error(message, err, args...) \ -do { \ - if (_verbose >= VCRITICAL) {\ - if (err) \ - fprintf(stderr, "\t%s: %s: "message, \ - ERROR, strerror(err), ##args); \ - else \ - fprintf(stderr, "\t%s: "message, ERROR, ##args); \ - } \ -} while (0) - -#define fail(message, args...) \ -do { \ - if (_verbose >= VCRITICAL) \ - fprintf(stderr, "\t%s: "message, FAIL, ##args); \ -} while (0) - -#endif diff --git a/tools/testing/selftests/kselftest.h b/tools/testing/selftests/kselftest.h index c3b6d2604b1e..8deeb4b72e73 100644 --- a/tools/testing/selftests/kselftest.h +++ b/tools/testing/selftests/kselftest.h @@ -54,6 +54,7 @@ #include <stdlib.h> #include <unistd.h> #include <stdarg.h> +#include <stdbool.h> #include <string.h> #include <stdio.h> #include <sys/utsname.h> @@ -104,6 +105,7 @@ struct ksft_count { static struct ksft_count ksft_cnt; static unsigned int ksft_plan; +static bool ksft_debug_enabled; static inline unsigned int ksft_test_num(void) { @@ -175,6 +177,18 @@ static inline __printf(1, 2) void ksft_print_msg(const char *msg, ...) va_end(args); } +static inline void ksft_print_dbg_msg(const char *msg, ...) +{ + va_list args; + + if (!ksft_debug_enabled) + return; + + va_start(args, msg); + ksft_print_msg(msg, args); + va_end(args); +} + static inline void ksft_perror(const char *msg) { ksft_print_msg("%s: %s (%d)\n", msg, strerror(errno), errno); diff --git a/tools/testing/selftests/kselftest_harness.h b/tools/testing/selftests/kselftest_harness.h index 8516e8434bc4..3f66e862e83e 100644 --- a/tools/testing/selftests/kselftest_harness.h +++ b/tools/testing/selftests/kselftest_harness.h @@ -1091,7 +1091,7 @@ static int test_harness_argv_check(int argc, char **argv) { int opt; - while ((opt = getopt(argc, argv, "hlF:f:V:v:t:T:r:")) != -1) { + while ((opt = getopt(argc, argv, "dhlF:f:V:v:t:T:r:")) != -1) { switch (opt) { case 'f': case 'F': @@ -1104,12 +1104,16 @@ static int test_harness_argv_check(int argc, char **argv) case 'l': test_harness_list_tests(); return KSFT_SKIP; + case 'd': + ksft_debug_enabled = true; + break; case 'h': default: fprintf(stderr, - "Usage: %s [-h|-l] [-t|-T|-v|-V|-f|-F|-r name]\n" + "Usage: %s [-h|-l|-d] [-t|-T|-v|-V|-f|-F|-r name]\n" "\t-h print help\n" "\t-l list all tests\n" + "\t-d enable debug prints\n" "\n" "\t-t name include test\n" "\t-T name exclude test\n" @@ -1142,8 +1146,9 @@ static bool test_enabled(int argc, char **argv, int opt; optind = 1; - while ((opt = getopt(argc, argv, "F:f:V:v:t:T:r:")) != -1) { - has_positive |= islower(opt); + while ((opt = getopt(argc, argv, "dF:f:V:v:t:T:r:")) != -1) { + if (opt != 'd') + has_positive |= islower(opt); switch (tolower(opt)) { case 't': diff --git a/tools/testing/selftests/pidfd/config b/tools/testing/selftests/pidfd/config index 6133524710f7..cf7cc0ce0248 100644 --- a/tools/testing/selftests/pidfd/config +++ b/tools/testing/selftests/pidfd/config @@ -4,6 +4,5 @@ CONFIG_USER_NS=y CONFIG_PID_NS=y CONFIG_NET_NS=y CONFIG_TIME_NS=y -CONFIG_GENERIC_VDSO_TIME_NS=y CONFIG_CGROUPS=y CONFIG_CHECKPOINT_RESTORE=y diff --git a/tools/testing/selftests/rseq/rseq.c b/tools/testing/selftests/rseq/rseq.c index 663a9cef1952..dcac5cbe7933 100644 --- a/tools/testing/selftests/rseq/rseq.c +++ b/tools/testing/selftests/rseq/rseq.c @@ -40,9 +40,9 @@ * Define weak versions to play nice with binaries that are statically linked * against a libc that doesn't support registering its own rseq. */ -__weak ptrdiff_t __rseq_offset; -__weak unsigned int __rseq_size; -__weak unsigned int __rseq_flags; +extern __weak ptrdiff_t __rseq_offset; +extern __weak unsigned int __rseq_size; +extern __weak unsigned int __rseq_flags; static const ptrdiff_t *libc_rseq_offset_p = &__rseq_offset; static const unsigned int *libc_rseq_size_p = &__rseq_size; @@ -209,7 +209,7 @@ void rseq_init(void) * libc not having registered a restartable sequence. Try to find the * symbols if that's the case. */ - if (!*libc_rseq_size_p) { + if (!libc_rseq_size_p || !*libc_rseq_size_p) { libc_rseq_offset_p = dlsym(RTLD_NEXT, "__rseq_offset"); libc_rseq_size_p = dlsym(RTLD_NEXT, "__rseq_size"); libc_rseq_flags_p = dlsym(RTLD_NEXT, "__rseq_flags"); diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c index fc4910d35342..874f17763536 100644 --- a/tools/testing/selftests/seccomp/seccomp_bpf.c +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c @@ -74,6 +74,14 @@ #define noinline __attribute__((noinline)) #endif +#ifndef __nocf_check +#define __nocf_check __attribute__((nocf_check)) +#endif + +#ifndef __naked +#define __naked __attribute__((__naked__)) +#endif + #ifndef PR_SET_NO_NEW_PRIVS #define PR_SET_NO_NEW_PRIVS 38 #define PR_GET_NO_NEW_PRIVS 39 @@ -5027,7 +5035,36 @@ TEST(tsync_vs_dead_thread_leader) EXPECT_EQ(0, status); } -noinline int probed(void) +#ifdef __x86_64__ + +/* + * We need naked probed_uprobe function. Using __nocf_check + * check to skip possible endbr64 instruction and ignoring + * -Wattributes, otherwise the compilation might fail. + */ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wattributes" + +__naked __nocf_check noinline int probed_uprobe(void) +{ + /* + * Optimized uprobe is possible only on top of nop5 instruction. + */ + asm volatile (" \n" + ".byte 0x0f, 0x1f, 0x44, 0x00, 0x00 \n" + "ret \n" + ); +} +#pragma GCC diagnostic pop + +#else +noinline int probed_uprobe(void) +{ + return 1; +} +#endif + +noinline int probed_uretprobe(void) { return 1; } @@ -5080,35 +5117,46 @@ static ssize_t get_uprobe_offset(const void *addr) return found ? (uintptr_t)addr - start + base : -1; } -FIXTURE(URETPROBE) { +FIXTURE(UPROBE) { int fd; }; -FIXTURE_VARIANT(URETPROBE) { +FIXTURE_VARIANT(UPROBE) { /* - * All of the URETPROBE behaviors can be tested with either - * uretprobe attached or not + * All of the U(RET)PROBE behaviors can be tested with either + * u(ret)probe attached or not */ bool attach; + /* + * Test both uprobe and uretprobe. + */ + bool uretprobe; }; -FIXTURE_VARIANT_ADD(URETPROBE, attached) { +FIXTURE_VARIANT_ADD(UPROBE, not_attached) { + .attach = false, + .uretprobe = false, +}; + +FIXTURE_VARIANT_ADD(UPROBE, uprobe_attached) { .attach = true, + .uretprobe = false, }; -FIXTURE_VARIANT_ADD(URETPROBE, not_attached) { - .attach = false, +FIXTURE_VARIANT_ADD(UPROBE, uretprobe_attached) { + .attach = true, + .uretprobe = true, }; -FIXTURE_SETUP(URETPROBE) +FIXTURE_SETUP(UPROBE) { const size_t attr_sz = sizeof(struct perf_event_attr); struct perf_event_attr attr; ssize_t offset; int type, bit; -#ifndef __NR_uretprobe - SKIP(return, "__NR_uretprobe syscall not defined"); +#if !defined(__NR_uprobe) || !defined(__NR_uretprobe) + SKIP(return, "__NR_uprobe ot __NR_uretprobe syscalls not defined"); #endif if (!variant->attach) @@ -5118,12 +5166,17 @@ FIXTURE_SETUP(URETPROBE) type = determine_uprobe_perf_type(); ASSERT_GE(type, 0); - bit = determine_uprobe_retprobe_bit(); - ASSERT_GE(bit, 0); - offset = get_uprobe_offset(probed); + + if (variant->uretprobe) { + bit = determine_uprobe_retprobe_bit(); + ASSERT_GE(bit, 0); + } + + offset = get_uprobe_offset(variant->uretprobe ? probed_uretprobe : probed_uprobe); ASSERT_GE(offset, 0); - attr.config |= 1 << bit; + if (variant->uretprobe) + attr.config |= 1 << bit; attr.size = attr_sz; attr.type = type; attr.config1 = ptr_to_u64("/proc/self/exe"); @@ -5134,7 +5187,7 @@ FIXTURE_SETUP(URETPROBE) PERF_FLAG_FD_CLOEXEC); } -FIXTURE_TEARDOWN(URETPROBE) +FIXTURE_TEARDOWN(UPROBE) { /* we could call close(self->fd), but we'd need extra filter for * that and since we are calling _exit right away.. @@ -5148,11 +5201,17 @@ static int run_probed_with_filter(struct sock_fprog *prog) return -1; } - probed(); + /* + * Uprobe is optimized after first hit, so let's hit twice. + */ + probed_uprobe(); + probed_uprobe(); + + probed_uretprobe(); return 0; } -TEST_F(URETPROBE, uretprobe_default_allow) +TEST_F(UPROBE, uprobe_default_allow) { struct sock_filter filter[] = { BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW), @@ -5165,7 +5224,7 @@ TEST_F(URETPROBE, uretprobe_default_allow) ASSERT_EQ(0, run_probed_with_filter(&prog)); } -TEST_F(URETPROBE, uretprobe_default_block) +TEST_F(UPROBE, uprobe_default_block) { struct sock_filter filter[] = { BPF_STMT(BPF_LD|BPF_W|BPF_ABS, @@ -5182,11 +5241,14 @@ TEST_F(URETPROBE, uretprobe_default_block) ASSERT_EQ(0, run_probed_with_filter(&prog)); } -TEST_F(URETPROBE, uretprobe_block_uretprobe_syscall) +TEST_F(UPROBE, uprobe_block_syscall) { struct sock_filter filter[] = { BPF_STMT(BPF_LD|BPF_W|BPF_ABS, offsetof(struct seccomp_data, nr)), +#ifdef __NR_uprobe + BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_uprobe, 1, 2), +#endif #ifdef __NR_uretprobe BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_uretprobe, 0, 1), #endif @@ -5201,11 +5263,14 @@ TEST_F(URETPROBE, uretprobe_block_uretprobe_syscall) ASSERT_EQ(0, run_probed_with_filter(&prog)); } -TEST_F(URETPROBE, uretprobe_default_block_with_uretprobe_syscall) +TEST_F(UPROBE, uprobe_default_block_with_syscall) { struct sock_filter filter[] = { BPF_STMT(BPF_LD|BPF_W|BPF_ABS, offsetof(struct seccomp_data, nr)), +#ifdef __NR_uprobe + BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_uprobe, 3, 0), +#endif #ifdef __NR_uretprobe BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_uretprobe, 2, 0), #endif diff --git a/tools/testing/selftests/vDSO/.gitignore b/tools/testing/selftests/vDSO/.gitignore index 30d5c8f0e5c7..ba322a353aff 100644 --- a/tools/testing/selftests/vDSO/.gitignore +++ b/tools/testing/selftests/vDSO/.gitignore @@ -1,7 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only vdso_test vdso_test_abi -vdso_test_clock_getres vdso_test_correctness vdso_test_gettimeofday vdso_test_getcpu diff --git a/tools/testing/selftests/vDSO/Makefile b/tools/testing/selftests/vDSO/Makefile index 918a2caa070e..e361aca22a74 100644 --- a/tools/testing/selftests/vDSO/Makefile +++ b/tools/testing/selftests/vDSO/Makefile @@ -4,7 +4,6 @@ include ../../../scripts/Makefile.arch TEST_GEN_PROGS := vdso_test_gettimeofday TEST_GEN_PROGS += vdso_test_getcpu TEST_GEN_PROGS += vdso_test_abi -TEST_GEN_PROGS += vdso_test_clock_getres ifeq ($(ARCH),$(filter $(ARCH),x86 x86_64)) TEST_GEN_PROGS += vdso_standalone_test_x86 endif @@ -29,7 +28,6 @@ CFLAGS_NOLIBC := -nostdlib -nostdinc -ffreestanding -fno-asynchronous-unwind-tab $(OUTPUT)/vdso_test_gettimeofday: parse_vdso.c vdso_test_gettimeofday.c $(OUTPUT)/vdso_test_getcpu: parse_vdso.c vdso_test_getcpu.c $(OUTPUT)/vdso_test_abi: parse_vdso.c vdso_test_abi.c -$(OUTPUT)/vdso_test_clock_getres: vdso_test_clock_getres.c $(OUTPUT)/vdso_standalone_test_x86: vdso_standalone_test_x86.c parse_vdso.c | headers $(OUTPUT)/vdso_standalone_test_x86: CFLAGS:=$(CFLAGS_NOLIBC) $(CFLAGS) diff --git a/tools/testing/selftests/vDSO/vdso_call.h b/tools/testing/selftests/vDSO/vdso_call.h index bb237d771051..e7205584cbdc 100644 --- a/tools/testing/selftests/vDSO/vdso_call.h +++ b/tools/testing/selftests/vDSO/vdso_call.h @@ -44,7 +44,6 @@ register long _r6 asm ("r6"); \ register long _r7 asm ("r7"); \ register long _r8 asm ("r8"); \ - register long _rval asm ("r3"); \ \ LOADARGS_##nr(fn, args); \ \ @@ -54,13 +53,13 @@ " bns+ 1f\n" \ " neg 3, 3\n" \ "1:" \ - : "+r" (_r0), "=r" (_r3), "+r" (_r4), "+r" (_r5), \ + : "+r" (_r0), "+r" (_r3), "+r" (_r4), "+r" (_r5), \ "+r" (_r6), "+r" (_r7), "+r" (_r8) \ - : "r" (_rval) \ + : \ : "r9", "r10", "r11", "r12", "cr0", "cr1", "cr5", \ "cr6", "cr7", "xer", "lr", "ctr", "memory" \ ); \ - _rval; \ + _r3; \ }) #else diff --git a/tools/testing/selftests/vDSO/vdso_test_abi.c b/tools/testing/selftests/vDSO/vdso_test_abi.c index a54424e2336f..238d609a457a 100644 --- a/tools/testing/selftests/vDSO/vdso_test_abi.c +++ b/tools/testing/selftests/vDSO/vdso_test_abi.c @@ -26,24 +26,31 @@ static const char *version; static const char **name; +/* The same as struct __kernel_timespec */ +struct vdso_timespec64 { + uint64_t tv_sec; + uint64_t tv_nsec; +}; + typedef long (*vdso_gettimeofday_t)(struct timeval *tv, struct timezone *tz); typedef long (*vdso_clock_gettime_t)(clockid_t clk_id, struct timespec *ts); +typedef long (*vdso_clock_gettime64_t)(clockid_t clk_id, struct vdso_timespec64 *ts); typedef long (*vdso_clock_getres_t)(clockid_t clk_id, struct timespec *ts); typedef time_t (*vdso_time_t)(time_t *t); -const char *vdso_clock_name[12] = { - "CLOCK_REALTIME", - "CLOCK_MONOTONIC", - "CLOCK_PROCESS_CPUTIME_ID", - "CLOCK_THREAD_CPUTIME_ID", - "CLOCK_MONOTONIC_RAW", - "CLOCK_REALTIME_COARSE", - "CLOCK_MONOTONIC_COARSE", - "CLOCK_BOOTTIME", - "CLOCK_REALTIME_ALARM", - "CLOCK_BOOTTIME_ALARM", - "CLOCK_SGI_CYCLE", - "CLOCK_TAI", +static const char * const vdso_clock_name[] = { + [CLOCK_REALTIME] = "CLOCK_REALTIME", + [CLOCK_MONOTONIC] = "CLOCK_MONOTONIC", + [CLOCK_PROCESS_CPUTIME_ID] = "CLOCK_PROCESS_CPUTIME_ID", + [CLOCK_THREAD_CPUTIME_ID] = "CLOCK_THREAD_CPUTIME_ID", + [CLOCK_MONOTONIC_RAW] = "CLOCK_MONOTONIC_RAW", + [CLOCK_REALTIME_COARSE] = "CLOCK_REALTIME_COARSE", + [CLOCK_MONOTONIC_COARSE] = "CLOCK_MONOTONIC_COARSE", + [CLOCK_BOOTTIME] = "CLOCK_BOOTTIME", + [CLOCK_REALTIME_ALARM] = "CLOCK_REALTIME_ALARM", + [CLOCK_BOOTTIME_ALARM] = "CLOCK_BOOTTIME_ALARM", + [10 /* CLOCK_SGI_CYCLE */] = "CLOCK_SGI_CYCLE", + [CLOCK_TAI] = "CLOCK_TAI", }; static void vdso_test_gettimeofday(void) @@ -70,6 +77,33 @@ static void vdso_test_gettimeofday(void) } } +static void vdso_test_clock_gettime64(clockid_t clk_id) +{ + /* Find clock_gettime64. */ + vdso_clock_gettime64_t vdso_clock_gettime64 = + (vdso_clock_gettime64_t)vdso_sym(version, name[5]); + + if (!vdso_clock_gettime64) { + ksft_print_msg("Couldn't find %s\n", name[5]); + ksft_test_result_skip("%s %s\n", name[5], + vdso_clock_name[clk_id]); + return; + } + + struct vdso_timespec64 ts; + long ret = VDSO_CALL(vdso_clock_gettime64, 2, clk_id, &ts); + + if (ret == 0) { + ksft_print_msg("The time is %lld.%06lld\n", + (long long)ts.tv_sec, (long long)ts.tv_nsec); + ksft_test_result_pass("%s %s\n", name[5], + vdso_clock_name[clk_id]); + } else { + ksft_test_result_fail("%s %s\n", name[5], + vdso_clock_name[clk_id]); + } +} + static void vdso_test_clock_gettime(clockid_t clk_id) { /* Find clock_gettime. */ @@ -171,23 +205,23 @@ static inline void vdso_test_clock(clockid_t clock_id) ksft_print_msg("clock_id: %s\n", vdso_clock_name[clock_id]); vdso_test_clock_gettime(clock_id); + vdso_test_clock_gettime64(clock_id); vdso_test_clock_getres(clock_id); } -#define VDSO_TEST_PLAN 16 +#define VDSO_TEST_PLAN 29 int main(int argc, char **argv) { unsigned long sysinfo_ehdr = getauxval(AT_SYSINFO_EHDR); ksft_print_header(); - ksft_set_plan(VDSO_TEST_PLAN); - if (!sysinfo_ehdr) { - ksft_print_msg("AT_SYSINFO_EHDR is not present!\n"); - return KSFT_SKIP; - } + if (!sysinfo_ehdr) + ksft_exit_skip("AT_SYSINFO_EHDR is not present!\n"); + + ksft_set_plan(VDSO_TEST_PLAN); version = versions[VDSO_VERSION]; name = (const char **)&names[VDSO_NAMES]; @@ -198,40 +232,17 @@ int main(int argc, char **argv) vdso_test_gettimeofday(); -#if _POSIX_TIMERS > 0 - -#ifdef CLOCK_REALTIME vdso_test_clock(CLOCK_REALTIME); -#endif - -#ifdef CLOCK_BOOTTIME vdso_test_clock(CLOCK_BOOTTIME); -#endif - -#ifdef CLOCK_TAI vdso_test_clock(CLOCK_TAI); -#endif - -#ifdef CLOCK_REALTIME_COARSE vdso_test_clock(CLOCK_REALTIME_COARSE); -#endif - -#ifdef CLOCK_MONOTONIC vdso_test_clock(CLOCK_MONOTONIC); -#endif - -#ifdef CLOCK_MONOTONIC_RAW vdso_test_clock(CLOCK_MONOTONIC_RAW); -#endif - -#ifdef CLOCK_MONOTONIC_COARSE vdso_test_clock(CLOCK_MONOTONIC_COARSE); -#endif - -#endif + vdso_test_clock(CLOCK_PROCESS_CPUTIME_ID); + vdso_test_clock(CLOCK_THREAD_CPUTIME_ID); vdso_test_time(); - ksft_print_cnts(); - return ksft_get_fail_cnt() == 0 ? KSFT_PASS : KSFT_FAIL; + ksft_finished(); } diff --git a/tools/testing/selftests/vDSO/vdso_test_clock_getres.c b/tools/testing/selftests/vDSO/vdso_test_clock_getres.c deleted file mode 100644 index b5d5f59f725a..000000000000 --- a/tools/testing/selftests/vDSO/vdso_test_clock_getres.c +++ /dev/null @@ -1,123 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note -/* - * vdso_clock_getres.c: Sample code to test clock_getres. - * Copyright (c) 2019 Arm Ltd. - * - * Compile with: - * gcc -std=gnu99 vdso_clock_getres.c - * - * Tested on ARM, ARM64, MIPS32, x86 (32-bit and 64-bit), - * Power (32-bit and 64-bit), S390x (32-bit and 64-bit). - * Might work on other architectures. - */ - -#define _GNU_SOURCE -#include <elf.h> -#include <fcntl.h> -#include <stdint.h> -#include <stdio.h> -#include <stdlib.h> -#include <time.h> -#include <sys/auxv.h> -#include <sys/mman.h> -#include <sys/time.h> -#include <unistd.h> -#include <sys/syscall.h> - -#include "../kselftest.h" - -static long syscall_clock_getres(clockid_t _clkid, struct timespec *_ts) -{ - long ret; - - ret = syscall(SYS_clock_getres, _clkid, _ts); - - return ret; -} - -const char *vdso_clock_name[12] = { - "CLOCK_REALTIME", - "CLOCK_MONOTONIC", - "CLOCK_PROCESS_CPUTIME_ID", - "CLOCK_THREAD_CPUTIME_ID", - "CLOCK_MONOTONIC_RAW", - "CLOCK_REALTIME_COARSE", - "CLOCK_MONOTONIC_COARSE", - "CLOCK_BOOTTIME", - "CLOCK_REALTIME_ALARM", - "CLOCK_BOOTTIME_ALARM", - "CLOCK_SGI_CYCLE", - "CLOCK_TAI", -}; - -/* - * This function calls clock_getres in vdso and by system call - * with different values for clock_id. - * - * Example of output: - * - * clock_id: CLOCK_REALTIME [PASS] - * clock_id: CLOCK_BOOTTIME [PASS] - * clock_id: CLOCK_TAI [PASS] - * clock_id: CLOCK_REALTIME_COARSE [PASS] - * clock_id: CLOCK_MONOTONIC [PASS] - * clock_id: CLOCK_MONOTONIC_RAW [PASS] - * clock_id: CLOCK_MONOTONIC_COARSE [PASS] - */ -static inline int vdso_test_clock(unsigned int clock_id) -{ - struct timespec x, y; - - printf("clock_id: %s", vdso_clock_name[clock_id]); - clock_getres(clock_id, &x); - syscall_clock_getres(clock_id, &y); - - if ((x.tv_sec != y.tv_sec) || (x.tv_nsec != y.tv_nsec)) { - printf(" [FAIL]\n"); - return KSFT_FAIL; - } - - printf(" [PASS]\n"); - return KSFT_PASS; -} - -int main(int argc, char **argv) -{ - int ret = 0; - -#if _POSIX_TIMERS > 0 - -#ifdef CLOCK_REALTIME - ret += vdso_test_clock(CLOCK_REALTIME); -#endif - -#ifdef CLOCK_BOOTTIME - ret += vdso_test_clock(CLOCK_BOOTTIME); -#endif - -#ifdef CLOCK_TAI - ret += vdso_test_clock(CLOCK_TAI); -#endif - -#ifdef CLOCK_REALTIME_COARSE - ret += vdso_test_clock(CLOCK_REALTIME_COARSE); -#endif - -#ifdef CLOCK_MONOTONIC - ret += vdso_test_clock(CLOCK_MONOTONIC); -#endif - -#ifdef CLOCK_MONOTONIC_RAW - ret += vdso_test_clock(CLOCK_MONOTONIC_RAW); -#endif - -#ifdef CLOCK_MONOTONIC_COARSE - ret += vdso_test_clock(CLOCK_MONOTONIC_COARSE); -#endif - -#endif - if (ret > 0) - return KSFT_FAIL; - - return KSFT_PASS; -} |