From c279e83953d937470f8a6e69b69f62608714f13f Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Mon, 15 Dec 2025 13:42:19 -0800 Subject: iommu: Introduce pci_dev_reset_iommu_prepare/done() PCIe permits a device to ignore ATS invalidation TLPs while processing a reset. This creates a problem visible to the OS where an ATS invalidation command will time out. E.g. an SVA domain will have no coordination with a reset event and can racily issue ATS invalidations to a resetting device. The OS should do something to mitigate this as we do not want production systems to be reporting critical ATS failures, especially in a hypervisor environment. Broadly, OS could arrange to ignore the timeouts, block page table mutations to prevent invalidations, or disable and block ATS. The PCIe r6.0, sec 10.3.1 IMPLEMENTATION NOTE recommends SW to disable and block ATS before initiating a Function Level Reset. It also mentions that other reset methods could have the same vulnerability as well. Provide a callback from the PCI subsystem that will enclose the reset and have the iommu core temporarily change all the attached RID/PASID domains group->blocking_domain so that the IOMMU hardware would fence any incoming ATS queries. And IOMMU drivers should also synchronously stop issuing new ATS invalidations and wait for all ATS invalidations to complete. This can avoid any ATS invaliation timeouts. However, if there is a domain attachment/replacement happening during an ongoing reset, ATS routines may be re-activated between the two function calls. So, introduce a new resetting_domain in the iommu_group structure to reject any concurrent attach_dev/set_dev_pasid call during a reset for a concern of compatibility failure. Since this changes the behavior of an attach operation, update the uAPI accordingly. Note that there are two corner cases: 1. Devices in the same iommu_group Since an attachment is always per iommu_group, this means that any sibling devices in the iommu_group cannot change domain, to prevent race conditions. 2. An SR-IOV PF that is being reset while its VF is not In such case, the VF itself is already broken. So, there is no point in preventing PF from going through the iommu reset. Reviewed-by: Lu Baolu Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Tested-by: Dheeraj Kumar Srivastava Signed-off-by: Nicolin Chen Signed-off-by: Joerg Roedel --- include/uapi/linux/vfio.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index ac2329f24141..bb7b89330d35 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -964,6 +964,10 @@ struct vfio_device_bind_iommufd { * hwpt corresponding to the given pt_id. * * Return: 0 on success, -errno on failure. + * + * When a device is resetting, -EBUSY will be returned to reject any concurrent + * attachment to the resetting device itself or any sibling device in the IOMMU + * group having the resetting device. */ struct vfio_device_attach_iommufd_pt { __u32 argsz; -- cgit v1.2.3 From 7d8b06ecc45bd679dec58d2cc2bd86223d4e076d Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Thu, 15 Jan 2026 06:08:02 +0000 Subject: iommu/amd: Add support for hw_info for iommu capability query AMD IOMMU Extended Feature (EFR) and Extended Feature 2 (EFR2) registers specify features supported by each IOMMU hardware instance. The IOMMU driver checks each feature-specific bits before enabling each feature at run time. For IOMMUFD, the hypervisor passes the raw value of amd_iommu_efr and amd_iommu_efr2 to VMM via iommufd IOMMU_DEVICE_GET_HW_INFO ioctl. Reviewed-by: Nicolin Chen Reviewed-by: Vasant Hegde Reviewed-by: Jason Gunthorpe Signed-off-by: Suravee Suthikulpanit Signed-off-by: Joerg Roedel --- drivers/iommu/amd/Kconfig | 10 ++++++++++ drivers/iommu/amd/Makefile | 1 + drivers/iommu/amd/iommu.c | 2 ++ drivers/iommu/amd/iommufd.c | 31 +++++++++++++++++++++++++++++++ drivers/iommu/amd/iommufd.h | 15 +++++++++++++++ include/uapi/linux/iommufd.h | 28 ++++++++++++++++++++++++++++ 6 files changed, 87 insertions(+) create mode 100644 drivers/iommu/amd/iommufd.c create mode 100644 drivers/iommu/amd/iommufd.h (limited to 'include/uapi') diff --git a/drivers/iommu/amd/Kconfig b/drivers/iommu/amd/Kconfig index f2acf471cb5d..588355ff7eb7 100644 --- a/drivers/iommu/amd/Kconfig +++ b/drivers/iommu/amd/Kconfig @@ -30,6 +30,16 @@ config AMD_IOMMU your BIOS for an option to enable it or if you have an IVRS ACPI table. +config AMD_IOMMU_IOMMUFD + bool "Enable IOMMUFD features for AMD IOMMU (EXPERIMENTAL)" + depends on IOMMUFD + depends on AMD_IOMMU + help + Support for IOMMUFD features intended to support virtual machines + with accelerated virtual IOMMUs. + + Say Y here if you are doing development and testing on this feature. + config AMD_IOMMU_DEBUGFS bool "Enable AMD IOMMU internals in DebugFS" depends on AMD_IOMMU && IOMMU_DEBUGFS diff --git a/drivers/iommu/amd/Makefile b/drivers/iommu/amd/Makefile index 5412a563c697..41f053b49dce 100644 --- a/drivers/iommu/amd/Makefile +++ b/drivers/iommu/amd/Makefile @@ -1,3 +1,4 @@ # SPDX-License-Identifier: GPL-2.0-only obj-y += iommu.o init.o quirks.o ppr.o pasid.o +obj-$(CONFIG_AMD_IOMMU_IOMMUFD) += iommufd.o obj-$(CONFIG_AMD_IOMMU_DEBUGFS) += debugfs.o diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index d7f457338de7..d550a7e431ac 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -43,6 +43,7 @@ #include #include "amd_iommu.h" +#include "iommufd.h" #include "../irq_remapping.h" #include "../iommu-pages.h" @@ -3083,6 +3084,7 @@ static bool amd_iommu_enforce_cache_coherency(struct iommu_domain *domain) const struct iommu_ops amd_iommu_ops = { .capable = amd_iommu_capable, + .hw_info = amd_iommufd_hw_info, .blocked_domain = &blocked_domain, .release_domain = &blocked_domain, .identity_domain = &identity_domain.domain, diff --git a/drivers/iommu/amd/iommufd.c b/drivers/iommu/amd/iommufd.c new file mode 100644 index 000000000000..72eaaa923d04 --- /dev/null +++ b/drivers/iommu/amd/iommufd.c @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2025 Advanced Micro Devices, Inc. + */ + +#include + +#include "iommufd.h" +#include "amd_iommu.h" +#include "amd_iommu_types.h" + +void *amd_iommufd_hw_info(struct device *dev, u32 *length, u32 *type) +{ + struct iommu_hw_info_amd *hwinfo; + + if (*type != IOMMU_HW_INFO_TYPE_DEFAULT && + *type != IOMMU_HW_INFO_TYPE_AMD) + return ERR_PTR(-EOPNOTSUPP); + + hwinfo = kzalloc(sizeof(*hwinfo), GFP_KERNEL); + if (!hwinfo) + return ERR_PTR(-ENOMEM); + + *length = sizeof(*hwinfo); + *type = IOMMU_HW_INFO_TYPE_AMD; + + hwinfo->efr = amd_iommu_efr; + hwinfo->efr2 = amd_iommu_efr2; + + return hwinfo; +} diff --git a/drivers/iommu/amd/iommufd.h b/drivers/iommu/amd/iommufd.h new file mode 100644 index 000000000000..f880be80a30d --- /dev/null +++ b/drivers/iommu/amd/iommufd.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2025 Advanced Micro Devices, Inc. + */ + +#ifndef AMD_IOMMUFD_H +#define AMD_IOMMUFD_H + +#if IS_ENABLED(CONFIG_AMD_IOMMU_IOMMUFD) +void *amd_iommufd_hw_info(struct device *dev, u32 *length, u32 *type); +#else +#define amd_iommufd_hw_info NULL +#endif /* CONFIG_AMD_IOMMU_IOMMUFD */ + +#endif /* AMD_IOMMUFD_H */ diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h index 2c41920b641d..3db37f6042a0 100644 --- a/include/uapi/linux/iommufd.h +++ b/include/uapi/linux/iommufd.h @@ -623,6 +623,32 @@ struct iommu_hw_info_tegra241_cmdqv { __u8 __reserved; }; +/** + * struct iommu_hw_info_amd - AMD IOMMU device info + * + * @efr : Value of AMD IOMMU Extended Feature Register (EFR) + * @efr2: Value of AMD IOMMU Extended Feature 2 Register (EFR2) + * + * Please See description of these registers in the following sections of + * the AMD I/O Virtualization Technology (IOMMU) Specification. + * (https://docs.amd.com/v/u/en-US/48882_3.10_PUB) + * + * - MMIO Offset 0030h IOMMU Extended Feature Register + * - MMIO Offset 01A0h IOMMU Extended Feature 2 Register + * + * Note: The EFR and EFR2 are raw values reported by hardware. + * VMM is responsible to determine the appropriate flags to be exposed to + * the VM since cetertain features are not currently supported by the kernel + * for HW-vIOMMU. + * + * Current VMM-allowed list of feature flags are: + * - EFR[GTSup, GASup, GioSup, PPRSup, EPHSup, GATS, GLX, PASmax] + */ +struct iommu_hw_info_amd { + __aligned_u64 efr; + __aligned_u64 efr2; +}; + /** * enum iommu_hw_info_type - IOMMU Hardware Info Types * @IOMMU_HW_INFO_TYPE_NONE: Output by the drivers that do not report hardware @@ -632,6 +658,7 @@ struct iommu_hw_info_tegra241_cmdqv { * @IOMMU_HW_INFO_TYPE_ARM_SMMUV3: ARM SMMUv3 iommu info type * @IOMMU_HW_INFO_TYPE_TEGRA241_CMDQV: NVIDIA Tegra241 CMDQV (extension for ARM * SMMUv3) info type + * @IOMMU_HW_INFO_TYPE_AMD: AMD IOMMU info type */ enum iommu_hw_info_type { IOMMU_HW_INFO_TYPE_NONE = 0, @@ -639,6 +666,7 @@ enum iommu_hw_info_type { IOMMU_HW_INFO_TYPE_INTEL_VTD = 1, IOMMU_HW_INFO_TYPE_ARM_SMMUV3 = 2, IOMMU_HW_INFO_TYPE_TEGRA241_CMDQV = 3, + IOMMU_HW_INFO_TYPE_AMD = 4, }; /** -- cgit v1.2.3 From e05698c10d980ac0a0b57ed81ec9353b9e9533c6 Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Thu, 15 Jan 2026 06:08:06 +0000 Subject: iommufd: Introduce data struct for AMD nested domain allocation Introduce IOMMU_HWPT_DATA_AMD_GUEST data type for IOMMU guest page table, which is used for stage-1 in nested translation. The data structure contains information necessary for setting up the AMD HW-vIOMMU support. Reviewed-by: Jason Gunthorpe Reviewed-by: Nicolin Chen Reviewed-by: Vasant Hegde Signed-off-by: Suravee Suthikulpanit Signed-off-by: Joerg Roedel --- include/uapi/linux/iommufd.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h index 3db37f6042a0..1dafbc552d37 100644 --- a/include/uapi/linux/iommufd.h +++ b/include/uapi/linux/iommufd.h @@ -465,16 +465,27 @@ struct iommu_hwpt_arm_smmuv3 { __aligned_le64 ste[2]; }; +/** + * struct iommu_hwpt_amd_guest - AMD IOMMU guest I/O page table data + * (IOMMU_HWPT_DATA_AMD_GUEST) + * @dte: Guest Device Table Entry (DTE) + */ +struct iommu_hwpt_amd_guest { + __aligned_u64 dte[4]; +}; + /** * enum iommu_hwpt_data_type - IOMMU HWPT Data Type * @IOMMU_HWPT_DATA_NONE: no data * @IOMMU_HWPT_DATA_VTD_S1: Intel VT-d stage-1 page table * @IOMMU_HWPT_DATA_ARM_SMMUV3: ARM SMMUv3 Context Descriptor Table + * @IOMMU_HWPT_DATA_AMD_GUEST: AMD IOMMU guest page table */ enum iommu_hwpt_data_type { IOMMU_HWPT_DATA_NONE = 0, IOMMU_HWPT_DATA_VTD_S1 = 1, IOMMU_HWPT_DATA_ARM_SMMUV3 = 2, + IOMMU_HWPT_DATA_AMD_GUEST = 3, }; /** -- cgit v1.2.3