From a75b2be249d60eff6015737f6c3e94935b541068 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Mon, 15 Dec 2025 13:42:18 -0800 Subject: iommu: Add iommu_driver_get_domain_for_dev() helper There is a need to stage a resetting PCI device to temporarily the blocked domain and then attach back to its previously attached domain after reset. This can be simply done by keeping the "previously attached domain" in the iommu_group->domain pointer while adding an iommu_group->resetting_domain, which gives troubles to IOMMU drivers using the iommu_get_domain_for_dev() for a device's physical domain in order to program IOMMU hardware. And in such for-driver use cases, the iommu_group->mutex must be held, so it doesn't fit in external callers that don't hold the iommu_group->mutex. Introduce a new iommu_driver_get_domain_for_dev() helper, exclusively for driver use cases that hold the iommu_group->mutex, to separate from those external use cases. Add a lockdep_assert_not_held to the existing iommu_get_domain_for_dev() and highlight that in a kdoc. Reviewed-by: Kevin Tian Reviewed-by: Lu Baolu Reviewed-by: Jason Gunthorpe Tested-by: Dheeraj Kumar Srivastava Signed-off-by: Nicolin Chen Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 8c66284a91a8..ff097df318b9 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -910,6 +910,7 @@ extern int iommu_attach_device(struct iommu_domain *domain, extern void iommu_detach_device(struct iommu_domain *domain, struct device *dev); extern struct iommu_domain *iommu_get_domain_for_dev(struct device *dev); +struct iommu_domain *iommu_driver_get_domain_for_dev(struct device *dev); extern struct iommu_domain *iommu_get_dma_domain(struct device *dev); extern int iommu_map(struct iommu_domain *domain, unsigned long iova, phys_addr_t paddr, size_t size, int prot, gfp_t gfp); -- cgit v1.2.3 From c279e83953d937470f8a6e69b69f62608714f13f Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Mon, 15 Dec 2025 13:42:19 -0800 Subject: iommu: Introduce pci_dev_reset_iommu_prepare/done() PCIe permits a device to ignore ATS invalidation TLPs while processing a reset. This creates a problem visible to the OS where an ATS invalidation command will time out. E.g. an SVA domain will have no coordination with a reset event and can racily issue ATS invalidations to a resetting device. The OS should do something to mitigate this as we do not want production systems to be reporting critical ATS failures, especially in a hypervisor environment. Broadly, OS could arrange to ignore the timeouts, block page table mutations to prevent invalidations, or disable and block ATS. The PCIe r6.0, sec 10.3.1 IMPLEMENTATION NOTE recommends SW to disable and block ATS before initiating a Function Level Reset. It also mentions that other reset methods could have the same vulnerability as well. Provide a callback from the PCI subsystem that will enclose the reset and have the iommu core temporarily change all the attached RID/PASID domains group->blocking_domain so that the IOMMU hardware would fence any incoming ATS queries. And IOMMU drivers should also synchronously stop issuing new ATS invalidations and wait for all ATS invalidations to complete. This can avoid any ATS invaliation timeouts. However, if there is a domain attachment/replacement happening during an ongoing reset, ATS routines may be re-activated between the two function calls. So, introduce a new resetting_domain in the iommu_group structure to reject any concurrent attach_dev/set_dev_pasid call during a reset for a concern of compatibility failure. Since this changes the behavior of an attach operation, update the uAPI accordingly. Note that there are two corner cases: 1. Devices in the same iommu_group Since an attachment is always per iommu_group, this means that any sibling devices in the iommu_group cannot change domain, to prevent race conditions. 2. An SR-IOV PF that is being reset while its VF is not In such case, the VF itself is already broken. So, there is no point in preventing PF from going through the iommu reset. Reviewed-by: Lu Baolu Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Tested-by: Dheeraj Kumar Srivastava Signed-off-by: Nicolin Chen Signed-off-by: Joerg Roedel --- drivers/iommu/iommu.c | 173 ++++++++++++++++++++++++++++++++++++++++++++++ include/linux/iommu.h | 13 ++++ include/uapi/linux/vfio.h | 4 ++ 3 files changed, 190 insertions(+) (limited to 'include/linux') diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 672597100e9a..0665dedd91b2 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -61,6 +61,11 @@ struct iommu_group { int id; struct iommu_domain *default_domain; struct iommu_domain *blocking_domain; + /* + * During a group device reset, @resetting_domain points to the physical + * domain, while @domain points to the attached domain before the reset. + */ + struct iommu_domain *resetting_domain; struct iommu_domain *domain; struct list_head entry; unsigned int owner_cnt; @@ -2195,6 +2200,15 @@ int iommu_deferred_attach(struct device *dev, struct iommu_domain *domain) guard(mutex)(&dev->iommu_group->mutex); + /* + * This is a concurrent attach during a device reset. Reject it until + * pci_dev_reset_iommu_done() attaches the device to group->domain. + * + * Note that this might fail the iommu_dma_map(). But there's nothing + * more we can do here. + */ + if (dev->iommu_group->resetting_domain) + return -EBUSY; return __iommu_attach_device(domain, dev, NULL); } @@ -2253,6 +2267,17 @@ struct iommu_domain *iommu_driver_get_domain_for_dev(struct device *dev) lockdep_assert_held(&group->mutex); + /* + * Driver handles the low-level __iommu_attach_device(), including the + * one invoked by pci_dev_reset_iommu_done() re-attaching the device to + * the cached group->domain. In this case, the driver must get the old + * domain from group->resetting_domain rather than group->domain. This + * prevents it from re-attaching the device from group->domain (old) to + * group->domain (new). + */ + if (group->resetting_domain) + return group->resetting_domain; + return group->domain; } EXPORT_SYMBOL_GPL(iommu_driver_get_domain_for_dev); @@ -2409,6 +2434,13 @@ static int __iommu_group_set_domain_internal(struct iommu_group *group, if (WARN_ON(!new_domain)) return -EINVAL; + /* + * This is a concurrent attach during a device reset. Reject it until + * pci_dev_reset_iommu_done() attaches the device to group->domain. + */ + if (group->resetting_domain) + return -EBUSY; + /* * Changing the domain is done by calling attach_dev() on the new * domain. This switch does not have to be atomic and DMA can be @@ -3527,6 +3559,16 @@ int iommu_attach_device_pasid(struct iommu_domain *domain, return -EINVAL; mutex_lock(&group->mutex); + + /* + * This is a concurrent attach during a device reset. Reject it until + * pci_dev_reset_iommu_done() attaches the device to group->domain. + */ + if (group->resetting_domain) { + ret = -EBUSY; + goto out_unlock; + } + for_each_group_device(group, device) { /* * Skip PASID validation for devices without PASID support @@ -3610,6 +3652,16 @@ int iommu_replace_device_pasid(struct iommu_domain *domain, return -EINVAL; mutex_lock(&group->mutex); + + /* + * This is a concurrent attach during a device reset. Reject it until + * pci_dev_reset_iommu_done() attaches the device to group->domain. + */ + if (group->resetting_domain) { + ret = -EBUSY; + goto out_unlock; + } + entry = iommu_make_pasid_array_entry(domain, handle); curr = xa_cmpxchg(&group->pasid_array, pasid, NULL, XA_ZERO_ENTRY, GFP_KERNEL); @@ -3867,6 +3919,127 @@ err_unlock: } EXPORT_SYMBOL_NS_GPL(iommu_replace_group_handle, "IOMMUFD_INTERNAL"); +/** + * pci_dev_reset_iommu_prepare() - Block IOMMU to prepare for a PCI device reset + * @pdev: PCI device that is going to enter a reset routine + * + * The PCIe r6.0, sec 10.3.1 IMPLEMENTATION NOTE recommends to disable and block + * ATS before initiating a reset. This means that a PCIe device during the reset + * routine wants to block any IOMMU activity: translation and ATS invalidation. + * + * This function attaches the device's RID/PASID(s) the group->blocking_domain, + * setting the group->resetting_domain. This allows the IOMMU driver pausing any + * IOMMU activity while leaving the group->domain pointer intact. Later when the + * reset is finished, pci_dev_reset_iommu_done() can restore everything. + * + * Caller must use pci_dev_reset_iommu_prepare() with pci_dev_reset_iommu_done() + * before/after the core-level reset routine, to unset the resetting_domain. + * + * Return: 0 on success or negative error code if the preparation failed. + * + * These two functions are designed to be used by PCI reset functions that would + * not invoke any racy iommu_release_device(), since PCI sysfs node gets removed + * before it notifies with a BUS_NOTIFY_REMOVED_DEVICE. When using them in other + * case, callers must ensure there will be no racy iommu_release_device() call, + * which otherwise would UAF the dev->iommu_group pointer. + */ +int pci_dev_reset_iommu_prepare(struct pci_dev *pdev) +{ + struct iommu_group *group = pdev->dev.iommu_group; + unsigned long pasid; + void *entry; + int ret; + + if (!pci_ats_supported(pdev) || !dev_has_iommu(&pdev->dev)) + return 0; + + guard(mutex)(&group->mutex); + + /* Re-entry is not allowed */ + if (WARN_ON(group->resetting_domain)) + return -EBUSY; + + ret = __iommu_group_alloc_blocking_domain(group); + if (ret) + return ret; + + /* Stage RID domain at blocking_domain while retaining group->domain */ + if (group->domain != group->blocking_domain) { + ret = __iommu_attach_device(group->blocking_domain, &pdev->dev, + group->domain); + if (ret) + return ret; + } + + /* + * Stage PASID domains at blocking_domain while retaining pasid_array. + * + * The pasid_array is mostly fenced by group->mutex, except one reader + * in iommu_attach_handle_get(), so it's safe to read without xa_lock. + */ + xa_for_each_start(&group->pasid_array, pasid, entry, 1) + iommu_remove_dev_pasid(&pdev->dev, pasid, + pasid_array_entry_to_domain(entry)); + + group->resetting_domain = group->blocking_domain; + return ret; +} +EXPORT_SYMBOL_GPL(pci_dev_reset_iommu_prepare); + +/** + * pci_dev_reset_iommu_done() - Restore IOMMU after a PCI device reset is done + * @pdev: PCI device that has finished a reset routine + * + * After a PCIe device finishes a reset routine, it wants to restore its IOMMU + * IOMMU activity, including new translation as well as cache invalidation, by + * re-attaching all RID/PASID of the device's back to the domains retained in + * the core-level structure. + * + * Caller must pair it with a successful pci_dev_reset_iommu_prepare(). + * + * Note that, although unlikely, there is a risk that re-attaching domains might + * fail due to some unexpected happening like OOM. + */ +void pci_dev_reset_iommu_done(struct pci_dev *pdev) +{ + struct iommu_group *group = pdev->dev.iommu_group; + unsigned long pasid; + void *entry; + + if (!pci_ats_supported(pdev) || !dev_has_iommu(&pdev->dev)) + return; + + guard(mutex)(&group->mutex); + + /* pci_dev_reset_iommu_prepare() was bypassed for the device */ + if (!group->resetting_domain) + return; + + /* pci_dev_reset_iommu_prepare() was not successfully called */ + if (WARN_ON(!group->blocking_domain)) + return; + + /* Re-attach RID domain back to group->domain */ + if (group->domain != group->blocking_domain) { + WARN_ON(__iommu_attach_device(group->domain, &pdev->dev, + group->blocking_domain)); + } + + /* + * Re-attach PASID domains back to the domains retained in pasid_array. + * + * The pasid_array is mostly fenced by group->mutex, except one reader + * in iommu_attach_handle_get(), so it's safe to read without xa_lock. + */ + xa_for_each_start(&group->pasid_array, pasid, entry, 1) + WARN_ON(__iommu_set_group_pasid( + pasid_array_entry_to_domain(entry), group, pasid, + group->blocking_domain)); + + group->resetting_domain = NULL; +} +EXPORT_SYMBOL_GPL(pci_dev_reset_iommu_done); + #if IS_ENABLED(CONFIG_IRQ_MSI_IOMMU) /** * iommu_dma_prepare_msi() - Map the MSI page in the IOMMU domain diff --git a/include/linux/iommu.h b/include/linux/iommu.h index ff097df318b9..54b8b48c762e 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -1188,6 +1188,10 @@ void iommu_detach_device_pasid(struct iommu_domain *domain, struct device *dev, ioasid_t pasid); ioasid_t iommu_alloc_global_pasid(struct device *dev); void iommu_free_global_pasid(ioasid_t pasid); + +/* PCI device reset functions */ +int pci_dev_reset_iommu_prepare(struct pci_dev *pdev); +void pci_dev_reset_iommu_done(struct pci_dev *pdev); #else /* CONFIG_IOMMU_API */ struct iommu_ops {}; @@ -1511,6 +1515,15 @@ static inline ioasid_t iommu_alloc_global_pasid(struct device *dev) } static inline void iommu_free_global_pasid(ioasid_t pasid) {} + +static inline int pci_dev_reset_iommu_prepare(struct pci_dev *pdev) +{ + return 0; +} + +static inline void pci_dev_reset_iommu_done(struct pci_dev *pdev) +{ +} #endif /* CONFIG_IOMMU_API */ #ifdef CONFIG_IRQ_MSI_IOMMU diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index ac2329f24141..bb7b89330d35 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -964,6 +964,10 @@ struct vfio_device_bind_iommufd { * hwpt corresponding to the given pt_id. * * Return: 0 on success, -errno on failure. + * + * When a device is resetting, -EBUSY will be returned to reject any concurrent + * attachment to the resetting device itself or any sibling device in the IOMMU + * group having the resetting device. */ struct vfio_device_attach_iommufd_pt { __u32 argsz; -- cgit v1.2.3 From 466ae6978a5b8c6022bd4537fbfd00e94bb07219 Mon Sep 17 00:00:00 2001 From: Mostafa Saleh Date: Fri, 9 Jan 2026 17:18:02 +0000 Subject: iommu: Add page_ext for IOMMU_DEBUG_PAGEALLOC Add a new config IOMMU_DEBUG_PAGEALLOC, which registers new data to page_ext. This config will be used by the IOMMU API to track pages mapped in the IOMMU to catch drivers trying to free kernel memory that they still map in their domains, causing all types of memory corruption. This behaviour is disabled by default and can be enabled using kernel cmdline iommu.debug_pagealloc. Acked-by: David Hildenbrand (Red Hat) Reviewed-by: Pranjal Shrivastava Reviewed-by: Lu Baolu Signed-off-by: Mostafa Saleh Signed-off-by: Joerg Roedel --- Documentation/admin-guide/kernel-parameters.txt | 9 +++++++ drivers/iommu/Kconfig | 19 +++++++++++++++ drivers/iommu/Makefile | 1 + drivers/iommu/iommu-debug-pagealloc.c | 32 +++++++++++++++++++++++++ include/linux/iommu-debug-pagealloc.h | 17 +++++++++++++ mm/page_ext.c | 4 ++++ 6 files changed, 82 insertions(+) create mode 100644 drivers/iommu/iommu-debug-pagealloc.c create mode 100644 include/linux/iommu-debug-pagealloc.h (limited to 'include/linux') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index a8d0afde7f85..d484d9d8d0a4 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2675,6 +2675,15 @@ Kernel parameters 1 - Bypass the IOMMU for DMA. unset - Use value of CONFIG_IOMMU_DEFAULT_PASSTHROUGH. + iommu.debug_pagealloc= + [KNL,EARLY] When CONFIG_IOMMU_DEBUG_PAGEALLOC is set, this + parameter enables the feature at boot time. By default, it + is disabled and the system behaves the same way as a kernel + built without CONFIG_IOMMU_DEBUG_PAGEALLOC. + Format: { "0" | "1" } + 0 - Sanitizer disabled. + 1 - Sanitizer enabled, expect runtime overhead. + io7= [HW] IO7 for Marvel-based Alpha systems See comment before marvel_specify_io7 in arch/alpha/kernel/core_marvel.c. diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig index 99095645134f..f86262b11416 100644 --- a/drivers/iommu/Kconfig +++ b/drivers/iommu/Kconfig @@ -384,6 +384,25 @@ config SPRD_IOMMU Say Y here if you want to use the multimedia devices listed above. +config IOMMU_DEBUG_PAGEALLOC + bool "Debug IOMMU mappings against page allocations" + depends on DEBUG_PAGEALLOC && IOMMU_API && PAGE_EXTENSION + help + This enables a consistency check between the kernel page allocator and + the IOMMU subsystem. It verifies that pages being allocated or freed + are not currently mapped in any IOMMU domain. + + This helps detect DMA use-after-free bugs where a driver frees a page + but forgets to unmap it from the IOMMU, potentially allowing a device + to overwrite memory that the kernel has repurposed. + + These checks are best-effort and may not detect all problems. + + Due to performance overhead, this feature is disabled by default. + You must enable "iommu.debug_pagealloc" from the kernel command + line to activate the runtime checks. + + If unsure, say N. endif # IOMMU_SUPPORT source "drivers/iommu/generic_pt/Kconfig" diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile index 8e8843316c4b..0275821f4ef9 100644 --- a/drivers/iommu/Makefile +++ b/drivers/iommu/Makefile @@ -36,3 +36,4 @@ obj-$(CONFIG_IOMMU_SVA) += iommu-sva.o obj-$(CONFIG_IOMMU_IOPF) += io-pgfault.o obj-$(CONFIG_SPRD_IOMMU) += sprd-iommu.o obj-$(CONFIG_APPLE_DART) += apple-dart.o +obj-$(CONFIG_IOMMU_DEBUG_PAGEALLOC) += iommu-debug-pagealloc.o diff --git a/drivers/iommu/iommu-debug-pagealloc.c b/drivers/iommu/iommu-debug-pagealloc.c new file mode 100644 index 000000000000..4022e9af7f27 --- /dev/null +++ b/drivers/iommu/iommu-debug-pagealloc.c @@ -0,0 +1,32 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2025 - Google Inc + * Author: Mostafa Saleh + * IOMMU API debug page alloc sanitizer + */ +#include +#include +#include +#include + +static bool needed; + +struct iommu_debug_metadata { + atomic_t ref; +}; + +static __init bool need_iommu_debug(void) +{ + return needed; +} + +struct page_ext_operations page_iommu_debug_ops = { + .size = sizeof(struct iommu_debug_metadata), + .need = need_iommu_debug, +}; + +static int __init iommu_debug_pagealloc(char *str) +{ + return kstrtobool(str, &needed); +} +early_param("iommu.debug_pagealloc", iommu_debug_pagealloc); diff --git a/include/linux/iommu-debug-pagealloc.h b/include/linux/iommu-debug-pagealloc.h new file mode 100644 index 000000000000..83e64d70bf6c --- /dev/null +++ b/include/linux/iommu-debug-pagealloc.h @@ -0,0 +1,17 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2025 - Google Inc + * Author: Mostafa Saleh + * IOMMU API debug page alloc sanitizer + */ + +#ifndef __LINUX_IOMMU_DEBUG_PAGEALLOC_H +#define __LINUX_IOMMU_DEBUG_PAGEALLOC_H + +#ifdef CONFIG_IOMMU_DEBUG_PAGEALLOC + +extern struct page_ext_operations page_iommu_debug_ops; + +#endif /* CONFIG_IOMMU_DEBUG_PAGEALLOC */ + +#endif /* __LINUX_IOMMU_DEBUG_PAGEALLOC_H */ diff --git a/mm/page_ext.c b/mm/page_ext.c index d7396a8970e5..297e4cd8ce90 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -11,6 +11,7 @@ #include #include #include +#include /* * struct page extension @@ -89,6 +90,9 @@ static struct page_ext_operations *page_ext_ops[] __initdata = { #ifdef CONFIG_PAGE_TABLE_CHECK &page_table_check_ops, #endif +#ifdef CONFIG_IOMMU_DEBUG_PAGEALLOC + &page_iommu_debug_ops, +#endif }; unsigned long page_ext_size; -- cgit v1.2.3 From ccc21213f013834b484cdcc738e282f963fcfc97 Mon Sep 17 00:00:00 2001 From: Mostafa Saleh Date: Fri, 9 Jan 2026 17:18:03 +0000 Subject: iommu: Add calls for IOMMU_DEBUG_PAGEALLOC Add calls for the new iommu debug config IOMMU_DEBUG_PAGEALLOC: - iommu_debug_init: Enable the debug mode if configured by the user. - iommu_debug_map: Track iommu pages mapped, using physical address. - iommu_debug_unmap_begin: Track start of iommu unmap operation, with IOVA and size. - iommu_debug_unmap_end: Track the end of unmap operation, passing the actual unmapped size versus the tracked one at unmap_begin. We have to do the unmap_begin/end as once pages are unmapped we lose the information of the physical address. This is racy, but the API is racy by construction as it uses refcounts and doesn't attempt to lock/synchronize with the IOMMU API as that will be costly, meaning that possibility of false negative exists. Reviewed-by: Samiullah Khawaja Reviewed-by: Lu Baolu Reviewed-by: Pranjal Shrivastava Signed-off-by: Mostafa Saleh Signed-off-by: Joerg Roedel --- drivers/iommu/iommu-debug-pagealloc.c | 28 +++++++++++++++++ drivers/iommu/iommu-priv.h | 58 +++++++++++++++++++++++++++++++++++ drivers/iommu/iommu.c | 11 +++++-- include/linux/iommu-debug-pagealloc.h | 1 + 4 files changed, 96 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/drivers/iommu/iommu-debug-pagealloc.c b/drivers/iommu/iommu-debug-pagealloc.c index 4022e9af7f27..1d343421da98 100644 --- a/drivers/iommu/iommu-debug-pagealloc.c +++ b/drivers/iommu/iommu-debug-pagealloc.c @@ -5,11 +5,15 @@ * IOMMU API debug page alloc sanitizer */ #include +#include #include #include #include +#include "iommu-priv.h" + static bool needed; +DEFINE_STATIC_KEY_FALSE(iommu_debug_initialized); struct iommu_debug_metadata { atomic_t ref; @@ -25,6 +29,30 @@ struct page_ext_operations page_iommu_debug_ops = { .need = need_iommu_debug, }; +void __iommu_debug_map(struct iommu_domain *domain, phys_addr_t phys, size_t size) +{ +} + +void __iommu_debug_unmap_begin(struct iommu_domain *domain, + unsigned long iova, size_t size) +{ +} + +void __iommu_debug_unmap_end(struct iommu_domain *domain, + unsigned long iova, size_t size, + size_t unmapped) +{ +} + +void iommu_debug_init(void) +{ + if (!needed) + return; + + pr_info("iommu: Debugging page allocations, expect overhead or disable iommu.debug_pagealloc"); + static_branch_enable(&iommu_debug_initialized); +} + static int __init iommu_debug_pagealloc(char *str) { return kstrtobool(str, &needed); diff --git a/drivers/iommu/iommu-priv.h b/drivers/iommu/iommu-priv.h index c95394cd03a7..aaffad5854fc 100644 --- a/drivers/iommu/iommu-priv.h +++ b/drivers/iommu/iommu-priv.h @@ -5,6 +5,7 @@ #define __LINUX_IOMMU_PRIV_H #include +#include #include static inline const struct iommu_ops *dev_iommu_ops(struct device *dev) @@ -65,4 +66,61 @@ static inline int iommufd_sw_msi(struct iommu_domain *domain, int iommu_replace_device_pasid(struct iommu_domain *domain, struct device *dev, ioasid_t pasid, struct iommu_attach_handle *handle); + +#ifdef CONFIG_IOMMU_DEBUG_PAGEALLOC + +void __iommu_debug_map(struct iommu_domain *domain, phys_addr_t phys, + size_t size); +void __iommu_debug_unmap_begin(struct iommu_domain *domain, + unsigned long iova, size_t size); +void __iommu_debug_unmap_end(struct iommu_domain *domain, + unsigned long iova, size_t size, size_t unmapped); + +static inline void iommu_debug_map(struct iommu_domain *domain, + phys_addr_t phys, size_t size) +{ + if (static_branch_unlikely(&iommu_debug_initialized)) + __iommu_debug_map(domain, phys, size); +} + +static inline void iommu_debug_unmap_begin(struct iommu_domain *domain, + unsigned long iova, size_t size) +{ + if (static_branch_unlikely(&iommu_debug_initialized)) + __iommu_debug_unmap_begin(domain, iova, size); +} + +static inline void iommu_debug_unmap_end(struct iommu_domain *domain, + unsigned long iova, size_t size, + size_t unmapped) +{ + if (static_branch_unlikely(&iommu_debug_initialized)) + __iommu_debug_unmap_end(domain, iova, size, unmapped); +} + +void iommu_debug_init(void); + +#else +static inline void iommu_debug_map(struct iommu_domain *domain, + phys_addr_t phys, size_t size) +{ +} + +static inline void iommu_debug_unmap_begin(struct iommu_domain *domain, + unsigned long iova, size_t size) +{ +} + +static inline void iommu_debug_unmap_end(struct iommu_domain *domain, + unsigned long iova, size_t size, + size_t unmapped) +{ +} + +static inline void iommu_debug_init(void) +{ +} + +#endif /* CONFIG_IOMMU_DEBUG_PAGEALLOC */ + #endif /* __LINUX_IOMMU_PRIV_H */ diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 0665dedd91b2..585b13bcc8cf 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -237,6 +237,8 @@ static int __init iommu_subsys_init(void) if (!nb) return -ENOMEM; + iommu_debug_init(); + for (int i = 0; i < ARRAY_SIZE(iommu_buses); i++) { nb[i].notifier_call = iommu_bus_notifier; bus_register_notifier(iommu_buses[i], &nb[i]); @@ -2629,10 +2631,12 @@ int iommu_map_nosync(struct iommu_domain *domain, unsigned long iova, } /* unroll mapping in case something went wrong */ - if (ret) + if (ret) { iommu_unmap(domain, orig_iova, orig_size - size); - else + } else { trace_map(orig_iova, orig_paddr, orig_size); + iommu_debug_map(domain, orig_paddr, orig_size); + } return ret; } @@ -2694,6 +2698,8 @@ static size_t __iommu_unmap(struct iommu_domain *domain, pr_debug("unmap this: iova 0x%lx size 0x%zx\n", iova, size); + iommu_debug_unmap_begin(domain, iova, size); + /* * Keep iterating until we either unmap 'size' bytes (or more) * or we hit an area that isn't mapped. @@ -2714,6 +2720,7 @@ static size_t __iommu_unmap(struct iommu_domain *domain, } trace_unmap(orig_iova, size, unmapped); + iommu_debug_unmap_end(domain, orig_iova, size, unmapped); return unmapped; } diff --git a/include/linux/iommu-debug-pagealloc.h b/include/linux/iommu-debug-pagealloc.h index 83e64d70bf6c..a439d6815ca1 100644 --- a/include/linux/iommu-debug-pagealloc.h +++ b/include/linux/iommu-debug-pagealloc.h @@ -9,6 +9,7 @@ #define __LINUX_IOMMU_DEBUG_PAGEALLOC_H #ifdef CONFIG_IOMMU_DEBUG_PAGEALLOC +DECLARE_STATIC_KEY_FALSE(iommu_debug_initialized); extern struct page_ext_operations page_iommu_debug_ops; -- cgit v1.2.3 From a8258ffed2efdf533bdc756141eeb7bc5301ad4f Mon Sep 17 00:00:00 2001 From: Mostafa Saleh Date: Fri, 9 Jan 2026 17:18:05 +0000 Subject: iommu: debug-pagealloc: Check mapped/unmapped kernel memory MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now, as the page_ext holds count of IOMMU mappings, we can use it to assert that any page allocated/freed is indeed not in the IOMMU. The sanitizer doesn’t protect against mapping/unmapping during this period. However, that’s less harmful as the page is not used by the kernel. Reviewed-by: Samiullah Khawaja Reviewed-by: Lu Baolu Signed-off-by: Mostafa Saleh Reviewed-by: Pranjal Shrivastava Signed-off-by: Joerg Roedel --- drivers/iommu/iommu-debug-pagealloc.c | 23 +++++++++++++++++++++++ include/linux/iommu-debug-pagealloc.h | 14 ++++++++++++++ include/linux/mm.h | 5 +++++ 3 files changed, 42 insertions(+) (limited to 'include/linux') diff --git a/drivers/iommu/iommu-debug-pagealloc.c b/drivers/iommu/iommu-debug-pagealloc.c index 9eb49e1230ee..c080a38f45a4 100644 --- a/drivers/iommu/iommu-debug-pagealloc.c +++ b/drivers/iommu/iommu-debug-pagealloc.c @@ -9,6 +9,7 @@ #include #include #include +#include #include "iommu-priv.h" @@ -73,6 +74,28 @@ static size_t iommu_debug_page_size(struct iommu_domain *domain) return 1UL << __ffs(domain->pgsize_bitmap); } +static bool iommu_debug_page_count(const struct page *page) +{ + unsigned int ref; + struct page_ext *page_ext = page_ext_get(page); + struct iommu_debug_metadata *d = get_iommu_data(page_ext); + + ref = atomic_read(&d->ref); + page_ext_put(page_ext); + return ref != 0; +} + +void __iommu_debug_check_unmapped(const struct page *page, int numpages) +{ + while (numpages--) { + if (WARN_ON(iommu_debug_page_count(page))) { + pr_warn("iommu: Detected page leak!\n"); + dump_page_owner(page); + } + page++; + } +} + void __iommu_debug_map(struct iommu_domain *domain, phys_addr_t phys, size_t size) { size_t off, end; diff --git a/include/linux/iommu-debug-pagealloc.h b/include/linux/iommu-debug-pagealloc.h index a439d6815ca1..46c3c1f70150 100644 --- a/include/linux/iommu-debug-pagealloc.h +++ b/include/linux/iommu-debug-pagealloc.h @@ -13,6 +13,20 @@ DECLARE_STATIC_KEY_FALSE(iommu_debug_initialized); extern struct page_ext_operations page_iommu_debug_ops; +void __iommu_debug_check_unmapped(const struct page *page, int numpages); + +static inline void iommu_debug_check_unmapped(const struct page *page, int numpages) +{ + if (static_branch_unlikely(&iommu_debug_initialized)) + __iommu_debug_check_unmapped(page, numpages); +} + +#else +static inline void iommu_debug_check_unmapped(const struct page *page, + int numpages) +{ +} + #endif /* CONFIG_IOMMU_DEBUG_PAGEALLOC */ #endif /* __LINUX_IOMMU_DEBUG_PAGEALLOC_H */ diff --git a/include/linux/mm.h b/include/linux/mm.h index 6f959d8ca4b4..32205d2a24b2 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -36,6 +36,7 @@ #include #include #include +#include struct mempolicy; struct anon_vma; @@ -4133,12 +4134,16 @@ extern void __kernel_map_pages(struct page *page, int numpages, int enable); #ifdef CONFIG_DEBUG_PAGEALLOC static inline void debug_pagealloc_map_pages(struct page *page, int numpages) { + iommu_debug_check_unmapped(page, numpages); + if (debug_pagealloc_enabled_static()) __kernel_map_pages(page, numpages, 1); } static inline void debug_pagealloc_unmap_pages(struct page *page, int numpages) { + iommu_debug_check_unmapped(page, numpages); + if (debug_pagealloc_enabled_static()) __kernel_map_pages(page, numpages, 0); } -- cgit v1.2.3 From d414b83dc5f90a6a9a656cd6fbb9378ddc824032 Mon Sep 17 00:00:00 2001 From: Mostafa Saleh Date: Tue, 20 Jan 2026 09:19:25 +0000 Subject: mm/page_ext: Add page_ext_get_from_phys() The IOMMU code operates on physical addresses which can be outside of system RAM. Add a new function page_ext_get_from_phys() to abstract the logic of checking the address and returning the page_ext. Signed-off-by: Mostafa Saleh Acked-by: Vlastimil Babka Signed-off-by: Joerg Roedel --- include/linux/page_ext.h | 6 ++++++ mm/page_ext.c | 23 +++++++++++++++++++++++ 2 files changed, 29 insertions(+) (limited to 'include/linux') diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h index 76c817162d2f..61e876e255e8 100644 --- a/include/linux/page_ext.h +++ b/include/linux/page_ext.h @@ -93,6 +93,7 @@ static inline bool page_ext_iter_next_fast_possible(unsigned long next_pfn) #endif extern struct page_ext *page_ext_get(const struct page *page); +extern struct page_ext *page_ext_from_phys(phys_addr_t phys); extern void page_ext_put(struct page_ext *page_ext); extern struct page_ext *page_ext_lookup(unsigned long pfn); @@ -215,6 +216,11 @@ static inline struct page_ext *page_ext_get(const struct page *page) return NULL; } +static inline struct page_ext *page_ext_from_phys(phys_addr_t phys) +{ + return NULL; +} + static inline void page_ext_put(struct page_ext *page_ext) { } diff --git a/mm/page_ext.c b/mm/page_ext.c index 297e4cd8ce90..e2e92bd27ebd 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -538,6 +538,29 @@ struct page_ext *page_ext_get(const struct page *page) return page_ext; } +/** + * page_ext_from_phys() - Get the page_ext structure for a physical address. + * @phys: The physical address to query. + * + * This function safely gets the `struct page_ext` associated with a given + * physical address. It performs validation to ensure the address corresponds + * to a valid, online struct page before attempting to access it. + * It returns NULL for MMIO, ZONE_DEVICE, holes and offline memory. + * + * Return: NULL if no page_ext exists for this physical address. + * Context: Any context. Caller may not sleep until they have called + * page_ext_put(). + */ +struct page_ext *page_ext_from_phys(phys_addr_t phys) +{ + struct page *page = pfn_to_online_page(__phys_to_pfn(phys)); + + if (!page) + return NULL; + + return page_ext_get(page); +} + /** * page_ext_put() - Working with page extended information is done. * @page_ext: Page extended information received from page_ext_get(). -- cgit v1.2.3