summaryrefslogtreecommitdiff
path: root/drivers/iommu
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/iommu')
-rw-r--r--drivers/iommu/Kconfig15
-rw-r--r--drivers/iommu/Makefile2
-rw-r--r--drivers/iommu/amd/Kconfig5
-rw-r--r--drivers/iommu/amd/Makefile2
-rw-r--r--drivers/iommu/amd/amd_iommu.h1
-rw-r--r--drivers/iommu/amd/amd_iommu_types.h114
-rw-r--r--drivers/iommu/amd/debugfs.c2
-rw-r--r--drivers/iommu/amd/init.c15
-rw-r--r--drivers/iommu/amd/io_pgtable.c577
-rw-r--r--drivers/iommu/amd/io_pgtable_v2.c370
-rw-r--r--drivers/iommu/amd/iommu.c572
-rw-r--r--drivers/iommu/apple-dart.c11
-rw-r--r--drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c18
-rw-r--r--drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c33
-rw-r--r--drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c28
-rw-r--r--drivers/iommu/arm/arm-smmu/arm-smmu.c9
-rw-r--r--drivers/iommu/arm/arm-smmu/qcom_iommu.c21
-rw-r--r--drivers/iommu/dma-iommu.c9
-rw-r--r--drivers/iommu/exynos-iommu.c20
-rw-r--r--drivers/iommu/fsl_pamu_domain.c12
-rw-r--r--drivers/iommu/generic_pt/.kunitconfig14
-rw-r--r--drivers/iommu/generic_pt/Kconfig79
-rw-r--r--drivers/iommu/generic_pt/fmt/Makefile28
-rw-r--r--drivers/iommu/generic_pt/fmt/amdv1.h411
-rw-r--r--drivers/iommu/generic_pt/fmt/defs_amdv1.h21
-rw-r--r--drivers/iommu/generic_pt/fmt/defs_vtdss.h21
-rw-r--r--drivers/iommu/generic_pt/fmt/defs_x86_64.h21
-rw-r--r--drivers/iommu/generic_pt/fmt/iommu_amdv1.c15
-rw-r--r--drivers/iommu/generic_pt/fmt/iommu_mock.c10
-rw-r--r--drivers/iommu/generic_pt/fmt/iommu_template.h48
-rw-r--r--drivers/iommu/generic_pt/fmt/iommu_vtdss.c10
-rw-r--r--drivers/iommu/generic_pt/fmt/iommu_x86_64.c11
-rw-r--r--drivers/iommu/generic_pt/fmt/vtdss.h285
-rw-r--r--drivers/iommu/generic_pt/fmt/x86_64.h279
-rw-r--r--drivers/iommu/generic_pt/iommu_pt.h1289
-rw-r--r--drivers/iommu/generic_pt/kunit_generic_pt.h823
-rw-r--r--drivers/iommu/generic_pt/kunit_iommu.h184
-rw-r--r--drivers/iommu/generic_pt/kunit_iommu_pt.h487
-rw-r--r--drivers/iommu/generic_pt/pt_common.h389
-rw-r--r--drivers/iommu/generic_pt/pt_defs.h332
-rw-r--r--drivers/iommu/generic_pt/pt_fmt_defaults.h295
-rw-r--r--drivers/iommu/generic_pt/pt_iter.h636
-rw-r--r--drivers/iommu/generic_pt/pt_log2.h122
-rw-r--r--drivers/iommu/intel/Kconfig6
-rw-r--r--drivers/iommu/intel/iommu.c931
-rw-r--r--drivers/iommu/intel/iommu.h99
-rw-r--r--drivers/iommu/intel/nested.c7
-rw-r--r--drivers/iommu/intel/pasid.c44
-rw-r--r--drivers/iommu/intel/pasid.h1
-rw-r--r--drivers/iommu/intel/svm.c1
-rw-r--r--drivers/iommu/io-pgtable-arm-selftests.c214
-rw-r--r--drivers/iommu/io-pgtable-arm.c203
-rw-r--r--drivers/iommu/io-pgtable.c4
-rw-r--r--drivers/iommu/iommu-pages.c136
-rw-r--r--drivers/iommu/iommu-pages.h51
-rw-r--r--drivers/iommu/iommu.c44
-rw-r--r--drivers/iommu/iommufd/Kconfig1
-rw-r--r--drivers/iommu/iommufd/driver.c2
-rw-r--r--drivers/iommu/iommufd/io_pagetable.c90
-rw-r--r--drivers/iommu/iommufd/io_pagetable.h54
-rw-r--r--drivers/iommu/iommufd/ioas.c12
-rw-r--r--drivers/iommu/iommufd/iommufd_private.h18
-rw-r--r--drivers/iommu/iommufd/iommufd_test.h21
-rw-r--r--drivers/iommu/iommufd/iova_bitmap.c5
-rw-r--r--drivers/iommu/iommufd/main.c10
-rw-r--r--drivers/iommu/iommufd/pages.c414
-rw-r--r--drivers/iommu/iommufd/selftest.c569
-rw-r--r--drivers/iommu/ipmmu-vmsa.c12
-rw-r--r--drivers/iommu/msm_iommu.c11
-rw-r--r--drivers/iommu/mtk_iommu.c174
-rw-r--r--drivers/iommu/mtk_iommu_v1.c35
-rw-r--r--drivers/iommu/omap-iommu.c19
-rw-r--r--drivers/iommu/omap-iommu.h2
-rw-r--r--drivers/iommu/riscv/iommu.c9
-rw-r--r--drivers/iommu/rockchip-iommu.c20
-rw-r--r--drivers/iommu/s390-iommu.c13
-rw-r--r--drivers/iommu/sprd-iommu.c3
-rw-r--r--drivers/iommu/sun50i-iommu.c10
-rw-r--r--drivers/iommu/tegra-smmu.c15
-rw-r--r--drivers/iommu/virtio-iommu.c6
80 files changed, 7990 insertions, 2922 deletions
diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index 70d29b14d851..99095645134f 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -40,12 +40,13 @@ config IOMMU_IO_PGTABLE_LPAE
sizes at both stage-1 and stage-2, as well as address spaces
up to 48-bits in size.
-config IOMMU_IO_PGTABLE_LPAE_SELFTEST
- bool "LPAE selftests"
- depends on IOMMU_IO_PGTABLE_LPAE
+config IOMMU_IO_PGTABLE_LPAE_KUNIT_TEST
+ tristate "KUnit tests for LPAE"
+ depends on IOMMU_IO_PGTABLE_LPAE && KUNIT
+ default KUNIT_ALL_TESTS
help
- Enable self-tests for LPAE page table allocator. This performs
- a series of page-table consistency checks during boot.
+ Enable kunit tests for LPAE page table allocator. This performs
+ a series of page-table consistency checks.
If unsure, say N here.
@@ -247,7 +248,7 @@ config SUN50I_IOMMU
config TEGRA_IOMMU_SMMU
bool "NVIDIA Tegra SMMU Support"
- depends on ARCH_TEGRA
+ depends on ARCH_TEGRA || COMPILE_TEST
depends on TEGRA_AHB
depends on TEGRA_MC
select IOMMU_API
@@ -384,3 +385,5 @@ config SPRD_IOMMU
Say Y here if you want to use the multimedia devices listed above.
endif # IOMMU_SUPPORT
+
+source "drivers/iommu/generic_pt/Kconfig"
diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile
index 355294fa9033..8e8843316c4b 100644
--- a/drivers/iommu/Makefile
+++ b/drivers/iommu/Makefile
@@ -3,6 +3,7 @@ obj-y += arm/ iommufd/
obj-$(CONFIG_AMD_IOMMU) += amd/
obj-$(CONFIG_INTEL_IOMMU) += intel/
obj-$(CONFIG_RISCV_IOMMU) += riscv/
+obj-$(CONFIG_GENERIC_PT) += generic_pt/fmt/
obj-$(CONFIG_IOMMU_API) += iommu.o
obj-$(CONFIG_IOMMU_SUPPORT) += iommu-pages.o
obj-$(CONFIG_IOMMU_API) += iommu-traces.o
@@ -12,6 +13,7 @@ obj-$(CONFIG_IOMMU_DMA) += dma-iommu.o
obj-$(CONFIG_IOMMU_IO_PGTABLE) += io-pgtable.o
obj-$(CONFIG_IOMMU_IO_PGTABLE_ARMV7S) += io-pgtable-arm-v7s.o
obj-$(CONFIG_IOMMU_IO_PGTABLE_LPAE) += io-pgtable-arm.o
+obj-$(CONFIG_IOMMU_IO_PGTABLE_LPAE_KUNIT_TEST) += io-pgtable-arm-selftests.o
obj-$(CONFIG_IOMMU_IO_PGTABLE_DART) += io-pgtable-dart.o
obj-$(CONFIG_IOMMU_IOVA) += iova.o
obj-$(CONFIG_OF_IOMMU) += of_iommu.o
diff --git a/drivers/iommu/amd/Kconfig b/drivers/iommu/amd/Kconfig
index ecef69c11144..f2acf471cb5d 100644
--- a/drivers/iommu/amd/Kconfig
+++ b/drivers/iommu/amd/Kconfig
@@ -11,10 +11,13 @@ config AMD_IOMMU
select MMU_NOTIFIER
select IOMMU_API
select IOMMU_IOVA
- select IOMMU_IO_PGTABLE
select IOMMU_SVA
select IOMMU_IOPF
select IOMMUFD_DRIVER if IOMMUFD
+ select GENERIC_PT
+ select IOMMU_PT
+ select IOMMU_PT_AMDV1
+ select IOMMU_PT_X86_64
depends on X86_64 && PCI && ACPI && HAVE_CMPXCHG_DOUBLE
help
With this option you can enable support for AMD IOMMU hardware in
diff --git a/drivers/iommu/amd/Makefile b/drivers/iommu/amd/Makefile
index 59c04a67f398..5412a563c697 100644
--- a/drivers/iommu/amd/Makefile
+++ b/drivers/iommu/amd/Makefile
@@ -1,3 +1,3 @@
# SPDX-License-Identifier: GPL-2.0-only
-obj-y += iommu.o init.o quirks.o io_pgtable.o io_pgtable_v2.o ppr.o pasid.o
+obj-y += iommu.o init.o quirks.o ppr.o pasid.o
obj-$(CONFIG_AMD_IOMMU_DEBUGFS) += debugfs.o
diff --git a/drivers/iommu/amd/amd_iommu.h b/drivers/iommu/amd/amd_iommu.h
index 9b4b589a54b5..25044d28f28a 100644
--- a/drivers/iommu/amd/amd_iommu.h
+++ b/drivers/iommu/amd/amd_iommu.h
@@ -88,7 +88,6 @@ int amd_iommu_complete_ppr(struct device *dev, u32 pasid, int status, int tag);
* the IOMMU used by this driver.
*/
void amd_iommu_flush_all_caches(struct amd_iommu *iommu);
-void amd_iommu_update_and_flush_device_table(struct protection_domain *domain);
void amd_iommu_domain_flush_pages(struct protection_domain *domain,
u64 address, size_t size);
void amd_iommu_dev_flush_pasid_pages(struct iommu_dev_data *dev_data,
diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h
index a698a2e7ce2a..78b1c44bd6b5 100644
--- a/drivers/iommu/amd/amd_iommu_types.h
+++ b/drivers/iommu/amd/amd_iommu_types.h
@@ -18,7 +18,7 @@
#include <linux/spinlock.h>
#include <linux/pci.h>
#include <linux/irqreturn.h>
-#include <linux/io-pgtable.h>
+#include <linux/generic_pt/iommu.h>
/*
* Maximum number of IOMMUs supported
@@ -247,6 +247,10 @@
#define CMD_BUFFER_ENTRIES 512
#define MMIO_CMD_SIZE_SHIFT 56
#define MMIO_CMD_SIZE_512 (0x9ULL << MMIO_CMD_SIZE_SHIFT)
+#define MMIO_CMD_HEAD_MASK GENMASK_ULL(18, 4) /* Command buffer head ptr field [18:4] */
+#define MMIO_CMD_BUFFER_HEAD(x) FIELD_GET(MMIO_CMD_HEAD_MASK, (x))
+#define MMIO_CMD_TAIL_MASK GENMASK_ULL(18, 4) /* Command buffer tail ptr field [18:4] */
+#define MMIO_CMD_BUFFER_TAIL(x) FIELD_GET(MMIO_CMD_TAIL_MASK, (x))
/* constants for event buffer handling */
#define EVT_BUFFER_SIZE 8192 /* 512 entries */
@@ -337,76 +341,7 @@
#define GUEST_PGTABLE_4_LEVEL 0x00
#define GUEST_PGTABLE_5_LEVEL 0x01
-#define PM_LEVEL_SHIFT(x) (12 + ((x) * 9))
-#define PM_LEVEL_SIZE(x) (((x) < 6) ? \
- ((1ULL << PM_LEVEL_SHIFT((x))) - 1): \
- (0xffffffffffffffffULL))
-#define PM_LEVEL_INDEX(x, a) (((a) >> PM_LEVEL_SHIFT((x))) & 0x1ffULL)
-#define PM_LEVEL_ENC(x) (((x) << 9) & 0xe00ULL)
-#define PM_LEVEL_PDE(x, a) ((a) | PM_LEVEL_ENC((x)) | \
- IOMMU_PTE_PR | IOMMU_PTE_IR | IOMMU_PTE_IW)
-#define PM_PTE_LEVEL(pte) (((pte) >> 9) & 0x7ULL)
-
-#define PM_MAP_4k 0
#define PM_ADDR_MASK 0x000ffffffffff000ULL
-#define PM_MAP_MASK(lvl) (PM_ADDR_MASK & \
- (~((1ULL << (12 + ((lvl) * 9))) - 1)))
-#define PM_ALIGNED(lvl, addr) ((PM_MAP_MASK(lvl) & (addr)) == (addr))
-
-/*
- * Returns the page table level to use for a given page size
- * Pagesize is expected to be a power-of-two
- */
-#define PAGE_SIZE_LEVEL(pagesize) \
- ((__ffs(pagesize) - 12) / 9)
-/*
- * Returns the number of ptes to use for a given page size
- * Pagesize is expected to be a power-of-two
- */
-#define PAGE_SIZE_PTE_COUNT(pagesize) \
- (1ULL << ((__ffs(pagesize) - 12) % 9))
-
-/*
- * Aligns a given io-virtual address to a given page size
- * Pagesize is expected to be a power-of-two
- */
-#define PAGE_SIZE_ALIGN(address, pagesize) \
- ((address) & ~((pagesize) - 1))
-/*
- * Creates an IOMMU PTE for an address and a given pagesize
- * The PTE has no permission bits set
- * Pagesize is expected to be a power-of-two larger than 4096
- */
-#define PAGE_SIZE_PTE(address, pagesize) \
- (((address) | ((pagesize) - 1)) & \
- (~(pagesize >> 1)) & PM_ADDR_MASK)
-
-/*
- * Takes a PTE value with mode=0x07 and returns the page size it maps
- */
-#define PTE_PAGE_SIZE(pte) \
- (1ULL << (1 + ffz(((pte) | 0xfffULL))))
-
-/*
- * Takes a page-table level and returns the default page-size for this level
- */
-#define PTE_LEVEL_PAGE_SIZE(level) \
- (1ULL << (12 + (9 * (level))))
-
-/*
- * The IOPTE dirty bit
- */
-#define IOMMU_PTE_HD_BIT (6)
-
-/*
- * Bit value definition for I/O PTE fields
- */
-#define IOMMU_PTE_PR BIT_ULL(0)
-#define IOMMU_PTE_HD BIT_ULL(IOMMU_PTE_HD_BIT)
-#define IOMMU_PTE_U BIT_ULL(59)
-#define IOMMU_PTE_FC BIT_ULL(60)
-#define IOMMU_PTE_IR BIT_ULL(61)
-#define IOMMU_PTE_IW BIT_ULL(62)
/*
* Bit value definition for DTE fields
@@ -436,12 +371,6 @@
/* DTE[128:179] | DTE[184:191] */
#define DTE_DATA2_INTR_MASK ~GENMASK_ULL(55, 52)
-#define IOMMU_PAGE_MASK (((1ULL << 52) - 1) & ~0xfffULL)
-#define IOMMU_PTE_PRESENT(pte) ((pte) & IOMMU_PTE_PR)
-#define IOMMU_PTE_DIRTY(pte) ((pte) & IOMMU_PTE_HD)
-#define IOMMU_PTE_PAGE(pte) (iommu_phys_to_virt((pte) & IOMMU_PAGE_MASK))
-#define IOMMU_PTE_MODE(pte) (((pte) >> 9) & 0x07)
-
#define IOMMU_PROT_MASK 0x03
#define IOMMU_PROT_IR 0x01
#define IOMMU_PROT_IW 0x02
@@ -534,19 +463,6 @@ struct amd_irte_ops;
#define AMD_IOMMU_FLAG_TRANS_PRE_ENABLED (1 << 0)
-#define io_pgtable_to_data(x) \
- container_of((x), struct amd_io_pgtable, pgtbl)
-
-#define io_pgtable_ops_to_data(x) \
- io_pgtable_to_data(io_pgtable_ops_to_pgtable(x))
-
-#define io_pgtable_ops_to_domain(x) \
- container_of(io_pgtable_ops_to_data(x), \
- struct protection_domain, iop)
-
-#define io_pgtable_cfg_to_data(x) \
- container_of((x), struct amd_io_pgtable, pgtbl.cfg)
-
struct gcr3_tbl_info {
u64 *gcr3_tbl; /* Guest CR3 table */
int glx; /* Number of levels for GCR3 table */
@@ -554,14 +470,6 @@ struct gcr3_tbl_info {
u16 domid; /* Per device domain ID */
};
-struct amd_io_pgtable {
- seqcount_t seqcount; /* Protects root/mode update */
- struct io_pgtable pgtbl;
- int mode;
- u64 *root;
- u64 *pgd; /* v2 pgtable pgd pointer */
-};
-
enum protection_domain_mode {
PD_MODE_NONE,
PD_MODE_V1,
@@ -589,10 +497,13 @@ struct pdom_iommu_info {
* independent of their use.
*/
struct protection_domain {
+ union {
+ struct iommu_domain domain;
+ struct pt_iommu iommu;
+ struct pt_iommu_amdv1 amdv1;
+ struct pt_iommu_x86_64 amdv2;
+ };
struct list_head dev_list; /* List of all devices in this domain */
- struct iommu_domain domain; /* generic domain handle used by
- iommu core code */
- struct amd_io_pgtable iop;
spinlock_t lock; /* mostly used to lock the page table*/
u16 id; /* the domain id written to the device table */
enum protection_domain_mode pd_mode; /* Track page table type */
@@ -602,6 +513,9 @@ struct protection_domain {
struct mmu_notifier mn; /* mmu notifier for the SVA domain */
struct list_head dev_data_list; /* List of pdom_dev_data */
};
+PT_IOMMU_CHECK_DOMAIN(struct protection_domain, iommu, domain);
+PT_IOMMU_CHECK_DOMAIN(struct protection_domain, amdv1.iommu, domain);
+PT_IOMMU_CHECK_DOMAIN(struct protection_domain, amdv2.iommu, domain);
/*
* This structure contains information about one PCI segment in the system.
diff --git a/drivers/iommu/amd/debugfs.c b/drivers/iommu/amd/debugfs.c
index 10fa217a7119..20b04996441d 100644
--- a/drivers/iommu/amd/debugfs.c
+++ b/drivers/iommu/amd/debugfs.c
@@ -37,7 +37,7 @@ static ssize_t iommu_mmio_write(struct file *filp, const char __user *ubuf,
if (ret)
return ret;
- if (iommu->dbg_mmio_offset > iommu->mmio_phys_end - 4) {
+ if (iommu->dbg_mmio_offset > iommu->mmio_phys_end - sizeof(u64)) {
iommu->dbg_mmio_offset = -1;
return -EINVAL;
}
diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c
index f2991c11867c..4f4d4955269e 100644
--- a/drivers/iommu/amd/init.c
+++ b/drivers/iommu/amd/init.c
@@ -1710,13 +1710,22 @@ static struct amd_iommu_pci_seg *__init alloc_pci_segment(u16 id,
list_add_tail(&pci_seg->list, &amd_iommu_pci_seg_list);
if (alloc_dev_table(pci_seg))
- return NULL;
+ goto err_free_pci_seg;
if (alloc_alias_table(pci_seg))
- return NULL;
+ goto err_free_dev_table;
if (alloc_rlookup_table(pci_seg))
- return NULL;
+ goto err_free_alias_table;
return pci_seg;
+
+err_free_alias_table:
+ free_alias_table(pci_seg);
+err_free_dev_table:
+ free_dev_table(pci_seg);
+err_free_pci_seg:
+ list_del(&pci_seg->list);
+ kfree(pci_seg);
+ return NULL;
}
static struct amd_iommu_pci_seg *__init get_pci_segment(u16 id,
diff --git a/drivers/iommu/amd/io_pgtable.c b/drivers/iommu/amd/io_pgtable.c
deleted file mode 100644
index 70c2f5b1631b..000000000000
--- a/drivers/iommu/amd/io_pgtable.c
+++ /dev/null
@@ -1,577 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * CPU-agnostic AMD IO page table allocator.
- *
- * Copyright (C) 2020 Advanced Micro Devices, Inc.
- * Author: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
- */
-
-#define pr_fmt(fmt) "AMD-Vi: " fmt
-#define dev_fmt(fmt) pr_fmt(fmt)
-
-#include <linux/atomic.h>
-#include <linux/bitops.h>
-#include <linux/io-pgtable.h>
-#include <linux/kernel.h>
-#include <linux/sizes.h>
-#include <linux/slab.h>
-#include <linux/types.h>
-#include <linux/dma-mapping.h>
-#include <linux/seqlock.h>
-
-#include <asm/barrier.h>
-
-#include "amd_iommu_types.h"
-#include "amd_iommu.h"
-#include "../iommu-pages.h"
-
-/*
- * Helper function to get the first pte of a large mapping
- */
-static u64 *first_pte_l7(u64 *pte, unsigned long *page_size,
- unsigned long *count)
-{
- unsigned long pte_mask, pg_size, cnt;
- u64 *fpte;
-
- pg_size = PTE_PAGE_SIZE(*pte);
- cnt = PAGE_SIZE_PTE_COUNT(pg_size);
- pte_mask = ~((cnt << 3) - 1);
- fpte = (u64 *)(((unsigned long)pte) & pte_mask);
-
- if (page_size)
- *page_size = pg_size;
-
- if (count)
- *count = cnt;
-
- return fpte;
-}
-
-static void free_pt_lvl(u64 *pt, struct iommu_pages_list *freelist, int lvl)
-{
- u64 *p;
- int i;
-
- for (i = 0; i < 512; ++i) {
- /* PTE present? */
- if (!IOMMU_PTE_PRESENT(pt[i]))
- continue;
-
- /* Large PTE? */
- if (PM_PTE_LEVEL(pt[i]) == 0 ||
- PM_PTE_LEVEL(pt[i]) == 7)
- continue;
-
- /*
- * Free the next level. No need to look at l1 tables here since
- * they can only contain leaf PTEs; just free them directly.
- */
- p = IOMMU_PTE_PAGE(pt[i]);
- if (lvl > 2)
- free_pt_lvl(p, freelist, lvl - 1);
- else
- iommu_pages_list_add(freelist, p);
- }
-
- iommu_pages_list_add(freelist, pt);
-}
-
-static void free_sub_pt(u64 *root, int mode, struct iommu_pages_list *freelist)
-{
- switch (mode) {
- case PAGE_MODE_NONE:
- case PAGE_MODE_7_LEVEL:
- break;
- case PAGE_MODE_1_LEVEL:
- iommu_pages_list_add(freelist, root);
- break;
- case PAGE_MODE_2_LEVEL:
- case PAGE_MODE_3_LEVEL:
- case PAGE_MODE_4_LEVEL:
- case PAGE_MODE_5_LEVEL:
- case PAGE_MODE_6_LEVEL:
- free_pt_lvl(root, freelist, mode);
- break;
- default:
- BUG();
- }
-}
-
-/*
- * This function is used to add another level to an IO page table. Adding
- * another level increases the size of the address space by 9 bits to a size up
- * to 64 bits.
- */
-static bool increase_address_space(struct amd_io_pgtable *pgtable,
- unsigned long address,
- unsigned int page_size_level,
- gfp_t gfp)
-{
- struct io_pgtable_cfg *cfg = &pgtable->pgtbl.cfg;
- struct protection_domain *domain =
- container_of(pgtable, struct protection_domain, iop);
- unsigned long flags;
- bool ret = true;
- u64 *pte;
-
- pte = iommu_alloc_pages_node_sz(cfg->amd.nid, gfp, SZ_4K);
- if (!pte)
- return false;
-
- spin_lock_irqsave(&domain->lock, flags);
-
- if (address <= PM_LEVEL_SIZE(pgtable->mode) &&
- pgtable->mode - 1 >= page_size_level)
- goto out;
-
- ret = false;
- if (WARN_ON_ONCE(pgtable->mode == amd_iommu_hpt_level))
- goto out;
-
- *pte = PM_LEVEL_PDE(pgtable->mode, iommu_virt_to_phys(pgtable->root));
-
- write_seqcount_begin(&pgtable->seqcount);
- pgtable->root = pte;
- pgtable->mode += 1;
- write_seqcount_end(&pgtable->seqcount);
-
- amd_iommu_update_and_flush_device_table(domain);
-
- pte = NULL;
- ret = true;
-
-out:
- spin_unlock_irqrestore(&domain->lock, flags);
- iommu_free_pages(pte);
-
- return ret;
-}
-
-static u64 *alloc_pte(struct amd_io_pgtable *pgtable,
- unsigned long address,
- unsigned long page_size,
- u64 **pte_page,
- gfp_t gfp,
- bool *updated)
-{
- unsigned long last_addr = address + (page_size - 1);
- struct io_pgtable_cfg *cfg = &pgtable->pgtbl.cfg;
- unsigned int seqcount;
- int level, end_lvl;
- u64 *pte, *page;
-
- BUG_ON(!is_power_of_2(page_size));
-
- while (last_addr > PM_LEVEL_SIZE(pgtable->mode) ||
- pgtable->mode - 1 < PAGE_SIZE_LEVEL(page_size)) {
- /*
- * Return an error if there is no memory to update the
- * page-table.
- */
- if (!increase_address_space(pgtable, last_addr,
- PAGE_SIZE_LEVEL(page_size), gfp))
- return NULL;
- }
-
-
- do {
- seqcount = read_seqcount_begin(&pgtable->seqcount);
-
- level = pgtable->mode - 1;
- pte = &pgtable->root[PM_LEVEL_INDEX(level, address)];
- } while (read_seqcount_retry(&pgtable->seqcount, seqcount));
-
-
- address = PAGE_SIZE_ALIGN(address, page_size);
- end_lvl = PAGE_SIZE_LEVEL(page_size);
-
- while (level > end_lvl) {
- u64 __pte, __npte;
- int pte_level;
-
- __pte = *pte;
- pte_level = PM_PTE_LEVEL(__pte);
-
- /*
- * If we replace a series of large PTEs, we need
- * to tear down all of them.
- */
- if (IOMMU_PTE_PRESENT(__pte) &&
- pte_level == PAGE_MODE_7_LEVEL) {
- unsigned long count, i;
- u64 *lpte;
-
- lpte = first_pte_l7(pte, NULL, &count);
-
- /*
- * Unmap the replicated PTEs that still match the
- * original large mapping
- */
- for (i = 0; i < count; ++i)
- cmpxchg64(&lpte[i], __pte, 0ULL);
-
- *updated = true;
- continue;
- }
-
- if (!IOMMU_PTE_PRESENT(__pte) ||
- pte_level == PAGE_MODE_NONE) {
- page = iommu_alloc_pages_node_sz(cfg->amd.nid, gfp,
- SZ_4K);
-
- if (!page)
- return NULL;
-
- __npte = PM_LEVEL_PDE(level, iommu_virt_to_phys(page));
-
- /* pte could have been changed somewhere. */
- if (!try_cmpxchg64(pte, &__pte, __npte))
- iommu_free_pages(page);
- else if (IOMMU_PTE_PRESENT(__pte))
- *updated = true;
-
- continue;
- }
-
- /* No level skipping support yet */
- if (pte_level != level)
- return NULL;
-
- level -= 1;
-
- pte = IOMMU_PTE_PAGE(__pte);
-
- if (pte_page && level == end_lvl)
- *pte_page = pte;
-
- pte = &pte[PM_LEVEL_INDEX(level, address)];
- }
-
- return pte;
-}
-
-/*
- * This function checks if there is a PTE for a given dma address. If
- * there is one, it returns the pointer to it.
- */
-static u64 *fetch_pte(struct amd_io_pgtable *pgtable,
- unsigned long address,
- unsigned long *page_size)
-{
- int level;
- unsigned int seqcount;
- u64 *pte;
-
- *page_size = 0;
-
- if (address > PM_LEVEL_SIZE(pgtable->mode))
- return NULL;
-
- do {
- seqcount = read_seqcount_begin(&pgtable->seqcount);
- level = pgtable->mode - 1;
- pte = &pgtable->root[PM_LEVEL_INDEX(level, address)];
- } while (read_seqcount_retry(&pgtable->seqcount, seqcount));
-
- *page_size = PTE_LEVEL_PAGE_SIZE(level);
-
- while (level > 0) {
-
- /* Not Present */
- if (!IOMMU_PTE_PRESENT(*pte))
- return NULL;
-
- /* Large PTE */
- if (PM_PTE_LEVEL(*pte) == PAGE_MODE_7_LEVEL ||
- PM_PTE_LEVEL(*pte) == PAGE_MODE_NONE)
- break;
-
- /* No level skipping support yet */
- if (PM_PTE_LEVEL(*pte) != level)
- return NULL;
-
- level -= 1;
-
- /* Walk to the next level */
- pte = IOMMU_PTE_PAGE(*pte);
- pte = &pte[PM_LEVEL_INDEX(level, address)];
- *page_size = PTE_LEVEL_PAGE_SIZE(level);
- }
-
- /*
- * If we have a series of large PTEs, make
- * sure to return a pointer to the first one.
- */
- if (PM_PTE_LEVEL(*pte) == PAGE_MODE_7_LEVEL)
- pte = first_pte_l7(pte, page_size, NULL);
-
- return pte;
-}
-
-static void free_clear_pte(u64 *pte, u64 pteval,
- struct iommu_pages_list *freelist)
-{
- u64 *pt;
- int mode;
-
- while (!try_cmpxchg64(pte, &pteval, 0))
- pr_warn("AMD-Vi: IOMMU pte changed since we read it\n");
-
- if (!IOMMU_PTE_PRESENT(pteval))
- return;
-
- pt = IOMMU_PTE_PAGE(pteval);
- mode = IOMMU_PTE_MODE(pteval);
-
- free_sub_pt(pt, mode, freelist);
-}
-
-/*
- * Generic mapping functions. It maps a physical address into a DMA
- * address space. It allocates the page table pages if necessary.
- * In the future it can be extended to a generic mapping function
- * supporting all features of AMD IOMMU page tables like level skipping
- * and full 64 bit address spaces.
- */
-static int iommu_v1_map_pages(struct io_pgtable_ops *ops, unsigned long iova,
- phys_addr_t paddr, size_t pgsize, size_t pgcount,
- int prot, gfp_t gfp, size_t *mapped)
-{
- struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops);
- struct iommu_pages_list freelist = IOMMU_PAGES_LIST_INIT(freelist);
- bool updated = false;
- u64 __pte, *pte;
- int ret, i, count;
- size_t size = pgcount << __ffs(pgsize);
- unsigned long o_iova = iova;
-
- BUG_ON(!IS_ALIGNED(iova, pgsize));
- BUG_ON(!IS_ALIGNED(paddr, pgsize));
-
- ret = -EINVAL;
- if (!(prot & IOMMU_PROT_MASK))
- goto out;
-
- while (pgcount > 0) {
- count = PAGE_SIZE_PTE_COUNT(pgsize);
- pte = alloc_pte(pgtable, iova, pgsize, NULL, gfp, &updated);
-
- ret = -ENOMEM;
- if (!pte)
- goto out;
-
- for (i = 0; i < count; ++i)
- free_clear_pte(&pte[i], pte[i], &freelist);
-
- if (!iommu_pages_list_empty(&freelist))
- updated = true;
-
- if (count > 1) {
- __pte = PAGE_SIZE_PTE(__sme_set(paddr), pgsize);
- __pte |= PM_LEVEL_ENC(7) | IOMMU_PTE_PR | IOMMU_PTE_FC;
- } else
- __pte = __sme_set(paddr) | IOMMU_PTE_PR | IOMMU_PTE_FC;
-
- if (prot & IOMMU_PROT_IR)
- __pte |= IOMMU_PTE_IR;
- if (prot & IOMMU_PROT_IW)
- __pte |= IOMMU_PTE_IW;
-
- for (i = 0; i < count; ++i)
- pte[i] = __pte;
-
- iova += pgsize;
- paddr += pgsize;
- pgcount--;
- if (mapped)
- *mapped += pgsize;
- }
-
- ret = 0;
-
-out:
- if (updated) {
- struct protection_domain *dom = io_pgtable_ops_to_domain(ops);
- unsigned long flags;
-
- spin_lock_irqsave(&dom->lock, flags);
- /*
- * Flush domain TLB(s) and wait for completion. Any Device-Table
- * Updates and flushing already happened in
- * increase_address_space().
- */
- amd_iommu_domain_flush_pages(dom, o_iova, size);
- spin_unlock_irqrestore(&dom->lock, flags);
- }
-
- /* Everything flushed out, free pages now */
- iommu_put_pages_list(&freelist);
-
- return ret;
-}
-
-static unsigned long iommu_v1_unmap_pages(struct io_pgtable_ops *ops,
- unsigned long iova,
- size_t pgsize, size_t pgcount,
- struct iommu_iotlb_gather *gather)
-{
- struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops);
- unsigned long long unmapped;
- unsigned long unmap_size;
- u64 *pte;
- size_t size = pgcount << __ffs(pgsize);
-
- BUG_ON(!is_power_of_2(pgsize));
-
- unmapped = 0;
-
- while (unmapped < size) {
- pte = fetch_pte(pgtable, iova, &unmap_size);
- if (pte) {
- int i, count;
-
- count = PAGE_SIZE_PTE_COUNT(unmap_size);
- for (i = 0; i < count; i++)
- pte[i] = 0ULL;
- } else {
- return unmapped;
- }
-
- iova = (iova & ~(unmap_size - 1)) + unmap_size;
- unmapped += unmap_size;
- }
-
- return unmapped;
-}
-
-static phys_addr_t iommu_v1_iova_to_phys(struct io_pgtable_ops *ops, unsigned long iova)
-{
- struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops);
- unsigned long offset_mask, pte_pgsize;
- u64 *pte, __pte;
-
- pte = fetch_pte(pgtable, iova, &pte_pgsize);
-
- if (!pte || !IOMMU_PTE_PRESENT(*pte))
- return 0;
-
- offset_mask = pte_pgsize - 1;
- __pte = __sme_clr(*pte & PM_ADDR_MASK);
-
- return (__pte & ~offset_mask) | (iova & offset_mask);
-}
-
-static bool pte_test_and_clear_dirty(u64 *ptep, unsigned long size,
- unsigned long flags)
-{
- bool test_only = flags & IOMMU_DIRTY_NO_CLEAR;
- bool dirty = false;
- int i, count;
-
- /*
- * 2.2.3.2 Host Dirty Support
- * When a non-default page size is used , software must OR the
- * Dirty bits in all of the replicated host PTEs used to map
- * the page. The IOMMU does not guarantee the Dirty bits are
- * set in all of the replicated PTEs. Any portion of the page
- * may have been written even if the Dirty bit is set in only
- * one of the replicated PTEs.
- */
- count = PAGE_SIZE_PTE_COUNT(size);
- for (i = 0; i < count && test_only; i++) {
- if (test_bit(IOMMU_PTE_HD_BIT, (unsigned long *)&ptep[i])) {
- dirty = true;
- break;
- }
- }
-
- for (i = 0; i < count && !test_only; i++) {
- if (test_and_clear_bit(IOMMU_PTE_HD_BIT,
- (unsigned long *)&ptep[i])) {
- dirty = true;
- }
- }
-
- return dirty;
-}
-
-static int iommu_v1_read_and_clear_dirty(struct io_pgtable_ops *ops,
- unsigned long iova, size_t size,
- unsigned long flags,
- struct iommu_dirty_bitmap *dirty)
-{
- struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops);
- unsigned long end = iova + size - 1;
-
- do {
- unsigned long pgsize = 0;
- u64 *ptep, pte;
-
- ptep = fetch_pte(pgtable, iova, &pgsize);
- if (ptep)
- pte = READ_ONCE(*ptep);
- if (!ptep || !IOMMU_PTE_PRESENT(pte)) {
- pgsize = pgsize ?: PTE_LEVEL_PAGE_SIZE(0);
- iova += pgsize;
- continue;
- }
-
- /*
- * Mark the whole IOVA range as dirty even if only one of
- * the replicated PTEs were marked dirty.
- */
- if (pte_test_and_clear_dirty(ptep, pgsize, flags))
- iommu_dirty_bitmap_record(dirty, iova, pgsize);
- iova += pgsize;
- } while (iova < end);
-
- return 0;
-}
-
-/*
- * ----------------------------------------------------
- */
-static void v1_free_pgtable(struct io_pgtable *iop)
-{
- struct amd_io_pgtable *pgtable = container_of(iop, struct amd_io_pgtable, pgtbl);
- struct iommu_pages_list freelist = IOMMU_PAGES_LIST_INIT(freelist);
-
- if (pgtable->mode == PAGE_MODE_NONE)
- return;
-
- /* Page-table is not visible to IOMMU anymore, so free it */
- BUG_ON(pgtable->mode < PAGE_MODE_NONE ||
- pgtable->mode > amd_iommu_hpt_level);
-
- free_sub_pt(pgtable->root, pgtable->mode, &freelist);
- iommu_put_pages_list(&freelist);
-}
-
-static struct io_pgtable *v1_alloc_pgtable(struct io_pgtable_cfg *cfg, void *cookie)
-{
- struct amd_io_pgtable *pgtable = io_pgtable_cfg_to_data(cfg);
-
- pgtable->root =
- iommu_alloc_pages_node_sz(cfg->amd.nid, GFP_KERNEL, SZ_4K);
- if (!pgtable->root)
- return NULL;
- pgtable->mode = PAGE_MODE_3_LEVEL;
- seqcount_init(&pgtable->seqcount);
-
- cfg->pgsize_bitmap = amd_iommu_pgsize_bitmap;
- cfg->ias = IOMMU_IN_ADDR_BIT_SIZE;
- cfg->oas = IOMMU_OUT_ADDR_BIT_SIZE;
-
- pgtable->pgtbl.ops.map_pages = iommu_v1_map_pages;
- pgtable->pgtbl.ops.unmap_pages = iommu_v1_unmap_pages;
- pgtable->pgtbl.ops.iova_to_phys = iommu_v1_iova_to_phys;
- pgtable->pgtbl.ops.read_and_clear_dirty = iommu_v1_read_and_clear_dirty;
-
- return &pgtable->pgtbl;
-}
-
-struct io_pgtable_init_fns io_pgtable_amd_iommu_v1_init_fns = {
- .alloc = v1_alloc_pgtable,
- .free = v1_free_pgtable,
-};
diff --git a/drivers/iommu/amd/io_pgtable_v2.c b/drivers/iommu/amd/io_pgtable_v2.c
deleted file mode 100644
index b47941353ccb..000000000000
--- a/drivers/iommu/amd/io_pgtable_v2.c
+++ /dev/null
@@ -1,370 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * CPU-agnostic AMD IO page table v2 allocator.
- *
- * Copyright (C) 2022, 2023 Advanced Micro Devices, Inc.
- * Author: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
- * Author: Vasant Hegde <vasant.hegde@amd.com>
- */
-
-#define pr_fmt(fmt) "AMD-Vi: " fmt
-#define dev_fmt(fmt) pr_fmt(fmt)
-
-#include <linux/bitops.h>
-#include <linux/io-pgtable.h>
-#include <linux/kernel.h>
-
-#include <asm/barrier.h>
-
-#include "amd_iommu_types.h"
-#include "amd_iommu.h"
-#include "../iommu-pages.h"
-
-#define IOMMU_PAGE_PRESENT BIT_ULL(0) /* Is present */
-#define IOMMU_PAGE_RW BIT_ULL(1) /* Writeable */
-#define IOMMU_PAGE_USER BIT_ULL(2) /* Userspace addressable */
-#define IOMMU_PAGE_PWT BIT_ULL(3) /* Page write through */
-#define IOMMU_PAGE_PCD BIT_ULL(4) /* Page cache disabled */
-#define IOMMU_PAGE_ACCESS BIT_ULL(5) /* Was accessed (updated by IOMMU) */
-#define IOMMU_PAGE_DIRTY BIT_ULL(6) /* Was written to (updated by IOMMU) */
-#define IOMMU_PAGE_PSE BIT_ULL(7) /* Page Size Extensions */
-#define IOMMU_PAGE_NX BIT_ULL(63) /* No execute */
-
-#define MAX_PTRS_PER_PAGE 512
-
-#define IOMMU_PAGE_SIZE_2M BIT_ULL(21)
-#define IOMMU_PAGE_SIZE_1G BIT_ULL(30)
-
-
-static inline int get_pgtable_level(void)
-{
- return amd_iommu_gpt_level;
-}
-
-static inline bool is_large_pte(u64 pte)
-{
- return (pte & IOMMU_PAGE_PSE);
-}
-
-static inline u64 set_pgtable_attr(u64 *page)
-{
- u64 prot;
-
- prot = IOMMU_PAGE_PRESENT | IOMMU_PAGE_RW | IOMMU_PAGE_USER;
- prot |= IOMMU_PAGE_ACCESS;
-
- return (iommu_virt_to_phys(page) | prot);
-}
-
-static inline void *get_pgtable_pte(u64 pte)
-{
- return iommu_phys_to_virt(pte & PM_ADDR_MASK);
-}
-
-static u64 set_pte_attr(u64 paddr, u64 pg_size, int prot)
-{
- u64 pte;
-
- pte = __sme_set(paddr & PM_ADDR_MASK);
- pte |= IOMMU_PAGE_PRESENT | IOMMU_PAGE_USER;
- pte |= IOMMU_PAGE_ACCESS | IOMMU_PAGE_DIRTY;
-
- if (prot & IOMMU_PROT_IW)
- pte |= IOMMU_PAGE_RW;
-
- /* Large page */
- if (pg_size == IOMMU_PAGE_SIZE_1G || pg_size == IOMMU_PAGE_SIZE_2M)
- pte |= IOMMU_PAGE_PSE;
-
- return pte;
-}
-
-static inline u64 get_alloc_page_size(u64 size)
-{
- if (size >= IOMMU_PAGE_SIZE_1G)
- return IOMMU_PAGE_SIZE_1G;
-
- if (size >= IOMMU_PAGE_SIZE_2M)
- return IOMMU_PAGE_SIZE_2M;
-
- return PAGE_SIZE;
-}
-
-static inline int page_size_to_level(u64 pg_size)
-{
- if (pg_size == IOMMU_PAGE_SIZE_1G)
- return PAGE_MODE_3_LEVEL;
- if (pg_size == IOMMU_PAGE_SIZE_2M)
- return PAGE_MODE_2_LEVEL;
-
- return PAGE_MODE_1_LEVEL;
-}
-
-static void free_pgtable(u64 *pt, int level)
-{
- u64 *p;
- int i;
-
- for (i = 0; i < MAX_PTRS_PER_PAGE; i++) {
- /* PTE present? */
- if (!IOMMU_PTE_PRESENT(pt[i]))
- continue;
-
- if (is_large_pte(pt[i]))
- continue;
-
- /*
- * Free the next level. No need to look at l1 tables here since
- * they can only contain leaf PTEs; just free them directly.
- */
- p = get_pgtable_pte(pt[i]);
- if (level > 2)
- free_pgtable(p, level - 1);
- else
- iommu_free_pages(p);
- }
-
- iommu_free_pages(pt);
-}
-
-/* Allocate page table */
-static u64 *v2_alloc_pte(int nid, u64 *pgd, unsigned long iova,
- unsigned long pg_size, gfp_t gfp, bool *updated)
-{
- u64 *pte, *page;
- int level, end_level;
-
- level = get_pgtable_level() - 1;
- end_level = page_size_to_level(pg_size);
- pte = &pgd[PM_LEVEL_INDEX(level, iova)];
- iova = PAGE_SIZE_ALIGN(iova, PAGE_SIZE);
-
- while (level >= end_level) {
- u64 __pte, __npte;
-
- __pte = *pte;
-
- if (IOMMU_PTE_PRESENT(__pte) && is_large_pte(__pte)) {
- /* Unmap large pte */
- cmpxchg64(pte, *pte, 0ULL);
- *updated = true;
- continue;
- }
-
- if (!IOMMU_PTE_PRESENT(__pte)) {
- page = iommu_alloc_pages_node_sz(nid, gfp, SZ_4K);
- if (!page)
- return NULL;
-
- __npte = set_pgtable_attr(page);
- /* pte could have been changed somewhere. */
- if (!try_cmpxchg64(pte, &__pte, __npte))
- iommu_free_pages(page);
- else if (IOMMU_PTE_PRESENT(__pte))
- *updated = true;
-
- continue;
- }
-
- level -= 1;
- pte = get_pgtable_pte(__pte);
- pte = &pte[PM_LEVEL_INDEX(level, iova)];
- }
-
- /* Tear down existing pte entries */
- if (IOMMU_PTE_PRESENT(*pte)) {
- u64 *__pte;
-
- *updated = true;
- __pte = get_pgtable_pte(*pte);
- cmpxchg64(pte, *pte, 0ULL);
- if (pg_size == IOMMU_PAGE_SIZE_1G)
- free_pgtable(__pte, end_level - 1);
- else if (pg_size == IOMMU_PAGE_SIZE_2M)
- iommu_free_pages(__pte);
- }
-
- return pte;
-}
-
-/*
- * This function checks if there is a PTE for a given dma address.
- * If there is one, it returns the pointer to it.
- */
-static u64 *fetch_pte(struct amd_io_pgtable *pgtable,
- unsigned long iova, unsigned long *page_size)
-{
- u64 *pte;
- int level;
-
- level = get_pgtable_level() - 1;
- pte = &pgtable->pgd[PM_LEVEL_INDEX(level, iova)];
- /* Default page size is 4K */
- *page_size = PAGE_SIZE;
-
- while (level) {
- /* Not present */
- if (!IOMMU_PTE_PRESENT(*pte))
- return NULL;
-
- /* Walk to the next level */
- pte = get_pgtable_pte(*pte);
- pte = &pte[PM_LEVEL_INDEX(level - 1, iova)];
-
- /* Large page */
- if (is_large_pte(*pte)) {
- if (level == PAGE_MODE_3_LEVEL)
- *page_size = IOMMU_PAGE_SIZE_1G;
- else if (level == PAGE_MODE_2_LEVEL)
- *page_size = IOMMU_PAGE_SIZE_2M;
- else
- return NULL; /* Wrongly set PSE bit in PTE */
-
- break;
- }
-
- level -= 1;
- }
-
- return pte;
-}
-
-static int iommu_v2_map_pages(struct io_pgtable_ops *ops, unsigned long iova,
- phys_addr_t paddr, size_t pgsize, size_t pgcount,
- int prot, gfp_t gfp, size_t *mapped)
-{
- struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops);
- struct io_pgtable_cfg *cfg = &pgtable->pgtbl.cfg;
- u64 *pte;
- unsigned long map_size;
- unsigned long mapped_size = 0;
- unsigned long o_iova = iova;
- size_t size = pgcount << __ffs(pgsize);
- int ret = 0;
- bool updated = false;
-
- if (WARN_ON(!pgsize || (pgsize & cfg->pgsize_bitmap) != pgsize) || !pgcount)
- return -EINVAL;
-
- if (!(prot & IOMMU_PROT_MASK))
- return -EINVAL;
-
- while (mapped_size < size) {
- map_size = get_alloc_page_size(pgsize);
- pte = v2_alloc_pte(cfg->amd.nid, pgtable->pgd,
- iova, map_size, gfp, &updated);
- if (!pte) {
- ret = -ENOMEM;
- goto out;
- }
-
- *pte = set_pte_attr(paddr, map_size, prot);
-
- iova += map_size;
- paddr += map_size;
- mapped_size += map_size;
- }
-
-out:
- if (updated) {
- struct protection_domain *pdom = io_pgtable_ops_to_domain(ops);
- unsigned long flags;
-
- spin_lock_irqsave(&pdom->lock, flags);
- amd_iommu_domain_flush_pages(pdom, o_iova, size);
- spin_unlock_irqrestore(&pdom->lock, flags);
- }
-
- if (mapped)
- *mapped += mapped_size;
-
- return ret;
-}
-
-static unsigned long iommu_v2_unmap_pages(struct io_pgtable_ops *ops,
- unsigned long iova,
- size_t pgsize, size_t pgcount,
- struct iommu_iotlb_gather *gather)
-{
- struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops);
- struct io_pgtable_cfg *cfg = &pgtable->pgtbl.cfg;
- unsigned long unmap_size;
- unsigned long unmapped = 0;
- size_t size = pgcount << __ffs(pgsize);
- u64 *pte;
-
- if (WARN_ON(!pgsize || (pgsize & cfg->pgsize_bitmap) != pgsize || !pgcount))
- return 0;
-
- while (unmapped < size) {
- pte = fetch_pte(pgtable, iova, &unmap_size);
- if (!pte)
- return unmapped;
-
- *pte = 0ULL;
-
- iova = (iova & ~(unmap_size - 1)) + unmap_size;
- unmapped += unmap_size;
- }
-
- return unmapped;
-}
-
-static phys_addr_t iommu_v2_iova_to_phys(struct io_pgtable_ops *ops, unsigned long iova)
-{
- struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops);
- unsigned long offset_mask, pte_pgsize;
- u64 *pte, __pte;
-
- pte = fetch_pte(pgtable, iova, &pte_pgsize);
- if (!pte || !IOMMU_PTE_PRESENT(*pte))
- return 0;
-
- offset_mask = pte_pgsize - 1;
- __pte = __sme_clr(*pte & PM_ADDR_MASK);
-
- return (__pte & ~offset_mask) | (iova & offset_mask);
-}
-
-/*
- * ----------------------------------------------------
- */
-static void v2_free_pgtable(struct io_pgtable *iop)
-{
- struct amd_io_pgtable *pgtable = container_of(iop, struct amd_io_pgtable, pgtbl);
-
- if (!pgtable || !pgtable->pgd)
- return;
-
- /* Free page table */
- free_pgtable(pgtable->pgd, get_pgtable_level());
- pgtable->pgd = NULL;
-}
-
-static struct io_pgtable *v2_alloc_pgtable(struct io_pgtable_cfg *cfg, void *cookie)
-{
- struct amd_io_pgtable *pgtable = io_pgtable_cfg_to_data(cfg);
- int ias = IOMMU_IN_ADDR_BIT_SIZE;
-
- pgtable->pgd = iommu_alloc_pages_node_sz(cfg->amd.nid, GFP_KERNEL, SZ_4K);
- if (!pgtable->pgd)
- return NULL;
-
- if (get_pgtable_level() == PAGE_MODE_5_LEVEL)
- ias = 57;
-
- pgtable->pgtbl.ops.map_pages = iommu_v2_map_pages;
- pgtable->pgtbl.ops.unmap_pages = iommu_v2_unmap_pages;
- pgtable->pgtbl.ops.iova_to_phys = iommu_v2_iova_to_phys;
-
- cfg->pgsize_bitmap = AMD_IOMMU_PGSIZES_V2;
- cfg->ias = ias;
- cfg->oas = IOMMU_OUT_ADDR_BIT_SIZE;
-
- return &pgtable->pgtbl;
-}
-
-struct io_pgtable_init_fns io_pgtable_amd_iommu_v2_init_fns = {
- .alloc = v2_alloc_pgtable,
- .free = v2_free_pgtable,
-};
diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c
index 2e1865daa1ce..9f1d56a5e145 100644
--- a/drivers/iommu/amd/iommu.c
+++ b/drivers/iommu/amd/iommu.c
@@ -30,7 +30,6 @@
#include <linux/msi.h>
#include <linux/irqdomain.h>
#include <linux/percpu.h>
-#include <linux/io-pgtable.h>
#include <linux/cc_platform.h>
#include <asm/irq_remapping.h>
#include <asm/io_apic.h>
@@ -41,9 +40,9 @@
#include <asm/gart.h>
#include <asm/dma.h>
#include <uapi/linux/iommufd.h>
+#include <linux/generic_pt/iommu.h>
#include "amd_iommu.h"
-#include "../dma-iommu.h"
#include "../irq_remapping.h"
#include "../iommu-pages.h"
@@ -60,7 +59,6 @@ LIST_HEAD(hpet_map);
LIST_HEAD(acpihid_map);
const struct iommu_ops amd_iommu_ops;
-static const struct iommu_dirty_ops amd_dirty_ops;
int amd_iommu_max_glx_val = -1;
@@ -70,15 +68,22 @@ int amd_iommu_max_glx_val = -1;
*/
DEFINE_IDA(pdom_ids);
-static int amd_iommu_attach_device(struct iommu_domain *dom,
- struct device *dev);
+static int amd_iommu_attach_device(struct iommu_domain *dom, struct device *dev,
+ struct iommu_domain *old);
static void set_dte_entry(struct amd_iommu *iommu,
- struct iommu_dev_data *dev_data);
+ struct iommu_dev_data *dev_data,
+ phys_addr_t top_paddr, unsigned int top_level);
+
+static void amd_iommu_change_top(struct pt_iommu *iommu_table,
+ phys_addr_t top_paddr, unsigned int top_level);
static void iommu_flush_dte_sync(struct amd_iommu *iommu, u16 devid);
static struct iommu_dev_data *find_dev_data(struct amd_iommu *iommu, u16 devid);
+static bool amd_iommu_enforce_cache_coherency(struct iommu_domain *domain);
+static int amd_iommu_set_dirty_tracking(struct iommu_domain *domain,
+ bool enable);
/****************************************************************************
*
@@ -1157,6 +1162,25 @@ irqreturn_t amd_iommu_int_handler(int irq, void *data)
*
****************************************************************************/
+static void dump_command_buffer(struct amd_iommu *iommu)
+{
+ struct iommu_cmd *cmd;
+ u32 head, tail;
+ int i;
+
+ head = readl(iommu->mmio_base + MMIO_CMD_HEAD_OFFSET);
+ tail = readl(iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
+
+ pr_err("CMD Buffer head=%llu tail=%llu\n", MMIO_CMD_BUFFER_HEAD(head),
+ MMIO_CMD_BUFFER_TAIL(tail));
+
+ for (i = 0; i < CMD_BUFFER_ENTRIES; i++) {
+ cmd = (struct iommu_cmd *)(iommu->cmd_buf + i * sizeof(*cmd));
+ pr_err("%3d: %08x %08x %08x %08x\n", i, cmd->data[0], cmd->data[1], cmd->data[2],
+ cmd->data[3]);
+ }
+}
+
static int wait_on_sem(struct amd_iommu *iommu, u64 data)
{
int i = 0;
@@ -1167,7 +1191,14 @@ static int wait_on_sem(struct amd_iommu *iommu, u64 data)
}
if (i == LOOP_TIMEOUT) {
- pr_alert("Completion-Wait loop timed out\n");
+
+ pr_alert("IOMMU %04x:%02x:%02x.%01x: Completion-Wait loop timed out\n",
+ iommu->pci_seg->id, PCI_BUS_NUM(iommu->devid),
+ PCI_SLOT(iommu->devid), PCI_FUNC(iommu->devid));
+
+ if (amd_iommu_dump)
+ DO_ONCE_LITE(dump_command_buffer, iommu);
+
return -EIO;
}
@@ -1756,42 +1787,6 @@ static void dev_flush_pasid_all(struct iommu_dev_data *dev_data,
CMD_INV_IOMMU_ALL_PAGES_ADDRESS);
}
-/* Flush the not present cache if it exists */
-static void domain_flush_np_cache(struct protection_domain *domain,
- dma_addr_t iova, size_t size)
-{
- if (unlikely(amd_iommu_np_cache)) {
- unsigned long flags;
-
- spin_lock_irqsave(&domain->lock, flags);
- amd_iommu_domain_flush_pages(domain, iova, size);
- spin_unlock_irqrestore(&domain->lock, flags);
- }
-}
-
-
-/*
- * This function flushes the DTEs for all devices in domain
- */
-void amd_iommu_update_and_flush_device_table(struct protection_domain *domain)
-{
- struct iommu_dev_data *dev_data;
-
- lockdep_assert_held(&domain->lock);
-
- list_for_each_entry(dev_data, &domain->dev_list, list) {
- struct amd_iommu *iommu = rlookup_amd_iommu(dev_data->dev);
-
- set_dte_entry(iommu, dev_data);
- clone_aliases(iommu, dev_data->dev);
- }
-
- list_for_each_entry(dev_data, &domain->dev_list, list)
- device_flush_dte(dev_data);
-
- domain_flush_complete(domain);
-}
-
int amd_iommu_complete_ppr(struct device *dev, u32 pasid, int status, int tag)
{
struct iommu_dev_data *dev_data;
@@ -2051,7 +2046,8 @@ static void set_dte_gcr3_table(struct amd_iommu *iommu,
}
static void set_dte_entry(struct amd_iommu *iommu,
- struct iommu_dev_data *dev_data)
+ struct iommu_dev_data *dev_data,
+ phys_addr_t top_paddr, unsigned int top_level)
{
u16 domid;
u32 old_domid;
@@ -2060,19 +2056,36 @@ static void set_dte_entry(struct amd_iommu *iommu,
struct protection_domain *domain = dev_data->domain;
struct gcr3_tbl_info *gcr3_info = &dev_data->gcr3_info;
struct dev_table_entry *dte = &get_dev_table(iommu)[dev_data->devid];
+ struct pt_iommu_amdv1_hw_info pt_info;
+
+ make_clear_dte(dev_data, dte, &new);
if (gcr3_info && gcr3_info->gcr3_tbl)
domid = dev_data->gcr3_info.domid;
- else
+ else {
domid = domain->id;
- make_clear_dte(dev_data, dte, &new);
-
- if (domain->iop.mode != PAGE_MODE_NONE)
- new.data[0] |= iommu_virt_to_phys(domain->iop.root);
+ if (domain->domain.type & __IOMMU_DOMAIN_PAGING) {
+ /*
+ * When updating the IO pagetable, the new top and level
+ * are provided as parameters. For other operations i.e.
+ * device attach, retrieve the current pagetable info
+ * via the IOMMU PT API.
+ */
+ if (top_paddr) {
+ pt_info.host_pt_root = top_paddr;
+ pt_info.mode = top_level + 1;
+ } else {
+ WARN_ON(top_paddr || top_level);
+ pt_iommu_amdv1_hw_info(&domain->amdv1,
+ &pt_info);
+ }
- new.data[0] |= (domain->iop.mode & DEV_ENTRY_MODE_MASK)
- << DEV_ENTRY_MODE_SHIFT;
+ new.data[0] |= __sme_set(pt_info.host_pt_root) |
+ (pt_info.mode & DEV_ENTRY_MODE_MASK)
+ << DEV_ENTRY_MODE_SHIFT;
+ }
+ }
new.data[0] |= DTE_FLAG_IR | DTE_FLAG_IW;
@@ -2138,7 +2151,7 @@ static void dev_update_dte(struct iommu_dev_data *dev_data, bool set)
struct amd_iommu *iommu = get_amd_iommu_from_dev(dev_data->dev);
if (set)
- set_dte_entry(iommu, dev_data);
+ set_dte_entry(iommu, dev_data, 0, 0);
else
clear_dte_entry(iommu, dev_data);
@@ -2156,6 +2169,7 @@ static int init_gcr3_table(struct iommu_dev_data *dev_data,
{
struct amd_iommu *iommu = get_amd_iommu_from_dev_data(dev_data);
int max_pasids = dev_data->max_pasids;
+ struct pt_iommu_x86_64_hw_info pt_info;
int ret = 0;
/*
@@ -2178,7 +2192,8 @@ static int init_gcr3_table(struct iommu_dev_data *dev_data,
if (!pdom_is_v2_pgtbl_mode(pdom))
return ret;
- ret = update_gcr3(dev_data, 0, iommu_virt_to_phys(pdom->iop.pgd), true);
+ pt_iommu_x86_64_hw_info(&pdom->amdv2, &pt_info);
+ ret = update_gcr3(dev_data, 0, __sme_set(pt_info.gcr3_pt), true);
if (ret)
free_gcr3_table(&dev_data->gcr3_info);
@@ -2500,94 +2515,240 @@ struct protection_domain *protection_domain_alloc(void)
return domain;
}
-static int pdom_setup_pgtable(struct protection_domain *domain,
- struct device *dev)
+static bool amd_iommu_hd_support(struct amd_iommu *iommu)
+{
+ if (amd_iommu_hatdis)
+ return false;
+
+ return iommu && (iommu->features & FEATURE_HDSUP);
+}
+
+static spinlock_t *amd_iommu_get_top_lock(struct pt_iommu *iommupt)
{
- struct io_pgtable_ops *pgtbl_ops;
- enum io_pgtable_fmt fmt;
+ struct protection_domain *pdom =
+ container_of(iommupt, struct protection_domain, iommu);
- switch (domain->pd_mode) {
- case PD_MODE_V1:
- fmt = AMD_IOMMU_V1;
- break;
- case PD_MODE_V2:
- fmt = AMD_IOMMU_V2;
- break;
- case PD_MODE_NONE:
- WARN_ON_ONCE(1);
- return -EPERM;
+ return &pdom->lock;
+}
+
+/*
+ * Update all HW references to the domain with a new pgtable configuration.
+ */
+static void amd_iommu_change_top(struct pt_iommu *iommu_table,
+ phys_addr_t top_paddr, unsigned int top_level)
+{
+ struct protection_domain *pdom =
+ container_of(iommu_table, struct protection_domain, iommu);
+ struct iommu_dev_data *dev_data;
+
+ lockdep_assert_held(&pdom->lock);
+
+ /* Update the DTE for all devices attached to this domain */
+ list_for_each_entry(dev_data, &pdom->dev_list, list) {
+ struct amd_iommu *iommu = rlookup_amd_iommu(dev_data->dev);
+
+ /* Update the HW references with the new level and top ptr */
+ set_dte_entry(iommu, dev_data, top_paddr, top_level);
+ clone_aliases(iommu, dev_data->dev);
}
- domain->iop.pgtbl.cfg.amd.nid = dev_to_node(dev);
- pgtbl_ops = alloc_io_pgtable_ops(fmt, &domain->iop.pgtbl.cfg, domain);
- if (!pgtbl_ops)
- return -ENOMEM;
+ list_for_each_entry(dev_data, &pdom->dev_list, list)
+ device_flush_dte(dev_data);
+
+ domain_flush_complete(pdom);
+}
+
+/*
+ * amd_iommu_iotlb_sync_map() is used to generate flushes for non-present to
+ * present (ie mapping) operations. It is a NOP if the IOMMU doesn't have non
+ * present caching (like hypervisor shadowing).
+ */
+static int amd_iommu_iotlb_sync_map(struct iommu_domain *dom,
+ unsigned long iova, size_t size)
+{
+ struct protection_domain *domain = to_pdomain(dom);
+ unsigned long flags;
+ if (likely(!amd_iommu_np_cache))
+ return 0;
+
+ spin_lock_irqsave(&domain->lock, flags);
+ amd_iommu_domain_flush_pages(domain, iova, size);
+ spin_unlock_irqrestore(&domain->lock, flags);
return 0;
}
-static inline u64 dma_max_address(enum protection_domain_mode pgtable)
+static void amd_iommu_flush_iotlb_all(struct iommu_domain *domain)
{
- if (pgtable == PD_MODE_V1)
- return PM_LEVEL_SIZE(amd_iommu_hpt_level);
+ struct protection_domain *dom = to_pdomain(domain);
+ unsigned long flags;
- /*
- * V2 with 4/5 level page table. Note that "2.2.6.5 AMD64 4-Kbyte Page
- * Translation" shows that the V2 table sign extends the top of the
- * address space creating a reserved region in the middle of the
- * translation, just like the CPU does. Further Vasant says the docs are
- * incomplete and this only applies to non-zero PASIDs. If the AMDv2
- * page table is assigned to the 0 PASID then there is no sign extension
- * check.
- *
- * Since the IOMMU must have a fixed geometry, and the core code does
- * not understand sign extended addressing, we have to chop off the high
- * bit to get consistent behavior with attachments of the domain to any
- * PASID.
- */
- return ((1ULL << (PM_LEVEL_SHIFT(amd_iommu_gpt_level) - 1)) - 1);
+ spin_lock_irqsave(&dom->lock, flags);
+ amd_iommu_domain_flush_all(dom);
+ spin_unlock_irqrestore(&dom->lock, flags);
}
-static bool amd_iommu_hd_support(struct amd_iommu *iommu)
+static void amd_iommu_iotlb_sync(struct iommu_domain *domain,
+ struct iommu_iotlb_gather *gather)
{
- if (amd_iommu_hatdis)
- return false;
+ struct protection_domain *dom = to_pdomain(domain);
+ unsigned long flags;
- return iommu && (iommu->features & FEATURE_HDSUP);
+ spin_lock_irqsave(&dom->lock, flags);
+ amd_iommu_domain_flush_pages(dom, gather->start,
+ gather->end - gather->start + 1);
+ spin_unlock_irqrestore(&dom->lock, flags);
+ iommu_put_pages_list(&gather->freelist);
}
-static struct iommu_domain *
-do_iommu_domain_alloc(struct device *dev, u32 flags,
- enum protection_domain_mode pgtable)
+static const struct pt_iommu_driver_ops amd_hw_driver_ops_v1 = {
+ .get_top_lock = amd_iommu_get_top_lock,
+ .change_top = amd_iommu_change_top,
+};
+
+static const struct iommu_domain_ops amdv1_ops = {
+ IOMMU_PT_DOMAIN_OPS(amdv1),
+ .iotlb_sync_map = amd_iommu_iotlb_sync_map,
+ .flush_iotlb_all = amd_iommu_flush_iotlb_all,
+ .iotlb_sync = amd_iommu_iotlb_sync,
+ .attach_dev = amd_iommu_attach_device,
+ .free = amd_iommu_domain_free,
+ .enforce_cache_coherency = amd_iommu_enforce_cache_coherency,
+};
+
+static const struct iommu_dirty_ops amdv1_dirty_ops = {
+ IOMMU_PT_DIRTY_OPS(amdv1),
+ .set_dirty_tracking = amd_iommu_set_dirty_tracking,
+};
+
+static struct iommu_domain *amd_iommu_domain_alloc_paging_v1(struct device *dev,
+ u32 flags)
{
- bool dirty_tracking = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING;
- struct amd_iommu *iommu = get_amd_iommu_from_dev(dev);
+ struct pt_iommu_amdv1_cfg cfg = {};
struct protection_domain *domain;
int ret;
+ if (amd_iommu_hatdis)
+ return ERR_PTR(-EOPNOTSUPP);
+
domain = protection_domain_alloc();
if (!domain)
return ERR_PTR(-ENOMEM);
- domain->pd_mode = pgtable;
- ret = pdom_setup_pgtable(domain, dev);
+ domain->pd_mode = PD_MODE_V1;
+ domain->iommu.driver_ops = &amd_hw_driver_ops_v1;
+ domain->iommu.nid = dev_to_node(dev);
+ if (flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING)
+ domain->domain.dirty_ops = &amdv1_dirty_ops;
+
+ /*
+ * Someday FORCE_COHERENCE should be set by
+ * amd_iommu_enforce_cache_coherency() like VT-d does.
+ */
+ cfg.common.features = BIT(PT_FEAT_DYNAMIC_TOP) |
+ BIT(PT_FEAT_AMDV1_ENCRYPT_TABLES) |
+ BIT(PT_FEAT_AMDV1_FORCE_COHERENCE);
+
+ /*
+ * AMD's IOMMU can flush as many pages as necessary in a single flush.
+ * Unless we run in a virtual machine, which can be inferred according
+ * to whether "non-present cache" is on, it is probably best to prefer
+ * (potentially) too extensive TLB flushing (i.e., more misses) over
+ * multiple TLB flushes (i.e., more flushes). For virtual machines the
+ * hypervisor needs to synchronize the host IOMMU PTEs with those of
+ * the guest, and the trade-off is different: unnecessary TLB flushes
+ * should be avoided.
+ */
+ if (amd_iommu_np_cache)
+ cfg.common.features |= BIT(PT_FEAT_FLUSH_RANGE_NO_GAPS);
+ else
+ cfg.common.features |= BIT(PT_FEAT_FLUSH_RANGE);
+
+ cfg.common.hw_max_vasz_lg2 =
+ min(64, (amd_iommu_hpt_level - 1) * 9 + 21);
+ cfg.common.hw_max_oasz_lg2 = 52;
+ cfg.starting_level = 2;
+ domain->domain.ops = &amdv1_ops;
+
+ ret = pt_iommu_amdv1_init(&domain->amdv1, &cfg, GFP_KERNEL);
if (ret) {
- pdom_id_free(domain->id);
- kfree(domain);
+ amd_iommu_domain_free(&domain->domain);
return ERR_PTR(ret);
}
- domain->domain.geometry.aperture_start = 0;
- domain->domain.geometry.aperture_end = dma_max_address(pgtable);
- domain->domain.geometry.force_aperture = true;
- domain->domain.pgsize_bitmap = domain->iop.pgtbl.cfg.pgsize_bitmap;
+ /*
+ * Narrow the supported page sizes to those selected by the kernel
+ * command line.
+ */
+ domain->domain.pgsize_bitmap &= amd_iommu_pgsize_bitmap;
+ return &domain->domain;
+}
- domain->domain.type = IOMMU_DOMAIN_UNMANAGED;
- domain->domain.ops = iommu->iommu.ops->default_domain_ops;
+static const struct iommu_domain_ops amdv2_ops = {
+ IOMMU_PT_DOMAIN_OPS(x86_64),
+ .iotlb_sync_map = amd_iommu_iotlb_sync_map,
+ .flush_iotlb_all = amd_iommu_flush_iotlb_all,
+ .iotlb_sync = amd_iommu_iotlb_sync,
+ .attach_dev = amd_iommu_attach_device,
+ .free = amd_iommu_domain_free,
+ /*
+ * Note the AMDv2 page table format does not support a Force Coherency
+ * bit, so enforce_cache_coherency should not be set. However VFIO is
+ * not prepared to handle a case where some domains will support
+ * enforcement and others do not. VFIO and iommufd will have to be fixed
+ * before it can fully use the V2 page table. See the comment in
+ * iommufd_hwpt_paging_alloc(). For now leave things as they have
+ * historically been and lie about enforce_cache_coherencey.
+ */
+ .enforce_cache_coherency = amd_iommu_enforce_cache_coherency,
+};
- if (dirty_tracking)
- domain->domain.dirty_ops = &amd_dirty_ops;
+static struct iommu_domain *amd_iommu_domain_alloc_paging_v2(struct device *dev,
+ u32 flags)
+{
+ struct pt_iommu_x86_64_cfg cfg = {};
+ struct protection_domain *domain;
+ int ret;
+ if (!amd_iommu_v2_pgtbl_supported())
+ return ERR_PTR(-EOPNOTSUPP);
+
+ domain = protection_domain_alloc();
+ if (!domain)
+ return ERR_PTR(-ENOMEM);
+
+ domain->pd_mode = PD_MODE_V2;
+ domain->iommu.nid = dev_to_node(dev);
+
+ cfg.common.features = BIT(PT_FEAT_X86_64_AMD_ENCRYPT_TABLES);
+ if (amd_iommu_np_cache)
+ cfg.common.features |= BIT(PT_FEAT_FLUSH_RANGE_NO_GAPS);
+ else
+ cfg.common.features |= BIT(PT_FEAT_FLUSH_RANGE);
+
+ /*
+ * The v2 table behaves differently if it is attached to PASID 0 vs a
+ * non-zero PASID. On PASID 0 it has no sign extension and the full
+ * 57/48 bits decode the lower addresses. Otherwise it behaves like a
+ * normal sign extended x86 page table. Since we want the domain to work
+ * in both modes the top bit is removed and PT_FEAT_SIGN_EXTEND is not
+ * set which creates a table that is compatible in both modes.
+ */
+ if (amd_iommu_gpt_level == PAGE_MODE_5_LEVEL) {
+ cfg.common.hw_max_vasz_lg2 = 56;
+ cfg.top_level = 4;
+ } else {
+ cfg.common.hw_max_vasz_lg2 = 47;
+ cfg.top_level = 3;
+ }
+ cfg.common.hw_max_oasz_lg2 = 52;
+ domain->domain.ops = &amdv2_ops;
+
+ ret = pt_iommu_x86_64_init(&domain->amdv2, &cfg, GFP_KERNEL);
+ if (ret) {
+ amd_iommu_domain_free(&domain->domain);
+ return ERR_PTR(ret);
+ }
return &domain->domain;
}
@@ -2608,15 +2769,27 @@ amd_iommu_domain_alloc_paging_flags(struct device *dev, u32 flags,
/* Allocate domain with v1 page table for dirty tracking */
if (!amd_iommu_hd_support(iommu))
break;
- return do_iommu_domain_alloc(dev, flags, PD_MODE_V1);
+ return amd_iommu_domain_alloc_paging_v1(dev, flags);
case IOMMU_HWPT_ALLOC_PASID:
/* Allocate domain with v2 page table if IOMMU supports PASID. */
if (!amd_iommu_pasid_supported())
break;
- return do_iommu_domain_alloc(dev, flags, PD_MODE_V2);
- case 0:
+ return amd_iommu_domain_alloc_paging_v2(dev, flags);
+ case 0: {
+ struct iommu_domain *ret;
+
/* If nothing specific is required use the kernel commandline default */
- return do_iommu_domain_alloc(dev, 0, amd_iommu_pgtable);
+ if (amd_iommu_pgtable == PD_MODE_V1) {
+ ret = amd_iommu_domain_alloc_paging_v1(dev, flags);
+ if (ret != ERR_PTR(-EOPNOTSUPP))
+ return ret;
+ return amd_iommu_domain_alloc_paging_v2(dev, flags);
+ }
+ ret = amd_iommu_domain_alloc_paging_v2(dev, flags);
+ if (ret != ERR_PTR(-EOPNOTSUPP))
+ return ret;
+ return amd_iommu_domain_alloc_paging_v1(dev, flags);
+ }
default:
break;
}
@@ -2628,14 +2801,14 @@ void amd_iommu_domain_free(struct iommu_domain *dom)
struct protection_domain *domain = to_pdomain(dom);
WARN_ON(!list_empty(&domain->dev_list));
- if (domain->domain.type & __IOMMU_DOMAIN_PAGING)
- free_io_pgtable_ops(&domain->iop.pgtbl.ops);
+ pt_iommu_deinit(&domain->iommu);
pdom_id_free(domain->id);
kfree(domain);
}
static int blocked_domain_attach_device(struct iommu_domain *domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old)
{
struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev);
@@ -2685,16 +2858,8 @@ void amd_iommu_init_identity_domain(void)
protection_domain_init(&identity_domain);
}
-/* Same as blocked domain except it supports only ops->attach_dev() */
-static struct iommu_domain release_domain = {
- .type = IOMMU_DOMAIN_BLOCKED,
- .ops = &(const struct iommu_domain_ops) {
- .attach_dev = blocked_domain_attach_device,
- }
-};
-
-static int amd_iommu_attach_device(struct iommu_domain *dom,
- struct device *dev)
+static int amd_iommu_attach_device(struct iommu_domain *dom, struct device *dev,
+ struct iommu_domain *old)
{
struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev);
struct protection_domain *domain = to_pdomain(dom);
@@ -2734,93 +2899,6 @@ static int amd_iommu_attach_device(struct iommu_domain *dom,
return ret;
}
-static int amd_iommu_iotlb_sync_map(struct iommu_domain *dom,
- unsigned long iova, size_t size)
-{
- struct protection_domain *domain = to_pdomain(dom);
- struct io_pgtable_ops *ops = &domain->iop.pgtbl.ops;
-
- if (ops->map_pages)
- domain_flush_np_cache(domain, iova, size);
- return 0;
-}
-
-static int amd_iommu_map_pages(struct iommu_domain *dom, unsigned long iova,
- phys_addr_t paddr, size_t pgsize, size_t pgcount,
- int iommu_prot, gfp_t gfp, size_t *mapped)
-{
- struct protection_domain *domain = to_pdomain(dom);
- struct io_pgtable_ops *ops = &domain->iop.pgtbl.ops;
- int prot = 0;
- int ret = -EINVAL;
-
- if ((domain->pd_mode == PD_MODE_V1) &&
- (domain->iop.mode == PAGE_MODE_NONE))
- return -EINVAL;
-
- if (iommu_prot & IOMMU_READ)
- prot |= IOMMU_PROT_IR;
- if (iommu_prot & IOMMU_WRITE)
- prot |= IOMMU_PROT_IW;
-
- if (ops->map_pages) {
- ret = ops->map_pages(ops, iova, paddr, pgsize,
- pgcount, prot, gfp, mapped);
- }
-
- return ret;
-}
-
-static void amd_iommu_iotlb_gather_add_page(struct iommu_domain *domain,
- struct iommu_iotlb_gather *gather,
- unsigned long iova, size_t size)
-{
- /*
- * AMD's IOMMU can flush as many pages as necessary in a single flush.
- * Unless we run in a virtual machine, which can be inferred according
- * to whether "non-present cache" is on, it is probably best to prefer
- * (potentially) too extensive TLB flushing (i.e., more misses) over
- * mutliple TLB flushes (i.e., more flushes). For virtual machines the
- * hypervisor needs to synchronize the host IOMMU PTEs with those of
- * the guest, and the trade-off is different: unnecessary TLB flushes
- * should be avoided.
- */
- if (amd_iommu_np_cache &&
- iommu_iotlb_gather_is_disjoint(gather, iova, size))
- iommu_iotlb_sync(domain, gather);
-
- iommu_iotlb_gather_add_range(gather, iova, size);
-}
-
-static size_t amd_iommu_unmap_pages(struct iommu_domain *dom, unsigned long iova,
- size_t pgsize, size_t pgcount,
- struct iommu_iotlb_gather *gather)
-{
- struct protection_domain *domain = to_pdomain(dom);
- struct io_pgtable_ops *ops = &domain->iop.pgtbl.ops;
- size_t r;
-
- if ((domain->pd_mode == PD_MODE_V1) &&
- (domain->iop.mode == PAGE_MODE_NONE))
- return 0;
-
- r = (ops->unmap_pages) ? ops->unmap_pages(ops, iova, pgsize, pgcount, NULL) : 0;
-
- if (r)
- amd_iommu_iotlb_gather_add_page(dom, gather, iova, r);
-
- return r;
-}
-
-static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom,
- dma_addr_t iova)
-{
- struct protection_domain *domain = to_pdomain(dom);
- struct io_pgtable_ops *ops = &domain->iop.pgtbl.ops;
-
- return ops->iova_to_phys(ops, iova);
-}
-
static bool amd_iommu_capable(struct device *dev, enum iommu_cap cap)
{
switch (cap) {
@@ -2887,28 +2965,6 @@ static int amd_iommu_set_dirty_tracking(struct iommu_domain *domain,
return 0;
}
-static int amd_iommu_read_and_clear_dirty(struct iommu_domain *domain,
- unsigned long iova, size_t size,
- unsigned long flags,
- struct iommu_dirty_bitmap *dirty)
-{
- struct protection_domain *pdomain = to_pdomain(domain);
- struct io_pgtable_ops *ops = &pdomain->iop.pgtbl.ops;
- unsigned long lflags;
-
- if (!ops || !ops->read_and_clear_dirty)
- return -EOPNOTSUPP;
-
- spin_lock_irqsave(&pdomain->lock, lflags);
- if (!pdomain->dirty_tracking && dirty->bitmap) {
- spin_unlock_irqrestore(&pdomain->lock, lflags);
- return -EINVAL;
- }
- spin_unlock_irqrestore(&pdomain->lock, lflags);
-
- return ops->read_and_clear_dirty(ops, iova, size, flags, dirty);
-}
-
static void amd_iommu_get_resv_regions(struct device *dev,
struct list_head *head)
{
@@ -2978,28 +3034,6 @@ static bool amd_iommu_is_attach_deferred(struct device *dev)
return dev_data->defer_attach;
}
-static void amd_iommu_flush_iotlb_all(struct iommu_domain *domain)
-{
- struct protection_domain *dom = to_pdomain(domain);
- unsigned long flags;
-
- spin_lock_irqsave(&dom->lock, flags);
- amd_iommu_domain_flush_all(dom);
- spin_unlock_irqrestore(&dom->lock, flags);
-}
-
-static void amd_iommu_iotlb_sync(struct iommu_domain *domain,
- struct iommu_iotlb_gather *gather)
-{
- struct protection_domain *dom = to_pdomain(domain);
- unsigned long flags;
-
- spin_lock_irqsave(&dom->lock, flags);
- amd_iommu_domain_flush_pages(dom, gather->start,
- gather->end - gather->start + 1);
- spin_unlock_irqrestore(&dom->lock, flags);
-}
-
static int amd_iommu_def_domain_type(struct device *dev)
{
struct iommu_dev_data *dev_data;
@@ -3034,15 +3068,10 @@ static bool amd_iommu_enforce_cache_coherency(struct iommu_domain *domain)
return true;
}
-static const struct iommu_dirty_ops amd_dirty_ops = {
- .set_dirty_tracking = amd_iommu_set_dirty_tracking,
- .read_and_clear_dirty = amd_iommu_read_and_clear_dirty,
-};
-
const struct iommu_ops amd_iommu_ops = {
.capable = amd_iommu_capable,
.blocked_domain = &blocked_domain,
- .release_domain = &release_domain,
+ .release_domain = &blocked_domain,
.identity_domain = &identity_domain.domain,
.domain_alloc_paging_flags = amd_iommu_domain_alloc_paging_flags,
.domain_alloc_sva = amd_iommu_domain_alloc_sva,
@@ -3053,17 +3082,6 @@ const struct iommu_ops amd_iommu_ops = {
.is_attach_deferred = amd_iommu_is_attach_deferred,
.def_domain_type = amd_iommu_def_domain_type,
.page_response = amd_iommu_page_response,
- .default_domain_ops = &(const struct iommu_domain_ops) {
- .attach_dev = amd_iommu_attach_device,
- .map_pages = amd_iommu_map_pages,
- .unmap_pages = amd_iommu_unmap_pages,
- .iotlb_sync_map = amd_iommu_iotlb_sync_map,
- .iova_to_phys = amd_iommu_iova_to_phys,
- .flush_iotlb_all = amd_iommu_flush_iotlb_all,
- .iotlb_sync = amd_iommu_iotlb_sync,
- .free = amd_iommu_domain_free,
- .enforce_cache_coherency = amd_iommu_enforce_cache_coherency,
- }
};
#ifdef CONFIG_IRQ_REMAP
@@ -3354,7 +3372,7 @@ static int __modify_irte_ga(struct amd_iommu *iommu, u16 devid, int index,
static int modify_irte_ga(struct amd_iommu *iommu, u16 devid, int index,
struct irte_ga *irte)
{
- bool ret;
+ int ret;
ret = __modify_irte_ga(iommu, devid, index, irte);
if (ret)
@@ -4072,3 +4090,5 @@ int amd_iommu_create_irq_domain(struct amd_iommu *iommu)
return 0;
}
#endif
+
+MODULE_IMPORT_NS("GENERIC_PT_IOMMU");
diff --git a/drivers/iommu/apple-dart.c b/drivers/iommu/apple-dart.c
index 95a4e62b8f63..83a5aabcd15d 100644
--- a/drivers/iommu/apple-dart.c
+++ b/drivers/iommu/apple-dart.c
@@ -672,7 +672,8 @@ static int apple_dart_domain_add_streams(struct apple_dart_domain *domain,
}
static int apple_dart_attach_dev_paging(struct iommu_domain *domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old)
{
int ret, i;
struct apple_dart_stream_map *stream_map;
@@ -693,7 +694,8 @@ static int apple_dart_attach_dev_paging(struct iommu_domain *domain,
}
static int apple_dart_attach_dev_identity(struct iommu_domain *domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old)
{
struct apple_dart_master_cfg *cfg = dev_iommu_priv_get(dev);
struct apple_dart_stream_map *stream_map;
@@ -717,7 +719,8 @@ static struct iommu_domain apple_dart_identity_domain = {
};
static int apple_dart_attach_dev_blocked(struct iommu_domain *domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old)
{
struct apple_dart_master_cfg *cfg = dev_iommu_priv_get(dev);
struct apple_dart_stream_map *stream_map;
@@ -802,6 +805,8 @@ static int apple_dart_of_xlate(struct device *dev,
struct apple_dart *cfg_dart;
int i, sid;
+ put_device(&iommu_pdev->dev);
+
if (args->args_count != 1)
return -EINVAL;
sid = args->args[0];
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c
index 8cd8929bbfdf..93fdadd07431 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c
@@ -99,6 +99,8 @@ static void arm_smmu_make_nested_domain_ste(
int arm_smmu_attach_prepare_vmaster(struct arm_smmu_attach_state *state,
struct arm_smmu_nested_domain *nested_domain)
{
+ unsigned int cfg =
+ FIELD_GET(STRTAB_STE_0_CFG, le64_to_cpu(nested_domain->ste[0]));
struct arm_smmu_vmaster *vmaster;
unsigned long vsid;
int ret;
@@ -107,8 +109,17 @@ int arm_smmu_attach_prepare_vmaster(struct arm_smmu_attach_state *state,
ret = iommufd_viommu_get_vdev_id(&nested_domain->vsmmu->core,
state->master->dev, &vsid);
- if (ret)
+ /*
+ * Attaching to a translate nested domain must allocate a vDEVICE prior,
+ * as CD/ATS invalidations and vevents require a vSID to work properly.
+ * A abort/bypass domain is allowed to attach w/o vmaster for GBPA case.
+ */
+ if (ret) {
+ if (cfg == STRTAB_STE_0_CFG_ABORT ||
+ cfg == STRTAB_STE_0_CFG_BYPASS)
+ return 0;
return ret;
+ }
vmaster = kzalloc(sizeof(*vmaster), GFP_KERNEL);
if (!vmaster)
@@ -138,14 +149,15 @@ void arm_smmu_master_clear_vmaster(struct arm_smmu_master *master)
}
static int arm_smmu_attach_dev_nested(struct iommu_domain *domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old_domain)
{
struct arm_smmu_nested_domain *nested_domain =
to_smmu_nested_domain(domain);
struct arm_smmu_master *master = dev_iommu_priv_get(dev);
struct arm_smmu_attach_state state = {
.master = master,
- .old_domain = iommu_get_domain_for_dev(dev),
+ .old_domain = old_domain,
.ssid = IOMMU_NO_PASID,
};
struct arm_smmu_ste ste;
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 2a8b46b948f0..d16d35c78c06 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -1464,7 +1464,7 @@ static int arm_smmu_alloc_cd_tables(struct arm_smmu_master *master)
cd_table->l2.l1tab = dma_alloc_coherent(smmu->dev, l1size,
&cd_table->cdtab_dma,
GFP_KERNEL);
- if (!cd_table->l2.l2ptrs) {
+ if (!cd_table->l2.l1tab) {
ret = -ENOMEM;
goto err_free_l2ptrs;
}
@@ -3002,7 +3002,8 @@ void arm_smmu_attach_commit(struct arm_smmu_attach_state *state)
master->ats_enabled = state->ats_enabled;
}
-static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
+static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev,
+ struct iommu_domain *old_domain)
{
int ret = 0;
struct arm_smmu_ste target;
@@ -3010,7 +3011,7 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
struct arm_smmu_device *smmu;
struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
struct arm_smmu_attach_state state = {
- .old_domain = iommu_get_domain_for_dev(dev),
+ .old_domain = old_domain,
.ssid = IOMMU_NO_PASID,
};
struct arm_smmu_master *master;
@@ -3186,7 +3187,7 @@ static int arm_smmu_blocking_set_dev_pasid(struct iommu_domain *new_domain,
/*
* When the last user of the CD table goes away downgrade the STE back
- * to a non-cd_table one.
+ * to a non-cd_table one, by re-attaching its sid_domain.
*/
if (!arm_smmu_ssids_in_use(&master->cd_table)) {
struct iommu_domain *sid_domain =
@@ -3194,12 +3195,14 @@ static int arm_smmu_blocking_set_dev_pasid(struct iommu_domain *new_domain,
if (sid_domain->type == IOMMU_DOMAIN_IDENTITY ||
sid_domain->type == IOMMU_DOMAIN_BLOCKED)
- sid_domain->ops->attach_dev(sid_domain, dev);
+ sid_domain->ops->attach_dev(sid_domain, dev,
+ sid_domain);
}
return 0;
}
static void arm_smmu_attach_dev_ste(struct iommu_domain *domain,
+ struct iommu_domain *old_domain,
struct device *dev,
struct arm_smmu_ste *ste,
unsigned int s1dss)
@@ -3207,7 +3210,7 @@ static void arm_smmu_attach_dev_ste(struct iommu_domain *domain,
struct arm_smmu_master *master = dev_iommu_priv_get(dev);
struct arm_smmu_attach_state state = {
.master = master,
- .old_domain = iommu_get_domain_for_dev(dev),
+ .old_domain = old_domain,
.ssid = IOMMU_NO_PASID,
};
@@ -3248,14 +3251,16 @@ static void arm_smmu_attach_dev_ste(struct iommu_domain *domain,
}
static int arm_smmu_attach_dev_identity(struct iommu_domain *domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old_domain)
{
struct arm_smmu_ste ste;
struct arm_smmu_master *master = dev_iommu_priv_get(dev);
arm_smmu_master_clear_vmaster(master);
arm_smmu_make_bypass_ste(master->smmu, &ste);
- arm_smmu_attach_dev_ste(domain, dev, &ste, STRTAB_STE_1_S1DSS_BYPASS);
+ arm_smmu_attach_dev_ste(domain, old_domain, dev, &ste,
+ STRTAB_STE_1_S1DSS_BYPASS);
return 0;
}
@@ -3269,14 +3274,15 @@ static struct iommu_domain arm_smmu_identity_domain = {
};
static int arm_smmu_attach_dev_blocked(struct iommu_domain *domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old_domain)
{
struct arm_smmu_ste ste;
struct arm_smmu_master *master = dev_iommu_priv_get(dev);
arm_smmu_master_clear_vmaster(master);
arm_smmu_make_abort_ste(&ste);
- arm_smmu_attach_dev_ste(domain, dev, &ste,
+ arm_smmu_attach_dev_ste(domain, old_domain, dev, &ste,
STRTAB_STE_1_S1DSS_TERMINATE);
return 0;
}
@@ -3582,12 +3588,6 @@ static void arm_smmu_release_device(struct device *dev)
WARN_ON(master->iopf_refcount);
- /* Put the STE back to what arm_smmu_init_strtab() sets */
- if (dev->iommu->require_direct)
- arm_smmu_attach_dev_identity(&arm_smmu_identity_domain, dev);
- else
- arm_smmu_attach_dev_blocked(&arm_smmu_blocked_domain, dev);
-
arm_smmu_disable_pasid(master);
arm_smmu_remove_master(master);
if (arm_smmu_cdtab_allocated(&master->cd_table))
@@ -3678,6 +3678,7 @@ static int arm_smmu_def_domain_type(struct device *dev)
static const struct iommu_ops arm_smmu_ops = {
.identity_domain = &arm_smmu_identity_domain,
.blocked_domain = &arm_smmu_blocked_domain,
+ .release_domain = &arm_smmu_blocked_domain,
.capable = arm_smmu_capable,
.hw_info = arm_smmu_hw_info,
.domain_alloc_sva = arm_smmu_sva_domain_alloc,
diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
index 57c097e87613..573085349df3 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
@@ -367,6 +367,7 @@ static int qcom_adreno_smmu_init_context(struct arm_smmu_domain *smmu_domain,
static const struct of_device_id qcom_smmu_client_of_match[] __maybe_unused = {
{ .compatible = "qcom,adreno" },
{ .compatible = "qcom,adreno-gmu" },
+ { .compatible = "qcom,glymur-mdss" },
{ .compatible = "qcom,mdp4" },
{ .compatible = "qcom,mdss" },
{ .compatible = "qcom,qcm2290-mdss" },
@@ -431,17 +432,19 @@ static int qcom_smmu_cfg_probe(struct arm_smmu_device *smmu)
/*
* Some platforms support more than the Arm SMMU architected maximum of
- * 128 stream matching groups. For unknown reasons, the additional
- * groups don't exhibit the same behavior as the architected registers,
- * so limit the groups to 128 until the behavior is fixed for the other
- * groups.
+ * 128 stream matching groups. The additional registers appear to have
+ * the same behavior as the architected registers in the hardware.
+ * However, on some firmware versions, the hypervisor does not
+ * correctly trap and emulate accesses to the additional registers,
+ * resulting in unexpected behavior.
+ *
+ * If there are more than 128 groups, use the last reliable group to
+ * detect if we need to apply the bypass quirk.
*/
- if (smmu->num_mapping_groups > 128) {
- dev_notice(smmu->dev, "\tLimiting the stream matching groups to 128\n");
- smmu->num_mapping_groups = 128;
- }
-
- last_s2cr = ARM_SMMU_GR0_S2CR(smmu->num_mapping_groups - 1);
+ if (smmu->num_mapping_groups > 128)
+ last_s2cr = ARM_SMMU_GR0_S2CR(127);
+ else
+ last_s2cr = ARM_SMMU_GR0_S2CR(smmu->num_mapping_groups - 1);
/*
* With some firmware versions writes to S2CR of type FAULT are
@@ -464,6 +467,11 @@ static int qcom_smmu_cfg_probe(struct arm_smmu_device *smmu)
reg = FIELD_PREP(ARM_SMMU_CBAR_TYPE, CBAR_TYPE_S1_TRANS_S2_BYPASS);
arm_smmu_gr1_write(smmu, ARM_SMMU_GR1_CBAR(qsmmu->bypass_cbndx), reg);
+
+ if (smmu->num_mapping_groups > 128) {
+ dev_notice(smmu->dev, "\tLimiting the stream matching groups to 128\n");
+ smmu->num_mapping_groups = 128;
+ }
}
for (i = 0; i < smmu->num_mapping_groups; i++) {
diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c b/drivers/iommu/arm/arm-smmu/arm-smmu.c
index 4ced4b5bee4d..5e690cf85ec9 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c
@@ -1165,7 +1165,8 @@ static void arm_smmu_master_install_s2crs(struct arm_smmu_master_cfg *cfg,
}
}
-static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
+static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev,
+ struct iommu_domain *old)
{
struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
@@ -1234,7 +1235,8 @@ static int arm_smmu_attach_dev_type(struct device *dev,
}
static int arm_smmu_attach_dev_identity(struct iommu_domain *domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old)
{
return arm_smmu_attach_dev_type(dev, S2CR_TYPE_BYPASS);
}
@@ -1249,7 +1251,8 @@ static struct iommu_domain arm_smmu_identity_domain = {
};
static int arm_smmu_attach_dev_blocked(struct iommu_domain *domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old)
{
return arm_smmu_attach_dev_type(dev, S2CR_TYPE_FAULT);
}
diff --git a/drivers/iommu/arm/arm-smmu/qcom_iommu.c b/drivers/iommu/arm/arm-smmu/qcom_iommu.c
index c5be95e56031..f69d9276dc55 100644
--- a/drivers/iommu/arm/arm-smmu/qcom_iommu.c
+++ b/drivers/iommu/arm/arm-smmu/qcom_iommu.c
@@ -359,7 +359,8 @@ static void qcom_iommu_domain_free(struct iommu_domain *domain)
kfree(qcom_domain);
}
-static int qcom_iommu_attach_dev(struct iommu_domain *domain, struct device *dev)
+static int qcom_iommu_attach_dev(struct iommu_domain *domain,
+ struct device *dev, struct iommu_domain *old)
{
struct qcom_iommu_dev *qcom_iommu = dev_iommu_priv_get(dev);
struct qcom_iommu_domain *qcom_domain = to_qcom_iommu_domain(domain);
@@ -388,18 +389,18 @@ static int qcom_iommu_attach_dev(struct iommu_domain *domain, struct device *dev
}
static int qcom_iommu_identity_attach(struct iommu_domain *identity_domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old)
{
- struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
struct qcom_iommu_domain *qcom_domain;
struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
struct qcom_iommu_dev *qcom_iommu = dev_iommu_priv_get(dev);
unsigned int i;
- if (domain == identity_domain || !domain)
+ if (old == identity_domain || !old)
return 0;
- qcom_domain = to_qcom_iommu_domain(domain);
+ qcom_domain = to_qcom_iommu_domain(old);
if (WARN_ON(!qcom_domain->iommu))
return -EINVAL;
@@ -565,14 +566,14 @@ static int qcom_iommu_of_xlate(struct device *dev,
qcom_iommu = platform_get_drvdata(iommu_pdev);
+ put_device(&iommu_pdev->dev);
+
/* make sure the asid specified in dt is valid, so we don't have
* to sanity check this elsewhere:
*/
if (WARN_ON(asid > qcom_iommu->max_asid) ||
- WARN_ON(qcom_iommu->ctxs[asid] == NULL)) {
- put_device(&iommu_pdev->dev);
+ WARN_ON(qcom_iommu->ctxs[asid] == NULL))
return -EINVAL;
- }
if (!dev_iommu_priv_get(dev)) {
dev_iommu_priv_set(dev, qcom_iommu);
@@ -581,10 +582,8 @@ static int qcom_iommu_of_xlate(struct device *dev,
* multiple different iommu devices. Multiple context
* banks are ok, but multiple devices are not:
*/
- if (WARN_ON(qcom_iommu != dev_iommu_priv_get(dev))) {
- put_device(&iommu_pdev->dev);
+ if (WARN_ON(qcom_iommu != dev_iommu_priv_get(dev)))
return -EINVAL;
- }
}
return iommu_fwspec_add_ids(dev, &asid, 1);
diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 7944a3af4545..c92088855450 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -1439,8 +1439,8 @@ int iommu_dma_map_sg(struct device *dev, struct scatterlist *sg, int nents,
* as a bus address, __finalise_sg() will copy the dma
* address into the output segment.
*/
- s->dma_address = pci_p2pdma_bus_addr_map(&p2pdma_state,
- sg_phys(s));
+ s->dma_address = pci_p2pdma_bus_addr_map(
+ p2pdma_state.mem, sg_phys(s));
sg_dma_len(s) = sg->length;
sg_dma_mark_bus_address(s);
continue;
@@ -2008,7 +2008,7 @@ static void iommu_dma_iova_unlink_range_slow(struct device *dev,
end - addr, iovad->granule - iova_start_pad);
if (!dev_is_dma_coherent(dev) &&
- !(attrs & DMA_ATTR_SKIP_CPU_SYNC))
+ !(attrs & (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_MMIO)))
arch_sync_dma_for_cpu(phys, len, dir);
swiotlb_tbl_unmap_single(dev, phys, len, dir, attrs);
@@ -2032,7 +2032,8 @@ static void __iommu_dma_iova_unlink(struct device *dev,
size_t unmapped;
if ((state->__size & DMA_IOVA_USE_SWIOTLB) ||
- (!dev_is_dma_coherent(dev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)))
+ (!dev_is_dma_coherent(dev) &&
+ !(attrs & (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_MMIO))))
iommu_dma_iova_unlink_range_slow(dev, addr, size, dir, attrs);
iommu_iotlb_gather_init(&iotlb_gather);
diff --git a/drivers/iommu/exynos-iommu.c b/drivers/iommu/exynos-iommu.c
index b6edd178fe25..b512c6b939ac 100644
--- a/drivers/iommu/exynos-iommu.c
+++ b/drivers/iommu/exynos-iommu.c
@@ -984,7 +984,8 @@ static void exynos_iommu_domain_free(struct iommu_domain *iommu_domain)
}
static int exynos_iommu_identity_attach(struct iommu_domain *identity_domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old)
{
struct exynos_iommu_owner *owner = dev_iommu_priv_get(dev);
struct exynos_iommu_domain *domain;
@@ -1035,7 +1036,8 @@ static struct iommu_domain exynos_identity_domain = {
};
static int exynos_iommu_attach_device(struct iommu_domain *iommu_domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old)
{
struct exynos_iommu_domain *domain = to_exynos_domain(iommu_domain);
struct exynos_iommu_owner *owner = dev_iommu_priv_get(dev);
@@ -1044,7 +1046,7 @@ static int exynos_iommu_attach_device(struct iommu_domain *iommu_domain,
unsigned long flags;
int err;
- err = exynos_iommu_identity_attach(&exynos_identity_domain, dev);
+ err = exynos_iommu_identity_attach(&exynos_identity_domain, dev, old);
if (err)
return err;
@@ -1429,8 +1431,6 @@ static void exynos_iommu_release_device(struct device *dev)
struct exynos_iommu_owner *owner = dev_iommu_priv_get(dev);
struct sysmmu_drvdata *data;
- WARN_ON(exynos_iommu_identity_attach(&exynos_identity_domain, dev));
-
list_for_each_entry(data, &owner->controllers, owner_node)
device_link_del(data->link);
}
@@ -1446,17 +1446,14 @@ static int exynos_iommu_of_xlate(struct device *dev,
return -ENODEV;
data = platform_get_drvdata(sysmmu);
- if (!data) {
- put_device(&sysmmu->dev);
+ put_device(&sysmmu->dev);
+ if (!data)
return -ENODEV;
- }
if (!owner) {
owner = kzalloc(sizeof(*owner), GFP_KERNEL);
- if (!owner) {
- put_device(&sysmmu->dev);
+ if (!owner)
return -ENOMEM;
- }
INIT_LIST_HEAD(&owner->controllers);
mutex_init(&owner->rpm_lock);
@@ -1476,6 +1473,7 @@ static int exynos_iommu_of_xlate(struct device *dev,
static const struct iommu_ops exynos_iommu_ops = {
.identity_domain = &exynos_identity_domain,
+ .release_domain = &exynos_identity_domain,
.domain_alloc_paging = exynos_iommu_domain_alloc_paging,
.device_group = generic_device_group,
.probe_device = exynos_iommu_probe_device,
diff --git a/drivers/iommu/fsl_pamu_domain.c b/drivers/iommu/fsl_pamu_domain.c
index 5f08523f97cb..9664ef9840d2 100644
--- a/drivers/iommu/fsl_pamu_domain.c
+++ b/drivers/iommu/fsl_pamu_domain.c
@@ -238,7 +238,7 @@ static int update_domain_stash(struct fsl_dma_domain *dma_domain, u32 val)
}
static int fsl_pamu_attach_device(struct iommu_domain *domain,
- struct device *dev)
+ struct device *dev, struct iommu_domain *old)
{
struct fsl_dma_domain *dma_domain = to_fsl_dma_domain(domain);
unsigned long flags;
@@ -298,9 +298,9 @@ static int fsl_pamu_attach_device(struct iommu_domain *domain,
* switches to what looks like BLOCKING.
*/
static int fsl_pamu_platform_attach(struct iommu_domain *platform_domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old)
{
- struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
struct fsl_dma_domain *dma_domain;
const u32 *prop;
int len;
@@ -311,11 +311,11 @@ static int fsl_pamu_platform_attach(struct iommu_domain *platform_domain,
* Hack to keep things working as they always have, only leaving an
* UNMANAGED domain makes it BLOCKING.
*/
- if (domain == platform_domain || !domain ||
- domain->type != IOMMU_DOMAIN_UNMANAGED)
+ if (old == platform_domain || !old ||
+ old->type != IOMMU_DOMAIN_UNMANAGED)
return 0;
- dma_domain = to_fsl_dma_domain(domain);
+ dma_domain = to_fsl_dma_domain(old);
/*
* Use LIODN of the PCI controller while detaching a
diff --git a/drivers/iommu/generic_pt/.kunitconfig b/drivers/iommu/generic_pt/.kunitconfig
new file mode 100644
index 000000000000..52ac9e661ffd
--- /dev/null
+++ b/drivers/iommu/generic_pt/.kunitconfig
@@ -0,0 +1,14 @@
+CONFIG_KUNIT=y
+CONFIG_GENERIC_PT=y
+CONFIG_DEBUG_GENERIC_PT=y
+CONFIG_IOMMU_PT=y
+CONFIG_IOMMU_PT_AMDV1=y
+CONFIG_IOMMU_PT_VTDSS=y
+CONFIG_IOMMU_PT_X86_64=y
+CONFIG_IOMMU_PT_KUNIT_TEST=y
+
+CONFIG_IOMMUFD=y
+CONFIG_DEBUG_KERNEL=y
+CONFIG_FAULT_INJECTION=y
+CONFIG_RUNTIME_TESTING_MENU=y
+CONFIG_IOMMUFD_TEST=y
diff --git a/drivers/iommu/generic_pt/Kconfig b/drivers/iommu/generic_pt/Kconfig
new file mode 100644
index 000000000000..ce4fb4786914
--- /dev/null
+++ b/drivers/iommu/generic_pt/Kconfig
@@ -0,0 +1,79 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+menuconfig GENERIC_PT
+ bool "Generic Radix Page Table" if COMPILE_TEST
+ help
+ Generic library for building radix tree page tables.
+
+ Generic PT provides a set of HW page table formats and a common
+ set of APIs to work with them.
+
+if GENERIC_PT
+config DEBUG_GENERIC_PT
+ bool "Extra debugging checks for GENERIC_PT"
+ help
+ Enable extra run time debugging checks for GENERIC_PT code. This
+ incurs a runtime cost and should not be enabled for production
+ kernels.
+
+ The kunit tests require this to be enabled to get full coverage.
+
+config IOMMU_PT
+ tristate "IOMMU Page Tables"
+ select IOMMU_API
+ depends on IOMMU_SUPPORT
+ depends on GENERIC_PT
+ help
+ Generic library for building IOMMU page tables
+
+ IOMMU_PT provides an implementation of the page table operations
+ related to struct iommu_domain using GENERIC_PT. It provides a single
+ implementation of the page table operations that can be shared by
+ multiple drivers.
+
+if IOMMU_PT
+config IOMMU_PT_AMDV1
+ tristate "IOMMU page table for 64-bit AMD IOMMU v1"
+ depends on !GENERIC_ATOMIC64 # for cmpxchg64
+ help
+ iommu_domain implementation for the AMD v1 page table. AMDv1 is the
+ "host" page table. It supports granular page sizes of almost every
+ power of 2 and decodes the full 64-bit IOVA space.
+
+ Selected automatically by an IOMMU driver that uses this format.
+
+config IOMMU_PT_VTDSS
+ tristate "IOMMU page table for Intel VT-d Second Stage"
+ depends on !GENERIC_ATOMIC64 # for cmpxchg64
+ help
+ iommu_domain implementation for the Intel VT-d's 64 bit 3/4/5
+ level Second Stage page table. It is similar to the X86_64 format with
+ 4K/2M/1G page sizes.
+
+ Selected automatically by an IOMMU driver that uses this format.
+
+config IOMMU_PT_X86_64
+ tristate "IOMMU page table for x86 64-bit, 4/5 levels"
+ depends on !GENERIC_ATOMIC64 # for cmpxchg64
+ help
+ iommu_domain implementation for the x86 64-bit 4/5 level page table.
+ It supports 4K/2M/1G page sizes and can decode a sign-extended
+ portion of the 64-bit IOVA space.
+
+ Selected automatically by an IOMMU driver that uses this format.
+
+config IOMMU_PT_KUNIT_TEST
+ tristate "IOMMU Page Table KUnit Test" if !KUNIT_ALL_TESTS
+ depends on KUNIT
+ depends on IOMMU_PT_AMDV1 || !IOMMU_PT_AMDV1
+ depends on IOMMU_PT_X86_64 || !IOMMU_PT_X86_64
+ depends on IOMMU_PT_VTDSS || !IOMMU_PT_VTDSS
+ default KUNIT_ALL_TESTS
+ help
+ Enable kunit tests for GENERIC_PT and IOMMU_PT that covers all the
+ enabled page table formats. The test covers most of the GENERIC_PT
+ functions provided by the page table format, as well as covering the
+ iommu_domain related functions.
+
+endif
+endif
diff --git a/drivers/iommu/generic_pt/fmt/Makefile b/drivers/iommu/generic_pt/fmt/Makefile
new file mode 100644
index 000000000000..976b49ec97dc
--- /dev/null
+++ b/drivers/iommu/generic_pt/fmt/Makefile
@@ -0,0 +1,28 @@
+# SPDX-License-Identifier: GPL-2.0
+
+iommu_pt_fmt-$(CONFIG_IOMMU_PT_AMDV1) += amdv1
+iommu_pt_fmt-$(CONFIG_IOMMUFD_TEST) += mock
+
+iommu_pt_fmt-$(CONFIG_IOMMU_PT_VTDSS) += vtdss
+
+iommu_pt_fmt-$(CONFIG_IOMMU_PT_X86_64) += x86_64
+
+IOMMU_PT_KUNIT_TEST :=
+define create_format
+obj-$(2) += iommu_$(1).o
+iommu_pt_kunit_test-y += kunit_iommu_$(1).o
+CFLAGS_kunit_iommu_$(1).o += -DGENERIC_PT_KUNIT=1
+IOMMU_PT_KUNIT_TEST := iommu_pt_kunit_test.o
+
+endef
+
+$(eval $(foreach fmt,$(iommu_pt_fmt-y),$(call create_format,$(fmt),y)))
+$(eval $(foreach fmt,$(iommu_pt_fmt-m),$(call create_format,$(fmt),m)))
+
+# The kunit objects are constructed by compiling the main source
+# with -DGENERIC_PT_KUNIT
+$(obj)/kunit_iommu_%.o: $(src)/iommu_%.c FORCE
+ $(call rule_mkdir)
+ $(call if_changed_dep,cc_o_c)
+
+obj-$(CONFIG_IOMMU_PT_KUNIT_TEST) += $(IOMMU_PT_KUNIT_TEST)
diff --git a/drivers/iommu/generic_pt/fmt/amdv1.h b/drivers/iommu/generic_pt/fmt/amdv1.h
new file mode 100644
index 000000000000..aa8e1a8ec95f
--- /dev/null
+++ b/drivers/iommu/generic_pt/fmt/amdv1.h
@@ -0,0 +1,411 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES
+ *
+ * AMD IOMMU v1 page table
+ *
+ * This is described in Section "2.2.3 I/O Page Tables for Host Translations"
+ * of the "AMD I/O Virtualization Technology (IOMMU) Specification"
+ *
+ * Note the level numbering here matches the core code, so level 0 is the same
+ * as mode 1.
+ *
+ */
+#ifndef __GENERIC_PT_FMT_AMDV1_H
+#define __GENERIC_PT_FMT_AMDV1_H
+
+#include "defs_amdv1.h"
+#include "../pt_defs.h"
+
+#include <asm/page.h>
+#include <linux/bitfield.h>
+#include <linux/container_of.h>
+#include <linux/mem_encrypt.h>
+#include <linux/minmax.h>
+#include <linux/sizes.h>
+#include <linux/string.h>
+
+enum {
+ PT_ITEM_WORD_SIZE = sizeof(u64),
+ /*
+ * The IOMMUFD selftest uses the AMDv1 format with some alterations It
+ * uses a 2k page size to test cases where the CPU page size is not the
+ * same.
+ */
+#ifdef AMDV1_IOMMUFD_SELFTEST
+ PT_MAX_VA_ADDRESS_LG2 = 56,
+ PT_MAX_OUTPUT_ADDRESS_LG2 = 51,
+ PT_MAX_TOP_LEVEL = 4,
+ PT_GRANULE_LG2SZ = 11,
+#else
+ PT_MAX_VA_ADDRESS_LG2 = 64,
+ PT_MAX_OUTPUT_ADDRESS_LG2 = 52,
+ PT_MAX_TOP_LEVEL = 5,
+ PT_GRANULE_LG2SZ = 12,
+#endif
+ PT_TABLEMEM_LG2SZ = 12,
+
+ /* The DTE only has these bits for the top phyiscal address */
+ PT_TOP_PHYS_MASK = GENMASK_ULL(51, 12),
+};
+
+/* PTE bits */
+enum {
+ AMDV1PT_FMT_PR = BIT(0),
+ AMDV1PT_FMT_D = BIT(6),
+ AMDV1PT_FMT_NEXT_LEVEL = GENMASK_ULL(11, 9),
+ AMDV1PT_FMT_OA = GENMASK_ULL(51, 12),
+ AMDV1PT_FMT_FC = BIT_ULL(60),
+ AMDV1PT_FMT_IR = BIT_ULL(61),
+ AMDV1PT_FMT_IW = BIT_ULL(62),
+};
+
+/*
+ * gcc 13 has a bug where it thinks the output of FIELD_GET() is an enum, make
+ * these defines to avoid it.
+ */
+#define AMDV1PT_FMT_NL_DEFAULT 0
+#define AMDV1PT_FMT_NL_SIZE 7
+
+static inline pt_oaddr_t amdv1pt_table_pa(const struct pt_state *pts)
+{
+ u64 entry = pts->entry;
+
+ if (pts_feature(pts, PT_FEAT_AMDV1_ENCRYPT_TABLES))
+ entry = __sme_clr(entry);
+ return oalog2_mul(FIELD_GET(AMDV1PT_FMT_OA, entry), PT_GRANULE_LG2SZ);
+}
+#define pt_table_pa amdv1pt_table_pa
+
+/* Returns the oa for the start of the contiguous entry */
+static inline pt_oaddr_t amdv1pt_entry_oa(const struct pt_state *pts)
+{
+ u64 entry = pts->entry;
+ pt_oaddr_t oa;
+
+ if (pts_feature(pts, PT_FEAT_AMDV1_ENCRYPT_TABLES))
+ entry = __sme_clr(entry);
+ oa = FIELD_GET(AMDV1PT_FMT_OA, entry);
+
+ if (FIELD_GET(AMDV1PT_FMT_NEXT_LEVEL, entry) == AMDV1PT_FMT_NL_SIZE) {
+ unsigned int sz_bits = oaffz(oa);
+
+ oa = oalog2_set_mod(oa, 0, sz_bits);
+ } else if (PT_WARN_ON(FIELD_GET(AMDV1PT_FMT_NEXT_LEVEL, entry) !=
+ AMDV1PT_FMT_NL_DEFAULT))
+ return 0;
+ return oalog2_mul(oa, PT_GRANULE_LG2SZ);
+}
+#define pt_entry_oa amdv1pt_entry_oa
+
+static inline bool amdv1pt_can_have_leaf(const struct pt_state *pts)
+{
+ /*
+ * Table 15: Page Table Level Parameters
+ * The top most level cannot have translation entries
+ */
+ return pts->level < PT_MAX_TOP_LEVEL;
+}
+#define pt_can_have_leaf amdv1pt_can_have_leaf
+
+/* Body in pt_fmt_defaults.h */
+static inline unsigned int pt_table_item_lg2sz(const struct pt_state *pts);
+
+static inline unsigned int
+amdv1pt_entry_num_contig_lg2(const struct pt_state *pts)
+{
+ u32 code;
+
+ if (FIELD_GET(AMDV1PT_FMT_NEXT_LEVEL, pts->entry) ==
+ AMDV1PT_FMT_NL_DEFAULT)
+ return ilog2(1);
+
+ PT_WARN_ON(FIELD_GET(AMDV1PT_FMT_NEXT_LEVEL, pts->entry) !=
+ AMDV1PT_FMT_NL_SIZE);
+
+ /*
+ * The contiguous size is encoded in the length of a string of 1's in
+ * the low bits of the OA. Reverse the equation:
+ * code = log2_to_int(num_contig_lg2 + item_lg2sz -
+ * PT_GRANULE_LG2SZ - 1) - 1
+ * Which can be expressed as:
+ * num_contig_lg2 = oalog2_ffz(code) + 1 -
+ * item_lg2sz - PT_GRANULE_LG2SZ
+ *
+ * Assume the bit layout is correct and remove the masking. Reorganize
+ * the equation to move all the arithmetic before the ffz.
+ */
+ code = pts->entry >> (__bf_shf(AMDV1PT_FMT_OA) - 1 +
+ pt_table_item_lg2sz(pts) - PT_GRANULE_LG2SZ);
+ return ffz_t(u32, code);
+}
+#define pt_entry_num_contig_lg2 amdv1pt_entry_num_contig_lg2
+
+static inline unsigned int amdv1pt_num_items_lg2(const struct pt_state *pts)
+{
+ /*
+ * Top entry covers bits [63:57] only, this is handled through
+ * max_vasz_lg2.
+ */
+ if (PT_WARN_ON(pts->level == 5))
+ return 7;
+ return PT_TABLEMEM_LG2SZ - ilog2(sizeof(u64));
+}
+#define pt_num_items_lg2 amdv1pt_num_items_lg2
+
+static inline pt_vaddr_t amdv1pt_possible_sizes(const struct pt_state *pts)
+{
+ unsigned int isz_lg2 = pt_table_item_lg2sz(pts);
+
+ if (!amdv1pt_can_have_leaf(pts))
+ return 0;
+
+ /*
+ * Table 14: Example Page Size Encodings
+ * Address bits 51:32 can be used to encode page sizes greater than 4
+ * Gbytes. Address bits 63:52 are zero-extended.
+ *
+ * 512GB Pages are not supported due to a hardware bug.
+ * Otherwise every power of two size is supported.
+ */
+ return GENMASK_ULL(min(51, isz_lg2 + amdv1pt_num_items_lg2(pts) - 1),
+ isz_lg2) & ~SZ_512G;
+}
+#define pt_possible_sizes amdv1pt_possible_sizes
+
+static inline enum pt_entry_type amdv1pt_load_entry_raw(struct pt_state *pts)
+{
+ const u64 *tablep = pt_cur_table(pts, u64) + pts->index;
+ unsigned int next_level;
+ u64 entry;
+
+ pts->entry = entry = READ_ONCE(*tablep);
+ if (!(entry & AMDV1PT_FMT_PR))
+ return PT_ENTRY_EMPTY;
+
+ next_level = FIELD_GET(AMDV1PT_FMT_NEXT_LEVEL, pts->entry);
+ if (pts->level == 0 || next_level == AMDV1PT_FMT_NL_DEFAULT ||
+ next_level == AMDV1PT_FMT_NL_SIZE)
+ return PT_ENTRY_OA;
+ return PT_ENTRY_TABLE;
+}
+#define pt_load_entry_raw amdv1pt_load_entry_raw
+
+static inline void
+amdv1pt_install_leaf_entry(struct pt_state *pts, pt_oaddr_t oa,
+ unsigned int oasz_lg2,
+ const struct pt_write_attrs *attrs)
+{
+ unsigned int isz_lg2 = pt_table_item_lg2sz(pts);
+ u64 *tablep = pt_cur_table(pts, u64) + pts->index;
+ u64 entry;
+
+ if (!pt_check_install_leaf_args(pts, oa, oasz_lg2))
+ return;
+
+ entry = AMDV1PT_FMT_PR |
+ FIELD_PREP(AMDV1PT_FMT_OA, log2_div(oa, PT_GRANULE_LG2SZ)) |
+ attrs->descriptor_bits;
+
+ if (oasz_lg2 == isz_lg2) {
+ entry |= FIELD_PREP(AMDV1PT_FMT_NEXT_LEVEL,
+ AMDV1PT_FMT_NL_DEFAULT);
+ WRITE_ONCE(*tablep, entry);
+ } else {
+ unsigned int num_contig_lg2 = oasz_lg2 - isz_lg2;
+ u64 *end = tablep + log2_to_int(num_contig_lg2);
+
+ entry |= FIELD_PREP(AMDV1PT_FMT_NEXT_LEVEL,
+ AMDV1PT_FMT_NL_SIZE) |
+ FIELD_PREP(AMDV1PT_FMT_OA,
+ oalog2_to_int(oasz_lg2 - PT_GRANULE_LG2SZ -
+ 1) -
+ 1);
+
+ /* See amdv1pt_clear_entries() */
+ if (num_contig_lg2 <= ilog2(32)) {
+ for (; tablep != end; tablep++)
+ WRITE_ONCE(*tablep, entry);
+ } else {
+ memset64(tablep, entry, log2_to_int(num_contig_lg2));
+ }
+ }
+ pts->entry = entry;
+}
+#define pt_install_leaf_entry amdv1pt_install_leaf_entry
+
+static inline bool amdv1pt_install_table(struct pt_state *pts,
+ pt_oaddr_t table_pa,
+ const struct pt_write_attrs *attrs)
+{
+ u64 entry;
+
+ /*
+ * IR and IW are ANDed from the table levels along with the PTE. We
+ * always control permissions from the PTE, so always set IR and IW for
+ * tables.
+ */
+ entry = AMDV1PT_FMT_PR |
+ FIELD_PREP(AMDV1PT_FMT_NEXT_LEVEL, pts->level) |
+ FIELD_PREP(AMDV1PT_FMT_OA,
+ log2_div(table_pa, PT_GRANULE_LG2SZ)) |
+ AMDV1PT_FMT_IR | AMDV1PT_FMT_IW;
+ if (pts_feature(pts, PT_FEAT_AMDV1_ENCRYPT_TABLES))
+ entry = __sme_set(entry);
+ return pt_table_install64(pts, entry);
+}
+#define pt_install_table amdv1pt_install_table
+
+static inline void amdv1pt_attr_from_entry(const struct pt_state *pts,
+ struct pt_write_attrs *attrs)
+{
+ attrs->descriptor_bits =
+ pts->entry & (AMDV1PT_FMT_FC | AMDV1PT_FMT_IR | AMDV1PT_FMT_IW);
+}
+#define pt_attr_from_entry amdv1pt_attr_from_entry
+
+static inline void amdv1pt_clear_entries(struct pt_state *pts,
+ unsigned int num_contig_lg2)
+{
+ u64 *tablep = pt_cur_table(pts, u64) + pts->index;
+ u64 *end = tablep + log2_to_int(num_contig_lg2);
+
+ /*
+ * gcc generates rep stos for the io-pgtable code, and this difference
+ * can show in microbenchmarks with larger contiguous page sizes.
+ * rep is slower for small cases.
+ */
+ if (num_contig_lg2 <= ilog2(32)) {
+ for (; tablep != end; tablep++)
+ WRITE_ONCE(*tablep, 0);
+ } else {
+ memset64(tablep, 0, log2_to_int(num_contig_lg2));
+ }
+}
+#define pt_clear_entries amdv1pt_clear_entries
+
+static inline bool amdv1pt_entry_is_write_dirty(const struct pt_state *pts)
+{
+ unsigned int num_contig_lg2 = amdv1pt_entry_num_contig_lg2(pts);
+ u64 *tablep = pt_cur_table(pts, u64) +
+ log2_set_mod(pts->index, 0, num_contig_lg2);
+ u64 *end = tablep + log2_to_int(num_contig_lg2);
+
+ for (; tablep != end; tablep++)
+ if (READ_ONCE(*tablep) & AMDV1PT_FMT_D)
+ return true;
+ return false;
+}
+#define pt_entry_is_write_dirty amdv1pt_entry_is_write_dirty
+
+static inline void amdv1pt_entry_make_write_clean(struct pt_state *pts)
+{
+ unsigned int num_contig_lg2 = amdv1pt_entry_num_contig_lg2(pts);
+ u64 *tablep = pt_cur_table(pts, u64) +
+ log2_set_mod(pts->index, 0, num_contig_lg2);
+ u64 *end = tablep + log2_to_int(num_contig_lg2);
+
+ for (; tablep != end; tablep++)
+ WRITE_ONCE(*tablep, READ_ONCE(*tablep) & ~(u64)AMDV1PT_FMT_D);
+}
+#define pt_entry_make_write_clean amdv1pt_entry_make_write_clean
+
+static inline bool amdv1pt_entry_make_write_dirty(struct pt_state *pts)
+{
+ u64 *tablep = pt_cur_table(pts, u64) + pts->index;
+ u64 new = pts->entry | AMDV1PT_FMT_D;
+
+ return try_cmpxchg64(tablep, &pts->entry, new);
+}
+#define pt_entry_make_write_dirty amdv1pt_entry_make_write_dirty
+
+/* --- iommu */
+#include <linux/generic_pt/iommu.h>
+#include <linux/iommu.h>
+
+#define pt_iommu_table pt_iommu_amdv1
+
+/* The common struct is in the per-format common struct */
+static inline struct pt_common *common_from_iommu(struct pt_iommu *iommu_table)
+{
+ return &container_of(iommu_table, struct pt_iommu_amdv1, iommu)
+ ->amdpt.common;
+}
+
+static inline struct pt_iommu *iommu_from_common(struct pt_common *common)
+{
+ return &container_of(common, struct pt_iommu_amdv1, amdpt.common)->iommu;
+}
+
+static inline int amdv1pt_iommu_set_prot(struct pt_common *common,
+ struct pt_write_attrs *attrs,
+ unsigned int iommu_prot)
+{
+ u64 pte = 0;
+
+ if (pt_feature(common, PT_FEAT_AMDV1_FORCE_COHERENCE))
+ pte |= AMDV1PT_FMT_FC;
+ if (iommu_prot & IOMMU_READ)
+ pte |= AMDV1PT_FMT_IR;
+ if (iommu_prot & IOMMU_WRITE)
+ pte |= AMDV1PT_FMT_IW;
+
+ /*
+ * Ideally we'd have an IOMMU_ENCRYPTED flag set by higher levels to
+ * control this. For now if the tables use sme_set then so do the ptes.
+ */
+ if (pt_feature(common, PT_FEAT_AMDV1_ENCRYPT_TABLES))
+ pte = __sme_set(pte);
+
+ attrs->descriptor_bits = pte;
+ return 0;
+}
+#define pt_iommu_set_prot amdv1pt_iommu_set_prot
+
+static inline int amdv1pt_iommu_fmt_init(struct pt_iommu_amdv1 *iommu_table,
+ const struct pt_iommu_amdv1_cfg *cfg)
+{
+ struct pt_amdv1 *table = &iommu_table->amdpt;
+ unsigned int max_vasz_lg2 = PT_MAX_VA_ADDRESS_LG2;
+
+ if (cfg->starting_level == 0 || cfg->starting_level > PT_MAX_TOP_LEVEL)
+ return -EINVAL;
+
+ if (!pt_feature(&table->common, PT_FEAT_DYNAMIC_TOP) &&
+ cfg->starting_level != PT_MAX_TOP_LEVEL)
+ max_vasz_lg2 = PT_GRANULE_LG2SZ +
+ (PT_TABLEMEM_LG2SZ - ilog2(sizeof(u64))) *
+ (cfg->starting_level + 1);
+
+ table->common.max_vasz_lg2 =
+ min(max_vasz_lg2, cfg->common.hw_max_vasz_lg2);
+ table->common.max_oasz_lg2 =
+ min(PT_MAX_OUTPUT_ADDRESS_LG2, cfg->common.hw_max_oasz_lg2);
+ pt_top_set_level(&table->common, cfg->starting_level);
+ return 0;
+}
+#define pt_iommu_fmt_init amdv1pt_iommu_fmt_init
+
+#ifndef PT_FMT_VARIANT
+static inline void
+amdv1pt_iommu_fmt_hw_info(struct pt_iommu_amdv1 *table,
+ const struct pt_range *top_range,
+ struct pt_iommu_amdv1_hw_info *info)
+{
+ info->host_pt_root = virt_to_phys(top_range->top_table);
+ PT_WARN_ON(info->host_pt_root & ~PT_TOP_PHYS_MASK);
+ info->mode = top_range->top_level + 1;
+}
+#define pt_iommu_fmt_hw_info amdv1pt_iommu_fmt_hw_info
+#endif
+
+#if defined(GENERIC_PT_KUNIT)
+static const struct pt_iommu_amdv1_cfg amdv1_kunit_fmt_cfgs[] = {
+ /* Matches what io_pgtable does */
+ [0] = { .starting_level = 2 },
+};
+#define kunit_fmt_cfgs amdv1_kunit_fmt_cfgs
+enum { KUNIT_FMT_FEATURES = 0 };
+#endif
+
+#endif
diff --git a/drivers/iommu/generic_pt/fmt/defs_amdv1.h b/drivers/iommu/generic_pt/fmt/defs_amdv1.h
new file mode 100644
index 000000000000..0b9614ca6d10
--- /dev/null
+++ b/drivers/iommu/generic_pt/fmt/defs_amdv1.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES
+ *
+ */
+#ifndef __GENERIC_PT_FMT_DEFS_AMDV1_H
+#define __GENERIC_PT_FMT_DEFS_AMDV1_H
+
+#include <linux/generic_pt/common.h>
+#include <linux/types.h>
+
+typedef u64 pt_vaddr_t;
+typedef u64 pt_oaddr_t;
+
+struct amdv1pt_write_attrs {
+ u64 descriptor_bits;
+ gfp_t gfp;
+};
+#define pt_write_attrs amdv1pt_write_attrs
+
+#endif
diff --git a/drivers/iommu/generic_pt/fmt/defs_vtdss.h b/drivers/iommu/generic_pt/fmt/defs_vtdss.h
new file mode 100644
index 000000000000..4a239bcaae2a
--- /dev/null
+++ b/drivers/iommu/generic_pt/fmt/defs_vtdss.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES
+ *
+ */
+#ifndef __GENERIC_PT_FMT_DEFS_VTDSS_H
+#define __GENERIC_PT_FMT_DEFS_VTDSS_H
+
+#include <linux/generic_pt/common.h>
+#include <linux/types.h>
+
+typedef u64 pt_vaddr_t;
+typedef u64 pt_oaddr_t;
+
+struct vtdss_pt_write_attrs {
+ u64 descriptor_bits;
+ gfp_t gfp;
+};
+#define pt_write_attrs vtdss_pt_write_attrs
+
+#endif
diff --git a/drivers/iommu/generic_pt/fmt/defs_x86_64.h b/drivers/iommu/generic_pt/fmt/defs_x86_64.h
new file mode 100644
index 000000000000..6f589e1f55d3
--- /dev/null
+++ b/drivers/iommu/generic_pt/fmt/defs_x86_64.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES
+ *
+ */
+#ifndef __GENERIC_PT_FMT_DEFS_X86_64_H
+#define __GENERIC_PT_FMT_DEFS_X86_64_H
+
+#include <linux/generic_pt/common.h>
+#include <linux/types.h>
+
+typedef u64 pt_vaddr_t;
+typedef u64 pt_oaddr_t;
+
+struct x86_64_pt_write_attrs {
+ u64 descriptor_bits;
+ gfp_t gfp;
+};
+#define pt_write_attrs x86_64_pt_write_attrs
+
+#endif
diff --git a/drivers/iommu/generic_pt/fmt/iommu_amdv1.c b/drivers/iommu/generic_pt/fmt/iommu_amdv1.c
new file mode 100644
index 000000000000..72a2337d0c55
--- /dev/null
+++ b/drivers/iommu/generic_pt/fmt/iommu_amdv1.c
@@ -0,0 +1,15 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES
+ */
+#define PT_FMT amdv1
+#define PT_SUPPORTED_FEATURES \
+ (BIT(PT_FEAT_FULL_VA) | BIT(PT_FEAT_DYNAMIC_TOP) | \
+ BIT(PT_FEAT_FLUSH_RANGE) | BIT(PT_FEAT_FLUSH_RANGE_NO_GAPS) | \
+ BIT(PT_FEAT_AMDV1_ENCRYPT_TABLES) | \
+ BIT(PT_FEAT_AMDV1_FORCE_COHERENCE))
+#define PT_FORCE_ENABLED_FEATURES \
+ (BIT(PT_FEAT_DYNAMIC_TOP) | BIT(PT_FEAT_AMDV1_ENCRYPT_TABLES) | \
+ BIT(PT_FEAT_AMDV1_FORCE_COHERENCE))
+
+#include "iommu_template.h"
diff --git a/drivers/iommu/generic_pt/fmt/iommu_mock.c b/drivers/iommu/generic_pt/fmt/iommu_mock.c
new file mode 100644
index 000000000000..74e597cba9d9
--- /dev/null
+++ b/drivers/iommu/generic_pt/fmt/iommu_mock.c
@@ -0,0 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES
+ */
+#define AMDV1_IOMMUFD_SELFTEST 1
+#define PT_FMT amdv1
+#define PT_FMT_VARIANT mock
+#define PT_SUPPORTED_FEATURES 0
+
+#include "iommu_template.h"
diff --git a/drivers/iommu/generic_pt/fmt/iommu_template.h b/drivers/iommu/generic_pt/fmt/iommu_template.h
new file mode 100644
index 000000000000..d28e86abdf2e
--- /dev/null
+++ b/drivers/iommu/generic_pt/fmt/iommu_template.h
@@ -0,0 +1,48 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES
+ *
+ * Template to build the iommu module and kunit from the format and
+ * implementation headers.
+ *
+ * The format should have:
+ * #define PT_FMT <name>
+ * #define PT_SUPPORTED_FEATURES (BIT(PT_FEAT_xx) | BIT(PT_FEAT_yy))
+ * And optionally:
+ * #define PT_FORCE_ENABLED_FEATURES ..
+ * #define PT_FMT_VARIANT <suffix>
+ */
+#include <linux/args.h>
+#include <linux/stringify.h>
+
+#ifdef PT_FMT_VARIANT
+#define PTPFX_RAW \
+ CONCATENATE(CONCATENATE(PT_FMT, _), PT_FMT_VARIANT)
+#else
+#define PTPFX_RAW PT_FMT
+#endif
+
+#define PTPFX CONCATENATE(PTPFX_RAW, _)
+
+#define _PT_FMT_H PT_FMT.h
+#define PT_FMT_H __stringify(_PT_FMT_H)
+
+#define _PT_DEFS_H CONCATENATE(defs_, _PT_FMT_H)
+#define PT_DEFS_H __stringify(_PT_DEFS_H)
+
+#include <linux/generic_pt/common.h>
+#include PT_DEFS_H
+#include "../pt_defs.h"
+#include PT_FMT_H
+#include "../pt_common.h"
+
+#ifndef GENERIC_PT_KUNIT
+#include "../iommu_pt.h"
+#else
+/*
+ * The makefile will compile the .c file twice, once with GENERIC_PT_KUNIT set
+ * which means we are building the kunit modle.
+ */
+#include "../kunit_generic_pt.h"
+#include "../kunit_iommu_pt.h"
+#endif
diff --git a/drivers/iommu/generic_pt/fmt/iommu_vtdss.c b/drivers/iommu/generic_pt/fmt/iommu_vtdss.c
new file mode 100644
index 000000000000..f551711e2a33
--- /dev/null
+++ b/drivers/iommu/generic_pt/fmt/iommu_vtdss.c
@@ -0,0 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES
+ */
+#define PT_FMT vtdss
+#define PT_SUPPORTED_FEATURES \
+ (BIT(PT_FEAT_FLUSH_RANGE) | BIT(PT_FEAT_VTDSS_FORCE_COHERENCE) | \
+ BIT(PT_FEAT_VTDSS_FORCE_WRITEABLE) | BIT(PT_FEAT_DMA_INCOHERENT))
+
+#include "iommu_template.h"
diff --git a/drivers/iommu/generic_pt/fmt/iommu_x86_64.c b/drivers/iommu/generic_pt/fmt/iommu_x86_64.c
new file mode 100644
index 000000000000..5472660c2d71
--- /dev/null
+++ b/drivers/iommu/generic_pt/fmt/iommu_x86_64.c
@@ -0,0 +1,11 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES
+ */
+#define PT_FMT x86_64
+#define PT_SUPPORTED_FEATURES \
+ (BIT(PT_FEAT_SIGN_EXTEND) | BIT(PT_FEAT_FLUSH_RANGE) | \
+ BIT(PT_FEAT_FLUSH_RANGE_NO_GAPS) | \
+ BIT(PT_FEAT_X86_64_AMD_ENCRYPT_TABLES) | BIT(PT_FEAT_DMA_INCOHERENT))
+
+#include "iommu_template.h"
diff --git a/drivers/iommu/generic_pt/fmt/vtdss.h b/drivers/iommu/generic_pt/fmt/vtdss.h
new file mode 100644
index 000000000000..f5f8981edde7
--- /dev/null
+++ b/drivers/iommu/generic_pt/fmt/vtdss.h
@@ -0,0 +1,285 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES
+ *
+ * Intel VT-d Second Stange 5/4 level page table
+ *
+ * This is described in
+ * Section "3.7 Second-Stage Translation"
+ * Section "9.8 Second-Stage Paging Entries"
+ *
+ * Of the "Intel Virtualization Technology for Directed I/O Architecture
+ * Specification".
+ *
+ * The named levels in the spec map to the pts->level as:
+ * Table/SS-PTE - 0
+ * Directory/SS-PDE - 1
+ * Directory Ptr/SS-PDPTE - 2
+ * PML4/SS-PML4E - 3
+ * PML5/SS-PML5E - 4
+ */
+#ifndef __GENERIC_PT_FMT_VTDSS_H
+#define __GENERIC_PT_FMT_VTDSS_H
+
+#include "defs_vtdss.h"
+#include "../pt_defs.h"
+
+#include <linux/bitfield.h>
+#include <linux/container_of.h>
+#include <linux/log2.h>
+
+enum {
+ PT_MAX_OUTPUT_ADDRESS_LG2 = 52,
+ PT_MAX_VA_ADDRESS_LG2 = 57,
+ PT_ITEM_WORD_SIZE = sizeof(u64),
+ PT_MAX_TOP_LEVEL = 4,
+ PT_GRANULE_LG2SZ = 12,
+ PT_TABLEMEM_LG2SZ = 12,
+
+ /* SSPTPTR is 4k aligned and limited by HAW */
+ PT_TOP_PHYS_MASK = GENMASK_ULL(63, 12),
+};
+
+/* Shared descriptor bits */
+enum {
+ VTDSS_FMT_R = BIT(0),
+ VTDSS_FMT_W = BIT(1),
+ VTDSS_FMT_A = BIT(8),
+ VTDSS_FMT_D = BIT(9),
+ VTDSS_FMT_SNP = BIT(11),
+ VTDSS_FMT_OA = GENMASK_ULL(51, 12),
+};
+
+/* PDPTE/PDE */
+enum {
+ VTDSS_FMT_PS = BIT(7),
+};
+
+#define common_to_vtdss_pt(common_ptr) \
+ container_of_const(common_ptr, struct pt_vtdss, common)
+#define to_vtdss_pt(pts) common_to_vtdss_pt((pts)->range->common)
+
+static inline pt_oaddr_t vtdss_pt_table_pa(const struct pt_state *pts)
+{
+ return oalog2_mul(FIELD_GET(VTDSS_FMT_OA, pts->entry),
+ PT_TABLEMEM_LG2SZ);
+}
+#define pt_table_pa vtdss_pt_table_pa
+
+static inline pt_oaddr_t vtdss_pt_entry_oa(const struct pt_state *pts)
+{
+ return oalog2_mul(FIELD_GET(VTDSS_FMT_OA, pts->entry),
+ PT_GRANULE_LG2SZ);
+}
+#define pt_entry_oa vtdss_pt_entry_oa
+
+static inline bool vtdss_pt_can_have_leaf(const struct pt_state *pts)
+{
+ return pts->level <= 2;
+}
+#define pt_can_have_leaf vtdss_pt_can_have_leaf
+
+static inline unsigned int vtdss_pt_num_items_lg2(const struct pt_state *pts)
+{
+ return PT_TABLEMEM_LG2SZ - ilog2(sizeof(u64));
+}
+#define pt_num_items_lg2 vtdss_pt_num_items_lg2
+
+static inline enum pt_entry_type vtdss_pt_load_entry_raw(struct pt_state *pts)
+{
+ const u64 *tablep = pt_cur_table(pts, u64);
+ u64 entry;
+
+ pts->entry = entry = READ_ONCE(tablep[pts->index]);
+ if (!entry)
+ return PT_ENTRY_EMPTY;
+ if (pts->level == 0 ||
+ (vtdss_pt_can_have_leaf(pts) && (pts->entry & VTDSS_FMT_PS)))
+ return PT_ENTRY_OA;
+ return PT_ENTRY_TABLE;
+}
+#define pt_load_entry_raw vtdss_pt_load_entry_raw
+
+static inline void
+vtdss_pt_install_leaf_entry(struct pt_state *pts, pt_oaddr_t oa,
+ unsigned int oasz_lg2,
+ const struct pt_write_attrs *attrs)
+{
+ u64 *tablep = pt_cur_table(pts, u64);
+ u64 entry;
+
+ if (!pt_check_install_leaf_args(pts, oa, oasz_lg2))
+ return;
+
+ entry = FIELD_PREP(VTDSS_FMT_OA, log2_div(oa, PT_GRANULE_LG2SZ)) |
+ attrs->descriptor_bits;
+ if (pts->level != 0)
+ entry |= VTDSS_FMT_PS;
+
+ WRITE_ONCE(tablep[pts->index], entry);
+ pts->entry = entry;
+}
+#define pt_install_leaf_entry vtdss_pt_install_leaf_entry
+
+static inline bool vtdss_pt_install_table(struct pt_state *pts,
+ pt_oaddr_t table_pa,
+ const struct pt_write_attrs *attrs)
+{
+ u64 entry;
+
+ entry = VTDSS_FMT_R | VTDSS_FMT_W |
+ FIELD_PREP(VTDSS_FMT_OA, log2_div(table_pa, PT_GRANULE_LG2SZ));
+ return pt_table_install64(pts, entry);
+}
+#define pt_install_table vtdss_pt_install_table
+
+static inline void vtdss_pt_attr_from_entry(const struct pt_state *pts,
+ struct pt_write_attrs *attrs)
+{
+ attrs->descriptor_bits = pts->entry &
+ (VTDSS_FMT_R | VTDSS_FMT_W | VTDSS_FMT_SNP);
+}
+#define pt_attr_from_entry vtdss_pt_attr_from_entry
+
+static inline bool vtdss_pt_entry_is_write_dirty(const struct pt_state *pts)
+{
+ u64 *tablep = pt_cur_table(pts, u64) + pts->index;
+
+ return READ_ONCE(*tablep) & VTDSS_FMT_D;
+}
+#define pt_entry_is_write_dirty vtdss_pt_entry_is_write_dirty
+
+static inline void vtdss_pt_entry_make_write_clean(struct pt_state *pts)
+{
+ u64 *tablep = pt_cur_table(pts, u64) + pts->index;
+
+ WRITE_ONCE(*tablep, READ_ONCE(*tablep) & ~(u64)VTDSS_FMT_D);
+}
+#define pt_entry_make_write_clean vtdss_pt_entry_make_write_clean
+
+static inline bool vtdss_pt_entry_make_write_dirty(struct pt_state *pts)
+{
+ u64 *tablep = pt_cur_table(pts, u64) + pts->index;
+ u64 new = pts->entry | VTDSS_FMT_D;
+
+ return try_cmpxchg64(tablep, &pts->entry, new);
+}
+#define pt_entry_make_write_dirty vtdss_pt_entry_make_write_dirty
+
+static inline unsigned int vtdss_pt_max_sw_bit(struct pt_common *common)
+{
+ return 10;
+}
+#define pt_max_sw_bit vtdss_pt_max_sw_bit
+
+static inline u64 vtdss_pt_sw_bit(unsigned int bitnr)
+{
+ if (__builtin_constant_p(bitnr) && bitnr > 10)
+ BUILD_BUG();
+
+ /* Bits marked Ignored in the specification */
+ switch (bitnr) {
+ case 0:
+ return BIT(10);
+ case 1 ... 9:
+ return BIT_ULL((bitnr - 1) + 52);
+ case 10:
+ return BIT_ULL(63);
+ /* Some bits in 9-3 are available in some entries */
+ default:
+ PT_WARN_ON(true);
+ return 0;
+ }
+}
+#define pt_sw_bit vtdss_pt_sw_bit
+
+/* --- iommu */
+#include <linux/generic_pt/iommu.h>
+#include <linux/iommu.h>
+
+#define pt_iommu_table pt_iommu_vtdss
+
+/* The common struct is in the per-format common struct */
+static inline struct pt_common *common_from_iommu(struct pt_iommu *iommu_table)
+{
+ return &container_of(iommu_table, struct pt_iommu_table, iommu)
+ ->vtdss_pt.common;
+}
+
+static inline struct pt_iommu *iommu_from_common(struct pt_common *common)
+{
+ return &container_of(common, struct pt_iommu_table, vtdss_pt.common)
+ ->iommu;
+}
+
+static inline int vtdss_pt_iommu_set_prot(struct pt_common *common,
+ struct pt_write_attrs *attrs,
+ unsigned int iommu_prot)
+{
+ u64 pte = 0;
+
+ /*
+ * VTDSS does not have a present bit, so we tell if any entry is present
+ * by checking for R or W.
+ */
+ if (!(iommu_prot & (IOMMU_READ | IOMMU_WRITE)))
+ return -EINVAL;
+
+ if (iommu_prot & IOMMU_READ)
+ pte |= VTDSS_FMT_R;
+ if (iommu_prot & IOMMU_WRITE)
+ pte |= VTDSS_FMT_W;
+ if (pt_feature(common, PT_FEAT_VTDSS_FORCE_COHERENCE))
+ pte |= VTDSS_FMT_SNP;
+
+ if (pt_feature(common, PT_FEAT_VTDSS_FORCE_WRITEABLE) &&
+ !(iommu_prot & IOMMU_WRITE)) {
+ pr_err_ratelimited(
+ "Read-only mapping is disallowed on the domain which serves as the parent in a nested configuration, due to HW errata (ERRATA_772415_SPR17)\n");
+ return -EINVAL;
+ }
+
+ attrs->descriptor_bits = pte;
+ return 0;
+}
+#define pt_iommu_set_prot vtdss_pt_iommu_set_prot
+
+static inline int vtdss_pt_iommu_fmt_init(struct pt_iommu_vtdss *iommu_table,
+ const struct pt_iommu_vtdss_cfg *cfg)
+{
+ struct pt_vtdss *table = &iommu_table->vtdss_pt;
+
+ if (cfg->top_level > 4 || cfg->top_level < 2)
+ return -EOPNOTSUPP;
+
+ pt_top_set_level(&table->common, cfg->top_level);
+ return 0;
+}
+#define pt_iommu_fmt_init vtdss_pt_iommu_fmt_init
+
+static inline void
+vtdss_pt_iommu_fmt_hw_info(struct pt_iommu_vtdss *table,
+ const struct pt_range *top_range,
+ struct pt_iommu_vtdss_hw_info *info)
+{
+ info->ssptptr = virt_to_phys(top_range->top_table);
+ PT_WARN_ON(info->ssptptr & ~PT_TOP_PHYS_MASK);
+ /*
+ * top_level = 2 = 3 level table aw=1
+ * top_level = 3 = 4 level table aw=2
+ * top_level = 4 = 5 level table aw=3
+ */
+ info->aw = top_range->top_level - 1;
+}
+#define pt_iommu_fmt_hw_info vtdss_pt_iommu_fmt_hw_info
+
+#if defined(GENERIC_PT_KUNIT)
+static const struct pt_iommu_vtdss_cfg vtdss_kunit_fmt_cfgs[] = {
+ [0] = { .common.hw_max_vasz_lg2 = 39, .top_level = 2},
+ [1] = { .common.hw_max_vasz_lg2 = 48, .top_level = 3},
+ [2] = { .common.hw_max_vasz_lg2 = 57, .top_level = 4},
+};
+#define kunit_fmt_cfgs vtdss_kunit_fmt_cfgs
+enum { KUNIT_FMT_FEATURES = BIT(PT_FEAT_VTDSS_FORCE_WRITEABLE) };
+#endif
+#endif
diff --git a/drivers/iommu/generic_pt/fmt/x86_64.h b/drivers/iommu/generic_pt/fmt/x86_64.h
new file mode 100644
index 000000000000..210748d9d6e8
--- /dev/null
+++ b/drivers/iommu/generic_pt/fmt/x86_64.h
@@ -0,0 +1,279 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES
+ *
+ * x86 page table. Supports the 4 and 5 level variations.
+ *
+ * The 4 and 5 level version is described in:
+ * Section "4.4 4-Level Paging and 5-Level Paging" of the Intel Software
+ * Developer's Manual Volume 3
+ *
+ * Section "9.7 First-Stage Paging Entries" of the "Intel Virtualization
+ * Technology for Directed I/O Architecture Specification"
+ *
+ * Section "2.2.6 I/O Page Tables for Guest Translations" of the "AMD I/O
+ * Virtualization Technology (IOMMU) Specification"
+ *
+ * It is used by x86 CPUs, AMD and VT-d IOMMU HW.
+ *
+ * Note the 3 level format is very similar and almost implemented here. The
+ * reserved/ignored layout is different and there are functional bit
+ * differences.
+ *
+ * This format uses PT_FEAT_SIGN_EXTEND to have a upper/non-canonical/lower
+ * split. PT_FEAT_SIGN_EXTEND is optional as AMD IOMMU sometimes uses non-sign
+ * extended addressing with this page table format.
+ *
+ * The named levels in the spec map to the pts->level as:
+ * Table/PTE - 0
+ * Directory/PDE - 1
+ * Directory Ptr/PDPTE - 2
+ * PML4/PML4E - 3
+ * PML5/PML5E - 4
+ */
+#ifndef __GENERIC_PT_FMT_X86_64_H
+#define __GENERIC_PT_FMT_X86_64_H
+
+#include "defs_x86_64.h"
+#include "../pt_defs.h"
+
+#include <linux/bitfield.h>
+#include <linux/container_of.h>
+#include <linux/log2.h>
+#include <linux/mem_encrypt.h>
+
+enum {
+ PT_MAX_OUTPUT_ADDRESS_LG2 = 52,
+ PT_MAX_VA_ADDRESS_LG2 = 57,
+ PT_ITEM_WORD_SIZE = sizeof(u64),
+ PT_MAX_TOP_LEVEL = 4,
+ PT_GRANULE_LG2SZ = 12,
+ PT_TABLEMEM_LG2SZ = 12,
+
+ /*
+ * For AMD the GCR3 Base only has these bits. For VT-d FSPTPTR is 4k
+ * aligned and is limited by the architected HAW
+ */
+ PT_TOP_PHYS_MASK = GENMASK_ULL(51, 12),
+};
+
+/* Shared descriptor bits */
+enum {
+ X86_64_FMT_P = BIT(0),
+ X86_64_FMT_RW = BIT(1),
+ X86_64_FMT_U = BIT(2),
+ X86_64_FMT_A = BIT(5),
+ X86_64_FMT_D = BIT(6),
+ X86_64_FMT_OA = GENMASK_ULL(51, 12),
+ X86_64_FMT_XD = BIT_ULL(63),
+};
+
+/* PDPTE/PDE */
+enum {
+ X86_64_FMT_PS = BIT(7),
+};
+
+static inline pt_oaddr_t x86_64_pt_table_pa(const struct pt_state *pts)
+{
+ u64 entry = pts->entry;
+
+ if (pts_feature(pts, PT_FEAT_X86_64_AMD_ENCRYPT_TABLES))
+ entry = __sme_clr(entry);
+ return oalog2_mul(FIELD_GET(X86_64_FMT_OA, entry),
+ PT_TABLEMEM_LG2SZ);
+}
+#define pt_table_pa x86_64_pt_table_pa
+
+static inline pt_oaddr_t x86_64_pt_entry_oa(const struct pt_state *pts)
+{
+ u64 entry = pts->entry;
+
+ if (pts_feature(pts, PT_FEAT_X86_64_AMD_ENCRYPT_TABLES))
+ entry = __sme_clr(entry);
+ return oalog2_mul(FIELD_GET(X86_64_FMT_OA, entry),
+ PT_GRANULE_LG2SZ);
+}
+#define pt_entry_oa x86_64_pt_entry_oa
+
+static inline bool x86_64_pt_can_have_leaf(const struct pt_state *pts)
+{
+ return pts->level <= 2;
+}
+#define pt_can_have_leaf x86_64_pt_can_have_leaf
+
+static inline unsigned int x86_64_pt_num_items_lg2(const struct pt_state *pts)
+{
+ return PT_TABLEMEM_LG2SZ - ilog2(sizeof(u64));
+}
+#define pt_num_items_lg2 x86_64_pt_num_items_lg2
+
+static inline enum pt_entry_type x86_64_pt_load_entry_raw(struct pt_state *pts)
+{
+ const u64 *tablep = pt_cur_table(pts, u64);
+ u64 entry;
+
+ pts->entry = entry = READ_ONCE(tablep[pts->index]);
+ if (!(entry & X86_64_FMT_P))
+ return PT_ENTRY_EMPTY;
+ if (pts->level == 0 ||
+ (x86_64_pt_can_have_leaf(pts) && (entry & X86_64_FMT_PS)))
+ return PT_ENTRY_OA;
+ return PT_ENTRY_TABLE;
+}
+#define pt_load_entry_raw x86_64_pt_load_entry_raw
+
+static inline void
+x86_64_pt_install_leaf_entry(struct pt_state *pts, pt_oaddr_t oa,
+ unsigned int oasz_lg2,
+ const struct pt_write_attrs *attrs)
+{
+ u64 *tablep = pt_cur_table(pts, u64);
+ u64 entry;
+
+ if (!pt_check_install_leaf_args(pts, oa, oasz_lg2))
+ return;
+
+ entry = X86_64_FMT_P |
+ FIELD_PREP(X86_64_FMT_OA, log2_div(oa, PT_GRANULE_LG2SZ)) |
+ attrs->descriptor_bits;
+ if (pts->level != 0)
+ entry |= X86_64_FMT_PS;
+
+ WRITE_ONCE(tablep[pts->index], entry);
+ pts->entry = entry;
+}
+#define pt_install_leaf_entry x86_64_pt_install_leaf_entry
+
+static inline bool x86_64_pt_install_table(struct pt_state *pts,
+ pt_oaddr_t table_pa,
+ const struct pt_write_attrs *attrs)
+{
+ u64 entry;
+
+ entry = X86_64_FMT_P | X86_64_FMT_RW | X86_64_FMT_U | X86_64_FMT_A |
+ FIELD_PREP(X86_64_FMT_OA, log2_div(table_pa, PT_GRANULE_LG2SZ));
+ if (pts_feature(pts, PT_FEAT_X86_64_AMD_ENCRYPT_TABLES))
+ entry = __sme_set(entry);
+ return pt_table_install64(pts, entry);
+}
+#define pt_install_table x86_64_pt_install_table
+
+static inline void x86_64_pt_attr_from_entry(const struct pt_state *pts,
+ struct pt_write_attrs *attrs)
+{
+ attrs->descriptor_bits = pts->entry &
+ (X86_64_FMT_RW | X86_64_FMT_U | X86_64_FMT_A |
+ X86_64_FMT_D | X86_64_FMT_XD);
+}
+#define pt_attr_from_entry x86_64_pt_attr_from_entry
+
+static inline unsigned int x86_64_pt_max_sw_bit(struct pt_common *common)
+{
+ return 12;
+}
+#define pt_max_sw_bit x86_64_pt_max_sw_bit
+
+static inline u64 x86_64_pt_sw_bit(unsigned int bitnr)
+{
+ if (__builtin_constant_p(bitnr) && bitnr > 12)
+ BUILD_BUG();
+
+ /* Bits marked Ignored/AVL in the specification */
+ switch (bitnr) {
+ case 0:
+ return BIT(9);
+ case 1:
+ return BIT(11);
+ case 2 ... 12:
+ return BIT_ULL((bitnr - 2) + 52);
+ /* Some bits in 8,6,4,3 are available in some entries */
+ default:
+ PT_WARN_ON(true);
+ return 0;
+ }
+}
+#define pt_sw_bit x86_64_pt_sw_bit
+
+/* --- iommu */
+#include <linux/generic_pt/iommu.h>
+#include <linux/iommu.h>
+
+#define pt_iommu_table pt_iommu_x86_64
+
+/* The common struct is in the per-format common struct */
+static inline struct pt_common *common_from_iommu(struct pt_iommu *iommu_table)
+{
+ return &container_of(iommu_table, struct pt_iommu_table, iommu)
+ ->x86_64_pt.common;
+}
+
+static inline struct pt_iommu *iommu_from_common(struct pt_common *common)
+{
+ return &container_of(common, struct pt_iommu_table, x86_64_pt.common)
+ ->iommu;
+}
+
+static inline int x86_64_pt_iommu_set_prot(struct pt_common *common,
+ struct pt_write_attrs *attrs,
+ unsigned int iommu_prot)
+{
+ u64 pte;
+
+ pte = X86_64_FMT_U | X86_64_FMT_A;
+ if (iommu_prot & IOMMU_WRITE)
+ pte |= X86_64_FMT_RW | X86_64_FMT_D;
+
+ /*
+ * Ideally we'd have an IOMMU_ENCRYPTED flag set by higher levels to
+ * control this. For now if the tables use sme_set then so do the ptes.
+ */
+ if (pt_feature(common, PT_FEAT_X86_64_AMD_ENCRYPT_TABLES))
+ pte = __sme_set(pte);
+
+ attrs->descriptor_bits = pte;
+ return 0;
+}
+#define pt_iommu_set_prot x86_64_pt_iommu_set_prot
+
+static inline int
+x86_64_pt_iommu_fmt_init(struct pt_iommu_x86_64 *iommu_table,
+ const struct pt_iommu_x86_64_cfg *cfg)
+{
+ struct pt_x86_64 *table = &iommu_table->x86_64_pt;
+
+ if (cfg->top_level < 3 || cfg->top_level > 4)
+ return -EOPNOTSUPP;
+
+ pt_top_set_level(&table->common, cfg->top_level);
+
+ table->common.max_oasz_lg2 =
+ min(PT_MAX_OUTPUT_ADDRESS_LG2, cfg->common.hw_max_oasz_lg2);
+ return 0;
+}
+#define pt_iommu_fmt_init x86_64_pt_iommu_fmt_init
+
+static inline void
+x86_64_pt_iommu_fmt_hw_info(struct pt_iommu_x86_64 *table,
+ const struct pt_range *top_range,
+ struct pt_iommu_x86_64_hw_info *info)
+{
+ info->gcr3_pt = virt_to_phys(top_range->top_table);
+ PT_WARN_ON(info->gcr3_pt & ~PT_TOP_PHYS_MASK);
+ info->levels = top_range->top_level + 1;
+}
+#define pt_iommu_fmt_hw_info x86_64_pt_iommu_fmt_hw_info
+
+#if defined(GENERIC_PT_KUNIT)
+static const struct pt_iommu_x86_64_cfg x86_64_kunit_fmt_cfgs[] = {
+ [0] = { .common.features = BIT(PT_FEAT_SIGN_EXTEND),
+ .common.hw_max_vasz_lg2 = 48, .top_level = 3 },
+ [1] = { .common.features = BIT(PT_FEAT_SIGN_EXTEND),
+ .common.hw_max_vasz_lg2 = 57, .top_level = 4 },
+ /* AMD IOMMU PASID 0 formats with no SIGN_EXTEND */
+ [2] = { .common.hw_max_vasz_lg2 = 47, .top_level = 3 },
+ [3] = { .common.hw_max_vasz_lg2 = 56, .top_level = 4},
+};
+#define kunit_fmt_cfgs x86_64_kunit_fmt_cfgs
+enum { KUNIT_FMT_FEATURES = BIT(PT_FEAT_SIGN_EXTEND)};
+#endif
+#endif
diff --git a/drivers/iommu/generic_pt/iommu_pt.h b/drivers/iommu/generic_pt/iommu_pt.h
new file mode 100644
index 000000000000..97aeda1ad01c
--- /dev/null
+++ b/drivers/iommu/generic_pt/iommu_pt.h
@@ -0,0 +1,1289 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES
+ *
+ * "Templated C code" for implementing the iommu operations for page tables.
+ * This is compiled multiple times, over all the page table formats to pick up
+ * the per-format definitions.
+ */
+#ifndef __GENERIC_PT_IOMMU_PT_H
+#define __GENERIC_PT_IOMMU_PT_H
+
+#include "pt_iter.h"
+
+#include <linux/export.h>
+#include <linux/iommu.h>
+#include "../iommu-pages.h"
+#include <linux/cleanup.h>
+#include <linux/dma-mapping.h>
+
+enum {
+ SW_BIT_CACHE_FLUSH_DONE = 0,
+};
+
+static void flush_writes_range(const struct pt_state *pts,
+ unsigned int start_index, unsigned int end_index)
+{
+ if (pts_feature(pts, PT_FEAT_DMA_INCOHERENT))
+ iommu_pages_flush_incoherent(
+ iommu_from_common(pts->range->common)->iommu_device,
+ pts->table, start_index * PT_ITEM_WORD_SIZE,
+ (end_index - start_index) * PT_ITEM_WORD_SIZE);
+}
+
+static void flush_writes_item(const struct pt_state *pts)
+{
+ if (pts_feature(pts, PT_FEAT_DMA_INCOHERENT))
+ iommu_pages_flush_incoherent(
+ iommu_from_common(pts->range->common)->iommu_device,
+ pts->table, pts->index * PT_ITEM_WORD_SIZE,
+ PT_ITEM_WORD_SIZE);
+}
+
+static void gather_range_pages(struct iommu_iotlb_gather *iotlb_gather,
+ struct pt_iommu *iommu_table, pt_vaddr_t iova,
+ pt_vaddr_t len,
+ struct iommu_pages_list *free_list)
+{
+ struct pt_common *common = common_from_iommu(iommu_table);
+
+ if (pt_feature(common, PT_FEAT_DMA_INCOHERENT))
+ iommu_pages_stop_incoherent_list(free_list,
+ iommu_table->iommu_device);
+
+ if (pt_feature(common, PT_FEAT_FLUSH_RANGE_NO_GAPS) &&
+ iommu_iotlb_gather_is_disjoint(iotlb_gather, iova, len)) {
+ iommu_iotlb_sync(&iommu_table->domain, iotlb_gather);
+ /*
+ * Note that the sync frees the gather's free list, so we must
+ * not have any pages on that list that are covered by iova/len
+ */
+ } else if (pt_feature(common, PT_FEAT_FLUSH_RANGE)) {
+ iommu_iotlb_gather_add_range(iotlb_gather, iova, len);
+ }
+
+ iommu_pages_list_splice(free_list, &iotlb_gather->freelist);
+}
+
+#define DOMAIN_NS(op) CONCATENATE(CONCATENATE(pt_iommu_, PTPFX), op)
+
+static int make_range_ul(struct pt_common *common, struct pt_range *range,
+ unsigned long iova, unsigned long len)
+{
+ unsigned long last;
+
+ if (unlikely(len == 0))
+ return -EINVAL;
+
+ if (check_add_overflow(iova, len - 1, &last))
+ return -EOVERFLOW;
+
+ *range = pt_make_range(common, iova, last);
+ if (sizeof(iova) > sizeof(range->va)) {
+ if (unlikely(range->va != iova || range->last_va != last))
+ return -EOVERFLOW;
+ }
+ return 0;
+}
+
+static __maybe_unused int make_range_u64(struct pt_common *common,
+ struct pt_range *range, u64 iova,
+ u64 len)
+{
+ if (unlikely(iova > ULONG_MAX || len > ULONG_MAX))
+ return -EOVERFLOW;
+ return make_range_ul(common, range, iova, len);
+}
+
+/*
+ * Some APIs use unsigned long, while othersuse dma_addr_t as the type. Dispatch
+ * to the correct validation based on the type.
+ */
+#define make_range_no_check(common, range, iova, len) \
+ ({ \
+ int ret; \
+ if (sizeof(iova) > sizeof(unsigned long) || \
+ sizeof(len) > sizeof(unsigned long)) \
+ ret = make_range_u64(common, range, iova, len); \
+ else \
+ ret = make_range_ul(common, range, iova, len); \
+ ret; \
+ })
+
+#define make_range(common, range, iova, len) \
+ ({ \
+ int ret = make_range_no_check(common, range, iova, len); \
+ if (!ret) \
+ ret = pt_check_range(range); \
+ ret; \
+ })
+
+static inline unsigned int compute_best_pgsize(struct pt_state *pts,
+ pt_oaddr_t oa)
+{
+ struct pt_iommu *iommu_table = iommu_from_common(pts->range->common);
+
+ if (!pt_can_have_leaf(pts))
+ return 0;
+
+ /*
+ * The page size is limited by the domain's bitmap. This allows the core
+ * code to reduce the supported page sizes by changing the bitmap.
+ */
+ return pt_compute_best_pgsize(pt_possible_sizes(pts) &
+ iommu_table->domain.pgsize_bitmap,
+ pts->range->va, pts->range->last_va, oa);
+}
+
+static __always_inline int __do_iova_to_phys(struct pt_range *range, void *arg,
+ unsigned int level,
+ struct pt_table_p *table,
+ pt_level_fn_t descend_fn)
+{
+ struct pt_state pts = pt_init(range, level, table);
+ pt_oaddr_t *res = arg;
+
+ switch (pt_load_single_entry(&pts)) {
+ case PT_ENTRY_EMPTY:
+ return -ENOENT;
+ case PT_ENTRY_TABLE:
+ return pt_descend(&pts, arg, descend_fn);
+ case PT_ENTRY_OA:
+ *res = pt_entry_oa_exact(&pts);
+ return 0;
+ }
+ return -ENOENT;
+}
+PT_MAKE_LEVELS(__iova_to_phys, __do_iova_to_phys);
+
+/**
+ * iova_to_phys() - Return the output address for the given IOVA
+ * @domain: Table to query
+ * @iova: IO virtual address to query
+ *
+ * Determine the output address from the given IOVA. @iova may have any
+ * alignment, the returned physical will be adjusted with any sub page offset.
+ *
+ * Context: The caller must hold a read range lock that includes @iova.
+ *
+ * Return: 0 if there is no translation for the given iova.
+ */
+phys_addr_t DOMAIN_NS(iova_to_phys)(struct iommu_domain *domain,
+ dma_addr_t iova)
+{
+ struct pt_iommu *iommu_table =
+ container_of(domain, struct pt_iommu, domain);
+ struct pt_range range;
+ pt_oaddr_t res;
+ int ret;
+
+ ret = make_range(common_from_iommu(iommu_table), &range, iova, 1);
+ if (ret)
+ return ret;
+
+ ret = pt_walk_range(&range, __iova_to_phys, &res);
+ /* PHYS_ADDR_MAX would be a better error code */
+ if (ret)
+ return 0;
+ return res;
+}
+EXPORT_SYMBOL_NS_GPL(DOMAIN_NS(iova_to_phys), "GENERIC_PT_IOMMU");
+
+struct pt_iommu_dirty_args {
+ struct iommu_dirty_bitmap *dirty;
+ unsigned int flags;
+};
+
+static void record_dirty(struct pt_state *pts,
+ struct pt_iommu_dirty_args *dirty,
+ unsigned int num_contig_lg2)
+{
+ pt_vaddr_t dirty_len;
+
+ if (num_contig_lg2 != ilog2(1)) {
+ unsigned int index = pts->index;
+ unsigned int end_index = log2_set_mod_max_t(
+ unsigned int, pts->index, num_contig_lg2);
+
+ /* Adjust for being contained inside a contiguous page */
+ end_index = min(end_index, pts->end_index);
+ dirty_len = (end_index - index) *
+ log2_to_int(pt_table_item_lg2sz(pts));
+ } else {
+ dirty_len = log2_to_int(pt_table_item_lg2sz(pts));
+ }
+
+ if (dirty->dirty->bitmap)
+ iova_bitmap_set(dirty->dirty->bitmap, pts->range->va,
+ dirty_len);
+
+ if (!(dirty->flags & IOMMU_DIRTY_NO_CLEAR)) {
+ /*
+ * No write log required because DMA incoherence and atomic
+ * dirty tracking bits can't work together
+ */
+ pt_entry_make_write_clean(pts);
+ iommu_iotlb_gather_add_range(dirty->dirty->gather,
+ pts->range->va, dirty_len);
+ }
+}
+
+static inline int __read_and_clear_dirty(struct pt_range *range, void *arg,
+ unsigned int level,
+ struct pt_table_p *table)
+{
+ struct pt_state pts = pt_init(range, level, table);
+ struct pt_iommu_dirty_args *dirty = arg;
+ int ret;
+
+ for_each_pt_level_entry(&pts) {
+ if (pts.type == PT_ENTRY_TABLE) {
+ ret = pt_descend(&pts, arg, __read_and_clear_dirty);
+ if (ret)
+ return ret;
+ continue;
+ }
+ if (pts.type == PT_ENTRY_OA && pt_entry_is_write_dirty(&pts))
+ record_dirty(&pts, dirty,
+ pt_entry_num_contig_lg2(&pts));
+ }
+ return 0;
+}
+
+/**
+ * read_and_clear_dirty() - Manipulate the HW set write dirty state
+ * @domain: Domain to manipulate
+ * @iova: IO virtual address to start
+ * @size: Length of the IOVA
+ * @flags: A bitmap of IOMMU_DIRTY_NO_CLEAR
+ * @dirty: Place to store the dirty bits
+ *
+ * Iterate over all the entries in the mapped range and record their write dirty
+ * status in iommu_dirty_bitmap. If IOMMU_DIRTY_NO_CLEAR is not specified then
+ * the entries will be left dirty, otherwise they are returned to being not
+ * write dirty.
+ *
+ * Context: The caller must hold a read range lock that includes @iova.
+ *
+ * Returns: -ERRNO on failure, 0 on success.
+ */
+int DOMAIN_NS(read_and_clear_dirty)(struct iommu_domain *domain,
+ unsigned long iova, size_t size,
+ unsigned long flags,
+ struct iommu_dirty_bitmap *dirty)
+{
+ struct pt_iommu *iommu_table =
+ container_of(domain, struct pt_iommu, domain);
+ struct pt_iommu_dirty_args dirty_args = {
+ .dirty = dirty,
+ .flags = flags,
+ };
+ struct pt_range range;
+ int ret;
+
+#if !IS_ENABLED(CONFIG_IOMMUFD_DRIVER) || !defined(pt_entry_is_write_dirty)
+ return -EOPNOTSUPP;
+#endif
+
+ ret = make_range(common_from_iommu(iommu_table), &range, iova, size);
+ if (ret)
+ return ret;
+
+ ret = pt_walk_range(&range, __read_and_clear_dirty, &dirty_args);
+ PT_WARN_ON(ret);
+ return ret;
+}
+EXPORT_SYMBOL_NS_GPL(DOMAIN_NS(read_and_clear_dirty), "GENERIC_PT_IOMMU");
+
+static inline int __set_dirty(struct pt_range *range, void *arg,
+ unsigned int level, struct pt_table_p *table)
+{
+ struct pt_state pts = pt_init(range, level, table);
+
+ switch (pt_load_single_entry(&pts)) {
+ case PT_ENTRY_EMPTY:
+ return -ENOENT;
+ case PT_ENTRY_TABLE:
+ return pt_descend(&pts, arg, __set_dirty);
+ case PT_ENTRY_OA:
+ if (!pt_entry_make_write_dirty(&pts))
+ return -EAGAIN;
+ return 0;
+ }
+ return -ENOENT;
+}
+
+static int __maybe_unused NS(set_dirty)(struct pt_iommu *iommu_table,
+ dma_addr_t iova)
+{
+ struct pt_range range;
+ int ret;
+
+ ret = make_range(common_from_iommu(iommu_table), &range, iova, 1);
+ if (ret)
+ return ret;
+
+ /*
+ * Note: There is no locking here yet, if the test suite races this it
+ * can crash. It should use RCU locking eventually.
+ */
+ return pt_walk_range(&range, __set_dirty, NULL);
+}
+
+struct pt_iommu_collect_args {
+ struct iommu_pages_list free_list;
+ /* Fail if any OAs are within the range */
+ u8 check_mapped : 1;
+};
+
+static int __collect_tables(struct pt_range *range, void *arg,
+ unsigned int level, struct pt_table_p *table)
+{
+ struct pt_state pts = pt_init(range, level, table);
+ struct pt_iommu_collect_args *collect = arg;
+ int ret;
+
+ if (!collect->check_mapped && !pt_can_have_table(&pts))
+ return 0;
+
+ for_each_pt_level_entry(&pts) {
+ if (pts.type == PT_ENTRY_TABLE) {
+ iommu_pages_list_add(&collect->free_list, pts.table_lower);
+ ret = pt_descend(&pts, arg, __collect_tables);
+ if (ret)
+ return ret;
+ continue;
+ }
+ if (pts.type == PT_ENTRY_OA && collect->check_mapped)
+ return -EADDRINUSE;
+ }
+ return 0;
+}
+
+enum alloc_mode {ALLOC_NORMAL, ALLOC_DEFER_COHERENT_FLUSH};
+
+/* Allocate a table, the empty table will be ready to be installed. */
+static inline struct pt_table_p *_table_alloc(struct pt_common *common,
+ size_t lg2sz, gfp_t gfp,
+ enum alloc_mode mode)
+{
+ struct pt_iommu *iommu_table = iommu_from_common(common);
+ struct pt_table_p *table_mem;
+
+ table_mem = iommu_alloc_pages_node_sz(iommu_table->nid, gfp,
+ log2_to_int(lg2sz));
+ if (pt_feature(common, PT_FEAT_DMA_INCOHERENT) &&
+ mode == ALLOC_NORMAL) {
+ int ret = iommu_pages_start_incoherent(
+ table_mem, iommu_table->iommu_device);
+ if (ret) {
+ iommu_free_pages(table_mem);
+ return ERR_PTR(ret);
+ }
+ }
+ return table_mem;
+}
+
+static inline struct pt_table_p *table_alloc_top(struct pt_common *common,
+ uintptr_t top_of_table,
+ gfp_t gfp,
+ enum alloc_mode mode)
+{
+ /*
+ * Top doesn't need the free list or otherwise, so it technically
+ * doesn't need to use iommu pages. Use the API anyhow as the top is
+ * usually not smaller than PAGE_SIZE to keep things simple.
+ */
+ return _table_alloc(common, pt_top_memsize_lg2(common, top_of_table),
+ gfp, mode);
+}
+
+/* Allocate an interior table */
+static inline struct pt_table_p *table_alloc(const struct pt_state *parent_pts,
+ gfp_t gfp, enum alloc_mode mode)
+{
+ struct pt_state child_pts =
+ pt_init(parent_pts->range, parent_pts->level - 1, NULL);
+
+ return _table_alloc(parent_pts->range->common,
+ pt_num_items_lg2(&child_pts) +
+ ilog2(PT_ITEM_WORD_SIZE),
+ gfp, mode);
+}
+
+static inline int pt_iommu_new_table(struct pt_state *pts,
+ struct pt_write_attrs *attrs)
+{
+ struct pt_table_p *table_mem;
+ phys_addr_t phys;
+
+ /* Given PA/VA/length can't be represented */
+ if (PT_WARN_ON(!pt_can_have_table(pts)))
+ return -ENXIO;
+
+ table_mem = table_alloc(pts, attrs->gfp, ALLOC_NORMAL);
+ if (IS_ERR(table_mem))
+ return PTR_ERR(table_mem);
+
+ phys = virt_to_phys(table_mem);
+ if (!pt_install_table(pts, phys, attrs)) {
+ iommu_pages_free_incoherent(
+ table_mem,
+ iommu_from_common(pts->range->common)->iommu_device);
+ return -EAGAIN;
+ }
+
+ if (pts_feature(pts, PT_FEAT_DMA_INCOHERENT)) {
+ flush_writes_item(pts);
+ pt_set_sw_bit_release(pts, SW_BIT_CACHE_FLUSH_DONE);
+ }
+
+ if (IS_ENABLED(CONFIG_DEBUG_GENERIC_PT)) {
+ /*
+ * The underlying table can't store the physical table address.
+ * This happens when kunit testing tables outside their normal
+ * environment where a CPU might be limited.
+ */
+ pt_load_single_entry(pts);
+ if (PT_WARN_ON(pt_table_pa(pts) != phys)) {
+ pt_clear_entries(pts, ilog2(1));
+ iommu_pages_free_incoherent(
+ table_mem, iommu_from_common(pts->range->common)
+ ->iommu_device);
+ return -EINVAL;
+ }
+ }
+
+ pts->table_lower = table_mem;
+ return 0;
+}
+
+struct pt_iommu_map_args {
+ struct iommu_iotlb_gather *iotlb_gather;
+ struct pt_write_attrs attrs;
+ pt_oaddr_t oa;
+ unsigned int leaf_pgsize_lg2;
+ unsigned int leaf_level;
+};
+
+/*
+ * This will recursively check any tables in the block to validate they are
+ * empty and then free them through the gather.
+ */
+static int clear_contig(const struct pt_state *start_pts,
+ struct iommu_iotlb_gather *iotlb_gather,
+ unsigned int step, unsigned int pgsize_lg2)
+{
+ struct pt_iommu *iommu_table =
+ iommu_from_common(start_pts->range->common);
+ struct pt_range range = *start_pts->range;
+ struct pt_state pts =
+ pt_init(&range, start_pts->level, start_pts->table);
+ struct pt_iommu_collect_args collect = { .check_mapped = true };
+ int ret;
+
+ pts.index = start_pts->index;
+ pts.end_index = start_pts->index + step;
+ for (; _pt_iter_load(&pts); pt_next_entry(&pts)) {
+ if (pts.type == PT_ENTRY_TABLE) {
+ collect.free_list =
+ IOMMU_PAGES_LIST_INIT(collect.free_list);
+ ret = pt_walk_descend_all(&pts, __collect_tables,
+ &collect);
+ if (ret)
+ return ret;
+
+ /*
+ * The table item must be cleared before we can update
+ * the gather
+ */
+ pt_clear_entries(&pts, ilog2(1));
+ flush_writes_item(&pts);
+
+ iommu_pages_list_add(&collect.free_list,
+ pt_table_ptr(&pts));
+ gather_range_pages(
+ iotlb_gather, iommu_table, range.va,
+ log2_to_int(pt_table_item_lg2sz(&pts)),
+ &collect.free_list);
+ } else if (pts.type != PT_ENTRY_EMPTY) {
+ return -EADDRINUSE;
+ }
+ }
+ return 0;
+}
+
+static int __map_range_leaf(struct pt_range *range, void *arg,
+ unsigned int level, struct pt_table_p *table)
+{
+ struct pt_state pts = pt_init(range, level, table);
+ struct pt_iommu_map_args *map = arg;
+ unsigned int leaf_pgsize_lg2 = map->leaf_pgsize_lg2;
+ unsigned int start_index;
+ pt_oaddr_t oa = map->oa;
+ unsigned int step;
+ bool need_contig;
+ int ret = 0;
+
+ PT_WARN_ON(map->leaf_level != level);
+ PT_WARN_ON(!pt_can_have_leaf(&pts));
+
+ step = log2_to_int_t(unsigned int,
+ leaf_pgsize_lg2 - pt_table_item_lg2sz(&pts));
+ need_contig = leaf_pgsize_lg2 != pt_table_item_lg2sz(&pts);
+
+ _pt_iter_first(&pts);
+ start_index = pts.index;
+ do {
+ pts.type = pt_load_entry_raw(&pts);
+ if (pts.type != PT_ENTRY_EMPTY || need_contig) {
+ if (pts.index != start_index)
+ pt_index_to_va(&pts);
+ ret = clear_contig(&pts, map->iotlb_gather, step,
+ leaf_pgsize_lg2);
+ if (ret)
+ break;
+ }
+
+ if (IS_ENABLED(CONFIG_DEBUG_GENERIC_PT)) {
+ pt_index_to_va(&pts);
+ PT_WARN_ON(compute_best_pgsize(&pts, oa) !=
+ leaf_pgsize_lg2);
+ }
+ pt_install_leaf_entry(&pts, oa, leaf_pgsize_lg2, &map->attrs);
+
+ oa += log2_to_int(leaf_pgsize_lg2);
+ pts.index += step;
+ } while (pts.index < pts.end_index);
+
+ flush_writes_range(&pts, start_index, pts.index);
+
+ map->oa = oa;
+ return ret;
+}
+
+static int __map_range(struct pt_range *range, void *arg, unsigned int level,
+ struct pt_table_p *table)
+{
+ struct pt_state pts = pt_init(range, level, table);
+ struct pt_iommu_map_args *map = arg;
+ int ret;
+
+ PT_WARN_ON(map->leaf_level == level);
+ PT_WARN_ON(!pt_can_have_table(&pts));
+
+ _pt_iter_first(&pts);
+
+ /* Descend to a child table */
+ do {
+ pts.type = pt_load_entry_raw(&pts);
+
+ if (pts.type != PT_ENTRY_TABLE) {
+ if (pts.type != PT_ENTRY_EMPTY)
+ return -EADDRINUSE;
+ ret = pt_iommu_new_table(&pts, &map->attrs);
+ if (ret) {
+ /*
+ * Racing with another thread installing a table
+ */
+ if (ret == -EAGAIN)
+ continue;
+ return ret;
+ }
+ } else {
+ pts.table_lower = pt_table_ptr(&pts);
+ /*
+ * Racing with a shared pt_iommu_new_table()? The other
+ * thread is still flushing the cache, so we have to
+ * also flush it to ensure that when our thread's map
+ * completes all the table items leading to our mapping
+ * are visible.
+ *
+ * This requires the pt_set_bit_release() to be a
+ * release of the cache flush so that this can acquire
+ * visibility at the iommu.
+ */
+ if (pts_feature(&pts, PT_FEAT_DMA_INCOHERENT) &&
+ !pt_test_sw_bit_acquire(&pts,
+ SW_BIT_CACHE_FLUSH_DONE))
+ flush_writes_item(&pts);
+ }
+
+ /*
+ * The already present table can possibly be shared with another
+ * concurrent map.
+ */
+ if (map->leaf_level == level - 1)
+ ret = pt_descend(&pts, arg, __map_range_leaf);
+ else
+ ret = pt_descend(&pts, arg, __map_range);
+ if (ret)
+ return ret;
+
+ pts.index++;
+ pt_index_to_va(&pts);
+ if (pts.index >= pts.end_index)
+ break;
+ } while (true);
+ return 0;
+}
+
+/*
+ * Fast path for the easy case of mapping a 4k page to an already allocated
+ * table. This is a common workload. If it returns EAGAIN run the full algorithm
+ * instead.
+ */
+static __always_inline int __do_map_single_page(struct pt_range *range,
+ void *arg, unsigned int level,
+ struct pt_table_p *table,
+ pt_level_fn_t descend_fn)
+{
+ struct pt_state pts = pt_init(range, level, table);
+ struct pt_iommu_map_args *map = arg;
+
+ pts.type = pt_load_single_entry(&pts);
+ if (level == 0) {
+ if (pts.type != PT_ENTRY_EMPTY)
+ return -EADDRINUSE;
+ pt_install_leaf_entry(&pts, map->oa, PAGE_SHIFT,
+ &map->attrs);
+ /* No flush, not used when incoherent */
+ map->oa += PAGE_SIZE;
+ return 0;
+ }
+ if (pts.type == PT_ENTRY_TABLE)
+ return pt_descend(&pts, arg, descend_fn);
+ /* Something else, use the slow path */
+ return -EAGAIN;
+}
+PT_MAKE_LEVELS(__map_single_page, __do_map_single_page);
+
+/*
+ * Add a table to the top, increasing the top level as much as necessary to
+ * encompass range.
+ */
+static int increase_top(struct pt_iommu *iommu_table, struct pt_range *range,
+ struct pt_iommu_map_args *map)
+{
+ struct iommu_pages_list free_list = IOMMU_PAGES_LIST_INIT(free_list);
+ struct pt_common *common = common_from_iommu(iommu_table);
+ uintptr_t top_of_table = READ_ONCE(common->top_of_table);
+ uintptr_t new_top_of_table = top_of_table;
+ struct pt_table_p *table_mem;
+ unsigned int new_level;
+ spinlock_t *domain_lock;
+ unsigned long flags;
+ int ret;
+
+ while (true) {
+ struct pt_range top_range =
+ _pt_top_range(common, new_top_of_table);
+ struct pt_state pts = pt_init_top(&top_range);
+
+ top_range.va = range->va;
+ top_range.last_va = range->last_va;
+
+ if (!pt_check_range(&top_range) &&
+ map->leaf_level <= pts.level) {
+ new_level = pts.level;
+ break;
+ }
+
+ pts.level++;
+ if (pts.level > PT_MAX_TOP_LEVEL ||
+ pt_table_item_lg2sz(&pts) >= common->max_vasz_lg2) {
+ ret = -ERANGE;
+ goto err_free;
+ }
+
+ table_mem =
+ table_alloc_top(common, _pt_top_set(NULL, pts.level),
+ map->attrs.gfp, ALLOC_DEFER_COHERENT_FLUSH);
+ if (IS_ERR(table_mem)) {
+ ret = PTR_ERR(table_mem);
+ goto err_free;
+ }
+ iommu_pages_list_add(&free_list, table_mem);
+
+ /* The new table links to the lower table always at index 0 */
+ top_range.va = 0;
+ top_range.top_level = pts.level;
+ pts.table_lower = pts.table;
+ pts.table = table_mem;
+ pt_load_single_entry(&pts);
+ PT_WARN_ON(pts.index != 0);
+ pt_install_table(&pts, virt_to_phys(pts.table_lower),
+ &map->attrs);
+ new_top_of_table = _pt_top_set(pts.table, pts.level);
+ }
+
+ /*
+ * Avoid double flushing, flush it once after all pt_install_table()
+ */
+ if (pt_feature(common, PT_FEAT_DMA_INCOHERENT)) {
+ ret = iommu_pages_start_incoherent_list(
+ &free_list, iommu_table->iommu_device);
+ if (ret)
+ goto err_free;
+ }
+
+ /*
+ * top_of_table is write locked by the spinlock, but readers can use
+ * READ_ONCE() to get the value. Since we encode both the level and the
+ * pointer in one quanta the lockless reader will always see something
+ * valid. The HW must be updated to the new level under the spinlock
+ * before top_of_table is updated so that concurrent readers don't map
+ * into the new level until it is fully functional. If another thread
+ * already updated it while we were working then throw everything away
+ * and try again.
+ */
+ domain_lock = iommu_table->driver_ops->get_top_lock(iommu_table);
+ spin_lock_irqsave(domain_lock, flags);
+ if (common->top_of_table != top_of_table ||
+ top_of_table == new_top_of_table) {
+ spin_unlock_irqrestore(domain_lock, flags);
+ ret = -EAGAIN;
+ goto err_free;
+ }
+
+ /*
+ * We do not issue any flushes for change_top on the expectation that
+ * any walk cache will not become a problem by adding another layer to
+ * the tree. Misses will rewalk from the updated top pointer, hits
+ * continue to be correct. Negative caching is fine too since all the
+ * new IOVA added by the new top is non-present.
+ */
+ iommu_table->driver_ops->change_top(
+ iommu_table, virt_to_phys(table_mem), new_level);
+ WRITE_ONCE(common->top_of_table, new_top_of_table);
+ spin_unlock_irqrestore(domain_lock, flags);
+ return 0;
+
+err_free:
+ if (pt_feature(common, PT_FEAT_DMA_INCOHERENT))
+ iommu_pages_stop_incoherent_list(&free_list,
+ iommu_table->iommu_device);
+ iommu_put_pages_list(&free_list);
+ return ret;
+}
+
+static int check_map_range(struct pt_iommu *iommu_table, struct pt_range *range,
+ struct pt_iommu_map_args *map)
+{
+ struct pt_common *common = common_from_iommu(iommu_table);
+ int ret;
+
+ do {
+ ret = pt_check_range(range);
+ if (!pt_feature(common, PT_FEAT_DYNAMIC_TOP))
+ return ret;
+
+ if (!ret && map->leaf_level <= range->top_level)
+ break;
+
+ ret = increase_top(iommu_table, range, map);
+ if (ret && ret != -EAGAIN)
+ return ret;
+
+ /* Reload the new top */
+ *range = pt_make_range(common, range->va, range->last_va);
+ } while (ret);
+ PT_WARN_ON(pt_check_range(range));
+ return 0;
+}
+
+static int do_map(struct pt_range *range, struct pt_common *common,
+ bool single_page, struct pt_iommu_map_args *map)
+{
+ /*
+ * The __map_single_page() fast path does not support DMA_INCOHERENT
+ * flushing to keep its .text small.
+ */
+ if (single_page && !pt_feature(common, PT_FEAT_DMA_INCOHERENT)) {
+ int ret;
+
+ ret = pt_walk_range(range, __map_single_page, map);
+ if (ret != -EAGAIN)
+ return ret;
+ /* EAGAIN falls through to the full path */
+ }
+
+ if (map->leaf_level == range->top_level)
+ return pt_walk_range(range, __map_range_leaf, map);
+ return pt_walk_range(range, __map_range, map);
+}
+
+/**
+ * map_pages() - Install translation for an IOVA range
+ * @domain: Domain to manipulate
+ * @iova: IO virtual address to start
+ * @paddr: Physical/Output address to start
+ * @pgsize: Length of each page
+ * @pgcount: Length of the range in pgsize units starting from @iova
+ * @prot: A bitmap of IOMMU_READ/WRITE/CACHE/NOEXEC/MMIO
+ * @gfp: GFP flags for any memory allocations
+ * @mapped: Total bytes successfully mapped
+ *
+ * The range starting at IOVA will have paddr installed into it. The caller
+ * must specify a valid pgsize and pgcount to segment the range into compatible
+ * blocks.
+ *
+ * On error the caller will probably want to invoke unmap on the range from iova
+ * up to the amount indicated by @mapped to return the table back to an
+ * unchanged state.
+ *
+ * Context: The caller must hold a write range lock that includes the whole
+ * range.
+ *
+ * Returns: -ERRNO on failure, 0 on success. The number of bytes of VA that were
+ * mapped are added to @mapped, @mapped is not zerod first.
+ */
+int DOMAIN_NS(map_pages)(struct iommu_domain *domain, unsigned long iova,
+ phys_addr_t paddr, size_t pgsize, size_t pgcount,
+ int prot, gfp_t gfp, size_t *mapped)
+{
+ struct pt_iommu *iommu_table =
+ container_of(domain, struct pt_iommu, domain);
+ pt_vaddr_t pgsize_bitmap = iommu_table->domain.pgsize_bitmap;
+ struct pt_common *common = common_from_iommu(iommu_table);
+ struct iommu_iotlb_gather iotlb_gather;
+ pt_vaddr_t len = pgsize * pgcount;
+ struct pt_iommu_map_args map = {
+ .iotlb_gather = &iotlb_gather,
+ .oa = paddr,
+ .leaf_pgsize_lg2 = vaffs(pgsize),
+ };
+ bool single_page = false;
+ struct pt_range range;
+ int ret;
+
+ iommu_iotlb_gather_init(&iotlb_gather);
+
+ if (WARN_ON(!(prot & (IOMMU_READ | IOMMU_WRITE))))
+ return -EINVAL;
+
+ /* Check the paddr doesn't exceed what the table can store */
+ if ((sizeof(pt_oaddr_t) < sizeof(paddr) &&
+ (pt_vaddr_t)paddr > PT_VADDR_MAX) ||
+ (common->max_oasz_lg2 != PT_VADDR_MAX_LG2 &&
+ oalog2_div(paddr, common->max_oasz_lg2)))
+ return -ERANGE;
+
+ ret = pt_iommu_set_prot(common, &map.attrs, prot);
+ if (ret)
+ return ret;
+ map.attrs.gfp = gfp;
+
+ ret = make_range_no_check(common, &range, iova, len);
+ if (ret)
+ return ret;
+
+ /* Calculate target page size and level for the leaves */
+ if (pt_has_system_page_size(common) && pgsize == PAGE_SIZE &&
+ pgcount == 1) {
+ PT_WARN_ON(!(pgsize_bitmap & PAGE_SIZE));
+ if (log2_mod(iova | paddr, PAGE_SHIFT))
+ return -ENXIO;
+ map.leaf_pgsize_lg2 = PAGE_SHIFT;
+ map.leaf_level = 0;
+ single_page = true;
+ } else {
+ map.leaf_pgsize_lg2 = pt_compute_best_pgsize(
+ pgsize_bitmap, range.va, range.last_va, paddr);
+ if (!map.leaf_pgsize_lg2)
+ return -ENXIO;
+ map.leaf_level =
+ pt_pgsz_lg2_to_level(common, map.leaf_pgsize_lg2);
+ }
+
+ ret = check_map_range(iommu_table, &range, &map);
+ if (ret)
+ return ret;
+
+ PT_WARN_ON(map.leaf_level > range.top_level);
+
+ ret = do_map(&range, common, single_page, &map);
+
+ /*
+ * Table levels were freed and replaced with large items, flush any walk
+ * cache that may refer to the freed levels.
+ */
+ if (!iommu_pages_list_empty(&iotlb_gather.freelist))
+ iommu_iotlb_sync(&iommu_table->domain, &iotlb_gather);
+
+ /* Bytes successfully mapped */
+ PT_WARN_ON(!ret && map.oa - paddr != len);
+ *mapped += map.oa - paddr;
+ return ret;
+}
+EXPORT_SYMBOL_NS_GPL(DOMAIN_NS(map_pages), "GENERIC_PT_IOMMU");
+
+struct pt_unmap_args {
+ struct iommu_pages_list free_list;
+ pt_vaddr_t unmapped;
+};
+
+static __maybe_unused int __unmap_range(struct pt_range *range, void *arg,
+ unsigned int level,
+ struct pt_table_p *table)
+{
+ struct pt_state pts = pt_init(range, level, table);
+ struct pt_unmap_args *unmap = arg;
+ unsigned int num_oas = 0;
+ unsigned int start_index;
+ int ret = 0;
+
+ _pt_iter_first(&pts);
+ start_index = pts.index;
+ pts.type = pt_load_entry_raw(&pts);
+ /*
+ * A starting index is in the middle of a contiguous entry
+ *
+ * The IOMMU API does not require drivers to support unmapping parts of
+ * large pages. Long ago VFIO would try to split maps but the current
+ * version never does.
+ *
+ * Instead when unmap reaches a partial unmap of the start of a large
+ * IOPTE it should remove the entire IOPTE and return that size to the
+ * caller.
+ */
+ if (pts.type == PT_ENTRY_OA) {
+ if (log2_mod(range->va, pt_entry_oa_lg2sz(&pts)))
+ return -EINVAL;
+ /* Micro optimization */
+ goto start_oa;
+ }
+
+ do {
+ if (pts.type != PT_ENTRY_OA) {
+ bool fully_covered;
+
+ if (pts.type != PT_ENTRY_TABLE) {
+ ret = -EINVAL;
+ break;
+ }
+
+ if (pts.index != start_index)
+ pt_index_to_va(&pts);
+ pts.table_lower = pt_table_ptr(&pts);
+
+ fully_covered = pt_entry_fully_covered(
+ &pts, pt_table_item_lg2sz(&pts));
+
+ ret = pt_descend(&pts, arg, __unmap_range);
+ if (ret)
+ break;
+
+ /*
+ * If the unmapping range fully covers the table then we
+ * can free it as well. The clear is delayed until we
+ * succeed in clearing the lower table levels.
+ */
+ if (fully_covered) {
+ iommu_pages_list_add(&unmap->free_list,
+ pts.table_lower);
+ pt_clear_entries(&pts, ilog2(1));
+ }
+ pts.index++;
+ } else {
+ unsigned int num_contig_lg2;
+start_oa:
+ /*
+ * If the caller requested an last that falls within a
+ * single entry then the entire entry is unmapped and
+ * the length returned will be larger than requested.
+ */
+ num_contig_lg2 = pt_entry_num_contig_lg2(&pts);
+ pt_clear_entries(&pts, num_contig_lg2);
+ num_oas += log2_to_int(num_contig_lg2);
+ pts.index += log2_to_int(num_contig_lg2);
+ }
+ if (pts.index >= pts.end_index)
+ break;
+ pts.type = pt_load_entry_raw(&pts);
+ } while (true);
+
+ unmap->unmapped += log2_mul(num_oas, pt_table_item_lg2sz(&pts));
+ flush_writes_range(&pts, start_index, pts.index);
+
+ return ret;
+}
+
+/**
+ * unmap_pages() - Make a range of IOVA empty/not present
+ * @domain: Domain to manipulate
+ * @iova: IO virtual address to start
+ * @pgsize: Length of each page
+ * @pgcount: Length of the range in pgsize units starting from @iova
+ * @iotlb_gather: Gather struct that must be flushed on return
+ *
+ * unmap_pages() will remove a translation created by map_pages(). It cannot
+ * subdivide a mapping created by map_pages(), so it should be called with IOVA
+ * ranges that match those passed to map_pages(). The IOVA range can aggregate
+ * contiguous map_pages() calls so long as no individual range is split.
+ *
+ * Context: The caller must hold a write range lock that includes
+ * the whole range.
+ *
+ * Returns: Number of bytes of VA unmapped. iova + res will be the point
+ * unmapping stopped.
+ */
+size_t DOMAIN_NS(unmap_pages)(struct iommu_domain *domain, unsigned long iova,
+ size_t pgsize, size_t pgcount,
+ struct iommu_iotlb_gather *iotlb_gather)
+{
+ struct pt_iommu *iommu_table =
+ container_of(domain, struct pt_iommu, domain);
+ struct pt_unmap_args unmap = { .free_list = IOMMU_PAGES_LIST_INIT(
+ unmap.free_list) };
+ pt_vaddr_t len = pgsize * pgcount;
+ struct pt_range range;
+ int ret;
+
+ ret = make_range(common_from_iommu(iommu_table), &range, iova, len);
+ if (ret)
+ return 0;
+
+ pt_walk_range(&range, __unmap_range, &unmap);
+
+ gather_range_pages(iotlb_gather, iommu_table, iova, len,
+ &unmap.free_list);
+
+ return unmap.unmapped;
+}
+EXPORT_SYMBOL_NS_GPL(DOMAIN_NS(unmap_pages), "GENERIC_PT_IOMMU");
+
+static void NS(get_info)(struct pt_iommu *iommu_table,
+ struct pt_iommu_info *info)
+{
+ struct pt_common *common = common_from_iommu(iommu_table);
+ struct pt_range range = pt_top_range(common);
+ struct pt_state pts = pt_init_top(&range);
+ pt_vaddr_t pgsize_bitmap = 0;
+
+ if (pt_feature(common, PT_FEAT_DYNAMIC_TOP)) {
+ for (pts.level = 0; pts.level <= PT_MAX_TOP_LEVEL;
+ pts.level++) {
+ if (pt_table_item_lg2sz(&pts) >= common->max_vasz_lg2)
+ break;
+ pgsize_bitmap |= pt_possible_sizes(&pts);
+ }
+ } else {
+ for (pts.level = 0; pts.level <= range.top_level; pts.level++)
+ pgsize_bitmap |= pt_possible_sizes(&pts);
+ }
+
+ /* Hide page sizes larger than the maximum OA */
+ info->pgsize_bitmap = oalog2_mod(pgsize_bitmap, common->max_oasz_lg2);
+}
+
+static void NS(deinit)(struct pt_iommu *iommu_table)
+{
+ struct pt_common *common = common_from_iommu(iommu_table);
+ struct pt_range range = pt_all_range(common);
+ struct pt_iommu_collect_args collect = {
+ .free_list = IOMMU_PAGES_LIST_INIT(collect.free_list),
+ };
+
+ iommu_pages_list_add(&collect.free_list, range.top_table);
+ pt_walk_range(&range, __collect_tables, &collect);
+
+ /*
+ * The driver has to already have fenced the HW access to the page table
+ * and invalidated any caching referring to this memory.
+ */
+ if (pt_feature(common, PT_FEAT_DMA_INCOHERENT))
+ iommu_pages_stop_incoherent_list(&collect.free_list,
+ iommu_table->iommu_device);
+ iommu_put_pages_list(&collect.free_list);
+}
+
+static const struct pt_iommu_ops NS(ops) = {
+#if IS_ENABLED(CONFIG_IOMMUFD_DRIVER) && defined(pt_entry_is_write_dirty) && \
+ IS_ENABLED(CONFIG_IOMMUFD_TEST) && defined(pt_entry_make_write_dirty)
+ .set_dirty = NS(set_dirty),
+#endif
+ .get_info = NS(get_info),
+ .deinit = NS(deinit),
+};
+
+static int pt_init_common(struct pt_common *common)
+{
+ struct pt_range top_range = pt_top_range(common);
+
+ if (PT_WARN_ON(top_range.top_level > PT_MAX_TOP_LEVEL))
+ return -EINVAL;
+
+ if (top_range.top_level == PT_MAX_TOP_LEVEL ||
+ common->max_vasz_lg2 == top_range.max_vasz_lg2)
+ common->features &= ~BIT(PT_FEAT_DYNAMIC_TOP);
+
+ if (top_range.max_vasz_lg2 == PT_VADDR_MAX_LG2)
+ common->features |= BIT(PT_FEAT_FULL_VA);
+
+ /* Requested features must match features compiled into this format */
+ if ((common->features & ~(unsigned int)PT_SUPPORTED_FEATURES) ||
+ (!IS_ENABLED(CONFIG_DEBUG_GENERIC_PT) &&
+ (common->features & PT_FORCE_ENABLED_FEATURES) !=
+ PT_FORCE_ENABLED_FEATURES))
+ return -EOPNOTSUPP;
+
+ /*
+ * Check if the top level of the page table is too small to hold the
+ * specified maxvasz.
+ */
+ if (!pt_feature(common, PT_FEAT_DYNAMIC_TOP) &&
+ top_range.top_level != PT_MAX_TOP_LEVEL) {
+ struct pt_state pts = { .range = &top_range,
+ .level = top_range.top_level };
+
+ if (common->max_vasz_lg2 >
+ pt_num_items_lg2(&pts) + pt_table_item_lg2sz(&pts))
+ return -EOPNOTSUPP;
+ }
+
+ if (common->max_oasz_lg2 == 0)
+ common->max_oasz_lg2 = pt_max_oa_lg2(common);
+ else
+ common->max_oasz_lg2 = min(common->max_oasz_lg2,
+ pt_max_oa_lg2(common));
+ return 0;
+}
+
+static int pt_iommu_init_domain(struct pt_iommu *iommu_table,
+ struct iommu_domain *domain)
+{
+ struct pt_common *common = common_from_iommu(iommu_table);
+ struct pt_iommu_info info;
+ struct pt_range range;
+
+ NS(get_info)(iommu_table, &info);
+
+ domain->type = __IOMMU_DOMAIN_PAGING;
+ domain->pgsize_bitmap = info.pgsize_bitmap;
+
+ if (pt_feature(common, PT_FEAT_DYNAMIC_TOP))
+ range = _pt_top_range(common,
+ _pt_top_set(NULL, PT_MAX_TOP_LEVEL));
+ else
+ range = pt_top_range(common);
+
+ /* A 64-bit high address space table on a 32-bit system cannot work. */
+ domain->geometry.aperture_start = (unsigned long)range.va;
+ if ((pt_vaddr_t)domain->geometry.aperture_start != range.va)
+ return -EOVERFLOW;
+
+ /*
+ * The aperture is limited to what the API can do after considering all
+ * the different types dma_addr_t/unsigned long/pt_vaddr_t that are used
+ * to store a VA. Set the aperture to something that is valid for all
+ * cases. Saturate instead of truncate the end if the types are smaller
+ * than the top range. aperture_end should be called aperture_last.
+ */
+ domain->geometry.aperture_end = (unsigned long)range.last_va;
+ if ((pt_vaddr_t)domain->geometry.aperture_end != range.last_va) {
+ domain->geometry.aperture_end = ULONG_MAX;
+ domain->pgsize_bitmap &= ULONG_MAX;
+ }
+ domain->geometry.force_aperture = true;
+
+ return 0;
+}
+
+static void pt_iommu_zero(struct pt_iommu_table *fmt_table)
+{
+ struct pt_iommu *iommu_table = &fmt_table->iommu;
+ struct pt_iommu cfg = *iommu_table;
+
+ static_assert(offsetof(struct pt_iommu_table, iommu.domain) == 0);
+ memset_after(fmt_table, 0, iommu.domain);
+
+ /* The caller can initialize some of these values */
+ iommu_table->iommu_device = cfg.iommu_device;
+ iommu_table->driver_ops = cfg.driver_ops;
+ iommu_table->nid = cfg.nid;
+}
+
+#define pt_iommu_table_cfg CONCATENATE(pt_iommu_table, _cfg)
+#define pt_iommu_init CONCATENATE(CONCATENATE(pt_iommu_, PTPFX), init)
+
+int pt_iommu_init(struct pt_iommu_table *fmt_table,
+ const struct pt_iommu_table_cfg *cfg, gfp_t gfp)
+{
+ struct pt_iommu *iommu_table = &fmt_table->iommu;
+ struct pt_common *common = common_from_iommu(iommu_table);
+ struct pt_table_p *table_mem;
+ int ret;
+
+ if (cfg->common.hw_max_vasz_lg2 > PT_MAX_VA_ADDRESS_LG2 ||
+ !cfg->common.hw_max_vasz_lg2 || !cfg->common.hw_max_oasz_lg2)
+ return -EINVAL;
+
+ pt_iommu_zero(fmt_table);
+ common->features = cfg->common.features;
+ common->max_vasz_lg2 = cfg->common.hw_max_vasz_lg2;
+ common->max_oasz_lg2 = cfg->common.hw_max_oasz_lg2;
+ ret = pt_iommu_fmt_init(fmt_table, cfg);
+ if (ret)
+ return ret;
+
+ if (cfg->common.hw_max_oasz_lg2 > pt_max_oa_lg2(common))
+ return -EINVAL;
+
+ ret = pt_init_common(common);
+ if (ret)
+ return ret;
+
+ if (pt_feature(common, PT_FEAT_DYNAMIC_TOP) &&
+ WARN_ON(!iommu_table->driver_ops ||
+ !iommu_table->driver_ops->change_top ||
+ !iommu_table->driver_ops->get_top_lock))
+ return -EINVAL;
+
+ if (pt_feature(common, PT_FEAT_SIGN_EXTEND) &&
+ (pt_feature(common, PT_FEAT_FULL_VA) ||
+ pt_feature(common, PT_FEAT_DYNAMIC_TOP)))
+ return -EINVAL;
+
+ if (pt_feature(common, PT_FEAT_DMA_INCOHERENT) &&
+ WARN_ON(!iommu_table->iommu_device))
+ return -EINVAL;
+
+ ret = pt_iommu_init_domain(iommu_table, &iommu_table->domain);
+ if (ret)
+ return ret;
+
+ table_mem = table_alloc_top(common, common->top_of_table, gfp,
+ ALLOC_NORMAL);
+ if (IS_ERR(table_mem))
+ return PTR_ERR(table_mem);
+ pt_top_set(common, table_mem, pt_top_get_level(common));
+
+ /* Must be last, see pt_iommu_deinit() */
+ iommu_table->ops = &NS(ops);
+ return 0;
+}
+EXPORT_SYMBOL_NS_GPL(pt_iommu_init, "GENERIC_PT_IOMMU");
+
+#ifdef pt_iommu_fmt_hw_info
+#define pt_iommu_table_hw_info CONCATENATE(pt_iommu_table, _hw_info)
+#define pt_iommu_hw_info CONCATENATE(CONCATENATE(pt_iommu_, PTPFX), hw_info)
+void pt_iommu_hw_info(struct pt_iommu_table *fmt_table,
+ struct pt_iommu_table_hw_info *info)
+{
+ struct pt_iommu *iommu_table = &fmt_table->iommu;
+ struct pt_common *common = common_from_iommu(iommu_table);
+ struct pt_range top_range = pt_top_range(common);
+
+ pt_iommu_fmt_hw_info(fmt_table, &top_range, info);
+}
+EXPORT_SYMBOL_NS_GPL(pt_iommu_hw_info, "GENERIC_PT_IOMMU");
+#endif
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("IOMMU Page table implementation for " __stringify(PTPFX_RAW));
+MODULE_IMPORT_NS("GENERIC_PT");
+/* For iommu_dirty_bitmap_record() */
+MODULE_IMPORT_NS("IOMMUFD");
+
+#endif /* __GENERIC_PT_IOMMU_PT_H */
diff --git a/drivers/iommu/generic_pt/kunit_generic_pt.h b/drivers/iommu/generic_pt/kunit_generic_pt.h
new file mode 100644
index 000000000000..68278bf15cfe
--- /dev/null
+++ b/drivers/iommu/generic_pt/kunit_generic_pt.h
@@ -0,0 +1,823 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES
+ *
+ * Test the format API directly.
+ *
+ */
+#include "kunit_iommu.h"
+#include "pt_iter.h"
+
+static void do_map(struct kunit *test, pt_vaddr_t va, pt_oaddr_t pa,
+ pt_vaddr_t len)
+{
+ struct kunit_iommu_priv *priv = test->priv;
+ int ret;
+
+ KUNIT_ASSERT_EQ(test, len, (size_t)len);
+
+ ret = iommu_map(&priv->domain, va, pa, len, IOMMU_READ | IOMMU_WRITE,
+ GFP_KERNEL);
+ KUNIT_ASSERT_NO_ERRNO_FN(test, "map_pages", ret);
+}
+
+#define KUNIT_ASSERT_PT_LOAD(test, pts, entry) \
+ ({ \
+ pt_load_entry(pts); \
+ KUNIT_ASSERT_EQ(test, (pts)->type, entry); \
+ })
+
+struct check_levels_arg {
+ struct kunit *test;
+ void *fn_arg;
+ void (*fn)(struct kunit *test, struct pt_state *pts, void *arg);
+};
+
+static int __check_all_levels(struct pt_range *range, void *arg,
+ unsigned int level, struct pt_table_p *table)
+{
+ struct pt_state pts = pt_init(range, level, table);
+ struct check_levels_arg *chk = arg;
+ struct kunit *test = chk->test;
+ int ret;
+
+ _pt_iter_first(&pts);
+
+
+ /*
+ * If we were able to use the full VA space this should always be the
+ * last index in each table.
+ */
+ if (!(IS_32BIT && range->max_vasz_lg2 > 32)) {
+ if (pt_feature(range->common, PT_FEAT_SIGN_EXTEND) &&
+ pts.level == pts.range->top_level)
+ KUNIT_ASSERT_EQ(test, pts.index,
+ log2_to_int(range->max_vasz_lg2 - 1 -
+ pt_table_item_lg2sz(&pts)) -
+ 1);
+ else
+ KUNIT_ASSERT_EQ(test, pts.index,
+ log2_to_int(pt_table_oa_lg2sz(&pts) -
+ pt_table_item_lg2sz(&pts)) -
+ 1);
+ }
+
+ if (pt_can_have_table(&pts)) {
+ pt_load_single_entry(&pts);
+ KUNIT_ASSERT_EQ(test, pts.type, PT_ENTRY_TABLE);
+ ret = pt_descend(&pts, arg, __check_all_levels);
+ KUNIT_ASSERT_EQ(test, ret, 0);
+
+ /* Index 0 is used by the test */
+ if (IS_32BIT && !pts.index)
+ return 0;
+ KUNIT_ASSERT_NE(chk->test, pts.index, 0);
+ }
+
+ /*
+ * A format should not create a table with only one entry, at least this
+ * test approach won't work.
+ */
+ KUNIT_ASSERT_GT(chk->test, pts.end_index, 1);
+
+ /*
+ * For increase top we end up using index 0 for the original top's tree,
+ * so use index 1 for testing instead.
+ */
+ pts.index = 0;
+ pt_index_to_va(&pts);
+ pt_load_single_entry(&pts);
+ if (pts.type == PT_ENTRY_TABLE && pts.end_index > 2) {
+ pts.index = 1;
+ pt_index_to_va(&pts);
+ }
+ (*chk->fn)(chk->test, &pts, chk->fn_arg);
+ return 0;
+}
+
+/*
+ * Call fn for each level in the table with a pts setup to index 0 in a table
+ * for that level. This allows writing tests that run on every level.
+ * The test can use every index in the table except the last one.
+ */
+static void check_all_levels(struct kunit *test,
+ void (*fn)(struct kunit *test,
+ struct pt_state *pts, void *arg),
+ void *fn_arg)
+{
+ struct kunit_iommu_priv *priv = test->priv;
+ struct pt_range range = pt_top_range(priv->common);
+ struct check_levels_arg chk = {
+ .test = test,
+ .fn = fn,
+ .fn_arg = fn_arg,
+ };
+ int ret;
+
+ if (pt_feature(priv->common, PT_FEAT_DYNAMIC_TOP) &&
+ priv->common->max_vasz_lg2 > range.max_vasz_lg2)
+ range.last_va = fvalog2_set_mod_max(range.va,
+ priv->common->max_vasz_lg2);
+
+ /*
+ * Map a page at the highest VA, this will populate all the levels so we
+ * can then iterate over them. Index 0 will be used for testing.
+ */
+ if (IS_32BIT && range.max_vasz_lg2 > 32)
+ range.last_va = (u32)range.last_va;
+ range.va = range.last_va - (priv->smallest_pgsz - 1);
+ do_map(test, range.va, 0, priv->smallest_pgsz);
+
+ range = pt_make_range(priv->common, range.va, range.last_va);
+ ret = pt_walk_range(&range, __check_all_levels, &chk);
+ KUNIT_ASSERT_EQ(test, ret, 0);
+}
+
+static void test_init(struct kunit *test)
+{
+ struct kunit_iommu_priv *priv = test->priv;
+
+ /* Fixture does the setup */
+ KUNIT_ASSERT_NE(test, priv->info.pgsize_bitmap, 0);
+}
+
+/*
+ * Basic check that the log2_* functions are working, especially at the integer
+ * limits.
+ */
+static void test_bitops(struct kunit *test)
+{
+ int i;
+
+ KUNIT_ASSERT_EQ(test, fls_t(u32, 0), 0);
+ KUNIT_ASSERT_EQ(test, fls_t(u32, 1), 1);
+ KUNIT_ASSERT_EQ(test, fls_t(u32, BIT(2)), 3);
+ KUNIT_ASSERT_EQ(test, fls_t(u32, U32_MAX), 32);
+
+ KUNIT_ASSERT_EQ(test, fls_t(u64, 0), 0);
+ KUNIT_ASSERT_EQ(test, fls_t(u64, 1), 1);
+ KUNIT_ASSERT_EQ(test, fls_t(u64, BIT(2)), 3);
+ KUNIT_ASSERT_EQ(test, fls_t(u64, U64_MAX), 64);
+
+ KUNIT_ASSERT_EQ(test, ffs_t(u32, 1), 0);
+ KUNIT_ASSERT_EQ(test, ffs_t(u32, BIT(2)), 2);
+ KUNIT_ASSERT_EQ(test, ffs_t(u32, BIT(31)), 31);
+
+ KUNIT_ASSERT_EQ(test, ffs_t(u64, 1), 0);
+ KUNIT_ASSERT_EQ(test, ffs_t(u64, BIT(2)), 2);
+ KUNIT_ASSERT_EQ(test, ffs_t(u64, BIT_ULL(63)), 63);
+
+ for (i = 0; i != 31; i++)
+ KUNIT_ASSERT_EQ(test, ffz_t(u64, BIT_ULL(i) - 1), i);
+
+ for (i = 0; i != 63; i++)
+ KUNIT_ASSERT_EQ(test, ffz_t(u64, BIT_ULL(i) - 1), i);
+
+ for (i = 0; i != 32; i++) {
+ u64 val = get_random_u64();
+
+ KUNIT_ASSERT_EQ(test, log2_mod_t(u32, val, ffs_t(u32, val)), 0);
+ KUNIT_ASSERT_EQ(test, log2_mod_t(u64, val, ffs_t(u64, val)), 0);
+
+ KUNIT_ASSERT_EQ(test, log2_mod_t(u32, val, ffz_t(u32, val)),
+ log2_to_max_int_t(u32, ffz_t(u32, val)));
+ KUNIT_ASSERT_EQ(test, log2_mod_t(u64, val, ffz_t(u64, val)),
+ log2_to_max_int_t(u64, ffz_t(u64, val)));
+ }
+}
+
+static unsigned int ref_best_pgsize(pt_vaddr_t pgsz_bitmap, pt_vaddr_t va,
+ pt_vaddr_t last_va, pt_oaddr_t oa)
+{
+ pt_vaddr_t pgsz_lg2;
+
+ /* Brute force the constraints described in pt_compute_best_pgsize() */
+ for (pgsz_lg2 = PT_VADDR_MAX_LG2 - 1; pgsz_lg2 != 0; pgsz_lg2--) {
+ if ((pgsz_bitmap & log2_to_int(pgsz_lg2)) &&
+ log2_mod(va, pgsz_lg2) == 0 &&
+ oalog2_mod(oa, pgsz_lg2) == 0 &&
+ va + log2_to_int(pgsz_lg2) - 1 <= last_va &&
+ log2_div_eq(va, va + log2_to_int(pgsz_lg2) - 1, pgsz_lg2) &&
+ oalog2_div_eq(oa, oa + log2_to_int(pgsz_lg2) - 1, pgsz_lg2))
+ return pgsz_lg2;
+ }
+ return 0;
+}
+
+/* Check that the bit logic in pt_compute_best_pgsize() works. */
+static void test_best_pgsize(struct kunit *test)
+{
+ unsigned int a_lg2;
+ unsigned int b_lg2;
+ unsigned int c_lg2;
+
+ /* Try random prefixes with every suffix combination */
+ for (a_lg2 = 1; a_lg2 != 10; a_lg2++) {
+ for (b_lg2 = 1; b_lg2 != 10; b_lg2++) {
+ for (c_lg2 = 1; c_lg2 != 10; c_lg2++) {
+ pt_vaddr_t pgsz_bitmap = get_random_u64();
+ pt_vaddr_t va = get_random_u64() << a_lg2;
+ pt_oaddr_t oa = get_random_u64() << b_lg2;
+ pt_vaddr_t last_va = log2_set_mod_max(
+ get_random_u64(), c_lg2);
+
+ if (va > last_va)
+ swap(va, last_va);
+ KUNIT_ASSERT_EQ(
+ test,
+ pt_compute_best_pgsize(pgsz_bitmap, va,
+ last_va, oa),
+ ref_best_pgsize(pgsz_bitmap, va,
+ last_va, oa));
+ }
+ }
+ }
+
+ /* 0 prefix, every suffix */
+ for (c_lg2 = 1; c_lg2 != PT_VADDR_MAX_LG2 - 1; c_lg2++) {
+ pt_vaddr_t pgsz_bitmap = get_random_u64();
+ pt_vaddr_t va = 0;
+ pt_oaddr_t oa = 0;
+ pt_vaddr_t last_va = log2_set_mod_max(0, c_lg2);
+
+ KUNIT_ASSERT_EQ(test,
+ pt_compute_best_pgsize(pgsz_bitmap, va, last_va,
+ oa),
+ ref_best_pgsize(pgsz_bitmap, va, last_va, oa));
+ }
+
+ /* 1's prefix, every suffix */
+ for (a_lg2 = 1; a_lg2 != 10; a_lg2++) {
+ for (b_lg2 = 1; b_lg2 != 10; b_lg2++) {
+ for (c_lg2 = 1; c_lg2 != 10; c_lg2++) {
+ pt_vaddr_t pgsz_bitmap = get_random_u64();
+ pt_vaddr_t va = PT_VADDR_MAX << a_lg2;
+ pt_oaddr_t oa = PT_VADDR_MAX << b_lg2;
+ pt_vaddr_t last_va = PT_VADDR_MAX;
+
+ KUNIT_ASSERT_EQ(
+ test,
+ pt_compute_best_pgsize(pgsz_bitmap, va,
+ last_va, oa),
+ ref_best_pgsize(pgsz_bitmap, va,
+ last_va, oa));
+ }
+ }
+ }
+
+ /* pgsize_bitmap is always 0 */
+ for (a_lg2 = 1; a_lg2 != 10; a_lg2++) {
+ for (b_lg2 = 1; b_lg2 != 10; b_lg2++) {
+ for (c_lg2 = 1; c_lg2 != 10; c_lg2++) {
+ pt_vaddr_t pgsz_bitmap = 0;
+ pt_vaddr_t va = get_random_u64() << a_lg2;
+ pt_oaddr_t oa = get_random_u64() << b_lg2;
+ pt_vaddr_t last_va = log2_set_mod_max(
+ get_random_u64(), c_lg2);
+
+ if (va > last_va)
+ swap(va, last_va);
+ KUNIT_ASSERT_EQ(
+ test,
+ pt_compute_best_pgsize(pgsz_bitmap, va,
+ last_va, oa),
+ 0);
+ }
+ }
+ }
+
+ if (sizeof(pt_vaddr_t) <= 4)
+ return;
+
+ /* over 32 bit page sizes */
+ for (a_lg2 = 32; a_lg2 != 42; a_lg2++) {
+ for (b_lg2 = 32; b_lg2 != 42; b_lg2++) {
+ for (c_lg2 = 32; c_lg2 != 42; c_lg2++) {
+ pt_vaddr_t pgsz_bitmap = get_random_u64();
+ pt_vaddr_t va = get_random_u64() << a_lg2;
+ pt_oaddr_t oa = get_random_u64() << b_lg2;
+ pt_vaddr_t last_va = log2_set_mod_max(
+ get_random_u64(), c_lg2);
+
+ if (va > last_va)
+ swap(va, last_va);
+ KUNIT_ASSERT_EQ(
+ test,
+ pt_compute_best_pgsize(pgsz_bitmap, va,
+ last_va, oa),
+ ref_best_pgsize(pgsz_bitmap, va,
+ last_va, oa));
+ }
+ }
+ }
+}
+
+/*
+ * Check that pt_install_table() and pt_table_pa() match
+ */
+static void test_lvl_table_ptr(struct kunit *test, struct pt_state *pts,
+ void *arg)
+{
+ struct kunit_iommu_priv *priv = test->priv;
+ pt_oaddr_t paddr =
+ log2_set_mod(priv->test_oa, 0, priv->smallest_pgsz_lg2);
+ struct pt_write_attrs attrs = {};
+
+ if (!pt_can_have_table(pts))
+ return;
+
+ KUNIT_ASSERT_NO_ERRNO_FN(test, "pt_iommu_set_prot",
+ pt_iommu_set_prot(pts->range->common, &attrs,
+ IOMMU_READ));
+
+ pt_load_single_entry(pts);
+ KUNIT_ASSERT_PT_LOAD(test, pts, PT_ENTRY_EMPTY);
+
+ KUNIT_ASSERT_TRUE(test, pt_install_table(pts, paddr, &attrs));
+
+ /* A second install should pass because install updates pts->entry. */
+ KUNIT_ASSERT_EQ(test, pt_install_table(pts, paddr, &attrs), true);
+
+ KUNIT_ASSERT_PT_LOAD(test, pts, PT_ENTRY_TABLE);
+ KUNIT_ASSERT_EQ(test, pt_table_pa(pts), paddr);
+
+ pt_clear_entries(pts, ilog2(1));
+ KUNIT_ASSERT_PT_LOAD(test, pts, PT_ENTRY_EMPTY);
+}
+
+static void test_table_ptr(struct kunit *test)
+{
+ check_all_levels(test, test_lvl_table_ptr, NULL);
+}
+
+struct lvl_radix_arg {
+ pt_vaddr_t vbits;
+};
+
+/*
+ * Check pt_table_oa_lg2sz() and pt_table_item_lg2sz() they need to decode a
+ * continuous list of VA across all the levels that covers the entire advertised
+ * VA space.
+ */
+static void test_lvl_radix(struct kunit *test, struct pt_state *pts, void *arg)
+{
+ unsigned int table_lg2sz = pt_table_oa_lg2sz(pts);
+ unsigned int isz_lg2 = pt_table_item_lg2sz(pts);
+ struct lvl_radix_arg *radix = arg;
+
+ /* Every bit below us is decoded */
+ KUNIT_ASSERT_EQ(test, log2_set_mod_max(0, isz_lg2), radix->vbits);
+
+ /* We are not decoding bits someone else is */
+ KUNIT_ASSERT_EQ(test, log2_div(radix->vbits, isz_lg2), 0);
+
+ /* Can't decode past the pt_vaddr_t size */
+ KUNIT_ASSERT_LE(test, table_lg2sz, PT_VADDR_MAX_LG2);
+ KUNIT_ASSERT_EQ(test, fvalog2_div(table_lg2sz, PT_MAX_VA_ADDRESS_LG2),
+ 0);
+
+ radix->vbits = fvalog2_set_mod_max(0, table_lg2sz);
+}
+
+static void test_max_va(struct kunit *test)
+{
+ struct kunit_iommu_priv *priv = test->priv;
+ struct pt_range range = pt_top_range(priv->common);
+
+ KUNIT_ASSERT_GE(test, priv->common->max_vasz_lg2, range.max_vasz_lg2);
+}
+
+static void test_table_radix(struct kunit *test)
+{
+ struct kunit_iommu_priv *priv = test->priv;
+ struct lvl_radix_arg radix = { .vbits = priv->smallest_pgsz - 1 };
+ struct pt_range range;
+
+ check_all_levels(test, test_lvl_radix, &radix);
+
+ range = pt_top_range(priv->common);
+ if (range.max_vasz_lg2 == PT_VADDR_MAX_LG2) {
+ KUNIT_ASSERT_EQ(test, radix.vbits, PT_VADDR_MAX);
+ } else {
+ if (!IS_32BIT)
+ KUNIT_ASSERT_EQ(test,
+ log2_set_mod_max(0, range.max_vasz_lg2),
+ radix.vbits);
+ KUNIT_ASSERT_EQ(test, log2_div(radix.vbits, range.max_vasz_lg2),
+ 0);
+ }
+}
+
+static unsigned int safe_pt_num_items_lg2(const struct pt_state *pts)
+{
+ struct pt_range top_range = pt_top_range(pts->range->common);
+ struct pt_state top_pts = pt_init_top(&top_range);
+
+ /*
+ * Avoid calling pt_num_items_lg2() on the top, instead we can derive
+ * the size of the top table from the top range.
+ */
+ if (pts->level == top_range.top_level)
+ return ilog2(pt_range_to_end_index(&top_pts));
+ return pt_num_items_lg2(pts);
+}
+
+static void test_lvl_possible_sizes(struct kunit *test, struct pt_state *pts,
+ void *arg)
+{
+ unsigned int num_items_lg2 = safe_pt_num_items_lg2(pts);
+ pt_vaddr_t pgsize_bitmap = pt_possible_sizes(pts);
+ unsigned int isz_lg2 = pt_table_item_lg2sz(pts);
+
+ if (!pt_can_have_leaf(pts)) {
+ KUNIT_ASSERT_EQ(test, pgsize_bitmap, 0);
+ return;
+ }
+
+ /* No bits for sizes that would be outside this table */
+ KUNIT_ASSERT_EQ(test, log2_mod(pgsize_bitmap, isz_lg2), 0);
+ KUNIT_ASSERT_EQ(
+ test, fvalog2_div(pgsize_bitmap, num_items_lg2 + isz_lg2), 0);
+
+ /*
+ * Non contiguous must be supported. AMDv1 has a HW bug where it does
+ * not support it on one of the levels.
+ */
+ if ((u64)pgsize_bitmap != 0xff0000000000ULL ||
+ strcmp(__stringify(PTPFX_RAW), "amdv1") != 0)
+ KUNIT_ASSERT_TRUE(test, pgsize_bitmap & log2_to_int(isz_lg2));
+ else
+ KUNIT_ASSERT_NE(test, pgsize_bitmap, 0);
+
+ /* A contiguous entry should not span the whole table */
+ if (num_items_lg2 + isz_lg2 != PT_VADDR_MAX_LG2)
+ KUNIT_ASSERT_FALSE(
+ test,
+ pgsize_bitmap & log2_to_int(num_items_lg2 + isz_lg2));
+}
+
+static void test_entry_possible_sizes(struct kunit *test)
+{
+ check_all_levels(test, test_lvl_possible_sizes, NULL);
+}
+
+static void sweep_all_pgsizes(struct kunit *test, struct pt_state *pts,
+ struct pt_write_attrs *attrs,
+ pt_oaddr_t test_oaddr)
+{
+ pt_vaddr_t pgsize_bitmap = pt_possible_sizes(pts);
+ unsigned int isz_lg2 = pt_table_item_lg2sz(pts);
+ unsigned int len_lg2;
+
+ if (pts->index != 0)
+ return;
+
+ for (len_lg2 = 0; len_lg2 < PT_VADDR_MAX_LG2 - 1; len_lg2++) {
+ struct pt_state sub_pts = *pts;
+ pt_oaddr_t oaddr;
+
+ if (!(pgsize_bitmap & log2_to_int(len_lg2)))
+ continue;
+
+ oaddr = log2_set_mod(test_oaddr, 0, len_lg2);
+ pt_install_leaf_entry(pts, oaddr, len_lg2, attrs);
+ /* Verify that every contiguous item translates correctly */
+ for (sub_pts.index = 0;
+ sub_pts.index != log2_to_int(len_lg2 - isz_lg2);
+ sub_pts.index++) {
+ KUNIT_ASSERT_PT_LOAD(test, &sub_pts, PT_ENTRY_OA);
+ KUNIT_ASSERT_EQ(test, pt_item_oa(&sub_pts),
+ oaddr + sub_pts.index *
+ oalog2_mul(1, isz_lg2));
+ KUNIT_ASSERT_EQ(test, pt_entry_oa(&sub_pts), oaddr);
+ KUNIT_ASSERT_EQ(test, pt_entry_num_contig_lg2(&sub_pts),
+ len_lg2 - isz_lg2);
+ }
+
+ pt_clear_entries(pts, len_lg2 - isz_lg2);
+ KUNIT_ASSERT_PT_LOAD(test, pts, PT_ENTRY_EMPTY);
+ }
+}
+
+/*
+ * Check that pt_install_leaf_entry() and pt_entry_oa() match.
+ * Check that pt_clear_entries() works.
+ */
+static void test_lvl_entry_oa(struct kunit *test, struct pt_state *pts,
+ void *arg)
+{
+ unsigned int max_oa_lg2 = pts->range->common->max_oasz_lg2;
+ struct kunit_iommu_priv *priv = test->priv;
+ struct pt_write_attrs attrs = {};
+
+ if (!pt_can_have_leaf(pts))
+ return;
+
+ KUNIT_ASSERT_NO_ERRNO_FN(test, "pt_iommu_set_prot",
+ pt_iommu_set_prot(pts->range->common, &attrs,
+ IOMMU_READ));
+
+ sweep_all_pgsizes(test, pts, &attrs, priv->test_oa);
+
+ /* Check that the table can store the boundary OAs */
+ sweep_all_pgsizes(test, pts, &attrs, 0);
+ if (max_oa_lg2 == PT_OADDR_MAX_LG2)
+ sweep_all_pgsizes(test, pts, &attrs, PT_OADDR_MAX);
+ else
+ sweep_all_pgsizes(test, pts, &attrs,
+ oalog2_to_max_int(max_oa_lg2));
+}
+
+static void test_entry_oa(struct kunit *test)
+{
+ check_all_levels(test, test_lvl_entry_oa, NULL);
+}
+
+/* Test pt_attr_from_entry() */
+static void test_lvl_attr_from_entry(struct kunit *test, struct pt_state *pts,
+ void *arg)
+{
+ pt_vaddr_t pgsize_bitmap = pt_possible_sizes(pts);
+ unsigned int isz_lg2 = pt_table_item_lg2sz(pts);
+ struct kunit_iommu_priv *priv = test->priv;
+ unsigned int len_lg2;
+ unsigned int prot;
+
+ if (!pt_can_have_leaf(pts))
+ return;
+
+ for (len_lg2 = 0; len_lg2 < PT_VADDR_MAX_LG2; len_lg2++) {
+ if (!(pgsize_bitmap & log2_to_int(len_lg2)))
+ continue;
+ for (prot = 0; prot <= (IOMMU_READ | IOMMU_WRITE | IOMMU_CACHE |
+ IOMMU_NOEXEC | IOMMU_MMIO);
+ prot++) {
+ pt_oaddr_t oaddr;
+ struct pt_write_attrs attrs = {};
+ u64 good_entry;
+
+ /*
+ * If the format doesn't support this combination of
+ * prot bits skip it
+ */
+ if (pt_iommu_set_prot(pts->range->common, &attrs,
+ prot)) {
+ /* But RW has to be supported */
+ KUNIT_ASSERT_NE(test, prot,
+ IOMMU_READ | IOMMU_WRITE);
+ continue;
+ }
+
+ oaddr = log2_set_mod(priv->test_oa, 0, len_lg2);
+ pt_install_leaf_entry(pts, oaddr, len_lg2, &attrs);
+ KUNIT_ASSERT_PT_LOAD(test, pts, PT_ENTRY_OA);
+
+ good_entry = pts->entry;
+
+ memset(&attrs, 0, sizeof(attrs));
+ pt_attr_from_entry(pts, &attrs);
+
+ pt_clear_entries(pts, len_lg2 - isz_lg2);
+ KUNIT_ASSERT_PT_LOAD(test, pts, PT_ENTRY_EMPTY);
+
+ pt_install_leaf_entry(pts, oaddr, len_lg2, &attrs);
+ KUNIT_ASSERT_PT_LOAD(test, pts, PT_ENTRY_OA);
+
+ /*
+ * The descriptor produced by pt_attr_from_entry()
+ * produce an identical entry value when re-written
+ */
+ KUNIT_ASSERT_EQ(test, good_entry, pts->entry);
+
+ pt_clear_entries(pts, len_lg2 - isz_lg2);
+ }
+ }
+}
+
+static void test_attr_from_entry(struct kunit *test)
+{
+ check_all_levels(test, test_lvl_attr_from_entry, NULL);
+}
+
+static void test_lvl_dirty(struct kunit *test, struct pt_state *pts, void *arg)
+{
+ pt_vaddr_t pgsize_bitmap = pt_possible_sizes(pts);
+ unsigned int isz_lg2 = pt_table_item_lg2sz(pts);
+ struct kunit_iommu_priv *priv = test->priv;
+ unsigned int start_idx = pts->index;
+ struct pt_write_attrs attrs = {};
+ unsigned int len_lg2;
+
+ if (!pt_can_have_leaf(pts))
+ return;
+
+ KUNIT_ASSERT_NO_ERRNO_FN(test, "pt_iommu_set_prot",
+ pt_iommu_set_prot(pts->range->common, &attrs,
+ IOMMU_READ | IOMMU_WRITE));
+
+ for (len_lg2 = 0; len_lg2 < PT_VADDR_MAX_LG2; len_lg2++) {
+ pt_oaddr_t oaddr;
+ unsigned int i;
+
+ if (!(pgsize_bitmap & log2_to_int(len_lg2)))
+ continue;
+
+ oaddr = log2_set_mod(priv->test_oa, 0, len_lg2);
+ pt_install_leaf_entry(pts, oaddr, len_lg2, &attrs);
+ KUNIT_ASSERT_PT_LOAD(test, pts, PT_ENTRY_OA);
+
+ pt_load_entry(pts);
+ pt_entry_make_write_clean(pts);
+ pt_load_entry(pts);
+ KUNIT_ASSERT_FALSE(test, pt_entry_is_write_dirty(pts));
+
+ for (i = 0; i != log2_to_int(len_lg2 - isz_lg2); i++) {
+ /* dirty every contiguous entry */
+ pts->index = start_idx + i;
+ pt_load_entry(pts);
+ KUNIT_ASSERT_TRUE(test, pt_entry_make_write_dirty(pts));
+ pts->index = start_idx;
+ pt_load_entry(pts);
+ KUNIT_ASSERT_TRUE(test, pt_entry_is_write_dirty(pts));
+
+ pt_entry_make_write_clean(pts);
+ pt_load_entry(pts);
+ KUNIT_ASSERT_FALSE(test, pt_entry_is_write_dirty(pts));
+ }
+
+ pt_clear_entries(pts, len_lg2 - isz_lg2);
+ }
+}
+
+static __maybe_unused void test_dirty(struct kunit *test)
+{
+ struct kunit_iommu_priv *priv = test->priv;
+
+ if (!pt_dirty_supported(priv->common))
+ kunit_skip(test,
+ "Page table features do not support dirty tracking");
+
+ check_all_levels(test, test_lvl_dirty, NULL);
+}
+
+static void test_lvl_sw_bit_leaf(struct kunit *test, struct pt_state *pts,
+ void *arg)
+{
+ struct kunit_iommu_priv *priv = test->priv;
+ pt_vaddr_t pgsize_bitmap = pt_possible_sizes(pts);
+ unsigned int isz_lg2 = pt_table_item_lg2sz(pts);
+ struct pt_write_attrs attrs = {};
+ unsigned int len_lg2;
+
+ if (!pt_can_have_leaf(pts))
+ return;
+ if (pts->index != 0)
+ return;
+
+ KUNIT_ASSERT_NO_ERRNO_FN(test, "pt_iommu_set_prot",
+ pt_iommu_set_prot(pts->range->common, &attrs,
+ IOMMU_READ));
+
+ for (len_lg2 = 0; len_lg2 < PT_VADDR_MAX_LG2 - 1; len_lg2++) {
+ pt_oaddr_t paddr = log2_set_mod(priv->test_oa, 0, len_lg2);
+ struct pt_write_attrs new_attrs = {};
+ unsigned int bitnr;
+
+ if (!(pgsize_bitmap & log2_to_int(len_lg2)))
+ continue;
+
+ pt_install_leaf_entry(pts, paddr, len_lg2, &attrs);
+
+ for (bitnr = 0; bitnr <= pt_max_sw_bit(pts->range->common);
+ bitnr++)
+ KUNIT_ASSERT_FALSE(test,
+ pt_test_sw_bit_acquire(pts, bitnr));
+
+ for (bitnr = 0; bitnr <= pt_max_sw_bit(pts->range->common);
+ bitnr++) {
+ KUNIT_ASSERT_FALSE(test,
+ pt_test_sw_bit_acquire(pts, bitnr));
+ pt_set_sw_bit_release(pts, bitnr);
+ KUNIT_ASSERT_TRUE(test,
+ pt_test_sw_bit_acquire(pts, bitnr));
+ }
+
+ for (bitnr = 0; bitnr <= pt_max_sw_bit(pts->range->common);
+ bitnr++)
+ KUNIT_ASSERT_TRUE(test,
+ pt_test_sw_bit_acquire(pts, bitnr));
+
+ KUNIT_ASSERT_EQ(test, pt_item_oa(pts), paddr);
+
+ /* SW bits didn't leak into the attrs */
+ pt_attr_from_entry(pts, &new_attrs);
+ KUNIT_ASSERT_MEMEQ(test, &new_attrs, &attrs, sizeof(attrs));
+
+ pt_clear_entries(pts, len_lg2 - isz_lg2);
+ KUNIT_ASSERT_PT_LOAD(test, pts, PT_ENTRY_EMPTY);
+ }
+}
+
+static __maybe_unused void test_sw_bit_leaf(struct kunit *test)
+{
+ check_all_levels(test, test_lvl_sw_bit_leaf, NULL);
+}
+
+static void test_lvl_sw_bit_table(struct kunit *test, struct pt_state *pts,
+ void *arg)
+{
+ struct kunit_iommu_priv *priv = test->priv;
+ struct pt_write_attrs attrs = {};
+ pt_oaddr_t paddr =
+ log2_set_mod(priv->test_oa, 0, priv->smallest_pgsz_lg2);
+ unsigned int bitnr;
+
+ if (!pt_can_have_leaf(pts))
+ return;
+ if (pts->index != 0)
+ return;
+
+ KUNIT_ASSERT_NO_ERRNO_FN(test, "pt_iommu_set_prot",
+ pt_iommu_set_prot(pts->range->common, &attrs,
+ IOMMU_READ));
+
+ KUNIT_ASSERT_TRUE(test, pt_install_table(pts, paddr, &attrs));
+
+ for (bitnr = 0; bitnr <= pt_max_sw_bit(pts->range->common); bitnr++)
+ KUNIT_ASSERT_FALSE(test, pt_test_sw_bit_acquire(pts, bitnr));
+
+ for (bitnr = 0; bitnr <= pt_max_sw_bit(pts->range->common); bitnr++) {
+ KUNIT_ASSERT_FALSE(test, pt_test_sw_bit_acquire(pts, bitnr));
+ pt_set_sw_bit_release(pts, bitnr);
+ KUNIT_ASSERT_TRUE(test, pt_test_sw_bit_acquire(pts, bitnr));
+ }
+
+ for (bitnr = 0; bitnr <= pt_max_sw_bit(pts->range->common); bitnr++)
+ KUNIT_ASSERT_TRUE(test, pt_test_sw_bit_acquire(pts, bitnr));
+
+ KUNIT_ASSERT_EQ(test, pt_table_pa(pts), paddr);
+
+ pt_clear_entries(pts, ilog2(1));
+ KUNIT_ASSERT_PT_LOAD(test, pts, PT_ENTRY_EMPTY);
+}
+
+static __maybe_unused void test_sw_bit_table(struct kunit *test)
+{
+ check_all_levels(test, test_lvl_sw_bit_table, NULL);
+}
+
+static struct kunit_case generic_pt_test_cases[] = {
+ KUNIT_CASE_FMT(test_init),
+ KUNIT_CASE_FMT(test_bitops),
+ KUNIT_CASE_FMT(test_best_pgsize),
+ KUNIT_CASE_FMT(test_table_ptr),
+ KUNIT_CASE_FMT(test_max_va),
+ KUNIT_CASE_FMT(test_table_radix),
+ KUNIT_CASE_FMT(test_entry_possible_sizes),
+ KUNIT_CASE_FMT(test_entry_oa),
+ KUNIT_CASE_FMT(test_attr_from_entry),
+#ifdef pt_entry_is_write_dirty
+ KUNIT_CASE_FMT(test_dirty),
+#endif
+#ifdef pt_sw_bit
+ KUNIT_CASE_FMT(test_sw_bit_leaf),
+ KUNIT_CASE_FMT(test_sw_bit_table),
+#endif
+ {},
+};
+
+static int pt_kunit_generic_pt_init(struct kunit *test)
+{
+ struct kunit_iommu_priv *priv;
+ int ret;
+
+ priv = kunit_kzalloc(test, sizeof(*priv), GFP_KERNEL);
+ if (!priv)
+ return -ENOMEM;
+ ret = pt_kunit_priv_init(test, priv);
+ if (ret) {
+ kunit_kfree(test, priv);
+ return ret;
+ }
+ test->priv = priv;
+ return 0;
+}
+
+static void pt_kunit_generic_pt_exit(struct kunit *test)
+{
+ struct kunit_iommu_priv *priv = test->priv;
+
+ if (!test->priv)
+ return;
+
+ pt_iommu_deinit(priv->iommu);
+ kunit_kfree(test, test->priv);
+}
+
+static struct kunit_suite NS(generic_pt_suite) = {
+ .name = __stringify(NS(fmt_test)),
+ .init = pt_kunit_generic_pt_init,
+ .exit = pt_kunit_generic_pt_exit,
+ .test_cases = generic_pt_test_cases,
+};
+kunit_test_suites(&NS(generic_pt_suite));
diff --git a/drivers/iommu/generic_pt/kunit_iommu.h b/drivers/iommu/generic_pt/kunit_iommu.h
new file mode 100644
index 000000000000..22c9e4c4dd97
--- /dev/null
+++ b/drivers/iommu/generic_pt/kunit_iommu.h
@@ -0,0 +1,184 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES
+ */
+#ifndef __GENERIC_PT_KUNIT_IOMMU_H
+#define __GENERIC_PT_KUNIT_IOMMU_H
+
+#define GENERIC_PT_KUNIT 1
+#include <kunit/device.h>
+#include <kunit/test.h>
+#include "../iommu-pages.h"
+#include "pt_iter.h"
+
+#define pt_iommu_table_cfg CONCATENATE(pt_iommu_table, _cfg)
+#define pt_iommu_init CONCATENATE(CONCATENATE(pt_iommu_, PTPFX), init)
+int pt_iommu_init(struct pt_iommu_table *fmt_table,
+ const struct pt_iommu_table_cfg *cfg, gfp_t gfp);
+
+/* The format can provide a list of configurations it would like to test */
+#ifdef kunit_fmt_cfgs
+static const void *kunit_pt_gen_params_cfg(struct kunit *test, const void *prev,
+ char *desc)
+{
+ uintptr_t cfg_id = (uintptr_t)prev;
+
+ cfg_id++;
+ if (cfg_id >= ARRAY_SIZE(kunit_fmt_cfgs) + 1)
+ return NULL;
+ snprintf(desc, KUNIT_PARAM_DESC_SIZE, "%s_cfg_%u",
+ __stringify(PTPFX_RAW), (unsigned int)(cfg_id - 1));
+ return (void *)cfg_id;
+}
+#define KUNIT_CASE_FMT(test_name) \
+ KUNIT_CASE_PARAM(test_name, kunit_pt_gen_params_cfg)
+#else
+#define KUNIT_CASE_FMT(test_name) KUNIT_CASE(test_name)
+#endif
+
+#define KUNIT_ASSERT_NO_ERRNO(test, ret) \
+ KUNIT_ASSERT_EQ_MSG(test, ret, 0, KUNIT_SUBSUBTEST_INDENT "errno %pe", \
+ ERR_PTR(ret))
+
+#define KUNIT_ASSERT_NO_ERRNO_FN(test, fn, ret) \
+ KUNIT_ASSERT_EQ_MSG(test, ret, 0, \
+ KUNIT_SUBSUBTEST_INDENT "errno %pe from %s", \
+ ERR_PTR(ret), fn)
+
+/*
+ * When the test is run on a 32 bit system unsigned long can be 32 bits. This
+ * cause the iommu op signatures to be restricted to 32 bits. Meaning the test
+ * has to be mindful not to create any VA's over the 32 bit limit. Reduce the
+ * scope of the testing as the main purpose of checking on full 32 bit is to
+ * look for 32bitism in the core code. Run the test on i386 with X86_PAE=y to
+ * get the full coverage when dma_addr_t & phys_addr_t are 8 bytes
+ */
+#define IS_32BIT (sizeof(unsigned long) == 4)
+
+struct kunit_iommu_priv {
+ union {
+ struct iommu_domain domain;
+ struct pt_iommu_table fmt_table;
+ };
+ spinlock_t top_lock;
+ struct device *dummy_dev;
+ struct pt_iommu *iommu;
+ struct pt_common *common;
+ struct pt_iommu_table_cfg cfg;
+ struct pt_iommu_info info;
+ unsigned int smallest_pgsz_lg2;
+ pt_vaddr_t smallest_pgsz;
+ unsigned int largest_pgsz_lg2;
+ pt_oaddr_t test_oa;
+ pt_vaddr_t safe_pgsize_bitmap;
+ unsigned long orig_nr_secondary_pagetable;
+
+};
+PT_IOMMU_CHECK_DOMAIN(struct kunit_iommu_priv, fmt_table.iommu, domain);
+
+static void pt_kunit_iotlb_sync(struct iommu_domain *domain,
+ struct iommu_iotlb_gather *gather)
+{
+ iommu_put_pages_list(&gather->freelist);
+}
+
+#define IOMMU_PT_DOMAIN_OPS1(x) IOMMU_PT_DOMAIN_OPS(x)
+static const struct iommu_domain_ops kunit_pt_ops = {
+ IOMMU_PT_DOMAIN_OPS1(PTPFX_RAW),
+ .iotlb_sync = &pt_kunit_iotlb_sync,
+};
+
+static void pt_kunit_change_top(struct pt_iommu *iommu_table,
+ phys_addr_t top_paddr, unsigned int top_level)
+{
+}
+
+static spinlock_t *pt_kunit_get_top_lock(struct pt_iommu *iommu_table)
+{
+ struct kunit_iommu_priv *priv = container_of(
+ iommu_table, struct kunit_iommu_priv, fmt_table.iommu);
+
+ return &priv->top_lock;
+}
+
+static const struct pt_iommu_driver_ops pt_kunit_driver_ops = {
+ .change_top = &pt_kunit_change_top,
+ .get_top_lock = &pt_kunit_get_top_lock,
+};
+
+static int pt_kunit_priv_init(struct kunit *test, struct kunit_iommu_priv *priv)
+{
+ unsigned int va_lg2sz;
+ int ret;
+
+ /* Enough so the memory allocator works */
+ priv->dummy_dev = kunit_device_register(test, "pt_kunit_dev");
+ if (IS_ERR(priv->dummy_dev))
+ return PTR_ERR(priv->dummy_dev);
+ set_dev_node(priv->dummy_dev, NUMA_NO_NODE);
+
+ spin_lock_init(&priv->top_lock);
+
+#ifdef kunit_fmt_cfgs
+ priv->cfg = kunit_fmt_cfgs[((uintptr_t)test->param_value) - 1];
+ /*
+ * The format can set a list of features that the kunit_fmt_cfgs
+ * controls, other features are default to on.
+ */
+ priv->cfg.common.features |= PT_SUPPORTED_FEATURES &
+ (~KUNIT_FMT_FEATURES);
+#else
+ priv->cfg.common.features = PT_SUPPORTED_FEATURES;
+#endif
+
+ /* Defaults, for the kunit */
+ if (!priv->cfg.common.hw_max_vasz_lg2)
+ priv->cfg.common.hw_max_vasz_lg2 = PT_MAX_VA_ADDRESS_LG2;
+ if (!priv->cfg.common.hw_max_oasz_lg2)
+ priv->cfg.common.hw_max_oasz_lg2 = pt_max_oa_lg2(NULL);
+
+ priv->fmt_table.iommu.nid = NUMA_NO_NODE;
+ priv->fmt_table.iommu.driver_ops = &pt_kunit_driver_ops;
+ priv->fmt_table.iommu.iommu_device = priv->dummy_dev;
+ priv->domain.ops = &kunit_pt_ops;
+ ret = pt_iommu_init(&priv->fmt_table, &priv->cfg, GFP_KERNEL);
+ if (ret) {
+ if (ret == -EOVERFLOW)
+ kunit_skip(
+ test,
+ "This configuration cannot be tested on 32 bit");
+ return ret;
+ }
+
+ priv->iommu = &priv->fmt_table.iommu;
+ priv->common = common_from_iommu(&priv->fmt_table.iommu);
+ priv->iommu->ops->get_info(priv->iommu, &priv->info);
+
+ /*
+ * size_t is used to pass the mapping length, it can be 32 bit, truncate
+ * the pagesizes so we don't use large sizes.
+ */
+ priv->info.pgsize_bitmap = (size_t)priv->info.pgsize_bitmap;
+
+ priv->smallest_pgsz_lg2 = vaffs(priv->info.pgsize_bitmap);
+ priv->smallest_pgsz = log2_to_int(priv->smallest_pgsz_lg2);
+ priv->largest_pgsz_lg2 =
+ vafls((dma_addr_t)priv->info.pgsize_bitmap) - 1;
+
+ priv->test_oa =
+ oalog2_mod(0x74a71445deadbeef, priv->common->max_oasz_lg2);
+
+ /*
+ * We run out of VA space if the mappings get too big, make something
+ * smaller that can safely pass through dma_addr_t API.
+ */
+ va_lg2sz = priv->common->max_vasz_lg2;
+ if (IS_32BIT && va_lg2sz > 32)
+ va_lg2sz = 32;
+ priv->safe_pgsize_bitmap =
+ log2_mod(priv->info.pgsize_bitmap, va_lg2sz - 1);
+
+ return 0;
+}
+
+#endif
diff --git a/drivers/iommu/generic_pt/kunit_iommu_pt.h b/drivers/iommu/generic_pt/kunit_iommu_pt.h
new file mode 100644
index 000000000000..e8a63c8ea850
--- /dev/null
+++ b/drivers/iommu/generic_pt/kunit_iommu_pt.h
@@ -0,0 +1,487 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES
+ */
+#include "kunit_iommu.h"
+#include "pt_iter.h"
+#include <linux/generic_pt/iommu.h>
+#include <linux/iommu.h>
+
+static void do_map(struct kunit *test, pt_vaddr_t va, pt_oaddr_t pa,
+ pt_vaddr_t len);
+
+struct count_valids {
+ u64 per_size[PT_VADDR_MAX_LG2];
+};
+
+static int __count_valids(struct pt_range *range, void *arg, unsigned int level,
+ struct pt_table_p *table)
+{
+ struct pt_state pts = pt_init(range, level, table);
+ struct count_valids *valids = arg;
+
+ for_each_pt_level_entry(&pts) {
+ if (pts.type == PT_ENTRY_TABLE) {
+ pt_descend(&pts, arg, __count_valids);
+ continue;
+ }
+ if (pts.type == PT_ENTRY_OA) {
+ valids->per_size[pt_entry_oa_lg2sz(&pts)]++;
+ continue;
+ }
+ }
+ return 0;
+}
+
+/*
+ * Number of valid table entries. This counts contiguous entries as a single
+ * valid.
+ */
+static unsigned int count_valids(struct kunit *test)
+{
+ struct kunit_iommu_priv *priv = test->priv;
+ struct pt_range range = pt_top_range(priv->common);
+ struct count_valids valids = {};
+ u64 total = 0;
+ unsigned int i;
+
+ KUNIT_ASSERT_NO_ERRNO(test,
+ pt_walk_range(&range, __count_valids, &valids));
+
+ for (i = 0; i != ARRAY_SIZE(valids.per_size); i++)
+ total += valids.per_size[i];
+ return total;
+}
+
+/* Only a single page size is present, count the number of valid entries */
+static unsigned int count_valids_single(struct kunit *test, pt_vaddr_t pgsz)
+{
+ struct kunit_iommu_priv *priv = test->priv;
+ struct pt_range range = pt_top_range(priv->common);
+ struct count_valids valids = {};
+ u64 total = 0;
+ unsigned int i;
+
+ KUNIT_ASSERT_NO_ERRNO(test,
+ pt_walk_range(&range, __count_valids, &valids));
+
+ for (i = 0; i != ARRAY_SIZE(valids.per_size); i++) {
+ if ((1ULL << i) == pgsz)
+ total = valids.per_size[i];
+ else
+ KUNIT_ASSERT_EQ(test, valids.per_size[i], 0);
+ }
+ return total;
+}
+
+static void do_unmap(struct kunit *test, pt_vaddr_t va, pt_vaddr_t len)
+{
+ struct kunit_iommu_priv *priv = test->priv;
+ size_t ret;
+
+ ret = iommu_unmap(&priv->domain, va, len);
+ KUNIT_ASSERT_EQ(test, ret, len);
+}
+
+static void check_iova(struct kunit *test, pt_vaddr_t va, pt_oaddr_t pa,
+ pt_vaddr_t len)
+{
+ struct kunit_iommu_priv *priv = test->priv;
+ pt_vaddr_t pfn = log2_div(va, priv->smallest_pgsz_lg2);
+ pt_vaddr_t end_pfn = pfn + log2_div(len, priv->smallest_pgsz_lg2);
+
+ for (; pfn != end_pfn; pfn++) {
+ phys_addr_t res = iommu_iova_to_phys(&priv->domain,
+ pfn * priv->smallest_pgsz);
+
+ KUNIT_ASSERT_EQ(test, res, (phys_addr_t)pa);
+ if (res != pa)
+ break;
+ pa += priv->smallest_pgsz;
+ }
+}
+
+static void test_increase_level(struct kunit *test)
+{
+ struct kunit_iommu_priv *priv = test->priv;
+ struct pt_common *common = priv->common;
+
+ if (!pt_feature(common, PT_FEAT_DYNAMIC_TOP))
+ kunit_skip(test, "PT_FEAT_DYNAMIC_TOP not set for this format");
+
+ if (IS_32BIT)
+ kunit_skip(test, "Unable to test on 32bit");
+
+ KUNIT_ASSERT_GT(test, common->max_vasz_lg2,
+ pt_top_range(common).max_vasz_lg2);
+
+ /* Add every possible level to the max */
+ while (common->max_vasz_lg2 != pt_top_range(common).max_vasz_lg2) {
+ struct pt_range top_range = pt_top_range(common);
+
+ if (top_range.va == 0)
+ do_map(test, top_range.last_va + 1, 0,
+ priv->smallest_pgsz);
+ else
+ do_map(test, top_range.va - priv->smallest_pgsz, 0,
+ priv->smallest_pgsz);
+
+ KUNIT_ASSERT_EQ(test, pt_top_range(common).top_level,
+ top_range.top_level + 1);
+ KUNIT_ASSERT_GE(test, common->max_vasz_lg2,
+ pt_top_range(common).max_vasz_lg2);
+ }
+}
+
+static void test_map_simple(struct kunit *test)
+{
+ struct kunit_iommu_priv *priv = test->priv;
+ struct pt_range range = pt_top_range(priv->common);
+ struct count_valids valids = {};
+ pt_vaddr_t pgsize_bitmap = priv->safe_pgsize_bitmap;
+ unsigned int pgsz_lg2;
+ pt_vaddr_t cur_va;
+
+ /* Map every reported page size */
+ cur_va = range.va + priv->smallest_pgsz * 256;
+ for (pgsz_lg2 = 0; pgsz_lg2 != PT_VADDR_MAX_LG2; pgsz_lg2++) {
+ pt_oaddr_t paddr = log2_set_mod(priv->test_oa, 0, pgsz_lg2);
+ u64 len = log2_to_int(pgsz_lg2);
+
+ if (!(pgsize_bitmap & len))
+ continue;
+
+ cur_va = ALIGN(cur_va, len);
+ do_map(test, cur_va, paddr, len);
+ if (len <= SZ_2G)
+ check_iova(test, cur_va, paddr, len);
+ cur_va += len;
+ }
+
+ /* The read interface reports that every page size was created */
+ range = pt_top_range(priv->common);
+ KUNIT_ASSERT_NO_ERRNO(test,
+ pt_walk_range(&range, __count_valids, &valids));
+ for (pgsz_lg2 = 0; pgsz_lg2 != PT_VADDR_MAX_LG2; pgsz_lg2++) {
+ if (pgsize_bitmap & (1ULL << pgsz_lg2))
+ KUNIT_ASSERT_EQ(test, valids.per_size[pgsz_lg2], 1);
+ else
+ KUNIT_ASSERT_EQ(test, valids.per_size[pgsz_lg2], 0);
+ }
+
+ /* Unmap works */
+ range = pt_top_range(priv->common);
+ cur_va = range.va + priv->smallest_pgsz * 256;
+ for (pgsz_lg2 = 0; pgsz_lg2 != PT_VADDR_MAX_LG2; pgsz_lg2++) {
+ u64 len = log2_to_int(pgsz_lg2);
+
+ if (!(pgsize_bitmap & len))
+ continue;
+ cur_va = ALIGN(cur_va, len);
+ do_unmap(test, cur_va, len);
+ cur_va += len;
+ }
+ KUNIT_ASSERT_EQ(test, count_valids(test), 0);
+}
+
+/*
+ * Test to convert a table pointer into an OA by mapping something small,
+ * unmapping it so as to leave behind a table pointer, then mapping something
+ * larger that will convert the table into an OA.
+ */
+static void test_map_table_to_oa(struct kunit *test)
+{
+ struct kunit_iommu_priv *priv = test->priv;
+ pt_vaddr_t limited_pgbitmap =
+ priv->info.pgsize_bitmap % (IS_32BIT ? SZ_2G : SZ_16G);
+ struct pt_range range = pt_top_range(priv->common);
+ unsigned int pgsz_lg2;
+ pt_vaddr_t max_pgsize;
+ pt_vaddr_t cur_va;
+
+ max_pgsize = 1ULL << (vafls(limited_pgbitmap) - 1);
+ KUNIT_ASSERT_TRUE(test, priv->info.pgsize_bitmap & max_pgsize);
+
+ for (pgsz_lg2 = 0; pgsz_lg2 != PT_VADDR_MAX_LG2; pgsz_lg2++) {
+ pt_oaddr_t paddr = log2_set_mod(priv->test_oa, 0, pgsz_lg2);
+ u64 len = log2_to_int(pgsz_lg2);
+ pt_vaddr_t offset;
+
+ if (!(priv->info.pgsize_bitmap & len))
+ continue;
+ if (len > max_pgsize)
+ break;
+
+ cur_va = ALIGN(range.va + priv->smallest_pgsz * 256,
+ max_pgsize);
+ for (offset = 0; offset != max_pgsize; offset += len)
+ do_map(test, cur_va + offset, paddr + offset, len);
+ check_iova(test, cur_va, paddr, max_pgsize);
+ KUNIT_ASSERT_EQ(test, count_valids_single(test, len),
+ log2_div(max_pgsize, pgsz_lg2));
+
+ if (len == max_pgsize) {
+ do_unmap(test, cur_va, max_pgsize);
+ } else {
+ do_unmap(test, cur_va, max_pgsize / 2);
+ for (offset = max_pgsize / 2; offset != max_pgsize;
+ offset += len)
+ do_unmap(test, cur_va + offset, len);
+ }
+
+ KUNIT_ASSERT_EQ(test, count_valids(test), 0);
+ }
+}
+
+/*
+ * Test unmapping a small page at the start of a large page. This always unmaps
+ * the large page.
+ */
+static void test_unmap_split(struct kunit *test)
+{
+ struct kunit_iommu_priv *priv = test->priv;
+ struct pt_range top_range = pt_top_range(priv->common);
+ pt_vaddr_t pgsize_bitmap = priv->safe_pgsize_bitmap;
+ unsigned int pgsz_lg2;
+ unsigned int count = 0;
+
+ for (pgsz_lg2 = 0; pgsz_lg2 != PT_VADDR_MAX_LG2; pgsz_lg2++) {
+ pt_vaddr_t base_len = log2_to_int(pgsz_lg2);
+ unsigned int next_pgsz_lg2;
+
+ if (!(pgsize_bitmap & base_len))
+ continue;
+
+ for (next_pgsz_lg2 = pgsz_lg2 + 1;
+ next_pgsz_lg2 != PT_VADDR_MAX_LG2; next_pgsz_lg2++) {
+ pt_vaddr_t next_len = log2_to_int(next_pgsz_lg2);
+ pt_vaddr_t vaddr = top_range.va;
+ pt_oaddr_t paddr = 0;
+ size_t gnmapped;
+
+ if (!(pgsize_bitmap & next_len))
+ continue;
+
+ do_map(test, vaddr, paddr, next_len);
+ gnmapped = iommu_unmap(&priv->domain, vaddr, base_len);
+ KUNIT_ASSERT_EQ(test, gnmapped, next_len);
+
+ /* Make sure unmap doesn't keep going */
+ do_map(test, vaddr, paddr, next_len);
+ do_map(test, vaddr + next_len, paddr, next_len);
+ gnmapped = iommu_unmap(&priv->domain, vaddr, base_len);
+ KUNIT_ASSERT_EQ(test, gnmapped, next_len);
+ gnmapped = iommu_unmap(&priv->domain, vaddr + next_len,
+ next_len);
+ KUNIT_ASSERT_EQ(test, gnmapped, next_len);
+
+ count++;
+ }
+ }
+
+ if (count == 0)
+ kunit_skip(test, "Test needs two page sizes");
+}
+
+static void unmap_collisions(struct kunit *test, struct maple_tree *mt,
+ pt_vaddr_t start, pt_vaddr_t last)
+{
+ struct kunit_iommu_priv *priv = test->priv;
+ MA_STATE(mas, mt, start, last);
+ void *entry;
+
+ mtree_lock(mt);
+ mas_for_each(&mas, entry, last) {
+ pt_vaddr_t mas_start = mas.index;
+ pt_vaddr_t len = (mas.last - mas_start) + 1;
+ pt_oaddr_t paddr;
+
+ mas_erase(&mas);
+ mas_pause(&mas);
+ mtree_unlock(mt);
+
+ paddr = oalog2_mod(mas_start, priv->common->max_oasz_lg2);
+ check_iova(test, mas_start, paddr, len);
+ do_unmap(test, mas_start, len);
+ mtree_lock(mt);
+ }
+ mtree_unlock(mt);
+}
+
+static void clamp_range(struct kunit *test, struct pt_range *range)
+{
+ struct kunit_iommu_priv *priv = test->priv;
+
+ if (range->last_va - range->va > SZ_1G)
+ range->last_va = range->va + SZ_1G;
+ KUNIT_ASSERT_NE(test, range->last_va, PT_VADDR_MAX);
+ if (range->va <= MAPLE_RESERVED_RANGE)
+ range->va =
+ ALIGN(MAPLE_RESERVED_RANGE, priv->smallest_pgsz);
+}
+
+/*
+ * Randomly map and unmap ranges that can large physical pages. If a random
+ * range overlaps with existing ranges then unmap them. This hits all the
+ * special cases.
+ */
+static void test_random_map(struct kunit *test)
+{
+ struct kunit_iommu_priv *priv = test->priv;
+ struct pt_range upper_range = pt_upper_range(priv->common);
+ struct pt_range top_range = pt_top_range(priv->common);
+ struct maple_tree mt;
+ unsigned int iter;
+
+ mt_init(&mt);
+
+ /*
+ * Shrink the range so randomization is more likely to have
+ * intersections
+ */
+ clamp_range(test, &top_range);
+ clamp_range(test, &upper_range);
+
+ for (iter = 0; iter != 1000; iter++) {
+ struct pt_range *range = &top_range;
+ pt_oaddr_t paddr;
+ pt_vaddr_t start;
+ pt_vaddr_t end;
+ int ret;
+
+ if (pt_feature(priv->common, PT_FEAT_SIGN_EXTEND) &&
+ ULONG_MAX >= PT_VADDR_MAX && get_random_u32_inclusive(0, 1))
+ range = &upper_range;
+
+ start = get_random_u32_below(
+ min(U32_MAX, range->last_va - range->va));
+ end = get_random_u32_below(
+ min(U32_MAX, range->last_va - start));
+
+ start = ALIGN_DOWN(start, priv->smallest_pgsz);
+ end = ALIGN(end, priv->smallest_pgsz);
+ start += range->va;
+ end += start;
+ if (start < range->va || end > range->last_va + 1 ||
+ start >= end)
+ continue;
+
+ /* Try overmapping to test the failure handling */
+ paddr = oalog2_mod(start, priv->common->max_oasz_lg2);
+ ret = iommu_map(&priv->domain, start, paddr, end - start,
+ IOMMU_READ | IOMMU_WRITE, GFP_KERNEL);
+ if (ret) {
+ KUNIT_ASSERT_EQ(test, ret, -EADDRINUSE);
+ unmap_collisions(test, &mt, start, end - 1);
+ do_map(test, start, paddr, end - start);
+ }
+
+ KUNIT_ASSERT_NO_ERRNO_FN(test, "mtree_insert_range",
+ mtree_insert_range(&mt, start, end - 1,
+ XA_ZERO_ENTRY,
+ GFP_KERNEL));
+
+ check_iova(test, start, paddr, end - start);
+ if (iter % 100)
+ cond_resched();
+ }
+
+ unmap_collisions(test, &mt, 0, PT_VADDR_MAX);
+ KUNIT_ASSERT_EQ(test, count_valids(test), 0);
+
+ mtree_destroy(&mt);
+}
+
+/* See https://lore.kernel.org/r/b9b18a03-63a2-4065-a27e-d92dd5c860bc@amd.com */
+static void test_pgsize_boundary(struct kunit *test)
+{
+ struct kunit_iommu_priv *priv = test->priv;
+ struct pt_range top_range = pt_top_range(priv->common);
+
+ if (top_range.va != 0 || top_range.last_va < 0xfef9ffff ||
+ priv->smallest_pgsz != SZ_4K)
+ kunit_skip(test, "Format does not have the required range");
+
+ do_map(test, 0xfef80000, 0x208b95d000, 0xfef9ffff - 0xfef80000 + 1);
+}
+
+/* See https://lore.kernel.org/r/20250826143816.38686-1-eugkoira@amazon.com */
+static void test_mixed(struct kunit *test)
+{
+ struct kunit_iommu_priv *priv = test->priv;
+ struct pt_range top_range = pt_top_range(priv->common);
+ u64 start = 0x3fe400ULL << 12;
+ u64 end = 0x4c0600ULL << 12;
+ pt_vaddr_t len = end - start;
+ pt_oaddr_t oa = start;
+
+ if (top_range.last_va <= start || sizeof(unsigned long) == 4)
+ kunit_skip(test, "range is too small");
+ if ((priv->safe_pgsize_bitmap & GENMASK(30, 21)) != (BIT(30) | BIT(21)))
+ kunit_skip(test, "incompatible psize");
+
+ do_map(test, start, oa, len);
+ /* 14 2M, 3 1G, 3 2M */
+ KUNIT_ASSERT_EQ(test, count_valids(test), 20);
+ check_iova(test, start, oa, len);
+}
+
+static struct kunit_case iommu_test_cases[] = {
+ KUNIT_CASE_FMT(test_increase_level),
+ KUNIT_CASE_FMT(test_map_simple),
+ KUNIT_CASE_FMT(test_map_table_to_oa),
+ KUNIT_CASE_FMT(test_unmap_split),
+ KUNIT_CASE_FMT(test_random_map),
+ KUNIT_CASE_FMT(test_pgsize_boundary),
+ KUNIT_CASE_FMT(test_mixed),
+ {},
+};
+
+static int pt_kunit_iommu_init(struct kunit *test)
+{
+ struct kunit_iommu_priv *priv;
+ int ret;
+
+ priv = kunit_kzalloc(test, sizeof(*priv), GFP_KERNEL);
+ if (!priv)
+ return -ENOMEM;
+
+ priv->orig_nr_secondary_pagetable =
+ global_node_page_state(NR_SECONDARY_PAGETABLE);
+ ret = pt_kunit_priv_init(test, priv);
+ if (ret) {
+ kunit_kfree(test, priv);
+ return ret;
+ }
+ test->priv = priv;
+ return 0;
+}
+
+static void pt_kunit_iommu_exit(struct kunit *test)
+{
+ struct kunit_iommu_priv *priv = test->priv;
+
+ if (!test->priv)
+ return;
+
+ pt_iommu_deinit(priv->iommu);
+ /*
+ * Look for memory leaks, assumes kunit is running isolated and nothing
+ * else is using secondary page tables.
+ */
+ KUNIT_ASSERT_EQ(test, priv->orig_nr_secondary_pagetable,
+ global_node_page_state(NR_SECONDARY_PAGETABLE));
+ kunit_kfree(test, test->priv);
+}
+
+static struct kunit_suite NS(iommu_suite) = {
+ .name = __stringify(NS(iommu_test)),
+ .init = pt_kunit_iommu_init,
+ .exit = pt_kunit_iommu_exit,
+ .test_cases = iommu_test_cases,
+};
+kunit_test_suites(&NS(iommu_suite));
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Kunit for generic page table");
+MODULE_IMPORT_NS("GENERIC_PT_IOMMU");
diff --git a/drivers/iommu/generic_pt/pt_common.h b/drivers/iommu/generic_pt/pt_common.h
new file mode 100644
index 000000000000..e1123d35c907
--- /dev/null
+++ b/drivers/iommu/generic_pt/pt_common.h
@@ -0,0 +1,389 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES
+ *
+ * This header is included after the format. It contains definitions
+ * that build on the format definitions to create the basic format API.
+ *
+ * The format API is listed here, with kdocs. The functions without bodies are
+ * implemented in the format using the pattern:
+ * static inline FMTpt_XXX(..) {..}
+ * #define pt_XXX FMTpt_XXX
+ *
+ * If the format doesn't implement a function then pt_fmt_defaults.h can provide
+ * a generic version.
+ *
+ * The routines marked "@pts: Entry to query" operate on the entire contiguous
+ * entry and can be called with a pts->index pointing to any sub item that makes
+ * up that entry.
+ *
+ * The header order is:
+ * pt_defs.h
+ * FMT.h
+ * pt_common.h
+ */
+#ifndef __GENERIC_PT_PT_COMMON_H
+#define __GENERIC_PT_PT_COMMON_H
+
+#include "pt_defs.h"
+#include "pt_fmt_defaults.h"
+
+/**
+ * pt_attr_from_entry() - Convert the permission bits back to attrs
+ * @pts: Entry to convert from
+ * @attrs: Resulting attrs
+ *
+ * Fill in the attrs with the permission bits encoded in the current leaf entry.
+ * The attrs should be usable with pt_install_leaf_entry() to reconstruct the
+ * same entry.
+ */
+static inline void pt_attr_from_entry(const struct pt_state *pts,
+ struct pt_write_attrs *attrs);
+
+/**
+ * pt_can_have_leaf() - True if the current level can have an OA entry
+ * @pts: The current level
+ *
+ * True if the current level can support pt_install_leaf_entry(). A leaf
+ * entry produce an OA.
+ */
+static inline bool pt_can_have_leaf(const struct pt_state *pts);
+
+/**
+ * pt_can_have_table() - True if the current level can have a lower table
+ * @pts: The current level
+ *
+ * Every level except 0 is allowed to have a lower table.
+ */
+static inline bool pt_can_have_table(const struct pt_state *pts)
+{
+ /* No further tables at level 0 */
+ return pts->level > 0;
+}
+
+/**
+ * pt_clear_entries() - Make entries empty (non-present)
+ * @pts: Starting table index
+ * @num_contig_lg2: Number of contiguous items to clear
+ *
+ * Clear a run of entries. A cleared entry will load back as PT_ENTRY_EMPTY
+ * and does not have any effect on table walking. The starting index must be
+ * aligned to num_contig_lg2.
+ */
+static inline void pt_clear_entries(struct pt_state *pts,
+ unsigned int num_contig_lg2);
+
+/**
+ * pt_entry_make_write_dirty() - Make an entry dirty
+ * @pts: Table entry to change
+ *
+ * Make pt_entry_is_write_dirty() return true for this entry. This can be called
+ * asynchronously with any other table manipulation under a RCU lock and must
+ * not corrupt the table.
+ */
+static inline bool pt_entry_make_write_dirty(struct pt_state *pts);
+
+/**
+ * pt_entry_make_write_clean() - Make the entry write clean
+ * @pts: Table entry to change
+ *
+ * Modify the entry so that pt_entry_is_write_dirty() == false. The HW will
+ * eventually be notified of this change via a TLB flush, which is the point
+ * that the HW must become synchronized. Any "write dirty" prior to the TLB
+ * flush can be lost, but once the TLB flush completes all writes must make
+ * their entries write dirty.
+ *
+ * The format should alter the entry in a way that is compatible with any
+ * concurrent update from HW. The entire contiguous entry is changed.
+ */
+static inline void pt_entry_make_write_clean(struct pt_state *pts);
+
+/**
+ * pt_entry_is_write_dirty() - True if the entry has been written to
+ * @pts: Entry to query
+ *
+ * "write dirty" means that the HW has written to the OA translated
+ * by this entry. If the entry is contiguous then the consolidated
+ * "write dirty" for all the items must be returned.
+ */
+static inline bool pt_entry_is_write_dirty(const struct pt_state *pts);
+
+/**
+ * pt_dirty_supported() - True if the page table supports dirty tracking
+ * @common: Page table to query
+ */
+static inline bool pt_dirty_supported(struct pt_common *common);
+
+/**
+ * pt_entry_num_contig_lg2() - Number of contiguous items for this leaf entry
+ * @pts: Entry to query
+ *
+ * Return the number of contiguous items this leaf entry spans. If the entry
+ * is single item it returns ilog2(1).
+ */
+static inline unsigned int pt_entry_num_contig_lg2(const struct pt_state *pts);
+
+/**
+ * pt_entry_oa() - Output Address for this leaf entry
+ * @pts: Entry to query
+ *
+ * Return the output address for the start of the entry. If the entry
+ * is contiguous this returns the same value for each sub-item. I.e.::
+ *
+ * log2_mod(pt_entry_oa(), pt_entry_oa_lg2sz()) == 0
+ *
+ * See pt_item_oa(). The format should implement one of these two functions
+ * depending on how it stores the OAs in the table.
+ */
+static inline pt_oaddr_t pt_entry_oa(const struct pt_state *pts);
+
+/**
+ * pt_entry_oa_lg2sz() - Return the size of an OA entry
+ * @pts: Entry to query
+ *
+ * If the entry is not contiguous this returns pt_table_item_lg2sz(), otherwise
+ * it returns the total VA/OA size of the entire contiguous entry.
+ */
+static inline unsigned int pt_entry_oa_lg2sz(const struct pt_state *pts)
+{
+ return pt_entry_num_contig_lg2(pts) + pt_table_item_lg2sz(pts);
+}
+
+/**
+ * pt_entry_oa_exact() - Return the complete OA for an entry
+ * @pts: Entry to query
+ *
+ * During iteration the first entry could have a VA with an offset from the
+ * natural start of the entry. Return the exact OA including the pts's VA
+ * offset.
+ */
+static inline pt_oaddr_t pt_entry_oa_exact(const struct pt_state *pts)
+{
+ return _pt_entry_oa_fast(pts) |
+ log2_mod(pts->range->va, pt_entry_oa_lg2sz(pts));
+}
+
+/**
+ * pt_full_va_prefix() - The top bits of the VA
+ * @common: Page table to query
+ *
+ * This is usually 0, but some formats have their VA space going downward from
+ * PT_VADDR_MAX, and will return that instead. This value must always be
+ * adjusted by struct pt_common max_vasz_lg2.
+ */
+static inline pt_vaddr_t pt_full_va_prefix(const struct pt_common *common);
+
+/**
+ * pt_has_system_page_size() - True if level 0 can install a PAGE_SHIFT entry
+ * @common: Page table to query
+ *
+ * If true the caller can use, at level 0, pt_install_leaf_entry(PAGE_SHIFT).
+ * This is useful to create optimized paths for common cases of PAGE_SIZE
+ * mappings.
+ */
+static inline bool pt_has_system_page_size(const struct pt_common *common);
+
+/**
+ * pt_install_leaf_entry() - Write a leaf entry to the table
+ * @pts: Table index to change
+ * @oa: Output Address for this leaf
+ * @oasz_lg2: Size in VA/OA for this leaf
+ * @attrs: Attributes to modify the entry
+ *
+ * A leaf OA entry will return PT_ENTRY_OA from pt_load_entry(). It translates
+ * the VA indicated by pts to the given OA.
+ *
+ * For a single item non-contiguous entry oasz_lg2 is pt_table_item_lg2sz().
+ * For contiguous it is pt_table_item_lg2sz() + num_contig_lg2.
+ *
+ * This must not be called if pt_can_have_leaf() == false. Contiguous sizes
+ * not indicated by pt_possible_sizes() must not be specified.
+ */
+static inline void pt_install_leaf_entry(struct pt_state *pts, pt_oaddr_t oa,
+ unsigned int oasz_lg2,
+ const struct pt_write_attrs *attrs);
+
+/**
+ * pt_install_table() - Write a table entry to the table
+ * @pts: Table index to change
+ * @table_pa: CPU physical address of the lower table's memory
+ * @attrs: Attributes to modify the table index
+ *
+ * A table entry will return PT_ENTRY_TABLE from pt_load_entry(). The table_pa
+ * is the table at pts->level - 1. This is done by cmpxchg so pts must have the
+ * current entry loaded. The pts is updated with the installed entry.
+ *
+ * This must not be called if pt_can_have_table() == false.
+ *
+ * Returns: true if the table was installed successfully.
+ */
+static inline bool pt_install_table(struct pt_state *pts, pt_oaddr_t table_pa,
+ const struct pt_write_attrs *attrs);
+
+/**
+ * pt_item_oa() - Output Address for this leaf item
+ * @pts: Item to query
+ *
+ * Return the output address for this item. If the item is part of a contiguous
+ * entry it returns the value of the OA for this individual sub item.
+ *
+ * See pt_entry_oa(). The format should implement one of these two functions
+ * depending on how it stores the OA's in the table.
+ */
+static inline pt_oaddr_t pt_item_oa(const struct pt_state *pts);
+
+/**
+ * pt_load_entry_raw() - Read from the location pts points at into the pts
+ * @pts: Table index to load
+ *
+ * Return the type of entry that was loaded. pts->entry will be filled in with
+ * the entry's content. See pt_load_entry()
+ */
+static inline enum pt_entry_type pt_load_entry_raw(struct pt_state *pts);
+
+/**
+ * pt_max_oa_lg2() - Return the maximum OA the table format can hold
+ * @common: Page table to query
+ *
+ * The value oalog2_to_max_int(pt_max_oa_lg2()) is the MAX for the
+ * OA. This is the absolute maximum address the table can hold. struct pt_common
+ * max_oasz_lg2 sets a lower dynamic maximum based on HW capability.
+ */
+static inline unsigned int
+pt_max_oa_lg2(const struct pt_common *common);
+
+/**
+ * pt_num_items_lg2() - Return the number of items in this table level
+ * @pts: The current level
+ *
+ * The number of items in a table level defines the number of bits this level
+ * decodes from the VA. This function is not called for the top level,
+ * so it does not need to compute a special value for the top case. The
+ * result for the top is based on pt_common max_vasz_lg2.
+ *
+ * The value is used as part of determining the table indexes via the
+ * equation::
+ *
+ * log2_mod(log2_div(VA, pt_table_item_lg2sz()), pt_num_items_lg2())
+ */
+static inline unsigned int pt_num_items_lg2(const struct pt_state *pts);
+
+/**
+ * pt_pgsz_lg2_to_level - Return the level that maps the page size
+ * @common: Page table to query
+ * @pgsize_lg2: Log2 page size
+ *
+ * Returns the table level that will map the given page size. The page
+ * size must be part of the pt_possible_sizes() for some level.
+ */
+static inline unsigned int pt_pgsz_lg2_to_level(struct pt_common *common,
+ unsigned int pgsize_lg2);
+
+/**
+ * pt_possible_sizes() - Return a bitmap of possible output sizes at this level
+ * @pts: The current level
+ *
+ * Each level has a list of possible output sizes that can be installed as
+ * leaf entries. If pt_can_have_leaf() is false returns zero.
+ *
+ * Otherwise the bit in position pt_table_item_lg2sz() should be set indicating
+ * that a non-contiguous single item leaf entry is supported. The following
+ * pt_num_items_lg2() number of bits can be set indicating contiguous entries
+ * are supported. Bit pt_table_item_lg2sz() + pt_num_items_lg2() must not be
+ * set, contiguous entries cannot span the entire table.
+ *
+ * The OR of pt_possible_sizes() of all levels is the typical bitmask of all
+ * supported sizes in the entire table.
+ */
+static inline pt_vaddr_t pt_possible_sizes(const struct pt_state *pts);
+
+/**
+ * pt_table_item_lg2sz() - Size of a single item entry in this table level
+ * @pts: The current level
+ *
+ * The size of the item specifies how much VA and OA a single item occupies.
+ *
+ * See pt_entry_oa_lg2sz() for the same value including the effect of contiguous
+ * entries.
+ */
+static inline unsigned int pt_table_item_lg2sz(const struct pt_state *pts);
+
+/**
+ * pt_table_oa_lg2sz() - Return the VA/OA size of the entire table
+ * @pts: The current level
+ *
+ * Return the size of VA decoded by the entire table level.
+ */
+static inline unsigned int pt_table_oa_lg2sz(const struct pt_state *pts)
+{
+ if (pts->range->top_level == pts->level)
+ return pts->range->max_vasz_lg2;
+ return min_t(unsigned int, pts->range->common->max_vasz_lg2,
+ pt_num_items_lg2(pts) + pt_table_item_lg2sz(pts));
+}
+
+/**
+ * pt_table_pa() - Return the CPU physical address of the table entry
+ * @pts: Entry to query
+ *
+ * This is only ever called on PT_ENTRY_TABLE entries. Must return the same
+ * value passed to pt_install_table().
+ */
+static inline pt_oaddr_t pt_table_pa(const struct pt_state *pts);
+
+/**
+ * pt_table_ptr() - Return a CPU pointer for a table item
+ * @pts: Entry to query
+ *
+ * Same as pt_table_pa() but returns a CPU pointer.
+ */
+static inline struct pt_table_p *pt_table_ptr(const struct pt_state *pts)
+{
+ return __va(pt_table_pa(pts));
+}
+
+/**
+ * pt_max_sw_bit() - Return the maximum software bit usable for any level and
+ * entry
+ * @common: Page table
+ *
+ * The swbit can be passed as bitnr to the other sw_bit functions.
+ */
+static inline unsigned int pt_max_sw_bit(struct pt_common *common);
+
+/**
+ * pt_test_sw_bit_acquire() - Read a software bit in an item
+ * @pts: Entry to read
+ * @bitnr: Bit to read
+ *
+ * Software bits are ignored by HW and can be used for any purpose by the
+ * software. This does a test bit and acquire operation.
+ */
+static inline bool pt_test_sw_bit_acquire(struct pt_state *pts,
+ unsigned int bitnr);
+
+/**
+ * pt_set_sw_bit_release() - Set a software bit in an item
+ * @pts: Entry to set
+ * @bitnr: Bit to set
+ *
+ * Software bits are ignored by HW and can be used for any purpose by the
+ * software. This does a set bit and release operation.
+ */
+static inline void pt_set_sw_bit_release(struct pt_state *pts,
+ unsigned int bitnr);
+
+/**
+ * pt_load_entry() - Read from the location pts points at into the pts
+ * @pts: Table index to load
+ *
+ * Set the type of entry that was loaded. pts->entry and pts->table_lower
+ * will be filled in with the entry's content.
+ */
+static inline void pt_load_entry(struct pt_state *pts)
+{
+ pts->type = pt_load_entry_raw(pts);
+ if (pts->type == PT_ENTRY_TABLE)
+ pts->table_lower = pt_table_ptr(pts);
+}
+#endif
diff --git a/drivers/iommu/generic_pt/pt_defs.h b/drivers/iommu/generic_pt/pt_defs.h
new file mode 100644
index 000000000000..c25544d72f97
--- /dev/null
+++ b/drivers/iommu/generic_pt/pt_defs.h
@@ -0,0 +1,332 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES
+ *
+ * This header is included before the format. It contains definitions
+ * that are required to compile the format. The header order is:
+ * pt_defs.h
+ * fmt_XX.h
+ * pt_common.h
+ */
+#ifndef __GENERIC_PT_DEFS_H
+#define __GENERIC_PT_DEFS_H
+
+#include <linux/generic_pt/common.h>
+
+#include <linux/types.h>
+#include <linux/atomic.h>
+#include <linux/bits.h>
+#include <linux/limits.h>
+#include <linux/bug.h>
+#include <linux/kconfig.h>
+#include "pt_log2.h"
+
+/* Header self-compile default defines */
+#ifndef pt_write_attrs
+typedef u64 pt_vaddr_t;
+typedef u64 pt_oaddr_t;
+#endif
+
+struct pt_table_p;
+
+enum {
+ PT_VADDR_MAX = sizeof(pt_vaddr_t) == 8 ? U64_MAX : U32_MAX,
+ PT_VADDR_MAX_LG2 = sizeof(pt_vaddr_t) == 8 ? 64 : 32,
+ PT_OADDR_MAX = sizeof(pt_oaddr_t) == 8 ? U64_MAX : U32_MAX,
+ PT_OADDR_MAX_LG2 = sizeof(pt_oaddr_t) == 8 ? 64 : 32,
+};
+
+/*
+ * The format instantiation can have features wired off or on to optimize the
+ * code gen. Supported features are just a reflection of what the current set of
+ * kernel users want to use.
+ */
+#ifndef PT_SUPPORTED_FEATURES
+#define PT_SUPPORTED_FEATURES 0
+#endif
+
+/*
+ * When in debug mode we compile all formats with all features. This allows the
+ * kunit to test the full matrix. SIGN_EXTEND can't co-exist with DYNAMIC_TOP or
+ * FULL_VA. DMA_INCOHERENT requires a SW bit that not all formats have
+ */
+#if IS_ENABLED(CONFIG_DEBUG_GENERIC_PT)
+enum {
+ PT_ORIG_SUPPORTED_FEATURES = PT_SUPPORTED_FEATURES,
+ PT_DEBUG_SUPPORTED_FEATURES =
+ UINT_MAX &
+ ~((PT_ORIG_SUPPORTED_FEATURES & BIT(PT_FEAT_DMA_INCOHERENT) ?
+ 0 :
+ BIT(PT_FEAT_DMA_INCOHERENT))) &
+ ~((PT_ORIG_SUPPORTED_FEATURES & BIT(PT_FEAT_SIGN_EXTEND)) ?
+ BIT(PT_FEAT_DYNAMIC_TOP) | BIT(PT_FEAT_FULL_VA) :
+ BIT(PT_FEAT_SIGN_EXTEND)),
+};
+#undef PT_SUPPORTED_FEATURES
+#define PT_SUPPORTED_FEATURES PT_DEBUG_SUPPORTED_FEATURES
+#endif
+
+#ifndef PT_FORCE_ENABLED_FEATURES
+#define PT_FORCE_ENABLED_FEATURES 0
+#endif
+
+/**
+ * DOC: Generic Page Table Language
+ *
+ * Language used in Generic Page Table
+ * VA
+ * The input address to the page table, often the virtual address.
+ * OA
+ * The output address from the page table, often the physical address.
+ * leaf
+ * An entry that results in an output address.
+ * start/end
+ * An half-open range, e.g. [0,0) refers to no VA.
+ * start/last
+ * An inclusive closed range, e.g. [0,0] refers to the VA 0
+ * common
+ * The generic page table container struct pt_common
+ * level
+ * Level 0 is always a table of only leaves with no futher table pointers.
+ * Increasing levels increase the size of the table items. The least
+ * significant VA bits used to index page tables are used to index the Level
+ * 0 table. The various labels for table levels used by HW descriptions are
+ * not used.
+ * top_level
+ * The inclusive highest level of the table. A two-level table
+ * has a top level of 1.
+ * table
+ * A linear array of translation items for that level.
+ * index
+ * The position in a table of an element: item = table[index]
+ * item
+ * A single index in a table
+ * entry
+ * A single logical element in a table. If contiguous pages are not
+ * supported then item and entry are the same thing, otherwise entry refers
+ * to all the items that comprise a single contiguous translation.
+ * item/entry_size
+ * The number of bytes of VA the table index translates for.
+ * If the item is a table entry then the next table covers
+ * this size. If the entry translates to an output address then the
+ * full OA is: OA | (VA % entry_size)
+ * contig_count
+ * The number of consecutive items fused into a single entry.
+ * item_size * contig_count is the size of that entry's translation.
+ * lg2
+ * Indicates the value is encoded as log2, i.e. 1<<x is the actual value.
+ * Normally the compiler is fine to optimize divide and mod with log2 values
+ * automatically when inlining, however if the values are not constant
+ * expressions it can't. So we do it by hand; we want to avoid 64-bit
+ * divmod.
+ */
+
+/* Returned by pt_load_entry() and for_each_pt_level_entry() */
+enum pt_entry_type {
+ PT_ENTRY_EMPTY,
+ /* Entry is valid and points to a lower table level */
+ PT_ENTRY_TABLE,
+ /* Entry is valid and returns an output address */
+ PT_ENTRY_OA,
+};
+
+struct pt_range {
+ struct pt_common *common;
+ struct pt_table_p *top_table;
+ pt_vaddr_t va;
+ pt_vaddr_t last_va;
+ u8 top_level;
+ u8 max_vasz_lg2;
+};
+
+/*
+ * Similar to xa_state, this records information about an in-progress parse at a
+ * single level.
+ */
+struct pt_state {
+ struct pt_range *range;
+ struct pt_table_p *table;
+ struct pt_table_p *table_lower;
+ u64 entry;
+ enum pt_entry_type type;
+ unsigned short index;
+ unsigned short end_index;
+ u8 level;
+};
+
+#define pt_cur_table(pts, type) ((type *)((pts)->table))
+
+/*
+ * Try to install a new table pointer. The locking methodology requires this to
+ * be atomic (multiple threads can race to install a pointer). The losing
+ * threads will fail the atomic and return false. They should free any memory
+ * and reparse the table level again.
+ */
+#if !IS_ENABLED(CONFIG_GENERIC_ATOMIC64)
+static inline bool pt_table_install64(struct pt_state *pts, u64 table_entry)
+{
+ u64 *entryp = pt_cur_table(pts, u64) + pts->index;
+ u64 old_entry = pts->entry;
+ bool ret;
+
+ /*
+ * Ensure the zero'd table content itself is visible before its PTE can
+ * be. release is a NOP on !SMP, but the HW is still doing an acquire.
+ */
+ if (!IS_ENABLED(CONFIG_SMP))
+ dma_wmb();
+ ret = try_cmpxchg64_release(entryp, &old_entry, table_entry);
+ if (ret)
+ pts->entry = table_entry;
+ return ret;
+}
+#endif
+
+static inline bool pt_table_install32(struct pt_state *pts, u32 table_entry)
+{
+ u32 *entryp = pt_cur_table(pts, u32) + pts->index;
+ u32 old_entry = pts->entry;
+ bool ret;
+
+ /*
+ * Ensure the zero'd table content itself is visible before its PTE can
+ * be. release is a NOP on !SMP, but the HW is still doing an acquire.
+ */
+ if (!IS_ENABLED(CONFIG_SMP))
+ dma_wmb();
+ ret = try_cmpxchg_release(entryp, &old_entry, table_entry);
+ if (ret)
+ pts->entry = table_entry;
+ return ret;
+}
+
+#define PT_SUPPORTED_FEATURE(feature_nr) (PT_SUPPORTED_FEATURES & BIT(feature_nr))
+
+static inline bool pt_feature(const struct pt_common *common,
+ unsigned int feature_nr)
+{
+ if (PT_FORCE_ENABLED_FEATURES & BIT(feature_nr))
+ return true;
+ if (!PT_SUPPORTED_FEATURE(feature_nr))
+ return false;
+ return common->features & BIT(feature_nr);
+}
+
+static inline bool pts_feature(const struct pt_state *pts,
+ unsigned int feature_nr)
+{
+ return pt_feature(pts->range->common, feature_nr);
+}
+
+/*
+ * PT_WARN_ON is used for invariants that the kunit should be checking can't
+ * happen.
+ */
+#if IS_ENABLED(CONFIG_DEBUG_GENERIC_PT)
+#define PT_WARN_ON WARN_ON
+#else
+static inline bool PT_WARN_ON(bool condition)
+{
+ return false;
+}
+#endif
+
+/* These all work on the VA type */
+#define log2_to_int(a_lg2) log2_to_int_t(pt_vaddr_t, a_lg2)
+#define log2_to_max_int(a_lg2) log2_to_max_int_t(pt_vaddr_t, a_lg2)
+#define log2_div(a, b_lg2) log2_div_t(pt_vaddr_t, a, b_lg2)
+#define log2_div_eq(a, b, c_lg2) log2_div_eq_t(pt_vaddr_t, a, b, c_lg2)
+#define log2_mod(a, b_lg2) log2_mod_t(pt_vaddr_t, a, b_lg2)
+#define log2_mod_eq_max(a, b_lg2) log2_mod_eq_max_t(pt_vaddr_t, a, b_lg2)
+#define log2_set_mod(a, val, b_lg2) log2_set_mod_t(pt_vaddr_t, a, val, b_lg2)
+#define log2_set_mod_max(a, b_lg2) log2_set_mod_max_t(pt_vaddr_t, a, b_lg2)
+#define log2_mul(a, b_lg2) log2_mul_t(pt_vaddr_t, a, b_lg2)
+#define vaffs(a) ffs_t(pt_vaddr_t, a)
+#define vafls(a) fls_t(pt_vaddr_t, a)
+#define vaffz(a) ffz_t(pt_vaddr_t, a)
+
+/*
+ * The full VA (fva) versions permit the lg2 value to be == PT_VADDR_MAX_LG2 and
+ * generate a useful defined result. The non-fva versions will malfunction at
+ * this extreme.
+ */
+static inline pt_vaddr_t fvalog2_div(pt_vaddr_t a, unsigned int b_lg2)
+{
+ if (PT_SUPPORTED_FEATURE(PT_FEAT_FULL_VA) && b_lg2 == PT_VADDR_MAX_LG2)
+ return 0;
+ return log2_div_t(pt_vaddr_t, a, b_lg2);
+}
+
+static inline pt_vaddr_t fvalog2_mod(pt_vaddr_t a, unsigned int b_lg2)
+{
+ if (PT_SUPPORTED_FEATURE(PT_FEAT_FULL_VA) && b_lg2 == PT_VADDR_MAX_LG2)
+ return a;
+ return log2_mod_t(pt_vaddr_t, a, b_lg2);
+}
+
+static inline bool fvalog2_div_eq(pt_vaddr_t a, pt_vaddr_t b,
+ unsigned int c_lg2)
+{
+ if (PT_SUPPORTED_FEATURE(PT_FEAT_FULL_VA) && c_lg2 == PT_VADDR_MAX_LG2)
+ return true;
+ return log2_div_eq_t(pt_vaddr_t, a, b, c_lg2);
+}
+
+static inline pt_vaddr_t fvalog2_set_mod(pt_vaddr_t a, pt_vaddr_t val,
+ unsigned int b_lg2)
+{
+ if (PT_SUPPORTED_FEATURE(PT_FEAT_FULL_VA) && b_lg2 == PT_VADDR_MAX_LG2)
+ return val;
+ return log2_set_mod_t(pt_vaddr_t, a, val, b_lg2);
+}
+
+static inline pt_vaddr_t fvalog2_set_mod_max(pt_vaddr_t a, unsigned int b_lg2)
+{
+ if (PT_SUPPORTED_FEATURE(PT_FEAT_FULL_VA) && b_lg2 == PT_VADDR_MAX_LG2)
+ return PT_VADDR_MAX;
+ return log2_set_mod_max_t(pt_vaddr_t, a, b_lg2);
+}
+
+/* These all work on the OA type */
+#define oalog2_to_int(a_lg2) log2_to_int_t(pt_oaddr_t, a_lg2)
+#define oalog2_to_max_int(a_lg2) log2_to_max_int_t(pt_oaddr_t, a_lg2)
+#define oalog2_div(a, b_lg2) log2_div_t(pt_oaddr_t, a, b_lg2)
+#define oalog2_div_eq(a, b, c_lg2) log2_div_eq_t(pt_oaddr_t, a, b, c_lg2)
+#define oalog2_mod(a, b_lg2) log2_mod_t(pt_oaddr_t, a, b_lg2)
+#define oalog2_mod_eq_max(a, b_lg2) log2_mod_eq_max_t(pt_oaddr_t, a, b_lg2)
+#define oalog2_set_mod(a, val, b_lg2) log2_set_mod_t(pt_oaddr_t, a, val, b_lg2)
+#define oalog2_set_mod_max(a, b_lg2) log2_set_mod_max_t(pt_oaddr_t, a, b_lg2)
+#define oalog2_mul(a, b_lg2) log2_mul_t(pt_oaddr_t, a, b_lg2)
+#define oaffs(a) ffs_t(pt_oaddr_t, a)
+#define oafls(a) fls_t(pt_oaddr_t, a)
+#define oaffz(a) ffz_t(pt_oaddr_t, a)
+
+static inline uintptr_t _pt_top_set(struct pt_table_p *table_mem,
+ unsigned int top_level)
+{
+ return top_level | (uintptr_t)table_mem;
+}
+
+static inline void pt_top_set(struct pt_common *common,
+ struct pt_table_p *table_mem,
+ unsigned int top_level)
+{
+ WRITE_ONCE(common->top_of_table, _pt_top_set(table_mem, top_level));
+}
+
+static inline void pt_top_set_level(struct pt_common *common,
+ unsigned int top_level)
+{
+ pt_top_set(common, NULL, top_level);
+}
+
+static inline unsigned int pt_top_get_level(const struct pt_common *common)
+{
+ return READ_ONCE(common->top_of_table) % (1 << PT_TOP_LEVEL_BITS);
+}
+
+static inline bool pt_check_install_leaf_args(struct pt_state *pts,
+ pt_oaddr_t oa,
+ unsigned int oasz_lg2);
+
+#endif
diff --git a/drivers/iommu/generic_pt/pt_fmt_defaults.h b/drivers/iommu/generic_pt/pt_fmt_defaults.h
new file mode 100644
index 000000000000..69fb7c2314ca
--- /dev/null
+++ b/drivers/iommu/generic_pt/pt_fmt_defaults.h
@@ -0,0 +1,295 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES
+ *
+ * Default definitions for formats that don't define these functions.
+ */
+#ifndef __GENERIC_PT_PT_FMT_DEFAULTS_H
+#define __GENERIC_PT_PT_FMT_DEFAULTS_H
+
+#include "pt_defs.h"
+#include <linux/log2.h>
+
+/* Header self-compile default defines */
+#ifndef pt_load_entry_raw
+#include "fmt/amdv1.h"
+#endif
+
+/*
+ * The format must provide PT_GRANULE_LG2SZ, PT_TABLEMEM_LG2SZ, and
+ * PT_ITEM_WORD_SIZE. They must be the same at every level excluding the top.
+ */
+#ifndef pt_table_item_lg2sz
+static inline unsigned int pt_table_item_lg2sz(const struct pt_state *pts)
+{
+ return PT_GRANULE_LG2SZ +
+ (PT_TABLEMEM_LG2SZ - ilog2(PT_ITEM_WORD_SIZE)) * pts->level;
+}
+#endif
+
+#ifndef pt_pgsz_lg2_to_level
+static inline unsigned int pt_pgsz_lg2_to_level(struct pt_common *common,
+ unsigned int pgsize_lg2)
+{
+ return ((unsigned int)(pgsize_lg2 - PT_GRANULE_LG2SZ)) /
+ (PT_TABLEMEM_LG2SZ - ilog2(PT_ITEM_WORD_SIZE));
+}
+#endif
+
+/*
+ * If not supplied by the format then contiguous pages are not supported.
+ *
+ * If contiguous pages are supported then the format must also provide
+ * pt_contig_count_lg2() if it supports a single contiguous size per level,
+ * or pt_possible_sizes() if it supports multiple sizes per level.
+ */
+#ifndef pt_entry_num_contig_lg2
+static inline unsigned int pt_entry_num_contig_lg2(const struct pt_state *pts)
+{
+ return ilog2(1);
+}
+
+/*
+ * Return the number of contiguous OA items forming an entry at this table level
+ */
+static inline unsigned short pt_contig_count_lg2(const struct pt_state *pts)
+{
+ return ilog2(1);
+}
+#endif
+
+/* If not supplied by the format then dirty tracking is not supported */
+#ifndef pt_entry_is_write_dirty
+static inline bool pt_entry_is_write_dirty(const struct pt_state *pts)
+{
+ return false;
+}
+
+static inline void pt_entry_make_write_clean(struct pt_state *pts)
+{
+}
+
+static inline bool pt_dirty_supported(struct pt_common *common)
+{
+ return false;
+}
+#else
+/* If not supplied then dirty tracking is always enabled */
+#ifndef pt_dirty_supported
+static inline bool pt_dirty_supported(struct pt_common *common)
+{
+ return true;
+}
+#endif
+#endif
+
+#ifndef pt_entry_make_write_dirty
+static inline bool pt_entry_make_write_dirty(struct pt_state *pts)
+{
+ return false;
+}
+#endif
+
+/*
+ * Format supplies either:
+ * pt_entry_oa - OA is at the start of a contiguous entry
+ * or
+ * pt_item_oa - OA is adjusted for every item in a contiguous entry
+ *
+ * Build the missing one
+ *
+ * The internal helper _pt_entry_oa_fast() allows generating
+ * an efficient pt_entry_oa_exact(), it doesn't care which
+ * option is selected.
+ */
+#ifdef pt_entry_oa
+static inline pt_oaddr_t pt_item_oa(const struct pt_state *pts)
+{
+ return pt_entry_oa(pts) |
+ log2_mul(pts->index, pt_table_item_lg2sz(pts));
+}
+#define _pt_entry_oa_fast pt_entry_oa
+#endif
+
+#ifdef pt_item_oa
+static inline pt_oaddr_t pt_entry_oa(const struct pt_state *pts)
+{
+ return log2_set_mod(pt_item_oa(pts), 0,
+ pt_entry_num_contig_lg2(pts) +
+ pt_table_item_lg2sz(pts));
+}
+#define _pt_entry_oa_fast pt_item_oa
+#endif
+
+/*
+ * If not supplied by the format then use the constant
+ * PT_MAX_OUTPUT_ADDRESS_LG2.
+ */
+#ifndef pt_max_oa_lg2
+static inline unsigned int
+pt_max_oa_lg2(const struct pt_common *common)
+{
+ return PT_MAX_OUTPUT_ADDRESS_LG2;
+}
+#endif
+
+#ifndef pt_has_system_page_size
+static inline bool pt_has_system_page_size(const struct pt_common *common)
+{
+ return PT_GRANULE_LG2SZ == PAGE_SHIFT;
+}
+#endif
+
+/*
+ * If not supplied by the format then assume only one contiguous size determined
+ * by pt_contig_count_lg2()
+ */
+#ifndef pt_possible_sizes
+static inline unsigned short pt_contig_count_lg2(const struct pt_state *pts);
+
+/* Return a bitmap of possible leaf page sizes at this level */
+static inline pt_vaddr_t pt_possible_sizes(const struct pt_state *pts)
+{
+ unsigned int isz_lg2 = pt_table_item_lg2sz(pts);
+
+ if (!pt_can_have_leaf(pts))
+ return 0;
+ return log2_to_int(isz_lg2) |
+ log2_to_int(pt_contig_count_lg2(pts) + isz_lg2);
+}
+#endif
+
+/* If not supplied by the format then use 0. */
+#ifndef pt_full_va_prefix
+static inline pt_vaddr_t pt_full_va_prefix(const struct pt_common *common)
+{
+ return 0;
+}
+#endif
+
+/* If not supplied by the format then zero fill using PT_ITEM_WORD_SIZE */
+#ifndef pt_clear_entries
+static inline void pt_clear_entries64(struct pt_state *pts,
+ unsigned int num_contig_lg2)
+{
+ u64 *tablep = pt_cur_table(pts, u64) + pts->index;
+ u64 *end = tablep + log2_to_int(num_contig_lg2);
+
+ PT_WARN_ON(log2_mod(pts->index, num_contig_lg2));
+ for (; tablep != end; tablep++)
+ WRITE_ONCE(*tablep, 0);
+}
+
+static inline void pt_clear_entries32(struct pt_state *pts,
+ unsigned int num_contig_lg2)
+{
+ u32 *tablep = pt_cur_table(pts, u32) + pts->index;
+ u32 *end = tablep + log2_to_int(num_contig_lg2);
+
+ PT_WARN_ON(log2_mod(pts->index, num_contig_lg2));
+ for (; tablep != end; tablep++)
+ WRITE_ONCE(*tablep, 0);
+}
+
+static inline void pt_clear_entries(struct pt_state *pts,
+ unsigned int num_contig_lg2)
+{
+ if (PT_ITEM_WORD_SIZE == sizeof(u32))
+ pt_clear_entries32(pts, num_contig_lg2);
+ else
+ pt_clear_entries64(pts, num_contig_lg2);
+}
+#define pt_clear_entries pt_clear_entries
+#endif
+
+/* If not supplied then SW bits are not supported */
+#ifdef pt_sw_bit
+static inline bool pt_test_sw_bit_acquire(struct pt_state *pts,
+ unsigned int bitnr)
+{
+ /* Acquire, pairs with pt_set_sw_bit_release() */
+ smp_mb();
+ /* For a contiguous entry the sw bit is only stored in the first item. */
+ return pts->entry & pt_sw_bit(bitnr);
+}
+#define pt_test_sw_bit_acquire pt_test_sw_bit_acquire
+
+static inline void pt_set_sw_bit_release(struct pt_state *pts,
+ unsigned int bitnr)
+{
+#if !IS_ENABLED(CONFIG_GENERIC_ATOMIC64)
+ if (PT_ITEM_WORD_SIZE == sizeof(u64)) {
+ u64 *entryp = pt_cur_table(pts, u64) + pts->index;
+ u64 old_entry = pts->entry;
+ u64 new_entry;
+
+ do {
+ new_entry = old_entry | pt_sw_bit(bitnr);
+ } while (!try_cmpxchg64_release(entryp, &old_entry, new_entry));
+ pts->entry = new_entry;
+ return;
+ }
+#endif
+ if (PT_ITEM_WORD_SIZE == sizeof(u32)) {
+ u32 *entryp = pt_cur_table(pts, u32) + pts->index;
+ u32 old_entry = pts->entry;
+ u32 new_entry;
+
+ do {
+ new_entry = old_entry | pt_sw_bit(bitnr);
+ } while (!try_cmpxchg_release(entryp, &old_entry, new_entry));
+ pts->entry = new_entry;
+ } else
+ BUILD_BUG();
+}
+#define pt_set_sw_bit_release pt_set_sw_bit_release
+#else
+static inline unsigned int pt_max_sw_bit(struct pt_common *common)
+{
+ return 0;
+}
+
+extern void __pt_no_sw_bit(void);
+static inline bool pt_test_sw_bit_acquire(struct pt_state *pts,
+ unsigned int bitnr)
+{
+ __pt_no_sw_bit();
+ return false;
+}
+
+static inline void pt_set_sw_bit_release(struct pt_state *pts,
+ unsigned int bitnr)
+{
+ __pt_no_sw_bit();
+}
+#endif
+
+/*
+ * Format can call in the pt_install_leaf_entry() to check the arguments are all
+ * aligned correctly.
+ */
+static inline bool pt_check_install_leaf_args(struct pt_state *pts,
+ pt_oaddr_t oa,
+ unsigned int oasz_lg2)
+{
+ unsigned int isz_lg2 = pt_table_item_lg2sz(pts);
+
+ if (PT_WARN_ON(oalog2_mod(oa, oasz_lg2)))
+ return false;
+
+#ifdef pt_possible_sizes
+ if (PT_WARN_ON(isz_lg2 > oasz_lg2 ||
+ oasz_lg2 > isz_lg2 + pt_num_items_lg2(pts)))
+ return false;
+#else
+ if (PT_WARN_ON(oasz_lg2 != isz_lg2 &&
+ oasz_lg2 != isz_lg2 + pt_contig_count_lg2(pts)))
+ return false;
+#endif
+
+ if (PT_WARN_ON(oalog2_mod(pts->index, oasz_lg2 - isz_lg2)))
+ return false;
+ return true;
+}
+
+#endif
diff --git a/drivers/iommu/generic_pt/pt_iter.h b/drivers/iommu/generic_pt/pt_iter.h
new file mode 100644
index 000000000000..c0d8617cce29
--- /dev/null
+++ b/drivers/iommu/generic_pt/pt_iter.h
@@ -0,0 +1,636 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES
+ *
+ * Iterators for Generic Page Table
+ */
+#ifndef __GENERIC_PT_PT_ITER_H
+#define __GENERIC_PT_PT_ITER_H
+
+#include "pt_common.h"
+
+#include <linux/errno.h>
+
+/*
+ * Use to mangle symbols so that backtraces and the symbol table are
+ * understandable. Any non-inlined function should get mangled like this.
+ */
+#define NS(fn) CONCATENATE(PTPFX, fn)
+
+/**
+ * pt_check_range() - Validate the range can be iterated
+ * @range: Range to validate
+ *
+ * Check that VA and last_va fall within the permitted range of VAs. If the
+ * format is using PT_FEAT_SIGN_EXTEND then this also checks the sign extension
+ * is correct.
+ */
+static inline int pt_check_range(struct pt_range *range)
+{
+ pt_vaddr_t prefix;
+
+ PT_WARN_ON(!range->max_vasz_lg2);
+
+ if (pt_feature(range->common, PT_FEAT_SIGN_EXTEND)) {
+ PT_WARN_ON(range->common->max_vasz_lg2 != range->max_vasz_lg2);
+ prefix = fvalog2_div(range->va, range->max_vasz_lg2 - 1) ?
+ PT_VADDR_MAX :
+ 0;
+ } else {
+ prefix = pt_full_va_prefix(range->common);
+ }
+
+ if (!fvalog2_div_eq(range->va, prefix, range->max_vasz_lg2) ||
+ !fvalog2_div_eq(range->last_va, prefix, range->max_vasz_lg2))
+ return -ERANGE;
+ return 0;
+}
+
+/**
+ * pt_index_to_va() - Update range->va to the current pts->index
+ * @pts: Iteration State
+ *
+ * Adjust range->va to match the current index. This is done in a lazy manner
+ * since computing the VA takes several instructions and is rarely required.
+ */
+static inline void pt_index_to_va(struct pt_state *pts)
+{
+ pt_vaddr_t lower_va;
+
+ lower_va = log2_mul(pts->index, pt_table_item_lg2sz(pts));
+ pts->range->va = fvalog2_set_mod(pts->range->va, lower_va,
+ pt_table_oa_lg2sz(pts));
+}
+
+/*
+ * Add index_count_lg2 number of entries to pts's VA and index. The VA will be
+ * adjusted to the end of the contiguous block if it is currently in the middle.
+ */
+static inline void _pt_advance(struct pt_state *pts,
+ unsigned int index_count_lg2)
+{
+ pts->index = log2_set_mod(pts->index + log2_to_int(index_count_lg2), 0,
+ index_count_lg2);
+}
+
+/**
+ * pt_entry_fully_covered() - Check if the item or entry is entirely contained
+ * within pts->range
+ * @pts: Iteration State
+ * @oasz_lg2: The size of the item to check, pt_table_item_lg2sz() or
+ * pt_entry_oa_lg2sz()
+ *
+ * Returns: true if the item is fully enclosed by the pts->range.
+ */
+static inline bool pt_entry_fully_covered(const struct pt_state *pts,
+ unsigned int oasz_lg2)
+{
+ struct pt_range *range = pts->range;
+
+ /* Range begins at the start of the entry */
+ if (log2_mod(pts->range->va, oasz_lg2))
+ return false;
+
+ /* Range ends past the end of the entry */
+ if (!log2_div_eq(range->va, range->last_va, oasz_lg2))
+ return true;
+
+ /* Range ends at the end of the entry */
+ return log2_mod_eq_max(range->last_va, oasz_lg2);
+}
+
+/**
+ * pt_range_to_index() - Starting index for an iteration
+ * @pts: Iteration State
+ *
+ * Return: the starting index for the iteration in pts.
+ */
+static inline unsigned int pt_range_to_index(const struct pt_state *pts)
+{
+ unsigned int isz_lg2 = pt_table_item_lg2sz(pts);
+
+ PT_WARN_ON(pts->level > pts->range->top_level);
+ if (pts->range->top_level == pts->level)
+ return log2_div(fvalog2_mod(pts->range->va,
+ pts->range->max_vasz_lg2),
+ isz_lg2);
+ return log2_mod(log2_div(pts->range->va, isz_lg2),
+ pt_num_items_lg2(pts));
+}
+
+/**
+ * pt_range_to_end_index() - Ending index iteration
+ * @pts: Iteration State
+ *
+ * Return: the last index for the iteration in pts.
+ */
+static inline unsigned int pt_range_to_end_index(const struct pt_state *pts)
+{
+ unsigned int isz_lg2 = pt_table_item_lg2sz(pts);
+ struct pt_range *range = pts->range;
+ unsigned int num_entries_lg2;
+
+ if (range->va == range->last_va)
+ return pts->index + 1;
+
+ if (pts->range->top_level == pts->level)
+ return log2_div(fvalog2_mod(pts->range->last_va,
+ pts->range->max_vasz_lg2),
+ isz_lg2) +
+ 1;
+
+ num_entries_lg2 = pt_num_items_lg2(pts);
+
+ /* last_va falls within this table */
+ if (log2_div_eq(range->va, range->last_va, num_entries_lg2 + isz_lg2))
+ return log2_mod(log2_div(pts->range->last_va, isz_lg2),
+ num_entries_lg2) +
+ 1;
+
+ return log2_to_int(num_entries_lg2);
+}
+
+static inline void _pt_iter_first(struct pt_state *pts)
+{
+ pts->index = pt_range_to_index(pts);
+ pts->end_index = pt_range_to_end_index(pts);
+ PT_WARN_ON(pts->index > pts->end_index);
+}
+
+static inline bool _pt_iter_load(struct pt_state *pts)
+{
+ if (pts->index >= pts->end_index)
+ return false;
+ pt_load_entry(pts);
+ return true;
+}
+
+/**
+ * pt_next_entry() - Advance pts to the next entry
+ * @pts: Iteration State
+ *
+ * Update pts to go to the next index at this level. If pts is pointing at a
+ * contiguous entry then the index may advance my more than one.
+ */
+static inline void pt_next_entry(struct pt_state *pts)
+{
+ if (pts->type == PT_ENTRY_OA &&
+ !__builtin_constant_p(pt_entry_num_contig_lg2(pts) == 0))
+ _pt_advance(pts, pt_entry_num_contig_lg2(pts));
+ else
+ pts->index++;
+ pt_index_to_va(pts);
+}
+
+/**
+ * for_each_pt_level_entry() - For loop wrapper over entries in the range
+ * @pts: Iteration State
+ *
+ * This is the basic iteration primitive. It iterates over all the entries in
+ * pts->range that fall within the pts's current table level. Each step does
+ * pt_load_entry(pts).
+ */
+#define for_each_pt_level_entry(pts) \
+ for (_pt_iter_first(pts); _pt_iter_load(pts); pt_next_entry(pts))
+
+/**
+ * pt_load_single_entry() - Version of pt_load_entry() usable within a walker
+ * @pts: Iteration State
+ *
+ * Alternative to for_each_pt_level_entry() if the walker function uses only a
+ * single entry.
+ */
+static inline enum pt_entry_type pt_load_single_entry(struct pt_state *pts)
+{
+ pts->index = pt_range_to_index(pts);
+ pt_load_entry(pts);
+ return pts->type;
+}
+
+static __always_inline struct pt_range _pt_top_range(struct pt_common *common,
+ uintptr_t top_of_table)
+{
+ struct pt_range range = {
+ .common = common,
+ .top_table =
+ (struct pt_table_p *)(top_of_table &
+ ~(uintptr_t)PT_TOP_LEVEL_MASK),
+ .top_level = top_of_table % (1 << PT_TOP_LEVEL_BITS),
+ };
+ struct pt_state pts = { .range = &range, .level = range.top_level };
+ unsigned int max_vasz_lg2;
+
+ max_vasz_lg2 = common->max_vasz_lg2;
+ if (pt_feature(common, PT_FEAT_DYNAMIC_TOP) &&
+ pts.level != PT_MAX_TOP_LEVEL)
+ max_vasz_lg2 = min_t(unsigned int, common->max_vasz_lg2,
+ pt_num_items_lg2(&pts) +
+ pt_table_item_lg2sz(&pts));
+
+ /*
+ * The top range will default to the lower region only with sign extend.
+ */
+ range.max_vasz_lg2 = max_vasz_lg2;
+ if (pt_feature(common, PT_FEAT_SIGN_EXTEND))
+ max_vasz_lg2--;
+
+ range.va = fvalog2_set_mod(pt_full_va_prefix(common), 0, max_vasz_lg2);
+ range.last_va =
+ fvalog2_set_mod_max(pt_full_va_prefix(common), max_vasz_lg2);
+ return range;
+}
+
+/**
+ * pt_top_range() - Return a range that spans part of the top level
+ * @common: Table
+ *
+ * For PT_FEAT_SIGN_EXTEND this will return the lower range, and cover half the
+ * total page table. Otherwise it returns the entire page table.
+ */
+static __always_inline struct pt_range pt_top_range(struct pt_common *common)
+{
+ /*
+ * The top pointer can change without locking. We capture the value and
+ * it's level here and are safe to walk it so long as both values are
+ * captured without tearing.
+ */
+ return _pt_top_range(common, READ_ONCE(common->top_of_table));
+}
+
+/**
+ * pt_all_range() - Return a range that spans the entire page table
+ * @common: Table
+ *
+ * The returned range spans the whole page table. Due to how PT_FEAT_SIGN_EXTEND
+ * is supported range->va and range->last_va will be incorrect during the
+ * iteration and must not be accessed.
+ */
+static inline struct pt_range pt_all_range(struct pt_common *common)
+{
+ struct pt_range range = pt_top_range(common);
+
+ if (!pt_feature(common, PT_FEAT_SIGN_EXTEND))
+ return range;
+
+ /*
+ * Pretend the table is linear from 0 without a sign extension. This
+ * generates the correct indexes for iteration.
+ */
+ range.last_va = fvalog2_set_mod_max(0, range.max_vasz_lg2);
+ return range;
+}
+
+/**
+ * pt_upper_range() - Return a range that spans part of the top level
+ * @common: Table
+ *
+ * For PT_FEAT_SIGN_EXTEND this will return the upper range, and cover half the
+ * total page table. Otherwise it returns the entire page table.
+ */
+static inline struct pt_range pt_upper_range(struct pt_common *common)
+{
+ struct pt_range range = pt_top_range(common);
+
+ if (!pt_feature(common, PT_FEAT_SIGN_EXTEND))
+ return range;
+
+ range.va = fvalog2_set_mod(PT_VADDR_MAX, 0, range.max_vasz_lg2 - 1);
+ range.last_va = PT_VADDR_MAX;
+ return range;
+}
+
+/**
+ * pt_make_range() - Return a range that spans part of the table
+ * @common: Table
+ * @va: Start address
+ * @last_va: Last address
+ *
+ * The caller must validate the range with pt_check_range() before using it.
+ */
+static __always_inline struct pt_range
+pt_make_range(struct pt_common *common, pt_vaddr_t va, pt_vaddr_t last_va)
+{
+ struct pt_range range =
+ _pt_top_range(common, READ_ONCE(common->top_of_table));
+
+ range.va = va;
+ range.last_va = last_va;
+
+ return range;
+}
+
+/*
+ * Span a slice of the table starting at a lower table level from an active
+ * walk.
+ */
+static __always_inline struct pt_range
+pt_make_child_range(const struct pt_range *parent, pt_vaddr_t va,
+ pt_vaddr_t last_va)
+{
+ struct pt_range range = *parent;
+
+ range.va = va;
+ range.last_va = last_va;
+
+ PT_WARN_ON(last_va < va);
+ PT_WARN_ON(pt_check_range(&range));
+
+ return range;
+}
+
+/**
+ * pt_init() - Initialize a pt_state on the stack
+ * @range: Range pointer to embed in the state
+ * @level: Table level for the state
+ * @table: Pointer to the table memory at level
+ *
+ * Helper to initialize the on-stack pt_state from walker arguments.
+ */
+static __always_inline struct pt_state
+pt_init(struct pt_range *range, unsigned int level, struct pt_table_p *table)
+{
+ struct pt_state pts = {
+ .range = range,
+ .table = table,
+ .level = level,
+ };
+ return pts;
+}
+
+/**
+ * pt_init_top() - Initialize a pt_state on the stack
+ * @range: Range pointer to embed in the state
+ *
+ * The pt_state points to the top most level.
+ */
+static __always_inline struct pt_state pt_init_top(struct pt_range *range)
+{
+ return pt_init(range, range->top_level, range->top_table);
+}
+
+typedef int (*pt_level_fn_t)(struct pt_range *range, void *arg,
+ unsigned int level, struct pt_table_p *table);
+
+/**
+ * pt_descend() - Recursively invoke the walker for the lower level
+ * @pts: Iteration State
+ * @arg: Value to pass to the function
+ * @fn: Walker function to call
+ *
+ * pts must point to a table item. Invoke fn as a walker on the table
+ * pts points to.
+ */
+static __always_inline int pt_descend(struct pt_state *pts, void *arg,
+ pt_level_fn_t fn)
+{
+ int ret;
+
+ if (PT_WARN_ON(!pts->table_lower))
+ return -EINVAL;
+
+ ret = (*fn)(pts->range, arg, pts->level - 1, pts->table_lower);
+ return ret;
+}
+
+/**
+ * pt_walk_range() - Walk over a VA range
+ * @range: Range pointer
+ * @fn: Walker function to call
+ * @arg: Value to pass to the function
+ *
+ * Walk over a VA range. The caller should have done a validity check, at
+ * least calling pt_check_range(), when building range. The walk will
+ * start at the top most table.
+ */
+static __always_inline int pt_walk_range(struct pt_range *range,
+ pt_level_fn_t fn, void *arg)
+{
+ return fn(range, arg, range->top_level, range->top_table);
+}
+
+/*
+ * pt_walk_descend() - Recursively invoke the walker for a slice of a lower
+ * level
+ * @pts: Iteration State
+ * @va: Start address
+ * @last_va: Last address
+ * @fn: Walker function to call
+ * @arg: Value to pass to the function
+ *
+ * With pts pointing at a table item this will descend and over a slice of the
+ * lower table. The caller must ensure that va/last_va are within the table
+ * item. This creates a new walk and does not alter pts or pts->range.
+ */
+static __always_inline int pt_walk_descend(const struct pt_state *pts,
+ pt_vaddr_t va, pt_vaddr_t last_va,
+ pt_level_fn_t fn, void *arg)
+{
+ struct pt_range range = pt_make_child_range(pts->range, va, last_va);
+
+ if (PT_WARN_ON(!pt_can_have_table(pts)) ||
+ PT_WARN_ON(!pts->table_lower))
+ return -EINVAL;
+
+ return fn(&range, arg, pts->level - 1, pts->table_lower);
+}
+
+/*
+ * pt_walk_descend_all() - Recursively invoke the walker for a table item
+ * @parent_pts: Iteration State
+ * @fn: Walker function to call
+ * @arg: Value to pass to the function
+ *
+ * With pts pointing at a table item this will descend and over the entire lower
+ * table. This creates a new walk and does not alter pts or pts->range.
+ */
+static __always_inline int
+pt_walk_descend_all(const struct pt_state *parent_pts, pt_level_fn_t fn,
+ void *arg)
+{
+ unsigned int isz_lg2 = pt_table_item_lg2sz(parent_pts);
+
+ return pt_walk_descend(parent_pts,
+ log2_set_mod(parent_pts->range->va, 0, isz_lg2),
+ log2_set_mod_max(parent_pts->range->va, isz_lg2),
+ fn, arg);
+}
+
+/**
+ * pt_range_slice() - Return a range that spans indexes
+ * @pts: Iteration State
+ * @start_index: Starting index within pts
+ * @end_index: Ending index within pts
+ *
+ * Create a range than spans an index range of the current table level
+ * pt_state points at.
+ */
+static inline struct pt_range pt_range_slice(const struct pt_state *pts,
+ unsigned int start_index,
+ unsigned int end_index)
+{
+ unsigned int table_lg2sz = pt_table_oa_lg2sz(pts);
+ pt_vaddr_t last_va;
+ pt_vaddr_t va;
+
+ va = fvalog2_set_mod(pts->range->va,
+ log2_mul(start_index, pt_table_item_lg2sz(pts)),
+ table_lg2sz);
+ last_va = fvalog2_set_mod(
+ pts->range->va,
+ log2_mul(end_index, pt_table_item_lg2sz(pts)) - 1, table_lg2sz);
+ return pt_make_child_range(pts->range, va, last_va);
+}
+
+/**
+ * pt_top_memsize_lg2()
+ * @common: Table
+ * @top_of_table: Top of table value from _pt_top_set()
+ *
+ * Compute the allocation size of the top table. For PT_FEAT_DYNAMIC_TOP this
+ * will compute the top size assuming the table will grow.
+ */
+static inline unsigned int pt_top_memsize_lg2(struct pt_common *common,
+ uintptr_t top_of_table)
+{
+ struct pt_range range = _pt_top_range(common, top_of_table);
+ struct pt_state pts = pt_init_top(&range);
+ unsigned int num_items_lg2;
+
+ num_items_lg2 = common->max_vasz_lg2 - pt_table_item_lg2sz(&pts);
+ if (range.top_level != PT_MAX_TOP_LEVEL &&
+ pt_feature(common, PT_FEAT_DYNAMIC_TOP))
+ num_items_lg2 = min(num_items_lg2, pt_num_items_lg2(&pts));
+
+ /* Round up the allocation size to the minimum alignment */
+ return max(ffs_t(u64, PT_TOP_PHYS_MASK),
+ num_items_lg2 + ilog2(PT_ITEM_WORD_SIZE));
+}
+
+/**
+ * pt_compute_best_pgsize() - Determine the best page size for leaf entries
+ * @pgsz_bitmap: Permitted page sizes
+ * @va: Starting virtual address for the leaf entry
+ * @last_va: Last virtual address for the leaf entry, sets the max page size
+ * @oa: Starting output address for the leaf entry
+ *
+ * Compute the largest page size for va, last_va, and oa together and return it
+ * in lg2. The largest page size depends on the format's supported page sizes at
+ * this level, and the relative alignment of the VA and OA addresses. 0 means
+ * the OA cannot be stored with the provided pgsz_bitmap.
+ */
+static inline unsigned int pt_compute_best_pgsize(pt_vaddr_t pgsz_bitmap,
+ pt_vaddr_t va,
+ pt_vaddr_t last_va,
+ pt_oaddr_t oa)
+{
+ unsigned int best_pgsz_lg2;
+ unsigned int pgsz_lg2;
+ pt_vaddr_t len = last_va - va + 1;
+ pt_vaddr_t mask;
+
+ if (PT_WARN_ON(va >= last_va))
+ return 0;
+
+ /*
+ * Given a VA/OA pair the best page size is the largest page size
+ * where:
+ *
+ * 1) VA and OA start at the page. Bitwise this is the count of least
+ * significant 0 bits.
+ * This also implies that last_va/oa has the same prefix as va/oa.
+ */
+ mask = va | oa;
+
+ /*
+ * 2) The page size is not larger than the last_va (length). Since page
+ * sizes are always power of two this can't be larger than the
+ * largest power of two factor of the length.
+ */
+ mask |= log2_to_int(vafls(len) - 1);
+
+ best_pgsz_lg2 = vaffs(mask);
+
+ /* Choose the highest bit <= best_pgsz_lg2 */
+ if (best_pgsz_lg2 < PT_VADDR_MAX_LG2 - 1)
+ pgsz_bitmap = log2_mod(pgsz_bitmap, best_pgsz_lg2 + 1);
+
+ pgsz_lg2 = vafls(pgsz_bitmap);
+ if (!pgsz_lg2)
+ return 0;
+
+ pgsz_lg2--;
+
+ PT_WARN_ON(log2_mod(va, pgsz_lg2) != 0);
+ PT_WARN_ON(oalog2_mod(oa, pgsz_lg2) != 0);
+ PT_WARN_ON(va + log2_to_int(pgsz_lg2) - 1 > last_va);
+ PT_WARN_ON(!log2_div_eq(va, va + log2_to_int(pgsz_lg2) - 1, pgsz_lg2));
+ PT_WARN_ON(
+ !oalog2_div_eq(oa, oa + log2_to_int(pgsz_lg2) - 1, pgsz_lg2));
+ return pgsz_lg2;
+}
+
+#define _PT_MAKE_CALL_LEVEL(fn) \
+ static __always_inline int fn(struct pt_range *range, void *arg, \
+ unsigned int level, \
+ struct pt_table_p *table) \
+ { \
+ static_assert(PT_MAX_TOP_LEVEL <= 5); \
+ if (level == 0) \
+ return CONCATENATE(fn, 0)(range, arg, 0, table); \
+ if (level == 1 || PT_MAX_TOP_LEVEL == 1) \
+ return CONCATENATE(fn, 1)(range, arg, 1, table); \
+ if (level == 2 || PT_MAX_TOP_LEVEL == 2) \
+ return CONCATENATE(fn, 2)(range, arg, 2, table); \
+ if (level == 3 || PT_MAX_TOP_LEVEL == 3) \
+ return CONCATENATE(fn, 3)(range, arg, 3, table); \
+ if (level == 4 || PT_MAX_TOP_LEVEL == 4) \
+ return CONCATENATE(fn, 4)(range, arg, 4, table); \
+ return CONCATENATE(fn, 5)(range, arg, 5, table); \
+ }
+
+static inline int __pt_make_level_fn_err(struct pt_range *range, void *arg,
+ unsigned int unused_level,
+ struct pt_table_p *table)
+{
+ static_assert(PT_MAX_TOP_LEVEL <= 5);
+ return -EPROTOTYPE;
+}
+
+#define __PT_MAKE_LEVEL_FN(fn, level, descend_fn, do_fn) \
+ static inline int fn(struct pt_range *range, void *arg, \
+ unsigned int unused_level, \
+ struct pt_table_p *table) \
+ { \
+ return do_fn(range, arg, level, table, descend_fn); \
+ }
+
+/**
+ * PT_MAKE_LEVELS() - Build an unwound walker
+ * @fn: Name of the walker function
+ * @do_fn: Function to call at each level
+ *
+ * This builds a function call tree that can be fully inlined.
+ * The caller must provide a function body in an __always_inline function::
+ *
+ * static __always_inline int do_fn(struct pt_range *range, void *arg,
+ * unsigned int level, struct pt_table_p *table,
+ * pt_level_fn_t descend_fn)
+ *
+ * An inline function will be created for each table level that calls do_fn with
+ * a compile time constant for level and a pointer to the next lower function.
+ * This generates an optimally inlined walk where each of the functions sees a
+ * constant level and can codegen the exact constants/etc for that level.
+ *
+ * Note this can produce a lot of code!
+ */
+#define PT_MAKE_LEVELS(fn, do_fn) \
+ __PT_MAKE_LEVEL_FN(CONCATENATE(fn, 0), 0, __pt_make_level_fn_err, \
+ do_fn); \
+ __PT_MAKE_LEVEL_FN(CONCATENATE(fn, 1), 1, CONCATENATE(fn, 0), do_fn); \
+ __PT_MAKE_LEVEL_FN(CONCATENATE(fn, 2), 2, CONCATENATE(fn, 1), do_fn); \
+ __PT_MAKE_LEVEL_FN(CONCATENATE(fn, 3), 3, CONCATENATE(fn, 2), do_fn); \
+ __PT_MAKE_LEVEL_FN(CONCATENATE(fn, 4), 4, CONCATENATE(fn, 3), do_fn); \
+ __PT_MAKE_LEVEL_FN(CONCATENATE(fn, 5), 5, CONCATENATE(fn, 4), do_fn); \
+ _PT_MAKE_CALL_LEVEL(fn)
+
+#endif
diff --git a/drivers/iommu/generic_pt/pt_log2.h b/drivers/iommu/generic_pt/pt_log2.h
new file mode 100644
index 000000000000..6dbbed119238
--- /dev/null
+++ b/drivers/iommu/generic_pt/pt_log2.h
@@ -0,0 +1,122 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES
+ *
+ * Helper macros for working with log2 values
+ *
+ */
+#ifndef __GENERIC_PT_LOG2_H
+#define __GENERIC_PT_LOG2_H
+#include <linux/bitops.h>
+#include <linux/limits.h>
+
+/* Compute a */
+#define log2_to_int_t(type, a_lg2) ((type)(((type)1) << (a_lg2)))
+static_assert(log2_to_int_t(unsigned int, 0) == 1);
+
+/* Compute a - 1 (aka all low bits set) */
+#define log2_to_max_int_t(type, a_lg2) ((type)(log2_to_int_t(type, a_lg2) - 1))
+
+/* Compute a / b */
+#define log2_div_t(type, a, b_lg2) ((type)(((type)a) >> (b_lg2)))
+static_assert(log2_div_t(unsigned int, 4, 2) == 1);
+
+/*
+ * Compute:
+ * a / c == b / c
+ * aka the high bits are equal
+ */
+#define log2_div_eq_t(type, a, b, c_lg2) \
+ (log2_div_t(type, (a) ^ (b), c_lg2) == 0)
+static_assert(log2_div_eq_t(unsigned int, 1, 1, 2));
+
+/* Compute a % b */
+#define log2_mod_t(type, a, b_lg2) \
+ ((type)(((type)a) & log2_to_max_int_t(type, b_lg2)))
+static_assert(log2_mod_t(unsigned int, 1, 2) == 1);
+
+/*
+ * Compute:
+ * a % b == b - 1
+ * aka the low bits are all 1s
+ */
+#define log2_mod_eq_max_t(type, a, b_lg2) \
+ (log2_mod_t(type, a, b_lg2) == log2_to_max_int_t(type, b_lg2))
+static_assert(log2_mod_eq_max_t(unsigned int, 3, 2));
+
+/*
+ * Return a value such that:
+ * a / b == ret / b
+ * ret % b == val
+ * aka set the low bits to val. val must be < b
+ */
+#define log2_set_mod_t(type, a, val, b_lg2) \
+ ((((type)(a)) & (~log2_to_max_int_t(type, b_lg2))) | ((type)(val)))
+static_assert(log2_set_mod_t(unsigned int, 3, 1, 2) == 1);
+
+/* Return a value such that:
+ * a / b == ret / b
+ * ret % b == b - 1
+ * aka set the low bits to all 1s
+ */
+#define log2_set_mod_max_t(type, a, b_lg2) \
+ (((type)(a)) | log2_to_max_int_t(type, b_lg2))
+static_assert(log2_set_mod_max_t(unsigned int, 2, 2) == 3);
+
+/* Compute a * b */
+#define log2_mul_t(type, a, b_lg2) ((type)(((type)a) << (b_lg2)))
+static_assert(log2_mul_t(unsigned int, 2, 2) == 8);
+
+#define _dispatch_sz(type, fn, a) \
+ (sizeof(type) == 4 ? fn##32((u32)a) : fn##64(a))
+
+/*
+ * Return the highest value such that:
+ * fls_t(u32, 0) == 0
+ * fls_t(u3, 1) == 1
+ * a >= log2_to_int(ret - 1)
+ * aka find last set bit
+ */
+static inline unsigned int fls32(u32 a)
+{
+ return fls(a);
+}
+#define fls_t(type, a) _dispatch_sz(type, fls, a)
+
+/*
+ * Return the highest value such that:
+ * ffs_t(u32, 0) == UNDEFINED
+ * ffs_t(u32, 1) == 0
+ * log_mod(a, ret) == 0
+ * aka find first set bit
+ */
+static inline unsigned int __ffs32(u32 a)
+{
+ return __ffs(a);
+}
+#define ffs_t(type, a) _dispatch_sz(type, __ffs, a)
+
+/*
+ * Return the highest value such that:
+ * ffz_t(u32, U32_MAX) == UNDEFINED
+ * ffz_t(u32, 0) == 0
+ * ffz_t(u32, 1) == 1
+ * log_mod(a, ret) == log_to_max_int(ret)
+ * aka find first zero bit
+ */
+static inline unsigned int ffz32(u32 a)
+{
+ return ffz(a);
+}
+static inline unsigned int ffz64(u64 a)
+{
+ if (sizeof(u64) == sizeof(unsigned long))
+ return ffz(a);
+
+ if ((u32)a == U32_MAX)
+ return ffz32(a >> 32) + 32;
+ return ffz32(a);
+}
+#define ffz_t(type, a) _dispatch_sz(type, ffz, a)
+
+#endif
diff --git a/drivers/iommu/intel/Kconfig b/drivers/iommu/intel/Kconfig
index f2f538c70650..5471f814e073 100644
--- a/drivers/iommu/intel/Kconfig
+++ b/drivers/iommu/intel/Kconfig
@@ -13,6 +13,10 @@ config INTEL_IOMMU
bool "Support for Intel IOMMU using DMA Remapping Devices"
depends on PCI_MSI && ACPI && X86
select IOMMU_API
+ select GENERIC_PT
+ select IOMMU_PT
+ select IOMMU_PT_X86_64
+ select IOMMU_PT_VTDSS
select IOMMU_IOVA
select IOMMU_IOPF
select IOMMUFD_DRIVER if IOMMUFD
@@ -66,7 +70,7 @@ config INTEL_IOMMU_DEFAULT_ON
config INTEL_IOMMU_FLOPPY_WA
def_bool y
- depends on X86
+ depends on X86 && BLK_DEV_FD
help
Floppy disk drivers are known to bypass DMA API calls
thereby failing to work when IOMMU is enabled. This
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index e236c7ec221f..4e888867e85c 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -45,16 +45,9 @@
#define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
-#define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
-#define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
-
-/* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
- to match. That way, we can use 'unsigned long' for PFNs with impunity. */
-#define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \
- __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
-#define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
-
static void __init check_tylersburg_isoch(void);
+static int intel_iommu_set_dirty_tracking(struct iommu_domain *domain,
+ bool enable);
static int rwbf_quirk;
#define rwbf_required(iommu) (rwbf_quirk || cap_rwbf((iommu)->cap))
@@ -217,7 +210,6 @@ static int disable_igfx_iommu;
#define IDENTMAP_AZALIA 4
const struct iommu_ops intel_iommu_ops;
-static const struct iommu_dirty_ops intel_dirty_ops;
static bool translation_pre_enabled(struct intel_iommu *iommu)
{
@@ -285,13 +277,6 @@ static int __init intel_iommu_setup(char *str)
}
__setup("intel_iommu=", intel_iommu_setup);
-static int domain_pfn_supported(struct dmar_domain *domain, unsigned long pfn)
-{
- int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
-
- return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
-}
-
/*
* Calculate the Supported Adjusted Guest Address Widths of an IOMMU.
* Refer to 11.4.2 of the VT-d spec for the encoding of each bit of
@@ -353,23 +338,6 @@ static bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
}
-/* Return the super pagesize bitmap if supported. */
-static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain)
-{
- unsigned long bitmap = 0;
-
- /*
- * 1-level super page supports page size of 2MiB, 2-level super page
- * supports page size of both 2MiB and 1GiB.
- */
- if (domain->iommu_superpage == 1)
- bitmap |= SZ_2M;
- else if (domain->iommu_superpage == 2)
- bitmap |= SZ_2M | SZ_1G;
-
- return bitmap;
-}
-
struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
u8 devfn, int alloc)
{
@@ -556,13 +524,6 @@ out:
return iommu;
}
-static void domain_flush_cache(struct dmar_domain *domain,
- void *addr, int size)
-{
- if (!domain->iommu_coherency)
- clflush_cache_range(addr, size);
-}
-
static void free_context_table(struct intel_iommu *iommu)
{
struct context_entry *context;
@@ -707,280 +668,6 @@ pgtable_walk:
}
#endif
-static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
- unsigned long pfn, int *target_level,
- gfp_t gfp)
-{
- struct dma_pte *parent, *pte;
- int level = agaw_to_level(domain->agaw);
- int offset;
-
- if (!domain_pfn_supported(domain, pfn))
- /* Address beyond IOMMU's addressing capabilities. */
- return NULL;
-
- parent = domain->pgd;
-
- while (1) {
- void *tmp_page;
-
- offset = pfn_level_offset(pfn, level);
- pte = &parent[offset];
- if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
- break;
- if (level == *target_level)
- break;
-
- if (!dma_pte_present(pte)) {
- uint64_t pteval, tmp;
-
- tmp_page = iommu_alloc_pages_node_sz(domain->nid, gfp,
- SZ_4K);
-
- if (!tmp_page)
- return NULL;
-
- domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
- pteval = virt_to_phys(tmp_page) | DMA_PTE_READ |
- DMA_PTE_WRITE;
- if (domain->use_first_level)
- pteval |= DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
-
- tmp = 0ULL;
- if (!try_cmpxchg64(&pte->val, &tmp, pteval))
- /* Someone else set it while we were thinking; use theirs. */
- iommu_free_pages(tmp_page);
- else
- domain_flush_cache(domain, pte, sizeof(*pte));
- }
- if (level == 1)
- break;
-
- parent = phys_to_virt(dma_pte_addr(pte));
- level--;
- }
-
- if (!*target_level)
- *target_level = level;
-
- return pte;
-}
-
-/* return address's pte at specific level */
-static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
- unsigned long pfn,
- int level, int *large_page)
-{
- struct dma_pte *parent, *pte;
- int total = agaw_to_level(domain->agaw);
- int offset;
-
- parent = domain->pgd;
- while (level <= total) {
- offset = pfn_level_offset(pfn, total);
- pte = &parent[offset];
- if (level == total)
- return pte;
-
- if (!dma_pte_present(pte)) {
- *large_page = total;
- break;
- }
-
- if (dma_pte_superpage(pte)) {
- *large_page = total;
- return pte;
- }
-
- parent = phys_to_virt(dma_pte_addr(pte));
- total--;
- }
- return NULL;
-}
-
-/* clear last level pte, a tlb flush should be followed */
-static void dma_pte_clear_range(struct dmar_domain *domain,
- unsigned long start_pfn,
- unsigned long last_pfn)
-{
- unsigned int large_page;
- struct dma_pte *first_pte, *pte;
-
- if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
- WARN_ON(start_pfn > last_pfn))
- return;
-
- /* we don't need lock here; nobody else touches the iova range */
- do {
- large_page = 1;
- first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
- if (!pte) {
- start_pfn = align_to_level(start_pfn + 1, large_page + 1);
- continue;
- }
- do {
- dma_clear_pte(pte);
- start_pfn += lvl_to_nr_pages(large_page);
- pte++;
- } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
-
- domain_flush_cache(domain, first_pte,
- (void *)pte - (void *)first_pte);
-
- } while (start_pfn && start_pfn <= last_pfn);
-}
-
-static void dma_pte_free_level(struct dmar_domain *domain, int level,
- int retain_level, struct dma_pte *pte,
- unsigned long pfn, unsigned long start_pfn,
- unsigned long last_pfn)
-{
- pfn = max(start_pfn, pfn);
- pte = &pte[pfn_level_offset(pfn, level)];
-
- do {
- unsigned long level_pfn;
- struct dma_pte *level_pte;
-
- if (!dma_pte_present(pte) || dma_pte_superpage(pte))
- goto next;
-
- level_pfn = pfn & level_mask(level);
- level_pte = phys_to_virt(dma_pte_addr(pte));
-
- if (level > 2) {
- dma_pte_free_level(domain, level - 1, retain_level,
- level_pte, level_pfn, start_pfn,
- last_pfn);
- }
-
- /*
- * Free the page table if we're below the level we want to
- * retain and the range covers the entire table.
- */
- if (level < retain_level && !(start_pfn > level_pfn ||
- last_pfn < level_pfn + level_size(level) - 1)) {
- dma_clear_pte(pte);
- domain_flush_cache(domain, pte, sizeof(*pte));
- iommu_free_pages(level_pte);
- }
-next:
- pfn += level_size(level);
- } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
-}
-
-/*
- * clear last level (leaf) ptes and free page table pages below the
- * level we wish to keep intact.
- */
-static void dma_pte_free_pagetable(struct dmar_domain *domain,
- unsigned long start_pfn,
- unsigned long last_pfn,
- int retain_level)
-{
- dma_pte_clear_range(domain, start_pfn, last_pfn);
-
- /* We don't need lock here; nobody else touches the iova range */
- dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
- domain->pgd, 0, start_pfn, last_pfn);
-
- /* free pgd */
- if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
- iommu_free_pages(domain->pgd);
- domain->pgd = NULL;
- }
-}
-
-/* When a page at a given level is being unlinked from its parent, we don't
- need to *modify* it at all. All we need to do is make a list of all the
- pages which can be freed just as soon as we've flushed the IOTLB and we
- know the hardware page-walk will no longer touch them.
- The 'pte' argument is the *parent* PTE, pointing to the page that is to
- be freed. */
-static void dma_pte_list_pagetables(struct dmar_domain *domain,
- int level, struct dma_pte *parent_pte,
- struct iommu_pages_list *freelist)
-{
- struct dma_pte *pte = phys_to_virt(dma_pte_addr(parent_pte));
-
- iommu_pages_list_add(freelist, pte);
-
- if (level == 1)
- return;
-
- do {
- if (dma_pte_present(pte) && !dma_pte_superpage(pte))
- dma_pte_list_pagetables(domain, level - 1, pte, freelist);
- pte++;
- } while (!first_pte_in_page(pte));
-}
-
-static void dma_pte_clear_level(struct dmar_domain *domain, int level,
- struct dma_pte *pte, unsigned long pfn,
- unsigned long start_pfn, unsigned long last_pfn,
- struct iommu_pages_list *freelist)
-{
- struct dma_pte *first_pte = NULL, *last_pte = NULL;
-
- pfn = max(start_pfn, pfn);
- pte = &pte[pfn_level_offset(pfn, level)];
-
- do {
- unsigned long level_pfn = pfn & level_mask(level);
-
- if (!dma_pte_present(pte))
- goto next;
-
- /* If range covers entire pagetable, free it */
- if (start_pfn <= level_pfn &&
- last_pfn >= level_pfn + level_size(level) - 1) {
- /* These suborbinate page tables are going away entirely. Don't
- bother to clear them; we're just going to *free* them. */
- if (level > 1 && !dma_pte_superpage(pte))
- dma_pte_list_pagetables(domain, level - 1, pte, freelist);
-
- dma_clear_pte(pte);
- if (!first_pte)
- first_pte = pte;
- last_pte = pte;
- } else if (level > 1) {
- /* Recurse down into a level that isn't *entirely* obsolete */
- dma_pte_clear_level(domain, level - 1,
- phys_to_virt(dma_pte_addr(pte)),
- level_pfn, start_pfn, last_pfn,
- freelist);
- }
-next:
- pfn = level_pfn + level_size(level);
- } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
-
- if (first_pte)
- domain_flush_cache(domain, first_pte,
- (void *)++last_pte - (void *)first_pte);
-}
-
-/* We can't just free the pages because the IOMMU may still be walking
- the page tables, and may have cached the intermediate levels. The
- pages can only be freed after the IOTLB flush has been done. */
-static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn,
- unsigned long last_pfn,
- struct iommu_pages_list *freelist)
-{
- if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
- WARN_ON(start_pfn > last_pfn))
- return;
-
- /* we don't need lock here; nobody else touches the iova range */
- dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
- domain->pgd, 0, start_pfn, last_pfn, freelist);
-
- /* free pgd */
- if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
- iommu_pages_list_add(freelist, domain->pgd);
- domain->pgd = NULL;
- }
-}
-
/* iommu handling */
static int iommu_alloc_root_entry(struct intel_iommu *iommu)
{
@@ -1460,13 +1147,15 @@ static int domain_context_mapping_one(struct dmar_domain *domain,
domain_lookup_dev_info(domain, iommu, bus, devfn);
u16 did = domain_id_iommu(domain, iommu);
int translation = CONTEXT_TT_MULTI_LEVEL;
- struct dma_pte *pgd = domain->pgd;
+ struct pt_iommu_vtdss_hw_info pt_info;
struct context_entry *context;
int ret;
if (WARN_ON(!intel_domain_is_ss_paging(domain)))
return -EINVAL;
+ pt_iommu_vtdss_hw_info(&domain->sspt, &pt_info);
+
pr_debug("Set context mapping for %02x:%02x.%d\n",
bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
@@ -1489,8 +1178,8 @@ static int domain_context_mapping_one(struct dmar_domain *domain,
else
translation = CONTEXT_TT_MULTI_LEVEL;
- context_set_address_root(context, virt_to_phys(pgd));
- context_set_address_width(context, domain->agaw);
+ context_set_address_root(context, pt_info.ssptptr);
+ context_set_address_width(context, pt_info.aw);
context_set_translation_type(context, translation);
context_set_fault_enable(context);
context_set_present(context);
@@ -1537,177 +1226,6 @@ domain_context_mapping(struct dmar_domain *domain, struct device *dev)
return 0;
}
-/* Return largest possible superpage level for a given mapping */
-static int hardware_largepage_caps(struct dmar_domain *domain, unsigned long iov_pfn,
- unsigned long phy_pfn, unsigned long pages)
-{
- int support, level = 1;
- unsigned long pfnmerge;
-
- support = domain->iommu_superpage;
-
- /* To use a large page, the virtual *and* physical addresses
- must be aligned to 2MiB/1GiB/etc. Lower bits set in either
- of them will mean we have to use smaller pages. So just
- merge them and check both at once. */
- pfnmerge = iov_pfn | phy_pfn;
-
- while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
- pages >>= VTD_STRIDE_SHIFT;
- if (!pages)
- break;
- pfnmerge >>= VTD_STRIDE_SHIFT;
- level++;
- support--;
- }
- return level;
-}
-
-/*
- * Ensure that old small page tables are removed to make room for superpage(s).
- * We're going to add new large pages, so make sure we don't remove their parent
- * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared.
- */
-static void switch_to_super_page(struct dmar_domain *domain,
- unsigned long start_pfn,
- unsigned long end_pfn, int level)
-{
- unsigned long lvl_pages = lvl_to_nr_pages(level);
- struct dma_pte *pte = NULL;
-
- if (WARN_ON(!IS_ALIGNED(start_pfn, lvl_pages) ||
- !IS_ALIGNED(end_pfn + 1, lvl_pages)))
- return;
-
- while (start_pfn <= end_pfn) {
- if (!pte)
- pte = pfn_to_dma_pte(domain, start_pfn, &level,
- GFP_ATOMIC);
-
- if (dma_pte_present(pte)) {
- dma_pte_free_pagetable(domain, start_pfn,
- start_pfn + lvl_pages - 1,
- level + 1);
-
- cache_tag_flush_range(domain, start_pfn << VTD_PAGE_SHIFT,
- end_pfn << VTD_PAGE_SHIFT, 0);
- }
-
- pte++;
- start_pfn += lvl_pages;
- if (first_pte_in_page(pte))
- pte = NULL;
- }
-}
-
-static int
-__domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
- unsigned long phys_pfn, unsigned long nr_pages, int prot,
- gfp_t gfp)
-{
- struct dma_pte *first_pte = NULL, *pte = NULL;
- unsigned int largepage_lvl = 0;
- unsigned long lvl_pages = 0;
- phys_addr_t pteval;
- u64 attr;
-
- if (unlikely(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1)))
- return -EINVAL;
-
- if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
- return -EINVAL;
-
- if (!(prot & DMA_PTE_WRITE) && domain->nested_parent) {
- pr_err_ratelimited("Read-only mapping is disallowed on the domain which serves as the parent in a nested configuration, due to HW errata (ERRATA_772415_SPR17)\n");
- return -EINVAL;
- }
-
- attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
- if (domain->use_first_level) {
- attr |= DMA_FL_PTE_PRESENT | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
- if (prot & DMA_PTE_WRITE)
- attr |= DMA_FL_PTE_DIRTY;
- }
-
- domain->has_mappings = true;
-
- pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
-
- while (nr_pages > 0) {
- uint64_t tmp;
-
- if (!pte) {
- largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
- phys_pfn, nr_pages);
-
- pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl,
- gfp);
- if (!pte)
- return -ENOMEM;
- first_pte = pte;
-
- lvl_pages = lvl_to_nr_pages(largepage_lvl);
-
- /* It is large page*/
- if (largepage_lvl > 1) {
- unsigned long end_pfn;
- unsigned long pages_to_remove;
-
- pteval |= DMA_PTE_LARGE_PAGE;
- pages_to_remove = min_t(unsigned long,
- round_down(nr_pages, lvl_pages),
- nr_pte_to_next_page(pte) * lvl_pages);
- end_pfn = iov_pfn + pages_to_remove - 1;
- switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl);
- } else {
- pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
- }
-
- }
- /* We don't need lock here, nobody else
- * touches the iova range
- */
- tmp = 0ULL;
- if (!try_cmpxchg64_local(&pte->val, &tmp, pteval)) {
- static int dumps = 5;
- pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
- iov_pfn, tmp, (unsigned long long)pteval);
- if (dumps) {
- dumps--;
- debug_dma_dump_mappings(NULL);
- }
- WARN_ON(1);
- }
-
- nr_pages -= lvl_pages;
- iov_pfn += lvl_pages;
- phys_pfn += lvl_pages;
- pteval += lvl_pages * VTD_PAGE_SIZE;
-
- /* If the next PTE would be the first in a new page, then we
- * need to flush the cache on the entries we've just written.
- * And then we'll need to recalculate 'pte', so clear it and
- * let it get set again in the if (!pte) block above.
- *
- * If we're done (!nr_pages) we need to flush the cache too.
- *
- * Also if we've been setting superpages, we may need to
- * recalculate 'pte' and switch back to smaller pages for the
- * end of the mapping, if the trailing size is not enough to
- * use another superpage (i.e. nr_pages < lvl_pages).
- */
- pte++;
- if (!nr_pages || first_pte_in_page(pte) ||
- (largepage_lvl > 1 && nr_pages < lvl_pages)) {
- domain_flush_cache(domain, first_pte,
- (void *)pte - (void *)first_pte);
- pte = NULL;
- }
- }
-
- return 0;
-}
-
static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn)
{
struct intel_iommu *iommu = info->iommu;
@@ -1769,22 +1287,26 @@ static int domain_setup_first_level(struct intel_iommu *iommu,
struct device *dev,
u32 pasid, struct iommu_domain *old)
{
- struct dma_pte *pgd = domain->pgd;
- int level, flags = 0;
+ struct pt_iommu_x86_64_hw_info pt_info;
+ unsigned int flags = 0;
- level = agaw_to_level(domain->agaw);
- if (level != 4 && level != 5)
+ pt_iommu_x86_64_hw_info(&domain->fspt, &pt_info);
+ if (WARN_ON(pt_info.levels != 4 && pt_info.levels != 5))
return -EINVAL;
- if (level == 5)
+ if (pt_info.levels == 5)
flags |= PASID_FLAG_FL5LP;
if (domain->force_snooping)
flags |= PASID_FLAG_PAGE_SNOOP;
+ if (!(domain->fspt.x86_64_pt.common.features &
+ BIT(PT_FEAT_DMA_INCOHERENT)))
+ flags |= PASID_FLAG_PWSNP;
+
return __domain_setup_first_level(iommu, dev, pasid,
domain_id_iommu(domain, iommu),
- __pa(pgd), flags, old);
+ pt_info.gcr3_pt, flags, old);
}
static int dmar_domain_attach_device(struct dmar_domain *domain,
@@ -3230,7 +2752,8 @@ void device_block_translation(struct device *dev)
}
static int blocking_domain_attach_dev(struct iommu_domain *domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old)
{
struct device_domain_info *info = dev_iommu_priv_get(dev);
@@ -3251,23 +2774,9 @@ static struct iommu_domain blocking_domain = {
}
};
-static int iommu_superpage_capability(struct intel_iommu *iommu, bool first_stage)
-{
- if (!intel_iommu_superpage)
- return 0;
-
- if (first_stage)
- return cap_fl1gp_support(iommu->cap) ? 2 : 1;
-
- return fls(cap_super_page_val(iommu->cap));
-}
-
-static struct dmar_domain *paging_domain_alloc(struct device *dev, bool first_stage)
+static struct dmar_domain *paging_domain_alloc(void)
{
- struct device_domain_info *info = dev_iommu_priv_get(dev);
- struct intel_iommu *iommu = info->iommu;
struct dmar_domain *domain;
- int addr_width;
domain = kzalloc(sizeof(*domain), GFP_KERNEL);
if (!domain)
@@ -3282,56 +2791,38 @@ static struct dmar_domain *paging_domain_alloc(struct device *dev, bool first_st
INIT_LIST_HEAD(&domain->s1_domains);
spin_lock_init(&domain->s1_lock);
- domain->nid = dev_to_node(dev);
- domain->use_first_level = first_stage;
-
- domain->domain.type = IOMMU_DOMAIN_UNMANAGED;
-
- /* calculate the address width */
- addr_width = agaw_to_width(iommu->agaw);
- if (addr_width > cap_mgaw(iommu->cap))
- addr_width = cap_mgaw(iommu->cap);
- domain->gaw = addr_width;
- domain->agaw = iommu->agaw;
- domain->max_addr = __DOMAIN_MAX_ADDR(addr_width);
-
- /* iommu memory access coherency */
- domain->iommu_coherency = iommu_paging_structure_coherency(iommu);
+ return domain;
+}
- /* pagesize bitmap */
- domain->domain.pgsize_bitmap = SZ_4K;
- domain->iommu_superpage = iommu_superpage_capability(iommu, first_stage);
- domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain);
+static unsigned int compute_vasz_lg2_fs(struct intel_iommu *iommu,
+ unsigned int *top_level)
+{
+ unsigned int mgaw = cap_mgaw(iommu->cap);
/*
- * IOVA aperture: First-level translation restricts the input-address
- * to a canonical address (i.e., address bits 63:N have the same value
- * as address bit [N-1], where N is 48-bits with 4-level paging and
- * 57-bits with 5-level paging). Hence, skip bit [N-1].
+ * Spec 3.6 First-Stage Translation:
+ *
+ * Software must limit addresses to less than the minimum of MGAW
+ * and the lower canonical address width implied by FSPM (i.e.,
+ * 47-bit when FSPM is 4-level and 56-bit when FSPM is 5-level).
*/
- domain->domain.geometry.force_aperture = true;
- domain->domain.geometry.aperture_start = 0;
- if (first_stage)
- domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
- else
- domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
-
- /* always allocate the top pgd */
- domain->pgd = iommu_alloc_pages_node_sz(domain->nid, GFP_KERNEL, SZ_4K);
- if (!domain->pgd) {
- kfree(domain);
- return ERR_PTR(-ENOMEM);
+ if (mgaw > 48 && cap_fl5lp_support(iommu->cap)) {
+ *top_level = 4;
+ return min(57, mgaw);
}
- domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
- return domain;
+ /* Four level is always supported */
+ *top_level = 3;
+ return min(48, mgaw);
}
static struct iommu_domain *
intel_iommu_domain_alloc_first_stage(struct device *dev,
struct intel_iommu *iommu, u32 flags)
{
+ struct pt_iommu_x86_64_cfg cfg = {};
struct dmar_domain *dmar_domain;
+ int ret;
if (flags & ~IOMMU_HWPT_ALLOC_PASID)
return ERR_PTR(-EOPNOTSUPP);
@@ -3340,10 +2831,20 @@ intel_iommu_domain_alloc_first_stage(struct device *dev,
if (!sm_supported(iommu) || !ecap_flts(iommu->ecap))
return ERR_PTR(-EOPNOTSUPP);
- dmar_domain = paging_domain_alloc(dev, true);
+ dmar_domain = paging_domain_alloc();
if (IS_ERR(dmar_domain))
return ERR_CAST(dmar_domain);
+ cfg.common.hw_max_vasz_lg2 =
+ compute_vasz_lg2_fs(iommu, &cfg.top_level);
+ cfg.common.hw_max_oasz_lg2 = 52;
+ cfg.common.features = BIT(PT_FEAT_SIGN_EXTEND) |
+ BIT(PT_FEAT_FLUSH_RANGE);
+ /* First stage always uses scalable mode */
+ if (!ecap_smpwc(iommu->ecap))
+ cfg.common.features |= BIT(PT_FEAT_DMA_INCOHERENT);
+ dmar_domain->iommu.iommu_device = dev;
+ dmar_domain->iommu.nid = dev_to_node(dev);
dmar_domain->domain.ops = &intel_fs_paging_domain_ops;
/*
* iotlb sync for map is only needed for legacy implementations that
@@ -3353,14 +2854,58 @@ intel_iommu_domain_alloc_first_stage(struct device *dev,
if (rwbf_required(iommu))
dmar_domain->iotlb_sync_map = true;
+ ret = pt_iommu_x86_64_init(&dmar_domain->fspt, &cfg, GFP_KERNEL);
+ if (ret) {
+ kfree(dmar_domain);
+ return ERR_PTR(ret);
+ }
+
+ if (!cap_fl1gp_support(iommu->cap))
+ dmar_domain->domain.pgsize_bitmap &= ~(u64)SZ_1G;
+ if (!intel_iommu_superpage)
+ dmar_domain->domain.pgsize_bitmap = SZ_4K;
+
return &dmar_domain->domain;
}
+static unsigned int compute_vasz_lg2_ss(struct intel_iommu *iommu,
+ unsigned int *top_level)
+{
+ unsigned int sagaw = cap_sagaw(iommu->cap);
+ unsigned int mgaw = cap_mgaw(iommu->cap);
+
+ /*
+ * Find the largest table size that both the mgaw and sagaw support.
+ * This sets the valid range of IOVA and the top starting level.
+ * Some HW may only support a 4 or 5 level walk but must limit IOVA to
+ * 3 levels.
+ */
+ if (mgaw > 48 && sagaw >= BIT(3)) {
+ *top_level = 4;
+ return min(57, mgaw);
+ } else if (mgaw > 39 && sagaw >= BIT(2)) {
+ *top_level = 3 + ffs(sagaw >> 3);
+ return min(48, mgaw);
+ } else if (mgaw > 30 && sagaw >= BIT(1)) {
+ *top_level = 2 + ffs(sagaw >> 2);
+ return min(39, mgaw);
+ }
+ return 0;
+}
+
+static const struct iommu_dirty_ops intel_second_stage_dirty_ops = {
+ IOMMU_PT_DIRTY_OPS(vtdss),
+ .set_dirty_tracking = intel_iommu_set_dirty_tracking,
+};
+
static struct iommu_domain *
intel_iommu_domain_alloc_second_stage(struct device *dev,
struct intel_iommu *iommu, u32 flags)
{
+ struct pt_iommu_vtdss_cfg cfg = {};
struct dmar_domain *dmar_domain;
+ unsigned int sslps;
+ int ret;
if (flags &
(~(IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING |
@@ -3377,15 +2922,46 @@ intel_iommu_domain_alloc_second_stage(struct device *dev,
if (sm_supported(iommu) && !ecap_slts(iommu->ecap))
return ERR_PTR(-EOPNOTSUPP);
- dmar_domain = paging_domain_alloc(dev, false);
+ dmar_domain = paging_domain_alloc();
if (IS_ERR(dmar_domain))
return ERR_CAST(dmar_domain);
+ cfg.common.hw_max_vasz_lg2 = compute_vasz_lg2_ss(iommu, &cfg.top_level);
+ cfg.common.hw_max_oasz_lg2 = 52;
+ cfg.common.features = BIT(PT_FEAT_FLUSH_RANGE);
+
+ /*
+ * Read-only mapping is disallowed on the domain which serves as the
+ * parent in a nested configuration, due to HW errata
+ * (ERRATA_772415_SPR17)
+ */
+ if (flags & IOMMU_HWPT_ALLOC_NEST_PARENT)
+ cfg.common.features |= BIT(PT_FEAT_VTDSS_FORCE_WRITEABLE);
+
+ if (!iommu_paging_structure_coherency(iommu))
+ cfg.common.features |= BIT(PT_FEAT_DMA_INCOHERENT);
+ dmar_domain->iommu.iommu_device = dev;
+ dmar_domain->iommu.nid = dev_to_node(dev);
dmar_domain->domain.ops = &intel_ss_paging_domain_ops;
dmar_domain->nested_parent = flags & IOMMU_HWPT_ALLOC_NEST_PARENT;
if (flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING)
- dmar_domain->domain.dirty_ops = &intel_dirty_ops;
+ dmar_domain->domain.dirty_ops = &intel_second_stage_dirty_ops;
+
+ ret = pt_iommu_vtdss_init(&dmar_domain->sspt, &cfg, GFP_KERNEL);
+ if (ret) {
+ kfree(dmar_domain);
+ return ERR_PTR(ret);
+ }
+
+ /* Adjust the supported page sizes to HW capability */
+ sslps = cap_super_page_val(iommu->cap);
+ if (!(sslps & BIT(0)))
+ dmar_domain->domain.pgsize_bitmap &= ~(u64)SZ_2M;
+ if (!(sslps & BIT(1)))
+ dmar_domain->domain.pgsize_bitmap &= ~(u64)SZ_1G;
+ if (!intel_iommu_superpage)
+ dmar_domain->domain.pgsize_bitmap = SZ_4K;
/*
* Besides the internal write buffer flush, the caching mode used for
@@ -3427,14 +3003,7 @@ static void intel_iommu_domain_free(struct iommu_domain *domain)
if (WARN_ON(!list_empty(&dmar_domain->devices)))
return;
- if (dmar_domain->pgd) {
- struct iommu_pages_list freelist =
- IOMMU_PAGES_LIST_INIT(freelist);
-
- domain_unmap(dmar_domain, 0, DOMAIN_MAX_PFN(dmar_domain->gaw),
- &freelist);
- iommu_put_pages_list(&freelist);
- }
+ pt_iommu_deinit(&dmar_domain->iommu);
kfree(dmar_domain->qi_batch);
kfree(dmar_domain);
@@ -3451,6 +3020,16 @@ static int paging_domain_compatible_first_stage(struct dmar_domain *dmar_domain,
if (!sm_supported(iommu) || !ecap_flts(iommu->ecap))
return -EINVAL;
+ if (!ecap_smpwc(iommu->ecap) &&
+ !(dmar_domain->fspt.x86_64_pt.common.features &
+ BIT(PT_FEAT_DMA_INCOHERENT)))
+ return -EINVAL;
+
+ /* Supports the number of table levels */
+ if (!cap_fl5lp_support(iommu->cap) &&
+ dmar_domain->fspt.x86_64_pt.common.max_vasz_lg2 > 48)
+ return -EINVAL;
+
/* Same page size support */
if (!cap_fl1gp_support(iommu->cap) &&
(dmar_domain->domain.pgsize_bitmap & SZ_1G))
@@ -3467,7 +3046,11 @@ static int
paging_domain_compatible_second_stage(struct dmar_domain *dmar_domain,
struct intel_iommu *iommu)
{
+ unsigned int vasz_lg2 = dmar_domain->sspt.vtdss_pt.common.max_vasz_lg2;
unsigned int sslps = cap_super_page_val(iommu->cap);
+ struct pt_iommu_vtdss_hw_info pt_info;
+
+ pt_iommu_vtdss_hw_info(&dmar_domain->sspt, &pt_info);
if (dmar_domain->domain.dirty_ops && !ssads_supported(iommu))
return -EINVAL;
@@ -3478,6 +3061,19 @@ paging_domain_compatible_second_stage(struct dmar_domain *dmar_domain,
if (sm_supported(iommu) && !ecap_slts(iommu->ecap))
return -EINVAL;
+ if (!iommu_paging_structure_coherency(iommu) &&
+ !(dmar_domain->sspt.vtdss_pt.common.features &
+ BIT(PT_FEAT_DMA_INCOHERENT)))
+ return -EINVAL;
+
+ /* Address width falls within the capability */
+ if (cap_mgaw(iommu->cap) < vasz_lg2)
+ return -EINVAL;
+
+ /* Page table level is supported. */
+ if (!(cap_sagaw(iommu->cap) & BIT(pt_info.aw)))
+ return -EINVAL;
+
/* Same page size support */
if (!(sslps & BIT(0)) && (dmar_domain->domain.pgsize_bitmap & SZ_2M))
return -EINVAL;
@@ -3489,6 +3085,14 @@ paging_domain_compatible_second_stage(struct dmar_domain *dmar_domain,
!dmar_domain->iotlb_sync_map)
return -EINVAL;
+ /*
+ * FIXME this is locked wrong, it needs to be under the
+ * dmar_domain->lock
+ */
+ if ((dmar_domain->sspt.vtdss_pt.common.features &
+ BIT(PT_FEAT_VTDSS_FORCE_COHERENCE)) &&
+ !ecap_sc_support(iommu->ecap))
+ return -EINVAL;
return 0;
}
@@ -3498,7 +3102,6 @@ int paging_domain_compatible(struct iommu_domain *domain, struct device *dev)
struct dmar_domain *dmar_domain = to_dmar_domain(domain);
struct intel_iommu *iommu = info->iommu;
int ret = -EINVAL;
- int addr_width;
if (intel_domain_is_fs_paging(dmar_domain))
ret = paging_domain_compatible_first_stage(dmar_domain, iommu);
@@ -3509,26 +3112,6 @@ int paging_domain_compatible(struct iommu_domain *domain, struct device *dev)
if (ret)
return ret;
- /*
- * FIXME this is locked wrong, it needs to be under the
- * dmar_domain->lock
- */
- if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap))
- return -EINVAL;
-
- if (dmar_domain->iommu_coherency !=
- iommu_paging_structure_coherency(iommu))
- return -EINVAL;
-
-
- /* check if this iommu agaw is sufficient for max mapped address */
- addr_width = agaw_to_width(iommu->agaw);
- if (addr_width > cap_mgaw(iommu->cap))
- addr_width = cap_mgaw(iommu->cap);
-
- if (dmar_domain->gaw > addr_width || dmar_domain->agaw > iommu->agaw)
- return -EINVAL;
-
if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev) &&
context_copied(iommu, info->bus, info->devfn))
return intel_pasid_setup_sm_context(dev);
@@ -3537,7 +3120,8 @@ int paging_domain_compatible(struct iommu_domain *domain, struct device *dev)
}
static int intel_iommu_attach_device(struct iommu_domain *domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old)
{
int ret;
@@ -3558,110 +3142,6 @@ static int intel_iommu_attach_device(struct iommu_domain *domain,
return ret;
}
-static int intel_iommu_map(struct iommu_domain *domain,
- unsigned long iova, phys_addr_t hpa,
- size_t size, int iommu_prot, gfp_t gfp)
-{
- struct dmar_domain *dmar_domain = to_dmar_domain(domain);
- u64 max_addr;
- int prot = 0;
-
- if (iommu_prot & IOMMU_READ)
- prot |= DMA_PTE_READ;
- if (iommu_prot & IOMMU_WRITE)
- prot |= DMA_PTE_WRITE;
- if (dmar_domain->set_pte_snp)
- prot |= DMA_PTE_SNP;
-
- max_addr = iova + size;
- if (dmar_domain->max_addr < max_addr) {
- u64 end;
-
- /* check if minimum agaw is sufficient for mapped address */
- end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
- if (end < max_addr) {
- pr_err("%s: iommu width (%d) is not "
- "sufficient for the mapped address (%llx)\n",
- __func__, dmar_domain->gaw, max_addr);
- return -EFAULT;
- }
- dmar_domain->max_addr = max_addr;
- }
- /* Round up size to next multiple of PAGE_SIZE, if it and
- the low bits of hpa would take us onto the next page */
- size = aligned_nrpages(hpa, size);
- return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
- hpa >> VTD_PAGE_SHIFT, size, prot, gfp);
-}
-
-static int intel_iommu_map_pages(struct iommu_domain *domain,
- unsigned long iova, phys_addr_t paddr,
- size_t pgsize, size_t pgcount,
- int prot, gfp_t gfp, size_t *mapped)
-{
- unsigned long pgshift = __ffs(pgsize);
- size_t size = pgcount << pgshift;
- int ret;
-
- if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G)
- return -EINVAL;
-
- if (!IS_ALIGNED(iova | paddr, pgsize))
- return -EINVAL;
-
- ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp);
- if (!ret && mapped)
- *mapped = size;
-
- return ret;
-}
-
-static size_t intel_iommu_unmap(struct iommu_domain *domain,
- unsigned long iova, size_t size,
- struct iommu_iotlb_gather *gather)
-{
- struct dmar_domain *dmar_domain = to_dmar_domain(domain);
- unsigned long start_pfn, last_pfn;
- int level = 0;
-
- /* Cope with horrid API which requires us to unmap more than the
- size argument if it happens to be a large-page mapping. */
- if (unlikely(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT,
- &level, GFP_ATOMIC)))
- return 0;
-
- if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
- size = VTD_PAGE_SIZE << level_to_offset_bits(level);
-
- start_pfn = iova >> VTD_PAGE_SHIFT;
- last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
-
- domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist);
-
- if (dmar_domain->max_addr == iova + size)
- dmar_domain->max_addr = iova;
-
- /*
- * We do not use page-selective IOTLB invalidation in flush queue,
- * so there is no need to track page and sync iotlb.
- */
- if (!iommu_iotlb_gather_queued(gather))
- iommu_iotlb_gather_add_page(domain, gather, iova, size);
-
- return size;
-}
-
-static size_t intel_iommu_unmap_pages(struct iommu_domain *domain,
- unsigned long iova,
- size_t pgsize, size_t pgcount,
- struct iommu_iotlb_gather *gather)
-{
- unsigned long pgshift = __ffs(pgsize);
- size_t size = pgcount << pgshift;
-
- return intel_iommu_unmap(domain, iova, size, gather);
-}
-
static void intel_iommu_tlb_sync(struct iommu_domain *domain,
struct iommu_iotlb_gather *gather)
{
@@ -3671,24 +3151,6 @@ static void intel_iommu_tlb_sync(struct iommu_domain *domain,
iommu_put_pages_list(&gather->freelist);
}
-static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
- dma_addr_t iova)
-{
- struct dmar_domain *dmar_domain = to_dmar_domain(domain);
- struct dma_pte *pte;
- int level = 0;
- u64 phys = 0;
-
- pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level,
- GFP_ATOMIC);
- if (pte && dma_pte_present(pte))
- phys = dma_pte_addr(pte) +
- (iova & (BIT_MASK(level_to_offset_bits(level) +
- VTD_PAGE_SHIFT) - 1));
-
- return phys;
-}
-
static bool domain_support_force_snooping(struct dmar_domain *domain)
{
struct device_domain_info *info;
@@ -3730,15 +3192,15 @@ static bool intel_iommu_enforce_cache_coherency_ss(struct iommu_domain *domain)
struct dmar_domain *dmar_domain = to_dmar_domain(domain);
guard(spinlock_irqsave)(&dmar_domain->lock);
- if (!domain_support_force_snooping(dmar_domain) ||
- dmar_domain->has_mappings)
+ if (!domain_support_force_snooping(dmar_domain))
return false;
/*
* Second level page table supports per-PTE snoop control. The
* iommu_map() interface will handle this by setting SNP bit.
*/
- dmar_domain->set_pte_snp = true;
+ dmar_domain->sspt.vtdss_pt.common.features |=
+ BIT(PT_FEAT_VTDSS_FORCE_COHERENCE);
dmar_domain->force_snooping = true;
return true;
}
@@ -4302,49 +3764,6 @@ err_unwind:
return ret;
}
-static int intel_iommu_read_and_clear_dirty(struct iommu_domain *domain,
- unsigned long iova, size_t size,
- unsigned long flags,
- struct iommu_dirty_bitmap *dirty)
-{
- struct dmar_domain *dmar_domain = to_dmar_domain(domain);
- unsigned long end = iova + size - 1;
- unsigned long pgsize;
-
- /*
- * IOMMUFD core calls into a dirty tracking disabled domain without an
- * IOVA bitmap set in order to clean dirty bits in all PTEs that might
- * have occurred when we stopped dirty tracking. This ensures that we
- * never inherit dirtied bits from a previous cycle.
- */
- if (!dmar_domain->dirty_tracking && dirty->bitmap)
- return -EINVAL;
-
- do {
- struct dma_pte *pte;
- int lvl = 0;
-
- pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &lvl,
- GFP_ATOMIC);
- pgsize = level_size(lvl) << VTD_PAGE_SHIFT;
- if (!pte || !dma_pte_present(pte)) {
- iova += pgsize;
- continue;
- }
-
- if (dma_sl_pte_test_and_clear_dirty(pte, flags))
- iommu_dirty_bitmap_record(dirty, iova, pgsize);
- iova += pgsize;
- } while (iova < end);
-
- return 0;
-}
-
-static const struct iommu_dirty_ops intel_dirty_ops = {
- .set_dirty_tracking = intel_iommu_set_dirty_tracking,
- .read_and_clear_dirty = intel_iommu_read_and_clear_dirty,
-};
-
static int context_setup_pass_through(struct device *dev, u8 bus, u8 devfn)
{
struct device_domain_info *info = dev_iommu_priv_get(dev);
@@ -4401,7 +3820,9 @@ static int device_setup_pass_through(struct device *dev)
context_setup_pass_through_cb, dev);
}
-static int identity_domain_attach_dev(struct iommu_domain *domain, struct device *dev)
+static int identity_domain_attach_dev(struct iommu_domain *domain,
+ struct device *dev,
+ struct iommu_domain *old)
{
struct device_domain_info *info = dev_iommu_priv_get(dev);
struct intel_iommu *iommu = info->iommu;
@@ -4462,27 +3883,23 @@ static struct iommu_domain identity_domain = {
};
const struct iommu_domain_ops intel_fs_paging_domain_ops = {
+ IOMMU_PT_DOMAIN_OPS(x86_64),
.attach_dev = intel_iommu_attach_device,
.set_dev_pasid = intel_iommu_set_dev_pasid,
- .map_pages = intel_iommu_map_pages,
- .unmap_pages = intel_iommu_unmap_pages,
.iotlb_sync_map = intel_iommu_iotlb_sync_map,
.flush_iotlb_all = intel_flush_iotlb_all,
.iotlb_sync = intel_iommu_tlb_sync,
- .iova_to_phys = intel_iommu_iova_to_phys,
.free = intel_iommu_domain_free,
.enforce_cache_coherency = intel_iommu_enforce_cache_coherency_fs,
};
const struct iommu_domain_ops intel_ss_paging_domain_ops = {
+ IOMMU_PT_DOMAIN_OPS(vtdss),
.attach_dev = intel_iommu_attach_device,
.set_dev_pasid = intel_iommu_set_dev_pasid,
- .map_pages = intel_iommu_map_pages,
- .unmap_pages = intel_iommu_unmap_pages,
.iotlb_sync_map = intel_iommu_iotlb_sync_map,
.flush_iotlb_all = intel_flush_iotlb_all,
.iotlb_sync = intel_iommu_tlb_sync,
- .iova_to_phys = intel_iommu_iova_to_phys,
.free = intel_iommu_domain_free,
.enforce_cache_coherency = intel_iommu_enforce_cache_coherency_ss,
};
@@ -4797,3 +4214,5 @@ err:
return ret;
}
+
+MODULE_IMPORT_NS("GENERIC_PT_IOMMU");
diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h
index 3056583d7f56..25c5e22096d4 100644
--- a/drivers/iommu/intel/iommu.h
+++ b/drivers/iommu/intel/iommu.h
@@ -23,8 +23,8 @@
#include <linux/xarray.h>
#include <linux/perf_event.h>
#include <linux/pci.h>
+#include <linux/generic_pt/iommu.h>
-#include <asm/cacheflush.h>
#include <asm/iommu.h>
#include <uapi/linux/iommufd.h>
@@ -595,22 +595,20 @@ struct qi_batch {
};
struct dmar_domain {
- int nid; /* node id */
+ union {
+ struct iommu_domain domain;
+ struct pt_iommu iommu;
+ /* First stage page table */
+ struct pt_iommu_x86_64 fspt;
+ /* Second stage page table */
+ struct pt_iommu_vtdss sspt;
+ };
+
struct xarray iommu_array; /* Attached IOMMU array */
- u8 iommu_coherency: 1; /* indicate coherency of iommu access */
- u8 force_snooping : 1; /* Create IOPTEs with snoop control */
- u8 set_pte_snp:1;
- u8 use_first_level:1; /* DMA translation for the domain goes
- * through the first level page table,
- * otherwise, goes through the second
- * level.
- */
+ u8 force_snooping:1; /* Create PASID entry with snoop control */
u8 dirty_tracking:1; /* Dirty tracking is enabled */
u8 nested_parent:1; /* Has other domains nested on it */
- u8 has_mappings:1; /* Has mappings configured through
- * iommu_map() interface.
- */
u8 iotlb_sync_map:1; /* Need to flush IOTLB cache or write
* buffer when creating mappings.
*/
@@ -623,26 +621,9 @@ struct dmar_domain {
struct list_head cache_tags; /* Cache tag list */
struct qi_batch *qi_batch; /* Batched QI descriptors */
- int iommu_superpage;/* Level of superpages supported:
- 0 == 4KiB (no superpages), 1 == 2MiB,
- 2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
union {
/* DMA remapping domain */
struct {
- /* virtual address */
- struct dma_pte *pgd;
- /* max guest address width */
- int gaw;
- /*
- * adjusted guest address width:
- * 0: level 2 30-bit
- * 1: level 3 39-bit
- * 2: level 4 48-bit
- * 3: level 5 57-bit
- */
- int agaw;
- /* maximum mapped address */
- u64 max_addr;
/* Protect the s1_domains list */
spinlock_t s1_lock;
/* Track s1_domains nested on this domain */
@@ -664,10 +645,10 @@ struct dmar_domain {
struct mmu_notifier notifier;
};
};
-
- struct iommu_domain domain; /* generic domain data structure for
- iommu core */
};
+PT_IOMMU_CHECK_DOMAIN(struct dmar_domain, iommu, domain);
+PT_IOMMU_CHECK_DOMAIN(struct dmar_domain, sspt.iommu, domain);
+PT_IOMMU_CHECK_DOMAIN(struct dmar_domain, fspt.iommu, domain);
/*
* In theory, the VT-d 4.0 spec can support up to 2 ^ 16 counters.
@@ -866,11 +847,6 @@ struct dma_pte {
u64 val;
};
-static inline void dma_clear_pte(struct dma_pte *pte)
-{
- pte->val = 0;
-}
-
static inline u64 dma_pte_addr(struct dma_pte *pte)
{
#ifdef CONFIG_64BIT
@@ -886,32 +862,11 @@ static inline bool dma_pte_present(struct dma_pte *pte)
return (pte->val & 3) != 0;
}
-static inline bool dma_sl_pte_test_and_clear_dirty(struct dma_pte *pte,
- unsigned long flags)
-{
- if (flags & IOMMU_DIRTY_NO_CLEAR)
- return (pte->val & DMA_SL_PTE_DIRTY) != 0;
-
- return test_and_clear_bit(DMA_SL_PTE_DIRTY_BIT,
- (unsigned long *)&pte->val);
-}
-
static inline bool dma_pte_superpage(struct dma_pte *pte)
{
return (pte->val & DMA_PTE_LARGE_PAGE);
}
-static inline bool first_pte_in_page(struct dma_pte *pte)
-{
- return IS_ALIGNED((unsigned long)pte, VTD_PAGE_SIZE);
-}
-
-static inline int nr_pte_to_next_page(struct dma_pte *pte)
-{
- return first_pte_in_page(pte) ? BIT_ULL(VTD_STRIDE_SHIFT) :
- (struct dma_pte *)ALIGN((unsigned long)pte, VTD_PAGE_SIZE) - pte;
-}
-
static inline bool context_present(struct context_entry *context)
{
return (context->lo & 1);
@@ -927,11 +882,6 @@ static inline int agaw_to_level(int agaw)
return agaw + 2;
}
-static inline int agaw_to_width(int agaw)
-{
- return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
-}
-
static inline int width_to_agaw(int width)
{
return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
@@ -947,25 +897,6 @@ static inline int pfn_level_offset(u64 pfn, int level)
return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
}
-static inline u64 level_mask(int level)
-{
- return -1ULL << level_to_offset_bits(level);
-}
-
-static inline u64 level_size(int level)
-{
- return 1ULL << level_to_offset_bits(level);
-}
-
-static inline u64 align_to_level(u64 pfn, int level)
-{
- return (pfn + level_size(level) - 1) & level_mask(level);
-}
-
-static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
-{
- return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
-}
static inline void context_set_present(struct context_entry *context)
{
@@ -1097,7 +1028,7 @@ static inline void qi_desc_iotlb(struct intel_iommu *iommu, u16 did, u64 addr,
struct qi_desc *desc)
{
u8 dw = 0, dr = 0;
- int ih = 0;
+ int ih = addr & 1;
if (cap_write_drain(iommu->cap))
dw = 1;
diff --git a/drivers/iommu/intel/nested.c b/drivers/iommu/intel/nested.c
index 1b6ad9c900a5..a3fb8c193ca6 100644
--- a/drivers/iommu/intel/nested.c
+++ b/drivers/iommu/intel/nested.c
@@ -19,7 +19,7 @@
#include "pasid.h"
static int intel_nested_attach_dev(struct iommu_domain *domain,
- struct device *dev)
+ struct device *dev, struct iommu_domain *old)
{
struct device_domain_info *info = dev_iommu_priv_get(dev);
struct dmar_domain *dmar_domain = to_dmar_domain(domain);
@@ -29,11 +29,6 @@ static int intel_nested_attach_dev(struct iommu_domain *domain,
device_block_translation(dev);
- if (iommu->agaw < dmar_domain->s2_domain->agaw) {
- dev_err_ratelimited(dev, "Adjusted guest address width not compatible\n");
- return -ENODEV;
- }
-
/*
* Stage-1 domain cannot work alone, it is nested on a s2_domain.
* The s2_domain will be used in nested translation, hence needs
diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c
index 52f678975da7..3e2255057079 100644
--- a/drivers/iommu/intel/pasid.c
+++ b/drivers/iommu/intel/pasid.c
@@ -366,7 +366,7 @@ static void pasid_pte_config_first_level(struct intel_iommu *iommu,
pasid_set_domain_id(pte, did);
pasid_set_address_width(pte, iommu->agaw);
- pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap));
+ pasid_set_page_snoop(pte, flags & PASID_FLAG_PWSNP);
/* Setup Present and PASID Granular Transfer Type: */
pasid_set_translation_type(pte, PASID_ENTRY_PGTT_FL_ONLY);
@@ -461,19 +461,22 @@ int intel_pasid_replace_first_level(struct intel_iommu *iommu,
*/
static void pasid_pte_config_second_level(struct intel_iommu *iommu,
struct pasid_entry *pte,
- u64 pgd_val, int agaw, u16 did,
- bool dirty_tracking)
+ struct dmar_domain *domain, u16 did)
{
+ struct pt_iommu_vtdss_hw_info pt_info;
+
lockdep_assert_held(&iommu->lock);
+ pt_iommu_vtdss_hw_info(&domain->sspt, &pt_info);
pasid_clear_entry(pte);
pasid_set_domain_id(pte, did);
- pasid_set_slptr(pte, pgd_val);
- pasid_set_address_width(pte, agaw);
+ pasid_set_slptr(pte, pt_info.ssptptr);
+ pasid_set_address_width(pte, pt_info.aw);
pasid_set_translation_type(pte, PASID_ENTRY_PGTT_SL_ONLY);
pasid_set_fault_enable(pte);
- pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap));
- if (dirty_tracking)
+ pasid_set_page_snoop(pte, !(domain->sspt.vtdss_pt.common.features &
+ BIT(PT_FEAT_DMA_INCOHERENT)));
+ if (domain->dirty_tracking)
pasid_set_ssade(pte);
pasid_set_present(pte);
@@ -484,10 +487,9 @@ int intel_pasid_setup_second_level(struct intel_iommu *iommu,
struct device *dev, u32 pasid)
{
struct pasid_entry *pte;
- struct dma_pte *pgd;
- u64 pgd_val;
u16 did;
+
/*
* If hardware advertises no support for second level
* translation, return directly.
@@ -498,8 +500,6 @@ int intel_pasid_setup_second_level(struct intel_iommu *iommu,
return -EINVAL;
}
- pgd = domain->pgd;
- pgd_val = virt_to_phys(pgd);
did = domain_id_iommu(domain, iommu);
spin_lock(&iommu->lock);
@@ -514,8 +514,7 @@ int intel_pasid_setup_second_level(struct intel_iommu *iommu,
return -EBUSY;
}
- pasid_pte_config_second_level(iommu, pte, pgd_val, domain->agaw,
- did, domain->dirty_tracking);
+ pasid_pte_config_second_level(iommu, pte, domain, did);
spin_unlock(&iommu->lock);
pasid_flush_caches(iommu, pte, pasid, did);
@@ -529,8 +528,6 @@ int intel_pasid_replace_second_level(struct intel_iommu *iommu,
u32 pasid)
{
struct pasid_entry *pte, new_pte;
- struct dma_pte *pgd;
- u64 pgd_val;
u16 did;
/*
@@ -543,13 +540,9 @@ int intel_pasid_replace_second_level(struct intel_iommu *iommu,
return -EINVAL;
}
- pgd = domain->pgd;
- pgd_val = virt_to_phys(pgd);
did = domain_id_iommu(domain, iommu);
- pasid_pte_config_second_level(iommu, &new_pte, pgd_val,
- domain->agaw, did,
- domain->dirty_tracking);
+ pasid_pte_config_second_level(iommu, &new_pte, domain, did);
spin_lock(&iommu->lock);
pte = intel_pasid_get_entry(dev, pasid);
@@ -747,10 +740,12 @@ static void pasid_pte_config_nestd(struct intel_iommu *iommu,
struct dmar_domain *s2_domain,
u16 did)
{
- struct dma_pte *pgd = s2_domain->pgd;
+ struct pt_iommu_vtdss_hw_info pt_info;
lockdep_assert_held(&iommu->lock);
+ pt_iommu_vtdss_hw_info(&s2_domain->sspt, &pt_info);
+
pasid_clear_entry(pte);
if (s1_cfg->addr_width == ADDR_WIDTH_5LEVEL)
@@ -770,11 +765,12 @@ static void pasid_pte_config_nestd(struct intel_iommu *iommu,
if (s2_domain->force_snooping)
pasid_set_pgsnp(pte);
- pasid_set_slptr(pte, virt_to_phys(pgd));
+ pasid_set_slptr(pte, pt_info.ssptptr);
pasid_set_fault_enable(pte);
pasid_set_domain_id(pte, did);
- pasid_set_address_width(pte, s2_domain->agaw);
- pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap));
+ pasid_set_address_width(pte, pt_info.aw);
+ pasid_set_page_snoop(pte, !(s2_domain->sspt.vtdss_pt.common.features &
+ BIT(PT_FEAT_DMA_INCOHERENT)));
if (s2_domain->dirty_tracking)
pasid_set_ssade(pte);
pasid_set_translation_type(pte, PASID_ENTRY_PGTT_NESTED);
diff --git a/drivers/iommu/intel/pasid.h b/drivers/iommu/intel/pasid.h
index a771a77d4239..b4c85242dc79 100644
--- a/drivers/iommu/intel/pasid.h
+++ b/drivers/iommu/intel/pasid.h
@@ -24,6 +24,7 @@
#define PASID_FLAG_NESTED BIT(1)
#define PASID_FLAG_PAGE_SNOOP BIT(2)
+#define PASID_FLAG_PWSNP BIT(2)
/*
* The PASID_FLAG_FL5LP flag Indicates using 5-level paging for first-
diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index e147f71f91b7..71de7947971f 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -170,6 +170,7 @@ static int intel_svm_set_dev_pasid(struct iommu_domain *domain,
/* Setup the pasid table: */
sflags = cpu_feature_enabled(X86_FEATURE_LA57) ? PASID_FLAG_FL5LP : 0;
+ sflags |= PASID_FLAG_PWSNP;
ret = __domain_setup_first_level(iommu, dev, pasid,
FLPT_DEFAULT_DID, __pa(mm->pgd),
sflags, old);
diff --git a/drivers/iommu/io-pgtable-arm-selftests.c b/drivers/iommu/io-pgtable-arm-selftests.c
new file mode 100644
index 000000000000..334e70350924
--- /dev/null
+++ b/drivers/iommu/io-pgtable-arm-selftests.c
@@ -0,0 +1,214 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * CPU-agnostic ARM page table allocator.
+ *
+ * Copyright (C) 2014 ARM Limited
+ *
+ * Author: Will Deacon <will.deacon@arm.com>
+ */
+
+#define pr_fmt(fmt) "arm-lpae io-pgtable: " fmt
+
+#include <kunit/device.h>
+#include <kunit/test.h>
+#include <linux/io-pgtable.h>
+#include <linux/kernel.h>
+
+#include "io-pgtable-arm.h"
+
+static struct io_pgtable_cfg *cfg_cookie;
+
+static void dummy_tlb_flush_all(void *cookie)
+{
+ WARN_ON(cookie != cfg_cookie);
+}
+
+static void dummy_tlb_flush(unsigned long iova, size_t size,
+ size_t granule, void *cookie)
+{
+ WARN_ON(cookie != cfg_cookie);
+ WARN_ON(!(size & cfg_cookie->pgsize_bitmap));
+}
+
+static void dummy_tlb_add_page(struct iommu_iotlb_gather *gather,
+ unsigned long iova, size_t granule,
+ void *cookie)
+{
+ dummy_tlb_flush(iova, granule, granule, cookie);
+}
+
+static const struct iommu_flush_ops dummy_tlb_ops = {
+ .tlb_flush_all = dummy_tlb_flush_all,
+ .tlb_flush_walk = dummy_tlb_flush,
+ .tlb_add_page = dummy_tlb_add_page,
+};
+
+#define __FAIL(test, i) ({ \
+ KUNIT_FAIL(test, "test failed for fmt idx %d\n", (i)); \
+ -EFAULT; \
+})
+
+static int arm_lpae_run_tests(struct kunit *test, struct io_pgtable_cfg *cfg)
+{
+ static const enum io_pgtable_fmt fmts[] = {
+ ARM_64_LPAE_S1,
+ ARM_64_LPAE_S2,
+ };
+
+ int i, j;
+ unsigned long iova;
+ size_t size, mapped;
+ struct io_pgtable_ops *ops;
+
+ for (i = 0; i < ARRAY_SIZE(fmts); ++i) {
+ cfg_cookie = cfg;
+ ops = alloc_io_pgtable_ops(fmts[i], cfg, cfg);
+ if (!ops) {
+ kunit_err(test, "failed to allocate io pgtable ops\n");
+ return -ENOMEM;
+ }
+
+ /*
+ * Initial sanity checks.
+ * Empty page tables shouldn't provide any translations.
+ */
+ if (ops->iova_to_phys(ops, 42))
+ return __FAIL(test, i);
+
+ if (ops->iova_to_phys(ops, SZ_1G + 42))
+ return __FAIL(test, i);
+
+ if (ops->iova_to_phys(ops, SZ_2G + 42))
+ return __FAIL(test, i);
+
+ /*
+ * Distinct mappings of different granule sizes.
+ */
+ iova = 0;
+ for_each_set_bit(j, &cfg->pgsize_bitmap, BITS_PER_LONG) {
+ size = 1UL << j;
+
+ if (ops->map_pages(ops, iova, iova, size, 1,
+ IOMMU_READ | IOMMU_WRITE |
+ IOMMU_NOEXEC | IOMMU_CACHE,
+ GFP_KERNEL, &mapped))
+ return __FAIL(test, i);
+
+ /* Overlapping mappings */
+ if (!ops->map_pages(ops, iova, iova + size, size, 1,
+ IOMMU_READ | IOMMU_NOEXEC,
+ GFP_KERNEL, &mapped))
+ return __FAIL(test, i);
+
+ if (ops->iova_to_phys(ops, iova + 42) != (iova + 42))
+ return __FAIL(test, i);
+
+ iova += SZ_1G;
+ }
+
+ /* Full unmap */
+ iova = 0;
+ for_each_set_bit(j, &cfg->pgsize_bitmap, BITS_PER_LONG) {
+ size = 1UL << j;
+
+ if (ops->unmap_pages(ops, iova, size, 1, NULL) != size)
+ return __FAIL(test, i);
+
+ if (ops->iova_to_phys(ops, iova + 42))
+ return __FAIL(test, i);
+
+ /* Remap full block */
+ if (ops->map_pages(ops, iova, iova, size, 1,
+ IOMMU_WRITE, GFP_KERNEL, &mapped))
+ return __FAIL(test, i);
+
+ if (ops->iova_to_phys(ops, iova + 42) != (iova + 42))
+ return __FAIL(test, i);
+
+ iova += SZ_1G;
+ }
+
+ /*
+ * Map/unmap the last largest supported page of the IAS, this can
+ * trigger corner cases in the concatednated page tables.
+ */
+ mapped = 0;
+ size = 1UL << __fls(cfg->pgsize_bitmap);
+ iova = (1UL << cfg->ias) - size;
+ if (ops->map_pages(ops, iova, iova, size, 1,
+ IOMMU_READ | IOMMU_WRITE |
+ IOMMU_NOEXEC | IOMMU_CACHE,
+ GFP_KERNEL, &mapped))
+ return __FAIL(test, i);
+ if (mapped != size)
+ return __FAIL(test, i);
+ if (ops->unmap_pages(ops, iova, size, 1, NULL) != size)
+ return __FAIL(test, i);
+
+ free_io_pgtable_ops(ops);
+ }
+
+ return 0;
+}
+
+static void arm_lpae_do_selftests(struct kunit *test)
+{
+ static const unsigned long pgsize[] = {
+ SZ_4K | SZ_2M | SZ_1G,
+ SZ_16K | SZ_32M,
+ SZ_64K | SZ_512M,
+ };
+
+ static const unsigned int address_size[] = {
+ 32, 36, 40, 42, 44, 48,
+ };
+
+ int i, j, k, pass = 0, fail = 0;
+ struct device *dev;
+ struct io_pgtable_cfg cfg = {
+ .tlb = &dummy_tlb_ops,
+ .coherent_walk = true,
+ .quirks = IO_PGTABLE_QUIRK_NO_WARN,
+ };
+
+ dev = kunit_device_register(test, "io-pgtable-test");
+ KUNIT_EXPECT_NOT_ERR_OR_NULL(test, dev);
+ if (IS_ERR_OR_NULL(dev))
+ return;
+
+ cfg.iommu_dev = dev;
+
+ for (i = 0; i < ARRAY_SIZE(pgsize); ++i) {
+ for (j = 0; j < ARRAY_SIZE(address_size); ++j) {
+ /* Don't use ias > oas as it is not valid for stage-2. */
+ for (k = 0; k <= j; ++k) {
+ cfg.pgsize_bitmap = pgsize[i];
+ cfg.ias = address_size[k];
+ cfg.oas = address_size[j];
+ kunit_info(test, "pgsize_bitmap 0x%08lx, IAS %u OAS %u\n",
+ pgsize[i], cfg.ias, cfg.oas);
+ if (arm_lpae_run_tests(test, &cfg))
+ fail++;
+ else
+ pass++;
+ }
+ }
+ }
+
+ kunit_info(test, "completed with %d PASS %d FAIL\n", pass, fail);
+}
+
+static struct kunit_case io_pgtable_arm_test_cases[] = {
+ KUNIT_CASE(arm_lpae_do_selftests),
+ {},
+};
+
+static struct kunit_suite io_pgtable_arm_test = {
+ .name = "io-pgtable-arm-test",
+ .test_cases = io_pgtable_arm_test_cases,
+};
+
+kunit_test_suite(io_pgtable_arm_test);
+
+MODULE_DESCRIPTION("io-pgtable-arm library kunit tests");
+MODULE_LICENSE("GPL");
diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c
index 7e8e2216c294..e6626004b323 100644
--- a/drivers/iommu/io-pgtable-arm.c
+++ b/drivers/iommu/io-pgtable-arm.c
@@ -12,8 +12,6 @@
#include <linux/atomic.h>
#include <linux/bitops.h>
#include <linux/io-pgtable.h>
-#include <linux/kernel.h>
-#include <linux/device/faux.h>
#include <linux/sizes.h>
#include <linux/slab.h>
#include <linux/types.h>
@@ -1267,204 +1265,3 @@ struct io_pgtable_init_fns io_pgtable_arm_mali_lpae_init_fns = {
.alloc = arm_mali_lpae_alloc_pgtable,
.free = arm_lpae_free_pgtable,
};
-
-#ifdef CONFIG_IOMMU_IO_PGTABLE_LPAE_SELFTEST
-
-static struct io_pgtable_cfg *cfg_cookie __initdata;
-
-static void __init dummy_tlb_flush_all(void *cookie)
-{
- WARN_ON(cookie != cfg_cookie);
-}
-
-static void __init dummy_tlb_flush(unsigned long iova, size_t size,
- size_t granule, void *cookie)
-{
- WARN_ON(cookie != cfg_cookie);
- WARN_ON(!(size & cfg_cookie->pgsize_bitmap));
-}
-
-static void __init dummy_tlb_add_page(struct iommu_iotlb_gather *gather,
- unsigned long iova, size_t granule,
- void *cookie)
-{
- dummy_tlb_flush(iova, granule, granule, cookie);
-}
-
-static const struct iommu_flush_ops dummy_tlb_ops __initconst = {
- .tlb_flush_all = dummy_tlb_flush_all,
- .tlb_flush_walk = dummy_tlb_flush,
- .tlb_add_page = dummy_tlb_add_page,
-};
-
-static void __init arm_lpae_dump_ops(struct io_pgtable_ops *ops)
-{
- struct arm_lpae_io_pgtable *data = io_pgtable_ops_to_data(ops);
- struct io_pgtable_cfg *cfg = &data->iop.cfg;
-
- pr_err("cfg: pgsize_bitmap 0x%lx, ias %u-bit\n",
- cfg->pgsize_bitmap, cfg->ias);
- pr_err("data: %d levels, 0x%zx pgd_size, %u pg_shift, %u bits_per_level, pgd @ %p\n",
- ARM_LPAE_MAX_LEVELS - data->start_level, ARM_LPAE_PGD_SIZE(data),
- ilog2(ARM_LPAE_GRANULE(data)), data->bits_per_level, data->pgd);
-}
-
-#define __FAIL(ops, i) ({ \
- WARN(1, "selftest: test failed for fmt idx %d\n", (i)); \
- arm_lpae_dump_ops(ops); \
- -EFAULT; \
-})
-
-static int __init arm_lpae_run_tests(struct io_pgtable_cfg *cfg)
-{
- static const enum io_pgtable_fmt fmts[] __initconst = {
- ARM_64_LPAE_S1,
- ARM_64_LPAE_S2,
- };
-
- int i, j;
- unsigned long iova;
- size_t size, mapped;
- struct io_pgtable_ops *ops;
-
- for (i = 0; i < ARRAY_SIZE(fmts); ++i) {
- cfg_cookie = cfg;
- ops = alloc_io_pgtable_ops(fmts[i], cfg, cfg);
- if (!ops) {
- pr_err("selftest: failed to allocate io pgtable ops\n");
- return -ENOMEM;
- }
-
- /*
- * Initial sanity checks.
- * Empty page tables shouldn't provide any translations.
- */
- if (ops->iova_to_phys(ops, 42))
- return __FAIL(ops, i);
-
- if (ops->iova_to_phys(ops, SZ_1G + 42))
- return __FAIL(ops, i);
-
- if (ops->iova_to_phys(ops, SZ_2G + 42))
- return __FAIL(ops, i);
-
- /*
- * Distinct mappings of different granule sizes.
- */
- iova = 0;
- for_each_set_bit(j, &cfg->pgsize_bitmap, BITS_PER_LONG) {
- size = 1UL << j;
-
- if (ops->map_pages(ops, iova, iova, size, 1,
- IOMMU_READ | IOMMU_WRITE |
- IOMMU_NOEXEC | IOMMU_CACHE,
- GFP_KERNEL, &mapped))
- return __FAIL(ops, i);
-
- /* Overlapping mappings */
- if (!ops->map_pages(ops, iova, iova + size, size, 1,
- IOMMU_READ | IOMMU_NOEXEC,
- GFP_KERNEL, &mapped))
- return __FAIL(ops, i);
-
- if (ops->iova_to_phys(ops, iova + 42) != (iova + 42))
- return __FAIL(ops, i);
-
- iova += SZ_1G;
- }
-
- /* Full unmap */
- iova = 0;
- for_each_set_bit(j, &cfg->pgsize_bitmap, BITS_PER_LONG) {
- size = 1UL << j;
-
- if (ops->unmap_pages(ops, iova, size, 1, NULL) != size)
- return __FAIL(ops, i);
-
- if (ops->iova_to_phys(ops, iova + 42))
- return __FAIL(ops, i);
-
- /* Remap full block */
- if (ops->map_pages(ops, iova, iova, size, 1,
- IOMMU_WRITE, GFP_KERNEL, &mapped))
- return __FAIL(ops, i);
-
- if (ops->iova_to_phys(ops, iova + 42) != (iova + 42))
- return __FAIL(ops, i);
-
- iova += SZ_1G;
- }
-
- /*
- * Map/unmap the last largest supported page of the IAS, this can
- * trigger corner cases in the concatednated page tables.
- */
- mapped = 0;
- size = 1UL << __fls(cfg->pgsize_bitmap);
- iova = (1UL << cfg->ias) - size;
- if (ops->map_pages(ops, iova, iova, size, 1,
- IOMMU_READ | IOMMU_WRITE |
- IOMMU_NOEXEC | IOMMU_CACHE,
- GFP_KERNEL, &mapped))
- return __FAIL(ops, i);
- if (mapped != size)
- return __FAIL(ops, i);
- if (ops->unmap_pages(ops, iova, size, 1, NULL) != size)
- return __FAIL(ops, i);
-
- free_io_pgtable_ops(ops);
- }
-
- return 0;
-}
-
-static int __init arm_lpae_do_selftests(void)
-{
- static const unsigned long pgsize[] __initconst = {
- SZ_4K | SZ_2M | SZ_1G,
- SZ_16K | SZ_32M,
- SZ_64K | SZ_512M,
- };
-
- static const unsigned int address_size[] __initconst = {
- 32, 36, 40, 42, 44, 48,
- };
-
- int i, j, k, pass = 0, fail = 0;
- struct faux_device *dev;
- struct io_pgtable_cfg cfg = {
- .tlb = &dummy_tlb_ops,
- .coherent_walk = true,
- .quirks = IO_PGTABLE_QUIRK_NO_WARN,
- };
-
- dev = faux_device_create("io-pgtable-test", NULL, 0);
- if (!dev)
- return -ENOMEM;
-
- cfg.iommu_dev = &dev->dev;
-
- for (i = 0; i < ARRAY_SIZE(pgsize); ++i) {
- for (j = 0; j < ARRAY_SIZE(address_size); ++j) {
- /* Don't use ias > oas as it is not valid for stage-2. */
- for (k = 0; k <= j; ++k) {
- cfg.pgsize_bitmap = pgsize[i];
- cfg.ias = address_size[k];
- cfg.oas = address_size[j];
- pr_info("selftest: pgsize_bitmap 0x%08lx, IAS %u OAS %u\n",
- pgsize[i], cfg.ias, cfg.oas);
- if (arm_lpae_run_tests(&cfg))
- fail++;
- else
- pass++;
- }
- }
- }
-
- pr_info("selftest: completed with %d PASS %d FAIL\n", pass, fail);
- faux_device_destroy(dev);
-
- return fail ? -EFAULT : 0;
-}
-subsys_initcall(arm_lpae_do_selftests);
-#endif
diff --git a/drivers/iommu/io-pgtable.c b/drivers/iommu/io-pgtable.c
index 8841c1487f00..843fec8e8a51 100644
--- a/drivers/iommu/io-pgtable.c
+++ b/drivers/iommu/io-pgtable.c
@@ -28,10 +28,6 @@ io_pgtable_init_table[IO_PGTABLE_NUM_FMTS] = {
#ifdef CONFIG_IOMMU_IO_PGTABLE_ARMV7S
[ARM_V7S] = &io_pgtable_arm_v7s_init_fns,
#endif
-#ifdef CONFIG_AMD_IOMMU
- [AMD_IOMMU_V1] = &io_pgtable_amd_iommu_v1_init_fns,
- [AMD_IOMMU_V2] = &io_pgtable_amd_iommu_v2_init_fns,
-#endif
};
static int check_custom_allocator(enum io_pgtable_fmt fmt,
diff --git a/drivers/iommu/iommu-pages.c b/drivers/iommu/iommu-pages.c
index 238c09e5166b..3bab175d8557 100644
--- a/drivers/iommu/iommu-pages.c
+++ b/drivers/iommu/iommu-pages.c
@@ -4,6 +4,7 @@
* Pasha Tatashin <pasha.tatashin@soleen.com>
*/
#include "iommu-pages.h"
+#include <linux/dma-mapping.h>
#include <linux/gfp.h>
#include <linux/mm.h>
@@ -22,6 +23,11 @@ IOPTDESC_MATCH(memcg_data, memcg_data);
#undef IOPTDESC_MATCH
static_assert(sizeof(struct ioptdesc) <= sizeof(struct page));
+static inline size_t ioptdesc_mem_size(struct ioptdesc *desc)
+{
+ return 1UL << (folio_order(ioptdesc_folio(desc)) + PAGE_SHIFT);
+}
+
/**
* iommu_alloc_pages_node_sz - Allocate a zeroed page of a given size from
* specific NUMA node
@@ -36,6 +42,7 @@ static_assert(sizeof(struct ioptdesc) <= sizeof(struct page));
*/
void *iommu_alloc_pages_node_sz(int nid, gfp_t gfp, size_t size)
{
+ struct ioptdesc *iopt;
unsigned long pgcnt;
struct folio *folio;
unsigned int order;
@@ -60,6 +67,9 @@ void *iommu_alloc_pages_node_sz(int nid, gfp_t gfp, size_t size)
if (unlikely(!folio))
return NULL;
+ iopt = folio_ioptdesc(folio);
+ iopt->incoherent = false;
+
/*
* All page allocations that should be reported to as "iommu-pagetables"
* to userspace must use one of the functions below. This includes
@@ -80,7 +90,10 @@ EXPORT_SYMBOL_GPL(iommu_alloc_pages_node_sz);
static void __iommu_free_desc(struct ioptdesc *iopt)
{
struct folio *folio = ioptdesc_folio(iopt);
- const unsigned long pgcnt = 1UL << folio_order(folio);
+ const unsigned long pgcnt = folio_nr_pages(folio);
+
+ if (IOMMU_PAGES_USE_DMA_API)
+ WARN_ON_ONCE(iopt->incoherent);
mod_node_page_state(folio_pgdat(folio), NR_IOMMU_PAGES, -pgcnt);
lruvec_stat_mod_folio(folio, NR_SECONDARY_PAGETABLE, -pgcnt);
@@ -117,3 +130,124 @@ void iommu_put_pages_list(struct iommu_pages_list *list)
__iommu_free_desc(iopt);
}
EXPORT_SYMBOL_GPL(iommu_put_pages_list);
+
+/**
+ * iommu_pages_start_incoherent - Setup the page for cache incoherent operation
+ * @virt: The page to setup
+ * @dma_dev: The iommu device
+ *
+ * For incoherent memory this will use the DMA API to manage the cache flushing
+ * on some arches. This is a lot of complexity compared to just calling
+ * arch_sync_dma_for_device(), but it is what the existing ARM iommu drivers
+ * have been doing. The DMA API requires keeping track of the DMA map and
+ * freeing it when required. This keeps track of the dma map inside the ioptdesc
+ * so that error paths are simple for the caller.
+ */
+int iommu_pages_start_incoherent(void *virt, struct device *dma_dev)
+{
+ struct ioptdesc *iopt = virt_to_ioptdesc(virt);
+ dma_addr_t dma;
+
+ if (WARN_ON(iopt->incoherent))
+ return -EINVAL;
+
+ if (!IOMMU_PAGES_USE_DMA_API) {
+ iommu_pages_flush_incoherent(dma_dev, virt, 0,
+ ioptdesc_mem_size(iopt));
+ } else {
+ dma = dma_map_single(dma_dev, virt, ioptdesc_mem_size(iopt),
+ DMA_TO_DEVICE);
+ if (dma_mapping_error(dma_dev, dma))
+ return -EINVAL;
+
+ /*
+ * The DMA API is not allowed to do anything other than DMA
+ * direct. It would be nice to also check
+ * dev_is_dma_coherent(dma_dev));
+ */
+ if (WARN_ON(dma != virt_to_phys(virt))) {
+ dma_unmap_single(dma_dev, dma, ioptdesc_mem_size(iopt),
+ DMA_TO_DEVICE);
+ return -EOPNOTSUPP;
+ }
+ }
+
+ iopt->incoherent = 1;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(iommu_pages_start_incoherent);
+
+/**
+ * iommu_pages_start_incoherent_list - Make a list of pages incoherent
+ * @list: The list of pages to setup
+ * @dma_dev: The iommu device
+ *
+ * Perform iommu_pages_start_incoherent() across all of list.
+ *
+ * If this fails the caller must call iommu_pages_stop_incoherent_list().
+ */
+int iommu_pages_start_incoherent_list(struct iommu_pages_list *list,
+ struct device *dma_dev)
+{
+ struct ioptdesc *cur;
+ int ret;
+
+ list_for_each_entry(cur, &list->pages, iopt_freelist_elm) {
+ if (WARN_ON(cur->incoherent))
+ continue;
+
+ ret = iommu_pages_start_incoherent(
+ folio_address(ioptdesc_folio(cur)), dma_dev);
+ if (ret)
+ return ret;
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(iommu_pages_start_incoherent_list);
+
+/**
+ * iommu_pages_stop_incoherent_list - Undo incoherence across a list
+ * @list: The list of pages to release
+ * @dma_dev: The iommu device
+ *
+ * Revert iommu_pages_start_incoherent() across all of the list. Pages that did
+ * not call or succeed iommu_pages_start_incoherent() will be ignored.
+ */
+#if IOMMU_PAGES_USE_DMA_API
+void iommu_pages_stop_incoherent_list(struct iommu_pages_list *list,
+ struct device *dma_dev)
+{
+ struct ioptdesc *cur;
+
+ list_for_each_entry(cur, &list->pages, iopt_freelist_elm) {
+ struct folio *folio = ioptdesc_folio(cur);
+
+ if (!cur->incoherent)
+ continue;
+ dma_unmap_single(dma_dev, virt_to_phys(folio_address(folio)),
+ ioptdesc_mem_size(cur), DMA_TO_DEVICE);
+ cur->incoherent = 0;
+ }
+}
+EXPORT_SYMBOL_GPL(iommu_pages_stop_incoherent_list);
+
+/**
+ * iommu_pages_free_incoherent - Free an incoherent page
+ * @virt: virtual address of the page to be freed.
+ * @dma_dev: The iommu device
+ *
+ * If the page is incoherent it made coherent again then freed.
+ */
+void iommu_pages_free_incoherent(void *virt, struct device *dma_dev)
+{
+ struct ioptdesc *iopt = virt_to_ioptdesc(virt);
+
+ if (iopt->incoherent) {
+ dma_unmap_single(dma_dev, virt_to_phys(virt),
+ ioptdesc_mem_size(iopt), DMA_TO_DEVICE);
+ iopt->incoherent = 0;
+ }
+ __iommu_free_desc(iopt);
+}
+EXPORT_SYMBOL_GPL(iommu_pages_free_incoherent);
+#endif
diff --git a/drivers/iommu/iommu-pages.h b/drivers/iommu/iommu-pages.h
index b3af2813ed0c..ae9da4f571f6 100644
--- a/drivers/iommu/iommu-pages.h
+++ b/drivers/iommu/iommu-pages.h
@@ -21,7 +21,10 @@ struct ioptdesc {
struct list_head iopt_freelist_elm;
unsigned long __page_mapping;
- pgoff_t __index;
+ union {
+ u8 incoherent;
+ pgoff_t __index;
+ };
void *_private;
unsigned int __page_type;
@@ -98,4 +101,48 @@ static inline void *iommu_alloc_pages_sz(gfp_t gfp, size_t size)
return iommu_alloc_pages_node_sz(NUMA_NO_NODE, gfp, size);
}
-#endif /* __IOMMU_PAGES_H */
+int iommu_pages_start_incoherent(void *virt, struct device *dma_dev);
+int iommu_pages_start_incoherent_list(struct iommu_pages_list *list,
+ struct device *dma_dev);
+
+#ifdef CONFIG_X86
+#define IOMMU_PAGES_USE_DMA_API 0
+#include <linux/cacheflush.h>
+
+static inline void iommu_pages_flush_incoherent(struct device *dma_dev,
+ void *virt, size_t offset,
+ size_t len)
+{
+ clflush_cache_range(virt + offset, len);
+}
+static inline void
+iommu_pages_stop_incoherent_list(struct iommu_pages_list *list,
+ struct device *dma_dev)
+{
+ /*
+ * For performance leave the incoherent flag alone which turns this into
+ * a NOP. For X86 the rest of the stop/free flow ignores the flag.
+ */
+}
+static inline void iommu_pages_free_incoherent(void *virt,
+ struct device *dma_dev)
+{
+ iommu_free_pages(virt);
+}
+#else
+#define IOMMU_PAGES_USE_DMA_API 1
+#include <linux/dma-mapping.h>
+
+static inline void iommu_pages_flush_incoherent(struct device *dma_dev,
+ void *virt, size_t offset,
+ size_t len)
+{
+ dma_sync_single_for_device(dma_dev, (uintptr_t)virt + offset, len,
+ DMA_TO_DEVICE);
+}
+void iommu_pages_stop_incoherent_list(struct iommu_pages_list *list,
+ struct device *dma_dev);
+void iommu_pages_free_incoherent(void *virt, struct device *dma_dev);
+#endif
+
+#endif /* __IOMMU_PAGES_H */
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 59244c744eab..2ca990dfbb88 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -100,7 +100,7 @@ static int iommu_bus_notifier(struct notifier_block *nb,
unsigned long action, void *data);
static void iommu_release_device(struct device *dev);
static int __iommu_attach_device(struct iommu_domain *domain,
- struct device *dev);
+ struct device *dev, struct iommu_domain *old);
static int __iommu_attach_group(struct iommu_domain *domain,
struct iommu_group *group);
static struct iommu_domain *__iommu_paging_domain_alloc_flags(struct device *dev,
@@ -114,6 +114,7 @@ enum {
static int __iommu_device_set_domain(struct iommu_group *group,
struct device *dev,
struct iommu_domain *new_domain,
+ struct iommu_domain *old_domain,
unsigned int flags);
static int __iommu_group_set_domain_internal(struct iommu_group *group,
struct iommu_domain *new_domain,
@@ -542,8 +543,21 @@ static void iommu_deinit_device(struct device *dev)
* Regardless, if a delayed attach never occurred, then the release
* should still avoid touching any hardware configuration either.
*/
- if (!dev->iommu->attach_deferred && ops->release_domain)
- ops->release_domain->ops->attach_dev(ops->release_domain, dev);
+ if (!dev->iommu->attach_deferred && ops->release_domain) {
+ struct iommu_domain *release_domain = ops->release_domain;
+
+ /*
+ * If the device requires direct mappings then it should not
+ * be parked on a BLOCKED domain during release as that would
+ * break the direct mappings.
+ */
+ if (dev->iommu->require_direct && ops->identity_domain &&
+ release_domain == ops->blocked_domain)
+ release_domain = ops->identity_domain;
+
+ release_domain->ops->attach_dev(release_domain, dev,
+ group->domain);
+ }
if (ops->release_device)
ops->release_device(dev);
@@ -628,7 +642,8 @@ static int __iommu_probe_device(struct device *dev, struct list_head *group_list
if (group->default_domain)
iommu_create_device_direct_mappings(group->default_domain, dev);
if (group->domain) {
- ret = __iommu_device_set_domain(group, dev, group->domain, 0);
+ ret = __iommu_device_set_domain(group, dev, group->domain, NULL,
+ 0);
if (ret)
goto err_remove_gdev;
} else if (!group->default_domain && !group_list) {
@@ -2115,14 +2130,14 @@ static void __iommu_group_set_core_domain(struct iommu_group *group)
}
static int __iommu_attach_device(struct iommu_domain *domain,
- struct device *dev)
+ struct device *dev, struct iommu_domain *old)
{
int ret;
if (unlikely(domain->ops->attach_dev == NULL))
return -ENODEV;
- ret = domain->ops->attach_dev(domain, dev);
+ ret = domain->ops->attach_dev(domain, dev, old);
if (ret)
return ret;
dev->iommu->attach_deferred = 0;
@@ -2171,7 +2186,7 @@ EXPORT_SYMBOL_GPL(iommu_attach_device);
int iommu_deferred_attach(struct device *dev, struct iommu_domain *domain)
{
if (dev->iommu && dev->iommu->attach_deferred)
- return __iommu_attach_device(domain, dev);
+ return __iommu_attach_device(domain, dev, NULL);
return 0;
}
@@ -2284,6 +2299,7 @@ EXPORT_SYMBOL_GPL(iommu_attach_group);
static int __iommu_device_set_domain(struct iommu_group *group,
struct device *dev,
struct iommu_domain *new_domain,
+ struct iommu_domain *old_domain,
unsigned int flags)
{
int ret;
@@ -2309,7 +2325,7 @@ static int __iommu_device_set_domain(struct iommu_group *group,
dev->iommu->attach_deferred = 0;
}
- ret = __iommu_attach_device(new_domain, dev);
+ ret = __iommu_attach_device(new_domain, dev, old_domain);
if (ret) {
/*
* If we have a blocking domain then try to attach that in hopes
@@ -2319,7 +2335,8 @@ static int __iommu_device_set_domain(struct iommu_group *group,
if ((flags & IOMMU_SET_DOMAIN_MUST_SUCCEED) &&
group->blocking_domain &&
group->blocking_domain != new_domain)
- __iommu_attach_device(group->blocking_domain, dev);
+ __iommu_attach_device(group->blocking_domain, dev,
+ old_domain);
return ret;
}
return 0;
@@ -2366,7 +2383,7 @@ static int __iommu_group_set_domain_internal(struct iommu_group *group,
result = 0;
for_each_group_device(group, gdev) {
ret = __iommu_device_set_domain(group, gdev->dev, new_domain,
- flags);
+ group->domain, flags);
if (ret) {
result = ret;
/*
@@ -2391,6 +2408,9 @@ err_revert:
*/
last_gdev = gdev;
for_each_group_device(group, gdev) {
+ /* No need to revert the last gdev that failed to set domain */
+ if (gdev == last_gdev)
+ break;
/*
* A NULL domain can happen only for first probe, in which case
* we leave group->domain as NULL and let release clean
@@ -2398,10 +2418,8 @@ err_revert:
*/
if (group->domain)
WARN_ON(__iommu_device_set_domain(
- group, gdev->dev, group->domain,
+ group, gdev->dev, group->domain, new_domain,
IOMMU_SET_DOMAIN_MUST_SUCCEED));
- if (gdev == last_gdev)
- break;
}
return ret;
}
diff --git a/drivers/iommu/iommufd/Kconfig b/drivers/iommu/iommufd/Kconfig
index 2beeb4f60ee5..eae3f03629b0 100644
--- a/drivers/iommu/iommufd/Kconfig
+++ b/drivers/iommu/iommufd/Kconfig
@@ -41,6 +41,7 @@ config IOMMUFD_TEST
depends on DEBUG_KERNEL
depends on FAULT_INJECTION
depends on RUNTIME_TESTING_MENU
+ depends on IOMMU_PT_AMDV1
select IOMMUFD_DRIVER
default n
help
diff --git a/drivers/iommu/iommufd/driver.c b/drivers/iommu/iommufd/driver.c
index 6f1010da221c..21d4a35538f6 100644
--- a/drivers/iommu/iommufd/driver.c
+++ b/drivers/iommu/iommufd/driver.c
@@ -161,8 +161,8 @@ int iommufd_viommu_report_event(struct iommufd_viommu *viommu,
vevent = &veventq->lost_events_header;
goto out_set_header;
}
- memcpy(vevent->event_data, event_data, data_len);
vevent->data_len = data_len;
+ memcpy(vevent->event_data, event_data, data_len);
veventq->num_events++;
out_set_header:
diff --git a/drivers/iommu/iommufd/io_pagetable.c b/drivers/iommu/iommufd/io_pagetable.c
index c0360c450880..54cf4d856179 100644
--- a/drivers/iommu/iommufd/io_pagetable.c
+++ b/drivers/iommu/iommufd/io_pagetable.c
@@ -8,8 +8,10 @@
* The datastructure uses the iopt_pages to optimize the storage of the PFNs
* between the domains and xarray.
*/
+#include <linux/dma-buf.h>
#include <linux/err.h>
#include <linux/errno.h>
+#include <linux/file.h>
#include <linux/iommu.h>
#include <linux/iommufd.h>
#include <linux/lockdep.h>
@@ -284,6 +286,9 @@ static int iopt_alloc_area_pages(struct io_pagetable *iopt,
case IOPT_ADDRESS_FILE:
start = elm->start_byte + elm->pages->start;
break;
+ case IOPT_ADDRESS_DMABUF:
+ start = elm->start_byte + elm->pages->dmabuf.start;
+ break;
}
rc = iopt_alloc_iova(iopt, dst_iova, start, length);
if (rc)
@@ -468,25 +473,53 @@ int iopt_map_user_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt,
* @iopt: io_pagetable to act on
* @iova: If IOPT_ALLOC_IOVA is set this is unused on input and contains
* the chosen iova on output. Otherwise is the iova to map to on input
- * @file: file to map
+ * @fd: fdno of a file to map
* @start: map file starting at this byte offset
* @length: Number of bytes to map
* @iommu_prot: Combination of IOMMU_READ/WRITE/etc bits for the mapping
* @flags: IOPT_ALLOC_IOVA or zero
*/
int iopt_map_file_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt,
- unsigned long *iova, struct file *file,
- unsigned long start, unsigned long length,
- int iommu_prot, unsigned int flags)
+ unsigned long *iova, int fd, unsigned long start,
+ unsigned long length, int iommu_prot,
+ unsigned int flags)
{
struct iopt_pages *pages;
+ struct dma_buf *dmabuf;
+ unsigned long start_byte;
+ unsigned long last;
+
+ if (!length)
+ return -EINVAL;
+ if (check_add_overflow(start, length - 1, &last))
+ return -EOVERFLOW;
+
+ start_byte = start - ALIGN_DOWN(start, PAGE_SIZE);
+ dmabuf = dma_buf_get(fd);
+ if (!IS_ERR(dmabuf)) {
+ pages = iopt_alloc_dmabuf_pages(ictx, dmabuf, start_byte, start,
+ length,
+ iommu_prot & IOMMU_WRITE);
+ if (IS_ERR(pages)) {
+ dma_buf_put(dmabuf);
+ return PTR_ERR(pages);
+ }
+ } else {
+ struct file *file;
+
+ file = fget(fd);
+ if (!file)
+ return -EBADF;
+
+ pages = iopt_alloc_file_pages(file, start_byte, start, length,
+ iommu_prot & IOMMU_WRITE);
+ fput(file);
+ if (IS_ERR(pages))
+ return PTR_ERR(pages);
+ }
- pages = iopt_alloc_file_pages(file, start, length,
- iommu_prot & IOMMU_WRITE);
- if (IS_ERR(pages))
- return PTR_ERR(pages);
return iopt_map_common(ictx, iopt, pages, iova, length,
- start - pages->start, iommu_prot, flags);
+ start_byte, iommu_prot, flags);
}
struct iova_bitmap_fn_arg {
@@ -707,7 +740,8 @@ static int iopt_unmap_iova_range(struct io_pagetable *iopt, unsigned long start,
struct iopt_area *area;
unsigned long unmapped_bytes = 0;
unsigned int tries = 0;
- int rc = -ENOENT;
+ /* If there are no mapped entries then success */
+ int rc = 0;
/*
* The domains_rwsem must be held in read mode any time any area->pages
@@ -777,8 +811,6 @@ again:
down_write(&iopt->iova_rwsem);
}
- if (unmapped_bytes)
- rc = 0;
out_unlock_iova:
up_write(&iopt->iova_rwsem);
@@ -815,13 +847,8 @@ int iopt_unmap_iova(struct io_pagetable *iopt, unsigned long iova,
int iopt_unmap_all(struct io_pagetable *iopt, unsigned long *unmapped)
{
- int rc;
-
- rc = iopt_unmap_iova_range(iopt, 0, ULONG_MAX, unmapped);
/* If the IOVAs are empty then unmap all succeeds */
- if (rc == -ENOENT)
- return 0;
- return rc;
+ return iopt_unmap_iova_range(iopt, 0, ULONG_MAX, unmapped);
}
/* The caller must always free all the nodes in the allowed_iova rb_root. */
@@ -967,9 +994,15 @@ static void iopt_unfill_domain(struct io_pagetable *iopt,
WARN_ON(!area->storage_domain);
if (area->storage_domain == domain)
area->storage_domain = storage_domain;
+ if (iopt_is_dmabuf(pages)) {
+ if (!iopt_dmabuf_revoked(pages))
+ iopt_area_unmap_domain(area, domain);
+ iopt_dmabuf_untrack_domain(pages, area, domain);
+ }
mutex_unlock(&pages->mutex);
- iopt_area_unmap_domain(area, domain);
+ if (!iopt_is_dmabuf(pages))
+ iopt_area_unmap_domain(area, domain);
}
return;
}
@@ -986,6 +1019,8 @@ static void iopt_unfill_domain(struct io_pagetable *iopt,
WARN_ON(area->storage_domain != domain);
area->storage_domain = NULL;
iopt_area_unfill_domain(area, pages, domain);
+ if (iopt_is_dmabuf(pages))
+ iopt_dmabuf_untrack_domain(pages, area, domain);
mutex_unlock(&pages->mutex);
}
}
@@ -1015,10 +1050,16 @@ static int iopt_fill_domain(struct io_pagetable *iopt,
if (!pages)
continue;
- mutex_lock(&pages->mutex);
+ guard(mutex)(&pages->mutex);
+ if (iopt_is_dmabuf(pages)) {
+ rc = iopt_dmabuf_track_domain(pages, area, domain);
+ if (rc)
+ goto out_unfill;
+ }
rc = iopt_area_fill_domain(area, domain);
if (rc) {
- mutex_unlock(&pages->mutex);
+ if (iopt_is_dmabuf(pages))
+ iopt_dmabuf_untrack_domain(pages, area, domain);
goto out_unfill;
}
if (!area->storage_domain) {
@@ -1027,7 +1068,6 @@ static int iopt_fill_domain(struct io_pagetable *iopt,
interval_tree_insert(&area->pages_node,
&pages->domains_itree);
}
- mutex_unlock(&pages->mutex);
}
return 0;
@@ -1048,6 +1088,8 @@ out_unfill:
area->storage_domain = NULL;
}
iopt_area_unfill_domain(area, pages, domain);
+ if (iopt_is_dmabuf(pages))
+ iopt_dmabuf_untrack_domain(pages, area, domain);
mutex_unlock(&pages->mutex);
}
return rc;
@@ -1258,6 +1300,10 @@ static int iopt_area_split(struct iopt_area *area, unsigned long iova)
if (!pages || area->prevent_access)
return -EBUSY;
+ /* Maintaining the domains_itree below is a bit complicated */
+ if (iopt_is_dmabuf(pages))
+ return -EOPNOTSUPP;
+
if (new_start & (alignment - 1) ||
iopt_area_start_byte(area, new_start) & (alignment - 1))
return -EINVAL;
diff --git a/drivers/iommu/iommufd/io_pagetable.h b/drivers/iommu/iommufd/io_pagetable.h
index b6064f4ce4af..14cd052fd320 100644
--- a/drivers/iommu/iommufd/io_pagetable.h
+++ b/drivers/iommu/iommufd/io_pagetable.h
@@ -5,6 +5,7 @@
#ifndef __IO_PAGETABLE_H
#define __IO_PAGETABLE_H
+#include <linux/dma-buf.h>
#include <linux/interval_tree.h>
#include <linux/kref.h>
#include <linux/mutex.h>
@@ -69,6 +70,16 @@ void iopt_area_unfill_domain(struct iopt_area *area, struct iopt_pages *pages,
void iopt_area_unmap_domain(struct iopt_area *area,
struct iommu_domain *domain);
+int iopt_dmabuf_track_domain(struct iopt_pages *pages, struct iopt_area *area,
+ struct iommu_domain *domain);
+void iopt_dmabuf_untrack_domain(struct iopt_pages *pages,
+ struct iopt_area *area,
+ struct iommu_domain *domain);
+int iopt_dmabuf_track_all_domains(struct iopt_area *area,
+ struct iopt_pages *pages);
+void iopt_dmabuf_untrack_all_domains(struct iopt_area *area,
+ struct iopt_pages *pages);
+
static inline unsigned long iopt_area_index(struct iopt_area *area)
{
return area->pages_node.start;
@@ -179,7 +190,22 @@ enum {
enum iopt_address_type {
IOPT_ADDRESS_USER = 0,
- IOPT_ADDRESS_FILE = 1,
+ IOPT_ADDRESS_FILE,
+ IOPT_ADDRESS_DMABUF,
+};
+
+struct iopt_pages_dmabuf_track {
+ struct iommu_domain *domain;
+ struct iopt_area *area;
+ struct list_head elm;
+};
+
+struct iopt_pages_dmabuf {
+ struct dma_buf_attachment *attach;
+ struct dma_buf_phys_vec phys;
+ /* Always PAGE_SIZE aligned */
+ unsigned long start;
+ struct list_head tracker;
};
/*
@@ -209,6 +235,8 @@ struct iopt_pages {
struct file *file;
unsigned long start;
};
+ /* IOPT_ADDRESS_DMABUF */
+ struct iopt_pages_dmabuf dmabuf;
};
bool writable:1;
u8 account_mode;
@@ -220,10 +248,32 @@ struct iopt_pages {
struct rb_root_cached domains_itree;
};
+static inline bool iopt_is_dmabuf(struct iopt_pages *pages)
+{
+ if (!IS_ENABLED(CONFIG_DMA_SHARED_BUFFER))
+ return false;
+ return pages->type == IOPT_ADDRESS_DMABUF;
+}
+
+static inline bool iopt_dmabuf_revoked(struct iopt_pages *pages)
+{
+ lockdep_assert_held(&pages->mutex);
+ if (iopt_is_dmabuf(pages))
+ return pages->dmabuf.phys.len == 0;
+ return false;
+}
+
struct iopt_pages *iopt_alloc_user_pages(void __user *uptr,
unsigned long length, bool writable);
-struct iopt_pages *iopt_alloc_file_pages(struct file *file, unsigned long start,
+struct iopt_pages *iopt_alloc_file_pages(struct file *file,
+ unsigned long start_byte,
+ unsigned long start,
unsigned long length, bool writable);
+struct iopt_pages *iopt_alloc_dmabuf_pages(struct iommufd_ctx *ictx,
+ struct dma_buf *dmabuf,
+ unsigned long start_byte,
+ unsigned long start,
+ unsigned long length, bool writable);
void iopt_release_pages(struct kref *kref);
static inline void iopt_put_pages(struct iopt_pages *pages)
{
diff --git a/drivers/iommu/iommufd/ioas.c b/drivers/iommu/iommufd/ioas.c
index 1542c5fd10a8..f4721afedadc 100644
--- a/drivers/iommu/iommufd/ioas.c
+++ b/drivers/iommu/iommufd/ioas.c
@@ -207,7 +207,6 @@ int iommufd_ioas_map_file(struct iommufd_ucmd *ucmd)
unsigned long iova = cmd->iova;
struct iommufd_ioas *ioas;
unsigned int flags = 0;
- struct file *file;
int rc;
if (cmd->flags &
@@ -229,11 +228,7 @@ int iommufd_ioas_map_file(struct iommufd_ucmd *ucmd)
if (!(cmd->flags & IOMMU_IOAS_MAP_FIXED_IOVA))
flags = IOPT_ALLOC_IOVA;
- file = fget(cmd->fd);
- if (!file)
- return -EBADF;
-
- rc = iopt_map_file_pages(ucmd->ictx, &ioas->iopt, &iova, file,
+ rc = iopt_map_file_pages(ucmd->ictx, &ioas->iopt, &iova, cmd->fd,
cmd->start, cmd->length,
conv_iommu_prot(cmd->flags), flags);
if (rc)
@@ -243,7 +238,6 @@ int iommufd_ioas_map_file(struct iommufd_ucmd *ucmd)
rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
out_put:
iommufd_put_object(ucmd->ictx, &ioas->obj);
- fput(file);
return rc;
}
@@ -367,6 +361,10 @@ int iommufd_ioas_unmap(struct iommufd_ucmd *ucmd)
&unmapped);
if (rc)
goto out_put;
+ if (!unmapped) {
+ rc = -ENOENT;
+ goto out_put;
+ }
}
cmd->length = unmapped;
diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
index 627f9b78483a..eb6d1a70f673 100644
--- a/drivers/iommu/iommufd/iommufd_private.h
+++ b/drivers/iommu/iommufd/iommufd_private.h
@@ -19,6 +19,8 @@ struct iommu_domain;
struct iommu_group;
struct iommu_option;
struct iommufd_device;
+struct dma_buf_attachment;
+struct dma_buf_phys_vec;
struct iommufd_sw_msi_map {
struct list_head sw_msi_item;
@@ -108,7 +110,7 @@ int iopt_map_user_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt,
unsigned long length, int iommu_prot,
unsigned int flags);
int iopt_map_file_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt,
- unsigned long *iova, struct file *file,
+ unsigned long *iova, int fd,
unsigned long start, unsigned long length,
int iommu_prot, unsigned int flags);
int iopt_map_pages(struct io_pagetable *iopt, struct list_head *pages_list,
@@ -504,6 +506,8 @@ void iommufd_device_pre_destroy(struct iommufd_object *obj);
void iommufd_device_destroy(struct iommufd_object *obj);
int iommufd_get_hw_info(struct iommufd_ucmd *ucmd);
+struct device *iommufd_global_device(void);
+
struct iommufd_access {
struct iommufd_object obj;
struct iommufd_ctx *ictx;
@@ -614,7 +618,6 @@ struct iommufd_veventq {
struct iommufd_eventq common;
struct iommufd_viommu *viommu;
struct list_head node; /* for iommufd_viommu::veventqs */
- struct iommufd_vevent lost_events_header;
enum iommu_veventq_type type;
unsigned int depth;
@@ -622,6 +625,9 @@ struct iommufd_veventq {
/* Use common.lock for protection */
u32 num_events;
u32 sequence;
+
+ /* Must be last as it ends in a flexible-array member. */
+ struct iommufd_vevent lost_events_header;
};
static inline struct iommufd_veventq *
@@ -711,6 +717,8 @@ bool iommufd_should_fail(void);
int __init iommufd_test_init(void);
void iommufd_test_exit(void);
bool iommufd_selftest_is_mock_dev(struct device *dev);
+int iommufd_test_dma_buf_iommufd_map(struct dma_buf_attachment *attachment,
+ struct dma_buf_phys_vec *phys);
#else
static inline void iommufd_test_syz_conv_iova_id(struct iommufd_ucmd *ucmd,
unsigned int ioas_id,
@@ -732,5 +740,11 @@ static inline bool iommufd_selftest_is_mock_dev(struct device *dev)
{
return false;
}
+static inline int
+iommufd_test_dma_buf_iommufd_map(struct dma_buf_attachment *attachment,
+ struct dma_buf_phys_vec *phys)
+{
+ return -EOPNOTSUPP;
+}
#endif
#endif
diff --git a/drivers/iommu/iommufd/iommufd_test.h b/drivers/iommu/iommufd/iommufd_test.h
index 8fc618b2bcf9..73e73e1ec158 100644
--- a/drivers/iommu/iommufd/iommufd_test.h
+++ b/drivers/iommu/iommufd/iommufd_test.h
@@ -29,11 +29,22 @@ enum {
IOMMU_TEST_OP_PASID_REPLACE,
IOMMU_TEST_OP_PASID_DETACH,
IOMMU_TEST_OP_PASID_CHECK_HWPT,
+ IOMMU_TEST_OP_DMABUF_GET,
+ IOMMU_TEST_OP_DMABUF_REVOKE,
};
enum {
+ MOCK_IOMMUPT_DEFAULT = 0,
+ MOCK_IOMMUPT_HUGE,
+ MOCK_IOMMUPT_AMDV1,
+};
+
+/* These values are true for MOCK_IOMMUPT_DEFAULT */
+enum {
MOCK_APERTURE_START = 1UL << 24,
MOCK_APERTURE_LAST = (1UL << 31) - 1,
+ MOCK_PAGE_SIZE = 2048,
+ MOCK_HUGE_PAGE_SIZE = 512 * MOCK_PAGE_SIZE,
};
enum {
@@ -52,7 +63,6 @@ enum {
enum {
MOCK_FLAGS_DEVICE_NO_DIRTY = 1 << 0,
- MOCK_FLAGS_DEVICE_HUGE_IOVA = 1 << 1,
MOCK_FLAGS_DEVICE_PASID = 1 << 2,
};
@@ -176,6 +186,14 @@ struct iommu_test_cmd {
__u32 hwpt_id;
/* @id is stdev_id */
} pasid_check;
+ struct {
+ __u32 length;
+ __u32 open_flags;
+ } dmabuf_get;
+ struct {
+ __s32 dmabuf_fd;
+ __u32 revoked;
+ } dmabuf_revoke;
};
__u32 last;
};
@@ -205,6 +223,7 @@ struct iommu_test_hw_info {
*/
struct iommu_hwpt_selftest {
__u32 iotlb;
+ __u32 pagetable_type;
};
/* Should not be equal to any defined value in enum iommu_hwpt_invalidate_data_type */
diff --git a/drivers/iommu/iommufd/iova_bitmap.c b/drivers/iommu/iommufd/iova_bitmap.c
index 4514575818fc..b5b67a9d3fb3 100644
--- a/drivers/iommu/iommufd/iova_bitmap.c
+++ b/drivers/iommu/iommufd/iova_bitmap.c
@@ -130,9 +130,8 @@ struct iova_bitmap {
static unsigned long iova_bitmap_offset_to_index(struct iova_bitmap *bitmap,
unsigned long iova)
{
- unsigned long pgsize = 1UL << bitmap->mapped.pgshift;
-
- return iova / (BITS_PER_TYPE(*bitmap->bitmap) * pgsize);
+ return (iova >> bitmap->mapped.pgshift) /
+ BITS_PER_TYPE(*bitmap->bitmap);
}
/*
diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c
index ce775fbbae94..5cc4b08c25f5 100644
--- a/drivers/iommu/iommufd/main.c
+++ b/drivers/iommu/iommufd/main.c
@@ -751,6 +751,15 @@ static struct miscdevice vfio_misc_dev = {
.mode = 0666,
};
+/*
+ * Used only by DMABUF, returns a valid struct device to use as a dummy struct
+ * device for attachment.
+ */
+struct device *iommufd_global_device(void)
+{
+ return iommu_misc_dev.this_device;
+}
+
static int __init iommufd_init(void)
{
int ret;
@@ -794,5 +803,6 @@ MODULE_ALIAS("devname:vfio/vfio");
#endif
MODULE_IMPORT_NS("IOMMUFD_INTERNAL");
MODULE_IMPORT_NS("IOMMUFD");
+MODULE_IMPORT_NS("DMA_BUF");
MODULE_DESCRIPTION("I/O Address Space Management for passthrough devices");
MODULE_LICENSE("GPL");
diff --git a/drivers/iommu/iommufd/pages.c b/drivers/iommu/iommufd/pages.c
index c3433b845561..dbe51ecb9a20 100644
--- a/drivers/iommu/iommufd/pages.c
+++ b/drivers/iommu/iommufd/pages.c
@@ -45,6 +45,8 @@
* last_iova + 1 can overflow. An iopt_pages index will always be much less than
* ULONG_MAX so last_index + 1 cannot overflow.
*/
+#include <linux/dma-buf.h>
+#include <linux/dma-resv.h>
#include <linux/file.h>
#include <linux/highmem.h>
#include <linux/iommu.h>
@@ -53,6 +55,7 @@
#include <linux/overflow.h>
#include <linux/slab.h>
#include <linux/sched/mm.h>
+#include <linux/vfio_pci_core.h>
#include "double_span.h"
#include "io_pagetable.h"
@@ -258,6 +261,11 @@ static struct iopt_area *iopt_pages_find_domain_area(struct iopt_pages *pages,
return container_of(node, struct iopt_area, pages_node);
}
+enum batch_kind {
+ BATCH_CPU_MEMORY = 0,
+ BATCH_MMIO,
+};
+
/*
* A simple datastructure to hold a vector of PFNs, optimized for contiguous
* PFNs. This is used as a temporary holding memory for shuttling pfns from one
@@ -271,7 +279,9 @@ struct pfn_batch {
unsigned int array_size;
unsigned int end;
unsigned int total_pfns;
+ enum batch_kind kind;
};
+enum { MAX_NPFNS = type_max(typeof(((struct pfn_batch *)0)->npfns[0])) };
static void batch_clear(struct pfn_batch *batch)
{
@@ -348,11 +358,17 @@ static void batch_destroy(struct pfn_batch *batch, void *backup)
}
static bool batch_add_pfn_num(struct pfn_batch *batch, unsigned long pfn,
- u32 nr)
+ u32 nr, enum batch_kind kind)
{
- const unsigned int MAX_NPFNS = type_max(typeof(*batch->npfns));
unsigned int end = batch->end;
+ if (batch->kind != kind) {
+ /* One kind per batch */
+ if (batch->end != 0)
+ return false;
+ batch->kind = kind;
+ }
+
if (end && pfn == batch->pfns[end - 1] + batch->npfns[end - 1] &&
nr <= MAX_NPFNS - batch->npfns[end - 1]) {
batch->npfns[end - 1] += nr;
@@ -379,7 +395,7 @@ static void batch_remove_pfn_num(struct pfn_batch *batch, unsigned long nr)
/* true if the pfn was added, false otherwise */
static bool batch_add_pfn(struct pfn_batch *batch, unsigned long pfn)
{
- return batch_add_pfn_num(batch, pfn, 1);
+ return batch_add_pfn_num(batch, pfn, 1, BATCH_CPU_MEMORY);
}
/*
@@ -492,6 +508,7 @@ static int batch_to_domain(struct pfn_batch *batch, struct iommu_domain *domain,
{
bool disable_large_pages = area->iopt->disable_large_pages;
unsigned long last_iova = iopt_area_last_iova(area);
+ int iommu_prot = area->iommu_prot;
unsigned int page_offset = 0;
unsigned long start_iova;
unsigned long next_iova;
@@ -499,6 +516,11 @@ static int batch_to_domain(struct pfn_batch *batch, struct iommu_domain *domain,
unsigned long iova;
int rc;
+ if (batch->kind == BATCH_MMIO) {
+ iommu_prot &= ~IOMMU_CACHE;
+ iommu_prot |= IOMMU_MMIO;
+ }
+
/* The first index might be a partial page */
if (start_index == iopt_area_index(area))
page_offset = area->page_offset;
@@ -512,11 +534,11 @@ static int batch_to_domain(struct pfn_batch *batch, struct iommu_domain *domain,
rc = batch_iommu_map_small(
domain, iova,
PFN_PHYS(batch->pfns[cur]) + page_offset,
- next_iova - iova, area->iommu_prot);
+ next_iova - iova, iommu_prot);
else
rc = iommu_map(domain, iova,
PFN_PHYS(batch->pfns[cur]) + page_offset,
- next_iova - iova, area->iommu_prot,
+ next_iova - iova, iommu_prot,
GFP_KERNEL_ACCOUNT);
if (rc)
goto err_unmap;
@@ -652,7 +674,7 @@ static int batch_from_folios(struct pfn_batch *batch, struct folio ***folios_p,
nr = min(nr, npages);
npages -= nr;
- if (!batch_add_pfn_num(batch, pfn, nr))
+ if (!batch_add_pfn_num(batch, pfn, nr, BATCH_CPU_MEMORY))
break;
if (nr > 1) {
rc = folio_add_pins(folio, nr - 1);
@@ -1054,6 +1076,41 @@ static int pfn_reader_user_update_pinned(struct pfn_reader_user *user,
return iopt_pages_update_pinned(pages, npages, inc, user);
}
+struct pfn_reader_dmabuf {
+ struct dma_buf_phys_vec phys;
+ unsigned long start_offset;
+};
+
+static int pfn_reader_dmabuf_init(struct pfn_reader_dmabuf *dmabuf,
+ struct iopt_pages *pages)
+{
+ /* Callers must not get here if the dmabuf was already revoked */
+ if (WARN_ON(iopt_dmabuf_revoked(pages)))
+ return -EINVAL;
+
+ dmabuf->phys = pages->dmabuf.phys;
+ dmabuf->start_offset = pages->dmabuf.start;
+ return 0;
+}
+
+static int pfn_reader_fill_dmabuf(struct pfn_reader_dmabuf *dmabuf,
+ struct pfn_batch *batch,
+ unsigned long start_index,
+ unsigned long last_index)
+{
+ unsigned long start = dmabuf->start_offset + start_index * PAGE_SIZE;
+
+ /*
+ * start/last_index and start are all PAGE_SIZE aligned, the batch is
+ * always filled using page size aligned PFNs just like the other types.
+ * If the dmabuf has been sliced on a sub page offset then the common
+ * batch to domain code will adjust it before mapping to the domain.
+ */
+ batch_add_pfn_num(batch, PHYS_PFN(dmabuf->phys.paddr + start),
+ last_index - start_index + 1, BATCH_MMIO);
+ return 0;
+}
+
/*
* PFNs are stored in three places, in order of preference:
* - The iopt_pages xarray. This is only populated if there is a
@@ -1072,7 +1129,10 @@ struct pfn_reader {
unsigned long batch_end_index;
unsigned long last_index;
- struct pfn_reader_user user;
+ union {
+ struct pfn_reader_user user;
+ struct pfn_reader_dmabuf dmabuf;
+ };
};
static int pfn_reader_update_pinned(struct pfn_reader *pfns)
@@ -1108,7 +1168,7 @@ static int pfn_reader_fill_span(struct pfn_reader *pfns)
{
struct interval_tree_double_span_iter *span = &pfns->span;
unsigned long start_index = pfns->batch_end_index;
- struct pfn_reader_user *user = &pfns->user;
+ struct pfn_reader_user *user;
unsigned long npages;
struct iopt_area *area;
int rc;
@@ -1140,8 +1200,13 @@ static int pfn_reader_fill_span(struct pfn_reader *pfns)
return 0;
}
- if (start_index >= pfns->user.upages_end) {
- rc = pfn_reader_user_pin(&pfns->user, pfns->pages, start_index,
+ if (iopt_is_dmabuf(pfns->pages))
+ return pfn_reader_fill_dmabuf(&pfns->dmabuf, &pfns->batch,
+ start_index, span->last_hole);
+
+ user = &pfns->user;
+ if (start_index >= user->upages_end) {
+ rc = pfn_reader_user_pin(user, pfns->pages, start_index,
span->last_hole);
if (rc)
return rc;
@@ -1209,7 +1274,10 @@ static int pfn_reader_init(struct pfn_reader *pfns, struct iopt_pages *pages,
pfns->batch_start_index = start_index;
pfns->batch_end_index = start_index;
pfns->last_index = last_index;
- pfn_reader_user_init(&pfns->user, pages);
+ if (iopt_is_dmabuf(pages))
+ pfn_reader_dmabuf_init(&pfns->dmabuf, pages);
+ else
+ pfn_reader_user_init(&pfns->user, pages);
rc = batch_init(&pfns->batch, last_index - start_index + 1);
if (rc)
return rc;
@@ -1230,8 +1298,12 @@ static int pfn_reader_init(struct pfn_reader *pfns, struct iopt_pages *pages,
static void pfn_reader_release_pins(struct pfn_reader *pfns)
{
struct iopt_pages *pages = pfns->pages;
- struct pfn_reader_user *user = &pfns->user;
+ struct pfn_reader_user *user;
+
+ if (iopt_is_dmabuf(pages))
+ return;
+ user = &pfns->user;
if (user->upages_end > pfns->batch_end_index) {
/* Any pages not transferred to the batch are just unpinned */
@@ -1261,7 +1333,8 @@ static void pfn_reader_destroy(struct pfn_reader *pfns)
struct iopt_pages *pages = pfns->pages;
pfn_reader_release_pins(pfns);
- pfn_reader_user_destroy(&pfns->user, pfns->pages);
+ if (!iopt_is_dmabuf(pfns->pages))
+ pfn_reader_user_destroy(&pfns->user, pfns->pages);
batch_destroy(&pfns->batch, NULL);
WARN_ON(pages->last_npinned != pages->npinned);
}
@@ -1340,26 +1413,234 @@ struct iopt_pages *iopt_alloc_user_pages(void __user *uptr,
return pages;
}
-struct iopt_pages *iopt_alloc_file_pages(struct file *file, unsigned long start,
+struct iopt_pages *iopt_alloc_file_pages(struct file *file,
+ unsigned long start_byte,
+ unsigned long start,
unsigned long length, bool writable)
{
struct iopt_pages *pages;
- unsigned long start_down = ALIGN_DOWN(start, PAGE_SIZE);
- unsigned long end;
- if (length && check_add_overflow(start, length - 1, &end))
- return ERR_PTR(-EOVERFLOW);
-
- pages = iopt_alloc_pages(start - start_down, length, writable);
+ pages = iopt_alloc_pages(start_byte, length, writable);
if (IS_ERR(pages))
return pages;
pages->file = get_file(file);
- pages->start = start_down;
+ pages->start = start - start_byte;
pages->type = IOPT_ADDRESS_FILE;
return pages;
}
+static void iopt_revoke_notify(struct dma_buf_attachment *attach)
+{
+ struct iopt_pages *pages = attach->importer_priv;
+ struct iopt_pages_dmabuf_track *track;
+
+ guard(mutex)(&pages->mutex);
+ if (iopt_dmabuf_revoked(pages))
+ return;
+
+ list_for_each_entry(track, &pages->dmabuf.tracker, elm) {
+ struct iopt_area *area = track->area;
+
+ iopt_area_unmap_domain_range(area, track->domain,
+ iopt_area_index(area),
+ iopt_area_last_index(area));
+ }
+ pages->dmabuf.phys.len = 0;
+}
+
+static struct dma_buf_attach_ops iopt_dmabuf_attach_revoke_ops = {
+ .allow_peer2peer = true,
+ .move_notify = iopt_revoke_notify,
+};
+
+/*
+ * iommufd and vfio have a circular dependency. Future work for a phys
+ * based private interconnect will remove this.
+ */
+static int
+sym_vfio_pci_dma_buf_iommufd_map(struct dma_buf_attachment *attachment,
+ struct dma_buf_phys_vec *phys)
+{
+ typeof(&vfio_pci_dma_buf_iommufd_map) fn;
+ int rc;
+
+ rc = iommufd_test_dma_buf_iommufd_map(attachment, phys);
+ if (rc != -EOPNOTSUPP)
+ return rc;
+
+ if (!IS_ENABLED(CONFIG_VFIO_PCI_DMABUF))
+ return -EOPNOTSUPP;
+
+ fn = symbol_get(vfio_pci_dma_buf_iommufd_map);
+ if (!fn)
+ return -EOPNOTSUPP;
+ rc = fn(attachment, phys);
+ symbol_put(vfio_pci_dma_buf_iommufd_map);
+ return rc;
+}
+
+static int iopt_map_dmabuf(struct iommufd_ctx *ictx, struct iopt_pages *pages,
+ struct dma_buf *dmabuf)
+{
+ struct dma_buf_attachment *attach;
+ int rc;
+
+ attach = dma_buf_dynamic_attach(dmabuf, iommufd_global_device(),
+ &iopt_dmabuf_attach_revoke_ops, pages);
+ if (IS_ERR(attach))
+ return PTR_ERR(attach);
+
+ dma_resv_lock(dmabuf->resv, NULL);
+ /*
+ * Lock ordering requires the mutex to be taken inside the reservation,
+ * make sure lockdep sees this.
+ */
+ if (IS_ENABLED(CONFIG_LOCKDEP)) {
+ mutex_lock(&pages->mutex);
+ mutex_unlock(&pages->mutex);
+ }
+
+ rc = sym_vfio_pci_dma_buf_iommufd_map(attach, &pages->dmabuf.phys);
+ if (rc)
+ goto err_detach;
+
+ dma_resv_unlock(dmabuf->resv);
+
+ /* On success iopt_release_pages() will detach and put the dmabuf. */
+ pages->dmabuf.attach = attach;
+ return 0;
+
+err_detach:
+ dma_resv_unlock(dmabuf->resv);
+ dma_buf_detach(dmabuf, attach);
+ return rc;
+}
+
+struct iopt_pages *iopt_alloc_dmabuf_pages(struct iommufd_ctx *ictx,
+ struct dma_buf *dmabuf,
+ unsigned long start_byte,
+ unsigned long start,
+ unsigned long length, bool writable)
+{
+ static struct lock_class_key pages_dmabuf_mutex_key;
+ struct iopt_pages *pages;
+ int rc;
+
+ if (!IS_ENABLED(CONFIG_DMA_SHARED_BUFFER))
+ return ERR_PTR(-EOPNOTSUPP);
+
+ if (dmabuf->size <= (start + length - 1) ||
+ length / PAGE_SIZE >= MAX_NPFNS)
+ return ERR_PTR(-EINVAL);
+
+ pages = iopt_alloc_pages(start_byte, length, writable);
+ if (IS_ERR(pages))
+ return pages;
+
+ /*
+ * The mmap_lock can be held when obtaining the dmabuf reservation lock
+ * which creates a locking cycle with the pages mutex which is held
+ * while obtaining the mmap_lock. This locking path is not present for
+ * IOPT_ADDRESS_DMABUF so split the lock class.
+ */
+ lockdep_set_class(&pages->mutex, &pages_dmabuf_mutex_key);
+
+ /* dmabuf does not use pinned page accounting. */
+ pages->account_mode = IOPT_PAGES_ACCOUNT_NONE;
+ pages->type = IOPT_ADDRESS_DMABUF;
+ pages->dmabuf.start = start - start_byte;
+ INIT_LIST_HEAD(&pages->dmabuf.tracker);
+
+ rc = iopt_map_dmabuf(ictx, pages, dmabuf);
+ if (rc) {
+ iopt_put_pages(pages);
+ return ERR_PTR(rc);
+ }
+
+ return pages;
+}
+
+int iopt_dmabuf_track_domain(struct iopt_pages *pages, struct iopt_area *area,
+ struct iommu_domain *domain)
+{
+ struct iopt_pages_dmabuf_track *track;
+
+ lockdep_assert_held(&pages->mutex);
+ if (WARN_ON(!iopt_is_dmabuf(pages)))
+ return -EINVAL;
+
+ list_for_each_entry(track, &pages->dmabuf.tracker, elm)
+ if (WARN_ON(track->domain == domain && track->area == area))
+ return -EINVAL;
+
+ track = kzalloc(sizeof(*track), GFP_KERNEL);
+ if (!track)
+ return -ENOMEM;
+ track->domain = domain;
+ track->area = area;
+ list_add_tail(&track->elm, &pages->dmabuf.tracker);
+
+ return 0;
+}
+
+void iopt_dmabuf_untrack_domain(struct iopt_pages *pages,
+ struct iopt_area *area,
+ struct iommu_domain *domain)
+{
+ struct iopt_pages_dmabuf_track *track;
+
+ lockdep_assert_held(&pages->mutex);
+ WARN_ON(!iopt_is_dmabuf(pages));
+
+ list_for_each_entry(track, &pages->dmabuf.tracker, elm) {
+ if (track->domain == domain && track->area == area) {
+ list_del(&track->elm);
+ kfree(track);
+ return;
+ }
+ }
+ WARN_ON(true);
+}
+
+int iopt_dmabuf_track_all_domains(struct iopt_area *area,
+ struct iopt_pages *pages)
+{
+ struct iopt_pages_dmabuf_track *track;
+ struct iommu_domain *domain;
+ unsigned long index;
+ int rc;
+
+ list_for_each_entry(track, &pages->dmabuf.tracker, elm)
+ if (WARN_ON(track->area == area))
+ return -EINVAL;
+
+ xa_for_each(&area->iopt->domains, index, domain) {
+ rc = iopt_dmabuf_track_domain(pages, area, domain);
+ if (rc)
+ goto err_untrack;
+ }
+ return 0;
+err_untrack:
+ iopt_dmabuf_untrack_all_domains(area, pages);
+ return rc;
+}
+
+void iopt_dmabuf_untrack_all_domains(struct iopt_area *area,
+ struct iopt_pages *pages)
+{
+ struct iopt_pages_dmabuf_track *track;
+ struct iopt_pages_dmabuf_track *tmp;
+
+ list_for_each_entry_safe(track, tmp, &pages->dmabuf.tracker,
+ elm) {
+ if (track->area == area) {
+ list_del(&track->elm);
+ kfree(track);
+ }
+ }
+}
+
void iopt_release_pages(struct kref *kref)
{
struct iopt_pages *pages = container_of(kref, struct iopt_pages, kref);
@@ -1372,8 +1653,15 @@ void iopt_release_pages(struct kref *kref)
mutex_destroy(&pages->mutex);
put_task_struct(pages->source_task);
free_uid(pages->source_user);
- if (pages->type == IOPT_ADDRESS_FILE)
+ if (iopt_is_dmabuf(pages) && pages->dmabuf.attach) {
+ struct dma_buf *dmabuf = pages->dmabuf.attach->dmabuf;
+
+ dma_buf_detach(dmabuf, pages->dmabuf.attach);
+ dma_buf_put(dmabuf);
+ WARN_ON(!list_empty(&pages->dmabuf.tracker));
+ } else if (pages->type == IOPT_ADDRESS_FILE) {
fput(pages->file);
+ }
kfree(pages);
}
@@ -1451,6 +1739,14 @@ static void __iopt_area_unfill_domain(struct iopt_area *area,
lockdep_assert_held(&pages->mutex);
+ if (iopt_is_dmabuf(pages)) {
+ if (WARN_ON(iopt_dmabuf_revoked(pages)))
+ return;
+ iopt_area_unmap_domain_range(area, domain, start_index,
+ last_index);
+ return;
+ }
+
/*
* For security we must not unpin something that is still DMA mapped,
* so this must unmap any IOVA before we go ahead and unpin the pages.
@@ -1526,6 +1822,9 @@ void iopt_area_unmap_domain(struct iopt_area *area, struct iommu_domain *domain)
void iopt_area_unfill_domain(struct iopt_area *area, struct iopt_pages *pages,
struct iommu_domain *domain)
{
+ if (iopt_dmabuf_revoked(pages))
+ return;
+
__iopt_area_unfill_domain(area, pages, domain,
iopt_area_last_index(area));
}
@@ -1546,6 +1845,9 @@ int iopt_area_fill_domain(struct iopt_area *area, struct iommu_domain *domain)
lockdep_assert_held(&area->pages->mutex);
+ if (iopt_dmabuf_revoked(area->pages))
+ return 0;
+
rc = pfn_reader_first(&pfns, area->pages, iopt_area_index(area),
iopt_area_last_index(area));
if (rc)
@@ -1605,33 +1907,44 @@ int iopt_area_fill_domains(struct iopt_area *area, struct iopt_pages *pages)
return 0;
mutex_lock(&pages->mutex);
- rc = pfn_reader_first(&pfns, pages, iopt_area_index(area),
- iopt_area_last_index(area));
- if (rc)
- goto out_unlock;
+ if (iopt_is_dmabuf(pages)) {
+ rc = iopt_dmabuf_track_all_domains(area, pages);
+ if (rc)
+ goto out_unlock;
+ }
- while (!pfn_reader_done(&pfns)) {
- done_first_end_index = pfns.batch_end_index;
- done_all_end_index = pfns.batch_start_index;
- xa_for_each(&area->iopt->domains, index, domain) {
- rc = batch_to_domain(&pfns.batch, domain, area,
- pfns.batch_start_index);
+ if (!iopt_dmabuf_revoked(pages)) {
+ rc = pfn_reader_first(&pfns, pages, iopt_area_index(area),
+ iopt_area_last_index(area));
+ if (rc)
+ goto out_untrack;
+
+ while (!pfn_reader_done(&pfns)) {
+ done_first_end_index = pfns.batch_end_index;
+ done_all_end_index = pfns.batch_start_index;
+ xa_for_each(&area->iopt->domains, index, domain) {
+ rc = batch_to_domain(&pfns.batch, domain, area,
+ pfns.batch_start_index);
+ if (rc)
+ goto out_unmap;
+ }
+ done_all_end_index = done_first_end_index;
+
+ rc = pfn_reader_next(&pfns);
if (rc)
goto out_unmap;
}
- done_all_end_index = done_first_end_index;
-
- rc = pfn_reader_next(&pfns);
+ rc = pfn_reader_update_pinned(&pfns);
if (rc)
goto out_unmap;
+
+ pfn_reader_destroy(&pfns);
}
- rc = pfn_reader_update_pinned(&pfns);
- if (rc)
- goto out_unmap;
area->storage_domain = xa_load(&area->iopt->domains, 0);
interval_tree_insert(&area->pages_node, &pages->domains_itree);
- goto out_destroy;
+ mutex_unlock(&pages->mutex);
+ return 0;
out_unmap:
pfn_reader_release_pins(&pfns);
@@ -1658,8 +1971,10 @@ out_unmap:
end_index);
}
}
-out_destroy:
pfn_reader_destroy(&pfns);
+out_untrack:
+ if (iopt_is_dmabuf(pages))
+ iopt_dmabuf_untrack_all_domains(area, pages);
out_unlock:
mutex_unlock(&pages->mutex);
return rc;
@@ -1685,16 +2000,22 @@ void iopt_area_unfill_domains(struct iopt_area *area, struct iopt_pages *pages)
if (!area->storage_domain)
goto out_unlock;
- xa_for_each(&iopt->domains, index, domain)
- if (domain != area->storage_domain)
+ xa_for_each(&iopt->domains, index, domain) {
+ if (domain == area->storage_domain)
+ continue;
+
+ if (!iopt_dmabuf_revoked(pages))
iopt_area_unmap_domain_range(
area, domain, iopt_area_index(area),
iopt_area_last_index(area));
+ }
if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
WARN_ON(RB_EMPTY_NODE(&area->pages_node.rb));
interval_tree_remove(&area->pages_node, &pages->domains_itree);
iopt_area_unfill_domain(area, pages, area->storage_domain);
+ if (iopt_is_dmabuf(pages))
+ iopt_dmabuf_untrack_all_domains(area, pages);
area->storage_domain = NULL;
out_unlock:
mutex_unlock(&pages->mutex);
@@ -2031,15 +2352,14 @@ int iopt_pages_rw_access(struct iopt_pages *pages, unsigned long start_byte,
if ((flags & IOMMUFD_ACCESS_RW_WRITE) && !pages->writable)
return -EPERM;
- if (pages->type == IOPT_ADDRESS_FILE)
+ if (iopt_is_dmabuf(pages))
+ return -EINVAL;
+
+ if (pages->type != IOPT_ADDRESS_USER)
return iopt_pages_rw_slow(pages, start_index, last_index,
start_byte % PAGE_SIZE, data, length,
flags);
- if (IS_ENABLED(CONFIG_IOMMUFD_TEST) &&
- WARN_ON(pages->type != IOPT_ADDRESS_USER))
- return -EINVAL;
-
if (!(flags & IOMMUFD_ACCESS_RW_KTHREAD) && change_mm) {
if (start_index == last_index)
return iopt_pages_rw_page(pages, start_index,
diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c
index de178827a078..c4322fd26f93 100644
--- a/drivers/iommu/iommufd/selftest.c
+++ b/drivers/iommu/iommufd/selftest.c
@@ -5,6 +5,8 @@
*/
#include <linux/anon_inodes.h>
#include <linux/debugfs.h>
+#include <linux/dma-buf.h>
+#include <linux/dma-resv.h>
#include <linux/fault-inject.h>
#include <linux/file.h>
#include <linux/iommu.h>
@@ -12,6 +14,8 @@
#include <linux/slab.h>
#include <linux/xarray.h>
#include <uapi/linux/iommufd.h>
+#include <linux/generic_pt/iommu.h>
+#include "../iommu-pages.h"
#include "../iommu-priv.h"
#include "io_pagetable.h"
@@ -41,21 +45,6 @@ static DEFINE_IDA(mock_dev_ida);
enum {
MOCK_DIRTY_TRACK = 1,
- MOCK_IO_PAGE_SIZE = PAGE_SIZE / 2,
- MOCK_HUGE_PAGE_SIZE = 512 * MOCK_IO_PAGE_SIZE,
-
- /*
- * Like a real page table alignment requires the low bits of the address
- * to be zero. xarray also requires the high bit to be zero, so we store
- * the pfns shifted. The upper bits are used for metadata.
- */
- MOCK_PFN_MASK = ULONG_MAX / MOCK_IO_PAGE_SIZE,
-
- _MOCK_PFN_START = MOCK_PFN_MASK + 1,
- MOCK_PFN_START_IOVA = _MOCK_PFN_START,
- MOCK_PFN_LAST_IOVA = _MOCK_PFN_START,
- MOCK_PFN_DIRTY_IOVA = _MOCK_PFN_START << 1,
- MOCK_PFN_HUGE_IOVA = _MOCK_PFN_START << 2,
};
static int mock_dev_enable_iopf(struct device *dev, struct iommu_domain *domain);
@@ -124,10 +113,15 @@ void iommufd_test_syz_conv_iova_id(struct iommufd_ucmd *ucmd,
}
struct mock_iommu_domain {
+ union {
+ struct iommu_domain domain;
+ struct pt_iommu iommu;
+ struct pt_iommu_amdv1 amdv1;
+ };
unsigned long flags;
- struct iommu_domain domain;
- struct xarray pfns;
};
+PT_IOMMU_CHECK_DOMAIN(struct mock_iommu_domain, iommu, domain);
+PT_IOMMU_CHECK_DOMAIN(struct mock_iommu_domain, amdv1.iommu, domain);
static inline struct mock_iommu_domain *
to_mock_domain(struct iommu_domain *domain)
@@ -216,7 +210,7 @@ static inline struct selftest_obj *to_selftest_obj(struct iommufd_object *obj)
}
static int mock_domain_nop_attach(struct iommu_domain *domain,
- struct device *dev)
+ struct device *dev, struct iommu_domain *old)
{
struct mock_dev *mdev = to_mock_dev(dev);
struct mock_viommu *new_viommu = NULL;
@@ -344,74 +338,6 @@ static int mock_domain_set_dirty_tracking(struct iommu_domain *domain,
return 0;
}
-static bool mock_test_and_clear_dirty(struct mock_iommu_domain *mock,
- unsigned long iova, size_t page_size,
- unsigned long flags)
-{
- unsigned long cur, end = iova + page_size - 1;
- bool dirty = false;
- void *ent, *old;
-
- for (cur = iova; cur < end; cur += MOCK_IO_PAGE_SIZE) {
- ent = xa_load(&mock->pfns, cur / MOCK_IO_PAGE_SIZE);
- if (!ent || !(xa_to_value(ent) & MOCK_PFN_DIRTY_IOVA))
- continue;
-
- dirty = true;
- /* Clear dirty */
- if (!(flags & IOMMU_DIRTY_NO_CLEAR)) {
- unsigned long val;
-
- val = xa_to_value(ent) & ~MOCK_PFN_DIRTY_IOVA;
- old = xa_store(&mock->pfns, cur / MOCK_IO_PAGE_SIZE,
- xa_mk_value(val), GFP_KERNEL);
- WARN_ON_ONCE(ent != old);
- }
- }
-
- return dirty;
-}
-
-static int mock_domain_read_and_clear_dirty(struct iommu_domain *domain,
- unsigned long iova, size_t size,
- unsigned long flags,
- struct iommu_dirty_bitmap *dirty)
-{
- struct mock_iommu_domain *mock = to_mock_domain(domain);
- unsigned long end = iova + size;
- void *ent;
-
- if (!(mock->flags & MOCK_DIRTY_TRACK) && dirty->bitmap)
- return -EINVAL;
-
- do {
- unsigned long pgsize = MOCK_IO_PAGE_SIZE;
- unsigned long head;
-
- ent = xa_load(&mock->pfns, iova / MOCK_IO_PAGE_SIZE);
- if (!ent) {
- iova += pgsize;
- continue;
- }
-
- if (xa_to_value(ent) & MOCK_PFN_HUGE_IOVA)
- pgsize = MOCK_HUGE_PAGE_SIZE;
- head = iova & ~(pgsize - 1);
-
- /* Clear dirty */
- if (mock_test_and_clear_dirty(mock, head, pgsize, flags))
- iommu_dirty_bitmap_record(dirty, iova, pgsize);
- iova += pgsize;
- } while (iova < end);
-
- return 0;
-}
-
-static const struct iommu_dirty_ops dirty_ops = {
- .set_dirty_tracking = mock_domain_set_dirty_tracking,
- .read_and_clear_dirty = mock_domain_read_and_clear_dirty,
-};
-
static struct mock_iommu_domain_nested *
__mock_domain_alloc_nested(const struct iommu_user_data *user_data)
{
@@ -446,7 +372,7 @@ mock_domain_alloc_nested(struct device *dev, struct iommu_domain *parent,
if (flags & ~IOMMU_HWPT_ALLOC_PASID)
return ERR_PTR(-EOPNOTSUPP);
- if (!parent || parent->ops != mock_ops.default_domain_ops)
+ if (!parent || !(parent->type & __IOMMU_DOMAIN_PAGING))
return ERR_PTR(-EINVAL);
mock_parent = to_mock_domain(parent);
@@ -459,159 +385,170 @@ mock_domain_alloc_nested(struct device *dev, struct iommu_domain *parent,
return &mock_nested->domain;
}
-static struct iommu_domain *
-mock_domain_alloc_paging_flags(struct device *dev, u32 flags,
- const struct iommu_user_data *user_data)
-{
- bool has_dirty_flag = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING;
- const u32 PAGING_FLAGS = IOMMU_HWPT_ALLOC_DIRTY_TRACKING |
- IOMMU_HWPT_ALLOC_NEST_PARENT |
- IOMMU_HWPT_ALLOC_PASID;
- struct mock_dev *mdev = to_mock_dev(dev);
- bool no_dirty_ops = mdev->flags & MOCK_FLAGS_DEVICE_NO_DIRTY;
- struct mock_iommu_domain *mock;
-
- if (user_data)
- return ERR_PTR(-EOPNOTSUPP);
- if ((flags & ~PAGING_FLAGS) || (has_dirty_flag && no_dirty_ops))
- return ERR_PTR(-EOPNOTSUPP);
-
- mock = kzalloc(sizeof(*mock), GFP_KERNEL);
- if (!mock)
- return ERR_PTR(-ENOMEM);
- mock->domain.geometry.aperture_start = MOCK_APERTURE_START;
- mock->domain.geometry.aperture_end = MOCK_APERTURE_LAST;
- mock->domain.pgsize_bitmap = MOCK_IO_PAGE_SIZE;
- if (dev && mdev->flags & MOCK_FLAGS_DEVICE_HUGE_IOVA)
- mock->domain.pgsize_bitmap |= MOCK_HUGE_PAGE_SIZE;
- mock->domain.ops = mock_ops.default_domain_ops;
- mock->domain.type = IOMMU_DOMAIN_UNMANAGED;
- xa_init(&mock->pfns);
-
- if (has_dirty_flag)
- mock->domain.dirty_ops = &dirty_ops;
- return &mock->domain;
-}
-
static void mock_domain_free(struct iommu_domain *domain)
{
struct mock_iommu_domain *mock = to_mock_domain(domain);
- WARN_ON(!xa_empty(&mock->pfns));
+ pt_iommu_deinit(&mock->iommu);
kfree(mock);
}
-static int mock_domain_map_pages(struct iommu_domain *domain,
- unsigned long iova, phys_addr_t paddr,
- size_t pgsize, size_t pgcount, int prot,
- gfp_t gfp, size_t *mapped)
+static void mock_iotlb_sync(struct iommu_domain *domain,
+ struct iommu_iotlb_gather *gather)
{
- struct mock_iommu_domain *mock = to_mock_domain(domain);
- unsigned long flags = MOCK_PFN_START_IOVA;
- unsigned long start_iova = iova;
+ iommu_put_pages_list(&gather->freelist);
+}
- /*
- * xarray does not reliably work with fault injection because it does a
- * retry allocation, so put our own failure point.
- */
- if (iommufd_should_fail())
- return -ENOENT;
+static const struct iommu_domain_ops amdv1_mock_ops = {
+ IOMMU_PT_DOMAIN_OPS(amdv1_mock),
+ .free = mock_domain_free,
+ .attach_dev = mock_domain_nop_attach,
+ .set_dev_pasid = mock_domain_set_dev_pasid_nop,
+ .iotlb_sync = &mock_iotlb_sync,
+};
- WARN_ON(iova % MOCK_IO_PAGE_SIZE);
- WARN_ON(pgsize % MOCK_IO_PAGE_SIZE);
- for (; pgcount; pgcount--) {
- size_t cur;
+static const struct iommu_domain_ops amdv1_mock_huge_ops = {
+ IOMMU_PT_DOMAIN_OPS(amdv1_mock),
+ .free = mock_domain_free,
+ .attach_dev = mock_domain_nop_attach,
+ .set_dev_pasid = mock_domain_set_dev_pasid_nop,
+ .iotlb_sync = &mock_iotlb_sync,
+};
+#undef pt_iommu_amdv1_mock_map_pages
- for (cur = 0; cur != pgsize; cur += MOCK_IO_PAGE_SIZE) {
- void *old;
+static const struct iommu_dirty_ops amdv1_mock_dirty_ops = {
+ IOMMU_PT_DIRTY_OPS(amdv1_mock),
+ .set_dirty_tracking = mock_domain_set_dirty_tracking,
+};
- if (pgcount == 1 && cur + MOCK_IO_PAGE_SIZE == pgsize)
- flags = MOCK_PFN_LAST_IOVA;
- if (pgsize != MOCK_IO_PAGE_SIZE) {
- flags |= MOCK_PFN_HUGE_IOVA;
- }
- old = xa_store(&mock->pfns, iova / MOCK_IO_PAGE_SIZE,
- xa_mk_value((paddr / MOCK_IO_PAGE_SIZE) |
- flags),
- gfp);
- if (xa_is_err(old)) {
- for (; start_iova != iova;
- start_iova += MOCK_IO_PAGE_SIZE)
- xa_erase(&mock->pfns,
- start_iova /
- MOCK_IO_PAGE_SIZE);
- return xa_err(old);
- }
- WARN_ON(old);
- iova += MOCK_IO_PAGE_SIZE;
- paddr += MOCK_IO_PAGE_SIZE;
- *mapped += MOCK_IO_PAGE_SIZE;
- flags = 0;
- }
- }
- return 0;
-}
+static const struct iommu_domain_ops amdv1_ops = {
+ IOMMU_PT_DOMAIN_OPS(amdv1),
+ .free = mock_domain_free,
+ .attach_dev = mock_domain_nop_attach,
+ .set_dev_pasid = mock_domain_set_dev_pasid_nop,
+ .iotlb_sync = &mock_iotlb_sync,
+};
-static size_t mock_domain_unmap_pages(struct iommu_domain *domain,
- unsigned long iova, size_t pgsize,
- size_t pgcount,
- struct iommu_iotlb_gather *iotlb_gather)
+static const struct iommu_dirty_ops amdv1_dirty_ops = {
+ IOMMU_PT_DIRTY_OPS(amdv1),
+ .set_dirty_tracking = mock_domain_set_dirty_tracking,
+};
+
+static struct mock_iommu_domain *
+mock_domain_alloc_pgtable(struct device *dev,
+ const struct iommu_hwpt_selftest *user_cfg, u32 flags)
{
- struct mock_iommu_domain *mock = to_mock_domain(domain);
- bool first = true;
- size_t ret = 0;
- void *ent;
+ struct mock_iommu_domain *mock;
+ int rc;
- WARN_ON(iova % MOCK_IO_PAGE_SIZE);
- WARN_ON(pgsize % MOCK_IO_PAGE_SIZE);
+ mock = kzalloc(sizeof(*mock), GFP_KERNEL);
+ if (!mock)
+ return ERR_PTR(-ENOMEM);
+ mock->domain.type = IOMMU_DOMAIN_UNMANAGED;
- for (; pgcount; pgcount--) {
- size_t cur;
+ mock->amdv1.iommu.nid = NUMA_NO_NODE;
+
+ switch (user_cfg->pagetable_type) {
+ case MOCK_IOMMUPT_DEFAULT:
+ case MOCK_IOMMUPT_HUGE: {
+ struct pt_iommu_amdv1_cfg cfg = {};
+
+ /* The mock version has a 2k page size */
+ cfg.common.hw_max_vasz_lg2 = 56;
+ cfg.common.hw_max_oasz_lg2 = 51;
+ cfg.starting_level = 2;
+ if (user_cfg->pagetable_type == MOCK_IOMMUPT_HUGE)
+ mock->domain.ops = &amdv1_mock_huge_ops;
+ else
+ mock->domain.ops = &amdv1_mock_ops;
+ rc = pt_iommu_amdv1_mock_init(&mock->amdv1, &cfg, GFP_KERNEL);
+ if (rc)
+ goto err_free;
+
+ /*
+ * In huge mode userspace should only provide huge pages, we
+ * have to include PAGE_SIZE for the domain to be accepted by
+ * iommufd.
+ */
+ if (user_cfg->pagetable_type == MOCK_IOMMUPT_HUGE)
+ mock->domain.pgsize_bitmap = MOCK_HUGE_PAGE_SIZE |
+ PAGE_SIZE;
+ if (flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING)
+ mock->domain.dirty_ops = &amdv1_mock_dirty_ops;
+ break;
+ }
- for (cur = 0; cur != pgsize; cur += MOCK_IO_PAGE_SIZE) {
- ent = xa_erase(&mock->pfns, iova / MOCK_IO_PAGE_SIZE);
+ case MOCK_IOMMUPT_AMDV1: {
+ struct pt_iommu_amdv1_cfg cfg = {};
+
+ cfg.common.hw_max_vasz_lg2 = 64;
+ cfg.common.hw_max_oasz_lg2 = 52;
+ cfg.common.features = BIT(PT_FEAT_DYNAMIC_TOP) |
+ BIT(PT_FEAT_AMDV1_ENCRYPT_TABLES) |
+ BIT(PT_FEAT_AMDV1_FORCE_COHERENCE);
+ cfg.starting_level = 2;
+ mock->domain.ops = &amdv1_ops;
+ rc = pt_iommu_amdv1_init(&mock->amdv1, &cfg, GFP_KERNEL);
+ if (rc)
+ goto err_free;
+ if (flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING)
+ mock->domain.dirty_ops = &amdv1_dirty_ops;
+ break;
+ }
+ default:
+ rc = -EOPNOTSUPP;
+ goto err_free;
+ }
- /*
- * iommufd generates unmaps that must be a strict
- * superset of the map's performend So every
- * starting/ending IOVA should have been an iova passed
- * to map.
- *
- * This simple logic doesn't work when the HUGE_PAGE is
- * turned on since the core code will automatically
- * switch between the two page sizes creating a break in
- * the unmap calls. The break can land in the middle of
- * contiguous IOVA.
- */
- if (!(domain->pgsize_bitmap & MOCK_HUGE_PAGE_SIZE)) {
- if (first) {
- WARN_ON(ent && !(xa_to_value(ent) &
- MOCK_PFN_START_IOVA));
- first = false;
- }
- if (pgcount == 1 &&
- cur + MOCK_IO_PAGE_SIZE == pgsize)
- WARN_ON(ent && !(xa_to_value(ent) &
- MOCK_PFN_LAST_IOVA));
- }
+ /*
+ * Override the real aperture to the MOCK aperture for test purposes.
+ */
+ if (user_cfg->pagetable_type == MOCK_IOMMUPT_DEFAULT) {
+ WARN_ON(mock->domain.geometry.aperture_start != 0);
+ WARN_ON(mock->domain.geometry.aperture_end < MOCK_APERTURE_LAST);
- iova += MOCK_IO_PAGE_SIZE;
- ret += MOCK_IO_PAGE_SIZE;
- }
+ mock->domain.geometry.aperture_start = MOCK_APERTURE_START;
+ mock->domain.geometry.aperture_end = MOCK_APERTURE_LAST;
}
- return ret;
+
+ return mock;
+err_free:
+ kfree(mock);
+ return ERR_PTR(rc);
}
-static phys_addr_t mock_domain_iova_to_phys(struct iommu_domain *domain,
- dma_addr_t iova)
+static struct iommu_domain *
+mock_domain_alloc_paging_flags(struct device *dev, u32 flags,
+ const struct iommu_user_data *user_data)
{
- struct mock_iommu_domain *mock = to_mock_domain(domain);
- void *ent;
+ bool has_dirty_flag = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING;
+ const u32 PAGING_FLAGS = IOMMU_HWPT_ALLOC_DIRTY_TRACKING |
+ IOMMU_HWPT_ALLOC_NEST_PARENT |
+ IOMMU_HWPT_ALLOC_PASID;
+ struct mock_dev *mdev = to_mock_dev(dev);
+ bool no_dirty_ops = mdev->flags & MOCK_FLAGS_DEVICE_NO_DIRTY;
+ struct iommu_hwpt_selftest user_cfg = {};
+ struct mock_iommu_domain *mock;
+ int rc;
- WARN_ON(iova % MOCK_IO_PAGE_SIZE);
- ent = xa_load(&mock->pfns, iova / MOCK_IO_PAGE_SIZE);
- WARN_ON(!ent);
- return (xa_to_value(ent) & MOCK_PFN_MASK) * MOCK_IO_PAGE_SIZE;
+ if ((flags & ~PAGING_FLAGS) || (has_dirty_flag && no_dirty_ops))
+ return ERR_PTR(-EOPNOTSUPP);
+
+ if (user_data && (user_data->type != IOMMU_HWPT_DATA_SELFTEST &&
+ user_data->type != IOMMU_HWPT_DATA_NONE))
+ return ERR_PTR(-EOPNOTSUPP);
+
+ if (user_data) {
+ rc = iommu_copy_struct_from_user(
+ &user_cfg, user_data, IOMMU_HWPT_DATA_SELFTEST, iotlb);
+ if (rc)
+ return ERR_PTR(rc);
+ }
+
+ mock = mock_domain_alloc_pgtable(dev, &user_cfg, flags);
+ if (IS_ERR(mock))
+ return ERR_CAST(mock);
+ return &mock->domain;
}
static bool mock_domain_capable(struct device *dev, enum iommu_cap cap)
@@ -955,15 +892,6 @@ static const struct iommu_ops mock_ops = {
.user_pasid_table = true,
.get_viommu_size = mock_get_viommu_size,
.viommu_init = mock_viommu_init,
- .default_domain_ops =
- &(struct iommu_domain_ops){
- .free = mock_domain_free,
- .attach_dev = mock_domain_nop_attach,
- .map_pages = mock_domain_map_pages,
- .unmap_pages = mock_domain_unmap_pages,
- .iova_to_phys = mock_domain_iova_to_phys,
- .set_dev_pasid = mock_domain_set_dev_pasid_nop,
- },
};
static void mock_domain_free_nested(struct iommu_domain *domain)
@@ -1047,7 +975,7 @@ get_md_pagetable(struct iommufd_ucmd *ucmd, u32 mockpt_id,
if (IS_ERR(hwpt))
return hwpt;
if (hwpt->domain->type != IOMMU_DOMAIN_UNMANAGED ||
- hwpt->domain->ops != mock_ops.default_domain_ops) {
+ hwpt->domain->owner != &mock_ops) {
iommufd_put_object(ucmd->ictx, &hwpt->obj);
return ERR_PTR(-EINVAL);
}
@@ -1088,7 +1016,6 @@ static struct mock_dev *mock_dev_create(unsigned long dev_flags)
{},
};
const u32 valid_flags = MOCK_FLAGS_DEVICE_NO_DIRTY |
- MOCK_FLAGS_DEVICE_HUGE_IOVA |
MOCK_FLAGS_DEVICE_PASID;
struct mock_dev *mdev;
int rc, i;
@@ -1277,23 +1204,25 @@ static int iommufd_test_md_check_pa(struct iommufd_ucmd *ucmd,
{
struct iommufd_hw_pagetable *hwpt;
struct mock_iommu_domain *mock;
+ unsigned int page_size;
uintptr_t end;
int rc;
- if (iova % MOCK_IO_PAGE_SIZE || length % MOCK_IO_PAGE_SIZE ||
- (uintptr_t)uptr % MOCK_IO_PAGE_SIZE ||
- check_add_overflow((uintptr_t)uptr, (uintptr_t)length, &end))
- return -EINVAL;
-
hwpt = get_md_pagetable(ucmd, mockpt_id, &mock);
if (IS_ERR(hwpt))
return PTR_ERR(hwpt);
- for (; length; length -= MOCK_IO_PAGE_SIZE) {
+ page_size = 1 << __ffs(mock->domain.pgsize_bitmap);
+ if (iova % page_size || length % page_size ||
+ (uintptr_t)uptr % page_size ||
+ check_add_overflow((uintptr_t)uptr, (uintptr_t)length, &end))
+ return -EINVAL;
+
+ for (; length; length -= page_size) {
struct page *pages[1];
+ phys_addr_t io_phys;
unsigned long pfn;
long npages;
- void *ent;
npages = get_user_pages_fast((uintptr_t)uptr & PAGE_MASK, 1, 0,
pages);
@@ -1308,15 +1237,14 @@ static int iommufd_test_md_check_pa(struct iommufd_ucmd *ucmd,
pfn = page_to_pfn(pages[0]);
put_page(pages[0]);
- ent = xa_load(&mock->pfns, iova / MOCK_IO_PAGE_SIZE);
- if (!ent ||
- (xa_to_value(ent) & MOCK_PFN_MASK) * MOCK_IO_PAGE_SIZE !=
- pfn * PAGE_SIZE + ((uintptr_t)uptr % PAGE_SIZE)) {
+ io_phys = mock->domain.ops->iova_to_phys(&mock->domain, iova);
+ if (io_phys !=
+ pfn * PAGE_SIZE + ((uintptr_t)uptr % PAGE_SIZE)) {
rc = -EINVAL;
goto out_put;
}
- iova += MOCK_IO_PAGE_SIZE;
- uptr += MOCK_IO_PAGE_SIZE;
+ iova += page_size;
+ uptr += page_size;
}
rc = 0;
@@ -1795,7 +1723,7 @@ static int iommufd_test_dirty(struct iommufd_ucmd *ucmd, unsigned int mockpt_id,
if (IS_ERR(hwpt))
return PTR_ERR(hwpt);
- if (!(mock->flags & MOCK_DIRTY_TRACK)) {
+ if (!(mock->flags & MOCK_DIRTY_TRACK) || !mock->iommu.ops->set_dirty) {
rc = -EINVAL;
goto out_put;
}
@@ -1814,22 +1742,10 @@ static int iommufd_test_dirty(struct iommufd_ucmd *ucmd, unsigned int mockpt_id,
}
for (i = 0; i < max; i++) {
- unsigned long cur = iova + i * page_size;
- void *ent, *old;
-
if (!test_bit(i, (unsigned long *)tmp))
continue;
-
- ent = xa_load(&mock->pfns, cur / page_size);
- if (ent) {
- unsigned long val;
-
- val = xa_to_value(ent) | MOCK_PFN_DIRTY_IOVA;
- old = xa_store(&mock->pfns, cur / page_size,
- xa_mk_value(val), GFP_KERNEL);
- WARN_ON_ONCE(ent != old);
- count++;
- }
+ mock->iommu.ops->set_dirty(&mock->iommu, iova + i * page_size);
+ count++;
}
cmd->dirty.out_nr_dirty = count;
@@ -2031,6 +1947,140 @@ void iommufd_selftest_destroy(struct iommufd_object *obj)
}
}
+struct iommufd_test_dma_buf {
+ void *memory;
+ size_t length;
+ bool revoked;
+};
+
+static int iommufd_test_dma_buf_attach(struct dma_buf *dmabuf,
+ struct dma_buf_attachment *attachment)
+{
+ return 0;
+}
+
+static void iommufd_test_dma_buf_detach(struct dma_buf *dmabuf,
+ struct dma_buf_attachment *attachment)
+{
+}
+
+static struct sg_table *
+iommufd_test_dma_buf_map(struct dma_buf_attachment *attachment,
+ enum dma_data_direction dir)
+{
+ return ERR_PTR(-EOPNOTSUPP);
+}
+
+static void iommufd_test_dma_buf_unmap(struct dma_buf_attachment *attachment,
+ struct sg_table *sgt,
+ enum dma_data_direction dir)
+{
+}
+
+static void iommufd_test_dma_buf_release(struct dma_buf *dmabuf)
+{
+ struct iommufd_test_dma_buf *priv = dmabuf->priv;
+
+ kfree(priv->memory);
+ kfree(priv);
+}
+
+static const struct dma_buf_ops iommufd_test_dmabuf_ops = {
+ .attach = iommufd_test_dma_buf_attach,
+ .detach = iommufd_test_dma_buf_detach,
+ .map_dma_buf = iommufd_test_dma_buf_map,
+ .release = iommufd_test_dma_buf_release,
+ .unmap_dma_buf = iommufd_test_dma_buf_unmap,
+};
+
+int iommufd_test_dma_buf_iommufd_map(struct dma_buf_attachment *attachment,
+ struct dma_buf_phys_vec *phys)
+{
+ struct iommufd_test_dma_buf *priv = attachment->dmabuf->priv;
+
+ dma_resv_assert_held(attachment->dmabuf->resv);
+
+ if (attachment->dmabuf->ops != &iommufd_test_dmabuf_ops)
+ return -EOPNOTSUPP;
+
+ if (priv->revoked)
+ return -ENODEV;
+
+ phys->paddr = virt_to_phys(priv->memory);
+ phys->len = priv->length;
+ return 0;
+}
+
+static int iommufd_test_dmabuf_get(struct iommufd_ucmd *ucmd,
+ unsigned int open_flags,
+ size_t len)
+{
+ DEFINE_DMA_BUF_EXPORT_INFO(exp_info);
+ struct iommufd_test_dma_buf *priv;
+ struct dma_buf *dmabuf;
+ int rc;
+
+ len = ALIGN(len, PAGE_SIZE);
+ if (len == 0 || len > PAGE_SIZE * 512)
+ return -EINVAL;
+
+ priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+ if (!priv)
+ return -ENOMEM;
+
+ priv->length = len;
+ priv->memory = kzalloc(len, GFP_KERNEL);
+ if (!priv->memory) {
+ rc = -ENOMEM;
+ goto err_free;
+ }
+
+ exp_info.ops = &iommufd_test_dmabuf_ops;
+ exp_info.size = len;
+ exp_info.flags = open_flags;
+ exp_info.priv = priv;
+
+ dmabuf = dma_buf_export(&exp_info);
+ if (IS_ERR(dmabuf)) {
+ rc = PTR_ERR(dmabuf);
+ goto err_free;
+ }
+
+ return dma_buf_fd(dmabuf, open_flags);
+
+err_free:
+ kfree(priv->memory);
+ kfree(priv);
+ return rc;
+}
+
+static int iommufd_test_dmabuf_revoke(struct iommufd_ucmd *ucmd, int fd,
+ bool revoked)
+{
+ struct iommufd_test_dma_buf *priv;
+ struct dma_buf *dmabuf;
+ int rc = 0;
+
+ dmabuf = dma_buf_get(fd);
+ if (IS_ERR(dmabuf))
+ return PTR_ERR(dmabuf);
+
+ if (dmabuf->ops != &iommufd_test_dmabuf_ops) {
+ rc = -EOPNOTSUPP;
+ goto err_put;
+ }
+
+ priv = dmabuf->priv;
+ dma_resv_lock(dmabuf->resv, NULL);
+ priv->revoked = revoked;
+ dma_buf_move_notify(dmabuf);
+ dma_resv_unlock(dmabuf->resv);
+
+err_put:
+ dma_buf_put(dmabuf);
+ return rc;
+}
+
int iommufd_test(struct iommufd_ucmd *ucmd)
{
struct iommu_test_cmd *cmd = ucmd->cmd;
@@ -2109,6 +2159,13 @@ int iommufd_test(struct iommufd_ucmd *ucmd)
return iommufd_test_pasid_detach(ucmd, cmd);
case IOMMU_TEST_OP_PASID_CHECK_HWPT:
return iommufd_test_pasid_check_hwpt(ucmd, cmd);
+ case IOMMU_TEST_OP_DMABUF_GET:
+ return iommufd_test_dmabuf_get(ucmd, cmd->dmabuf_get.open_flags,
+ cmd->dmabuf_get.length);
+ case IOMMU_TEST_OP_DMABUF_REVOKE:
+ return iommufd_test_dmabuf_revoke(ucmd,
+ cmd->dmabuf_revoke.dmabuf_fd,
+ cmd->dmabuf_revoke.revoked);
default:
return -EOPNOTSUPP;
}
@@ -2202,3 +2259,5 @@ void iommufd_test_exit(void)
platform_device_unregister(selftest_iommu_dev);
debugfs_remove_recursive(dbgfs_root);
}
+
+MODULE_IMPORT_NS("GENERIC_PT_IOMMU");
diff --git a/drivers/iommu/ipmmu-vmsa.c b/drivers/iommu/ipmmu-vmsa.c
index ffa892f65714..ca848288dbf2 100644
--- a/drivers/iommu/ipmmu-vmsa.c
+++ b/drivers/iommu/ipmmu-vmsa.c
@@ -590,7 +590,7 @@ static void ipmmu_domain_free(struct iommu_domain *io_domain)
}
static int ipmmu_attach_device(struct iommu_domain *io_domain,
- struct device *dev)
+ struct device *dev, struct iommu_domain *old)
{
struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
struct ipmmu_vmsa_device *mmu = to_ipmmu(dev);
@@ -637,17 +637,17 @@ static int ipmmu_attach_device(struct iommu_domain *io_domain,
}
static int ipmmu_iommu_identity_attach(struct iommu_domain *identity_domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old)
{
- struct iommu_domain *io_domain = iommu_get_domain_for_dev(dev);
struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
struct ipmmu_vmsa_domain *domain;
unsigned int i;
- if (io_domain == identity_domain || !io_domain)
+ if (old == identity_domain || !old)
return 0;
- domain = to_vmsa_domain(io_domain);
+ domain = to_vmsa_domain(old);
for (i = 0; i < fwspec->num_ids; ++i)
ipmmu_utlb_disable(domain, fwspec->ids[i]);
@@ -720,6 +720,8 @@ static int ipmmu_init_platform_device(struct device *dev,
dev_iommu_priv_set(dev, platform_get_drvdata(ipmmu_pdev));
+ put_device(&ipmmu_pdev->dev);
+
return 0;
}
diff --git a/drivers/iommu/msm_iommu.c b/drivers/iommu/msm_iommu.c
index 43a61ba021a5..819add75a665 100644
--- a/drivers/iommu/msm_iommu.c
+++ b/drivers/iommu/msm_iommu.c
@@ -391,7 +391,8 @@ static struct iommu_device *msm_iommu_probe_device(struct device *dev)
return &iommu->iommu;
}
-static int msm_iommu_attach_dev(struct iommu_domain *domain, struct device *dev)
+static int msm_iommu_attach_dev(struct iommu_domain *domain, struct device *dev,
+ struct iommu_domain *old)
{
int ret = 0;
unsigned long flags;
@@ -441,19 +442,19 @@ fail:
}
static int msm_iommu_identity_attach(struct iommu_domain *identity_domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old)
{
- struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
struct msm_priv *priv;
unsigned long flags;
struct msm_iommu_dev *iommu;
struct msm_iommu_ctx_dev *master;
int ret = 0;
- if (domain == identity_domain || !domain)
+ if (old == identity_domain || !old)
return 0;
- priv = to_msm_priv(domain);
+ priv = to_msm_priv(old);
free_io_pgtable_ops(priv->iop);
spin_lock_irqsave(&msm_iommu_lock, flags);
diff --git a/drivers/iommu/mtk_iommu.c b/drivers/iommu/mtk_iommu.c
index 0e0285348d2b..60fcd3d3b5eb 100644
--- a/drivers/iommu/mtk_iommu.c
+++ b/drivers/iommu/mtk_iommu.c
@@ -139,6 +139,7 @@
/* 2 bits: iommu type */
#define MTK_IOMMU_TYPE_MM (0x0 << 13)
#define MTK_IOMMU_TYPE_INFRA (0x1 << 13)
+#define MTK_IOMMU_TYPE_APU (0x2 << 13)
#define MTK_IOMMU_TYPE_MASK (0x3 << 13)
/* PM and clock always on. e.g. infra iommu */
#define PM_CLK_AO BIT(15)
@@ -147,6 +148,7 @@
#define TF_PORT_TO_ADDR_MT8173 BIT(18)
#define INT_ID_PORT_WIDTH_6 BIT(19)
#define CFG_IFA_MASTER_IN_ATF BIT(20)
+#define DL_WITH_MULTI_LARB BIT(21)
#define MTK_IOMMU_HAS_FLAG_MASK(pdata, _x, mask) \
((((pdata)->flags) & (mask)) == (_x))
@@ -172,6 +174,7 @@ enum mtk_iommu_plat {
M4U_MT8183,
M4U_MT8186,
M4U_MT8188,
+ M4U_MT8189,
M4U_MT8192,
M4U_MT8195,
M4U_MT8365,
@@ -335,6 +338,8 @@ static int mtk_iommu_hw_init(const struct mtk_iommu_data *data, unsigned int ban
*/
#define MTK_IOMMU_4GB_MODE_REMAP_BASE 0x140000000UL
+static LIST_HEAD(apulist); /* List the apu iommu HWs */
+static LIST_HEAD(infralist); /* List the iommu_infra HW */
static LIST_HEAD(m4ulist); /* List all the M4U HWs */
#define for_each_m4u(data, head) list_for_each_entry(data, head, list)
@@ -350,6 +355,15 @@ static const struct mtk_iommu_iova_region single_domain[] = {
#define MT8192_MULTI_REGION_NR (IS_ENABLED(CONFIG_ARCH_DMA_ADDR_T_64BIT) ? \
MT8192_MULTI_REGION_NR_MAX : 1)
+static const struct mtk_iommu_iova_region mt8189_multi_dom_apu[] = {
+ { .iova_base = 0x200000ULL, .size = SZ_512M}, /* APU SECURE */
+#if IS_ENABLED(CONFIG_ARCH_DMA_ADDR_T_64BIT)
+ { .iova_base = SZ_1G, .size = 0xc0000000}, /* APU CODE */
+ { .iova_base = 0x70000000ULL, .size = 0x12600000}, /* APU VLM */
+ { .iova_base = SZ_4G, .size = SZ_4G * 3}, /* APU VPU */
+#endif
+};
+
static const struct mtk_iommu_iova_region mt8192_multi_dom[MT8192_MULTI_REGION_NR] = {
{ .iova_base = 0x0, .size = MTK_IOMMU_IOVA_SZ_4G}, /* 0 ~ 4G, */
#if IS_ENABLED(CONFIG_ARCH_DMA_ADDR_T_64BIT)
@@ -705,7 +719,7 @@ static void mtk_iommu_domain_free(struct iommu_domain *domain)
}
static int mtk_iommu_attach_device(struct iommu_domain *domain,
- struct device *dev)
+ struct device *dev, struct iommu_domain *old)
{
struct mtk_iommu_data *data = dev_iommu_priv_get(dev), *frstdata;
struct mtk_iommu_domain *dom = to_mtk_domain(domain);
@@ -773,12 +787,12 @@ err_unlock:
}
static int mtk_iommu_identity_attach(struct iommu_domain *identity_domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old)
{
- struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
struct mtk_iommu_data *data = dev_iommu_priv_get(dev);
- if (domain == identity_domain || !domain)
+ if (old == identity_domain || !old)
return 0;
mtk_iommu_config(data, dev, false, 0);
@@ -865,6 +879,7 @@ static struct iommu_device *mtk_iommu_probe_device(struct device *dev)
struct mtk_iommu_data *data = dev_iommu_priv_get(dev);
struct device_link *link;
struct device *larbdev;
+ unsigned long larbid_msk = 0;
unsigned int larbid, larbidx, i;
if (!MTK_IOMMU_IS_TYPE(data->plat_data, MTK_IOMMU_TYPE_MM))
@@ -872,30 +887,50 @@ static struct iommu_device *mtk_iommu_probe_device(struct device *dev)
/*
* Link the consumer device with the smi-larb device(supplier).
- * The device that connects with each a larb is a independent HW.
- * All the ports in each a device should be in the same larbs.
+ * w/DL_WITH_MULTI_LARB: the master may connect with multi larbs,
+ * we should create device link with each larb.
+ * w/o DL_WITH_MULTI_LARB: the master must connect with one larb,
+ * otherwise fail.
*/
larbid = MTK_M4U_TO_LARB(fwspec->ids[0]);
if (larbid >= MTK_LARB_NR_MAX)
return ERR_PTR(-EINVAL);
+ larbid_msk |= BIT(larbid);
+
for (i = 1; i < fwspec->num_ids; i++) {
larbidx = MTK_M4U_TO_LARB(fwspec->ids[i]);
- if (larbid != larbidx) {
+ if (MTK_IOMMU_HAS_FLAG(data->plat_data, DL_WITH_MULTI_LARB)) {
+ larbid_msk |= BIT(larbidx);
+ } else if (larbid != larbidx) {
dev_err(dev, "Can only use one larb. Fail@larb%d-%d.\n",
larbid, larbidx);
return ERR_PTR(-EINVAL);
}
}
- larbdev = data->larb_imu[larbid].dev;
- if (!larbdev)
- return ERR_PTR(-EINVAL);
- link = device_link_add(dev, larbdev,
- DL_FLAG_PM_RUNTIME | DL_FLAG_STATELESS);
- if (!link)
- dev_err(dev, "Unable to link %s\n", dev_name(larbdev));
+ for_each_set_bit(larbid, &larbid_msk, 32) {
+ larbdev = data->larb_imu[larbid].dev;
+ if (!larbdev)
+ return ERR_PTR(-EINVAL);
+
+ link = device_link_add(dev, larbdev,
+ DL_FLAG_PM_RUNTIME | DL_FLAG_STATELESS);
+ if (!link) {
+ dev_err(dev, "Unable to link %s\n", dev_name(larbdev));
+ goto link_remove;
+ }
+ }
+
return &data->iommu;
+
+link_remove:
+ for_each_set_bit(i, &larbid_msk, larbid) {
+ larbdev = data->larb_imu[i].dev;
+ device_link_remove(dev, larbdev);
+ }
+
+ return ERR_PTR(-ENODEV);
}
static void mtk_iommu_release_device(struct device *dev)
@@ -903,11 +938,19 @@ static void mtk_iommu_release_device(struct device *dev)
struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
struct mtk_iommu_data *data;
struct device *larbdev;
- unsigned int larbid;
+ unsigned int larbid, i;
+ unsigned long larbid_msk = 0;
data = dev_iommu_priv_get(dev);
- if (MTK_IOMMU_IS_TYPE(data->plat_data, MTK_IOMMU_TYPE_MM)) {
- larbid = MTK_M4U_TO_LARB(fwspec->ids[0]);
+ if (!MTK_IOMMU_IS_TYPE(data->plat_data, MTK_IOMMU_TYPE_MM))
+ return;
+
+ for (i = 0; i < fwspec->num_ids; i++) {
+ larbid = MTK_M4U_TO_LARB(fwspec->ids[i]);
+ larbid_msk |= BIT(larbid);
+ }
+
+ for_each_set_bit(larbid, &larbid_msk, 32) {
larbdev = data->larb_imu[larbid].dev;
device_link_remove(dev, larbdev);
}
@@ -974,6 +1017,8 @@ static int mtk_iommu_of_xlate(struct device *dev,
return -EINVAL;
dev_iommu_priv_set(dev, platform_get_drvdata(m4updev));
+
+ put_device(&m4updev->dev);
}
return iommu_fwspec_add_ids(dev, args->args, 1);
@@ -1211,16 +1256,19 @@ static int mtk_iommu_mm_dts_parse(struct device *dev, struct component_match **m
}
component_match_add(dev, match, component_compare_dev, &plarbdev->dev);
- platform_device_put(plarbdev);
}
- if (!frst_avail_smicomm_node)
- return -EINVAL;
+ if (!frst_avail_smicomm_node) {
+ ret = -EINVAL;
+ goto err_larbdev_put;
+ }
pcommdev = of_find_device_by_node(frst_avail_smicomm_node);
of_node_put(frst_avail_smicomm_node);
- if (!pcommdev)
- return -ENODEV;
+ if (!pcommdev) {
+ ret = -ENODEV;
+ goto err_larbdev_put;
+ }
data->smicomm_dev = &pcommdev->dev;
link = device_link_add(data->smicomm_dev, dev,
@@ -1228,16 +1276,16 @@ static int mtk_iommu_mm_dts_parse(struct device *dev, struct component_match **m
platform_device_put(pcommdev);
if (!link) {
dev_err(dev, "Unable to link %s.\n", dev_name(data->smicomm_dev));
- return -EINVAL;
+ ret = -EINVAL;
+ goto err_larbdev_put;
}
return 0;
err_larbdev_put:
- for (i = MTK_LARB_NR_MAX - 1; i >= 0; i--) {
- if (!data->larb_imu[i].dev)
- continue;
+ /* id mapping may not be linear, loop the whole array */
+ for (i = 0; i < MTK_LARB_NR_MAX; i++)
put_device(data->larb_imu[i].dev);
- }
+
return ret;
}
@@ -1400,8 +1448,12 @@ out_sysfs_remove:
iommu_device_sysfs_remove(&data->iommu);
out_list_del:
list_del(&data->list);
- if (MTK_IOMMU_IS_TYPE(data->plat_data, MTK_IOMMU_TYPE_MM))
+ if (MTK_IOMMU_IS_TYPE(data->plat_data, MTK_IOMMU_TYPE_MM)) {
device_link_remove(data->smicomm_dev, dev);
+
+ for (i = 0; i < MTK_LARB_NR_MAX; i++)
+ put_device(data->larb_imu[i].dev);
+ }
out_runtime_disable:
pm_runtime_disable(dev);
return ret;
@@ -1421,6 +1473,9 @@ static void mtk_iommu_remove(struct platform_device *pdev)
if (MTK_IOMMU_IS_TYPE(data->plat_data, MTK_IOMMU_TYPE_MM)) {
device_link_remove(data->smicomm_dev, &pdev->dev);
component_master_del(&pdev->dev, &mtk_iommu_com_ops);
+
+ for (i = 0; i < MTK_LARB_NR_MAX; i++)
+ put_device(data->larb_imu[i].dev);
}
pm_runtime_disable(&pdev->dev);
for (i = 0; i < data->plat_data->banks_num; i++) {
@@ -1695,6 +1750,66 @@ static const struct mtk_iommu_plat_data mt8188_data_vpp = {
27, 28 /* ccu0 */, MTK_INVALID_LARBID}, {4, 6}},
};
+static const unsigned int mt8189_apu_region_msk[][MTK_LARB_NR_MAX] = {
+ [0] = {[0] = BIT(2)}, /* Region0: fake larb 0 APU_SECURE */
+ [1] = {[0] = BIT(1)}, /* Region1: fake larb 0 APU_CODE */
+ [2] = {[0] = BIT(3)}, /* Region2: fake larb 0 APU_VLM */
+ [3] = {[0] = BIT(0)}, /* Region3: fake larb 0 APU_DATA */
+};
+
+static const struct mtk_iommu_plat_data mt8189_data_apu = {
+ .m4u_plat = M4U_MT8189,
+ .flags = IOVA_34_EN | DCM_DISABLE |
+ MTK_IOMMU_TYPE_APU | PGTABLE_PA_35_EN,
+ .hw_list = &apulist,
+ .inv_sel_reg = REG_MMU_INV_SEL_GEN2,
+ .banks_num = 1,
+ .banks_enable = {true},
+ .iova_region = mt8189_multi_dom_apu,
+ .iova_region_nr = ARRAY_SIZE(mt8189_multi_dom_apu),
+ .larbid_remap = {{0}, {1}, {2}, {3}, {4}, {5}, {6}, {7}},
+ .iova_region_larb_msk = mt8189_apu_region_msk,
+};
+
+static const struct mtk_iommu_plat_data mt8189_data_infra = {
+ .m4u_plat = M4U_MT8189,
+ .flags = WR_THROT_EN | DCM_DISABLE | MTK_IOMMU_TYPE_INFRA |
+ CFG_IFA_MASTER_IN_ATF | SHARE_PGTABLE | PGTABLE_PA_35_EN,
+ .hw_list = &infralist,
+ .banks_num = 1,
+ .banks_enable = {true},
+ .inv_sel_reg = REG_MMU_INV_SEL_GEN2,
+ .iova_region = single_domain,
+ .iova_region_nr = ARRAY_SIZE(single_domain),
+};
+
+static const u32 mt8189_larb_region_msk[MT8192_MULTI_REGION_NR_MAX][MTK_LARB_NR_MAX] = {
+ [0] = {~0, ~0, ~0, [22] = BIT(0)}, /* Region0: all ports for larb0/1/2 */
+ [1] = {[3] = ~0, [4] = ~0}, /* Region1: all ports for larb4(3)/7(4) */
+ [2] = {[5] = ~0, [6] = ~0, /* Region2: all ports for larb9(5)/11(6) */
+ [7] = ~0, [8] = ~0, /* Region2: all ports for larb13(7)/14(8) */
+ [9] = ~0, [10] = ~0, /* Region2: all ports for larb16(9)/17(10) */
+ [11] = ~0, [12] = ~0, /* Region2: all ports for larb19(11)/20(12) */
+ [21] = ~0}, /* Region2: larb21 fake GCE larb */
+};
+
+static const struct mtk_iommu_plat_data mt8189_data_mm = {
+ .m4u_plat = M4U_MT8189,
+ .flags = HAS_BCLK | HAS_SUB_COMM_3BITS | OUT_ORDER_WR_EN |
+ WR_THROT_EN | IOVA_34_EN | MTK_IOMMU_TYPE_MM |
+ PGTABLE_PA_35_EN | DL_WITH_MULTI_LARB,
+ .hw_list = &m4ulist,
+ .inv_sel_reg = REG_MMU_INV_SEL_GEN2,
+ .banks_num = 5,
+ .banks_enable = {true, false, false, false, false},
+ .iova_region = mt8192_multi_dom,
+ .iova_region_nr = ARRAY_SIZE(mt8192_multi_dom),
+ .iova_region_larb_msk = mt8189_larb_region_msk,
+ .larbid_remap = {{0}, {1}, {21/* GCE_D */, 21/* GCE_M */, 2},
+ {19, 20, 9, 11}, {7}, {4},
+ {13, 17}, {14, 16}},
+};
+
static const struct mtk_iommu_plat_data mt8192_data = {
.m4u_plat = M4U_MT8192,
.flags = HAS_BCLK | HAS_SUB_COMM_2BITS | OUT_ORDER_WR_EN |
@@ -1796,6 +1911,9 @@ static const struct of_device_id mtk_iommu_of_ids[] = {
{ .compatible = "mediatek,mt8188-iommu-infra", .data = &mt8188_data_infra},
{ .compatible = "mediatek,mt8188-iommu-vdo", .data = &mt8188_data_vdo},
{ .compatible = "mediatek,mt8188-iommu-vpp", .data = &mt8188_data_vpp},
+ { .compatible = "mediatek,mt8189-iommu-apu", .data = &mt8189_data_apu},
+ { .compatible = "mediatek,mt8189-iommu-infra", .data = &mt8189_data_infra},
+ { .compatible = "mediatek,mt8189-iommu-mm", .data = &mt8189_data_mm},
{ .compatible = "mediatek,mt8192-m4u", .data = &mt8192_data},
{ .compatible = "mediatek,mt8195-iommu-infra", .data = &mt8195_data_infra},
{ .compatible = "mediatek,mt8195-iommu-vdo", .data = &mt8195_data_vdo},
diff --git a/drivers/iommu/mtk_iommu_v1.c b/drivers/iommu/mtk_iommu_v1.c
index 10cc0b1197e8..c8d8eff5373d 100644
--- a/drivers/iommu/mtk_iommu_v1.c
+++ b/drivers/iommu/mtk_iommu_v1.c
@@ -303,7 +303,9 @@ static void mtk_iommu_v1_domain_free(struct iommu_domain *domain)
kfree(to_mtk_domain(domain));
}
-static int mtk_iommu_v1_attach_device(struct iommu_domain *domain, struct device *dev)
+static int mtk_iommu_v1_attach_device(struct iommu_domain *domain,
+ struct device *dev,
+ struct iommu_domain *old)
{
struct mtk_iommu_v1_data *data = dev_iommu_priv_get(dev);
struct mtk_iommu_v1_domain *dom = to_mtk_domain(domain);
@@ -329,7 +331,8 @@ static int mtk_iommu_v1_attach_device(struct iommu_domain *domain, struct device
}
static int mtk_iommu_v1_identity_attach(struct iommu_domain *identity_domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old)
{
struct mtk_iommu_v1_data *data = dev_iommu_priv_get(dev);
@@ -435,6 +438,8 @@ static int mtk_iommu_v1_create_mapping(struct device *dev,
return -EINVAL;
dev_iommu_priv_set(dev, platform_get_drvdata(m4updev));
+
+ put_device(&m4updev->dev);
}
ret = iommu_fwspec_add_ids(dev, args->args, 1);
@@ -641,13 +646,18 @@ static int mtk_iommu_v1_probe(struct platform_device *pdev)
if (larb_nr < 0)
return larb_nr;
+ if (larb_nr > MTK_LARB_NR_MAX)
+ return -EINVAL;
+
for (i = 0; i < larb_nr; i++) {
struct device_node *larbnode;
struct platform_device *plarbdev;
larbnode = of_parse_phandle(dev->of_node, "mediatek,larbs", i);
- if (!larbnode)
- return -EINVAL;
+ if (!larbnode) {
+ ret = -EINVAL;
+ goto out_put_larbs;
+ }
if (!of_device_is_available(larbnode)) {
of_node_put(larbnode);
@@ -657,11 +667,14 @@ static int mtk_iommu_v1_probe(struct platform_device *pdev)
plarbdev = of_find_device_by_node(larbnode);
if (!plarbdev) {
of_node_put(larbnode);
- return -ENODEV;
+ ret = -ENODEV;
+ goto out_put_larbs;
}
if (!plarbdev->dev.driver) {
of_node_put(larbnode);
- return -EPROBE_DEFER;
+ put_device(&plarbdev->dev);
+ ret = -EPROBE_DEFER;
+ goto out_put_larbs;
}
data->larb_imu[i].dev = &plarbdev->dev;
@@ -673,7 +686,7 @@ static int mtk_iommu_v1_probe(struct platform_device *pdev)
ret = mtk_iommu_v1_hw_init(data);
if (ret)
- return ret;
+ goto out_put_larbs;
ret = iommu_device_sysfs_add(&data->iommu, &pdev->dev, NULL,
dev_name(&pdev->dev));
@@ -695,12 +708,17 @@ out_sysfs_remove:
iommu_device_sysfs_remove(&data->iommu);
out_clk_unprepare:
clk_disable_unprepare(data->bclk);
+out_put_larbs:
+ for (i = 0; i < MTK_LARB_NR_MAX; i++)
+ put_device(data->larb_imu[i].dev);
+
return ret;
}
static void mtk_iommu_v1_remove(struct platform_device *pdev)
{
struct mtk_iommu_v1_data *data = platform_get_drvdata(pdev);
+ int i;
iommu_device_sysfs_remove(&data->iommu);
iommu_device_unregister(&data->iommu);
@@ -708,6 +726,9 @@ static void mtk_iommu_v1_remove(struct platform_device *pdev)
clk_disable_unprepare(data->bclk);
devm_free_irq(&pdev->dev, data->irq, data);
component_master_del(&pdev->dev, &mtk_iommu_v1_com_ops);
+
+ for (i = 0; i < MTK_LARB_NR_MAX; i++)
+ put_device(data->larb_imu[i].dev);
}
static int __maybe_unused mtk_iommu_v1_suspend(struct device *dev)
diff --git a/drivers/iommu/omap-iommu.c b/drivers/iommu/omap-iommu.c
index 5c6f5943f44b..768973b7e511 100644
--- a/drivers/iommu/omap-iommu.c
+++ b/drivers/iommu/omap-iommu.c
@@ -1431,8 +1431,8 @@ static void omap_iommu_detach_fini(struct omap_iommu_domain *odomain)
odomain->iommus = NULL;
}
-static int
-omap_iommu_attach_dev(struct iommu_domain *domain, struct device *dev)
+static int omap_iommu_attach_dev(struct iommu_domain *domain,
+ struct device *dev, struct iommu_domain *old)
{
struct omap_iommu_arch_data *arch_data = dev_iommu_priv_get(dev);
struct omap_iommu_domain *omap_domain = to_omap_domain(domain);
@@ -1536,15 +1536,15 @@ static void _omap_iommu_detach_dev(struct omap_iommu_domain *omap_domain,
}
static int omap_iommu_identity_attach(struct iommu_domain *identity_domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old)
{
- struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
struct omap_iommu_domain *omap_domain;
- if (domain == identity_domain || !domain)
+ if (old == identity_domain || !old)
return 0;
- omap_domain = to_omap_domain(domain);
+ omap_domain = to_omap_domain(old);
spin_lock(&omap_domain->lock);
_omap_iommu_detach_dev(omap_domain, dev);
spin_unlock(&omap_domain->lock);
@@ -1668,23 +1668,20 @@ static struct iommu_device *omap_iommu_probe_device(struct device *dev)
}
pdev = of_find_device_by_node(np);
+ of_node_put(np);
if (!pdev) {
- of_node_put(np);
kfree(arch_data);
return ERR_PTR(-ENODEV);
}
oiommu = platform_get_drvdata(pdev);
+ put_device(&pdev->dev);
if (!oiommu) {
- of_node_put(np);
kfree(arch_data);
return ERR_PTR(-EINVAL);
}
tmp->iommu_dev = oiommu;
- tmp->dev = &pdev->dev;
-
- of_node_put(np);
}
dev_iommu_priv_set(dev, arch_data);
diff --git a/drivers/iommu/omap-iommu.h b/drivers/iommu/omap-iommu.h
index 27697109ec79..50b39be61abc 100644
--- a/drivers/iommu/omap-iommu.h
+++ b/drivers/iommu/omap-iommu.h
@@ -88,7 +88,6 @@ struct omap_iommu {
/**
* struct omap_iommu_arch_data - omap iommu private data
* @iommu_dev: handle of the OMAP iommu device
- * @dev: handle of the iommu device
*
* This is an omap iommu private data object, which binds an iommu user
* to its iommu device. This object should be placed at the iommu user's
@@ -97,7 +96,6 @@ struct omap_iommu {
*/
struct omap_iommu_arch_data {
struct omap_iommu *iommu_dev;
- struct device *dev;
};
struct cr_regs {
diff --git a/drivers/iommu/riscv/iommu.c b/drivers/iommu/riscv/iommu.c
index ebb22979075d..d9429097a2b5 100644
--- a/drivers/iommu/riscv/iommu.c
+++ b/drivers/iommu/riscv/iommu.c
@@ -1321,7 +1321,8 @@ static bool riscv_iommu_pt_supported(struct riscv_iommu_device *iommu, int pgd_m
}
static int riscv_iommu_attach_paging_domain(struct iommu_domain *iommu_domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old)
{
struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain);
struct riscv_iommu_device *iommu = dev_to_iommu(dev);
@@ -1426,7 +1427,8 @@ static struct iommu_domain *riscv_iommu_alloc_paging_domain(struct device *dev)
}
static int riscv_iommu_attach_blocking_domain(struct iommu_domain *iommu_domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old)
{
struct riscv_iommu_device *iommu = dev_to_iommu(dev);
struct riscv_iommu_info *info = dev_iommu_priv_get(dev);
@@ -1447,7 +1449,8 @@ static struct iommu_domain riscv_iommu_blocking_domain = {
};
static int riscv_iommu_attach_identity_domain(struct iommu_domain *iommu_domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old)
{
struct riscv_iommu_device *iommu = dev_to_iommu(dev);
struct riscv_iommu_info *info = dev_iommu_priv_get(dev);
diff --git a/drivers/iommu/rockchip-iommu.c b/drivers/iommu/rockchip-iommu.c
index 0861dd469bd8..85f3667e797c 100644
--- a/drivers/iommu/rockchip-iommu.c
+++ b/drivers/iommu/rockchip-iommu.c
@@ -960,7 +960,8 @@ out_disable_clocks:
}
static int rk_iommu_identity_attach(struct iommu_domain *identity_domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old)
{
struct rk_iommu *iommu;
struct rk_iommu_domain *rk_domain;
@@ -1005,7 +1006,7 @@ static struct iommu_domain rk_identity_domain = {
};
static int rk_iommu_attach_device(struct iommu_domain *domain,
- struct device *dev)
+ struct device *dev, struct iommu_domain *old)
{
struct rk_iommu *iommu;
struct rk_iommu_domain *rk_domain = to_rk_domain(domain);
@@ -1026,7 +1027,7 @@ static int rk_iommu_attach_device(struct iommu_domain *domain,
if (iommu->domain == domain)
return 0;
- ret = rk_iommu_identity_attach(&rk_identity_domain, dev);
+ ret = rk_iommu_identity_attach(&rk_identity_domain, dev, old);
if (ret)
return ret;
@@ -1041,8 +1042,17 @@ static int rk_iommu_attach_device(struct iommu_domain *domain,
return 0;
ret = rk_iommu_enable(iommu);
- if (ret)
- WARN_ON(rk_iommu_identity_attach(&rk_identity_domain, dev));
+ if (ret) {
+ /*
+ * Note rk_iommu_identity_attach() might fail before physically
+ * attaching the dev to iommu->domain, in which case the actual
+ * old domain for this revert should be rk_identity_domain v.s.
+ * iommu->domain. Since rk_iommu_identity_attach() does not care
+ * about the old domain argument for now, this is not a problem.
+ */
+ WARN_ON(rk_iommu_identity_attach(&rk_identity_domain, dev,
+ iommu->domain));
+ }
pm_runtime_put(iommu->dev);
diff --git a/drivers/iommu/s390-iommu.c b/drivers/iommu/s390-iommu.c
index aa576736d60b..fe679850af28 100644
--- a/drivers/iommu/s390-iommu.c
+++ b/drivers/iommu/s390-iommu.c
@@ -670,7 +670,8 @@ int zpci_iommu_register_ioat(struct zpci_dev *zdev, u8 *status)
}
static int blocking_domain_attach_device(struct iommu_domain *domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old)
{
struct zpci_dev *zdev = to_zpci_dev(dev);
struct s390_domain *s390_domain;
@@ -694,7 +695,8 @@ static int blocking_domain_attach_device(struct iommu_domain *domain,
}
static int s390_iommu_attach_device(struct iommu_domain *domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old)
{
struct s390_domain *s390_domain = to_s390_domain(domain);
struct zpci_dev *zdev = to_zpci_dev(dev);
@@ -709,7 +711,7 @@ static int s390_iommu_attach_device(struct iommu_domain *domain,
domain->geometry.aperture_end < zdev->start_dma))
return -EINVAL;
- blocking_domain_attach_device(&blocking_domain, dev);
+ blocking_domain_attach_device(&blocking_domain, dev, old);
/* If we fail now DMA remains blocked via blocking domain */
cc = s390_iommu_domain_reg_ioat(zdev, domain, &status);
@@ -1131,13 +1133,14 @@ static int __init s390_iommu_init(void)
subsys_initcall(s390_iommu_init);
static int s390_attach_dev_identity(struct iommu_domain *domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old)
{
struct zpci_dev *zdev = to_zpci_dev(dev);
u8 status;
int cc;
- blocking_domain_attach_device(&blocking_domain, dev);
+ blocking_domain_attach_device(&blocking_domain, dev, old);
/* If we fail now DMA remains blocked via blocking domain */
cc = s390_iommu_domain_reg_ioat(zdev, domain, &status);
diff --git a/drivers/iommu/sprd-iommu.c b/drivers/iommu/sprd-iommu.c
index c7ca1d8a0b15..555d4505c747 100644
--- a/drivers/iommu/sprd-iommu.c
+++ b/drivers/iommu/sprd-iommu.c
@@ -247,7 +247,8 @@ static void sprd_iommu_domain_free(struct iommu_domain *domain)
}
static int sprd_iommu_attach_device(struct iommu_domain *domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old)
{
struct sprd_iommu_device *sdev = dev_iommu_priv_get(dev);
struct sprd_iommu_domain *dom = to_sprd_domain(domain);
diff --git a/drivers/iommu/sun50i-iommu.c b/drivers/iommu/sun50i-iommu.c
index de10b569d9a9..90b26fe21817 100644
--- a/drivers/iommu/sun50i-iommu.c
+++ b/drivers/iommu/sun50i-iommu.c
@@ -771,7 +771,8 @@ static void sun50i_iommu_detach_domain(struct sun50i_iommu *iommu,
}
static int sun50i_iommu_identity_attach(struct iommu_domain *identity_domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old)
{
struct sun50i_iommu *iommu = dev_iommu_priv_get(dev);
struct sun50i_iommu_domain *sun50i_domain;
@@ -797,7 +798,8 @@ static struct iommu_domain sun50i_iommu_identity_domain = {
};
static int sun50i_iommu_attach_device(struct iommu_domain *domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old)
{
struct sun50i_iommu_domain *sun50i_domain = to_sun50i_domain(domain);
struct sun50i_iommu *iommu;
@@ -813,7 +815,7 @@ static int sun50i_iommu_attach_device(struct iommu_domain *domain,
if (iommu->domain == domain)
return 0;
- sun50i_iommu_identity_attach(&sun50i_iommu_identity_domain, dev);
+ sun50i_iommu_identity_attach(&sun50i_iommu_identity_domain, dev, old);
sun50i_iommu_attach_domain(iommu, sun50i_domain);
@@ -839,6 +841,8 @@ static int sun50i_iommu_of_xlate(struct device *dev,
dev_iommu_priv_set(dev, platform_get_drvdata(iommu_pdev));
+ put_device(&iommu_pdev->dev);
+
return iommu_fwspec_add_ids(dev, &id, 1);
}
diff --git a/drivers/iommu/tegra-smmu.c b/drivers/iommu/tegra-smmu.c
index 36cdd5fbab07..c391e7f2cde6 100644
--- a/drivers/iommu/tegra-smmu.c
+++ b/drivers/iommu/tegra-smmu.c
@@ -490,7 +490,7 @@ static void tegra_smmu_as_unprepare(struct tegra_smmu *smmu,
}
static int tegra_smmu_attach_dev(struct iommu_domain *domain,
- struct device *dev)
+ struct device *dev, struct iommu_domain *old)
{
struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
struct tegra_smmu *smmu = dev_iommu_priv_get(dev);
@@ -524,9 +524,9 @@ disable:
}
static int tegra_smmu_identity_attach(struct iommu_domain *identity_domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old)
{
- struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
struct tegra_smmu_as *as;
struct tegra_smmu *smmu;
@@ -535,10 +535,10 @@ static int tegra_smmu_identity_attach(struct iommu_domain *identity_domain,
if (!fwspec)
return -ENODEV;
- if (domain == identity_domain || !domain)
+ if (old == identity_domain || !old)
return 0;
- as = to_smmu_as(domain);
+ as = to_smmu_as(old);
smmu = as->smmu;
for (index = 0; index < fwspec->num_ids; index++) {
tegra_smmu_disable(smmu, fwspec->ids[index], as->id);
@@ -830,10 +830,9 @@ static struct tegra_smmu *tegra_smmu_find(struct device_node *np)
return NULL;
mc = platform_get_drvdata(pdev);
- if (!mc) {
- put_device(&pdev->dev);
+ put_device(&pdev->dev);
+ if (!mc)
return NULL;
- }
return mc->smmu;
}
diff --git a/drivers/iommu/virtio-iommu.c b/drivers/iommu/virtio-iommu.c
index b39d6f134ab2..d314fa5cd847 100644
--- a/drivers/iommu/virtio-iommu.c
+++ b/drivers/iommu/virtio-iommu.c
@@ -730,7 +730,8 @@ static struct iommu_domain *viommu_domain_alloc_identity(struct device *dev)
return domain;
}
-static int viommu_attach_dev(struct iommu_domain *domain, struct device *dev)
+static int viommu_attach_dev(struct iommu_domain *domain, struct device *dev,
+ struct iommu_domain *old)
{
int ret = 0;
struct virtio_iommu_req_attach req;
@@ -781,7 +782,8 @@ static int viommu_attach_dev(struct iommu_domain *domain, struct device *dev)
}
static int viommu_attach_identity_domain(struct iommu_domain *domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old)
{
int ret = 0;
struct virtio_iommu_req_attach req;