diff options
Diffstat (limited to 'drivers/iommu')
80 files changed, 7990 insertions, 2922 deletions
diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig index 70d29b14d851..99095645134f 100644 --- a/drivers/iommu/Kconfig +++ b/drivers/iommu/Kconfig @@ -40,12 +40,13 @@ config IOMMU_IO_PGTABLE_LPAE sizes at both stage-1 and stage-2, as well as address spaces up to 48-bits in size. -config IOMMU_IO_PGTABLE_LPAE_SELFTEST - bool "LPAE selftests" - depends on IOMMU_IO_PGTABLE_LPAE +config IOMMU_IO_PGTABLE_LPAE_KUNIT_TEST + tristate "KUnit tests for LPAE" + depends on IOMMU_IO_PGTABLE_LPAE && KUNIT + default KUNIT_ALL_TESTS help - Enable self-tests for LPAE page table allocator. This performs - a series of page-table consistency checks during boot. + Enable kunit tests for LPAE page table allocator. This performs + a series of page-table consistency checks. If unsure, say N here. @@ -247,7 +248,7 @@ config SUN50I_IOMMU config TEGRA_IOMMU_SMMU bool "NVIDIA Tegra SMMU Support" - depends on ARCH_TEGRA + depends on ARCH_TEGRA || COMPILE_TEST depends on TEGRA_AHB depends on TEGRA_MC select IOMMU_API @@ -384,3 +385,5 @@ config SPRD_IOMMU Say Y here if you want to use the multimedia devices listed above. endif # IOMMU_SUPPORT + +source "drivers/iommu/generic_pt/Kconfig" diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile index 355294fa9033..8e8843316c4b 100644 --- a/drivers/iommu/Makefile +++ b/drivers/iommu/Makefile @@ -3,6 +3,7 @@ obj-y += arm/ iommufd/ obj-$(CONFIG_AMD_IOMMU) += amd/ obj-$(CONFIG_INTEL_IOMMU) += intel/ obj-$(CONFIG_RISCV_IOMMU) += riscv/ +obj-$(CONFIG_GENERIC_PT) += generic_pt/fmt/ obj-$(CONFIG_IOMMU_API) += iommu.o obj-$(CONFIG_IOMMU_SUPPORT) += iommu-pages.o obj-$(CONFIG_IOMMU_API) += iommu-traces.o @@ -12,6 +13,7 @@ obj-$(CONFIG_IOMMU_DMA) += dma-iommu.o obj-$(CONFIG_IOMMU_IO_PGTABLE) += io-pgtable.o obj-$(CONFIG_IOMMU_IO_PGTABLE_ARMV7S) += io-pgtable-arm-v7s.o obj-$(CONFIG_IOMMU_IO_PGTABLE_LPAE) += io-pgtable-arm.o +obj-$(CONFIG_IOMMU_IO_PGTABLE_LPAE_KUNIT_TEST) += io-pgtable-arm-selftests.o obj-$(CONFIG_IOMMU_IO_PGTABLE_DART) += io-pgtable-dart.o obj-$(CONFIG_IOMMU_IOVA) += iova.o obj-$(CONFIG_OF_IOMMU) += of_iommu.o diff --git a/drivers/iommu/amd/Kconfig b/drivers/iommu/amd/Kconfig index ecef69c11144..f2acf471cb5d 100644 --- a/drivers/iommu/amd/Kconfig +++ b/drivers/iommu/amd/Kconfig @@ -11,10 +11,13 @@ config AMD_IOMMU select MMU_NOTIFIER select IOMMU_API select IOMMU_IOVA - select IOMMU_IO_PGTABLE select IOMMU_SVA select IOMMU_IOPF select IOMMUFD_DRIVER if IOMMUFD + select GENERIC_PT + select IOMMU_PT + select IOMMU_PT_AMDV1 + select IOMMU_PT_X86_64 depends on X86_64 && PCI && ACPI && HAVE_CMPXCHG_DOUBLE help With this option you can enable support for AMD IOMMU hardware in diff --git a/drivers/iommu/amd/Makefile b/drivers/iommu/amd/Makefile index 59c04a67f398..5412a563c697 100644 --- a/drivers/iommu/amd/Makefile +++ b/drivers/iommu/amd/Makefile @@ -1,3 +1,3 @@ # SPDX-License-Identifier: GPL-2.0-only -obj-y += iommu.o init.o quirks.o io_pgtable.o io_pgtable_v2.o ppr.o pasid.o +obj-y += iommu.o init.o quirks.o ppr.o pasid.o obj-$(CONFIG_AMD_IOMMU_DEBUGFS) += debugfs.o diff --git a/drivers/iommu/amd/amd_iommu.h b/drivers/iommu/amd/amd_iommu.h index 9b4b589a54b5..25044d28f28a 100644 --- a/drivers/iommu/amd/amd_iommu.h +++ b/drivers/iommu/amd/amd_iommu.h @@ -88,7 +88,6 @@ int amd_iommu_complete_ppr(struct device *dev, u32 pasid, int status, int tag); * the IOMMU used by this driver. */ void amd_iommu_flush_all_caches(struct amd_iommu *iommu); -void amd_iommu_update_and_flush_device_table(struct protection_domain *domain); void amd_iommu_domain_flush_pages(struct protection_domain *domain, u64 address, size_t size); void amd_iommu_dev_flush_pasid_pages(struct iommu_dev_data *dev_data, diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h index a698a2e7ce2a..78b1c44bd6b5 100644 --- a/drivers/iommu/amd/amd_iommu_types.h +++ b/drivers/iommu/amd/amd_iommu_types.h @@ -18,7 +18,7 @@ #include <linux/spinlock.h> #include <linux/pci.h> #include <linux/irqreturn.h> -#include <linux/io-pgtable.h> +#include <linux/generic_pt/iommu.h> /* * Maximum number of IOMMUs supported @@ -247,6 +247,10 @@ #define CMD_BUFFER_ENTRIES 512 #define MMIO_CMD_SIZE_SHIFT 56 #define MMIO_CMD_SIZE_512 (0x9ULL << MMIO_CMD_SIZE_SHIFT) +#define MMIO_CMD_HEAD_MASK GENMASK_ULL(18, 4) /* Command buffer head ptr field [18:4] */ +#define MMIO_CMD_BUFFER_HEAD(x) FIELD_GET(MMIO_CMD_HEAD_MASK, (x)) +#define MMIO_CMD_TAIL_MASK GENMASK_ULL(18, 4) /* Command buffer tail ptr field [18:4] */ +#define MMIO_CMD_BUFFER_TAIL(x) FIELD_GET(MMIO_CMD_TAIL_MASK, (x)) /* constants for event buffer handling */ #define EVT_BUFFER_SIZE 8192 /* 512 entries */ @@ -337,76 +341,7 @@ #define GUEST_PGTABLE_4_LEVEL 0x00 #define GUEST_PGTABLE_5_LEVEL 0x01 -#define PM_LEVEL_SHIFT(x) (12 + ((x) * 9)) -#define PM_LEVEL_SIZE(x) (((x) < 6) ? \ - ((1ULL << PM_LEVEL_SHIFT((x))) - 1): \ - (0xffffffffffffffffULL)) -#define PM_LEVEL_INDEX(x, a) (((a) >> PM_LEVEL_SHIFT((x))) & 0x1ffULL) -#define PM_LEVEL_ENC(x) (((x) << 9) & 0xe00ULL) -#define PM_LEVEL_PDE(x, a) ((a) | PM_LEVEL_ENC((x)) | \ - IOMMU_PTE_PR | IOMMU_PTE_IR | IOMMU_PTE_IW) -#define PM_PTE_LEVEL(pte) (((pte) >> 9) & 0x7ULL) - -#define PM_MAP_4k 0 #define PM_ADDR_MASK 0x000ffffffffff000ULL -#define PM_MAP_MASK(lvl) (PM_ADDR_MASK & \ - (~((1ULL << (12 + ((lvl) * 9))) - 1))) -#define PM_ALIGNED(lvl, addr) ((PM_MAP_MASK(lvl) & (addr)) == (addr)) - -/* - * Returns the page table level to use for a given page size - * Pagesize is expected to be a power-of-two - */ -#define PAGE_SIZE_LEVEL(pagesize) \ - ((__ffs(pagesize) - 12) / 9) -/* - * Returns the number of ptes to use for a given page size - * Pagesize is expected to be a power-of-two - */ -#define PAGE_SIZE_PTE_COUNT(pagesize) \ - (1ULL << ((__ffs(pagesize) - 12) % 9)) - -/* - * Aligns a given io-virtual address to a given page size - * Pagesize is expected to be a power-of-two - */ -#define PAGE_SIZE_ALIGN(address, pagesize) \ - ((address) & ~((pagesize) - 1)) -/* - * Creates an IOMMU PTE for an address and a given pagesize - * The PTE has no permission bits set - * Pagesize is expected to be a power-of-two larger than 4096 - */ -#define PAGE_SIZE_PTE(address, pagesize) \ - (((address) | ((pagesize) - 1)) & \ - (~(pagesize >> 1)) & PM_ADDR_MASK) - -/* - * Takes a PTE value with mode=0x07 and returns the page size it maps - */ -#define PTE_PAGE_SIZE(pte) \ - (1ULL << (1 + ffz(((pte) | 0xfffULL)))) - -/* - * Takes a page-table level and returns the default page-size for this level - */ -#define PTE_LEVEL_PAGE_SIZE(level) \ - (1ULL << (12 + (9 * (level)))) - -/* - * The IOPTE dirty bit - */ -#define IOMMU_PTE_HD_BIT (6) - -/* - * Bit value definition for I/O PTE fields - */ -#define IOMMU_PTE_PR BIT_ULL(0) -#define IOMMU_PTE_HD BIT_ULL(IOMMU_PTE_HD_BIT) -#define IOMMU_PTE_U BIT_ULL(59) -#define IOMMU_PTE_FC BIT_ULL(60) -#define IOMMU_PTE_IR BIT_ULL(61) -#define IOMMU_PTE_IW BIT_ULL(62) /* * Bit value definition for DTE fields @@ -436,12 +371,6 @@ /* DTE[128:179] | DTE[184:191] */ #define DTE_DATA2_INTR_MASK ~GENMASK_ULL(55, 52) -#define IOMMU_PAGE_MASK (((1ULL << 52) - 1) & ~0xfffULL) -#define IOMMU_PTE_PRESENT(pte) ((pte) & IOMMU_PTE_PR) -#define IOMMU_PTE_DIRTY(pte) ((pte) & IOMMU_PTE_HD) -#define IOMMU_PTE_PAGE(pte) (iommu_phys_to_virt((pte) & IOMMU_PAGE_MASK)) -#define IOMMU_PTE_MODE(pte) (((pte) >> 9) & 0x07) - #define IOMMU_PROT_MASK 0x03 #define IOMMU_PROT_IR 0x01 #define IOMMU_PROT_IW 0x02 @@ -534,19 +463,6 @@ struct amd_irte_ops; #define AMD_IOMMU_FLAG_TRANS_PRE_ENABLED (1 << 0) -#define io_pgtable_to_data(x) \ - container_of((x), struct amd_io_pgtable, pgtbl) - -#define io_pgtable_ops_to_data(x) \ - io_pgtable_to_data(io_pgtable_ops_to_pgtable(x)) - -#define io_pgtable_ops_to_domain(x) \ - container_of(io_pgtable_ops_to_data(x), \ - struct protection_domain, iop) - -#define io_pgtable_cfg_to_data(x) \ - container_of((x), struct amd_io_pgtable, pgtbl.cfg) - struct gcr3_tbl_info { u64 *gcr3_tbl; /* Guest CR3 table */ int glx; /* Number of levels for GCR3 table */ @@ -554,14 +470,6 @@ struct gcr3_tbl_info { u16 domid; /* Per device domain ID */ }; -struct amd_io_pgtable { - seqcount_t seqcount; /* Protects root/mode update */ - struct io_pgtable pgtbl; - int mode; - u64 *root; - u64 *pgd; /* v2 pgtable pgd pointer */ -}; - enum protection_domain_mode { PD_MODE_NONE, PD_MODE_V1, @@ -589,10 +497,13 @@ struct pdom_iommu_info { * independent of their use. */ struct protection_domain { + union { + struct iommu_domain domain; + struct pt_iommu iommu; + struct pt_iommu_amdv1 amdv1; + struct pt_iommu_x86_64 amdv2; + }; struct list_head dev_list; /* List of all devices in this domain */ - struct iommu_domain domain; /* generic domain handle used by - iommu core code */ - struct amd_io_pgtable iop; spinlock_t lock; /* mostly used to lock the page table*/ u16 id; /* the domain id written to the device table */ enum protection_domain_mode pd_mode; /* Track page table type */ @@ -602,6 +513,9 @@ struct protection_domain { struct mmu_notifier mn; /* mmu notifier for the SVA domain */ struct list_head dev_data_list; /* List of pdom_dev_data */ }; +PT_IOMMU_CHECK_DOMAIN(struct protection_domain, iommu, domain); +PT_IOMMU_CHECK_DOMAIN(struct protection_domain, amdv1.iommu, domain); +PT_IOMMU_CHECK_DOMAIN(struct protection_domain, amdv2.iommu, domain); /* * This structure contains information about one PCI segment in the system. diff --git a/drivers/iommu/amd/debugfs.c b/drivers/iommu/amd/debugfs.c index 10fa217a7119..20b04996441d 100644 --- a/drivers/iommu/amd/debugfs.c +++ b/drivers/iommu/amd/debugfs.c @@ -37,7 +37,7 @@ static ssize_t iommu_mmio_write(struct file *filp, const char __user *ubuf, if (ret) return ret; - if (iommu->dbg_mmio_offset > iommu->mmio_phys_end - 4) { + if (iommu->dbg_mmio_offset > iommu->mmio_phys_end - sizeof(u64)) { iommu->dbg_mmio_offset = -1; return -EINVAL; } diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c index f2991c11867c..4f4d4955269e 100644 --- a/drivers/iommu/amd/init.c +++ b/drivers/iommu/amd/init.c @@ -1710,13 +1710,22 @@ static struct amd_iommu_pci_seg *__init alloc_pci_segment(u16 id, list_add_tail(&pci_seg->list, &amd_iommu_pci_seg_list); if (alloc_dev_table(pci_seg)) - return NULL; + goto err_free_pci_seg; if (alloc_alias_table(pci_seg)) - return NULL; + goto err_free_dev_table; if (alloc_rlookup_table(pci_seg)) - return NULL; + goto err_free_alias_table; return pci_seg; + +err_free_alias_table: + free_alias_table(pci_seg); +err_free_dev_table: + free_dev_table(pci_seg); +err_free_pci_seg: + list_del(&pci_seg->list); + kfree(pci_seg); + return NULL; } static struct amd_iommu_pci_seg *__init get_pci_segment(u16 id, diff --git a/drivers/iommu/amd/io_pgtable.c b/drivers/iommu/amd/io_pgtable.c deleted file mode 100644 index 70c2f5b1631b..000000000000 --- a/drivers/iommu/amd/io_pgtable.c +++ /dev/null @@ -1,577 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * CPU-agnostic AMD IO page table allocator. - * - * Copyright (C) 2020 Advanced Micro Devices, Inc. - * Author: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com> - */ - -#define pr_fmt(fmt) "AMD-Vi: " fmt -#define dev_fmt(fmt) pr_fmt(fmt) - -#include <linux/atomic.h> -#include <linux/bitops.h> -#include <linux/io-pgtable.h> -#include <linux/kernel.h> -#include <linux/sizes.h> -#include <linux/slab.h> -#include <linux/types.h> -#include <linux/dma-mapping.h> -#include <linux/seqlock.h> - -#include <asm/barrier.h> - -#include "amd_iommu_types.h" -#include "amd_iommu.h" -#include "../iommu-pages.h" - -/* - * Helper function to get the first pte of a large mapping - */ -static u64 *first_pte_l7(u64 *pte, unsigned long *page_size, - unsigned long *count) -{ - unsigned long pte_mask, pg_size, cnt; - u64 *fpte; - - pg_size = PTE_PAGE_SIZE(*pte); - cnt = PAGE_SIZE_PTE_COUNT(pg_size); - pte_mask = ~((cnt << 3) - 1); - fpte = (u64 *)(((unsigned long)pte) & pte_mask); - - if (page_size) - *page_size = pg_size; - - if (count) - *count = cnt; - - return fpte; -} - -static void free_pt_lvl(u64 *pt, struct iommu_pages_list *freelist, int lvl) -{ - u64 *p; - int i; - - for (i = 0; i < 512; ++i) { - /* PTE present? */ - if (!IOMMU_PTE_PRESENT(pt[i])) - continue; - - /* Large PTE? */ - if (PM_PTE_LEVEL(pt[i]) == 0 || - PM_PTE_LEVEL(pt[i]) == 7) - continue; - - /* - * Free the next level. No need to look at l1 tables here since - * they can only contain leaf PTEs; just free them directly. - */ - p = IOMMU_PTE_PAGE(pt[i]); - if (lvl > 2) - free_pt_lvl(p, freelist, lvl - 1); - else - iommu_pages_list_add(freelist, p); - } - - iommu_pages_list_add(freelist, pt); -} - -static void free_sub_pt(u64 *root, int mode, struct iommu_pages_list *freelist) -{ - switch (mode) { - case PAGE_MODE_NONE: - case PAGE_MODE_7_LEVEL: - break; - case PAGE_MODE_1_LEVEL: - iommu_pages_list_add(freelist, root); - break; - case PAGE_MODE_2_LEVEL: - case PAGE_MODE_3_LEVEL: - case PAGE_MODE_4_LEVEL: - case PAGE_MODE_5_LEVEL: - case PAGE_MODE_6_LEVEL: - free_pt_lvl(root, freelist, mode); - break; - default: - BUG(); - } -} - -/* - * This function is used to add another level to an IO page table. Adding - * another level increases the size of the address space by 9 bits to a size up - * to 64 bits. - */ -static bool increase_address_space(struct amd_io_pgtable *pgtable, - unsigned long address, - unsigned int page_size_level, - gfp_t gfp) -{ - struct io_pgtable_cfg *cfg = &pgtable->pgtbl.cfg; - struct protection_domain *domain = - container_of(pgtable, struct protection_domain, iop); - unsigned long flags; - bool ret = true; - u64 *pte; - - pte = iommu_alloc_pages_node_sz(cfg->amd.nid, gfp, SZ_4K); - if (!pte) - return false; - - spin_lock_irqsave(&domain->lock, flags); - - if (address <= PM_LEVEL_SIZE(pgtable->mode) && - pgtable->mode - 1 >= page_size_level) - goto out; - - ret = false; - if (WARN_ON_ONCE(pgtable->mode == amd_iommu_hpt_level)) - goto out; - - *pte = PM_LEVEL_PDE(pgtable->mode, iommu_virt_to_phys(pgtable->root)); - - write_seqcount_begin(&pgtable->seqcount); - pgtable->root = pte; - pgtable->mode += 1; - write_seqcount_end(&pgtable->seqcount); - - amd_iommu_update_and_flush_device_table(domain); - - pte = NULL; - ret = true; - -out: - spin_unlock_irqrestore(&domain->lock, flags); - iommu_free_pages(pte); - - return ret; -} - -static u64 *alloc_pte(struct amd_io_pgtable *pgtable, - unsigned long address, - unsigned long page_size, - u64 **pte_page, - gfp_t gfp, - bool *updated) -{ - unsigned long last_addr = address + (page_size - 1); - struct io_pgtable_cfg *cfg = &pgtable->pgtbl.cfg; - unsigned int seqcount; - int level, end_lvl; - u64 *pte, *page; - - BUG_ON(!is_power_of_2(page_size)); - - while (last_addr > PM_LEVEL_SIZE(pgtable->mode) || - pgtable->mode - 1 < PAGE_SIZE_LEVEL(page_size)) { - /* - * Return an error if there is no memory to update the - * page-table. - */ - if (!increase_address_space(pgtable, last_addr, - PAGE_SIZE_LEVEL(page_size), gfp)) - return NULL; - } - - - do { - seqcount = read_seqcount_begin(&pgtable->seqcount); - - level = pgtable->mode - 1; - pte = &pgtable->root[PM_LEVEL_INDEX(level, address)]; - } while (read_seqcount_retry(&pgtable->seqcount, seqcount)); - - - address = PAGE_SIZE_ALIGN(address, page_size); - end_lvl = PAGE_SIZE_LEVEL(page_size); - - while (level > end_lvl) { - u64 __pte, __npte; - int pte_level; - - __pte = *pte; - pte_level = PM_PTE_LEVEL(__pte); - - /* - * If we replace a series of large PTEs, we need - * to tear down all of them. - */ - if (IOMMU_PTE_PRESENT(__pte) && - pte_level == PAGE_MODE_7_LEVEL) { - unsigned long count, i; - u64 *lpte; - - lpte = first_pte_l7(pte, NULL, &count); - - /* - * Unmap the replicated PTEs that still match the - * original large mapping - */ - for (i = 0; i < count; ++i) - cmpxchg64(&lpte[i], __pte, 0ULL); - - *updated = true; - continue; - } - - if (!IOMMU_PTE_PRESENT(__pte) || - pte_level == PAGE_MODE_NONE) { - page = iommu_alloc_pages_node_sz(cfg->amd.nid, gfp, - SZ_4K); - - if (!page) - return NULL; - - __npte = PM_LEVEL_PDE(level, iommu_virt_to_phys(page)); - - /* pte could have been changed somewhere. */ - if (!try_cmpxchg64(pte, &__pte, __npte)) - iommu_free_pages(page); - else if (IOMMU_PTE_PRESENT(__pte)) - *updated = true; - - continue; - } - - /* No level skipping support yet */ - if (pte_level != level) - return NULL; - - level -= 1; - - pte = IOMMU_PTE_PAGE(__pte); - - if (pte_page && level == end_lvl) - *pte_page = pte; - - pte = &pte[PM_LEVEL_INDEX(level, address)]; - } - - return pte; -} - -/* - * This function checks if there is a PTE for a given dma address. If - * there is one, it returns the pointer to it. - */ -static u64 *fetch_pte(struct amd_io_pgtable *pgtable, - unsigned long address, - unsigned long *page_size) -{ - int level; - unsigned int seqcount; - u64 *pte; - - *page_size = 0; - - if (address > PM_LEVEL_SIZE(pgtable->mode)) - return NULL; - - do { - seqcount = read_seqcount_begin(&pgtable->seqcount); - level = pgtable->mode - 1; - pte = &pgtable->root[PM_LEVEL_INDEX(level, address)]; - } while (read_seqcount_retry(&pgtable->seqcount, seqcount)); - - *page_size = PTE_LEVEL_PAGE_SIZE(level); - - while (level > 0) { - - /* Not Present */ - if (!IOMMU_PTE_PRESENT(*pte)) - return NULL; - - /* Large PTE */ - if (PM_PTE_LEVEL(*pte) == PAGE_MODE_7_LEVEL || - PM_PTE_LEVEL(*pte) == PAGE_MODE_NONE) - break; - - /* No level skipping support yet */ - if (PM_PTE_LEVEL(*pte) != level) - return NULL; - - level -= 1; - - /* Walk to the next level */ - pte = IOMMU_PTE_PAGE(*pte); - pte = &pte[PM_LEVEL_INDEX(level, address)]; - *page_size = PTE_LEVEL_PAGE_SIZE(level); - } - - /* - * If we have a series of large PTEs, make - * sure to return a pointer to the first one. - */ - if (PM_PTE_LEVEL(*pte) == PAGE_MODE_7_LEVEL) - pte = first_pte_l7(pte, page_size, NULL); - - return pte; -} - -static void free_clear_pte(u64 *pte, u64 pteval, - struct iommu_pages_list *freelist) -{ - u64 *pt; - int mode; - - while (!try_cmpxchg64(pte, &pteval, 0)) - pr_warn("AMD-Vi: IOMMU pte changed since we read it\n"); - - if (!IOMMU_PTE_PRESENT(pteval)) - return; - - pt = IOMMU_PTE_PAGE(pteval); - mode = IOMMU_PTE_MODE(pteval); - - free_sub_pt(pt, mode, freelist); -} - -/* - * Generic mapping functions. It maps a physical address into a DMA - * address space. It allocates the page table pages if necessary. - * In the future it can be extended to a generic mapping function - * supporting all features of AMD IOMMU page tables like level skipping - * and full 64 bit address spaces. - */ -static int iommu_v1_map_pages(struct io_pgtable_ops *ops, unsigned long iova, - phys_addr_t paddr, size_t pgsize, size_t pgcount, - int prot, gfp_t gfp, size_t *mapped) -{ - struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops); - struct iommu_pages_list freelist = IOMMU_PAGES_LIST_INIT(freelist); - bool updated = false; - u64 __pte, *pte; - int ret, i, count; - size_t size = pgcount << __ffs(pgsize); - unsigned long o_iova = iova; - - BUG_ON(!IS_ALIGNED(iova, pgsize)); - BUG_ON(!IS_ALIGNED(paddr, pgsize)); - - ret = -EINVAL; - if (!(prot & IOMMU_PROT_MASK)) - goto out; - - while (pgcount > 0) { - count = PAGE_SIZE_PTE_COUNT(pgsize); - pte = alloc_pte(pgtable, iova, pgsize, NULL, gfp, &updated); - - ret = -ENOMEM; - if (!pte) - goto out; - - for (i = 0; i < count; ++i) - free_clear_pte(&pte[i], pte[i], &freelist); - - if (!iommu_pages_list_empty(&freelist)) - updated = true; - - if (count > 1) { - __pte = PAGE_SIZE_PTE(__sme_set(paddr), pgsize); - __pte |= PM_LEVEL_ENC(7) | IOMMU_PTE_PR | IOMMU_PTE_FC; - } else - __pte = __sme_set(paddr) | IOMMU_PTE_PR | IOMMU_PTE_FC; - - if (prot & IOMMU_PROT_IR) - __pte |= IOMMU_PTE_IR; - if (prot & IOMMU_PROT_IW) - __pte |= IOMMU_PTE_IW; - - for (i = 0; i < count; ++i) - pte[i] = __pte; - - iova += pgsize; - paddr += pgsize; - pgcount--; - if (mapped) - *mapped += pgsize; - } - - ret = 0; - -out: - if (updated) { - struct protection_domain *dom = io_pgtable_ops_to_domain(ops); - unsigned long flags; - - spin_lock_irqsave(&dom->lock, flags); - /* - * Flush domain TLB(s) and wait for completion. Any Device-Table - * Updates and flushing already happened in - * increase_address_space(). - */ - amd_iommu_domain_flush_pages(dom, o_iova, size); - spin_unlock_irqrestore(&dom->lock, flags); - } - - /* Everything flushed out, free pages now */ - iommu_put_pages_list(&freelist); - - return ret; -} - -static unsigned long iommu_v1_unmap_pages(struct io_pgtable_ops *ops, - unsigned long iova, - size_t pgsize, size_t pgcount, - struct iommu_iotlb_gather *gather) -{ - struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops); - unsigned long long unmapped; - unsigned long unmap_size; - u64 *pte; - size_t size = pgcount << __ffs(pgsize); - - BUG_ON(!is_power_of_2(pgsize)); - - unmapped = 0; - - while (unmapped < size) { - pte = fetch_pte(pgtable, iova, &unmap_size); - if (pte) { - int i, count; - - count = PAGE_SIZE_PTE_COUNT(unmap_size); - for (i = 0; i < count; i++) - pte[i] = 0ULL; - } else { - return unmapped; - } - - iova = (iova & ~(unmap_size - 1)) + unmap_size; - unmapped += unmap_size; - } - - return unmapped; -} - -static phys_addr_t iommu_v1_iova_to_phys(struct io_pgtable_ops *ops, unsigned long iova) -{ - struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops); - unsigned long offset_mask, pte_pgsize; - u64 *pte, __pte; - - pte = fetch_pte(pgtable, iova, &pte_pgsize); - - if (!pte || !IOMMU_PTE_PRESENT(*pte)) - return 0; - - offset_mask = pte_pgsize - 1; - __pte = __sme_clr(*pte & PM_ADDR_MASK); - - return (__pte & ~offset_mask) | (iova & offset_mask); -} - -static bool pte_test_and_clear_dirty(u64 *ptep, unsigned long size, - unsigned long flags) -{ - bool test_only = flags & IOMMU_DIRTY_NO_CLEAR; - bool dirty = false; - int i, count; - - /* - * 2.2.3.2 Host Dirty Support - * When a non-default page size is used , software must OR the - * Dirty bits in all of the replicated host PTEs used to map - * the page. The IOMMU does not guarantee the Dirty bits are - * set in all of the replicated PTEs. Any portion of the page - * may have been written even if the Dirty bit is set in only - * one of the replicated PTEs. - */ - count = PAGE_SIZE_PTE_COUNT(size); - for (i = 0; i < count && test_only; i++) { - if (test_bit(IOMMU_PTE_HD_BIT, (unsigned long *)&ptep[i])) { - dirty = true; - break; - } - } - - for (i = 0; i < count && !test_only; i++) { - if (test_and_clear_bit(IOMMU_PTE_HD_BIT, - (unsigned long *)&ptep[i])) { - dirty = true; - } - } - - return dirty; -} - -static int iommu_v1_read_and_clear_dirty(struct io_pgtable_ops *ops, - unsigned long iova, size_t size, - unsigned long flags, - struct iommu_dirty_bitmap *dirty) -{ - struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops); - unsigned long end = iova + size - 1; - - do { - unsigned long pgsize = 0; - u64 *ptep, pte; - - ptep = fetch_pte(pgtable, iova, &pgsize); - if (ptep) - pte = READ_ONCE(*ptep); - if (!ptep || !IOMMU_PTE_PRESENT(pte)) { - pgsize = pgsize ?: PTE_LEVEL_PAGE_SIZE(0); - iova += pgsize; - continue; - } - - /* - * Mark the whole IOVA range as dirty even if only one of - * the replicated PTEs were marked dirty. - */ - if (pte_test_and_clear_dirty(ptep, pgsize, flags)) - iommu_dirty_bitmap_record(dirty, iova, pgsize); - iova += pgsize; - } while (iova < end); - - return 0; -} - -/* - * ---------------------------------------------------- - */ -static void v1_free_pgtable(struct io_pgtable *iop) -{ - struct amd_io_pgtable *pgtable = container_of(iop, struct amd_io_pgtable, pgtbl); - struct iommu_pages_list freelist = IOMMU_PAGES_LIST_INIT(freelist); - - if (pgtable->mode == PAGE_MODE_NONE) - return; - - /* Page-table is not visible to IOMMU anymore, so free it */ - BUG_ON(pgtable->mode < PAGE_MODE_NONE || - pgtable->mode > amd_iommu_hpt_level); - - free_sub_pt(pgtable->root, pgtable->mode, &freelist); - iommu_put_pages_list(&freelist); -} - -static struct io_pgtable *v1_alloc_pgtable(struct io_pgtable_cfg *cfg, void *cookie) -{ - struct amd_io_pgtable *pgtable = io_pgtable_cfg_to_data(cfg); - - pgtable->root = - iommu_alloc_pages_node_sz(cfg->amd.nid, GFP_KERNEL, SZ_4K); - if (!pgtable->root) - return NULL; - pgtable->mode = PAGE_MODE_3_LEVEL; - seqcount_init(&pgtable->seqcount); - - cfg->pgsize_bitmap = amd_iommu_pgsize_bitmap; - cfg->ias = IOMMU_IN_ADDR_BIT_SIZE; - cfg->oas = IOMMU_OUT_ADDR_BIT_SIZE; - - pgtable->pgtbl.ops.map_pages = iommu_v1_map_pages; - pgtable->pgtbl.ops.unmap_pages = iommu_v1_unmap_pages; - pgtable->pgtbl.ops.iova_to_phys = iommu_v1_iova_to_phys; - pgtable->pgtbl.ops.read_and_clear_dirty = iommu_v1_read_and_clear_dirty; - - return &pgtable->pgtbl; -} - -struct io_pgtable_init_fns io_pgtable_amd_iommu_v1_init_fns = { - .alloc = v1_alloc_pgtable, - .free = v1_free_pgtable, -}; diff --git a/drivers/iommu/amd/io_pgtable_v2.c b/drivers/iommu/amd/io_pgtable_v2.c deleted file mode 100644 index b47941353ccb..000000000000 --- a/drivers/iommu/amd/io_pgtable_v2.c +++ /dev/null @@ -1,370 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * CPU-agnostic AMD IO page table v2 allocator. - * - * Copyright (C) 2022, 2023 Advanced Micro Devices, Inc. - * Author: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com> - * Author: Vasant Hegde <vasant.hegde@amd.com> - */ - -#define pr_fmt(fmt) "AMD-Vi: " fmt -#define dev_fmt(fmt) pr_fmt(fmt) - -#include <linux/bitops.h> -#include <linux/io-pgtable.h> -#include <linux/kernel.h> - -#include <asm/barrier.h> - -#include "amd_iommu_types.h" -#include "amd_iommu.h" -#include "../iommu-pages.h" - -#define IOMMU_PAGE_PRESENT BIT_ULL(0) /* Is present */ -#define IOMMU_PAGE_RW BIT_ULL(1) /* Writeable */ -#define IOMMU_PAGE_USER BIT_ULL(2) /* Userspace addressable */ -#define IOMMU_PAGE_PWT BIT_ULL(3) /* Page write through */ -#define IOMMU_PAGE_PCD BIT_ULL(4) /* Page cache disabled */ -#define IOMMU_PAGE_ACCESS BIT_ULL(5) /* Was accessed (updated by IOMMU) */ -#define IOMMU_PAGE_DIRTY BIT_ULL(6) /* Was written to (updated by IOMMU) */ -#define IOMMU_PAGE_PSE BIT_ULL(7) /* Page Size Extensions */ -#define IOMMU_PAGE_NX BIT_ULL(63) /* No execute */ - -#define MAX_PTRS_PER_PAGE 512 - -#define IOMMU_PAGE_SIZE_2M BIT_ULL(21) -#define IOMMU_PAGE_SIZE_1G BIT_ULL(30) - - -static inline int get_pgtable_level(void) -{ - return amd_iommu_gpt_level; -} - -static inline bool is_large_pte(u64 pte) -{ - return (pte & IOMMU_PAGE_PSE); -} - -static inline u64 set_pgtable_attr(u64 *page) -{ - u64 prot; - - prot = IOMMU_PAGE_PRESENT | IOMMU_PAGE_RW | IOMMU_PAGE_USER; - prot |= IOMMU_PAGE_ACCESS; - - return (iommu_virt_to_phys(page) | prot); -} - -static inline void *get_pgtable_pte(u64 pte) -{ - return iommu_phys_to_virt(pte & PM_ADDR_MASK); -} - -static u64 set_pte_attr(u64 paddr, u64 pg_size, int prot) -{ - u64 pte; - - pte = __sme_set(paddr & PM_ADDR_MASK); - pte |= IOMMU_PAGE_PRESENT | IOMMU_PAGE_USER; - pte |= IOMMU_PAGE_ACCESS | IOMMU_PAGE_DIRTY; - - if (prot & IOMMU_PROT_IW) - pte |= IOMMU_PAGE_RW; - - /* Large page */ - if (pg_size == IOMMU_PAGE_SIZE_1G || pg_size == IOMMU_PAGE_SIZE_2M) - pte |= IOMMU_PAGE_PSE; - - return pte; -} - -static inline u64 get_alloc_page_size(u64 size) -{ - if (size >= IOMMU_PAGE_SIZE_1G) - return IOMMU_PAGE_SIZE_1G; - - if (size >= IOMMU_PAGE_SIZE_2M) - return IOMMU_PAGE_SIZE_2M; - - return PAGE_SIZE; -} - -static inline int page_size_to_level(u64 pg_size) -{ - if (pg_size == IOMMU_PAGE_SIZE_1G) - return PAGE_MODE_3_LEVEL; - if (pg_size == IOMMU_PAGE_SIZE_2M) - return PAGE_MODE_2_LEVEL; - - return PAGE_MODE_1_LEVEL; -} - -static void free_pgtable(u64 *pt, int level) -{ - u64 *p; - int i; - - for (i = 0; i < MAX_PTRS_PER_PAGE; i++) { - /* PTE present? */ - if (!IOMMU_PTE_PRESENT(pt[i])) - continue; - - if (is_large_pte(pt[i])) - continue; - - /* - * Free the next level. No need to look at l1 tables here since - * they can only contain leaf PTEs; just free them directly. - */ - p = get_pgtable_pte(pt[i]); - if (level > 2) - free_pgtable(p, level - 1); - else - iommu_free_pages(p); - } - - iommu_free_pages(pt); -} - -/* Allocate page table */ -static u64 *v2_alloc_pte(int nid, u64 *pgd, unsigned long iova, - unsigned long pg_size, gfp_t gfp, bool *updated) -{ - u64 *pte, *page; - int level, end_level; - - level = get_pgtable_level() - 1; - end_level = page_size_to_level(pg_size); - pte = &pgd[PM_LEVEL_INDEX(level, iova)]; - iova = PAGE_SIZE_ALIGN(iova, PAGE_SIZE); - - while (level >= end_level) { - u64 __pte, __npte; - - __pte = *pte; - - if (IOMMU_PTE_PRESENT(__pte) && is_large_pte(__pte)) { - /* Unmap large pte */ - cmpxchg64(pte, *pte, 0ULL); - *updated = true; - continue; - } - - if (!IOMMU_PTE_PRESENT(__pte)) { - page = iommu_alloc_pages_node_sz(nid, gfp, SZ_4K); - if (!page) - return NULL; - - __npte = set_pgtable_attr(page); - /* pte could have been changed somewhere. */ - if (!try_cmpxchg64(pte, &__pte, __npte)) - iommu_free_pages(page); - else if (IOMMU_PTE_PRESENT(__pte)) - *updated = true; - - continue; - } - - level -= 1; - pte = get_pgtable_pte(__pte); - pte = &pte[PM_LEVEL_INDEX(level, iova)]; - } - - /* Tear down existing pte entries */ - if (IOMMU_PTE_PRESENT(*pte)) { - u64 *__pte; - - *updated = true; - __pte = get_pgtable_pte(*pte); - cmpxchg64(pte, *pte, 0ULL); - if (pg_size == IOMMU_PAGE_SIZE_1G) - free_pgtable(__pte, end_level - 1); - else if (pg_size == IOMMU_PAGE_SIZE_2M) - iommu_free_pages(__pte); - } - - return pte; -} - -/* - * This function checks if there is a PTE for a given dma address. - * If there is one, it returns the pointer to it. - */ -static u64 *fetch_pte(struct amd_io_pgtable *pgtable, - unsigned long iova, unsigned long *page_size) -{ - u64 *pte; - int level; - - level = get_pgtable_level() - 1; - pte = &pgtable->pgd[PM_LEVEL_INDEX(level, iova)]; - /* Default page size is 4K */ - *page_size = PAGE_SIZE; - - while (level) { - /* Not present */ - if (!IOMMU_PTE_PRESENT(*pte)) - return NULL; - - /* Walk to the next level */ - pte = get_pgtable_pte(*pte); - pte = &pte[PM_LEVEL_INDEX(level - 1, iova)]; - - /* Large page */ - if (is_large_pte(*pte)) { - if (level == PAGE_MODE_3_LEVEL) - *page_size = IOMMU_PAGE_SIZE_1G; - else if (level == PAGE_MODE_2_LEVEL) - *page_size = IOMMU_PAGE_SIZE_2M; - else - return NULL; /* Wrongly set PSE bit in PTE */ - - break; - } - - level -= 1; - } - - return pte; -} - -static int iommu_v2_map_pages(struct io_pgtable_ops *ops, unsigned long iova, - phys_addr_t paddr, size_t pgsize, size_t pgcount, - int prot, gfp_t gfp, size_t *mapped) -{ - struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops); - struct io_pgtable_cfg *cfg = &pgtable->pgtbl.cfg; - u64 *pte; - unsigned long map_size; - unsigned long mapped_size = 0; - unsigned long o_iova = iova; - size_t size = pgcount << __ffs(pgsize); - int ret = 0; - bool updated = false; - - if (WARN_ON(!pgsize || (pgsize & cfg->pgsize_bitmap) != pgsize) || !pgcount) - return -EINVAL; - - if (!(prot & IOMMU_PROT_MASK)) - return -EINVAL; - - while (mapped_size < size) { - map_size = get_alloc_page_size(pgsize); - pte = v2_alloc_pte(cfg->amd.nid, pgtable->pgd, - iova, map_size, gfp, &updated); - if (!pte) { - ret = -ENOMEM; - goto out; - } - - *pte = set_pte_attr(paddr, map_size, prot); - - iova += map_size; - paddr += map_size; - mapped_size += map_size; - } - -out: - if (updated) { - struct protection_domain *pdom = io_pgtable_ops_to_domain(ops); - unsigned long flags; - - spin_lock_irqsave(&pdom->lock, flags); - amd_iommu_domain_flush_pages(pdom, o_iova, size); - spin_unlock_irqrestore(&pdom->lock, flags); - } - - if (mapped) - *mapped += mapped_size; - - return ret; -} - -static unsigned long iommu_v2_unmap_pages(struct io_pgtable_ops *ops, - unsigned long iova, - size_t pgsize, size_t pgcount, - struct iommu_iotlb_gather *gather) -{ - struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops); - struct io_pgtable_cfg *cfg = &pgtable->pgtbl.cfg; - unsigned long unmap_size; - unsigned long unmapped = 0; - size_t size = pgcount << __ffs(pgsize); - u64 *pte; - - if (WARN_ON(!pgsize || (pgsize & cfg->pgsize_bitmap) != pgsize || !pgcount)) - return 0; - - while (unmapped < size) { - pte = fetch_pte(pgtable, iova, &unmap_size); - if (!pte) - return unmapped; - - *pte = 0ULL; - - iova = (iova & ~(unmap_size - 1)) + unmap_size; - unmapped += unmap_size; - } - - return unmapped; -} - -static phys_addr_t iommu_v2_iova_to_phys(struct io_pgtable_ops *ops, unsigned long iova) -{ - struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops); - unsigned long offset_mask, pte_pgsize; - u64 *pte, __pte; - - pte = fetch_pte(pgtable, iova, &pte_pgsize); - if (!pte || !IOMMU_PTE_PRESENT(*pte)) - return 0; - - offset_mask = pte_pgsize - 1; - __pte = __sme_clr(*pte & PM_ADDR_MASK); - - return (__pte & ~offset_mask) | (iova & offset_mask); -} - -/* - * ---------------------------------------------------- - */ -static void v2_free_pgtable(struct io_pgtable *iop) -{ - struct amd_io_pgtable *pgtable = container_of(iop, struct amd_io_pgtable, pgtbl); - - if (!pgtable || !pgtable->pgd) - return; - - /* Free page table */ - free_pgtable(pgtable->pgd, get_pgtable_level()); - pgtable->pgd = NULL; -} - -static struct io_pgtable *v2_alloc_pgtable(struct io_pgtable_cfg *cfg, void *cookie) -{ - struct amd_io_pgtable *pgtable = io_pgtable_cfg_to_data(cfg); - int ias = IOMMU_IN_ADDR_BIT_SIZE; - - pgtable->pgd = iommu_alloc_pages_node_sz(cfg->amd.nid, GFP_KERNEL, SZ_4K); - if (!pgtable->pgd) - return NULL; - - if (get_pgtable_level() == PAGE_MODE_5_LEVEL) - ias = 57; - - pgtable->pgtbl.ops.map_pages = iommu_v2_map_pages; - pgtable->pgtbl.ops.unmap_pages = iommu_v2_unmap_pages; - pgtable->pgtbl.ops.iova_to_phys = iommu_v2_iova_to_phys; - - cfg->pgsize_bitmap = AMD_IOMMU_PGSIZES_V2; - cfg->ias = ias; - cfg->oas = IOMMU_OUT_ADDR_BIT_SIZE; - - return &pgtable->pgtbl; -} - -struct io_pgtable_init_fns io_pgtable_amd_iommu_v2_init_fns = { - .alloc = v2_alloc_pgtable, - .free = v2_free_pgtable, -}; diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 2e1865daa1ce..9f1d56a5e145 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -30,7 +30,6 @@ #include <linux/msi.h> #include <linux/irqdomain.h> #include <linux/percpu.h> -#include <linux/io-pgtable.h> #include <linux/cc_platform.h> #include <asm/irq_remapping.h> #include <asm/io_apic.h> @@ -41,9 +40,9 @@ #include <asm/gart.h> #include <asm/dma.h> #include <uapi/linux/iommufd.h> +#include <linux/generic_pt/iommu.h> #include "amd_iommu.h" -#include "../dma-iommu.h" #include "../irq_remapping.h" #include "../iommu-pages.h" @@ -60,7 +59,6 @@ LIST_HEAD(hpet_map); LIST_HEAD(acpihid_map); const struct iommu_ops amd_iommu_ops; -static const struct iommu_dirty_ops amd_dirty_ops; int amd_iommu_max_glx_val = -1; @@ -70,15 +68,22 @@ int amd_iommu_max_glx_val = -1; */ DEFINE_IDA(pdom_ids); -static int amd_iommu_attach_device(struct iommu_domain *dom, - struct device *dev); +static int amd_iommu_attach_device(struct iommu_domain *dom, struct device *dev, + struct iommu_domain *old); static void set_dte_entry(struct amd_iommu *iommu, - struct iommu_dev_data *dev_data); + struct iommu_dev_data *dev_data, + phys_addr_t top_paddr, unsigned int top_level); + +static void amd_iommu_change_top(struct pt_iommu *iommu_table, + phys_addr_t top_paddr, unsigned int top_level); static void iommu_flush_dte_sync(struct amd_iommu *iommu, u16 devid); static struct iommu_dev_data *find_dev_data(struct amd_iommu *iommu, u16 devid); +static bool amd_iommu_enforce_cache_coherency(struct iommu_domain *domain); +static int amd_iommu_set_dirty_tracking(struct iommu_domain *domain, + bool enable); /**************************************************************************** * @@ -1157,6 +1162,25 @@ irqreturn_t amd_iommu_int_handler(int irq, void *data) * ****************************************************************************/ +static void dump_command_buffer(struct amd_iommu *iommu) +{ + struct iommu_cmd *cmd; + u32 head, tail; + int i; + + head = readl(iommu->mmio_base + MMIO_CMD_HEAD_OFFSET); + tail = readl(iommu->mmio_base + MMIO_CMD_TAIL_OFFSET); + + pr_err("CMD Buffer head=%llu tail=%llu\n", MMIO_CMD_BUFFER_HEAD(head), + MMIO_CMD_BUFFER_TAIL(tail)); + + for (i = 0; i < CMD_BUFFER_ENTRIES; i++) { + cmd = (struct iommu_cmd *)(iommu->cmd_buf + i * sizeof(*cmd)); + pr_err("%3d: %08x %08x %08x %08x\n", i, cmd->data[0], cmd->data[1], cmd->data[2], + cmd->data[3]); + } +} + static int wait_on_sem(struct amd_iommu *iommu, u64 data) { int i = 0; @@ -1167,7 +1191,14 @@ static int wait_on_sem(struct amd_iommu *iommu, u64 data) } if (i == LOOP_TIMEOUT) { - pr_alert("Completion-Wait loop timed out\n"); + + pr_alert("IOMMU %04x:%02x:%02x.%01x: Completion-Wait loop timed out\n", + iommu->pci_seg->id, PCI_BUS_NUM(iommu->devid), + PCI_SLOT(iommu->devid), PCI_FUNC(iommu->devid)); + + if (amd_iommu_dump) + DO_ONCE_LITE(dump_command_buffer, iommu); + return -EIO; } @@ -1756,42 +1787,6 @@ static void dev_flush_pasid_all(struct iommu_dev_data *dev_data, CMD_INV_IOMMU_ALL_PAGES_ADDRESS); } -/* Flush the not present cache if it exists */ -static void domain_flush_np_cache(struct protection_domain *domain, - dma_addr_t iova, size_t size) -{ - if (unlikely(amd_iommu_np_cache)) { - unsigned long flags; - - spin_lock_irqsave(&domain->lock, flags); - amd_iommu_domain_flush_pages(domain, iova, size); - spin_unlock_irqrestore(&domain->lock, flags); - } -} - - -/* - * This function flushes the DTEs for all devices in domain - */ -void amd_iommu_update_and_flush_device_table(struct protection_domain *domain) -{ - struct iommu_dev_data *dev_data; - - lockdep_assert_held(&domain->lock); - - list_for_each_entry(dev_data, &domain->dev_list, list) { - struct amd_iommu *iommu = rlookup_amd_iommu(dev_data->dev); - - set_dte_entry(iommu, dev_data); - clone_aliases(iommu, dev_data->dev); - } - - list_for_each_entry(dev_data, &domain->dev_list, list) - device_flush_dte(dev_data); - - domain_flush_complete(domain); -} - int amd_iommu_complete_ppr(struct device *dev, u32 pasid, int status, int tag) { struct iommu_dev_data *dev_data; @@ -2051,7 +2046,8 @@ static void set_dte_gcr3_table(struct amd_iommu *iommu, } static void set_dte_entry(struct amd_iommu *iommu, - struct iommu_dev_data *dev_data) + struct iommu_dev_data *dev_data, + phys_addr_t top_paddr, unsigned int top_level) { u16 domid; u32 old_domid; @@ -2060,19 +2056,36 @@ static void set_dte_entry(struct amd_iommu *iommu, struct protection_domain *domain = dev_data->domain; struct gcr3_tbl_info *gcr3_info = &dev_data->gcr3_info; struct dev_table_entry *dte = &get_dev_table(iommu)[dev_data->devid]; + struct pt_iommu_amdv1_hw_info pt_info; + + make_clear_dte(dev_data, dte, &new); if (gcr3_info && gcr3_info->gcr3_tbl) domid = dev_data->gcr3_info.domid; - else + else { domid = domain->id; - make_clear_dte(dev_data, dte, &new); - - if (domain->iop.mode != PAGE_MODE_NONE) - new.data[0] |= iommu_virt_to_phys(domain->iop.root); + if (domain->domain.type & __IOMMU_DOMAIN_PAGING) { + /* + * When updating the IO pagetable, the new top and level + * are provided as parameters. For other operations i.e. + * device attach, retrieve the current pagetable info + * via the IOMMU PT API. + */ + if (top_paddr) { + pt_info.host_pt_root = top_paddr; + pt_info.mode = top_level + 1; + } else { + WARN_ON(top_paddr || top_level); + pt_iommu_amdv1_hw_info(&domain->amdv1, + &pt_info); + } - new.data[0] |= (domain->iop.mode & DEV_ENTRY_MODE_MASK) - << DEV_ENTRY_MODE_SHIFT; + new.data[0] |= __sme_set(pt_info.host_pt_root) | + (pt_info.mode & DEV_ENTRY_MODE_MASK) + << DEV_ENTRY_MODE_SHIFT; + } + } new.data[0] |= DTE_FLAG_IR | DTE_FLAG_IW; @@ -2138,7 +2151,7 @@ static void dev_update_dte(struct iommu_dev_data *dev_data, bool set) struct amd_iommu *iommu = get_amd_iommu_from_dev(dev_data->dev); if (set) - set_dte_entry(iommu, dev_data); + set_dte_entry(iommu, dev_data, 0, 0); else clear_dte_entry(iommu, dev_data); @@ -2156,6 +2169,7 @@ static int init_gcr3_table(struct iommu_dev_data *dev_data, { struct amd_iommu *iommu = get_amd_iommu_from_dev_data(dev_data); int max_pasids = dev_data->max_pasids; + struct pt_iommu_x86_64_hw_info pt_info; int ret = 0; /* @@ -2178,7 +2192,8 @@ static int init_gcr3_table(struct iommu_dev_data *dev_data, if (!pdom_is_v2_pgtbl_mode(pdom)) return ret; - ret = update_gcr3(dev_data, 0, iommu_virt_to_phys(pdom->iop.pgd), true); + pt_iommu_x86_64_hw_info(&pdom->amdv2, &pt_info); + ret = update_gcr3(dev_data, 0, __sme_set(pt_info.gcr3_pt), true); if (ret) free_gcr3_table(&dev_data->gcr3_info); @@ -2500,94 +2515,240 @@ struct protection_domain *protection_domain_alloc(void) return domain; } -static int pdom_setup_pgtable(struct protection_domain *domain, - struct device *dev) +static bool amd_iommu_hd_support(struct amd_iommu *iommu) +{ + if (amd_iommu_hatdis) + return false; + + return iommu && (iommu->features & FEATURE_HDSUP); +} + +static spinlock_t *amd_iommu_get_top_lock(struct pt_iommu *iommupt) { - struct io_pgtable_ops *pgtbl_ops; - enum io_pgtable_fmt fmt; + struct protection_domain *pdom = + container_of(iommupt, struct protection_domain, iommu); - switch (domain->pd_mode) { - case PD_MODE_V1: - fmt = AMD_IOMMU_V1; - break; - case PD_MODE_V2: - fmt = AMD_IOMMU_V2; - break; - case PD_MODE_NONE: - WARN_ON_ONCE(1); - return -EPERM; + return &pdom->lock; +} + +/* + * Update all HW references to the domain with a new pgtable configuration. + */ +static void amd_iommu_change_top(struct pt_iommu *iommu_table, + phys_addr_t top_paddr, unsigned int top_level) +{ + struct protection_domain *pdom = + container_of(iommu_table, struct protection_domain, iommu); + struct iommu_dev_data *dev_data; + + lockdep_assert_held(&pdom->lock); + + /* Update the DTE for all devices attached to this domain */ + list_for_each_entry(dev_data, &pdom->dev_list, list) { + struct amd_iommu *iommu = rlookup_amd_iommu(dev_data->dev); + + /* Update the HW references with the new level and top ptr */ + set_dte_entry(iommu, dev_data, top_paddr, top_level); + clone_aliases(iommu, dev_data->dev); } - domain->iop.pgtbl.cfg.amd.nid = dev_to_node(dev); - pgtbl_ops = alloc_io_pgtable_ops(fmt, &domain->iop.pgtbl.cfg, domain); - if (!pgtbl_ops) - return -ENOMEM; + list_for_each_entry(dev_data, &pdom->dev_list, list) + device_flush_dte(dev_data); + + domain_flush_complete(pdom); +} + +/* + * amd_iommu_iotlb_sync_map() is used to generate flushes for non-present to + * present (ie mapping) operations. It is a NOP if the IOMMU doesn't have non + * present caching (like hypervisor shadowing). + */ +static int amd_iommu_iotlb_sync_map(struct iommu_domain *dom, + unsigned long iova, size_t size) +{ + struct protection_domain *domain = to_pdomain(dom); + unsigned long flags; + if (likely(!amd_iommu_np_cache)) + return 0; + + spin_lock_irqsave(&domain->lock, flags); + amd_iommu_domain_flush_pages(domain, iova, size); + spin_unlock_irqrestore(&domain->lock, flags); return 0; } -static inline u64 dma_max_address(enum protection_domain_mode pgtable) +static void amd_iommu_flush_iotlb_all(struct iommu_domain *domain) { - if (pgtable == PD_MODE_V1) - return PM_LEVEL_SIZE(amd_iommu_hpt_level); + struct protection_domain *dom = to_pdomain(domain); + unsigned long flags; - /* - * V2 with 4/5 level page table. Note that "2.2.6.5 AMD64 4-Kbyte Page - * Translation" shows that the V2 table sign extends the top of the - * address space creating a reserved region in the middle of the - * translation, just like the CPU does. Further Vasant says the docs are - * incomplete and this only applies to non-zero PASIDs. If the AMDv2 - * page table is assigned to the 0 PASID then there is no sign extension - * check. - * - * Since the IOMMU must have a fixed geometry, and the core code does - * not understand sign extended addressing, we have to chop off the high - * bit to get consistent behavior with attachments of the domain to any - * PASID. - */ - return ((1ULL << (PM_LEVEL_SHIFT(amd_iommu_gpt_level) - 1)) - 1); + spin_lock_irqsave(&dom->lock, flags); + amd_iommu_domain_flush_all(dom); + spin_unlock_irqrestore(&dom->lock, flags); } -static bool amd_iommu_hd_support(struct amd_iommu *iommu) +static void amd_iommu_iotlb_sync(struct iommu_domain *domain, + struct iommu_iotlb_gather *gather) { - if (amd_iommu_hatdis) - return false; + struct protection_domain *dom = to_pdomain(domain); + unsigned long flags; - return iommu && (iommu->features & FEATURE_HDSUP); + spin_lock_irqsave(&dom->lock, flags); + amd_iommu_domain_flush_pages(dom, gather->start, + gather->end - gather->start + 1); + spin_unlock_irqrestore(&dom->lock, flags); + iommu_put_pages_list(&gather->freelist); } -static struct iommu_domain * -do_iommu_domain_alloc(struct device *dev, u32 flags, - enum protection_domain_mode pgtable) +static const struct pt_iommu_driver_ops amd_hw_driver_ops_v1 = { + .get_top_lock = amd_iommu_get_top_lock, + .change_top = amd_iommu_change_top, +}; + +static const struct iommu_domain_ops amdv1_ops = { + IOMMU_PT_DOMAIN_OPS(amdv1), + .iotlb_sync_map = amd_iommu_iotlb_sync_map, + .flush_iotlb_all = amd_iommu_flush_iotlb_all, + .iotlb_sync = amd_iommu_iotlb_sync, + .attach_dev = amd_iommu_attach_device, + .free = amd_iommu_domain_free, + .enforce_cache_coherency = amd_iommu_enforce_cache_coherency, +}; + +static const struct iommu_dirty_ops amdv1_dirty_ops = { + IOMMU_PT_DIRTY_OPS(amdv1), + .set_dirty_tracking = amd_iommu_set_dirty_tracking, +}; + +static struct iommu_domain *amd_iommu_domain_alloc_paging_v1(struct device *dev, + u32 flags) { - bool dirty_tracking = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING; - struct amd_iommu *iommu = get_amd_iommu_from_dev(dev); + struct pt_iommu_amdv1_cfg cfg = {}; struct protection_domain *domain; int ret; + if (amd_iommu_hatdis) + return ERR_PTR(-EOPNOTSUPP); + domain = protection_domain_alloc(); if (!domain) return ERR_PTR(-ENOMEM); - domain->pd_mode = pgtable; - ret = pdom_setup_pgtable(domain, dev); + domain->pd_mode = PD_MODE_V1; + domain->iommu.driver_ops = &amd_hw_driver_ops_v1; + domain->iommu.nid = dev_to_node(dev); + if (flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING) + domain->domain.dirty_ops = &amdv1_dirty_ops; + + /* + * Someday FORCE_COHERENCE should be set by + * amd_iommu_enforce_cache_coherency() like VT-d does. + */ + cfg.common.features = BIT(PT_FEAT_DYNAMIC_TOP) | + BIT(PT_FEAT_AMDV1_ENCRYPT_TABLES) | + BIT(PT_FEAT_AMDV1_FORCE_COHERENCE); + + /* + * AMD's IOMMU can flush as many pages as necessary in a single flush. + * Unless we run in a virtual machine, which can be inferred according + * to whether "non-present cache" is on, it is probably best to prefer + * (potentially) too extensive TLB flushing (i.e., more misses) over + * multiple TLB flushes (i.e., more flushes). For virtual machines the + * hypervisor needs to synchronize the host IOMMU PTEs with those of + * the guest, and the trade-off is different: unnecessary TLB flushes + * should be avoided. + */ + if (amd_iommu_np_cache) + cfg.common.features |= BIT(PT_FEAT_FLUSH_RANGE_NO_GAPS); + else + cfg.common.features |= BIT(PT_FEAT_FLUSH_RANGE); + + cfg.common.hw_max_vasz_lg2 = + min(64, (amd_iommu_hpt_level - 1) * 9 + 21); + cfg.common.hw_max_oasz_lg2 = 52; + cfg.starting_level = 2; + domain->domain.ops = &amdv1_ops; + + ret = pt_iommu_amdv1_init(&domain->amdv1, &cfg, GFP_KERNEL); if (ret) { - pdom_id_free(domain->id); - kfree(domain); + amd_iommu_domain_free(&domain->domain); return ERR_PTR(ret); } - domain->domain.geometry.aperture_start = 0; - domain->domain.geometry.aperture_end = dma_max_address(pgtable); - domain->domain.geometry.force_aperture = true; - domain->domain.pgsize_bitmap = domain->iop.pgtbl.cfg.pgsize_bitmap; + /* + * Narrow the supported page sizes to those selected by the kernel + * command line. + */ + domain->domain.pgsize_bitmap &= amd_iommu_pgsize_bitmap; + return &domain->domain; +} - domain->domain.type = IOMMU_DOMAIN_UNMANAGED; - domain->domain.ops = iommu->iommu.ops->default_domain_ops; +static const struct iommu_domain_ops amdv2_ops = { + IOMMU_PT_DOMAIN_OPS(x86_64), + .iotlb_sync_map = amd_iommu_iotlb_sync_map, + .flush_iotlb_all = amd_iommu_flush_iotlb_all, + .iotlb_sync = amd_iommu_iotlb_sync, + .attach_dev = amd_iommu_attach_device, + .free = amd_iommu_domain_free, + /* + * Note the AMDv2 page table format does not support a Force Coherency + * bit, so enforce_cache_coherency should not be set. However VFIO is + * not prepared to handle a case where some domains will support + * enforcement and others do not. VFIO and iommufd will have to be fixed + * before it can fully use the V2 page table. See the comment in + * iommufd_hwpt_paging_alloc(). For now leave things as they have + * historically been and lie about enforce_cache_coherencey. + */ + .enforce_cache_coherency = amd_iommu_enforce_cache_coherency, +}; - if (dirty_tracking) - domain->domain.dirty_ops = &amd_dirty_ops; +static struct iommu_domain *amd_iommu_domain_alloc_paging_v2(struct device *dev, + u32 flags) +{ + struct pt_iommu_x86_64_cfg cfg = {}; + struct protection_domain *domain; + int ret; + if (!amd_iommu_v2_pgtbl_supported()) + return ERR_PTR(-EOPNOTSUPP); + + domain = protection_domain_alloc(); + if (!domain) + return ERR_PTR(-ENOMEM); + + domain->pd_mode = PD_MODE_V2; + domain->iommu.nid = dev_to_node(dev); + + cfg.common.features = BIT(PT_FEAT_X86_64_AMD_ENCRYPT_TABLES); + if (amd_iommu_np_cache) + cfg.common.features |= BIT(PT_FEAT_FLUSH_RANGE_NO_GAPS); + else + cfg.common.features |= BIT(PT_FEAT_FLUSH_RANGE); + + /* + * The v2 table behaves differently if it is attached to PASID 0 vs a + * non-zero PASID. On PASID 0 it has no sign extension and the full + * 57/48 bits decode the lower addresses. Otherwise it behaves like a + * normal sign extended x86 page table. Since we want the domain to work + * in both modes the top bit is removed and PT_FEAT_SIGN_EXTEND is not + * set which creates a table that is compatible in both modes. + */ + if (amd_iommu_gpt_level == PAGE_MODE_5_LEVEL) { + cfg.common.hw_max_vasz_lg2 = 56; + cfg.top_level = 4; + } else { + cfg.common.hw_max_vasz_lg2 = 47; + cfg.top_level = 3; + } + cfg.common.hw_max_oasz_lg2 = 52; + domain->domain.ops = &amdv2_ops; + + ret = pt_iommu_x86_64_init(&domain->amdv2, &cfg, GFP_KERNEL); + if (ret) { + amd_iommu_domain_free(&domain->domain); + return ERR_PTR(ret); + } return &domain->domain; } @@ -2608,15 +2769,27 @@ amd_iommu_domain_alloc_paging_flags(struct device *dev, u32 flags, /* Allocate domain with v1 page table for dirty tracking */ if (!amd_iommu_hd_support(iommu)) break; - return do_iommu_domain_alloc(dev, flags, PD_MODE_V1); + return amd_iommu_domain_alloc_paging_v1(dev, flags); case IOMMU_HWPT_ALLOC_PASID: /* Allocate domain with v2 page table if IOMMU supports PASID. */ if (!amd_iommu_pasid_supported()) break; - return do_iommu_domain_alloc(dev, flags, PD_MODE_V2); - case 0: + return amd_iommu_domain_alloc_paging_v2(dev, flags); + case 0: { + struct iommu_domain *ret; + /* If nothing specific is required use the kernel commandline default */ - return do_iommu_domain_alloc(dev, 0, amd_iommu_pgtable); + if (amd_iommu_pgtable == PD_MODE_V1) { + ret = amd_iommu_domain_alloc_paging_v1(dev, flags); + if (ret != ERR_PTR(-EOPNOTSUPP)) + return ret; + return amd_iommu_domain_alloc_paging_v2(dev, flags); + } + ret = amd_iommu_domain_alloc_paging_v2(dev, flags); + if (ret != ERR_PTR(-EOPNOTSUPP)) + return ret; + return amd_iommu_domain_alloc_paging_v1(dev, flags); + } default: break; } @@ -2628,14 +2801,14 @@ void amd_iommu_domain_free(struct iommu_domain *dom) struct protection_domain *domain = to_pdomain(dom); WARN_ON(!list_empty(&domain->dev_list)); - if (domain->domain.type & __IOMMU_DOMAIN_PAGING) - free_io_pgtable_ops(&domain->iop.pgtbl.ops); + pt_iommu_deinit(&domain->iommu); pdom_id_free(domain->id); kfree(domain); } static int blocked_domain_attach_device(struct iommu_domain *domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev); @@ -2685,16 +2858,8 @@ void amd_iommu_init_identity_domain(void) protection_domain_init(&identity_domain); } -/* Same as blocked domain except it supports only ops->attach_dev() */ -static struct iommu_domain release_domain = { - .type = IOMMU_DOMAIN_BLOCKED, - .ops = &(const struct iommu_domain_ops) { - .attach_dev = blocked_domain_attach_device, - } -}; - -static int amd_iommu_attach_device(struct iommu_domain *dom, - struct device *dev) +static int amd_iommu_attach_device(struct iommu_domain *dom, struct device *dev, + struct iommu_domain *old) { struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev); struct protection_domain *domain = to_pdomain(dom); @@ -2734,93 +2899,6 @@ static int amd_iommu_attach_device(struct iommu_domain *dom, return ret; } -static int amd_iommu_iotlb_sync_map(struct iommu_domain *dom, - unsigned long iova, size_t size) -{ - struct protection_domain *domain = to_pdomain(dom); - struct io_pgtable_ops *ops = &domain->iop.pgtbl.ops; - - if (ops->map_pages) - domain_flush_np_cache(domain, iova, size); - return 0; -} - -static int amd_iommu_map_pages(struct iommu_domain *dom, unsigned long iova, - phys_addr_t paddr, size_t pgsize, size_t pgcount, - int iommu_prot, gfp_t gfp, size_t *mapped) -{ - struct protection_domain *domain = to_pdomain(dom); - struct io_pgtable_ops *ops = &domain->iop.pgtbl.ops; - int prot = 0; - int ret = -EINVAL; - - if ((domain->pd_mode == PD_MODE_V1) && - (domain->iop.mode == PAGE_MODE_NONE)) - return -EINVAL; - - if (iommu_prot & IOMMU_READ) - prot |= IOMMU_PROT_IR; - if (iommu_prot & IOMMU_WRITE) - prot |= IOMMU_PROT_IW; - - if (ops->map_pages) { - ret = ops->map_pages(ops, iova, paddr, pgsize, - pgcount, prot, gfp, mapped); - } - - return ret; -} - -static void amd_iommu_iotlb_gather_add_page(struct iommu_domain *domain, - struct iommu_iotlb_gather *gather, - unsigned long iova, size_t size) -{ - /* - * AMD's IOMMU can flush as many pages as necessary in a single flush. - * Unless we run in a virtual machine, which can be inferred according - * to whether "non-present cache" is on, it is probably best to prefer - * (potentially) too extensive TLB flushing (i.e., more misses) over - * mutliple TLB flushes (i.e., more flushes). For virtual machines the - * hypervisor needs to synchronize the host IOMMU PTEs with those of - * the guest, and the trade-off is different: unnecessary TLB flushes - * should be avoided. - */ - if (amd_iommu_np_cache && - iommu_iotlb_gather_is_disjoint(gather, iova, size)) - iommu_iotlb_sync(domain, gather); - - iommu_iotlb_gather_add_range(gather, iova, size); -} - -static size_t amd_iommu_unmap_pages(struct iommu_domain *dom, unsigned long iova, - size_t pgsize, size_t pgcount, - struct iommu_iotlb_gather *gather) -{ - struct protection_domain *domain = to_pdomain(dom); - struct io_pgtable_ops *ops = &domain->iop.pgtbl.ops; - size_t r; - - if ((domain->pd_mode == PD_MODE_V1) && - (domain->iop.mode == PAGE_MODE_NONE)) - return 0; - - r = (ops->unmap_pages) ? ops->unmap_pages(ops, iova, pgsize, pgcount, NULL) : 0; - - if (r) - amd_iommu_iotlb_gather_add_page(dom, gather, iova, r); - - return r; -} - -static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom, - dma_addr_t iova) -{ - struct protection_domain *domain = to_pdomain(dom); - struct io_pgtable_ops *ops = &domain->iop.pgtbl.ops; - - return ops->iova_to_phys(ops, iova); -} - static bool amd_iommu_capable(struct device *dev, enum iommu_cap cap) { switch (cap) { @@ -2887,28 +2965,6 @@ static int amd_iommu_set_dirty_tracking(struct iommu_domain *domain, return 0; } -static int amd_iommu_read_and_clear_dirty(struct iommu_domain *domain, - unsigned long iova, size_t size, - unsigned long flags, - struct iommu_dirty_bitmap *dirty) -{ - struct protection_domain *pdomain = to_pdomain(domain); - struct io_pgtable_ops *ops = &pdomain->iop.pgtbl.ops; - unsigned long lflags; - - if (!ops || !ops->read_and_clear_dirty) - return -EOPNOTSUPP; - - spin_lock_irqsave(&pdomain->lock, lflags); - if (!pdomain->dirty_tracking && dirty->bitmap) { - spin_unlock_irqrestore(&pdomain->lock, lflags); - return -EINVAL; - } - spin_unlock_irqrestore(&pdomain->lock, lflags); - - return ops->read_and_clear_dirty(ops, iova, size, flags, dirty); -} - static void amd_iommu_get_resv_regions(struct device *dev, struct list_head *head) { @@ -2978,28 +3034,6 @@ static bool amd_iommu_is_attach_deferred(struct device *dev) return dev_data->defer_attach; } -static void amd_iommu_flush_iotlb_all(struct iommu_domain *domain) -{ - struct protection_domain *dom = to_pdomain(domain); - unsigned long flags; - - spin_lock_irqsave(&dom->lock, flags); - amd_iommu_domain_flush_all(dom); - spin_unlock_irqrestore(&dom->lock, flags); -} - -static void amd_iommu_iotlb_sync(struct iommu_domain *domain, - struct iommu_iotlb_gather *gather) -{ - struct protection_domain *dom = to_pdomain(domain); - unsigned long flags; - - spin_lock_irqsave(&dom->lock, flags); - amd_iommu_domain_flush_pages(dom, gather->start, - gather->end - gather->start + 1); - spin_unlock_irqrestore(&dom->lock, flags); -} - static int amd_iommu_def_domain_type(struct device *dev) { struct iommu_dev_data *dev_data; @@ -3034,15 +3068,10 @@ static bool amd_iommu_enforce_cache_coherency(struct iommu_domain *domain) return true; } -static const struct iommu_dirty_ops amd_dirty_ops = { - .set_dirty_tracking = amd_iommu_set_dirty_tracking, - .read_and_clear_dirty = amd_iommu_read_and_clear_dirty, -}; - const struct iommu_ops amd_iommu_ops = { .capable = amd_iommu_capable, .blocked_domain = &blocked_domain, - .release_domain = &release_domain, + .release_domain = &blocked_domain, .identity_domain = &identity_domain.domain, .domain_alloc_paging_flags = amd_iommu_domain_alloc_paging_flags, .domain_alloc_sva = amd_iommu_domain_alloc_sva, @@ -3053,17 +3082,6 @@ const struct iommu_ops amd_iommu_ops = { .is_attach_deferred = amd_iommu_is_attach_deferred, .def_domain_type = amd_iommu_def_domain_type, .page_response = amd_iommu_page_response, - .default_domain_ops = &(const struct iommu_domain_ops) { - .attach_dev = amd_iommu_attach_device, - .map_pages = amd_iommu_map_pages, - .unmap_pages = amd_iommu_unmap_pages, - .iotlb_sync_map = amd_iommu_iotlb_sync_map, - .iova_to_phys = amd_iommu_iova_to_phys, - .flush_iotlb_all = amd_iommu_flush_iotlb_all, - .iotlb_sync = amd_iommu_iotlb_sync, - .free = amd_iommu_domain_free, - .enforce_cache_coherency = amd_iommu_enforce_cache_coherency, - } }; #ifdef CONFIG_IRQ_REMAP @@ -3354,7 +3372,7 @@ static int __modify_irte_ga(struct amd_iommu *iommu, u16 devid, int index, static int modify_irte_ga(struct amd_iommu *iommu, u16 devid, int index, struct irte_ga *irte) { - bool ret; + int ret; ret = __modify_irte_ga(iommu, devid, index, irte); if (ret) @@ -4072,3 +4090,5 @@ int amd_iommu_create_irq_domain(struct amd_iommu *iommu) return 0; } #endif + +MODULE_IMPORT_NS("GENERIC_PT_IOMMU"); diff --git a/drivers/iommu/apple-dart.c b/drivers/iommu/apple-dart.c index 95a4e62b8f63..83a5aabcd15d 100644 --- a/drivers/iommu/apple-dart.c +++ b/drivers/iommu/apple-dart.c @@ -672,7 +672,8 @@ static int apple_dart_domain_add_streams(struct apple_dart_domain *domain, } static int apple_dart_attach_dev_paging(struct iommu_domain *domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { int ret, i; struct apple_dart_stream_map *stream_map; @@ -693,7 +694,8 @@ static int apple_dart_attach_dev_paging(struct iommu_domain *domain, } static int apple_dart_attach_dev_identity(struct iommu_domain *domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { struct apple_dart_master_cfg *cfg = dev_iommu_priv_get(dev); struct apple_dart_stream_map *stream_map; @@ -717,7 +719,8 @@ static struct iommu_domain apple_dart_identity_domain = { }; static int apple_dart_attach_dev_blocked(struct iommu_domain *domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { struct apple_dart_master_cfg *cfg = dev_iommu_priv_get(dev); struct apple_dart_stream_map *stream_map; @@ -802,6 +805,8 @@ static int apple_dart_of_xlate(struct device *dev, struct apple_dart *cfg_dart; int i, sid; + put_device(&iommu_pdev->dev); + if (args->args_count != 1) return -EINVAL; sid = args->args[0]; diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c index 8cd8929bbfdf..93fdadd07431 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c @@ -99,6 +99,8 @@ static void arm_smmu_make_nested_domain_ste( int arm_smmu_attach_prepare_vmaster(struct arm_smmu_attach_state *state, struct arm_smmu_nested_domain *nested_domain) { + unsigned int cfg = + FIELD_GET(STRTAB_STE_0_CFG, le64_to_cpu(nested_domain->ste[0])); struct arm_smmu_vmaster *vmaster; unsigned long vsid; int ret; @@ -107,8 +109,17 @@ int arm_smmu_attach_prepare_vmaster(struct arm_smmu_attach_state *state, ret = iommufd_viommu_get_vdev_id(&nested_domain->vsmmu->core, state->master->dev, &vsid); - if (ret) + /* + * Attaching to a translate nested domain must allocate a vDEVICE prior, + * as CD/ATS invalidations and vevents require a vSID to work properly. + * A abort/bypass domain is allowed to attach w/o vmaster for GBPA case. + */ + if (ret) { + if (cfg == STRTAB_STE_0_CFG_ABORT || + cfg == STRTAB_STE_0_CFG_BYPASS) + return 0; return ret; + } vmaster = kzalloc(sizeof(*vmaster), GFP_KERNEL); if (!vmaster) @@ -138,14 +149,15 @@ void arm_smmu_master_clear_vmaster(struct arm_smmu_master *master) } static int arm_smmu_attach_dev_nested(struct iommu_domain *domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old_domain) { struct arm_smmu_nested_domain *nested_domain = to_smmu_nested_domain(domain); struct arm_smmu_master *master = dev_iommu_priv_get(dev); struct arm_smmu_attach_state state = { .master = master, - .old_domain = iommu_get_domain_for_dev(dev), + .old_domain = old_domain, .ssid = IOMMU_NO_PASID, }; struct arm_smmu_ste ste; diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index 2a8b46b948f0..d16d35c78c06 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -1464,7 +1464,7 @@ static int arm_smmu_alloc_cd_tables(struct arm_smmu_master *master) cd_table->l2.l1tab = dma_alloc_coherent(smmu->dev, l1size, &cd_table->cdtab_dma, GFP_KERNEL); - if (!cd_table->l2.l2ptrs) { + if (!cd_table->l2.l1tab) { ret = -ENOMEM; goto err_free_l2ptrs; } @@ -3002,7 +3002,8 @@ void arm_smmu_attach_commit(struct arm_smmu_attach_state *state) master->ats_enabled = state->ats_enabled; } -static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev) +static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev, + struct iommu_domain *old_domain) { int ret = 0; struct arm_smmu_ste target; @@ -3010,7 +3011,7 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev) struct arm_smmu_device *smmu; struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); struct arm_smmu_attach_state state = { - .old_domain = iommu_get_domain_for_dev(dev), + .old_domain = old_domain, .ssid = IOMMU_NO_PASID, }; struct arm_smmu_master *master; @@ -3186,7 +3187,7 @@ static int arm_smmu_blocking_set_dev_pasid(struct iommu_domain *new_domain, /* * When the last user of the CD table goes away downgrade the STE back - * to a non-cd_table one. + * to a non-cd_table one, by re-attaching its sid_domain. */ if (!arm_smmu_ssids_in_use(&master->cd_table)) { struct iommu_domain *sid_domain = @@ -3194,12 +3195,14 @@ static int arm_smmu_blocking_set_dev_pasid(struct iommu_domain *new_domain, if (sid_domain->type == IOMMU_DOMAIN_IDENTITY || sid_domain->type == IOMMU_DOMAIN_BLOCKED) - sid_domain->ops->attach_dev(sid_domain, dev); + sid_domain->ops->attach_dev(sid_domain, dev, + sid_domain); } return 0; } static void arm_smmu_attach_dev_ste(struct iommu_domain *domain, + struct iommu_domain *old_domain, struct device *dev, struct arm_smmu_ste *ste, unsigned int s1dss) @@ -3207,7 +3210,7 @@ static void arm_smmu_attach_dev_ste(struct iommu_domain *domain, struct arm_smmu_master *master = dev_iommu_priv_get(dev); struct arm_smmu_attach_state state = { .master = master, - .old_domain = iommu_get_domain_for_dev(dev), + .old_domain = old_domain, .ssid = IOMMU_NO_PASID, }; @@ -3248,14 +3251,16 @@ static void arm_smmu_attach_dev_ste(struct iommu_domain *domain, } static int arm_smmu_attach_dev_identity(struct iommu_domain *domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old_domain) { struct arm_smmu_ste ste; struct arm_smmu_master *master = dev_iommu_priv_get(dev); arm_smmu_master_clear_vmaster(master); arm_smmu_make_bypass_ste(master->smmu, &ste); - arm_smmu_attach_dev_ste(domain, dev, &ste, STRTAB_STE_1_S1DSS_BYPASS); + arm_smmu_attach_dev_ste(domain, old_domain, dev, &ste, + STRTAB_STE_1_S1DSS_BYPASS); return 0; } @@ -3269,14 +3274,15 @@ static struct iommu_domain arm_smmu_identity_domain = { }; static int arm_smmu_attach_dev_blocked(struct iommu_domain *domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old_domain) { struct arm_smmu_ste ste; struct arm_smmu_master *master = dev_iommu_priv_get(dev); arm_smmu_master_clear_vmaster(master); arm_smmu_make_abort_ste(&ste); - arm_smmu_attach_dev_ste(domain, dev, &ste, + arm_smmu_attach_dev_ste(domain, old_domain, dev, &ste, STRTAB_STE_1_S1DSS_TERMINATE); return 0; } @@ -3582,12 +3588,6 @@ static void arm_smmu_release_device(struct device *dev) WARN_ON(master->iopf_refcount); - /* Put the STE back to what arm_smmu_init_strtab() sets */ - if (dev->iommu->require_direct) - arm_smmu_attach_dev_identity(&arm_smmu_identity_domain, dev); - else - arm_smmu_attach_dev_blocked(&arm_smmu_blocked_domain, dev); - arm_smmu_disable_pasid(master); arm_smmu_remove_master(master); if (arm_smmu_cdtab_allocated(&master->cd_table)) @@ -3678,6 +3678,7 @@ static int arm_smmu_def_domain_type(struct device *dev) static const struct iommu_ops arm_smmu_ops = { .identity_domain = &arm_smmu_identity_domain, .blocked_domain = &arm_smmu_blocked_domain, + .release_domain = &arm_smmu_blocked_domain, .capable = arm_smmu_capable, .hw_info = arm_smmu_hw_info, .domain_alloc_sva = arm_smmu_sva_domain_alloc, diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c index 57c097e87613..573085349df3 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c @@ -367,6 +367,7 @@ static int qcom_adreno_smmu_init_context(struct arm_smmu_domain *smmu_domain, static const struct of_device_id qcom_smmu_client_of_match[] __maybe_unused = { { .compatible = "qcom,adreno" }, { .compatible = "qcom,adreno-gmu" }, + { .compatible = "qcom,glymur-mdss" }, { .compatible = "qcom,mdp4" }, { .compatible = "qcom,mdss" }, { .compatible = "qcom,qcm2290-mdss" }, @@ -431,17 +432,19 @@ static int qcom_smmu_cfg_probe(struct arm_smmu_device *smmu) /* * Some platforms support more than the Arm SMMU architected maximum of - * 128 stream matching groups. For unknown reasons, the additional - * groups don't exhibit the same behavior as the architected registers, - * so limit the groups to 128 until the behavior is fixed for the other - * groups. + * 128 stream matching groups. The additional registers appear to have + * the same behavior as the architected registers in the hardware. + * However, on some firmware versions, the hypervisor does not + * correctly trap and emulate accesses to the additional registers, + * resulting in unexpected behavior. + * + * If there are more than 128 groups, use the last reliable group to + * detect if we need to apply the bypass quirk. */ - if (smmu->num_mapping_groups > 128) { - dev_notice(smmu->dev, "\tLimiting the stream matching groups to 128\n"); - smmu->num_mapping_groups = 128; - } - - last_s2cr = ARM_SMMU_GR0_S2CR(smmu->num_mapping_groups - 1); + if (smmu->num_mapping_groups > 128) + last_s2cr = ARM_SMMU_GR0_S2CR(127); + else + last_s2cr = ARM_SMMU_GR0_S2CR(smmu->num_mapping_groups - 1); /* * With some firmware versions writes to S2CR of type FAULT are @@ -464,6 +467,11 @@ static int qcom_smmu_cfg_probe(struct arm_smmu_device *smmu) reg = FIELD_PREP(ARM_SMMU_CBAR_TYPE, CBAR_TYPE_S1_TRANS_S2_BYPASS); arm_smmu_gr1_write(smmu, ARM_SMMU_GR1_CBAR(qsmmu->bypass_cbndx), reg); + + if (smmu->num_mapping_groups > 128) { + dev_notice(smmu->dev, "\tLimiting the stream matching groups to 128\n"); + smmu->num_mapping_groups = 128; + } } for (i = 0; i < smmu->num_mapping_groups; i++) { diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c b/drivers/iommu/arm/arm-smmu/arm-smmu.c index 4ced4b5bee4d..5e690cf85ec9 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c @@ -1165,7 +1165,8 @@ static void arm_smmu_master_install_s2crs(struct arm_smmu_master_cfg *cfg, } } -static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev) +static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev, + struct iommu_domain *old) { struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); @@ -1234,7 +1235,8 @@ static int arm_smmu_attach_dev_type(struct device *dev, } static int arm_smmu_attach_dev_identity(struct iommu_domain *domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { return arm_smmu_attach_dev_type(dev, S2CR_TYPE_BYPASS); } @@ -1249,7 +1251,8 @@ static struct iommu_domain arm_smmu_identity_domain = { }; static int arm_smmu_attach_dev_blocked(struct iommu_domain *domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { return arm_smmu_attach_dev_type(dev, S2CR_TYPE_FAULT); } diff --git a/drivers/iommu/arm/arm-smmu/qcom_iommu.c b/drivers/iommu/arm/arm-smmu/qcom_iommu.c index c5be95e56031..f69d9276dc55 100644 --- a/drivers/iommu/arm/arm-smmu/qcom_iommu.c +++ b/drivers/iommu/arm/arm-smmu/qcom_iommu.c @@ -359,7 +359,8 @@ static void qcom_iommu_domain_free(struct iommu_domain *domain) kfree(qcom_domain); } -static int qcom_iommu_attach_dev(struct iommu_domain *domain, struct device *dev) +static int qcom_iommu_attach_dev(struct iommu_domain *domain, + struct device *dev, struct iommu_domain *old) { struct qcom_iommu_dev *qcom_iommu = dev_iommu_priv_get(dev); struct qcom_iommu_domain *qcom_domain = to_qcom_iommu_domain(domain); @@ -388,18 +389,18 @@ static int qcom_iommu_attach_dev(struct iommu_domain *domain, struct device *dev } static int qcom_iommu_identity_attach(struct iommu_domain *identity_domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { - struct iommu_domain *domain = iommu_get_domain_for_dev(dev); struct qcom_iommu_domain *qcom_domain; struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); struct qcom_iommu_dev *qcom_iommu = dev_iommu_priv_get(dev); unsigned int i; - if (domain == identity_domain || !domain) + if (old == identity_domain || !old) return 0; - qcom_domain = to_qcom_iommu_domain(domain); + qcom_domain = to_qcom_iommu_domain(old); if (WARN_ON(!qcom_domain->iommu)) return -EINVAL; @@ -565,14 +566,14 @@ static int qcom_iommu_of_xlate(struct device *dev, qcom_iommu = platform_get_drvdata(iommu_pdev); + put_device(&iommu_pdev->dev); + /* make sure the asid specified in dt is valid, so we don't have * to sanity check this elsewhere: */ if (WARN_ON(asid > qcom_iommu->max_asid) || - WARN_ON(qcom_iommu->ctxs[asid] == NULL)) { - put_device(&iommu_pdev->dev); + WARN_ON(qcom_iommu->ctxs[asid] == NULL)) return -EINVAL; - } if (!dev_iommu_priv_get(dev)) { dev_iommu_priv_set(dev, qcom_iommu); @@ -581,10 +582,8 @@ static int qcom_iommu_of_xlate(struct device *dev, * multiple different iommu devices. Multiple context * banks are ok, but multiple devices are not: */ - if (WARN_ON(qcom_iommu != dev_iommu_priv_get(dev))) { - put_device(&iommu_pdev->dev); + if (WARN_ON(qcom_iommu != dev_iommu_priv_get(dev))) return -EINVAL; - } } return iommu_fwspec_add_ids(dev, &asid, 1); diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index 7944a3af4545..c92088855450 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -1439,8 +1439,8 @@ int iommu_dma_map_sg(struct device *dev, struct scatterlist *sg, int nents, * as a bus address, __finalise_sg() will copy the dma * address into the output segment. */ - s->dma_address = pci_p2pdma_bus_addr_map(&p2pdma_state, - sg_phys(s)); + s->dma_address = pci_p2pdma_bus_addr_map( + p2pdma_state.mem, sg_phys(s)); sg_dma_len(s) = sg->length; sg_dma_mark_bus_address(s); continue; @@ -2008,7 +2008,7 @@ static void iommu_dma_iova_unlink_range_slow(struct device *dev, end - addr, iovad->granule - iova_start_pad); if (!dev_is_dma_coherent(dev) && - !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) + !(attrs & (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_MMIO))) arch_sync_dma_for_cpu(phys, len, dir); swiotlb_tbl_unmap_single(dev, phys, len, dir, attrs); @@ -2032,7 +2032,8 @@ static void __iommu_dma_iova_unlink(struct device *dev, size_t unmapped; if ((state->__size & DMA_IOVA_USE_SWIOTLB) || - (!dev_is_dma_coherent(dev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC))) + (!dev_is_dma_coherent(dev) && + !(attrs & (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_MMIO)))) iommu_dma_iova_unlink_range_slow(dev, addr, size, dir, attrs); iommu_iotlb_gather_init(&iotlb_gather); diff --git a/drivers/iommu/exynos-iommu.c b/drivers/iommu/exynos-iommu.c index b6edd178fe25..b512c6b939ac 100644 --- a/drivers/iommu/exynos-iommu.c +++ b/drivers/iommu/exynos-iommu.c @@ -984,7 +984,8 @@ static void exynos_iommu_domain_free(struct iommu_domain *iommu_domain) } static int exynos_iommu_identity_attach(struct iommu_domain *identity_domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { struct exynos_iommu_owner *owner = dev_iommu_priv_get(dev); struct exynos_iommu_domain *domain; @@ -1035,7 +1036,8 @@ static struct iommu_domain exynos_identity_domain = { }; static int exynos_iommu_attach_device(struct iommu_domain *iommu_domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { struct exynos_iommu_domain *domain = to_exynos_domain(iommu_domain); struct exynos_iommu_owner *owner = dev_iommu_priv_get(dev); @@ -1044,7 +1046,7 @@ static int exynos_iommu_attach_device(struct iommu_domain *iommu_domain, unsigned long flags; int err; - err = exynos_iommu_identity_attach(&exynos_identity_domain, dev); + err = exynos_iommu_identity_attach(&exynos_identity_domain, dev, old); if (err) return err; @@ -1429,8 +1431,6 @@ static void exynos_iommu_release_device(struct device *dev) struct exynos_iommu_owner *owner = dev_iommu_priv_get(dev); struct sysmmu_drvdata *data; - WARN_ON(exynos_iommu_identity_attach(&exynos_identity_domain, dev)); - list_for_each_entry(data, &owner->controllers, owner_node) device_link_del(data->link); } @@ -1446,17 +1446,14 @@ static int exynos_iommu_of_xlate(struct device *dev, return -ENODEV; data = platform_get_drvdata(sysmmu); - if (!data) { - put_device(&sysmmu->dev); + put_device(&sysmmu->dev); + if (!data) return -ENODEV; - } if (!owner) { owner = kzalloc(sizeof(*owner), GFP_KERNEL); - if (!owner) { - put_device(&sysmmu->dev); + if (!owner) return -ENOMEM; - } INIT_LIST_HEAD(&owner->controllers); mutex_init(&owner->rpm_lock); @@ -1476,6 +1473,7 @@ static int exynos_iommu_of_xlate(struct device *dev, static const struct iommu_ops exynos_iommu_ops = { .identity_domain = &exynos_identity_domain, + .release_domain = &exynos_identity_domain, .domain_alloc_paging = exynos_iommu_domain_alloc_paging, .device_group = generic_device_group, .probe_device = exynos_iommu_probe_device, diff --git a/drivers/iommu/fsl_pamu_domain.c b/drivers/iommu/fsl_pamu_domain.c index 5f08523f97cb..9664ef9840d2 100644 --- a/drivers/iommu/fsl_pamu_domain.c +++ b/drivers/iommu/fsl_pamu_domain.c @@ -238,7 +238,7 @@ static int update_domain_stash(struct fsl_dma_domain *dma_domain, u32 val) } static int fsl_pamu_attach_device(struct iommu_domain *domain, - struct device *dev) + struct device *dev, struct iommu_domain *old) { struct fsl_dma_domain *dma_domain = to_fsl_dma_domain(domain); unsigned long flags; @@ -298,9 +298,9 @@ static int fsl_pamu_attach_device(struct iommu_domain *domain, * switches to what looks like BLOCKING. */ static int fsl_pamu_platform_attach(struct iommu_domain *platform_domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { - struct iommu_domain *domain = iommu_get_domain_for_dev(dev); struct fsl_dma_domain *dma_domain; const u32 *prop; int len; @@ -311,11 +311,11 @@ static int fsl_pamu_platform_attach(struct iommu_domain *platform_domain, * Hack to keep things working as they always have, only leaving an * UNMANAGED domain makes it BLOCKING. */ - if (domain == platform_domain || !domain || - domain->type != IOMMU_DOMAIN_UNMANAGED) + if (old == platform_domain || !old || + old->type != IOMMU_DOMAIN_UNMANAGED) return 0; - dma_domain = to_fsl_dma_domain(domain); + dma_domain = to_fsl_dma_domain(old); /* * Use LIODN of the PCI controller while detaching a diff --git a/drivers/iommu/generic_pt/.kunitconfig b/drivers/iommu/generic_pt/.kunitconfig new file mode 100644 index 000000000000..52ac9e661ffd --- /dev/null +++ b/drivers/iommu/generic_pt/.kunitconfig @@ -0,0 +1,14 @@ +CONFIG_KUNIT=y +CONFIG_GENERIC_PT=y +CONFIG_DEBUG_GENERIC_PT=y +CONFIG_IOMMU_PT=y +CONFIG_IOMMU_PT_AMDV1=y +CONFIG_IOMMU_PT_VTDSS=y +CONFIG_IOMMU_PT_X86_64=y +CONFIG_IOMMU_PT_KUNIT_TEST=y + +CONFIG_IOMMUFD=y +CONFIG_DEBUG_KERNEL=y +CONFIG_FAULT_INJECTION=y +CONFIG_RUNTIME_TESTING_MENU=y +CONFIG_IOMMUFD_TEST=y diff --git a/drivers/iommu/generic_pt/Kconfig b/drivers/iommu/generic_pt/Kconfig new file mode 100644 index 000000000000..ce4fb4786914 --- /dev/null +++ b/drivers/iommu/generic_pt/Kconfig @@ -0,0 +1,79 @@ +# SPDX-License-Identifier: GPL-2.0-only + +menuconfig GENERIC_PT + bool "Generic Radix Page Table" if COMPILE_TEST + help + Generic library for building radix tree page tables. + + Generic PT provides a set of HW page table formats and a common + set of APIs to work with them. + +if GENERIC_PT +config DEBUG_GENERIC_PT + bool "Extra debugging checks for GENERIC_PT" + help + Enable extra run time debugging checks for GENERIC_PT code. This + incurs a runtime cost and should not be enabled for production + kernels. + + The kunit tests require this to be enabled to get full coverage. + +config IOMMU_PT + tristate "IOMMU Page Tables" + select IOMMU_API + depends on IOMMU_SUPPORT + depends on GENERIC_PT + help + Generic library for building IOMMU page tables + + IOMMU_PT provides an implementation of the page table operations + related to struct iommu_domain using GENERIC_PT. It provides a single + implementation of the page table operations that can be shared by + multiple drivers. + +if IOMMU_PT +config IOMMU_PT_AMDV1 + tristate "IOMMU page table for 64-bit AMD IOMMU v1" + depends on !GENERIC_ATOMIC64 # for cmpxchg64 + help + iommu_domain implementation for the AMD v1 page table. AMDv1 is the + "host" page table. It supports granular page sizes of almost every + power of 2 and decodes the full 64-bit IOVA space. + + Selected automatically by an IOMMU driver that uses this format. + +config IOMMU_PT_VTDSS + tristate "IOMMU page table for Intel VT-d Second Stage" + depends on !GENERIC_ATOMIC64 # for cmpxchg64 + help + iommu_domain implementation for the Intel VT-d's 64 bit 3/4/5 + level Second Stage page table. It is similar to the X86_64 format with + 4K/2M/1G page sizes. + + Selected automatically by an IOMMU driver that uses this format. + +config IOMMU_PT_X86_64 + tristate "IOMMU page table for x86 64-bit, 4/5 levels" + depends on !GENERIC_ATOMIC64 # for cmpxchg64 + help + iommu_domain implementation for the x86 64-bit 4/5 level page table. + It supports 4K/2M/1G page sizes and can decode a sign-extended + portion of the 64-bit IOVA space. + + Selected automatically by an IOMMU driver that uses this format. + +config IOMMU_PT_KUNIT_TEST + tristate "IOMMU Page Table KUnit Test" if !KUNIT_ALL_TESTS + depends on KUNIT + depends on IOMMU_PT_AMDV1 || !IOMMU_PT_AMDV1 + depends on IOMMU_PT_X86_64 || !IOMMU_PT_X86_64 + depends on IOMMU_PT_VTDSS || !IOMMU_PT_VTDSS + default KUNIT_ALL_TESTS + help + Enable kunit tests for GENERIC_PT and IOMMU_PT that covers all the + enabled page table formats. The test covers most of the GENERIC_PT + functions provided by the page table format, as well as covering the + iommu_domain related functions. + +endif +endif diff --git a/drivers/iommu/generic_pt/fmt/Makefile b/drivers/iommu/generic_pt/fmt/Makefile new file mode 100644 index 000000000000..976b49ec97dc --- /dev/null +++ b/drivers/iommu/generic_pt/fmt/Makefile @@ -0,0 +1,28 @@ +# SPDX-License-Identifier: GPL-2.0 + +iommu_pt_fmt-$(CONFIG_IOMMU_PT_AMDV1) += amdv1 +iommu_pt_fmt-$(CONFIG_IOMMUFD_TEST) += mock + +iommu_pt_fmt-$(CONFIG_IOMMU_PT_VTDSS) += vtdss + +iommu_pt_fmt-$(CONFIG_IOMMU_PT_X86_64) += x86_64 + +IOMMU_PT_KUNIT_TEST := +define create_format +obj-$(2) += iommu_$(1).o +iommu_pt_kunit_test-y += kunit_iommu_$(1).o +CFLAGS_kunit_iommu_$(1).o += -DGENERIC_PT_KUNIT=1 +IOMMU_PT_KUNIT_TEST := iommu_pt_kunit_test.o + +endef + +$(eval $(foreach fmt,$(iommu_pt_fmt-y),$(call create_format,$(fmt),y))) +$(eval $(foreach fmt,$(iommu_pt_fmt-m),$(call create_format,$(fmt),m))) + +# The kunit objects are constructed by compiling the main source +# with -DGENERIC_PT_KUNIT +$(obj)/kunit_iommu_%.o: $(src)/iommu_%.c FORCE + $(call rule_mkdir) + $(call if_changed_dep,cc_o_c) + +obj-$(CONFIG_IOMMU_PT_KUNIT_TEST) += $(IOMMU_PT_KUNIT_TEST) diff --git a/drivers/iommu/generic_pt/fmt/amdv1.h b/drivers/iommu/generic_pt/fmt/amdv1.h new file mode 100644 index 000000000000..aa8e1a8ec95f --- /dev/null +++ b/drivers/iommu/generic_pt/fmt/amdv1.h @@ -0,0 +1,411 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES + * + * AMD IOMMU v1 page table + * + * This is described in Section "2.2.3 I/O Page Tables for Host Translations" + * of the "AMD I/O Virtualization Technology (IOMMU) Specification" + * + * Note the level numbering here matches the core code, so level 0 is the same + * as mode 1. + * + */ +#ifndef __GENERIC_PT_FMT_AMDV1_H +#define __GENERIC_PT_FMT_AMDV1_H + +#include "defs_amdv1.h" +#include "../pt_defs.h" + +#include <asm/page.h> +#include <linux/bitfield.h> +#include <linux/container_of.h> +#include <linux/mem_encrypt.h> +#include <linux/minmax.h> +#include <linux/sizes.h> +#include <linux/string.h> + +enum { + PT_ITEM_WORD_SIZE = sizeof(u64), + /* + * The IOMMUFD selftest uses the AMDv1 format with some alterations It + * uses a 2k page size to test cases where the CPU page size is not the + * same. + */ +#ifdef AMDV1_IOMMUFD_SELFTEST + PT_MAX_VA_ADDRESS_LG2 = 56, + PT_MAX_OUTPUT_ADDRESS_LG2 = 51, + PT_MAX_TOP_LEVEL = 4, + PT_GRANULE_LG2SZ = 11, +#else + PT_MAX_VA_ADDRESS_LG2 = 64, + PT_MAX_OUTPUT_ADDRESS_LG2 = 52, + PT_MAX_TOP_LEVEL = 5, + PT_GRANULE_LG2SZ = 12, +#endif + PT_TABLEMEM_LG2SZ = 12, + + /* The DTE only has these bits for the top phyiscal address */ + PT_TOP_PHYS_MASK = GENMASK_ULL(51, 12), +}; + +/* PTE bits */ +enum { + AMDV1PT_FMT_PR = BIT(0), + AMDV1PT_FMT_D = BIT(6), + AMDV1PT_FMT_NEXT_LEVEL = GENMASK_ULL(11, 9), + AMDV1PT_FMT_OA = GENMASK_ULL(51, 12), + AMDV1PT_FMT_FC = BIT_ULL(60), + AMDV1PT_FMT_IR = BIT_ULL(61), + AMDV1PT_FMT_IW = BIT_ULL(62), +}; + +/* + * gcc 13 has a bug where it thinks the output of FIELD_GET() is an enum, make + * these defines to avoid it. + */ +#define AMDV1PT_FMT_NL_DEFAULT 0 +#define AMDV1PT_FMT_NL_SIZE 7 + +static inline pt_oaddr_t amdv1pt_table_pa(const struct pt_state *pts) +{ + u64 entry = pts->entry; + + if (pts_feature(pts, PT_FEAT_AMDV1_ENCRYPT_TABLES)) + entry = __sme_clr(entry); + return oalog2_mul(FIELD_GET(AMDV1PT_FMT_OA, entry), PT_GRANULE_LG2SZ); +} +#define pt_table_pa amdv1pt_table_pa + +/* Returns the oa for the start of the contiguous entry */ +static inline pt_oaddr_t amdv1pt_entry_oa(const struct pt_state *pts) +{ + u64 entry = pts->entry; + pt_oaddr_t oa; + + if (pts_feature(pts, PT_FEAT_AMDV1_ENCRYPT_TABLES)) + entry = __sme_clr(entry); + oa = FIELD_GET(AMDV1PT_FMT_OA, entry); + + if (FIELD_GET(AMDV1PT_FMT_NEXT_LEVEL, entry) == AMDV1PT_FMT_NL_SIZE) { + unsigned int sz_bits = oaffz(oa); + + oa = oalog2_set_mod(oa, 0, sz_bits); + } else if (PT_WARN_ON(FIELD_GET(AMDV1PT_FMT_NEXT_LEVEL, entry) != + AMDV1PT_FMT_NL_DEFAULT)) + return 0; + return oalog2_mul(oa, PT_GRANULE_LG2SZ); +} +#define pt_entry_oa amdv1pt_entry_oa + +static inline bool amdv1pt_can_have_leaf(const struct pt_state *pts) +{ + /* + * Table 15: Page Table Level Parameters + * The top most level cannot have translation entries + */ + return pts->level < PT_MAX_TOP_LEVEL; +} +#define pt_can_have_leaf amdv1pt_can_have_leaf + +/* Body in pt_fmt_defaults.h */ +static inline unsigned int pt_table_item_lg2sz(const struct pt_state *pts); + +static inline unsigned int +amdv1pt_entry_num_contig_lg2(const struct pt_state *pts) +{ + u32 code; + + if (FIELD_GET(AMDV1PT_FMT_NEXT_LEVEL, pts->entry) == + AMDV1PT_FMT_NL_DEFAULT) + return ilog2(1); + + PT_WARN_ON(FIELD_GET(AMDV1PT_FMT_NEXT_LEVEL, pts->entry) != + AMDV1PT_FMT_NL_SIZE); + + /* + * The contiguous size is encoded in the length of a string of 1's in + * the low bits of the OA. Reverse the equation: + * code = log2_to_int(num_contig_lg2 + item_lg2sz - + * PT_GRANULE_LG2SZ - 1) - 1 + * Which can be expressed as: + * num_contig_lg2 = oalog2_ffz(code) + 1 - + * item_lg2sz - PT_GRANULE_LG2SZ + * + * Assume the bit layout is correct and remove the masking. Reorganize + * the equation to move all the arithmetic before the ffz. + */ + code = pts->entry >> (__bf_shf(AMDV1PT_FMT_OA) - 1 + + pt_table_item_lg2sz(pts) - PT_GRANULE_LG2SZ); + return ffz_t(u32, code); +} +#define pt_entry_num_contig_lg2 amdv1pt_entry_num_contig_lg2 + +static inline unsigned int amdv1pt_num_items_lg2(const struct pt_state *pts) +{ + /* + * Top entry covers bits [63:57] only, this is handled through + * max_vasz_lg2. + */ + if (PT_WARN_ON(pts->level == 5)) + return 7; + return PT_TABLEMEM_LG2SZ - ilog2(sizeof(u64)); +} +#define pt_num_items_lg2 amdv1pt_num_items_lg2 + +static inline pt_vaddr_t amdv1pt_possible_sizes(const struct pt_state *pts) +{ + unsigned int isz_lg2 = pt_table_item_lg2sz(pts); + + if (!amdv1pt_can_have_leaf(pts)) + return 0; + + /* + * Table 14: Example Page Size Encodings + * Address bits 51:32 can be used to encode page sizes greater than 4 + * Gbytes. Address bits 63:52 are zero-extended. + * + * 512GB Pages are not supported due to a hardware bug. + * Otherwise every power of two size is supported. + */ + return GENMASK_ULL(min(51, isz_lg2 + amdv1pt_num_items_lg2(pts) - 1), + isz_lg2) & ~SZ_512G; +} +#define pt_possible_sizes amdv1pt_possible_sizes + +static inline enum pt_entry_type amdv1pt_load_entry_raw(struct pt_state *pts) +{ + const u64 *tablep = pt_cur_table(pts, u64) + pts->index; + unsigned int next_level; + u64 entry; + + pts->entry = entry = READ_ONCE(*tablep); + if (!(entry & AMDV1PT_FMT_PR)) + return PT_ENTRY_EMPTY; + + next_level = FIELD_GET(AMDV1PT_FMT_NEXT_LEVEL, pts->entry); + if (pts->level == 0 || next_level == AMDV1PT_FMT_NL_DEFAULT || + next_level == AMDV1PT_FMT_NL_SIZE) + return PT_ENTRY_OA; + return PT_ENTRY_TABLE; +} +#define pt_load_entry_raw amdv1pt_load_entry_raw + +static inline void +amdv1pt_install_leaf_entry(struct pt_state *pts, pt_oaddr_t oa, + unsigned int oasz_lg2, + const struct pt_write_attrs *attrs) +{ + unsigned int isz_lg2 = pt_table_item_lg2sz(pts); + u64 *tablep = pt_cur_table(pts, u64) + pts->index; + u64 entry; + + if (!pt_check_install_leaf_args(pts, oa, oasz_lg2)) + return; + + entry = AMDV1PT_FMT_PR | + FIELD_PREP(AMDV1PT_FMT_OA, log2_div(oa, PT_GRANULE_LG2SZ)) | + attrs->descriptor_bits; + + if (oasz_lg2 == isz_lg2) { + entry |= FIELD_PREP(AMDV1PT_FMT_NEXT_LEVEL, + AMDV1PT_FMT_NL_DEFAULT); + WRITE_ONCE(*tablep, entry); + } else { + unsigned int num_contig_lg2 = oasz_lg2 - isz_lg2; + u64 *end = tablep + log2_to_int(num_contig_lg2); + + entry |= FIELD_PREP(AMDV1PT_FMT_NEXT_LEVEL, + AMDV1PT_FMT_NL_SIZE) | + FIELD_PREP(AMDV1PT_FMT_OA, + oalog2_to_int(oasz_lg2 - PT_GRANULE_LG2SZ - + 1) - + 1); + + /* See amdv1pt_clear_entries() */ + if (num_contig_lg2 <= ilog2(32)) { + for (; tablep != end; tablep++) + WRITE_ONCE(*tablep, entry); + } else { + memset64(tablep, entry, log2_to_int(num_contig_lg2)); + } + } + pts->entry = entry; +} +#define pt_install_leaf_entry amdv1pt_install_leaf_entry + +static inline bool amdv1pt_install_table(struct pt_state *pts, + pt_oaddr_t table_pa, + const struct pt_write_attrs *attrs) +{ + u64 entry; + + /* + * IR and IW are ANDed from the table levels along with the PTE. We + * always control permissions from the PTE, so always set IR and IW for + * tables. + */ + entry = AMDV1PT_FMT_PR | + FIELD_PREP(AMDV1PT_FMT_NEXT_LEVEL, pts->level) | + FIELD_PREP(AMDV1PT_FMT_OA, + log2_div(table_pa, PT_GRANULE_LG2SZ)) | + AMDV1PT_FMT_IR | AMDV1PT_FMT_IW; + if (pts_feature(pts, PT_FEAT_AMDV1_ENCRYPT_TABLES)) + entry = __sme_set(entry); + return pt_table_install64(pts, entry); +} +#define pt_install_table amdv1pt_install_table + +static inline void amdv1pt_attr_from_entry(const struct pt_state *pts, + struct pt_write_attrs *attrs) +{ + attrs->descriptor_bits = + pts->entry & (AMDV1PT_FMT_FC | AMDV1PT_FMT_IR | AMDV1PT_FMT_IW); +} +#define pt_attr_from_entry amdv1pt_attr_from_entry + +static inline void amdv1pt_clear_entries(struct pt_state *pts, + unsigned int num_contig_lg2) +{ + u64 *tablep = pt_cur_table(pts, u64) + pts->index; + u64 *end = tablep + log2_to_int(num_contig_lg2); + + /* + * gcc generates rep stos for the io-pgtable code, and this difference + * can show in microbenchmarks with larger contiguous page sizes. + * rep is slower for small cases. + */ + if (num_contig_lg2 <= ilog2(32)) { + for (; tablep != end; tablep++) + WRITE_ONCE(*tablep, 0); + } else { + memset64(tablep, 0, log2_to_int(num_contig_lg2)); + } +} +#define pt_clear_entries amdv1pt_clear_entries + +static inline bool amdv1pt_entry_is_write_dirty(const struct pt_state *pts) +{ + unsigned int num_contig_lg2 = amdv1pt_entry_num_contig_lg2(pts); + u64 *tablep = pt_cur_table(pts, u64) + + log2_set_mod(pts->index, 0, num_contig_lg2); + u64 *end = tablep + log2_to_int(num_contig_lg2); + + for (; tablep != end; tablep++) + if (READ_ONCE(*tablep) & AMDV1PT_FMT_D) + return true; + return false; +} +#define pt_entry_is_write_dirty amdv1pt_entry_is_write_dirty + +static inline void amdv1pt_entry_make_write_clean(struct pt_state *pts) +{ + unsigned int num_contig_lg2 = amdv1pt_entry_num_contig_lg2(pts); + u64 *tablep = pt_cur_table(pts, u64) + + log2_set_mod(pts->index, 0, num_contig_lg2); + u64 *end = tablep + log2_to_int(num_contig_lg2); + + for (; tablep != end; tablep++) + WRITE_ONCE(*tablep, READ_ONCE(*tablep) & ~(u64)AMDV1PT_FMT_D); +} +#define pt_entry_make_write_clean amdv1pt_entry_make_write_clean + +static inline bool amdv1pt_entry_make_write_dirty(struct pt_state *pts) +{ + u64 *tablep = pt_cur_table(pts, u64) + pts->index; + u64 new = pts->entry | AMDV1PT_FMT_D; + + return try_cmpxchg64(tablep, &pts->entry, new); +} +#define pt_entry_make_write_dirty amdv1pt_entry_make_write_dirty + +/* --- iommu */ +#include <linux/generic_pt/iommu.h> +#include <linux/iommu.h> + +#define pt_iommu_table pt_iommu_amdv1 + +/* The common struct is in the per-format common struct */ +static inline struct pt_common *common_from_iommu(struct pt_iommu *iommu_table) +{ + return &container_of(iommu_table, struct pt_iommu_amdv1, iommu) + ->amdpt.common; +} + +static inline struct pt_iommu *iommu_from_common(struct pt_common *common) +{ + return &container_of(common, struct pt_iommu_amdv1, amdpt.common)->iommu; +} + +static inline int amdv1pt_iommu_set_prot(struct pt_common *common, + struct pt_write_attrs *attrs, + unsigned int iommu_prot) +{ + u64 pte = 0; + + if (pt_feature(common, PT_FEAT_AMDV1_FORCE_COHERENCE)) + pte |= AMDV1PT_FMT_FC; + if (iommu_prot & IOMMU_READ) + pte |= AMDV1PT_FMT_IR; + if (iommu_prot & IOMMU_WRITE) + pte |= AMDV1PT_FMT_IW; + + /* + * Ideally we'd have an IOMMU_ENCRYPTED flag set by higher levels to + * control this. For now if the tables use sme_set then so do the ptes. + */ + if (pt_feature(common, PT_FEAT_AMDV1_ENCRYPT_TABLES)) + pte = __sme_set(pte); + + attrs->descriptor_bits = pte; + return 0; +} +#define pt_iommu_set_prot amdv1pt_iommu_set_prot + +static inline int amdv1pt_iommu_fmt_init(struct pt_iommu_amdv1 *iommu_table, + const struct pt_iommu_amdv1_cfg *cfg) +{ + struct pt_amdv1 *table = &iommu_table->amdpt; + unsigned int max_vasz_lg2 = PT_MAX_VA_ADDRESS_LG2; + + if (cfg->starting_level == 0 || cfg->starting_level > PT_MAX_TOP_LEVEL) + return -EINVAL; + + if (!pt_feature(&table->common, PT_FEAT_DYNAMIC_TOP) && + cfg->starting_level != PT_MAX_TOP_LEVEL) + max_vasz_lg2 = PT_GRANULE_LG2SZ + + (PT_TABLEMEM_LG2SZ - ilog2(sizeof(u64))) * + (cfg->starting_level + 1); + + table->common.max_vasz_lg2 = + min(max_vasz_lg2, cfg->common.hw_max_vasz_lg2); + table->common.max_oasz_lg2 = + min(PT_MAX_OUTPUT_ADDRESS_LG2, cfg->common.hw_max_oasz_lg2); + pt_top_set_level(&table->common, cfg->starting_level); + return 0; +} +#define pt_iommu_fmt_init amdv1pt_iommu_fmt_init + +#ifndef PT_FMT_VARIANT +static inline void +amdv1pt_iommu_fmt_hw_info(struct pt_iommu_amdv1 *table, + const struct pt_range *top_range, + struct pt_iommu_amdv1_hw_info *info) +{ + info->host_pt_root = virt_to_phys(top_range->top_table); + PT_WARN_ON(info->host_pt_root & ~PT_TOP_PHYS_MASK); + info->mode = top_range->top_level + 1; +} +#define pt_iommu_fmt_hw_info amdv1pt_iommu_fmt_hw_info +#endif + +#if defined(GENERIC_PT_KUNIT) +static const struct pt_iommu_amdv1_cfg amdv1_kunit_fmt_cfgs[] = { + /* Matches what io_pgtable does */ + [0] = { .starting_level = 2 }, +}; +#define kunit_fmt_cfgs amdv1_kunit_fmt_cfgs +enum { KUNIT_FMT_FEATURES = 0 }; +#endif + +#endif diff --git a/drivers/iommu/generic_pt/fmt/defs_amdv1.h b/drivers/iommu/generic_pt/fmt/defs_amdv1.h new file mode 100644 index 000000000000..0b9614ca6d10 --- /dev/null +++ b/drivers/iommu/generic_pt/fmt/defs_amdv1.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES + * + */ +#ifndef __GENERIC_PT_FMT_DEFS_AMDV1_H +#define __GENERIC_PT_FMT_DEFS_AMDV1_H + +#include <linux/generic_pt/common.h> +#include <linux/types.h> + +typedef u64 pt_vaddr_t; +typedef u64 pt_oaddr_t; + +struct amdv1pt_write_attrs { + u64 descriptor_bits; + gfp_t gfp; +}; +#define pt_write_attrs amdv1pt_write_attrs + +#endif diff --git a/drivers/iommu/generic_pt/fmt/defs_vtdss.h b/drivers/iommu/generic_pt/fmt/defs_vtdss.h new file mode 100644 index 000000000000..4a239bcaae2a --- /dev/null +++ b/drivers/iommu/generic_pt/fmt/defs_vtdss.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES + * + */ +#ifndef __GENERIC_PT_FMT_DEFS_VTDSS_H +#define __GENERIC_PT_FMT_DEFS_VTDSS_H + +#include <linux/generic_pt/common.h> +#include <linux/types.h> + +typedef u64 pt_vaddr_t; +typedef u64 pt_oaddr_t; + +struct vtdss_pt_write_attrs { + u64 descriptor_bits; + gfp_t gfp; +}; +#define pt_write_attrs vtdss_pt_write_attrs + +#endif diff --git a/drivers/iommu/generic_pt/fmt/defs_x86_64.h b/drivers/iommu/generic_pt/fmt/defs_x86_64.h new file mode 100644 index 000000000000..6f589e1f55d3 --- /dev/null +++ b/drivers/iommu/generic_pt/fmt/defs_x86_64.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES + * + */ +#ifndef __GENERIC_PT_FMT_DEFS_X86_64_H +#define __GENERIC_PT_FMT_DEFS_X86_64_H + +#include <linux/generic_pt/common.h> +#include <linux/types.h> + +typedef u64 pt_vaddr_t; +typedef u64 pt_oaddr_t; + +struct x86_64_pt_write_attrs { + u64 descriptor_bits; + gfp_t gfp; +}; +#define pt_write_attrs x86_64_pt_write_attrs + +#endif diff --git a/drivers/iommu/generic_pt/fmt/iommu_amdv1.c b/drivers/iommu/generic_pt/fmt/iommu_amdv1.c new file mode 100644 index 000000000000..72a2337d0c55 --- /dev/null +++ b/drivers/iommu/generic_pt/fmt/iommu_amdv1.c @@ -0,0 +1,15 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES + */ +#define PT_FMT amdv1 +#define PT_SUPPORTED_FEATURES \ + (BIT(PT_FEAT_FULL_VA) | BIT(PT_FEAT_DYNAMIC_TOP) | \ + BIT(PT_FEAT_FLUSH_RANGE) | BIT(PT_FEAT_FLUSH_RANGE_NO_GAPS) | \ + BIT(PT_FEAT_AMDV1_ENCRYPT_TABLES) | \ + BIT(PT_FEAT_AMDV1_FORCE_COHERENCE)) +#define PT_FORCE_ENABLED_FEATURES \ + (BIT(PT_FEAT_DYNAMIC_TOP) | BIT(PT_FEAT_AMDV1_ENCRYPT_TABLES) | \ + BIT(PT_FEAT_AMDV1_FORCE_COHERENCE)) + +#include "iommu_template.h" diff --git a/drivers/iommu/generic_pt/fmt/iommu_mock.c b/drivers/iommu/generic_pt/fmt/iommu_mock.c new file mode 100644 index 000000000000..74e597cba9d9 --- /dev/null +++ b/drivers/iommu/generic_pt/fmt/iommu_mock.c @@ -0,0 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES + */ +#define AMDV1_IOMMUFD_SELFTEST 1 +#define PT_FMT amdv1 +#define PT_FMT_VARIANT mock +#define PT_SUPPORTED_FEATURES 0 + +#include "iommu_template.h" diff --git a/drivers/iommu/generic_pt/fmt/iommu_template.h b/drivers/iommu/generic_pt/fmt/iommu_template.h new file mode 100644 index 000000000000..d28e86abdf2e --- /dev/null +++ b/drivers/iommu/generic_pt/fmt/iommu_template.h @@ -0,0 +1,48 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES + * + * Template to build the iommu module and kunit from the format and + * implementation headers. + * + * The format should have: + * #define PT_FMT <name> + * #define PT_SUPPORTED_FEATURES (BIT(PT_FEAT_xx) | BIT(PT_FEAT_yy)) + * And optionally: + * #define PT_FORCE_ENABLED_FEATURES .. + * #define PT_FMT_VARIANT <suffix> + */ +#include <linux/args.h> +#include <linux/stringify.h> + +#ifdef PT_FMT_VARIANT +#define PTPFX_RAW \ + CONCATENATE(CONCATENATE(PT_FMT, _), PT_FMT_VARIANT) +#else +#define PTPFX_RAW PT_FMT +#endif + +#define PTPFX CONCATENATE(PTPFX_RAW, _) + +#define _PT_FMT_H PT_FMT.h +#define PT_FMT_H __stringify(_PT_FMT_H) + +#define _PT_DEFS_H CONCATENATE(defs_, _PT_FMT_H) +#define PT_DEFS_H __stringify(_PT_DEFS_H) + +#include <linux/generic_pt/common.h> +#include PT_DEFS_H +#include "../pt_defs.h" +#include PT_FMT_H +#include "../pt_common.h" + +#ifndef GENERIC_PT_KUNIT +#include "../iommu_pt.h" +#else +/* + * The makefile will compile the .c file twice, once with GENERIC_PT_KUNIT set + * which means we are building the kunit modle. + */ +#include "../kunit_generic_pt.h" +#include "../kunit_iommu_pt.h" +#endif diff --git a/drivers/iommu/generic_pt/fmt/iommu_vtdss.c b/drivers/iommu/generic_pt/fmt/iommu_vtdss.c new file mode 100644 index 000000000000..f551711e2a33 --- /dev/null +++ b/drivers/iommu/generic_pt/fmt/iommu_vtdss.c @@ -0,0 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES + */ +#define PT_FMT vtdss +#define PT_SUPPORTED_FEATURES \ + (BIT(PT_FEAT_FLUSH_RANGE) | BIT(PT_FEAT_VTDSS_FORCE_COHERENCE) | \ + BIT(PT_FEAT_VTDSS_FORCE_WRITEABLE) | BIT(PT_FEAT_DMA_INCOHERENT)) + +#include "iommu_template.h" diff --git a/drivers/iommu/generic_pt/fmt/iommu_x86_64.c b/drivers/iommu/generic_pt/fmt/iommu_x86_64.c new file mode 100644 index 000000000000..5472660c2d71 --- /dev/null +++ b/drivers/iommu/generic_pt/fmt/iommu_x86_64.c @@ -0,0 +1,11 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES + */ +#define PT_FMT x86_64 +#define PT_SUPPORTED_FEATURES \ + (BIT(PT_FEAT_SIGN_EXTEND) | BIT(PT_FEAT_FLUSH_RANGE) | \ + BIT(PT_FEAT_FLUSH_RANGE_NO_GAPS) | \ + BIT(PT_FEAT_X86_64_AMD_ENCRYPT_TABLES) | BIT(PT_FEAT_DMA_INCOHERENT)) + +#include "iommu_template.h" diff --git a/drivers/iommu/generic_pt/fmt/vtdss.h b/drivers/iommu/generic_pt/fmt/vtdss.h new file mode 100644 index 000000000000..f5f8981edde7 --- /dev/null +++ b/drivers/iommu/generic_pt/fmt/vtdss.h @@ -0,0 +1,285 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES + * + * Intel VT-d Second Stange 5/4 level page table + * + * This is described in + * Section "3.7 Second-Stage Translation" + * Section "9.8 Second-Stage Paging Entries" + * + * Of the "Intel Virtualization Technology for Directed I/O Architecture + * Specification". + * + * The named levels in the spec map to the pts->level as: + * Table/SS-PTE - 0 + * Directory/SS-PDE - 1 + * Directory Ptr/SS-PDPTE - 2 + * PML4/SS-PML4E - 3 + * PML5/SS-PML5E - 4 + */ +#ifndef __GENERIC_PT_FMT_VTDSS_H +#define __GENERIC_PT_FMT_VTDSS_H + +#include "defs_vtdss.h" +#include "../pt_defs.h" + +#include <linux/bitfield.h> +#include <linux/container_of.h> +#include <linux/log2.h> + +enum { + PT_MAX_OUTPUT_ADDRESS_LG2 = 52, + PT_MAX_VA_ADDRESS_LG2 = 57, + PT_ITEM_WORD_SIZE = sizeof(u64), + PT_MAX_TOP_LEVEL = 4, + PT_GRANULE_LG2SZ = 12, + PT_TABLEMEM_LG2SZ = 12, + + /* SSPTPTR is 4k aligned and limited by HAW */ + PT_TOP_PHYS_MASK = GENMASK_ULL(63, 12), +}; + +/* Shared descriptor bits */ +enum { + VTDSS_FMT_R = BIT(0), + VTDSS_FMT_W = BIT(1), + VTDSS_FMT_A = BIT(8), + VTDSS_FMT_D = BIT(9), + VTDSS_FMT_SNP = BIT(11), + VTDSS_FMT_OA = GENMASK_ULL(51, 12), +}; + +/* PDPTE/PDE */ +enum { + VTDSS_FMT_PS = BIT(7), +}; + +#define common_to_vtdss_pt(common_ptr) \ + container_of_const(common_ptr, struct pt_vtdss, common) +#define to_vtdss_pt(pts) common_to_vtdss_pt((pts)->range->common) + +static inline pt_oaddr_t vtdss_pt_table_pa(const struct pt_state *pts) +{ + return oalog2_mul(FIELD_GET(VTDSS_FMT_OA, pts->entry), + PT_TABLEMEM_LG2SZ); +} +#define pt_table_pa vtdss_pt_table_pa + +static inline pt_oaddr_t vtdss_pt_entry_oa(const struct pt_state *pts) +{ + return oalog2_mul(FIELD_GET(VTDSS_FMT_OA, pts->entry), + PT_GRANULE_LG2SZ); +} +#define pt_entry_oa vtdss_pt_entry_oa + +static inline bool vtdss_pt_can_have_leaf(const struct pt_state *pts) +{ + return pts->level <= 2; +} +#define pt_can_have_leaf vtdss_pt_can_have_leaf + +static inline unsigned int vtdss_pt_num_items_lg2(const struct pt_state *pts) +{ + return PT_TABLEMEM_LG2SZ - ilog2(sizeof(u64)); +} +#define pt_num_items_lg2 vtdss_pt_num_items_lg2 + +static inline enum pt_entry_type vtdss_pt_load_entry_raw(struct pt_state *pts) +{ + const u64 *tablep = pt_cur_table(pts, u64); + u64 entry; + + pts->entry = entry = READ_ONCE(tablep[pts->index]); + if (!entry) + return PT_ENTRY_EMPTY; + if (pts->level == 0 || + (vtdss_pt_can_have_leaf(pts) && (pts->entry & VTDSS_FMT_PS))) + return PT_ENTRY_OA; + return PT_ENTRY_TABLE; +} +#define pt_load_entry_raw vtdss_pt_load_entry_raw + +static inline void +vtdss_pt_install_leaf_entry(struct pt_state *pts, pt_oaddr_t oa, + unsigned int oasz_lg2, + const struct pt_write_attrs *attrs) +{ + u64 *tablep = pt_cur_table(pts, u64); + u64 entry; + + if (!pt_check_install_leaf_args(pts, oa, oasz_lg2)) + return; + + entry = FIELD_PREP(VTDSS_FMT_OA, log2_div(oa, PT_GRANULE_LG2SZ)) | + attrs->descriptor_bits; + if (pts->level != 0) + entry |= VTDSS_FMT_PS; + + WRITE_ONCE(tablep[pts->index], entry); + pts->entry = entry; +} +#define pt_install_leaf_entry vtdss_pt_install_leaf_entry + +static inline bool vtdss_pt_install_table(struct pt_state *pts, + pt_oaddr_t table_pa, + const struct pt_write_attrs *attrs) +{ + u64 entry; + + entry = VTDSS_FMT_R | VTDSS_FMT_W | + FIELD_PREP(VTDSS_FMT_OA, log2_div(table_pa, PT_GRANULE_LG2SZ)); + return pt_table_install64(pts, entry); +} +#define pt_install_table vtdss_pt_install_table + +static inline void vtdss_pt_attr_from_entry(const struct pt_state *pts, + struct pt_write_attrs *attrs) +{ + attrs->descriptor_bits = pts->entry & + (VTDSS_FMT_R | VTDSS_FMT_W | VTDSS_FMT_SNP); +} +#define pt_attr_from_entry vtdss_pt_attr_from_entry + +static inline bool vtdss_pt_entry_is_write_dirty(const struct pt_state *pts) +{ + u64 *tablep = pt_cur_table(pts, u64) + pts->index; + + return READ_ONCE(*tablep) & VTDSS_FMT_D; +} +#define pt_entry_is_write_dirty vtdss_pt_entry_is_write_dirty + +static inline void vtdss_pt_entry_make_write_clean(struct pt_state *pts) +{ + u64 *tablep = pt_cur_table(pts, u64) + pts->index; + + WRITE_ONCE(*tablep, READ_ONCE(*tablep) & ~(u64)VTDSS_FMT_D); +} +#define pt_entry_make_write_clean vtdss_pt_entry_make_write_clean + +static inline bool vtdss_pt_entry_make_write_dirty(struct pt_state *pts) +{ + u64 *tablep = pt_cur_table(pts, u64) + pts->index; + u64 new = pts->entry | VTDSS_FMT_D; + + return try_cmpxchg64(tablep, &pts->entry, new); +} +#define pt_entry_make_write_dirty vtdss_pt_entry_make_write_dirty + +static inline unsigned int vtdss_pt_max_sw_bit(struct pt_common *common) +{ + return 10; +} +#define pt_max_sw_bit vtdss_pt_max_sw_bit + +static inline u64 vtdss_pt_sw_bit(unsigned int bitnr) +{ + if (__builtin_constant_p(bitnr) && bitnr > 10) + BUILD_BUG(); + + /* Bits marked Ignored in the specification */ + switch (bitnr) { + case 0: + return BIT(10); + case 1 ... 9: + return BIT_ULL((bitnr - 1) + 52); + case 10: + return BIT_ULL(63); + /* Some bits in 9-3 are available in some entries */ + default: + PT_WARN_ON(true); + return 0; + } +} +#define pt_sw_bit vtdss_pt_sw_bit + +/* --- iommu */ +#include <linux/generic_pt/iommu.h> +#include <linux/iommu.h> + +#define pt_iommu_table pt_iommu_vtdss + +/* The common struct is in the per-format common struct */ +static inline struct pt_common *common_from_iommu(struct pt_iommu *iommu_table) +{ + return &container_of(iommu_table, struct pt_iommu_table, iommu) + ->vtdss_pt.common; +} + +static inline struct pt_iommu *iommu_from_common(struct pt_common *common) +{ + return &container_of(common, struct pt_iommu_table, vtdss_pt.common) + ->iommu; +} + +static inline int vtdss_pt_iommu_set_prot(struct pt_common *common, + struct pt_write_attrs *attrs, + unsigned int iommu_prot) +{ + u64 pte = 0; + + /* + * VTDSS does not have a present bit, so we tell if any entry is present + * by checking for R or W. + */ + if (!(iommu_prot & (IOMMU_READ | IOMMU_WRITE))) + return -EINVAL; + + if (iommu_prot & IOMMU_READ) + pte |= VTDSS_FMT_R; + if (iommu_prot & IOMMU_WRITE) + pte |= VTDSS_FMT_W; + if (pt_feature(common, PT_FEAT_VTDSS_FORCE_COHERENCE)) + pte |= VTDSS_FMT_SNP; + + if (pt_feature(common, PT_FEAT_VTDSS_FORCE_WRITEABLE) && + !(iommu_prot & IOMMU_WRITE)) { + pr_err_ratelimited( + "Read-only mapping is disallowed on the domain which serves as the parent in a nested configuration, due to HW errata (ERRATA_772415_SPR17)\n"); + return -EINVAL; + } + + attrs->descriptor_bits = pte; + return 0; +} +#define pt_iommu_set_prot vtdss_pt_iommu_set_prot + +static inline int vtdss_pt_iommu_fmt_init(struct pt_iommu_vtdss *iommu_table, + const struct pt_iommu_vtdss_cfg *cfg) +{ + struct pt_vtdss *table = &iommu_table->vtdss_pt; + + if (cfg->top_level > 4 || cfg->top_level < 2) + return -EOPNOTSUPP; + + pt_top_set_level(&table->common, cfg->top_level); + return 0; +} +#define pt_iommu_fmt_init vtdss_pt_iommu_fmt_init + +static inline void +vtdss_pt_iommu_fmt_hw_info(struct pt_iommu_vtdss *table, + const struct pt_range *top_range, + struct pt_iommu_vtdss_hw_info *info) +{ + info->ssptptr = virt_to_phys(top_range->top_table); + PT_WARN_ON(info->ssptptr & ~PT_TOP_PHYS_MASK); + /* + * top_level = 2 = 3 level table aw=1 + * top_level = 3 = 4 level table aw=2 + * top_level = 4 = 5 level table aw=3 + */ + info->aw = top_range->top_level - 1; +} +#define pt_iommu_fmt_hw_info vtdss_pt_iommu_fmt_hw_info + +#if defined(GENERIC_PT_KUNIT) +static const struct pt_iommu_vtdss_cfg vtdss_kunit_fmt_cfgs[] = { + [0] = { .common.hw_max_vasz_lg2 = 39, .top_level = 2}, + [1] = { .common.hw_max_vasz_lg2 = 48, .top_level = 3}, + [2] = { .common.hw_max_vasz_lg2 = 57, .top_level = 4}, +}; +#define kunit_fmt_cfgs vtdss_kunit_fmt_cfgs +enum { KUNIT_FMT_FEATURES = BIT(PT_FEAT_VTDSS_FORCE_WRITEABLE) }; +#endif +#endif diff --git a/drivers/iommu/generic_pt/fmt/x86_64.h b/drivers/iommu/generic_pt/fmt/x86_64.h new file mode 100644 index 000000000000..210748d9d6e8 --- /dev/null +++ b/drivers/iommu/generic_pt/fmt/x86_64.h @@ -0,0 +1,279 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES + * + * x86 page table. Supports the 4 and 5 level variations. + * + * The 4 and 5 level version is described in: + * Section "4.4 4-Level Paging and 5-Level Paging" of the Intel Software + * Developer's Manual Volume 3 + * + * Section "9.7 First-Stage Paging Entries" of the "Intel Virtualization + * Technology for Directed I/O Architecture Specification" + * + * Section "2.2.6 I/O Page Tables for Guest Translations" of the "AMD I/O + * Virtualization Technology (IOMMU) Specification" + * + * It is used by x86 CPUs, AMD and VT-d IOMMU HW. + * + * Note the 3 level format is very similar and almost implemented here. The + * reserved/ignored layout is different and there are functional bit + * differences. + * + * This format uses PT_FEAT_SIGN_EXTEND to have a upper/non-canonical/lower + * split. PT_FEAT_SIGN_EXTEND is optional as AMD IOMMU sometimes uses non-sign + * extended addressing with this page table format. + * + * The named levels in the spec map to the pts->level as: + * Table/PTE - 0 + * Directory/PDE - 1 + * Directory Ptr/PDPTE - 2 + * PML4/PML4E - 3 + * PML5/PML5E - 4 + */ +#ifndef __GENERIC_PT_FMT_X86_64_H +#define __GENERIC_PT_FMT_X86_64_H + +#include "defs_x86_64.h" +#include "../pt_defs.h" + +#include <linux/bitfield.h> +#include <linux/container_of.h> +#include <linux/log2.h> +#include <linux/mem_encrypt.h> + +enum { + PT_MAX_OUTPUT_ADDRESS_LG2 = 52, + PT_MAX_VA_ADDRESS_LG2 = 57, + PT_ITEM_WORD_SIZE = sizeof(u64), + PT_MAX_TOP_LEVEL = 4, + PT_GRANULE_LG2SZ = 12, + PT_TABLEMEM_LG2SZ = 12, + + /* + * For AMD the GCR3 Base only has these bits. For VT-d FSPTPTR is 4k + * aligned and is limited by the architected HAW + */ + PT_TOP_PHYS_MASK = GENMASK_ULL(51, 12), +}; + +/* Shared descriptor bits */ +enum { + X86_64_FMT_P = BIT(0), + X86_64_FMT_RW = BIT(1), + X86_64_FMT_U = BIT(2), + X86_64_FMT_A = BIT(5), + X86_64_FMT_D = BIT(6), + X86_64_FMT_OA = GENMASK_ULL(51, 12), + X86_64_FMT_XD = BIT_ULL(63), +}; + +/* PDPTE/PDE */ +enum { + X86_64_FMT_PS = BIT(7), +}; + +static inline pt_oaddr_t x86_64_pt_table_pa(const struct pt_state *pts) +{ + u64 entry = pts->entry; + + if (pts_feature(pts, PT_FEAT_X86_64_AMD_ENCRYPT_TABLES)) + entry = __sme_clr(entry); + return oalog2_mul(FIELD_GET(X86_64_FMT_OA, entry), + PT_TABLEMEM_LG2SZ); +} +#define pt_table_pa x86_64_pt_table_pa + +static inline pt_oaddr_t x86_64_pt_entry_oa(const struct pt_state *pts) +{ + u64 entry = pts->entry; + + if (pts_feature(pts, PT_FEAT_X86_64_AMD_ENCRYPT_TABLES)) + entry = __sme_clr(entry); + return oalog2_mul(FIELD_GET(X86_64_FMT_OA, entry), + PT_GRANULE_LG2SZ); +} +#define pt_entry_oa x86_64_pt_entry_oa + +static inline bool x86_64_pt_can_have_leaf(const struct pt_state *pts) +{ + return pts->level <= 2; +} +#define pt_can_have_leaf x86_64_pt_can_have_leaf + +static inline unsigned int x86_64_pt_num_items_lg2(const struct pt_state *pts) +{ + return PT_TABLEMEM_LG2SZ - ilog2(sizeof(u64)); +} +#define pt_num_items_lg2 x86_64_pt_num_items_lg2 + +static inline enum pt_entry_type x86_64_pt_load_entry_raw(struct pt_state *pts) +{ + const u64 *tablep = pt_cur_table(pts, u64); + u64 entry; + + pts->entry = entry = READ_ONCE(tablep[pts->index]); + if (!(entry & X86_64_FMT_P)) + return PT_ENTRY_EMPTY; + if (pts->level == 0 || + (x86_64_pt_can_have_leaf(pts) && (entry & X86_64_FMT_PS))) + return PT_ENTRY_OA; + return PT_ENTRY_TABLE; +} +#define pt_load_entry_raw x86_64_pt_load_entry_raw + +static inline void +x86_64_pt_install_leaf_entry(struct pt_state *pts, pt_oaddr_t oa, + unsigned int oasz_lg2, + const struct pt_write_attrs *attrs) +{ + u64 *tablep = pt_cur_table(pts, u64); + u64 entry; + + if (!pt_check_install_leaf_args(pts, oa, oasz_lg2)) + return; + + entry = X86_64_FMT_P | + FIELD_PREP(X86_64_FMT_OA, log2_div(oa, PT_GRANULE_LG2SZ)) | + attrs->descriptor_bits; + if (pts->level != 0) + entry |= X86_64_FMT_PS; + + WRITE_ONCE(tablep[pts->index], entry); + pts->entry = entry; +} +#define pt_install_leaf_entry x86_64_pt_install_leaf_entry + +static inline bool x86_64_pt_install_table(struct pt_state *pts, + pt_oaddr_t table_pa, + const struct pt_write_attrs *attrs) +{ + u64 entry; + + entry = X86_64_FMT_P | X86_64_FMT_RW | X86_64_FMT_U | X86_64_FMT_A | + FIELD_PREP(X86_64_FMT_OA, log2_div(table_pa, PT_GRANULE_LG2SZ)); + if (pts_feature(pts, PT_FEAT_X86_64_AMD_ENCRYPT_TABLES)) + entry = __sme_set(entry); + return pt_table_install64(pts, entry); +} +#define pt_install_table x86_64_pt_install_table + +static inline void x86_64_pt_attr_from_entry(const struct pt_state *pts, + struct pt_write_attrs *attrs) +{ + attrs->descriptor_bits = pts->entry & + (X86_64_FMT_RW | X86_64_FMT_U | X86_64_FMT_A | + X86_64_FMT_D | X86_64_FMT_XD); +} +#define pt_attr_from_entry x86_64_pt_attr_from_entry + +static inline unsigned int x86_64_pt_max_sw_bit(struct pt_common *common) +{ + return 12; +} +#define pt_max_sw_bit x86_64_pt_max_sw_bit + +static inline u64 x86_64_pt_sw_bit(unsigned int bitnr) +{ + if (__builtin_constant_p(bitnr) && bitnr > 12) + BUILD_BUG(); + + /* Bits marked Ignored/AVL in the specification */ + switch (bitnr) { + case 0: + return BIT(9); + case 1: + return BIT(11); + case 2 ... 12: + return BIT_ULL((bitnr - 2) + 52); + /* Some bits in 8,6,4,3 are available in some entries */ + default: + PT_WARN_ON(true); + return 0; + } +} +#define pt_sw_bit x86_64_pt_sw_bit + +/* --- iommu */ +#include <linux/generic_pt/iommu.h> +#include <linux/iommu.h> + +#define pt_iommu_table pt_iommu_x86_64 + +/* The common struct is in the per-format common struct */ +static inline struct pt_common *common_from_iommu(struct pt_iommu *iommu_table) +{ + return &container_of(iommu_table, struct pt_iommu_table, iommu) + ->x86_64_pt.common; +} + +static inline struct pt_iommu *iommu_from_common(struct pt_common *common) +{ + return &container_of(common, struct pt_iommu_table, x86_64_pt.common) + ->iommu; +} + +static inline int x86_64_pt_iommu_set_prot(struct pt_common *common, + struct pt_write_attrs *attrs, + unsigned int iommu_prot) +{ + u64 pte; + + pte = X86_64_FMT_U | X86_64_FMT_A; + if (iommu_prot & IOMMU_WRITE) + pte |= X86_64_FMT_RW | X86_64_FMT_D; + + /* + * Ideally we'd have an IOMMU_ENCRYPTED flag set by higher levels to + * control this. For now if the tables use sme_set then so do the ptes. + */ + if (pt_feature(common, PT_FEAT_X86_64_AMD_ENCRYPT_TABLES)) + pte = __sme_set(pte); + + attrs->descriptor_bits = pte; + return 0; +} +#define pt_iommu_set_prot x86_64_pt_iommu_set_prot + +static inline int +x86_64_pt_iommu_fmt_init(struct pt_iommu_x86_64 *iommu_table, + const struct pt_iommu_x86_64_cfg *cfg) +{ + struct pt_x86_64 *table = &iommu_table->x86_64_pt; + + if (cfg->top_level < 3 || cfg->top_level > 4) + return -EOPNOTSUPP; + + pt_top_set_level(&table->common, cfg->top_level); + + table->common.max_oasz_lg2 = + min(PT_MAX_OUTPUT_ADDRESS_LG2, cfg->common.hw_max_oasz_lg2); + return 0; +} +#define pt_iommu_fmt_init x86_64_pt_iommu_fmt_init + +static inline void +x86_64_pt_iommu_fmt_hw_info(struct pt_iommu_x86_64 *table, + const struct pt_range *top_range, + struct pt_iommu_x86_64_hw_info *info) +{ + info->gcr3_pt = virt_to_phys(top_range->top_table); + PT_WARN_ON(info->gcr3_pt & ~PT_TOP_PHYS_MASK); + info->levels = top_range->top_level + 1; +} +#define pt_iommu_fmt_hw_info x86_64_pt_iommu_fmt_hw_info + +#if defined(GENERIC_PT_KUNIT) +static const struct pt_iommu_x86_64_cfg x86_64_kunit_fmt_cfgs[] = { + [0] = { .common.features = BIT(PT_FEAT_SIGN_EXTEND), + .common.hw_max_vasz_lg2 = 48, .top_level = 3 }, + [1] = { .common.features = BIT(PT_FEAT_SIGN_EXTEND), + .common.hw_max_vasz_lg2 = 57, .top_level = 4 }, + /* AMD IOMMU PASID 0 formats with no SIGN_EXTEND */ + [2] = { .common.hw_max_vasz_lg2 = 47, .top_level = 3 }, + [3] = { .common.hw_max_vasz_lg2 = 56, .top_level = 4}, +}; +#define kunit_fmt_cfgs x86_64_kunit_fmt_cfgs +enum { KUNIT_FMT_FEATURES = BIT(PT_FEAT_SIGN_EXTEND)}; +#endif +#endif diff --git a/drivers/iommu/generic_pt/iommu_pt.h b/drivers/iommu/generic_pt/iommu_pt.h new file mode 100644 index 000000000000..97aeda1ad01c --- /dev/null +++ b/drivers/iommu/generic_pt/iommu_pt.h @@ -0,0 +1,1289 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES + * + * "Templated C code" for implementing the iommu operations for page tables. + * This is compiled multiple times, over all the page table formats to pick up + * the per-format definitions. + */ +#ifndef __GENERIC_PT_IOMMU_PT_H +#define __GENERIC_PT_IOMMU_PT_H + +#include "pt_iter.h" + +#include <linux/export.h> +#include <linux/iommu.h> +#include "../iommu-pages.h" +#include <linux/cleanup.h> +#include <linux/dma-mapping.h> + +enum { + SW_BIT_CACHE_FLUSH_DONE = 0, +}; + +static void flush_writes_range(const struct pt_state *pts, + unsigned int start_index, unsigned int end_index) +{ + if (pts_feature(pts, PT_FEAT_DMA_INCOHERENT)) + iommu_pages_flush_incoherent( + iommu_from_common(pts->range->common)->iommu_device, + pts->table, start_index * PT_ITEM_WORD_SIZE, + (end_index - start_index) * PT_ITEM_WORD_SIZE); +} + +static void flush_writes_item(const struct pt_state *pts) +{ + if (pts_feature(pts, PT_FEAT_DMA_INCOHERENT)) + iommu_pages_flush_incoherent( + iommu_from_common(pts->range->common)->iommu_device, + pts->table, pts->index * PT_ITEM_WORD_SIZE, + PT_ITEM_WORD_SIZE); +} + +static void gather_range_pages(struct iommu_iotlb_gather *iotlb_gather, + struct pt_iommu *iommu_table, pt_vaddr_t iova, + pt_vaddr_t len, + struct iommu_pages_list *free_list) +{ + struct pt_common *common = common_from_iommu(iommu_table); + + if (pt_feature(common, PT_FEAT_DMA_INCOHERENT)) + iommu_pages_stop_incoherent_list(free_list, + iommu_table->iommu_device); + + if (pt_feature(common, PT_FEAT_FLUSH_RANGE_NO_GAPS) && + iommu_iotlb_gather_is_disjoint(iotlb_gather, iova, len)) { + iommu_iotlb_sync(&iommu_table->domain, iotlb_gather); + /* + * Note that the sync frees the gather's free list, so we must + * not have any pages on that list that are covered by iova/len + */ + } else if (pt_feature(common, PT_FEAT_FLUSH_RANGE)) { + iommu_iotlb_gather_add_range(iotlb_gather, iova, len); + } + + iommu_pages_list_splice(free_list, &iotlb_gather->freelist); +} + +#define DOMAIN_NS(op) CONCATENATE(CONCATENATE(pt_iommu_, PTPFX), op) + +static int make_range_ul(struct pt_common *common, struct pt_range *range, + unsigned long iova, unsigned long len) +{ + unsigned long last; + + if (unlikely(len == 0)) + return -EINVAL; + + if (check_add_overflow(iova, len - 1, &last)) + return -EOVERFLOW; + + *range = pt_make_range(common, iova, last); + if (sizeof(iova) > sizeof(range->va)) { + if (unlikely(range->va != iova || range->last_va != last)) + return -EOVERFLOW; + } + return 0; +} + +static __maybe_unused int make_range_u64(struct pt_common *common, + struct pt_range *range, u64 iova, + u64 len) +{ + if (unlikely(iova > ULONG_MAX || len > ULONG_MAX)) + return -EOVERFLOW; + return make_range_ul(common, range, iova, len); +} + +/* + * Some APIs use unsigned long, while othersuse dma_addr_t as the type. Dispatch + * to the correct validation based on the type. + */ +#define make_range_no_check(common, range, iova, len) \ + ({ \ + int ret; \ + if (sizeof(iova) > sizeof(unsigned long) || \ + sizeof(len) > sizeof(unsigned long)) \ + ret = make_range_u64(common, range, iova, len); \ + else \ + ret = make_range_ul(common, range, iova, len); \ + ret; \ + }) + +#define make_range(common, range, iova, len) \ + ({ \ + int ret = make_range_no_check(common, range, iova, len); \ + if (!ret) \ + ret = pt_check_range(range); \ + ret; \ + }) + +static inline unsigned int compute_best_pgsize(struct pt_state *pts, + pt_oaddr_t oa) +{ + struct pt_iommu *iommu_table = iommu_from_common(pts->range->common); + + if (!pt_can_have_leaf(pts)) + return 0; + + /* + * The page size is limited by the domain's bitmap. This allows the core + * code to reduce the supported page sizes by changing the bitmap. + */ + return pt_compute_best_pgsize(pt_possible_sizes(pts) & + iommu_table->domain.pgsize_bitmap, + pts->range->va, pts->range->last_va, oa); +} + +static __always_inline int __do_iova_to_phys(struct pt_range *range, void *arg, + unsigned int level, + struct pt_table_p *table, + pt_level_fn_t descend_fn) +{ + struct pt_state pts = pt_init(range, level, table); + pt_oaddr_t *res = arg; + + switch (pt_load_single_entry(&pts)) { + case PT_ENTRY_EMPTY: + return -ENOENT; + case PT_ENTRY_TABLE: + return pt_descend(&pts, arg, descend_fn); + case PT_ENTRY_OA: + *res = pt_entry_oa_exact(&pts); + return 0; + } + return -ENOENT; +} +PT_MAKE_LEVELS(__iova_to_phys, __do_iova_to_phys); + +/** + * iova_to_phys() - Return the output address for the given IOVA + * @domain: Table to query + * @iova: IO virtual address to query + * + * Determine the output address from the given IOVA. @iova may have any + * alignment, the returned physical will be adjusted with any sub page offset. + * + * Context: The caller must hold a read range lock that includes @iova. + * + * Return: 0 if there is no translation for the given iova. + */ +phys_addr_t DOMAIN_NS(iova_to_phys)(struct iommu_domain *domain, + dma_addr_t iova) +{ + struct pt_iommu *iommu_table = + container_of(domain, struct pt_iommu, domain); + struct pt_range range; + pt_oaddr_t res; + int ret; + + ret = make_range(common_from_iommu(iommu_table), &range, iova, 1); + if (ret) + return ret; + + ret = pt_walk_range(&range, __iova_to_phys, &res); + /* PHYS_ADDR_MAX would be a better error code */ + if (ret) + return 0; + return res; +} +EXPORT_SYMBOL_NS_GPL(DOMAIN_NS(iova_to_phys), "GENERIC_PT_IOMMU"); + +struct pt_iommu_dirty_args { + struct iommu_dirty_bitmap *dirty; + unsigned int flags; +}; + +static void record_dirty(struct pt_state *pts, + struct pt_iommu_dirty_args *dirty, + unsigned int num_contig_lg2) +{ + pt_vaddr_t dirty_len; + + if (num_contig_lg2 != ilog2(1)) { + unsigned int index = pts->index; + unsigned int end_index = log2_set_mod_max_t( + unsigned int, pts->index, num_contig_lg2); + + /* Adjust for being contained inside a contiguous page */ + end_index = min(end_index, pts->end_index); + dirty_len = (end_index - index) * + log2_to_int(pt_table_item_lg2sz(pts)); + } else { + dirty_len = log2_to_int(pt_table_item_lg2sz(pts)); + } + + if (dirty->dirty->bitmap) + iova_bitmap_set(dirty->dirty->bitmap, pts->range->va, + dirty_len); + + if (!(dirty->flags & IOMMU_DIRTY_NO_CLEAR)) { + /* + * No write log required because DMA incoherence and atomic + * dirty tracking bits can't work together + */ + pt_entry_make_write_clean(pts); + iommu_iotlb_gather_add_range(dirty->dirty->gather, + pts->range->va, dirty_len); + } +} + +static inline int __read_and_clear_dirty(struct pt_range *range, void *arg, + unsigned int level, + struct pt_table_p *table) +{ + struct pt_state pts = pt_init(range, level, table); + struct pt_iommu_dirty_args *dirty = arg; + int ret; + + for_each_pt_level_entry(&pts) { + if (pts.type == PT_ENTRY_TABLE) { + ret = pt_descend(&pts, arg, __read_and_clear_dirty); + if (ret) + return ret; + continue; + } + if (pts.type == PT_ENTRY_OA && pt_entry_is_write_dirty(&pts)) + record_dirty(&pts, dirty, + pt_entry_num_contig_lg2(&pts)); + } + return 0; +} + +/** + * read_and_clear_dirty() - Manipulate the HW set write dirty state + * @domain: Domain to manipulate + * @iova: IO virtual address to start + * @size: Length of the IOVA + * @flags: A bitmap of IOMMU_DIRTY_NO_CLEAR + * @dirty: Place to store the dirty bits + * + * Iterate over all the entries in the mapped range and record their write dirty + * status in iommu_dirty_bitmap. If IOMMU_DIRTY_NO_CLEAR is not specified then + * the entries will be left dirty, otherwise they are returned to being not + * write dirty. + * + * Context: The caller must hold a read range lock that includes @iova. + * + * Returns: -ERRNO on failure, 0 on success. + */ +int DOMAIN_NS(read_and_clear_dirty)(struct iommu_domain *domain, + unsigned long iova, size_t size, + unsigned long flags, + struct iommu_dirty_bitmap *dirty) +{ + struct pt_iommu *iommu_table = + container_of(domain, struct pt_iommu, domain); + struct pt_iommu_dirty_args dirty_args = { + .dirty = dirty, + .flags = flags, + }; + struct pt_range range; + int ret; + +#if !IS_ENABLED(CONFIG_IOMMUFD_DRIVER) || !defined(pt_entry_is_write_dirty) + return -EOPNOTSUPP; +#endif + + ret = make_range(common_from_iommu(iommu_table), &range, iova, size); + if (ret) + return ret; + + ret = pt_walk_range(&range, __read_and_clear_dirty, &dirty_args); + PT_WARN_ON(ret); + return ret; +} +EXPORT_SYMBOL_NS_GPL(DOMAIN_NS(read_and_clear_dirty), "GENERIC_PT_IOMMU"); + +static inline int __set_dirty(struct pt_range *range, void *arg, + unsigned int level, struct pt_table_p *table) +{ + struct pt_state pts = pt_init(range, level, table); + + switch (pt_load_single_entry(&pts)) { + case PT_ENTRY_EMPTY: + return -ENOENT; + case PT_ENTRY_TABLE: + return pt_descend(&pts, arg, __set_dirty); + case PT_ENTRY_OA: + if (!pt_entry_make_write_dirty(&pts)) + return -EAGAIN; + return 0; + } + return -ENOENT; +} + +static int __maybe_unused NS(set_dirty)(struct pt_iommu *iommu_table, + dma_addr_t iova) +{ + struct pt_range range; + int ret; + + ret = make_range(common_from_iommu(iommu_table), &range, iova, 1); + if (ret) + return ret; + + /* + * Note: There is no locking here yet, if the test suite races this it + * can crash. It should use RCU locking eventually. + */ + return pt_walk_range(&range, __set_dirty, NULL); +} + +struct pt_iommu_collect_args { + struct iommu_pages_list free_list; + /* Fail if any OAs are within the range */ + u8 check_mapped : 1; +}; + +static int __collect_tables(struct pt_range *range, void *arg, + unsigned int level, struct pt_table_p *table) +{ + struct pt_state pts = pt_init(range, level, table); + struct pt_iommu_collect_args *collect = arg; + int ret; + + if (!collect->check_mapped && !pt_can_have_table(&pts)) + return 0; + + for_each_pt_level_entry(&pts) { + if (pts.type == PT_ENTRY_TABLE) { + iommu_pages_list_add(&collect->free_list, pts.table_lower); + ret = pt_descend(&pts, arg, __collect_tables); + if (ret) + return ret; + continue; + } + if (pts.type == PT_ENTRY_OA && collect->check_mapped) + return -EADDRINUSE; + } + return 0; +} + +enum alloc_mode {ALLOC_NORMAL, ALLOC_DEFER_COHERENT_FLUSH}; + +/* Allocate a table, the empty table will be ready to be installed. */ +static inline struct pt_table_p *_table_alloc(struct pt_common *common, + size_t lg2sz, gfp_t gfp, + enum alloc_mode mode) +{ + struct pt_iommu *iommu_table = iommu_from_common(common); + struct pt_table_p *table_mem; + + table_mem = iommu_alloc_pages_node_sz(iommu_table->nid, gfp, + log2_to_int(lg2sz)); + if (pt_feature(common, PT_FEAT_DMA_INCOHERENT) && + mode == ALLOC_NORMAL) { + int ret = iommu_pages_start_incoherent( + table_mem, iommu_table->iommu_device); + if (ret) { + iommu_free_pages(table_mem); + return ERR_PTR(ret); + } + } + return table_mem; +} + +static inline struct pt_table_p *table_alloc_top(struct pt_common *common, + uintptr_t top_of_table, + gfp_t gfp, + enum alloc_mode mode) +{ + /* + * Top doesn't need the free list or otherwise, so it technically + * doesn't need to use iommu pages. Use the API anyhow as the top is + * usually not smaller than PAGE_SIZE to keep things simple. + */ + return _table_alloc(common, pt_top_memsize_lg2(common, top_of_table), + gfp, mode); +} + +/* Allocate an interior table */ +static inline struct pt_table_p *table_alloc(const struct pt_state *parent_pts, + gfp_t gfp, enum alloc_mode mode) +{ + struct pt_state child_pts = + pt_init(parent_pts->range, parent_pts->level - 1, NULL); + + return _table_alloc(parent_pts->range->common, + pt_num_items_lg2(&child_pts) + + ilog2(PT_ITEM_WORD_SIZE), + gfp, mode); +} + +static inline int pt_iommu_new_table(struct pt_state *pts, + struct pt_write_attrs *attrs) +{ + struct pt_table_p *table_mem; + phys_addr_t phys; + + /* Given PA/VA/length can't be represented */ + if (PT_WARN_ON(!pt_can_have_table(pts))) + return -ENXIO; + + table_mem = table_alloc(pts, attrs->gfp, ALLOC_NORMAL); + if (IS_ERR(table_mem)) + return PTR_ERR(table_mem); + + phys = virt_to_phys(table_mem); + if (!pt_install_table(pts, phys, attrs)) { + iommu_pages_free_incoherent( + table_mem, + iommu_from_common(pts->range->common)->iommu_device); + return -EAGAIN; + } + + if (pts_feature(pts, PT_FEAT_DMA_INCOHERENT)) { + flush_writes_item(pts); + pt_set_sw_bit_release(pts, SW_BIT_CACHE_FLUSH_DONE); + } + + if (IS_ENABLED(CONFIG_DEBUG_GENERIC_PT)) { + /* + * The underlying table can't store the physical table address. + * This happens when kunit testing tables outside their normal + * environment where a CPU might be limited. + */ + pt_load_single_entry(pts); + if (PT_WARN_ON(pt_table_pa(pts) != phys)) { + pt_clear_entries(pts, ilog2(1)); + iommu_pages_free_incoherent( + table_mem, iommu_from_common(pts->range->common) + ->iommu_device); + return -EINVAL; + } + } + + pts->table_lower = table_mem; + return 0; +} + +struct pt_iommu_map_args { + struct iommu_iotlb_gather *iotlb_gather; + struct pt_write_attrs attrs; + pt_oaddr_t oa; + unsigned int leaf_pgsize_lg2; + unsigned int leaf_level; +}; + +/* + * This will recursively check any tables in the block to validate they are + * empty and then free them through the gather. + */ +static int clear_contig(const struct pt_state *start_pts, + struct iommu_iotlb_gather *iotlb_gather, + unsigned int step, unsigned int pgsize_lg2) +{ + struct pt_iommu *iommu_table = + iommu_from_common(start_pts->range->common); + struct pt_range range = *start_pts->range; + struct pt_state pts = + pt_init(&range, start_pts->level, start_pts->table); + struct pt_iommu_collect_args collect = { .check_mapped = true }; + int ret; + + pts.index = start_pts->index; + pts.end_index = start_pts->index + step; + for (; _pt_iter_load(&pts); pt_next_entry(&pts)) { + if (pts.type == PT_ENTRY_TABLE) { + collect.free_list = + IOMMU_PAGES_LIST_INIT(collect.free_list); + ret = pt_walk_descend_all(&pts, __collect_tables, + &collect); + if (ret) + return ret; + + /* + * The table item must be cleared before we can update + * the gather + */ + pt_clear_entries(&pts, ilog2(1)); + flush_writes_item(&pts); + + iommu_pages_list_add(&collect.free_list, + pt_table_ptr(&pts)); + gather_range_pages( + iotlb_gather, iommu_table, range.va, + log2_to_int(pt_table_item_lg2sz(&pts)), + &collect.free_list); + } else if (pts.type != PT_ENTRY_EMPTY) { + return -EADDRINUSE; + } + } + return 0; +} + +static int __map_range_leaf(struct pt_range *range, void *arg, + unsigned int level, struct pt_table_p *table) +{ + struct pt_state pts = pt_init(range, level, table); + struct pt_iommu_map_args *map = arg; + unsigned int leaf_pgsize_lg2 = map->leaf_pgsize_lg2; + unsigned int start_index; + pt_oaddr_t oa = map->oa; + unsigned int step; + bool need_contig; + int ret = 0; + + PT_WARN_ON(map->leaf_level != level); + PT_WARN_ON(!pt_can_have_leaf(&pts)); + + step = log2_to_int_t(unsigned int, + leaf_pgsize_lg2 - pt_table_item_lg2sz(&pts)); + need_contig = leaf_pgsize_lg2 != pt_table_item_lg2sz(&pts); + + _pt_iter_first(&pts); + start_index = pts.index; + do { + pts.type = pt_load_entry_raw(&pts); + if (pts.type != PT_ENTRY_EMPTY || need_contig) { + if (pts.index != start_index) + pt_index_to_va(&pts); + ret = clear_contig(&pts, map->iotlb_gather, step, + leaf_pgsize_lg2); + if (ret) + break; + } + + if (IS_ENABLED(CONFIG_DEBUG_GENERIC_PT)) { + pt_index_to_va(&pts); + PT_WARN_ON(compute_best_pgsize(&pts, oa) != + leaf_pgsize_lg2); + } + pt_install_leaf_entry(&pts, oa, leaf_pgsize_lg2, &map->attrs); + + oa += log2_to_int(leaf_pgsize_lg2); + pts.index += step; + } while (pts.index < pts.end_index); + + flush_writes_range(&pts, start_index, pts.index); + + map->oa = oa; + return ret; +} + +static int __map_range(struct pt_range *range, void *arg, unsigned int level, + struct pt_table_p *table) +{ + struct pt_state pts = pt_init(range, level, table); + struct pt_iommu_map_args *map = arg; + int ret; + + PT_WARN_ON(map->leaf_level == level); + PT_WARN_ON(!pt_can_have_table(&pts)); + + _pt_iter_first(&pts); + + /* Descend to a child table */ + do { + pts.type = pt_load_entry_raw(&pts); + + if (pts.type != PT_ENTRY_TABLE) { + if (pts.type != PT_ENTRY_EMPTY) + return -EADDRINUSE; + ret = pt_iommu_new_table(&pts, &map->attrs); + if (ret) { + /* + * Racing with another thread installing a table + */ + if (ret == -EAGAIN) + continue; + return ret; + } + } else { + pts.table_lower = pt_table_ptr(&pts); + /* + * Racing with a shared pt_iommu_new_table()? The other + * thread is still flushing the cache, so we have to + * also flush it to ensure that when our thread's map + * completes all the table items leading to our mapping + * are visible. + * + * This requires the pt_set_bit_release() to be a + * release of the cache flush so that this can acquire + * visibility at the iommu. + */ + if (pts_feature(&pts, PT_FEAT_DMA_INCOHERENT) && + !pt_test_sw_bit_acquire(&pts, + SW_BIT_CACHE_FLUSH_DONE)) + flush_writes_item(&pts); + } + + /* + * The already present table can possibly be shared with another + * concurrent map. + */ + if (map->leaf_level == level - 1) + ret = pt_descend(&pts, arg, __map_range_leaf); + else + ret = pt_descend(&pts, arg, __map_range); + if (ret) + return ret; + + pts.index++; + pt_index_to_va(&pts); + if (pts.index >= pts.end_index) + break; + } while (true); + return 0; +} + +/* + * Fast path for the easy case of mapping a 4k page to an already allocated + * table. This is a common workload. If it returns EAGAIN run the full algorithm + * instead. + */ +static __always_inline int __do_map_single_page(struct pt_range *range, + void *arg, unsigned int level, + struct pt_table_p *table, + pt_level_fn_t descend_fn) +{ + struct pt_state pts = pt_init(range, level, table); + struct pt_iommu_map_args *map = arg; + + pts.type = pt_load_single_entry(&pts); + if (level == 0) { + if (pts.type != PT_ENTRY_EMPTY) + return -EADDRINUSE; + pt_install_leaf_entry(&pts, map->oa, PAGE_SHIFT, + &map->attrs); + /* No flush, not used when incoherent */ + map->oa += PAGE_SIZE; + return 0; + } + if (pts.type == PT_ENTRY_TABLE) + return pt_descend(&pts, arg, descend_fn); + /* Something else, use the slow path */ + return -EAGAIN; +} +PT_MAKE_LEVELS(__map_single_page, __do_map_single_page); + +/* + * Add a table to the top, increasing the top level as much as necessary to + * encompass range. + */ +static int increase_top(struct pt_iommu *iommu_table, struct pt_range *range, + struct pt_iommu_map_args *map) +{ + struct iommu_pages_list free_list = IOMMU_PAGES_LIST_INIT(free_list); + struct pt_common *common = common_from_iommu(iommu_table); + uintptr_t top_of_table = READ_ONCE(common->top_of_table); + uintptr_t new_top_of_table = top_of_table; + struct pt_table_p *table_mem; + unsigned int new_level; + spinlock_t *domain_lock; + unsigned long flags; + int ret; + + while (true) { + struct pt_range top_range = + _pt_top_range(common, new_top_of_table); + struct pt_state pts = pt_init_top(&top_range); + + top_range.va = range->va; + top_range.last_va = range->last_va; + + if (!pt_check_range(&top_range) && + map->leaf_level <= pts.level) { + new_level = pts.level; + break; + } + + pts.level++; + if (pts.level > PT_MAX_TOP_LEVEL || + pt_table_item_lg2sz(&pts) >= common->max_vasz_lg2) { + ret = -ERANGE; + goto err_free; + } + + table_mem = + table_alloc_top(common, _pt_top_set(NULL, pts.level), + map->attrs.gfp, ALLOC_DEFER_COHERENT_FLUSH); + if (IS_ERR(table_mem)) { + ret = PTR_ERR(table_mem); + goto err_free; + } + iommu_pages_list_add(&free_list, table_mem); + + /* The new table links to the lower table always at index 0 */ + top_range.va = 0; + top_range.top_level = pts.level; + pts.table_lower = pts.table; + pts.table = table_mem; + pt_load_single_entry(&pts); + PT_WARN_ON(pts.index != 0); + pt_install_table(&pts, virt_to_phys(pts.table_lower), + &map->attrs); + new_top_of_table = _pt_top_set(pts.table, pts.level); + } + + /* + * Avoid double flushing, flush it once after all pt_install_table() + */ + if (pt_feature(common, PT_FEAT_DMA_INCOHERENT)) { + ret = iommu_pages_start_incoherent_list( + &free_list, iommu_table->iommu_device); + if (ret) + goto err_free; + } + + /* + * top_of_table is write locked by the spinlock, but readers can use + * READ_ONCE() to get the value. Since we encode both the level and the + * pointer in one quanta the lockless reader will always see something + * valid. The HW must be updated to the new level under the spinlock + * before top_of_table is updated so that concurrent readers don't map + * into the new level until it is fully functional. If another thread + * already updated it while we were working then throw everything away + * and try again. + */ + domain_lock = iommu_table->driver_ops->get_top_lock(iommu_table); + spin_lock_irqsave(domain_lock, flags); + if (common->top_of_table != top_of_table || + top_of_table == new_top_of_table) { + spin_unlock_irqrestore(domain_lock, flags); + ret = -EAGAIN; + goto err_free; + } + + /* + * We do not issue any flushes for change_top on the expectation that + * any walk cache will not become a problem by adding another layer to + * the tree. Misses will rewalk from the updated top pointer, hits + * continue to be correct. Negative caching is fine too since all the + * new IOVA added by the new top is non-present. + */ + iommu_table->driver_ops->change_top( + iommu_table, virt_to_phys(table_mem), new_level); + WRITE_ONCE(common->top_of_table, new_top_of_table); + spin_unlock_irqrestore(domain_lock, flags); + return 0; + +err_free: + if (pt_feature(common, PT_FEAT_DMA_INCOHERENT)) + iommu_pages_stop_incoherent_list(&free_list, + iommu_table->iommu_device); + iommu_put_pages_list(&free_list); + return ret; +} + +static int check_map_range(struct pt_iommu *iommu_table, struct pt_range *range, + struct pt_iommu_map_args *map) +{ + struct pt_common *common = common_from_iommu(iommu_table); + int ret; + + do { + ret = pt_check_range(range); + if (!pt_feature(common, PT_FEAT_DYNAMIC_TOP)) + return ret; + + if (!ret && map->leaf_level <= range->top_level) + break; + + ret = increase_top(iommu_table, range, map); + if (ret && ret != -EAGAIN) + return ret; + + /* Reload the new top */ + *range = pt_make_range(common, range->va, range->last_va); + } while (ret); + PT_WARN_ON(pt_check_range(range)); + return 0; +} + +static int do_map(struct pt_range *range, struct pt_common *common, + bool single_page, struct pt_iommu_map_args *map) +{ + /* + * The __map_single_page() fast path does not support DMA_INCOHERENT + * flushing to keep its .text small. + */ + if (single_page && !pt_feature(common, PT_FEAT_DMA_INCOHERENT)) { + int ret; + + ret = pt_walk_range(range, __map_single_page, map); + if (ret != -EAGAIN) + return ret; + /* EAGAIN falls through to the full path */ + } + + if (map->leaf_level == range->top_level) + return pt_walk_range(range, __map_range_leaf, map); + return pt_walk_range(range, __map_range, map); +} + +/** + * map_pages() - Install translation for an IOVA range + * @domain: Domain to manipulate + * @iova: IO virtual address to start + * @paddr: Physical/Output address to start + * @pgsize: Length of each page + * @pgcount: Length of the range in pgsize units starting from @iova + * @prot: A bitmap of IOMMU_READ/WRITE/CACHE/NOEXEC/MMIO + * @gfp: GFP flags for any memory allocations + * @mapped: Total bytes successfully mapped + * + * The range starting at IOVA will have paddr installed into it. The caller + * must specify a valid pgsize and pgcount to segment the range into compatible + * blocks. + * + * On error the caller will probably want to invoke unmap on the range from iova + * up to the amount indicated by @mapped to return the table back to an + * unchanged state. + * + * Context: The caller must hold a write range lock that includes the whole + * range. + * + * Returns: -ERRNO on failure, 0 on success. The number of bytes of VA that were + * mapped are added to @mapped, @mapped is not zerod first. + */ +int DOMAIN_NS(map_pages)(struct iommu_domain *domain, unsigned long iova, + phys_addr_t paddr, size_t pgsize, size_t pgcount, + int prot, gfp_t gfp, size_t *mapped) +{ + struct pt_iommu *iommu_table = + container_of(domain, struct pt_iommu, domain); + pt_vaddr_t pgsize_bitmap = iommu_table->domain.pgsize_bitmap; + struct pt_common *common = common_from_iommu(iommu_table); + struct iommu_iotlb_gather iotlb_gather; + pt_vaddr_t len = pgsize * pgcount; + struct pt_iommu_map_args map = { + .iotlb_gather = &iotlb_gather, + .oa = paddr, + .leaf_pgsize_lg2 = vaffs(pgsize), + }; + bool single_page = false; + struct pt_range range; + int ret; + + iommu_iotlb_gather_init(&iotlb_gather); + + if (WARN_ON(!(prot & (IOMMU_READ | IOMMU_WRITE)))) + return -EINVAL; + + /* Check the paddr doesn't exceed what the table can store */ + if ((sizeof(pt_oaddr_t) < sizeof(paddr) && + (pt_vaddr_t)paddr > PT_VADDR_MAX) || + (common->max_oasz_lg2 != PT_VADDR_MAX_LG2 && + oalog2_div(paddr, common->max_oasz_lg2))) + return -ERANGE; + + ret = pt_iommu_set_prot(common, &map.attrs, prot); + if (ret) + return ret; + map.attrs.gfp = gfp; + + ret = make_range_no_check(common, &range, iova, len); + if (ret) + return ret; + + /* Calculate target page size and level for the leaves */ + if (pt_has_system_page_size(common) && pgsize == PAGE_SIZE && + pgcount == 1) { + PT_WARN_ON(!(pgsize_bitmap & PAGE_SIZE)); + if (log2_mod(iova | paddr, PAGE_SHIFT)) + return -ENXIO; + map.leaf_pgsize_lg2 = PAGE_SHIFT; + map.leaf_level = 0; + single_page = true; + } else { + map.leaf_pgsize_lg2 = pt_compute_best_pgsize( + pgsize_bitmap, range.va, range.last_va, paddr); + if (!map.leaf_pgsize_lg2) + return -ENXIO; + map.leaf_level = + pt_pgsz_lg2_to_level(common, map.leaf_pgsize_lg2); + } + + ret = check_map_range(iommu_table, &range, &map); + if (ret) + return ret; + + PT_WARN_ON(map.leaf_level > range.top_level); + + ret = do_map(&range, common, single_page, &map); + + /* + * Table levels were freed and replaced with large items, flush any walk + * cache that may refer to the freed levels. + */ + if (!iommu_pages_list_empty(&iotlb_gather.freelist)) + iommu_iotlb_sync(&iommu_table->domain, &iotlb_gather); + + /* Bytes successfully mapped */ + PT_WARN_ON(!ret && map.oa - paddr != len); + *mapped += map.oa - paddr; + return ret; +} +EXPORT_SYMBOL_NS_GPL(DOMAIN_NS(map_pages), "GENERIC_PT_IOMMU"); + +struct pt_unmap_args { + struct iommu_pages_list free_list; + pt_vaddr_t unmapped; +}; + +static __maybe_unused int __unmap_range(struct pt_range *range, void *arg, + unsigned int level, + struct pt_table_p *table) +{ + struct pt_state pts = pt_init(range, level, table); + struct pt_unmap_args *unmap = arg; + unsigned int num_oas = 0; + unsigned int start_index; + int ret = 0; + + _pt_iter_first(&pts); + start_index = pts.index; + pts.type = pt_load_entry_raw(&pts); + /* + * A starting index is in the middle of a contiguous entry + * + * The IOMMU API does not require drivers to support unmapping parts of + * large pages. Long ago VFIO would try to split maps but the current + * version never does. + * + * Instead when unmap reaches a partial unmap of the start of a large + * IOPTE it should remove the entire IOPTE and return that size to the + * caller. + */ + if (pts.type == PT_ENTRY_OA) { + if (log2_mod(range->va, pt_entry_oa_lg2sz(&pts))) + return -EINVAL; + /* Micro optimization */ + goto start_oa; + } + + do { + if (pts.type != PT_ENTRY_OA) { + bool fully_covered; + + if (pts.type != PT_ENTRY_TABLE) { + ret = -EINVAL; + break; + } + + if (pts.index != start_index) + pt_index_to_va(&pts); + pts.table_lower = pt_table_ptr(&pts); + + fully_covered = pt_entry_fully_covered( + &pts, pt_table_item_lg2sz(&pts)); + + ret = pt_descend(&pts, arg, __unmap_range); + if (ret) + break; + + /* + * If the unmapping range fully covers the table then we + * can free it as well. The clear is delayed until we + * succeed in clearing the lower table levels. + */ + if (fully_covered) { + iommu_pages_list_add(&unmap->free_list, + pts.table_lower); + pt_clear_entries(&pts, ilog2(1)); + } + pts.index++; + } else { + unsigned int num_contig_lg2; +start_oa: + /* + * If the caller requested an last that falls within a + * single entry then the entire entry is unmapped and + * the length returned will be larger than requested. + */ + num_contig_lg2 = pt_entry_num_contig_lg2(&pts); + pt_clear_entries(&pts, num_contig_lg2); + num_oas += log2_to_int(num_contig_lg2); + pts.index += log2_to_int(num_contig_lg2); + } + if (pts.index >= pts.end_index) + break; + pts.type = pt_load_entry_raw(&pts); + } while (true); + + unmap->unmapped += log2_mul(num_oas, pt_table_item_lg2sz(&pts)); + flush_writes_range(&pts, start_index, pts.index); + + return ret; +} + +/** + * unmap_pages() - Make a range of IOVA empty/not present + * @domain: Domain to manipulate + * @iova: IO virtual address to start + * @pgsize: Length of each page + * @pgcount: Length of the range in pgsize units starting from @iova + * @iotlb_gather: Gather struct that must be flushed on return + * + * unmap_pages() will remove a translation created by map_pages(). It cannot + * subdivide a mapping created by map_pages(), so it should be called with IOVA + * ranges that match those passed to map_pages(). The IOVA range can aggregate + * contiguous map_pages() calls so long as no individual range is split. + * + * Context: The caller must hold a write range lock that includes + * the whole range. + * + * Returns: Number of bytes of VA unmapped. iova + res will be the point + * unmapping stopped. + */ +size_t DOMAIN_NS(unmap_pages)(struct iommu_domain *domain, unsigned long iova, + size_t pgsize, size_t pgcount, + struct iommu_iotlb_gather *iotlb_gather) +{ + struct pt_iommu *iommu_table = + container_of(domain, struct pt_iommu, domain); + struct pt_unmap_args unmap = { .free_list = IOMMU_PAGES_LIST_INIT( + unmap.free_list) }; + pt_vaddr_t len = pgsize * pgcount; + struct pt_range range; + int ret; + + ret = make_range(common_from_iommu(iommu_table), &range, iova, len); + if (ret) + return 0; + + pt_walk_range(&range, __unmap_range, &unmap); + + gather_range_pages(iotlb_gather, iommu_table, iova, len, + &unmap.free_list); + + return unmap.unmapped; +} +EXPORT_SYMBOL_NS_GPL(DOMAIN_NS(unmap_pages), "GENERIC_PT_IOMMU"); + +static void NS(get_info)(struct pt_iommu *iommu_table, + struct pt_iommu_info *info) +{ + struct pt_common *common = common_from_iommu(iommu_table); + struct pt_range range = pt_top_range(common); + struct pt_state pts = pt_init_top(&range); + pt_vaddr_t pgsize_bitmap = 0; + + if (pt_feature(common, PT_FEAT_DYNAMIC_TOP)) { + for (pts.level = 0; pts.level <= PT_MAX_TOP_LEVEL; + pts.level++) { + if (pt_table_item_lg2sz(&pts) >= common->max_vasz_lg2) + break; + pgsize_bitmap |= pt_possible_sizes(&pts); + } + } else { + for (pts.level = 0; pts.level <= range.top_level; pts.level++) + pgsize_bitmap |= pt_possible_sizes(&pts); + } + + /* Hide page sizes larger than the maximum OA */ + info->pgsize_bitmap = oalog2_mod(pgsize_bitmap, common->max_oasz_lg2); +} + +static void NS(deinit)(struct pt_iommu *iommu_table) +{ + struct pt_common *common = common_from_iommu(iommu_table); + struct pt_range range = pt_all_range(common); + struct pt_iommu_collect_args collect = { + .free_list = IOMMU_PAGES_LIST_INIT(collect.free_list), + }; + + iommu_pages_list_add(&collect.free_list, range.top_table); + pt_walk_range(&range, __collect_tables, &collect); + + /* + * The driver has to already have fenced the HW access to the page table + * and invalidated any caching referring to this memory. + */ + if (pt_feature(common, PT_FEAT_DMA_INCOHERENT)) + iommu_pages_stop_incoherent_list(&collect.free_list, + iommu_table->iommu_device); + iommu_put_pages_list(&collect.free_list); +} + +static const struct pt_iommu_ops NS(ops) = { +#if IS_ENABLED(CONFIG_IOMMUFD_DRIVER) && defined(pt_entry_is_write_dirty) && \ + IS_ENABLED(CONFIG_IOMMUFD_TEST) && defined(pt_entry_make_write_dirty) + .set_dirty = NS(set_dirty), +#endif + .get_info = NS(get_info), + .deinit = NS(deinit), +}; + +static int pt_init_common(struct pt_common *common) +{ + struct pt_range top_range = pt_top_range(common); + + if (PT_WARN_ON(top_range.top_level > PT_MAX_TOP_LEVEL)) + return -EINVAL; + + if (top_range.top_level == PT_MAX_TOP_LEVEL || + common->max_vasz_lg2 == top_range.max_vasz_lg2) + common->features &= ~BIT(PT_FEAT_DYNAMIC_TOP); + + if (top_range.max_vasz_lg2 == PT_VADDR_MAX_LG2) + common->features |= BIT(PT_FEAT_FULL_VA); + + /* Requested features must match features compiled into this format */ + if ((common->features & ~(unsigned int)PT_SUPPORTED_FEATURES) || + (!IS_ENABLED(CONFIG_DEBUG_GENERIC_PT) && + (common->features & PT_FORCE_ENABLED_FEATURES) != + PT_FORCE_ENABLED_FEATURES)) + return -EOPNOTSUPP; + + /* + * Check if the top level of the page table is too small to hold the + * specified maxvasz. + */ + if (!pt_feature(common, PT_FEAT_DYNAMIC_TOP) && + top_range.top_level != PT_MAX_TOP_LEVEL) { + struct pt_state pts = { .range = &top_range, + .level = top_range.top_level }; + + if (common->max_vasz_lg2 > + pt_num_items_lg2(&pts) + pt_table_item_lg2sz(&pts)) + return -EOPNOTSUPP; + } + + if (common->max_oasz_lg2 == 0) + common->max_oasz_lg2 = pt_max_oa_lg2(common); + else + common->max_oasz_lg2 = min(common->max_oasz_lg2, + pt_max_oa_lg2(common)); + return 0; +} + +static int pt_iommu_init_domain(struct pt_iommu *iommu_table, + struct iommu_domain *domain) +{ + struct pt_common *common = common_from_iommu(iommu_table); + struct pt_iommu_info info; + struct pt_range range; + + NS(get_info)(iommu_table, &info); + + domain->type = __IOMMU_DOMAIN_PAGING; + domain->pgsize_bitmap = info.pgsize_bitmap; + + if (pt_feature(common, PT_FEAT_DYNAMIC_TOP)) + range = _pt_top_range(common, + _pt_top_set(NULL, PT_MAX_TOP_LEVEL)); + else + range = pt_top_range(common); + + /* A 64-bit high address space table on a 32-bit system cannot work. */ + domain->geometry.aperture_start = (unsigned long)range.va; + if ((pt_vaddr_t)domain->geometry.aperture_start != range.va) + return -EOVERFLOW; + + /* + * The aperture is limited to what the API can do after considering all + * the different types dma_addr_t/unsigned long/pt_vaddr_t that are used + * to store a VA. Set the aperture to something that is valid for all + * cases. Saturate instead of truncate the end if the types are smaller + * than the top range. aperture_end should be called aperture_last. + */ + domain->geometry.aperture_end = (unsigned long)range.last_va; + if ((pt_vaddr_t)domain->geometry.aperture_end != range.last_va) { + domain->geometry.aperture_end = ULONG_MAX; + domain->pgsize_bitmap &= ULONG_MAX; + } + domain->geometry.force_aperture = true; + + return 0; +} + +static void pt_iommu_zero(struct pt_iommu_table *fmt_table) +{ + struct pt_iommu *iommu_table = &fmt_table->iommu; + struct pt_iommu cfg = *iommu_table; + + static_assert(offsetof(struct pt_iommu_table, iommu.domain) == 0); + memset_after(fmt_table, 0, iommu.domain); + + /* The caller can initialize some of these values */ + iommu_table->iommu_device = cfg.iommu_device; + iommu_table->driver_ops = cfg.driver_ops; + iommu_table->nid = cfg.nid; +} + +#define pt_iommu_table_cfg CONCATENATE(pt_iommu_table, _cfg) +#define pt_iommu_init CONCATENATE(CONCATENATE(pt_iommu_, PTPFX), init) + +int pt_iommu_init(struct pt_iommu_table *fmt_table, + const struct pt_iommu_table_cfg *cfg, gfp_t gfp) +{ + struct pt_iommu *iommu_table = &fmt_table->iommu; + struct pt_common *common = common_from_iommu(iommu_table); + struct pt_table_p *table_mem; + int ret; + + if (cfg->common.hw_max_vasz_lg2 > PT_MAX_VA_ADDRESS_LG2 || + !cfg->common.hw_max_vasz_lg2 || !cfg->common.hw_max_oasz_lg2) + return -EINVAL; + + pt_iommu_zero(fmt_table); + common->features = cfg->common.features; + common->max_vasz_lg2 = cfg->common.hw_max_vasz_lg2; + common->max_oasz_lg2 = cfg->common.hw_max_oasz_lg2; + ret = pt_iommu_fmt_init(fmt_table, cfg); + if (ret) + return ret; + + if (cfg->common.hw_max_oasz_lg2 > pt_max_oa_lg2(common)) + return -EINVAL; + + ret = pt_init_common(common); + if (ret) + return ret; + + if (pt_feature(common, PT_FEAT_DYNAMIC_TOP) && + WARN_ON(!iommu_table->driver_ops || + !iommu_table->driver_ops->change_top || + !iommu_table->driver_ops->get_top_lock)) + return -EINVAL; + + if (pt_feature(common, PT_FEAT_SIGN_EXTEND) && + (pt_feature(common, PT_FEAT_FULL_VA) || + pt_feature(common, PT_FEAT_DYNAMIC_TOP))) + return -EINVAL; + + if (pt_feature(common, PT_FEAT_DMA_INCOHERENT) && + WARN_ON(!iommu_table->iommu_device)) + return -EINVAL; + + ret = pt_iommu_init_domain(iommu_table, &iommu_table->domain); + if (ret) + return ret; + + table_mem = table_alloc_top(common, common->top_of_table, gfp, + ALLOC_NORMAL); + if (IS_ERR(table_mem)) + return PTR_ERR(table_mem); + pt_top_set(common, table_mem, pt_top_get_level(common)); + + /* Must be last, see pt_iommu_deinit() */ + iommu_table->ops = &NS(ops); + return 0; +} +EXPORT_SYMBOL_NS_GPL(pt_iommu_init, "GENERIC_PT_IOMMU"); + +#ifdef pt_iommu_fmt_hw_info +#define pt_iommu_table_hw_info CONCATENATE(pt_iommu_table, _hw_info) +#define pt_iommu_hw_info CONCATENATE(CONCATENATE(pt_iommu_, PTPFX), hw_info) +void pt_iommu_hw_info(struct pt_iommu_table *fmt_table, + struct pt_iommu_table_hw_info *info) +{ + struct pt_iommu *iommu_table = &fmt_table->iommu; + struct pt_common *common = common_from_iommu(iommu_table); + struct pt_range top_range = pt_top_range(common); + + pt_iommu_fmt_hw_info(fmt_table, &top_range, info); +} +EXPORT_SYMBOL_NS_GPL(pt_iommu_hw_info, "GENERIC_PT_IOMMU"); +#endif + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("IOMMU Page table implementation for " __stringify(PTPFX_RAW)); +MODULE_IMPORT_NS("GENERIC_PT"); +/* For iommu_dirty_bitmap_record() */ +MODULE_IMPORT_NS("IOMMUFD"); + +#endif /* __GENERIC_PT_IOMMU_PT_H */ diff --git a/drivers/iommu/generic_pt/kunit_generic_pt.h b/drivers/iommu/generic_pt/kunit_generic_pt.h new file mode 100644 index 000000000000..68278bf15cfe --- /dev/null +++ b/drivers/iommu/generic_pt/kunit_generic_pt.h @@ -0,0 +1,823 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES + * + * Test the format API directly. + * + */ +#include "kunit_iommu.h" +#include "pt_iter.h" + +static void do_map(struct kunit *test, pt_vaddr_t va, pt_oaddr_t pa, + pt_vaddr_t len) +{ + struct kunit_iommu_priv *priv = test->priv; + int ret; + + KUNIT_ASSERT_EQ(test, len, (size_t)len); + + ret = iommu_map(&priv->domain, va, pa, len, IOMMU_READ | IOMMU_WRITE, + GFP_KERNEL); + KUNIT_ASSERT_NO_ERRNO_FN(test, "map_pages", ret); +} + +#define KUNIT_ASSERT_PT_LOAD(test, pts, entry) \ + ({ \ + pt_load_entry(pts); \ + KUNIT_ASSERT_EQ(test, (pts)->type, entry); \ + }) + +struct check_levels_arg { + struct kunit *test; + void *fn_arg; + void (*fn)(struct kunit *test, struct pt_state *pts, void *arg); +}; + +static int __check_all_levels(struct pt_range *range, void *arg, + unsigned int level, struct pt_table_p *table) +{ + struct pt_state pts = pt_init(range, level, table); + struct check_levels_arg *chk = arg; + struct kunit *test = chk->test; + int ret; + + _pt_iter_first(&pts); + + + /* + * If we were able to use the full VA space this should always be the + * last index in each table. + */ + if (!(IS_32BIT && range->max_vasz_lg2 > 32)) { + if (pt_feature(range->common, PT_FEAT_SIGN_EXTEND) && + pts.level == pts.range->top_level) + KUNIT_ASSERT_EQ(test, pts.index, + log2_to_int(range->max_vasz_lg2 - 1 - + pt_table_item_lg2sz(&pts)) - + 1); + else + KUNIT_ASSERT_EQ(test, pts.index, + log2_to_int(pt_table_oa_lg2sz(&pts) - + pt_table_item_lg2sz(&pts)) - + 1); + } + + if (pt_can_have_table(&pts)) { + pt_load_single_entry(&pts); + KUNIT_ASSERT_EQ(test, pts.type, PT_ENTRY_TABLE); + ret = pt_descend(&pts, arg, __check_all_levels); + KUNIT_ASSERT_EQ(test, ret, 0); + + /* Index 0 is used by the test */ + if (IS_32BIT && !pts.index) + return 0; + KUNIT_ASSERT_NE(chk->test, pts.index, 0); + } + + /* + * A format should not create a table with only one entry, at least this + * test approach won't work. + */ + KUNIT_ASSERT_GT(chk->test, pts.end_index, 1); + + /* + * For increase top we end up using index 0 for the original top's tree, + * so use index 1 for testing instead. + */ + pts.index = 0; + pt_index_to_va(&pts); + pt_load_single_entry(&pts); + if (pts.type == PT_ENTRY_TABLE && pts.end_index > 2) { + pts.index = 1; + pt_index_to_va(&pts); + } + (*chk->fn)(chk->test, &pts, chk->fn_arg); + return 0; +} + +/* + * Call fn for each level in the table with a pts setup to index 0 in a table + * for that level. This allows writing tests that run on every level. + * The test can use every index in the table except the last one. + */ +static void check_all_levels(struct kunit *test, + void (*fn)(struct kunit *test, + struct pt_state *pts, void *arg), + void *fn_arg) +{ + struct kunit_iommu_priv *priv = test->priv; + struct pt_range range = pt_top_range(priv->common); + struct check_levels_arg chk = { + .test = test, + .fn = fn, + .fn_arg = fn_arg, + }; + int ret; + + if (pt_feature(priv->common, PT_FEAT_DYNAMIC_TOP) && + priv->common->max_vasz_lg2 > range.max_vasz_lg2) + range.last_va = fvalog2_set_mod_max(range.va, + priv->common->max_vasz_lg2); + + /* + * Map a page at the highest VA, this will populate all the levels so we + * can then iterate over them. Index 0 will be used for testing. + */ + if (IS_32BIT && range.max_vasz_lg2 > 32) + range.last_va = (u32)range.last_va; + range.va = range.last_va - (priv->smallest_pgsz - 1); + do_map(test, range.va, 0, priv->smallest_pgsz); + + range = pt_make_range(priv->common, range.va, range.last_va); + ret = pt_walk_range(&range, __check_all_levels, &chk); + KUNIT_ASSERT_EQ(test, ret, 0); +} + +static void test_init(struct kunit *test) +{ + struct kunit_iommu_priv *priv = test->priv; + + /* Fixture does the setup */ + KUNIT_ASSERT_NE(test, priv->info.pgsize_bitmap, 0); +} + +/* + * Basic check that the log2_* functions are working, especially at the integer + * limits. + */ +static void test_bitops(struct kunit *test) +{ + int i; + + KUNIT_ASSERT_EQ(test, fls_t(u32, 0), 0); + KUNIT_ASSERT_EQ(test, fls_t(u32, 1), 1); + KUNIT_ASSERT_EQ(test, fls_t(u32, BIT(2)), 3); + KUNIT_ASSERT_EQ(test, fls_t(u32, U32_MAX), 32); + + KUNIT_ASSERT_EQ(test, fls_t(u64, 0), 0); + KUNIT_ASSERT_EQ(test, fls_t(u64, 1), 1); + KUNIT_ASSERT_EQ(test, fls_t(u64, BIT(2)), 3); + KUNIT_ASSERT_EQ(test, fls_t(u64, U64_MAX), 64); + + KUNIT_ASSERT_EQ(test, ffs_t(u32, 1), 0); + KUNIT_ASSERT_EQ(test, ffs_t(u32, BIT(2)), 2); + KUNIT_ASSERT_EQ(test, ffs_t(u32, BIT(31)), 31); + + KUNIT_ASSERT_EQ(test, ffs_t(u64, 1), 0); + KUNIT_ASSERT_EQ(test, ffs_t(u64, BIT(2)), 2); + KUNIT_ASSERT_EQ(test, ffs_t(u64, BIT_ULL(63)), 63); + + for (i = 0; i != 31; i++) + KUNIT_ASSERT_EQ(test, ffz_t(u64, BIT_ULL(i) - 1), i); + + for (i = 0; i != 63; i++) + KUNIT_ASSERT_EQ(test, ffz_t(u64, BIT_ULL(i) - 1), i); + + for (i = 0; i != 32; i++) { + u64 val = get_random_u64(); + + KUNIT_ASSERT_EQ(test, log2_mod_t(u32, val, ffs_t(u32, val)), 0); + KUNIT_ASSERT_EQ(test, log2_mod_t(u64, val, ffs_t(u64, val)), 0); + + KUNIT_ASSERT_EQ(test, log2_mod_t(u32, val, ffz_t(u32, val)), + log2_to_max_int_t(u32, ffz_t(u32, val))); + KUNIT_ASSERT_EQ(test, log2_mod_t(u64, val, ffz_t(u64, val)), + log2_to_max_int_t(u64, ffz_t(u64, val))); + } +} + +static unsigned int ref_best_pgsize(pt_vaddr_t pgsz_bitmap, pt_vaddr_t va, + pt_vaddr_t last_va, pt_oaddr_t oa) +{ + pt_vaddr_t pgsz_lg2; + + /* Brute force the constraints described in pt_compute_best_pgsize() */ + for (pgsz_lg2 = PT_VADDR_MAX_LG2 - 1; pgsz_lg2 != 0; pgsz_lg2--) { + if ((pgsz_bitmap & log2_to_int(pgsz_lg2)) && + log2_mod(va, pgsz_lg2) == 0 && + oalog2_mod(oa, pgsz_lg2) == 0 && + va + log2_to_int(pgsz_lg2) - 1 <= last_va && + log2_div_eq(va, va + log2_to_int(pgsz_lg2) - 1, pgsz_lg2) && + oalog2_div_eq(oa, oa + log2_to_int(pgsz_lg2) - 1, pgsz_lg2)) + return pgsz_lg2; + } + return 0; +} + +/* Check that the bit logic in pt_compute_best_pgsize() works. */ +static void test_best_pgsize(struct kunit *test) +{ + unsigned int a_lg2; + unsigned int b_lg2; + unsigned int c_lg2; + + /* Try random prefixes with every suffix combination */ + for (a_lg2 = 1; a_lg2 != 10; a_lg2++) { + for (b_lg2 = 1; b_lg2 != 10; b_lg2++) { + for (c_lg2 = 1; c_lg2 != 10; c_lg2++) { + pt_vaddr_t pgsz_bitmap = get_random_u64(); + pt_vaddr_t va = get_random_u64() << a_lg2; + pt_oaddr_t oa = get_random_u64() << b_lg2; + pt_vaddr_t last_va = log2_set_mod_max( + get_random_u64(), c_lg2); + + if (va > last_va) + swap(va, last_va); + KUNIT_ASSERT_EQ( + test, + pt_compute_best_pgsize(pgsz_bitmap, va, + last_va, oa), + ref_best_pgsize(pgsz_bitmap, va, + last_va, oa)); + } + } + } + + /* 0 prefix, every suffix */ + for (c_lg2 = 1; c_lg2 != PT_VADDR_MAX_LG2 - 1; c_lg2++) { + pt_vaddr_t pgsz_bitmap = get_random_u64(); + pt_vaddr_t va = 0; + pt_oaddr_t oa = 0; + pt_vaddr_t last_va = log2_set_mod_max(0, c_lg2); + + KUNIT_ASSERT_EQ(test, + pt_compute_best_pgsize(pgsz_bitmap, va, last_va, + oa), + ref_best_pgsize(pgsz_bitmap, va, last_va, oa)); + } + + /* 1's prefix, every suffix */ + for (a_lg2 = 1; a_lg2 != 10; a_lg2++) { + for (b_lg2 = 1; b_lg2 != 10; b_lg2++) { + for (c_lg2 = 1; c_lg2 != 10; c_lg2++) { + pt_vaddr_t pgsz_bitmap = get_random_u64(); + pt_vaddr_t va = PT_VADDR_MAX << a_lg2; + pt_oaddr_t oa = PT_VADDR_MAX << b_lg2; + pt_vaddr_t last_va = PT_VADDR_MAX; + + KUNIT_ASSERT_EQ( + test, + pt_compute_best_pgsize(pgsz_bitmap, va, + last_va, oa), + ref_best_pgsize(pgsz_bitmap, va, + last_va, oa)); + } + } + } + + /* pgsize_bitmap is always 0 */ + for (a_lg2 = 1; a_lg2 != 10; a_lg2++) { + for (b_lg2 = 1; b_lg2 != 10; b_lg2++) { + for (c_lg2 = 1; c_lg2 != 10; c_lg2++) { + pt_vaddr_t pgsz_bitmap = 0; + pt_vaddr_t va = get_random_u64() << a_lg2; + pt_oaddr_t oa = get_random_u64() << b_lg2; + pt_vaddr_t last_va = log2_set_mod_max( + get_random_u64(), c_lg2); + + if (va > last_va) + swap(va, last_va); + KUNIT_ASSERT_EQ( + test, + pt_compute_best_pgsize(pgsz_bitmap, va, + last_va, oa), + 0); + } + } + } + + if (sizeof(pt_vaddr_t) <= 4) + return; + + /* over 32 bit page sizes */ + for (a_lg2 = 32; a_lg2 != 42; a_lg2++) { + for (b_lg2 = 32; b_lg2 != 42; b_lg2++) { + for (c_lg2 = 32; c_lg2 != 42; c_lg2++) { + pt_vaddr_t pgsz_bitmap = get_random_u64(); + pt_vaddr_t va = get_random_u64() << a_lg2; + pt_oaddr_t oa = get_random_u64() << b_lg2; + pt_vaddr_t last_va = log2_set_mod_max( + get_random_u64(), c_lg2); + + if (va > last_va) + swap(va, last_va); + KUNIT_ASSERT_EQ( + test, + pt_compute_best_pgsize(pgsz_bitmap, va, + last_va, oa), + ref_best_pgsize(pgsz_bitmap, va, + last_va, oa)); + } + } + } +} + +/* + * Check that pt_install_table() and pt_table_pa() match + */ +static void test_lvl_table_ptr(struct kunit *test, struct pt_state *pts, + void *arg) +{ + struct kunit_iommu_priv *priv = test->priv; + pt_oaddr_t paddr = + log2_set_mod(priv->test_oa, 0, priv->smallest_pgsz_lg2); + struct pt_write_attrs attrs = {}; + + if (!pt_can_have_table(pts)) + return; + + KUNIT_ASSERT_NO_ERRNO_FN(test, "pt_iommu_set_prot", + pt_iommu_set_prot(pts->range->common, &attrs, + IOMMU_READ)); + + pt_load_single_entry(pts); + KUNIT_ASSERT_PT_LOAD(test, pts, PT_ENTRY_EMPTY); + + KUNIT_ASSERT_TRUE(test, pt_install_table(pts, paddr, &attrs)); + + /* A second install should pass because install updates pts->entry. */ + KUNIT_ASSERT_EQ(test, pt_install_table(pts, paddr, &attrs), true); + + KUNIT_ASSERT_PT_LOAD(test, pts, PT_ENTRY_TABLE); + KUNIT_ASSERT_EQ(test, pt_table_pa(pts), paddr); + + pt_clear_entries(pts, ilog2(1)); + KUNIT_ASSERT_PT_LOAD(test, pts, PT_ENTRY_EMPTY); +} + +static void test_table_ptr(struct kunit *test) +{ + check_all_levels(test, test_lvl_table_ptr, NULL); +} + +struct lvl_radix_arg { + pt_vaddr_t vbits; +}; + +/* + * Check pt_table_oa_lg2sz() and pt_table_item_lg2sz() they need to decode a + * continuous list of VA across all the levels that covers the entire advertised + * VA space. + */ +static void test_lvl_radix(struct kunit *test, struct pt_state *pts, void *arg) +{ + unsigned int table_lg2sz = pt_table_oa_lg2sz(pts); + unsigned int isz_lg2 = pt_table_item_lg2sz(pts); + struct lvl_radix_arg *radix = arg; + + /* Every bit below us is decoded */ + KUNIT_ASSERT_EQ(test, log2_set_mod_max(0, isz_lg2), radix->vbits); + + /* We are not decoding bits someone else is */ + KUNIT_ASSERT_EQ(test, log2_div(radix->vbits, isz_lg2), 0); + + /* Can't decode past the pt_vaddr_t size */ + KUNIT_ASSERT_LE(test, table_lg2sz, PT_VADDR_MAX_LG2); + KUNIT_ASSERT_EQ(test, fvalog2_div(table_lg2sz, PT_MAX_VA_ADDRESS_LG2), + 0); + + radix->vbits = fvalog2_set_mod_max(0, table_lg2sz); +} + +static void test_max_va(struct kunit *test) +{ + struct kunit_iommu_priv *priv = test->priv; + struct pt_range range = pt_top_range(priv->common); + + KUNIT_ASSERT_GE(test, priv->common->max_vasz_lg2, range.max_vasz_lg2); +} + +static void test_table_radix(struct kunit *test) +{ + struct kunit_iommu_priv *priv = test->priv; + struct lvl_radix_arg radix = { .vbits = priv->smallest_pgsz - 1 }; + struct pt_range range; + + check_all_levels(test, test_lvl_radix, &radix); + + range = pt_top_range(priv->common); + if (range.max_vasz_lg2 == PT_VADDR_MAX_LG2) { + KUNIT_ASSERT_EQ(test, radix.vbits, PT_VADDR_MAX); + } else { + if (!IS_32BIT) + KUNIT_ASSERT_EQ(test, + log2_set_mod_max(0, range.max_vasz_lg2), + radix.vbits); + KUNIT_ASSERT_EQ(test, log2_div(radix.vbits, range.max_vasz_lg2), + 0); + } +} + +static unsigned int safe_pt_num_items_lg2(const struct pt_state *pts) +{ + struct pt_range top_range = pt_top_range(pts->range->common); + struct pt_state top_pts = pt_init_top(&top_range); + + /* + * Avoid calling pt_num_items_lg2() on the top, instead we can derive + * the size of the top table from the top range. + */ + if (pts->level == top_range.top_level) + return ilog2(pt_range_to_end_index(&top_pts)); + return pt_num_items_lg2(pts); +} + +static void test_lvl_possible_sizes(struct kunit *test, struct pt_state *pts, + void *arg) +{ + unsigned int num_items_lg2 = safe_pt_num_items_lg2(pts); + pt_vaddr_t pgsize_bitmap = pt_possible_sizes(pts); + unsigned int isz_lg2 = pt_table_item_lg2sz(pts); + + if (!pt_can_have_leaf(pts)) { + KUNIT_ASSERT_EQ(test, pgsize_bitmap, 0); + return; + } + + /* No bits for sizes that would be outside this table */ + KUNIT_ASSERT_EQ(test, log2_mod(pgsize_bitmap, isz_lg2), 0); + KUNIT_ASSERT_EQ( + test, fvalog2_div(pgsize_bitmap, num_items_lg2 + isz_lg2), 0); + + /* + * Non contiguous must be supported. AMDv1 has a HW bug where it does + * not support it on one of the levels. + */ + if ((u64)pgsize_bitmap != 0xff0000000000ULL || + strcmp(__stringify(PTPFX_RAW), "amdv1") != 0) + KUNIT_ASSERT_TRUE(test, pgsize_bitmap & log2_to_int(isz_lg2)); + else + KUNIT_ASSERT_NE(test, pgsize_bitmap, 0); + + /* A contiguous entry should not span the whole table */ + if (num_items_lg2 + isz_lg2 != PT_VADDR_MAX_LG2) + KUNIT_ASSERT_FALSE( + test, + pgsize_bitmap & log2_to_int(num_items_lg2 + isz_lg2)); +} + +static void test_entry_possible_sizes(struct kunit *test) +{ + check_all_levels(test, test_lvl_possible_sizes, NULL); +} + +static void sweep_all_pgsizes(struct kunit *test, struct pt_state *pts, + struct pt_write_attrs *attrs, + pt_oaddr_t test_oaddr) +{ + pt_vaddr_t pgsize_bitmap = pt_possible_sizes(pts); + unsigned int isz_lg2 = pt_table_item_lg2sz(pts); + unsigned int len_lg2; + + if (pts->index != 0) + return; + + for (len_lg2 = 0; len_lg2 < PT_VADDR_MAX_LG2 - 1; len_lg2++) { + struct pt_state sub_pts = *pts; + pt_oaddr_t oaddr; + + if (!(pgsize_bitmap & log2_to_int(len_lg2))) + continue; + + oaddr = log2_set_mod(test_oaddr, 0, len_lg2); + pt_install_leaf_entry(pts, oaddr, len_lg2, attrs); + /* Verify that every contiguous item translates correctly */ + for (sub_pts.index = 0; + sub_pts.index != log2_to_int(len_lg2 - isz_lg2); + sub_pts.index++) { + KUNIT_ASSERT_PT_LOAD(test, &sub_pts, PT_ENTRY_OA); + KUNIT_ASSERT_EQ(test, pt_item_oa(&sub_pts), + oaddr + sub_pts.index * + oalog2_mul(1, isz_lg2)); + KUNIT_ASSERT_EQ(test, pt_entry_oa(&sub_pts), oaddr); + KUNIT_ASSERT_EQ(test, pt_entry_num_contig_lg2(&sub_pts), + len_lg2 - isz_lg2); + } + + pt_clear_entries(pts, len_lg2 - isz_lg2); + KUNIT_ASSERT_PT_LOAD(test, pts, PT_ENTRY_EMPTY); + } +} + +/* + * Check that pt_install_leaf_entry() and pt_entry_oa() match. + * Check that pt_clear_entries() works. + */ +static void test_lvl_entry_oa(struct kunit *test, struct pt_state *pts, + void *arg) +{ + unsigned int max_oa_lg2 = pts->range->common->max_oasz_lg2; + struct kunit_iommu_priv *priv = test->priv; + struct pt_write_attrs attrs = {}; + + if (!pt_can_have_leaf(pts)) + return; + + KUNIT_ASSERT_NO_ERRNO_FN(test, "pt_iommu_set_prot", + pt_iommu_set_prot(pts->range->common, &attrs, + IOMMU_READ)); + + sweep_all_pgsizes(test, pts, &attrs, priv->test_oa); + + /* Check that the table can store the boundary OAs */ + sweep_all_pgsizes(test, pts, &attrs, 0); + if (max_oa_lg2 == PT_OADDR_MAX_LG2) + sweep_all_pgsizes(test, pts, &attrs, PT_OADDR_MAX); + else + sweep_all_pgsizes(test, pts, &attrs, + oalog2_to_max_int(max_oa_lg2)); +} + +static void test_entry_oa(struct kunit *test) +{ + check_all_levels(test, test_lvl_entry_oa, NULL); +} + +/* Test pt_attr_from_entry() */ +static void test_lvl_attr_from_entry(struct kunit *test, struct pt_state *pts, + void *arg) +{ + pt_vaddr_t pgsize_bitmap = pt_possible_sizes(pts); + unsigned int isz_lg2 = pt_table_item_lg2sz(pts); + struct kunit_iommu_priv *priv = test->priv; + unsigned int len_lg2; + unsigned int prot; + + if (!pt_can_have_leaf(pts)) + return; + + for (len_lg2 = 0; len_lg2 < PT_VADDR_MAX_LG2; len_lg2++) { + if (!(pgsize_bitmap & log2_to_int(len_lg2))) + continue; + for (prot = 0; prot <= (IOMMU_READ | IOMMU_WRITE | IOMMU_CACHE | + IOMMU_NOEXEC | IOMMU_MMIO); + prot++) { + pt_oaddr_t oaddr; + struct pt_write_attrs attrs = {}; + u64 good_entry; + + /* + * If the format doesn't support this combination of + * prot bits skip it + */ + if (pt_iommu_set_prot(pts->range->common, &attrs, + prot)) { + /* But RW has to be supported */ + KUNIT_ASSERT_NE(test, prot, + IOMMU_READ | IOMMU_WRITE); + continue; + } + + oaddr = log2_set_mod(priv->test_oa, 0, len_lg2); + pt_install_leaf_entry(pts, oaddr, len_lg2, &attrs); + KUNIT_ASSERT_PT_LOAD(test, pts, PT_ENTRY_OA); + + good_entry = pts->entry; + + memset(&attrs, 0, sizeof(attrs)); + pt_attr_from_entry(pts, &attrs); + + pt_clear_entries(pts, len_lg2 - isz_lg2); + KUNIT_ASSERT_PT_LOAD(test, pts, PT_ENTRY_EMPTY); + + pt_install_leaf_entry(pts, oaddr, len_lg2, &attrs); + KUNIT_ASSERT_PT_LOAD(test, pts, PT_ENTRY_OA); + + /* + * The descriptor produced by pt_attr_from_entry() + * produce an identical entry value when re-written + */ + KUNIT_ASSERT_EQ(test, good_entry, pts->entry); + + pt_clear_entries(pts, len_lg2 - isz_lg2); + } + } +} + +static void test_attr_from_entry(struct kunit *test) +{ + check_all_levels(test, test_lvl_attr_from_entry, NULL); +} + +static void test_lvl_dirty(struct kunit *test, struct pt_state *pts, void *arg) +{ + pt_vaddr_t pgsize_bitmap = pt_possible_sizes(pts); + unsigned int isz_lg2 = pt_table_item_lg2sz(pts); + struct kunit_iommu_priv *priv = test->priv; + unsigned int start_idx = pts->index; + struct pt_write_attrs attrs = {}; + unsigned int len_lg2; + + if (!pt_can_have_leaf(pts)) + return; + + KUNIT_ASSERT_NO_ERRNO_FN(test, "pt_iommu_set_prot", + pt_iommu_set_prot(pts->range->common, &attrs, + IOMMU_READ | IOMMU_WRITE)); + + for (len_lg2 = 0; len_lg2 < PT_VADDR_MAX_LG2; len_lg2++) { + pt_oaddr_t oaddr; + unsigned int i; + + if (!(pgsize_bitmap & log2_to_int(len_lg2))) + continue; + + oaddr = log2_set_mod(priv->test_oa, 0, len_lg2); + pt_install_leaf_entry(pts, oaddr, len_lg2, &attrs); + KUNIT_ASSERT_PT_LOAD(test, pts, PT_ENTRY_OA); + + pt_load_entry(pts); + pt_entry_make_write_clean(pts); + pt_load_entry(pts); + KUNIT_ASSERT_FALSE(test, pt_entry_is_write_dirty(pts)); + + for (i = 0; i != log2_to_int(len_lg2 - isz_lg2); i++) { + /* dirty every contiguous entry */ + pts->index = start_idx + i; + pt_load_entry(pts); + KUNIT_ASSERT_TRUE(test, pt_entry_make_write_dirty(pts)); + pts->index = start_idx; + pt_load_entry(pts); + KUNIT_ASSERT_TRUE(test, pt_entry_is_write_dirty(pts)); + + pt_entry_make_write_clean(pts); + pt_load_entry(pts); + KUNIT_ASSERT_FALSE(test, pt_entry_is_write_dirty(pts)); + } + + pt_clear_entries(pts, len_lg2 - isz_lg2); + } +} + +static __maybe_unused void test_dirty(struct kunit *test) +{ + struct kunit_iommu_priv *priv = test->priv; + + if (!pt_dirty_supported(priv->common)) + kunit_skip(test, + "Page table features do not support dirty tracking"); + + check_all_levels(test, test_lvl_dirty, NULL); +} + +static void test_lvl_sw_bit_leaf(struct kunit *test, struct pt_state *pts, + void *arg) +{ + struct kunit_iommu_priv *priv = test->priv; + pt_vaddr_t pgsize_bitmap = pt_possible_sizes(pts); + unsigned int isz_lg2 = pt_table_item_lg2sz(pts); + struct pt_write_attrs attrs = {}; + unsigned int len_lg2; + + if (!pt_can_have_leaf(pts)) + return; + if (pts->index != 0) + return; + + KUNIT_ASSERT_NO_ERRNO_FN(test, "pt_iommu_set_prot", + pt_iommu_set_prot(pts->range->common, &attrs, + IOMMU_READ)); + + for (len_lg2 = 0; len_lg2 < PT_VADDR_MAX_LG2 - 1; len_lg2++) { + pt_oaddr_t paddr = log2_set_mod(priv->test_oa, 0, len_lg2); + struct pt_write_attrs new_attrs = {}; + unsigned int bitnr; + + if (!(pgsize_bitmap & log2_to_int(len_lg2))) + continue; + + pt_install_leaf_entry(pts, paddr, len_lg2, &attrs); + + for (bitnr = 0; bitnr <= pt_max_sw_bit(pts->range->common); + bitnr++) + KUNIT_ASSERT_FALSE(test, + pt_test_sw_bit_acquire(pts, bitnr)); + + for (bitnr = 0; bitnr <= pt_max_sw_bit(pts->range->common); + bitnr++) { + KUNIT_ASSERT_FALSE(test, + pt_test_sw_bit_acquire(pts, bitnr)); + pt_set_sw_bit_release(pts, bitnr); + KUNIT_ASSERT_TRUE(test, + pt_test_sw_bit_acquire(pts, bitnr)); + } + + for (bitnr = 0; bitnr <= pt_max_sw_bit(pts->range->common); + bitnr++) + KUNIT_ASSERT_TRUE(test, + pt_test_sw_bit_acquire(pts, bitnr)); + + KUNIT_ASSERT_EQ(test, pt_item_oa(pts), paddr); + + /* SW bits didn't leak into the attrs */ + pt_attr_from_entry(pts, &new_attrs); + KUNIT_ASSERT_MEMEQ(test, &new_attrs, &attrs, sizeof(attrs)); + + pt_clear_entries(pts, len_lg2 - isz_lg2); + KUNIT_ASSERT_PT_LOAD(test, pts, PT_ENTRY_EMPTY); + } +} + +static __maybe_unused void test_sw_bit_leaf(struct kunit *test) +{ + check_all_levels(test, test_lvl_sw_bit_leaf, NULL); +} + +static void test_lvl_sw_bit_table(struct kunit *test, struct pt_state *pts, + void *arg) +{ + struct kunit_iommu_priv *priv = test->priv; + struct pt_write_attrs attrs = {}; + pt_oaddr_t paddr = + log2_set_mod(priv->test_oa, 0, priv->smallest_pgsz_lg2); + unsigned int bitnr; + + if (!pt_can_have_leaf(pts)) + return; + if (pts->index != 0) + return; + + KUNIT_ASSERT_NO_ERRNO_FN(test, "pt_iommu_set_prot", + pt_iommu_set_prot(pts->range->common, &attrs, + IOMMU_READ)); + + KUNIT_ASSERT_TRUE(test, pt_install_table(pts, paddr, &attrs)); + + for (bitnr = 0; bitnr <= pt_max_sw_bit(pts->range->common); bitnr++) + KUNIT_ASSERT_FALSE(test, pt_test_sw_bit_acquire(pts, bitnr)); + + for (bitnr = 0; bitnr <= pt_max_sw_bit(pts->range->common); bitnr++) { + KUNIT_ASSERT_FALSE(test, pt_test_sw_bit_acquire(pts, bitnr)); + pt_set_sw_bit_release(pts, bitnr); + KUNIT_ASSERT_TRUE(test, pt_test_sw_bit_acquire(pts, bitnr)); + } + + for (bitnr = 0; bitnr <= pt_max_sw_bit(pts->range->common); bitnr++) + KUNIT_ASSERT_TRUE(test, pt_test_sw_bit_acquire(pts, bitnr)); + + KUNIT_ASSERT_EQ(test, pt_table_pa(pts), paddr); + + pt_clear_entries(pts, ilog2(1)); + KUNIT_ASSERT_PT_LOAD(test, pts, PT_ENTRY_EMPTY); +} + +static __maybe_unused void test_sw_bit_table(struct kunit *test) +{ + check_all_levels(test, test_lvl_sw_bit_table, NULL); +} + +static struct kunit_case generic_pt_test_cases[] = { + KUNIT_CASE_FMT(test_init), + KUNIT_CASE_FMT(test_bitops), + KUNIT_CASE_FMT(test_best_pgsize), + KUNIT_CASE_FMT(test_table_ptr), + KUNIT_CASE_FMT(test_max_va), + KUNIT_CASE_FMT(test_table_radix), + KUNIT_CASE_FMT(test_entry_possible_sizes), + KUNIT_CASE_FMT(test_entry_oa), + KUNIT_CASE_FMT(test_attr_from_entry), +#ifdef pt_entry_is_write_dirty + KUNIT_CASE_FMT(test_dirty), +#endif +#ifdef pt_sw_bit + KUNIT_CASE_FMT(test_sw_bit_leaf), + KUNIT_CASE_FMT(test_sw_bit_table), +#endif + {}, +}; + +static int pt_kunit_generic_pt_init(struct kunit *test) +{ + struct kunit_iommu_priv *priv; + int ret; + + priv = kunit_kzalloc(test, sizeof(*priv), GFP_KERNEL); + if (!priv) + return -ENOMEM; + ret = pt_kunit_priv_init(test, priv); + if (ret) { + kunit_kfree(test, priv); + return ret; + } + test->priv = priv; + return 0; +} + +static void pt_kunit_generic_pt_exit(struct kunit *test) +{ + struct kunit_iommu_priv *priv = test->priv; + + if (!test->priv) + return; + + pt_iommu_deinit(priv->iommu); + kunit_kfree(test, test->priv); +} + +static struct kunit_suite NS(generic_pt_suite) = { + .name = __stringify(NS(fmt_test)), + .init = pt_kunit_generic_pt_init, + .exit = pt_kunit_generic_pt_exit, + .test_cases = generic_pt_test_cases, +}; +kunit_test_suites(&NS(generic_pt_suite)); diff --git a/drivers/iommu/generic_pt/kunit_iommu.h b/drivers/iommu/generic_pt/kunit_iommu.h new file mode 100644 index 000000000000..22c9e4c4dd97 --- /dev/null +++ b/drivers/iommu/generic_pt/kunit_iommu.h @@ -0,0 +1,184 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES + */ +#ifndef __GENERIC_PT_KUNIT_IOMMU_H +#define __GENERIC_PT_KUNIT_IOMMU_H + +#define GENERIC_PT_KUNIT 1 +#include <kunit/device.h> +#include <kunit/test.h> +#include "../iommu-pages.h" +#include "pt_iter.h" + +#define pt_iommu_table_cfg CONCATENATE(pt_iommu_table, _cfg) +#define pt_iommu_init CONCATENATE(CONCATENATE(pt_iommu_, PTPFX), init) +int pt_iommu_init(struct pt_iommu_table *fmt_table, + const struct pt_iommu_table_cfg *cfg, gfp_t gfp); + +/* The format can provide a list of configurations it would like to test */ +#ifdef kunit_fmt_cfgs +static const void *kunit_pt_gen_params_cfg(struct kunit *test, const void *prev, + char *desc) +{ + uintptr_t cfg_id = (uintptr_t)prev; + + cfg_id++; + if (cfg_id >= ARRAY_SIZE(kunit_fmt_cfgs) + 1) + return NULL; + snprintf(desc, KUNIT_PARAM_DESC_SIZE, "%s_cfg_%u", + __stringify(PTPFX_RAW), (unsigned int)(cfg_id - 1)); + return (void *)cfg_id; +} +#define KUNIT_CASE_FMT(test_name) \ + KUNIT_CASE_PARAM(test_name, kunit_pt_gen_params_cfg) +#else +#define KUNIT_CASE_FMT(test_name) KUNIT_CASE(test_name) +#endif + +#define KUNIT_ASSERT_NO_ERRNO(test, ret) \ + KUNIT_ASSERT_EQ_MSG(test, ret, 0, KUNIT_SUBSUBTEST_INDENT "errno %pe", \ + ERR_PTR(ret)) + +#define KUNIT_ASSERT_NO_ERRNO_FN(test, fn, ret) \ + KUNIT_ASSERT_EQ_MSG(test, ret, 0, \ + KUNIT_SUBSUBTEST_INDENT "errno %pe from %s", \ + ERR_PTR(ret), fn) + +/* + * When the test is run on a 32 bit system unsigned long can be 32 bits. This + * cause the iommu op signatures to be restricted to 32 bits. Meaning the test + * has to be mindful not to create any VA's over the 32 bit limit. Reduce the + * scope of the testing as the main purpose of checking on full 32 bit is to + * look for 32bitism in the core code. Run the test on i386 with X86_PAE=y to + * get the full coverage when dma_addr_t & phys_addr_t are 8 bytes + */ +#define IS_32BIT (sizeof(unsigned long) == 4) + +struct kunit_iommu_priv { + union { + struct iommu_domain domain; + struct pt_iommu_table fmt_table; + }; + spinlock_t top_lock; + struct device *dummy_dev; + struct pt_iommu *iommu; + struct pt_common *common; + struct pt_iommu_table_cfg cfg; + struct pt_iommu_info info; + unsigned int smallest_pgsz_lg2; + pt_vaddr_t smallest_pgsz; + unsigned int largest_pgsz_lg2; + pt_oaddr_t test_oa; + pt_vaddr_t safe_pgsize_bitmap; + unsigned long orig_nr_secondary_pagetable; + +}; +PT_IOMMU_CHECK_DOMAIN(struct kunit_iommu_priv, fmt_table.iommu, domain); + +static void pt_kunit_iotlb_sync(struct iommu_domain *domain, + struct iommu_iotlb_gather *gather) +{ + iommu_put_pages_list(&gather->freelist); +} + +#define IOMMU_PT_DOMAIN_OPS1(x) IOMMU_PT_DOMAIN_OPS(x) +static const struct iommu_domain_ops kunit_pt_ops = { + IOMMU_PT_DOMAIN_OPS1(PTPFX_RAW), + .iotlb_sync = &pt_kunit_iotlb_sync, +}; + +static void pt_kunit_change_top(struct pt_iommu *iommu_table, + phys_addr_t top_paddr, unsigned int top_level) +{ +} + +static spinlock_t *pt_kunit_get_top_lock(struct pt_iommu *iommu_table) +{ + struct kunit_iommu_priv *priv = container_of( + iommu_table, struct kunit_iommu_priv, fmt_table.iommu); + + return &priv->top_lock; +} + +static const struct pt_iommu_driver_ops pt_kunit_driver_ops = { + .change_top = &pt_kunit_change_top, + .get_top_lock = &pt_kunit_get_top_lock, +}; + +static int pt_kunit_priv_init(struct kunit *test, struct kunit_iommu_priv *priv) +{ + unsigned int va_lg2sz; + int ret; + + /* Enough so the memory allocator works */ + priv->dummy_dev = kunit_device_register(test, "pt_kunit_dev"); + if (IS_ERR(priv->dummy_dev)) + return PTR_ERR(priv->dummy_dev); + set_dev_node(priv->dummy_dev, NUMA_NO_NODE); + + spin_lock_init(&priv->top_lock); + +#ifdef kunit_fmt_cfgs + priv->cfg = kunit_fmt_cfgs[((uintptr_t)test->param_value) - 1]; + /* + * The format can set a list of features that the kunit_fmt_cfgs + * controls, other features are default to on. + */ + priv->cfg.common.features |= PT_SUPPORTED_FEATURES & + (~KUNIT_FMT_FEATURES); +#else + priv->cfg.common.features = PT_SUPPORTED_FEATURES; +#endif + + /* Defaults, for the kunit */ + if (!priv->cfg.common.hw_max_vasz_lg2) + priv->cfg.common.hw_max_vasz_lg2 = PT_MAX_VA_ADDRESS_LG2; + if (!priv->cfg.common.hw_max_oasz_lg2) + priv->cfg.common.hw_max_oasz_lg2 = pt_max_oa_lg2(NULL); + + priv->fmt_table.iommu.nid = NUMA_NO_NODE; + priv->fmt_table.iommu.driver_ops = &pt_kunit_driver_ops; + priv->fmt_table.iommu.iommu_device = priv->dummy_dev; + priv->domain.ops = &kunit_pt_ops; + ret = pt_iommu_init(&priv->fmt_table, &priv->cfg, GFP_KERNEL); + if (ret) { + if (ret == -EOVERFLOW) + kunit_skip( + test, + "This configuration cannot be tested on 32 bit"); + return ret; + } + + priv->iommu = &priv->fmt_table.iommu; + priv->common = common_from_iommu(&priv->fmt_table.iommu); + priv->iommu->ops->get_info(priv->iommu, &priv->info); + + /* + * size_t is used to pass the mapping length, it can be 32 bit, truncate + * the pagesizes so we don't use large sizes. + */ + priv->info.pgsize_bitmap = (size_t)priv->info.pgsize_bitmap; + + priv->smallest_pgsz_lg2 = vaffs(priv->info.pgsize_bitmap); + priv->smallest_pgsz = log2_to_int(priv->smallest_pgsz_lg2); + priv->largest_pgsz_lg2 = + vafls((dma_addr_t)priv->info.pgsize_bitmap) - 1; + + priv->test_oa = + oalog2_mod(0x74a71445deadbeef, priv->common->max_oasz_lg2); + + /* + * We run out of VA space if the mappings get too big, make something + * smaller that can safely pass through dma_addr_t API. + */ + va_lg2sz = priv->common->max_vasz_lg2; + if (IS_32BIT && va_lg2sz > 32) + va_lg2sz = 32; + priv->safe_pgsize_bitmap = + log2_mod(priv->info.pgsize_bitmap, va_lg2sz - 1); + + return 0; +} + +#endif diff --git a/drivers/iommu/generic_pt/kunit_iommu_pt.h b/drivers/iommu/generic_pt/kunit_iommu_pt.h new file mode 100644 index 000000000000..e8a63c8ea850 --- /dev/null +++ b/drivers/iommu/generic_pt/kunit_iommu_pt.h @@ -0,0 +1,487 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES + */ +#include "kunit_iommu.h" +#include "pt_iter.h" +#include <linux/generic_pt/iommu.h> +#include <linux/iommu.h> + +static void do_map(struct kunit *test, pt_vaddr_t va, pt_oaddr_t pa, + pt_vaddr_t len); + +struct count_valids { + u64 per_size[PT_VADDR_MAX_LG2]; +}; + +static int __count_valids(struct pt_range *range, void *arg, unsigned int level, + struct pt_table_p *table) +{ + struct pt_state pts = pt_init(range, level, table); + struct count_valids *valids = arg; + + for_each_pt_level_entry(&pts) { + if (pts.type == PT_ENTRY_TABLE) { + pt_descend(&pts, arg, __count_valids); + continue; + } + if (pts.type == PT_ENTRY_OA) { + valids->per_size[pt_entry_oa_lg2sz(&pts)]++; + continue; + } + } + return 0; +} + +/* + * Number of valid table entries. This counts contiguous entries as a single + * valid. + */ +static unsigned int count_valids(struct kunit *test) +{ + struct kunit_iommu_priv *priv = test->priv; + struct pt_range range = pt_top_range(priv->common); + struct count_valids valids = {}; + u64 total = 0; + unsigned int i; + + KUNIT_ASSERT_NO_ERRNO(test, + pt_walk_range(&range, __count_valids, &valids)); + + for (i = 0; i != ARRAY_SIZE(valids.per_size); i++) + total += valids.per_size[i]; + return total; +} + +/* Only a single page size is present, count the number of valid entries */ +static unsigned int count_valids_single(struct kunit *test, pt_vaddr_t pgsz) +{ + struct kunit_iommu_priv *priv = test->priv; + struct pt_range range = pt_top_range(priv->common); + struct count_valids valids = {}; + u64 total = 0; + unsigned int i; + + KUNIT_ASSERT_NO_ERRNO(test, + pt_walk_range(&range, __count_valids, &valids)); + + for (i = 0; i != ARRAY_SIZE(valids.per_size); i++) { + if ((1ULL << i) == pgsz) + total = valids.per_size[i]; + else + KUNIT_ASSERT_EQ(test, valids.per_size[i], 0); + } + return total; +} + +static void do_unmap(struct kunit *test, pt_vaddr_t va, pt_vaddr_t len) +{ + struct kunit_iommu_priv *priv = test->priv; + size_t ret; + + ret = iommu_unmap(&priv->domain, va, len); + KUNIT_ASSERT_EQ(test, ret, len); +} + +static void check_iova(struct kunit *test, pt_vaddr_t va, pt_oaddr_t pa, + pt_vaddr_t len) +{ + struct kunit_iommu_priv *priv = test->priv; + pt_vaddr_t pfn = log2_div(va, priv->smallest_pgsz_lg2); + pt_vaddr_t end_pfn = pfn + log2_div(len, priv->smallest_pgsz_lg2); + + for (; pfn != end_pfn; pfn++) { + phys_addr_t res = iommu_iova_to_phys(&priv->domain, + pfn * priv->smallest_pgsz); + + KUNIT_ASSERT_EQ(test, res, (phys_addr_t)pa); + if (res != pa) + break; + pa += priv->smallest_pgsz; + } +} + +static void test_increase_level(struct kunit *test) +{ + struct kunit_iommu_priv *priv = test->priv; + struct pt_common *common = priv->common; + + if (!pt_feature(common, PT_FEAT_DYNAMIC_TOP)) + kunit_skip(test, "PT_FEAT_DYNAMIC_TOP not set for this format"); + + if (IS_32BIT) + kunit_skip(test, "Unable to test on 32bit"); + + KUNIT_ASSERT_GT(test, common->max_vasz_lg2, + pt_top_range(common).max_vasz_lg2); + + /* Add every possible level to the max */ + while (common->max_vasz_lg2 != pt_top_range(common).max_vasz_lg2) { + struct pt_range top_range = pt_top_range(common); + + if (top_range.va == 0) + do_map(test, top_range.last_va + 1, 0, + priv->smallest_pgsz); + else + do_map(test, top_range.va - priv->smallest_pgsz, 0, + priv->smallest_pgsz); + + KUNIT_ASSERT_EQ(test, pt_top_range(common).top_level, + top_range.top_level + 1); + KUNIT_ASSERT_GE(test, common->max_vasz_lg2, + pt_top_range(common).max_vasz_lg2); + } +} + +static void test_map_simple(struct kunit *test) +{ + struct kunit_iommu_priv *priv = test->priv; + struct pt_range range = pt_top_range(priv->common); + struct count_valids valids = {}; + pt_vaddr_t pgsize_bitmap = priv->safe_pgsize_bitmap; + unsigned int pgsz_lg2; + pt_vaddr_t cur_va; + + /* Map every reported page size */ + cur_va = range.va + priv->smallest_pgsz * 256; + for (pgsz_lg2 = 0; pgsz_lg2 != PT_VADDR_MAX_LG2; pgsz_lg2++) { + pt_oaddr_t paddr = log2_set_mod(priv->test_oa, 0, pgsz_lg2); + u64 len = log2_to_int(pgsz_lg2); + + if (!(pgsize_bitmap & len)) + continue; + + cur_va = ALIGN(cur_va, len); + do_map(test, cur_va, paddr, len); + if (len <= SZ_2G) + check_iova(test, cur_va, paddr, len); + cur_va += len; + } + + /* The read interface reports that every page size was created */ + range = pt_top_range(priv->common); + KUNIT_ASSERT_NO_ERRNO(test, + pt_walk_range(&range, __count_valids, &valids)); + for (pgsz_lg2 = 0; pgsz_lg2 != PT_VADDR_MAX_LG2; pgsz_lg2++) { + if (pgsize_bitmap & (1ULL << pgsz_lg2)) + KUNIT_ASSERT_EQ(test, valids.per_size[pgsz_lg2], 1); + else + KUNIT_ASSERT_EQ(test, valids.per_size[pgsz_lg2], 0); + } + + /* Unmap works */ + range = pt_top_range(priv->common); + cur_va = range.va + priv->smallest_pgsz * 256; + for (pgsz_lg2 = 0; pgsz_lg2 != PT_VADDR_MAX_LG2; pgsz_lg2++) { + u64 len = log2_to_int(pgsz_lg2); + + if (!(pgsize_bitmap & len)) + continue; + cur_va = ALIGN(cur_va, len); + do_unmap(test, cur_va, len); + cur_va += len; + } + KUNIT_ASSERT_EQ(test, count_valids(test), 0); +} + +/* + * Test to convert a table pointer into an OA by mapping something small, + * unmapping it so as to leave behind a table pointer, then mapping something + * larger that will convert the table into an OA. + */ +static void test_map_table_to_oa(struct kunit *test) +{ + struct kunit_iommu_priv *priv = test->priv; + pt_vaddr_t limited_pgbitmap = + priv->info.pgsize_bitmap % (IS_32BIT ? SZ_2G : SZ_16G); + struct pt_range range = pt_top_range(priv->common); + unsigned int pgsz_lg2; + pt_vaddr_t max_pgsize; + pt_vaddr_t cur_va; + + max_pgsize = 1ULL << (vafls(limited_pgbitmap) - 1); + KUNIT_ASSERT_TRUE(test, priv->info.pgsize_bitmap & max_pgsize); + + for (pgsz_lg2 = 0; pgsz_lg2 != PT_VADDR_MAX_LG2; pgsz_lg2++) { + pt_oaddr_t paddr = log2_set_mod(priv->test_oa, 0, pgsz_lg2); + u64 len = log2_to_int(pgsz_lg2); + pt_vaddr_t offset; + + if (!(priv->info.pgsize_bitmap & len)) + continue; + if (len > max_pgsize) + break; + + cur_va = ALIGN(range.va + priv->smallest_pgsz * 256, + max_pgsize); + for (offset = 0; offset != max_pgsize; offset += len) + do_map(test, cur_va + offset, paddr + offset, len); + check_iova(test, cur_va, paddr, max_pgsize); + KUNIT_ASSERT_EQ(test, count_valids_single(test, len), + log2_div(max_pgsize, pgsz_lg2)); + + if (len == max_pgsize) { + do_unmap(test, cur_va, max_pgsize); + } else { + do_unmap(test, cur_va, max_pgsize / 2); + for (offset = max_pgsize / 2; offset != max_pgsize; + offset += len) + do_unmap(test, cur_va + offset, len); + } + + KUNIT_ASSERT_EQ(test, count_valids(test), 0); + } +} + +/* + * Test unmapping a small page at the start of a large page. This always unmaps + * the large page. + */ +static void test_unmap_split(struct kunit *test) +{ + struct kunit_iommu_priv *priv = test->priv; + struct pt_range top_range = pt_top_range(priv->common); + pt_vaddr_t pgsize_bitmap = priv->safe_pgsize_bitmap; + unsigned int pgsz_lg2; + unsigned int count = 0; + + for (pgsz_lg2 = 0; pgsz_lg2 != PT_VADDR_MAX_LG2; pgsz_lg2++) { + pt_vaddr_t base_len = log2_to_int(pgsz_lg2); + unsigned int next_pgsz_lg2; + + if (!(pgsize_bitmap & base_len)) + continue; + + for (next_pgsz_lg2 = pgsz_lg2 + 1; + next_pgsz_lg2 != PT_VADDR_MAX_LG2; next_pgsz_lg2++) { + pt_vaddr_t next_len = log2_to_int(next_pgsz_lg2); + pt_vaddr_t vaddr = top_range.va; + pt_oaddr_t paddr = 0; + size_t gnmapped; + + if (!(pgsize_bitmap & next_len)) + continue; + + do_map(test, vaddr, paddr, next_len); + gnmapped = iommu_unmap(&priv->domain, vaddr, base_len); + KUNIT_ASSERT_EQ(test, gnmapped, next_len); + + /* Make sure unmap doesn't keep going */ + do_map(test, vaddr, paddr, next_len); + do_map(test, vaddr + next_len, paddr, next_len); + gnmapped = iommu_unmap(&priv->domain, vaddr, base_len); + KUNIT_ASSERT_EQ(test, gnmapped, next_len); + gnmapped = iommu_unmap(&priv->domain, vaddr + next_len, + next_len); + KUNIT_ASSERT_EQ(test, gnmapped, next_len); + + count++; + } + } + + if (count == 0) + kunit_skip(test, "Test needs two page sizes"); +} + +static void unmap_collisions(struct kunit *test, struct maple_tree *mt, + pt_vaddr_t start, pt_vaddr_t last) +{ + struct kunit_iommu_priv *priv = test->priv; + MA_STATE(mas, mt, start, last); + void *entry; + + mtree_lock(mt); + mas_for_each(&mas, entry, last) { + pt_vaddr_t mas_start = mas.index; + pt_vaddr_t len = (mas.last - mas_start) + 1; + pt_oaddr_t paddr; + + mas_erase(&mas); + mas_pause(&mas); + mtree_unlock(mt); + + paddr = oalog2_mod(mas_start, priv->common->max_oasz_lg2); + check_iova(test, mas_start, paddr, len); + do_unmap(test, mas_start, len); + mtree_lock(mt); + } + mtree_unlock(mt); +} + +static void clamp_range(struct kunit *test, struct pt_range *range) +{ + struct kunit_iommu_priv *priv = test->priv; + + if (range->last_va - range->va > SZ_1G) + range->last_va = range->va + SZ_1G; + KUNIT_ASSERT_NE(test, range->last_va, PT_VADDR_MAX); + if (range->va <= MAPLE_RESERVED_RANGE) + range->va = + ALIGN(MAPLE_RESERVED_RANGE, priv->smallest_pgsz); +} + +/* + * Randomly map and unmap ranges that can large physical pages. If a random + * range overlaps with existing ranges then unmap them. This hits all the + * special cases. + */ +static void test_random_map(struct kunit *test) +{ + struct kunit_iommu_priv *priv = test->priv; + struct pt_range upper_range = pt_upper_range(priv->common); + struct pt_range top_range = pt_top_range(priv->common); + struct maple_tree mt; + unsigned int iter; + + mt_init(&mt); + + /* + * Shrink the range so randomization is more likely to have + * intersections + */ + clamp_range(test, &top_range); + clamp_range(test, &upper_range); + + for (iter = 0; iter != 1000; iter++) { + struct pt_range *range = &top_range; + pt_oaddr_t paddr; + pt_vaddr_t start; + pt_vaddr_t end; + int ret; + + if (pt_feature(priv->common, PT_FEAT_SIGN_EXTEND) && + ULONG_MAX >= PT_VADDR_MAX && get_random_u32_inclusive(0, 1)) + range = &upper_range; + + start = get_random_u32_below( + min(U32_MAX, range->last_va - range->va)); + end = get_random_u32_below( + min(U32_MAX, range->last_va - start)); + + start = ALIGN_DOWN(start, priv->smallest_pgsz); + end = ALIGN(end, priv->smallest_pgsz); + start += range->va; + end += start; + if (start < range->va || end > range->last_va + 1 || + start >= end) + continue; + + /* Try overmapping to test the failure handling */ + paddr = oalog2_mod(start, priv->common->max_oasz_lg2); + ret = iommu_map(&priv->domain, start, paddr, end - start, + IOMMU_READ | IOMMU_WRITE, GFP_KERNEL); + if (ret) { + KUNIT_ASSERT_EQ(test, ret, -EADDRINUSE); + unmap_collisions(test, &mt, start, end - 1); + do_map(test, start, paddr, end - start); + } + + KUNIT_ASSERT_NO_ERRNO_FN(test, "mtree_insert_range", + mtree_insert_range(&mt, start, end - 1, + XA_ZERO_ENTRY, + GFP_KERNEL)); + + check_iova(test, start, paddr, end - start); + if (iter % 100) + cond_resched(); + } + + unmap_collisions(test, &mt, 0, PT_VADDR_MAX); + KUNIT_ASSERT_EQ(test, count_valids(test), 0); + + mtree_destroy(&mt); +} + +/* See https://lore.kernel.org/r/b9b18a03-63a2-4065-a27e-d92dd5c860bc@amd.com */ +static void test_pgsize_boundary(struct kunit *test) +{ + struct kunit_iommu_priv *priv = test->priv; + struct pt_range top_range = pt_top_range(priv->common); + + if (top_range.va != 0 || top_range.last_va < 0xfef9ffff || + priv->smallest_pgsz != SZ_4K) + kunit_skip(test, "Format does not have the required range"); + + do_map(test, 0xfef80000, 0x208b95d000, 0xfef9ffff - 0xfef80000 + 1); +} + +/* See https://lore.kernel.org/r/20250826143816.38686-1-eugkoira@amazon.com */ +static void test_mixed(struct kunit *test) +{ + struct kunit_iommu_priv *priv = test->priv; + struct pt_range top_range = pt_top_range(priv->common); + u64 start = 0x3fe400ULL << 12; + u64 end = 0x4c0600ULL << 12; + pt_vaddr_t len = end - start; + pt_oaddr_t oa = start; + + if (top_range.last_va <= start || sizeof(unsigned long) == 4) + kunit_skip(test, "range is too small"); + if ((priv->safe_pgsize_bitmap & GENMASK(30, 21)) != (BIT(30) | BIT(21))) + kunit_skip(test, "incompatible psize"); + + do_map(test, start, oa, len); + /* 14 2M, 3 1G, 3 2M */ + KUNIT_ASSERT_EQ(test, count_valids(test), 20); + check_iova(test, start, oa, len); +} + +static struct kunit_case iommu_test_cases[] = { + KUNIT_CASE_FMT(test_increase_level), + KUNIT_CASE_FMT(test_map_simple), + KUNIT_CASE_FMT(test_map_table_to_oa), + KUNIT_CASE_FMT(test_unmap_split), + KUNIT_CASE_FMT(test_random_map), + KUNIT_CASE_FMT(test_pgsize_boundary), + KUNIT_CASE_FMT(test_mixed), + {}, +}; + +static int pt_kunit_iommu_init(struct kunit *test) +{ + struct kunit_iommu_priv *priv; + int ret; + + priv = kunit_kzalloc(test, sizeof(*priv), GFP_KERNEL); + if (!priv) + return -ENOMEM; + + priv->orig_nr_secondary_pagetable = + global_node_page_state(NR_SECONDARY_PAGETABLE); + ret = pt_kunit_priv_init(test, priv); + if (ret) { + kunit_kfree(test, priv); + return ret; + } + test->priv = priv; + return 0; +} + +static void pt_kunit_iommu_exit(struct kunit *test) +{ + struct kunit_iommu_priv *priv = test->priv; + + if (!test->priv) + return; + + pt_iommu_deinit(priv->iommu); + /* + * Look for memory leaks, assumes kunit is running isolated and nothing + * else is using secondary page tables. + */ + KUNIT_ASSERT_EQ(test, priv->orig_nr_secondary_pagetable, + global_node_page_state(NR_SECONDARY_PAGETABLE)); + kunit_kfree(test, test->priv); +} + +static struct kunit_suite NS(iommu_suite) = { + .name = __stringify(NS(iommu_test)), + .init = pt_kunit_iommu_init, + .exit = pt_kunit_iommu_exit, + .test_cases = iommu_test_cases, +}; +kunit_test_suites(&NS(iommu_suite)); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Kunit for generic page table"); +MODULE_IMPORT_NS("GENERIC_PT_IOMMU"); diff --git a/drivers/iommu/generic_pt/pt_common.h b/drivers/iommu/generic_pt/pt_common.h new file mode 100644 index 000000000000..e1123d35c907 --- /dev/null +++ b/drivers/iommu/generic_pt/pt_common.h @@ -0,0 +1,389 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES + * + * This header is included after the format. It contains definitions + * that build on the format definitions to create the basic format API. + * + * The format API is listed here, with kdocs. The functions without bodies are + * implemented in the format using the pattern: + * static inline FMTpt_XXX(..) {..} + * #define pt_XXX FMTpt_XXX + * + * If the format doesn't implement a function then pt_fmt_defaults.h can provide + * a generic version. + * + * The routines marked "@pts: Entry to query" operate on the entire contiguous + * entry and can be called with a pts->index pointing to any sub item that makes + * up that entry. + * + * The header order is: + * pt_defs.h + * FMT.h + * pt_common.h + */ +#ifndef __GENERIC_PT_PT_COMMON_H +#define __GENERIC_PT_PT_COMMON_H + +#include "pt_defs.h" +#include "pt_fmt_defaults.h" + +/** + * pt_attr_from_entry() - Convert the permission bits back to attrs + * @pts: Entry to convert from + * @attrs: Resulting attrs + * + * Fill in the attrs with the permission bits encoded in the current leaf entry. + * The attrs should be usable with pt_install_leaf_entry() to reconstruct the + * same entry. + */ +static inline void pt_attr_from_entry(const struct pt_state *pts, + struct pt_write_attrs *attrs); + +/** + * pt_can_have_leaf() - True if the current level can have an OA entry + * @pts: The current level + * + * True if the current level can support pt_install_leaf_entry(). A leaf + * entry produce an OA. + */ +static inline bool pt_can_have_leaf(const struct pt_state *pts); + +/** + * pt_can_have_table() - True if the current level can have a lower table + * @pts: The current level + * + * Every level except 0 is allowed to have a lower table. + */ +static inline bool pt_can_have_table(const struct pt_state *pts) +{ + /* No further tables at level 0 */ + return pts->level > 0; +} + +/** + * pt_clear_entries() - Make entries empty (non-present) + * @pts: Starting table index + * @num_contig_lg2: Number of contiguous items to clear + * + * Clear a run of entries. A cleared entry will load back as PT_ENTRY_EMPTY + * and does not have any effect on table walking. The starting index must be + * aligned to num_contig_lg2. + */ +static inline void pt_clear_entries(struct pt_state *pts, + unsigned int num_contig_lg2); + +/** + * pt_entry_make_write_dirty() - Make an entry dirty + * @pts: Table entry to change + * + * Make pt_entry_is_write_dirty() return true for this entry. This can be called + * asynchronously with any other table manipulation under a RCU lock and must + * not corrupt the table. + */ +static inline bool pt_entry_make_write_dirty(struct pt_state *pts); + +/** + * pt_entry_make_write_clean() - Make the entry write clean + * @pts: Table entry to change + * + * Modify the entry so that pt_entry_is_write_dirty() == false. The HW will + * eventually be notified of this change via a TLB flush, which is the point + * that the HW must become synchronized. Any "write dirty" prior to the TLB + * flush can be lost, but once the TLB flush completes all writes must make + * their entries write dirty. + * + * The format should alter the entry in a way that is compatible with any + * concurrent update from HW. The entire contiguous entry is changed. + */ +static inline void pt_entry_make_write_clean(struct pt_state *pts); + +/** + * pt_entry_is_write_dirty() - True if the entry has been written to + * @pts: Entry to query + * + * "write dirty" means that the HW has written to the OA translated + * by this entry. If the entry is contiguous then the consolidated + * "write dirty" for all the items must be returned. + */ +static inline bool pt_entry_is_write_dirty(const struct pt_state *pts); + +/** + * pt_dirty_supported() - True if the page table supports dirty tracking + * @common: Page table to query + */ +static inline bool pt_dirty_supported(struct pt_common *common); + +/** + * pt_entry_num_contig_lg2() - Number of contiguous items for this leaf entry + * @pts: Entry to query + * + * Return the number of contiguous items this leaf entry spans. If the entry + * is single item it returns ilog2(1). + */ +static inline unsigned int pt_entry_num_contig_lg2(const struct pt_state *pts); + +/** + * pt_entry_oa() - Output Address for this leaf entry + * @pts: Entry to query + * + * Return the output address for the start of the entry. If the entry + * is contiguous this returns the same value for each sub-item. I.e.:: + * + * log2_mod(pt_entry_oa(), pt_entry_oa_lg2sz()) == 0 + * + * See pt_item_oa(). The format should implement one of these two functions + * depending on how it stores the OAs in the table. + */ +static inline pt_oaddr_t pt_entry_oa(const struct pt_state *pts); + +/** + * pt_entry_oa_lg2sz() - Return the size of an OA entry + * @pts: Entry to query + * + * If the entry is not contiguous this returns pt_table_item_lg2sz(), otherwise + * it returns the total VA/OA size of the entire contiguous entry. + */ +static inline unsigned int pt_entry_oa_lg2sz(const struct pt_state *pts) +{ + return pt_entry_num_contig_lg2(pts) + pt_table_item_lg2sz(pts); +} + +/** + * pt_entry_oa_exact() - Return the complete OA for an entry + * @pts: Entry to query + * + * During iteration the first entry could have a VA with an offset from the + * natural start of the entry. Return the exact OA including the pts's VA + * offset. + */ +static inline pt_oaddr_t pt_entry_oa_exact(const struct pt_state *pts) +{ + return _pt_entry_oa_fast(pts) | + log2_mod(pts->range->va, pt_entry_oa_lg2sz(pts)); +} + +/** + * pt_full_va_prefix() - The top bits of the VA + * @common: Page table to query + * + * This is usually 0, but some formats have their VA space going downward from + * PT_VADDR_MAX, and will return that instead. This value must always be + * adjusted by struct pt_common max_vasz_lg2. + */ +static inline pt_vaddr_t pt_full_va_prefix(const struct pt_common *common); + +/** + * pt_has_system_page_size() - True if level 0 can install a PAGE_SHIFT entry + * @common: Page table to query + * + * If true the caller can use, at level 0, pt_install_leaf_entry(PAGE_SHIFT). + * This is useful to create optimized paths for common cases of PAGE_SIZE + * mappings. + */ +static inline bool pt_has_system_page_size(const struct pt_common *common); + +/** + * pt_install_leaf_entry() - Write a leaf entry to the table + * @pts: Table index to change + * @oa: Output Address for this leaf + * @oasz_lg2: Size in VA/OA for this leaf + * @attrs: Attributes to modify the entry + * + * A leaf OA entry will return PT_ENTRY_OA from pt_load_entry(). It translates + * the VA indicated by pts to the given OA. + * + * For a single item non-contiguous entry oasz_lg2 is pt_table_item_lg2sz(). + * For contiguous it is pt_table_item_lg2sz() + num_contig_lg2. + * + * This must not be called if pt_can_have_leaf() == false. Contiguous sizes + * not indicated by pt_possible_sizes() must not be specified. + */ +static inline void pt_install_leaf_entry(struct pt_state *pts, pt_oaddr_t oa, + unsigned int oasz_lg2, + const struct pt_write_attrs *attrs); + +/** + * pt_install_table() - Write a table entry to the table + * @pts: Table index to change + * @table_pa: CPU physical address of the lower table's memory + * @attrs: Attributes to modify the table index + * + * A table entry will return PT_ENTRY_TABLE from pt_load_entry(). The table_pa + * is the table at pts->level - 1. This is done by cmpxchg so pts must have the + * current entry loaded. The pts is updated with the installed entry. + * + * This must not be called if pt_can_have_table() == false. + * + * Returns: true if the table was installed successfully. + */ +static inline bool pt_install_table(struct pt_state *pts, pt_oaddr_t table_pa, + const struct pt_write_attrs *attrs); + +/** + * pt_item_oa() - Output Address for this leaf item + * @pts: Item to query + * + * Return the output address for this item. If the item is part of a contiguous + * entry it returns the value of the OA for this individual sub item. + * + * See pt_entry_oa(). The format should implement one of these two functions + * depending on how it stores the OA's in the table. + */ +static inline pt_oaddr_t pt_item_oa(const struct pt_state *pts); + +/** + * pt_load_entry_raw() - Read from the location pts points at into the pts + * @pts: Table index to load + * + * Return the type of entry that was loaded. pts->entry will be filled in with + * the entry's content. See pt_load_entry() + */ +static inline enum pt_entry_type pt_load_entry_raw(struct pt_state *pts); + +/** + * pt_max_oa_lg2() - Return the maximum OA the table format can hold + * @common: Page table to query + * + * The value oalog2_to_max_int(pt_max_oa_lg2()) is the MAX for the + * OA. This is the absolute maximum address the table can hold. struct pt_common + * max_oasz_lg2 sets a lower dynamic maximum based on HW capability. + */ +static inline unsigned int +pt_max_oa_lg2(const struct pt_common *common); + +/** + * pt_num_items_lg2() - Return the number of items in this table level + * @pts: The current level + * + * The number of items in a table level defines the number of bits this level + * decodes from the VA. This function is not called for the top level, + * so it does not need to compute a special value for the top case. The + * result for the top is based on pt_common max_vasz_lg2. + * + * The value is used as part of determining the table indexes via the + * equation:: + * + * log2_mod(log2_div(VA, pt_table_item_lg2sz()), pt_num_items_lg2()) + */ +static inline unsigned int pt_num_items_lg2(const struct pt_state *pts); + +/** + * pt_pgsz_lg2_to_level - Return the level that maps the page size + * @common: Page table to query + * @pgsize_lg2: Log2 page size + * + * Returns the table level that will map the given page size. The page + * size must be part of the pt_possible_sizes() for some level. + */ +static inline unsigned int pt_pgsz_lg2_to_level(struct pt_common *common, + unsigned int pgsize_lg2); + +/** + * pt_possible_sizes() - Return a bitmap of possible output sizes at this level + * @pts: The current level + * + * Each level has a list of possible output sizes that can be installed as + * leaf entries. If pt_can_have_leaf() is false returns zero. + * + * Otherwise the bit in position pt_table_item_lg2sz() should be set indicating + * that a non-contiguous single item leaf entry is supported. The following + * pt_num_items_lg2() number of bits can be set indicating contiguous entries + * are supported. Bit pt_table_item_lg2sz() + pt_num_items_lg2() must not be + * set, contiguous entries cannot span the entire table. + * + * The OR of pt_possible_sizes() of all levels is the typical bitmask of all + * supported sizes in the entire table. + */ +static inline pt_vaddr_t pt_possible_sizes(const struct pt_state *pts); + +/** + * pt_table_item_lg2sz() - Size of a single item entry in this table level + * @pts: The current level + * + * The size of the item specifies how much VA and OA a single item occupies. + * + * See pt_entry_oa_lg2sz() for the same value including the effect of contiguous + * entries. + */ +static inline unsigned int pt_table_item_lg2sz(const struct pt_state *pts); + +/** + * pt_table_oa_lg2sz() - Return the VA/OA size of the entire table + * @pts: The current level + * + * Return the size of VA decoded by the entire table level. + */ +static inline unsigned int pt_table_oa_lg2sz(const struct pt_state *pts) +{ + if (pts->range->top_level == pts->level) + return pts->range->max_vasz_lg2; + return min_t(unsigned int, pts->range->common->max_vasz_lg2, + pt_num_items_lg2(pts) + pt_table_item_lg2sz(pts)); +} + +/** + * pt_table_pa() - Return the CPU physical address of the table entry + * @pts: Entry to query + * + * This is only ever called on PT_ENTRY_TABLE entries. Must return the same + * value passed to pt_install_table(). + */ +static inline pt_oaddr_t pt_table_pa(const struct pt_state *pts); + +/** + * pt_table_ptr() - Return a CPU pointer for a table item + * @pts: Entry to query + * + * Same as pt_table_pa() but returns a CPU pointer. + */ +static inline struct pt_table_p *pt_table_ptr(const struct pt_state *pts) +{ + return __va(pt_table_pa(pts)); +} + +/** + * pt_max_sw_bit() - Return the maximum software bit usable for any level and + * entry + * @common: Page table + * + * The swbit can be passed as bitnr to the other sw_bit functions. + */ +static inline unsigned int pt_max_sw_bit(struct pt_common *common); + +/** + * pt_test_sw_bit_acquire() - Read a software bit in an item + * @pts: Entry to read + * @bitnr: Bit to read + * + * Software bits are ignored by HW and can be used for any purpose by the + * software. This does a test bit and acquire operation. + */ +static inline bool pt_test_sw_bit_acquire(struct pt_state *pts, + unsigned int bitnr); + +/** + * pt_set_sw_bit_release() - Set a software bit in an item + * @pts: Entry to set + * @bitnr: Bit to set + * + * Software bits are ignored by HW and can be used for any purpose by the + * software. This does a set bit and release operation. + */ +static inline void pt_set_sw_bit_release(struct pt_state *pts, + unsigned int bitnr); + +/** + * pt_load_entry() - Read from the location pts points at into the pts + * @pts: Table index to load + * + * Set the type of entry that was loaded. pts->entry and pts->table_lower + * will be filled in with the entry's content. + */ +static inline void pt_load_entry(struct pt_state *pts) +{ + pts->type = pt_load_entry_raw(pts); + if (pts->type == PT_ENTRY_TABLE) + pts->table_lower = pt_table_ptr(pts); +} +#endif diff --git a/drivers/iommu/generic_pt/pt_defs.h b/drivers/iommu/generic_pt/pt_defs.h new file mode 100644 index 000000000000..c25544d72f97 --- /dev/null +++ b/drivers/iommu/generic_pt/pt_defs.h @@ -0,0 +1,332 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES + * + * This header is included before the format. It contains definitions + * that are required to compile the format. The header order is: + * pt_defs.h + * fmt_XX.h + * pt_common.h + */ +#ifndef __GENERIC_PT_DEFS_H +#define __GENERIC_PT_DEFS_H + +#include <linux/generic_pt/common.h> + +#include <linux/types.h> +#include <linux/atomic.h> +#include <linux/bits.h> +#include <linux/limits.h> +#include <linux/bug.h> +#include <linux/kconfig.h> +#include "pt_log2.h" + +/* Header self-compile default defines */ +#ifndef pt_write_attrs +typedef u64 pt_vaddr_t; +typedef u64 pt_oaddr_t; +#endif + +struct pt_table_p; + +enum { + PT_VADDR_MAX = sizeof(pt_vaddr_t) == 8 ? U64_MAX : U32_MAX, + PT_VADDR_MAX_LG2 = sizeof(pt_vaddr_t) == 8 ? 64 : 32, + PT_OADDR_MAX = sizeof(pt_oaddr_t) == 8 ? U64_MAX : U32_MAX, + PT_OADDR_MAX_LG2 = sizeof(pt_oaddr_t) == 8 ? 64 : 32, +}; + +/* + * The format instantiation can have features wired off or on to optimize the + * code gen. Supported features are just a reflection of what the current set of + * kernel users want to use. + */ +#ifndef PT_SUPPORTED_FEATURES +#define PT_SUPPORTED_FEATURES 0 +#endif + +/* + * When in debug mode we compile all formats with all features. This allows the + * kunit to test the full matrix. SIGN_EXTEND can't co-exist with DYNAMIC_TOP or + * FULL_VA. DMA_INCOHERENT requires a SW bit that not all formats have + */ +#if IS_ENABLED(CONFIG_DEBUG_GENERIC_PT) +enum { + PT_ORIG_SUPPORTED_FEATURES = PT_SUPPORTED_FEATURES, + PT_DEBUG_SUPPORTED_FEATURES = + UINT_MAX & + ~((PT_ORIG_SUPPORTED_FEATURES & BIT(PT_FEAT_DMA_INCOHERENT) ? + 0 : + BIT(PT_FEAT_DMA_INCOHERENT))) & + ~((PT_ORIG_SUPPORTED_FEATURES & BIT(PT_FEAT_SIGN_EXTEND)) ? + BIT(PT_FEAT_DYNAMIC_TOP) | BIT(PT_FEAT_FULL_VA) : + BIT(PT_FEAT_SIGN_EXTEND)), +}; +#undef PT_SUPPORTED_FEATURES +#define PT_SUPPORTED_FEATURES PT_DEBUG_SUPPORTED_FEATURES +#endif + +#ifndef PT_FORCE_ENABLED_FEATURES +#define PT_FORCE_ENABLED_FEATURES 0 +#endif + +/** + * DOC: Generic Page Table Language + * + * Language used in Generic Page Table + * VA + * The input address to the page table, often the virtual address. + * OA + * The output address from the page table, often the physical address. + * leaf + * An entry that results in an output address. + * start/end + * An half-open range, e.g. [0,0) refers to no VA. + * start/last + * An inclusive closed range, e.g. [0,0] refers to the VA 0 + * common + * The generic page table container struct pt_common + * level + * Level 0 is always a table of only leaves with no futher table pointers. + * Increasing levels increase the size of the table items. The least + * significant VA bits used to index page tables are used to index the Level + * 0 table. The various labels for table levels used by HW descriptions are + * not used. + * top_level + * The inclusive highest level of the table. A two-level table + * has a top level of 1. + * table + * A linear array of translation items for that level. + * index + * The position in a table of an element: item = table[index] + * item + * A single index in a table + * entry + * A single logical element in a table. If contiguous pages are not + * supported then item and entry are the same thing, otherwise entry refers + * to all the items that comprise a single contiguous translation. + * item/entry_size + * The number of bytes of VA the table index translates for. + * If the item is a table entry then the next table covers + * this size. If the entry translates to an output address then the + * full OA is: OA | (VA % entry_size) + * contig_count + * The number of consecutive items fused into a single entry. + * item_size * contig_count is the size of that entry's translation. + * lg2 + * Indicates the value is encoded as log2, i.e. 1<<x is the actual value. + * Normally the compiler is fine to optimize divide and mod with log2 values + * automatically when inlining, however if the values are not constant + * expressions it can't. So we do it by hand; we want to avoid 64-bit + * divmod. + */ + +/* Returned by pt_load_entry() and for_each_pt_level_entry() */ +enum pt_entry_type { + PT_ENTRY_EMPTY, + /* Entry is valid and points to a lower table level */ + PT_ENTRY_TABLE, + /* Entry is valid and returns an output address */ + PT_ENTRY_OA, +}; + +struct pt_range { + struct pt_common *common; + struct pt_table_p *top_table; + pt_vaddr_t va; + pt_vaddr_t last_va; + u8 top_level; + u8 max_vasz_lg2; +}; + +/* + * Similar to xa_state, this records information about an in-progress parse at a + * single level. + */ +struct pt_state { + struct pt_range *range; + struct pt_table_p *table; + struct pt_table_p *table_lower; + u64 entry; + enum pt_entry_type type; + unsigned short index; + unsigned short end_index; + u8 level; +}; + +#define pt_cur_table(pts, type) ((type *)((pts)->table)) + +/* + * Try to install a new table pointer. The locking methodology requires this to + * be atomic (multiple threads can race to install a pointer). The losing + * threads will fail the atomic and return false. They should free any memory + * and reparse the table level again. + */ +#if !IS_ENABLED(CONFIG_GENERIC_ATOMIC64) +static inline bool pt_table_install64(struct pt_state *pts, u64 table_entry) +{ + u64 *entryp = pt_cur_table(pts, u64) + pts->index; + u64 old_entry = pts->entry; + bool ret; + + /* + * Ensure the zero'd table content itself is visible before its PTE can + * be. release is a NOP on !SMP, but the HW is still doing an acquire. + */ + if (!IS_ENABLED(CONFIG_SMP)) + dma_wmb(); + ret = try_cmpxchg64_release(entryp, &old_entry, table_entry); + if (ret) + pts->entry = table_entry; + return ret; +} +#endif + +static inline bool pt_table_install32(struct pt_state *pts, u32 table_entry) +{ + u32 *entryp = pt_cur_table(pts, u32) + pts->index; + u32 old_entry = pts->entry; + bool ret; + + /* + * Ensure the zero'd table content itself is visible before its PTE can + * be. release is a NOP on !SMP, but the HW is still doing an acquire. + */ + if (!IS_ENABLED(CONFIG_SMP)) + dma_wmb(); + ret = try_cmpxchg_release(entryp, &old_entry, table_entry); + if (ret) + pts->entry = table_entry; + return ret; +} + +#define PT_SUPPORTED_FEATURE(feature_nr) (PT_SUPPORTED_FEATURES & BIT(feature_nr)) + +static inline bool pt_feature(const struct pt_common *common, + unsigned int feature_nr) +{ + if (PT_FORCE_ENABLED_FEATURES & BIT(feature_nr)) + return true; + if (!PT_SUPPORTED_FEATURE(feature_nr)) + return false; + return common->features & BIT(feature_nr); +} + +static inline bool pts_feature(const struct pt_state *pts, + unsigned int feature_nr) +{ + return pt_feature(pts->range->common, feature_nr); +} + +/* + * PT_WARN_ON is used for invariants that the kunit should be checking can't + * happen. + */ +#if IS_ENABLED(CONFIG_DEBUG_GENERIC_PT) +#define PT_WARN_ON WARN_ON +#else +static inline bool PT_WARN_ON(bool condition) +{ + return false; +} +#endif + +/* These all work on the VA type */ +#define log2_to_int(a_lg2) log2_to_int_t(pt_vaddr_t, a_lg2) +#define log2_to_max_int(a_lg2) log2_to_max_int_t(pt_vaddr_t, a_lg2) +#define log2_div(a, b_lg2) log2_div_t(pt_vaddr_t, a, b_lg2) +#define log2_div_eq(a, b, c_lg2) log2_div_eq_t(pt_vaddr_t, a, b, c_lg2) +#define log2_mod(a, b_lg2) log2_mod_t(pt_vaddr_t, a, b_lg2) +#define log2_mod_eq_max(a, b_lg2) log2_mod_eq_max_t(pt_vaddr_t, a, b_lg2) +#define log2_set_mod(a, val, b_lg2) log2_set_mod_t(pt_vaddr_t, a, val, b_lg2) +#define log2_set_mod_max(a, b_lg2) log2_set_mod_max_t(pt_vaddr_t, a, b_lg2) +#define log2_mul(a, b_lg2) log2_mul_t(pt_vaddr_t, a, b_lg2) +#define vaffs(a) ffs_t(pt_vaddr_t, a) +#define vafls(a) fls_t(pt_vaddr_t, a) +#define vaffz(a) ffz_t(pt_vaddr_t, a) + +/* + * The full VA (fva) versions permit the lg2 value to be == PT_VADDR_MAX_LG2 and + * generate a useful defined result. The non-fva versions will malfunction at + * this extreme. + */ +static inline pt_vaddr_t fvalog2_div(pt_vaddr_t a, unsigned int b_lg2) +{ + if (PT_SUPPORTED_FEATURE(PT_FEAT_FULL_VA) && b_lg2 == PT_VADDR_MAX_LG2) + return 0; + return log2_div_t(pt_vaddr_t, a, b_lg2); +} + +static inline pt_vaddr_t fvalog2_mod(pt_vaddr_t a, unsigned int b_lg2) +{ + if (PT_SUPPORTED_FEATURE(PT_FEAT_FULL_VA) && b_lg2 == PT_VADDR_MAX_LG2) + return a; + return log2_mod_t(pt_vaddr_t, a, b_lg2); +} + +static inline bool fvalog2_div_eq(pt_vaddr_t a, pt_vaddr_t b, + unsigned int c_lg2) +{ + if (PT_SUPPORTED_FEATURE(PT_FEAT_FULL_VA) && c_lg2 == PT_VADDR_MAX_LG2) + return true; + return log2_div_eq_t(pt_vaddr_t, a, b, c_lg2); +} + +static inline pt_vaddr_t fvalog2_set_mod(pt_vaddr_t a, pt_vaddr_t val, + unsigned int b_lg2) +{ + if (PT_SUPPORTED_FEATURE(PT_FEAT_FULL_VA) && b_lg2 == PT_VADDR_MAX_LG2) + return val; + return log2_set_mod_t(pt_vaddr_t, a, val, b_lg2); +} + +static inline pt_vaddr_t fvalog2_set_mod_max(pt_vaddr_t a, unsigned int b_lg2) +{ + if (PT_SUPPORTED_FEATURE(PT_FEAT_FULL_VA) && b_lg2 == PT_VADDR_MAX_LG2) + return PT_VADDR_MAX; + return log2_set_mod_max_t(pt_vaddr_t, a, b_lg2); +} + +/* These all work on the OA type */ +#define oalog2_to_int(a_lg2) log2_to_int_t(pt_oaddr_t, a_lg2) +#define oalog2_to_max_int(a_lg2) log2_to_max_int_t(pt_oaddr_t, a_lg2) +#define oalog2_div(a, b_lg2) log2_div_t(pt_oaddr_t, a, b_lg2) +#define oalog2_div_eq(a, b, c_lg2) log2_div_eq_t(pt_oaddr_t, a, b, c_lg2) +#define oalog2_mod(a, b_lg2) log2_mod_t(pt_oaddr_t, a, b_lg2) +#define oalog2_mod_eq_max(a, b_lg2) log2_mod_eq_max_t(pt_oaddr_t, a, b_lg2) +#define oalog2_set_mod(a, val, b_lg2) log2_set_mod_t(pt_oaddr_t, a, val, b_lg2) +#define oalog2_set_mod_max(a, b_lg2) log2_set_mod_max_t(pt_oaddr_t, a, b_lg2) +#define oalog2_mul(a, b_lg2) log2_mul_t(pt_oaddr_t, a, b_lg2) +#define oaffs(a) ffs_t(pt_oaddr_t, a) +#define oafls(a) fls_t(pt_oaddr_t, a) +#define oaffz(a) ffz_t(pt_oaddr_t, a) + +static inline uintptr_t _pt_top_set(struct pt_table_p *table_mem, + unsigned int top_level) +{ + return top_level | (uintptr_t)table_mem; +} + +static inline void pt_top_set(struct pt_common *common, + struct pt_table_p *table_mem, + unsigned int top_level) +{ + WRITE_ONCE(common->top_of_table, _pt_top_set(table_mem, top_level)); +} + +static inline void pt_top_set_level(struct pt_common *common, + unsigned int top_level) +{ + pt_top_set(common, NULL, top_level); +} + +static inline unsigned int pt_top_get_level(const struct pt_common *common) +{ + return READ_ONCE(common->top_of_table) % (1 << PT_TOP_LEVEL_BITS); +} + +static inline bool pt_check_install_leaf_args(struct pt_state *pts, + pt_oaddr_t oa, + unsigned int oasz_lg2); + +#endif diff --git a/drivers/iommu/generic_pt/pt_fmt_defaults.h b/drivers/iommu/generic_pt/pt_fmt_defaults.h new file mode 100644 index 000000000000..69fb7c2314ca --- /dev/null +++ b/drivers/iommu/generic_pt/pt_fmt_defaults.h @@ -0,0 +1,295 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES + * + * Default definitions for formats that don't define these functions. + */ +#ifndef __GENERIC_PT_PT_FMT_DEFAULTS_H +#define __GENERIC_PT_PT_FMT_DEFAULTS_H + +#include "pt_defs.h" +#include <linux/log2.h> + +/* Header self-compile default defines */ +#ifndef pt_load_entry_raw +#include "fmt/amdv1.h" +#endif + +/* + * The format must provide PT_GRANULE_LG2SZ, PT_TABLEMEM_LG2SZ, and + * PT_ITEM_WORD_SIZE. They must be the same at every level excluding the top. + */ +#ifndef pt_table_item_lg2sz +static inline unsigned int pt_table_item_lg2sz(const struct pt_state *pts) +{ + return PT_GRANULE_LG2SZ + + (PT_TABLEMEM_LG2SZ - ilog2(PT_ITEM_WORD_SIZE)) * pts->level; +} +#endif + +#ifndef pt_pgsz_lg2_to_level +static inline unsigned int pt_pgsz_lg2_to_level(struct pt_common *common, + unsigned int pgsize_lg2) +{ + return ((unsigned int)(pgsize_lg2 - PT_GRANULE_LG2SZ)) / + (PT_TABLEMEM_LG2SZ - ilog2(PT_ITEM_WORD_SIZE)); +} +#endif + +/* + * If not supplied by the format then contiguous pages are not supported. + * + * If contiguous pages are supported then the format must also provide + * pt_contig_count_lg2() if it supports a single contiguous size per level, + * or pt_possible_sizes() if it supports multiple sizes per level. + */ +#ifndef pt_entry_num_contig_lg2 +static inline unsigned int pt_entry_num_contig_lg2(const struct pt_state *pts) +{ + return ilog2(1); +} + +/* + * Return the number of contiguous OA items forming an entry at this table level + */ +static inline unsigned short pt_contig_count_lg2(const struct pt_state *pts) +{ + return ilog2(1); +} +#endif + +/* If not supplied by the format then dirty tracking is not supported */ +#ifndef pt_entry_is_write_dirty +static inline bool pt_entry_is_write_dirty(const struct pt_state *pts) +{ + return false; +} + +static inline void pt_entry_make_write_clean(struct pt_state *pts) +{ +} + +static inline bool pt_dirty_supported(struct pt_common *common) +{ + return false; +} +#else +/* If not supplied then dirty tracking is always enabled */ +#ifndef pt_dirty_supported +static inline bool pt_dirty_supported(struct pt_common *common) +{ + return true; +} +#endif +#endif + +#ifndef pt_entry_make_write_dirty +static inline bool pt_entry_make_write_dirty(struct pt_state *pts) +{ + return false; +} +#endif + +/* + * Format supplies either: + * pt_entry_oa - OA is at the start of a contiguous entry + * or + * pt_item_oa - OA is adjusted for every item in a contiguous entry + * + * Build the missing one + * + * The internal helper _pt_entry_oa_fast() allows generating + * an efficient pt_entry_oa_exact(), it doesn't care which + * option is selected. + */ +#ifdef pt_entry_oa +static inline pt_oaddr_t pt_item_oa(const struct pt_state *pts) +{ + return pt_entry_oa(pts) | + log2_mul(pts->index, pt_table_item_lg2sz(pts)); +} +#define _pt_entry_oa_fast pt_entry_oa +#endif + +#ifdef pt_item_oa +static inline pt_oaddr_t pt_entry_oa(const struct pt_state *pts) +{ + return log2_set_mod(pt_item_oa(pts), 0, + pt_entry_num_contig_lg2(pts) + + pt_table_item_lg2sz(pts)); +} +#define _pt_entry_oa_fast pt_item_oa +#endif + +/* + * If not supplied by the format then use the constant + * PT_MAX_OUTPUT_ADDRESS_LG2. + */ +#ifndef pt_max_oa_lg2 +static inline unsigned int +pt_max_oa_lg2(const struct pt_common *common) +{ + return PT_MAX_OUTPUT_ADDRESS_LG2; +} +#endif + +#ifndef pt_has_system_page_size +static inline bool pt_has_system_page_size(const struct pt_common *common) +{ + return PT_GRANULE_LG2SZ == PAGE_SHIFT; +} +#endif + +/* + * If not supplied by the format then assume only one contiguous size determined + * by pt_contig_count_lg2() + */ +#ifndef pt_possible_sizes +static inline unsigned short pt_contig_count_lg2(const struct pt_state *pts); + +/* Return a bitmap of possible leaf page sizes at this level */ +static inline pt_vaddr_t pt_possible_sizes(const struct pt_state *pts) +{ + unsigned int isz_lg2 = pt_table_item_lg2sz(pts); + + if (!pt_can_have_leaf(pts)) + return 0; + return log2_to_int(isz_lg2) | + log2_to_int(pt_contig_count_lg2(pts) + isz_lg2); +} +#endif + +/* If not supplied by the format then use 0. */ +#ifndef pt_full_va_prefix +static inline pt_vaddr_t pt_full_va_prefix(const struct pt_common *common) +{ + return 0; +} +#endif + +/* If not supplied by the format then zero fill using PT_ITEM_WORD_SIZE */ +#ifndef pt_clear_entries +static inline void pt_clear_entries64(struct pt_state *pts, + unsigned int num_contig_lg2) +{ + u64 *tablep = pt_cur_table(pts, u64) + pts->index; + u64 *end = tablep + log2_to_int(num_contig_lg2); + + PT_WARN_ON(log2_mod(pts->index, num_contig_lg2)); + for (; tablep != end; tablep++) + WRITE_ONCE(*tablep, 0); +} + +static inline void pt_clear_entries32(struct pt_state *pts, + unsigned int num_contig_lg2) +{ + u32 *tablep = pt_cur_table(pts, u32) + pts->index; + u32 *end = tablep + log2_to_int(num_contig_lg2); + + PT_WARN_ON(log2_mod(pts->index, num_contig_lg2)); + for (; tablep != end; tablep++) + WRITE_ONCE(*tablep, 0); +} + +static inline void pt_clear_entries(struct pt_state *pts, + unsigned int num_contig_lg2) +{ + if (PT_ITEM_WORD_SIZE == sizeof(u32)) + pt_clear_entries32(pts, num_contig_lg2); + else + pt_clear_entries64(pts, num_contig_lg2); +} +#define pt_clear_entries pt_clear_entries +#endif + +/* If not supplied then SW bits are not supported */ +#ifdef pt_sw_bit +static inline bool pt_test_sw_bit_acquire(struct pt_state *pts, + unsigned int bitnr) +{ + /* Acquire, pairs with pt_set_sw_bit_release() */ + smp_mb(); + /* For a contiguous entry the sw bit is only stored in the first item. */ + return pts->entry & pt_sw_bit(bitnr); +} +#define pt_test_sw_bit_acquire pt_test_sw_bit_acquire + +static inline void pt_set_sw_bit_release(struct pt_state *pts, + unsigned int bitnr) +{ +#if !IS_ENABLED(CONFIG_GENERIC_ATOMIC64) + if (PT_ITEM_WORD_SIZE == sizeof(u64)) { + u64 *entryp = pt_cur_table(pts, u64) + pts->index; + u64 old_entry = pts->entry; + u64 new_entry; + + do { + new_entry = old_entry | pt_sw_bit(bitnr); + } while (!try_cmpxchg64_release(entryp, &old_entry, new_entry)); + pts->entry = new_entry; + return; + } +#endif + if (PT_ITEM_WORD_SIZE == sizeof(u32)) { + u32 *entryp = pt_cur_table(pts, u32) + pts->index; + u32 old_entry = pts->entry; + u32 new_entry; + + do { + new_entry = old_entry | pt_sw_bit(bitnr); + } while (!try_cmpxchg_release(entryp, &old_entry, new_entry)); + pts->entry = new_entry; + } else + BUILD_BUG(); +} +#define pt_set_sw_bit_release pt_set_sw_bit_release +#else +static inline unsigned int pt_max_sw_bit(struct pt_common *common) +{ + return 0; +} + +extern void __pt_no_sw_bit(void); +static inline bool pt_test_sw_bit_acquire(struct pt_state *pts, + unsigned int bitnr) +{ + __pt_no_sw_bit(); + return false; +} + +static inline void pt_set_sw_bit_release(struct pt_state *pts, + unsigned int bitnr) +{ + __pt_no_sw_bit(); +} +#endif + +/* + * Format can call in the pt_install_leaf_entry() to check the arguments are all + * aligned correctly. + */ +static inline bool pt_check_install_leaf_args(struct pt_state *pts, + pt_oaddr_t oa, + unsigned int oasz_lg2) +{ + unsigned int isz_lg2 = pt_table_item_lg2sz(pts); + + if (PT_WARN_ON(oalog2_mod(oa, oasz_lg2))) + return false; + +#ifdef pt_possible_sizes + if (PT_WARN_ON(isz_lg2 > oasz_lg2 || + oasz_lg2 > isz_lg2 + pt_num_items_lg2(pts))) + return false; +#else + if (PT_WARN_ON(oasz_lg2 != isz_lg2 && + oasz_lg2 != isz_lg2 + pt_contig_count_lg2(pts))) + return false; +#endif + + if (PT_WARN_ON(oalog2_mod(pts->index, oasz_lg2 - isz_lg2))) + return false; + return true; +} + +#endif diff --git a/drivers/iommu/generic_pt/pt_iter.h b/drivers/iommu/generic_pt/pt_iter.h new file mode 100644 index 000000000000..c0d8617cce29 --- /dev/null +++ b/drivers/iommu/generic_pt/pt_iter.h @@ -0,0 +1,636 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES + * + * Iterators for Generic Page Table + */ +#ifndef __GENERIC_PT_PT_ITER_H +#define __GENERIC_PT_PT_ITER_H + +#include "pt_common.h" + +#include <linux/errno.h> + +/* + * Use to mangle symbols so that backtraces and the symbol table are + * understandable. Any non-inlined function should get mangled like this. + */ +#define NS(fn) CONCATENATE(PTPFX, fn) + +/** + * pt_check_range() - Validate the range can be iterated + * @range: Range to validate + * + * Check that VA and last_va fall within the permitted range of VAs. If the + * format is using PT_FEAT_SIGN_EXTEND then this also checks the sign extension + * is correct. + */ +static inline int pt_check_range(struct pt_range *range) +{ + pt_vaddr_t prefix; + + PT_WARN_ON(!range->max_vasz_lg2); + + if (pt_feature(range->common, PT_FEAT_SIGN_EXTEND)) { + PT_WARN_ON(range->common->max_vasz_lg2 != range->max_vasz_lg2); + prefix = fvalog2_div(range->va, range->max_vasz_lg2 - 1) ? + PT_VADDR_MAX : + 0; + } else { + prefix = pt_full_va_prefix(range->common); + } + + if (!fvalog2_div_eq(range->va, prefix, range->max_vasz_lg2) || + !fvalog2_div_eq(range->last_va, prefix, range->max_vasz_lg2)) + return -ERANGE; + return 0; +} + +/** + * pt_index_to_va() - Update range->va to the current pts->index + * @pts: Iteration State + * + * Adjust range->va to match the current index. This is done in a lazy manner + * since computing the VA takes several instructions and is rarely required. + */ +static inline void pt_index_to_va(struct pt_state *pts) +{ + pt_vaddr_t lower_va; + + lower_va = log2_mul(pts->index, pt_table_item_lg2sz(pts)); + pts->range->va = fvalog2_set_mod(pts->range->va, lower_va, + pt_table_oa_lg2sz(pts)); +} + +/* + * Add index_count_lg2 number of entries to pts's VA and index. The VA will be + * adjusted to the end of the contiguous block if it is currently in the middle. + */ +static inline void _pt_advance(struct pt_state *pts, + unsigned int index_count_lg2) +{ + pts->index = log2_set_mod(pts->index + log2_to_int(index_count_lg2), 0, + index_count_lg2); +} + +/** + * pt_entry_fully_covered() - Check if the item or entry is entirely contained + * within pts->range + * @pts: Iteration State + * @oasz_lg2: The size of the item to check, pt_table_item_lg2sz() or + * pt_entry_oa_lg2sz() + * + * Returns: true if the item is fully enclosed by the pts->range. + */ +static inline bool pt_entry_fully_covered(const struct pt_state *pts, + unsigned int oasz_lg2) +{ + struct pt_range *range = pts->range; + + /* Range begins at the start of the entry */ + if (log2_mod(pts->range->va, oasz_lg2)) + return false; + + /* Range ends past the end of the entry */ + if (!log2_div_eq(range->va, range->last_va, oasz_lg2)) + return true; + + /* Range ends at the end of the entry */ + return log2_mod_eq_max(range->last_va, oasz_lg2); +} + +/** + * pt_range_to_index() - Starting index for an iteration + * @pts: Iteration State + * + * Return: the starting index for the iteration in pts. + */ +static inline unsigned int pt_range_to_index(const struct pt_state *pts) +{ + unsigned int isz_lg2 = pt_table_item_lg2sz(pts); + + PT_WARN_ON(pts->level > pts->range->top_level); + if (pts->range->top_level == pts->level) + return log2_div(fvalog2_mod(pts->range->va, + pts->range->max_vasz_lg2), + isz_lg2); + return log2_mod(log2_div(pts->range->va, isz_lg2), + pt_num_items_lg2(pts)); +} + +/** + * pt_range_to_end_index() - Ending index iteration + * @pts: Iteration State + * + * Return: the last index for the iteration in pts. + */ +static inline unsigned int pt_range_to_end_index(const struct pt_state *pts) +{ + unsigned int isz_lg2 = pt_table_item_lg2sz(pts); + struct pt_range *range = pts->range; + unsigned int num_entries_lg2; + + if (range->va == range->last_va) + return pts->index + 1; + + if (pts->range->top_level == pts->level) + return log2_div(fvalog2_mod(pts->range->last_va, + pts->range->max_vasz_lg2), + isz_lg2) + + 1; + + num_entries_lg2 = pt_num_items_lg2(pts); + + /* last_va falls within this table */ + if (log2_div_eq(range->va, range->last_va, num_entries_lg2 + isz_lg2)) + return log2_mod(log2_div(pts->range->last_va, isz_lg2), + num_entries_lg2) + + 1; + + return log2_to_int(num_entries_lg2); +} + +static inline void _pt_iter_first(struct pt_state *pts) +{ + pts->index = pt_range_to_index(pts); + pts->end_index = pt_range_to_end_index(pts); + PT_WARN_ON(pts->index > pts->end_index); +} + +static inline bool _pt_iter_load(struct pt_state *pts) +{ + if (pts->index >= pts->end_index) + return false; + pt_load_entry(pts); + return true; +} + +/** + * pt_next_entry() - Advance pts to the next entry + * @pts: Iteration State + * + * Update pts to go to the next index at this level. If pts is pointing at a + * contiguous entry then the index may advance my more than one. + */ +static inline void pt_next_entry(struct pt_state *pts) +{ + if (pts->type == PT_ENTRY_OA && + !__builtin_constant_p(pt_entry_num_contig_lg2(pts) == 0)) + _pt_advance(pts, pt_entry_num_contig_lg2(pts)); + else + pts->index++; + pt_index_to_va(pts); +} + +/** + * for_each_pt_level_entry() - For loop wrapper over entries in the range + * @pts: Iteration State + * + * This is the basic iteration primitive. It iterates over all the entries in + * pts->range that fall within the pts's current table level. Each step does + * pt_load_entry(pts). + */ +#define for_each_pt_level_entry(pts) \ + for (_pt_iter_first(pts); _pt_iter_load(pts); pt_next_entry(pts)) + +/** + * pt_load_single_entry() - Version of pt_load_entry() usable within a walker + * @pts: Iteration State + * + * Alternative to for_each_pt_level_entry() if the walker function uses only a + * single entry. + */ +static inline enum pt_entry_type pt_load_single_entry(struct pt_state *pts) +{ + pts->index = pt_range_to_index(pts); + pt_load_entry(pts); + return pts->type; +} + +static __always_inline struct pt_range _pt_top_range(struct pt_common *common, + uintptr_t top_of_table) +{ + struct pt_range range = { + .common = common, + .top_table = + (struct pt_table_p *)(top_of_table & + ~(uintptr_t)PT_TOP_LEVEL_MASK), + .top_level = top_of_table % (1 << PT_TOP_LEVEL_BITS), + }; + struct pt_state pts = { .range = &range, .level = range.top_level }; + unsigned int max_vasz_lg2; + + max_vasz_lg2 = common->max_vasz_lg2; + if (pt_feature(common, PT_FEAT_DYNAMIC_TOP) && + pts.level != PT_MAX_TOP_LEVEL) + max_vasz_lg2 = min_t(unsigned int, common->max_vasz_lg2, + pt_num_items_lg2(&pts) + + pt_table_item_lg2sz(&pts)); + + /* + * The top range will default to the lower region only with sign extend. + */ + range.max_vasz_lg2 = max_vasz_lg2; + if (pt_feature(common, PT_FEAT_SIGN_EXTEND)) + max_vasz_lg2--; + + range.va = fvalog2_set_mod(pt_full_va_prefix(common), 0, max_vasz_lg2); + range.last_va = + fvalog2_set_mod_max(pt_full_va_prefix(common), max_vasz_lg2); + return range; +} + +/** + * pt_top_range() - Return a range that spans part of the top level + * @common: Table + * + * For PT_FEAT_SIGN_EXTEND this will return the lower range, and cover half the + * total page table. Otherwise it returns the entire page table. + */ +static __always_inline struct pt_range pt_top_range(struct pt_common *common) +{ + /* + * The top pointer can change without locking. We capture the value and + * it's level here and are safe to walk it so long as both values are + * captured without tearing. + */ + return _pt_top_range(common, READ_ONCE(common->top_of_table)); +} + +/** + * pt_all_range() - Return a range that spans the entire page table + * @common: Table + * + * The returned range spans the whole page table. Due to how PT_FEAT_SIGN_EXTEND + * is supported range->va and range->last_va will be incorrect during the + * iteration and must not be accessed. + */ +static inline struct pt_range pt_all_range(struct pt_common *common) +{ + struct pt_range range = pt_top_range(common); + + if (!pt_feature(common, PT_FEAT_SIGN_EXTEND)) + return range; + + /* + * Pretend the table is linear from 0 without a sign extension. This + * generates the correct indexes for iteration. + */ + range.last_va = fvalog2_set_mod_max(0, range.max_vasz_lg2); + return range; +} + +/** + * pt_upper_range() - Return a range that spans part of the top level + * @common: Table + * + * For PT_FEAT_SIGN_EXTEND this will return the upper range, and cover half the + * total page table. Otherwise it returns the entire page table. + */ +static inline struct pt_range pt_upper_range(struct pt_common *common) +{ + struct pt_range range = pt_top_range(common); + + if (!pt_feature(common, PT_FEAT_SIGN_EXTEND)) + return range; + + range.va = fvalog2_set_mod(PT_VADDR_MAX, 0, range.max_vasz_lg2 - 1); + range.last_va = PT_VADDR_MAX; + return range; +} + +/** + * pt_make_range() - Return a range that spans part of the table + * @common: Table + * @va: Start address + * @last_va: Last address + * + * The caller must validate the range with pt_check_range() before using it. + */ +static __always_inline struct pt_range +pt_make_range(struct pt_common *common, pt_vaddr_t va, pt_vaddr_t last_va) +{ + struct pt_range range = + _pt_top_range(common, READ_ONCE(common->top_of_table)); + + range.va = va; + range.last_va = last_va; + + return range; +} + +/* + * Span a slice of the table starting at a lower table level from an active + * walk. + */ +static __always_inline struct pt_range +pt_make_child_range(const struct pt_range *parent, pt_vaddr_t va, + pt_vaddr_t last_va) +{ + struct pt_range range = *parent; + + range.va = va; + range.last_va = last_va; + + PT_WARN_ON(last_va < va); + PT_WARN_ON(pt_check_range(&range)); + + return range; +} + +/** + * pt_init() - Initialize a pt_state on the stack + * @range: Range pointer to embed in the state + * @level: Table level for the state + * @table: Pointer to the table memory at level + * + * Helper to initialize the on-stack pt_state from walker arguments. + */ +static __always_inline struct pt_state +pt_init(struct pt_range *range, unsigned int level, struct pt_table_p *table) +{ + struct pt_state pts = { + .range = range, + .table = table, + .level = level, + }; + return pts; +} + +/** + * pt_init_top() - Initialize a pt_state on the stack + * @range: Range pointer to embed in the state + * + * The pt_state points to the top most level. + */ +static __always_inline struct pt_state pt_init_top(struct pt_range *range) +{ + return pt_init(range, range->top_level, range->top_table); +} + +typedef int (*pt_level_fn_t)(struct pt_range *range, void *arg, + unsigned int level, struct pt_table_p *table); + +/** + * pt_descend() - Recursively invoke the walker for the lower level + * @pts: Iteration State + * @arg: Value to pass to the function + * @fn: Walker function to call + * + * pts must point to a table item. Invoke fn as a walker on the table + * pts points to. + */ +static __always_inline int pt_descend(struct pt_state *pts, void *arg, + pt_level_fn_t fn) +{ + int ret; + + if (PT_WARN_ON(!pts->table_lower)) + return -EINVAL; + + ret = (*fn)(pts->range, arg, pts->level - 1, pts->table_lower); + return ret; +} + +/** + * pt_walk_range() - Walk over a VA range + * @range: Range pointer + * @fn: Walker function to call + * @arg: Value to pass to the function + * + * Walk over a VA range. The caller should have done a validity check, at + * least calling pt_check_range(), when building range. The walk will + * start at the top most table. + */ +static __always_inline int pt_walk_range(struct pt_range *range, + pt_level_fn_t fn, void *arg) +{ + return fn(range, arg, range->top_level, range->top_table); +} + +/* + * pt_walk_descend() - Recursively invoke the walker for a slice of a lower + * level + * @pts: Iteration State + * @va: Start address + * @last_va: Last address + * @fn: Walker function to call + * @arg: Value to pass to the function + * + * With pts pointing at a table item this will descend and over a slice of the + * lower table. The caller must ensure that va/last_va are within the table + * item. This creates a new walk and does not alter pts or pts->range. + */ +static __always_inline int pt_walk_descend(const struct pt_state *pts, + pt_vaddr_t va, pt_vaddr_t last_va, + pt_level_fn_t fn, void *arg) +{ + struct pt_range range = pt_make_child_range(pts->range, va, last_va); + + if (PT_WARN_ON(!pt_can_have_table(pts)) || + PT_WARN_ON(!pts->table_lower)) + return -EINVAL; + + return fn(&range, arg, pts->level - 1, pts->table_lower); +} + +/* + * pt_walk_descend_all() - Recursively invoke the walker for a table item + * @parent_pts: Iteration State + * @fn: Walker function to call + * @arg: Value to pass to the function + * + * With pts pointing at a table item this will descend and over the entire lower + * table. This creates a new walk and does not alter pts or pts->range. + */ +static __always_inline int +pt_walk_descend_all(const struct pt_state *parent_pts, pt_level_fn_t fn, + void *arg) +{ + unsigned int isz_lg2 = pt_table_item_lg2sz(parent_pts); + + return pt_walk_descend(parent_pts, + log2_set_mod(parent_pts->range->va, 0, isz_lg2), + log2_set_mod_max(parent_pts->range->va, isz_lg2), + fn, arg); +} + +/** + * pt_range_slice() - Return a range that spans indexes + * @pts: Iteration State + * @start_index: Starting index within pts + * @end_index: Ending index within pts + * + * Create a range than spans an index range of the current table level + * pt_state points at. + */ +static inline struct pt_range pt_range_slice(const struct pt_state *pts, + unsigned int start_index, + unsigned int end_index) +{ + unsigned int table_lg2sz = pt_table_oa_lg2sz(pts); + pt_vaddr_t last_va; + pt_vaddr_t va; + + va = fvalog2_set_mod(pts->range->va, + log2_mul(start_index, pt_table_item_lg2sz(pts)), + table_lg2sz); + last_va = fvalog2_set_mod( + pts->range->va, + log2_mul(end_index, pt_table_item_lg2sz(pts)) - 1, table_lg2sz); + return pt_make_child_range(pts->range, va, last_va); +} + +/** + * pt_top_memsize_lg2() + * @common: Table + * @top_of_table: Top of table value from _pt_top_set() + * + * Compute the allocation size of the top table. For PT_FEAT_DYNAMIC_TOP this + * will compute the top size assuming the table will grow. + */ +static inline unsigned int pt_top_memsize_lg2(struct pt_common *common, + uintptr_t top_of_table) +{ + struct pt_range range = _pt_top_range(common, top_of_table); + struct pt_state pts = pt_init_top(&range); + unsigned int num_items_lg2; + + num_items_lg2 = common->max_vasz_lg2 - pt_table_item_lg2sz(&pts); + if (range.top_level != PT_MAX_TOP_LEVEL && + pt_feature(common, PT_FEAT_DYNAMIC_TOP)) + num_items_lg2 = min(num_items_lg2, pt_num_items_lg2(&pts)); + + /* Round up the allocation size to the minimum alignment */ + return max(ffs_t(u64, PT_TOP_PHYS_MASK), + num_items_lg2 + ilog2(PT_ITEM_WORD_SIZE)); +} + +/** + * pt_compute_best_pgsize() - Determine the best page size for leaf entries + * @pgsz_bitmap: Permitted page sizes + * @va: Starting virtual address for the leaf entry + * @last_va: Last virtual address for the leaf entry, sets the max page size + * @oa: Starting output address for the leaf entry + * + * Compute the largest page size for va, last_va, and oa together and return it + * in lg2. The largest page size depends on the format's supported page sizes at + * this level, and the relative alignment of the VA and OA addresses. 0 means + * the OA cannot be stored with the provided pgsz_bitmap. + */ +static inline unsigned int pt_compute_best_pgsize(pt_vaddr_t pgsz_bitmap, + pt_vaddr_t va, + pt_vaddr_t last_va, + pt_oaddr_t oa) +{ + unsigned int best_pgsz_lg2; + unsigned int pgsz_lg2; + pt_vaddr_t len = last_va - va + 1; + pt_vaddr_t mask; + + if (PT_WARN_ON(va >= last_va)) + return 0; + + /* + * Given a VA/OA pair the best page size is the largest page size + * where: + * + * 1) VA and OA start at the page. Bitwise this is the count of least + * significant 0 bits. + * This also implies that last_va/oa has the same prefix as va/oa. + */ + mask = va | oa; + + /* + * 2) The page size is not larger than the last_va (length). Since page + * sizes are always power of two this can't be larger than the + * largest power of two factor of the length. + */ + mask |= log2_to_int(vafls(len) - 1); + + best_pgsz_lg2 = vaffs(mask); + + /* Choose the highest bit <= best_pgsz_lg2 */ + if (best_pgsz_lg2 < PT_VADDR_MAX_LG2 - 1) + pgsz_bitmap = log2_mod(pgsz_bitmap, best_pgsz_lg2 + 1); + + pgsz_lg2 = vafls(pgsz_bitmap); + if (!pgsz_lg2) + return 0; + + pgsz_lg2--; + + PT_WARN_ON(log2_mod(va, pgsz_lg2) != 0); + PT_WARN_ON(oalog2_mod(oa, pgsz_lg2) != 0); + PT_WARN_ON(va + log2_to_int(pgsz_lg2) - 1 > last_va); + PT_WARN_ON(!log2_div_eq(va, va + log2_to_int(pgsz_lg2) - 1, pgsz_lg2)); + PT_WARN_ON( + !oalog2_div_eq(oa, oa + log2_to_int(pgsz_lg2) - 1, pgsz_lg2)); + return pgsz_lg2; +} + +#define _PT_MAKE_CALL_LEVEL(fn) \ + static __always_inline int fn(struct pt_range *range, void *arg, \ + unsigned int level, \ + struct pt_table_p *table) \ + { \ + static_assert(PT_MAX_TOP_LEVEL <= 5); \ + if (level == 0) \ + return CONCATENATE(fn, 0)(range, arg, 0, table); \ + if (level == 1 || PT_MAX_TOP_LEVEL == 1) \ + return CONCATENATE(fn, 1)(range, arg, 1, table); \ + if (level == 2 || PT_MAX_TOP_LEVEL == 2) \ + return CONCATENATE(fn, 2)(range, arg, 2, table); \ + if (level == 3 || PT_MAX_TOP_LEVEL == 3) \ + return CONCATENATE(fn, 3)(range, arg, 3, table); \ + if (level == 4 || PT_MAX_TOP_LEVEL == 4) \ + return CONCATENATE(fn, 4)(range, arg, 4, table); \ + return CONCATENATE(fn, 5)(range, arg, 5, table); \ + } + +static inline int __pt_make_level_fn_err(struct pt_range *range, void *arg, + unsigned int unused_level, + struct pt_table_p *table) +{ + static_assert(PT_MAX_TOP_LEVEL <= 5); + return -EPROTOTYPE; +} + +#define __PT_MAKE_LEVEL_FN(fn, level, descend_fn, do_fn) \ + static inline int fn(struct pt_range *range, void *arg, \ + unsigned int unused_level, \ + struct pt_table_p *table) \ + { \ + return do_fn(range, arg, level, table, descend_fn); \ + } + +/** + * PT_MAKE_LEVELS() - Build an unwound walker + * @fn: Name of the walker function + * @do_fn: Function to call at each level + * + * This builds a function call tree that can be fully inlined. + * The caller must provide a function body in an __always_inline function:: + * + * static __always_inline int do_fn(struct pt_range *range, void *arg, + * unsigned int level, struct pt_table_p *table, + * pt_level_fn_t descend_fn) + * + * An inline function will be created for each table level that calls do_fn with + * a compile time constant for level and a pointer to the next lower function. + * This generates an optimally inlined walk where each of the functions sees a + * constant level and can codegen the exact constants/etc for that level. + * + * Note this can produce a lot of code! + */ +#define PT_MAKE_LEVELS(fn, do_fn) \ + __PT_MAKE_LEVEL_FN(CONCATENATE(fn, 0), 0, __pt_make_level_fn_err, \ + do_fn); \ + __PT_MAKE_LEVEL_FN(CONCATENATE(fn, 1), 1, CONCATENATE(fn, 0), do_fn); \ + __PT_MAKE_LEVEL_FN(CONCATENATE(fn, 2), 2, CONCATENATE(fn, 1), do_fn); \ + __PT_MAKE_LEVEL_FN(CONCATENATE(fn, 3), 3, CONCATENATE(fn, 2), do_fn); \ + __PT_MAKE_LEVEL_FN(CONCATENATE(fn, 4), 4, CONCATENATE(fn, 3), do_fn); \ + __PT_MAKE_LEVEL_FN(CONCATENATE(fn, 5), 5, CONCATENATE(fn, 4), do_fn); \ + _PT_MAKE_CALL_LEVEL(fn) + +#endif diff --git a/drivers/iommu/generic_pt/pt_log2.h b/drivers/iommu/generic_pt/pt_log2.h new file mode 100644 index 000000000000..6dbbed119238 --- /dev/null +++ b/drivers/iommu/generic_pt/pt_log2.h @@ -0,0 +1,122 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES + * + * Helper macros for working with log2 values + * + */ +#ifndef __GENERIC_PT_LOG2_H +#define __GENERIC_PT_LOG2_H +#include <linux/bitops.h> +#include <linux/limits.h> + +/* Compute a */ +#define log2_to_int_t(type, a_lg2) ((type)(((type)1) << (a_lg2))) +static_assert(log2_to_int_t(unsigned int, 0) == 1); + +/* Compute a - 1 (aka all low bits set) */ +#define log2_to_max_int_t(type, a_lg2) ((type)(log2_to_int_t(type, a_lg2) - 1)) + +/* Compute a / b */ +#define log2_div_t(type, a, b_lg2) ((type)(((type)a) >> (b_lg2))) +static_assert(log2_div_t(unsigned int, 4, 2) == 1); + +/* + * Compute: + * a / c == b / c + * aka the high bits are equal + */ +#define log2_div_eq_t(type, a, b, c_lg2) \ + (log2_div_t(type, (a) ^ (b), c_lg2) == 0) +static_assert(log2_div_eq_t(unsigned int, 1, 1, 2)); + +/* Compute a % b */ +#define log2_mod_t(type, a, b_lg2) \ + ((type)(((type)a) & log2_to_max_int_t(type, b_lg2))) +static_assert(log2_mod_t(unsigned int, 1, 2) == 1); + +/* + * Compute: + * a % b == b - 1 + * aka the low bits are all 1s + */ +#define log2_mod_eq_max_t(type, a, b_lg2) \ + (log2_mod_t(type, a, b_lg2) == log2_to_max_int_t(type, b_lg2)) +static_assert(log2_mod_eq_max_t(unsigned int, 3, 2)); + +/* + * Return a value such that: + * a / b == ret / b + * ret % b == val + * aka set the low bits to val. val must be < b + */ +#define log2_set_mod_t(type, a, val, b_lg2) \ + ((((type)(a)) & (~log2_to_max_int_t(type, b_lg2))) | ((type)(val))) +static_assert(log2_set_mod_t(unsigned int, 3, 1, 2) == 1); + +/* Return a value such that: + * a / b == ret / b + * ret % b == b - 1 + * aka set the low bits to all 1s + */ +#define log2_set_mod_max_t(type, a, b_lg2) \ + (((type)(a)) | log2_to_max_int_t(type, b_lg2)) +static_assert(log2_set_mod_max_t(unsigned int, 2, 2) == 3); + +/* Compute a * b */ +#define log2_mul_t(type, a, b_lg2) ((type)(((type)a) << (b_lg2))) +static_assert(log2_mul_t(unsigned int, 2, 2) == 8); + +#define _dispatch_sz(type, fn, a) \ + (sizeof(type) == 4 ? fn##32((u32)a) : fn##64(a)) + +/* + * Return the highest value such that: + * fls_t(u32, 0) == 0 + * fls_t(u3, 1) == 1 + * a >= log2_to_int(ret - 1) + * aka find last set bit + */ +static inline unsigned int fls32(u32 a) +{ + return fls(a); +} +#define fls_t(type, a) _dispatch_sz(type, fls, a) + +/* + * Return the highest value such that: + * ffs_t(u32, 0) == UNDEFINED + * ffs_t(u32, 1) == 0 + * log_mod(a, ret) == 0 + * aka find first set bit + */ +static inline unsigned int __ffs32(u32 a) +{ + return __ffs(a); +} +#define ffs_t(type, a) _dispatch_sz(type, __ffs, a) + +/* + * Return the highest value such that: + * ffz_t(u32, U32_MAX) == UNDEFINED + * ffz_t(u32, 0) == 0 + * ffz_t(u32, 1) == 1 + * log_mod(a, ret) == log_to_max_int(ret) + * aka find first zero bit + */ +static inline unsigned int ffz32(u32 a) +{ + return ffz(a); +} +static inline unsigned int ffz64(u64 a) +{ + if (sizeof(u64) == sizeof(unsigned long)) + return ffz(a); + + if ((u32)a == U32_MAX) + return ffz32(a >> 32) + 32; + return ffz32(a); +} +#define ffz_t(type, a) _dispatch_sz(type, ffz, a) + +#endif diff --git a/drivers/iommu/intel/Kconfig b/drivers/iommu/intel/Kconfig index f2f538c70650..5471f814e073 100644 --- a/drivers/iommu/intel/Kconfig +++ b/drivers/iommu/intel/Kconfig @@ -13,6 +13,10 @@ config INTEL_IOMMU bool "Support for Intel IOMMU using DMA Remapping Devices" depends on PCI_MSI && ACPI && X86 select IOMMU_API + select GENERIC_PT + select IOMMU_PT + select IOMMU_PT_X86_64 + select IOMMU_PT_VTDSS select IOMMU_IOVA select IOMMU_IOPF select IOMMUFD_DRIVER if IOMMUFD @@ -66,7 +70,7 @@ config INTEL_IOMMU_DEFAULT_ON config INTEL_IOMMU_FLOPPY_WA def_bool y - depends on X86 + depends on X86 && BLK_DEV_FD help Floppy disk drivers are known to bypass DMA API calls thereby failing to work when IOMMU is enabled. This diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index e236c7ec221f..4e888867e85c 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -45,16 +45,9 @@ #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57 -#define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1) -#define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1) - -/* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR - to match. That way, we can use 'unsigned long' for PFNs with impunity. */ -#define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \ - __DOMAIN_MAX_PFN(gaw), (unsigned long)-1)) -#define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT) - static void __init check_tylersburg_isoch(void); +static int intel_iommu_set_dirty_tracking(struct iommu_domain *domain, + bool enable); static int rwbf_quirk; #define rwbf_required(iommu) (rwbf_quirk || cap_rwbf((iommu)->cap)) @@ -217,7 +210,6 @@ static int disable_igfx_iommu; #define IDENTMAP_AZALIA 4 const struct iommu_ops intel_iommu_ops; -static const struct iommu_dirty_ops intel_dirty_ops; static bool translation_pre_enabled(struct intel_iommu *iommu) { @@ -285,13 +277,6 @@ static int __init intel_iommu_setup(char *str) } __setup("intel_iommu=", intel_iommu_setup); -static int domain_pfn_supported(struct dmar_domain *domain, unsigned long pfn) -{ - int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT; - - return !(addr_width < BITS_PER_LONG && pfn >> addr_width); -} - /* * Calculate the Supported Adjusted Guest Address Widths of an IOMMU. * Refer to 11.4.2 of the VT-d spec for the encoding of each bit of @@ -353,23 +338,6 @@ static bool iommu_paging_structure_coherency(struct intel_iommu *iommu) ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap); } -/* Return the super pagesize bitmap if supported. */ -static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain) -{ - unsigned long bitmap = 0; - - /* - * 1-level super page supports page size of 2MiB, 2-level super page - * supports page size of both 2MiB and 1GiB. - */ - if (domain->iommu_superpage == 1) - bitmap |= SZ_2M; - else if (domain->iommu_superpage == 2) - bitmap |= SZ_2M | SZ_1G; - - return bitmap; -} - struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus, u8 devfn, int alloc) { @@ -556,13 +524,6 @@ out: return iommu; } -static void domain_flush_cache(struct dmar_domain *domain, - void *addr, int size) -{ - if (!domain->iommu_coherency) - clflush_cache_range(addr, size); -} - static void free_context_table(struct intel_iommu *iommu) { struct context_entry *context; @@ -707,280 +668,6 @@ pgtable_walk: } #endif -static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain, - unsigned long pfn, int *target_level, - gfp_t gfp) -{ - struct dma_pte *parent, *pte; - int level = agaw_to_level(domain->agaw); - int offset; - - if (!domain_pfn_supported(domain, pfn)) - /* Address beyond IOMMU's addressing capabilities. */ - return NULL; - - parent = domain->pgd; - - while (1) { - void *tmp_page; - - offset = pfn_level_offset(pfn, level); - pte = &parent[offset]; - if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte))) - break; - if (level == *target_level) - break; - - if (!dma_pte_present(pte)) { - uint64_t pteval, tmp; - - tmp_page = iommu_alloc_pages_node_sz(domain->nid, gfp, - SZ_4K); - - if (!tmp_page) - return NULL; - - domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE); - pteval = virt_to_phys(tmp_page) | DMA_PTE_READ | - DMA_PTE_WRITE; - if (domain->use_first_level) - pteval |= DMA_FL_PTE_US | DMA_FL_PTE_ACCESS; - - tmp = 0ULL; - if (!try_cmpxchg64(&pte->val, &tmp, pteval)) - /* Someone else set it while we were thinking; use theirs. */ - iommu_free_pages(tmp_page); - else - domain_flush_cache(domain, pte, sizeof(*pte)); - } - if (level == 1) - break; - - parent = phys_to_virt(dma_pte_addr(pte)); - level--; - } - - if (!*target_level) - *target_level = level; - - return pte; -} - -/* return address's pte at specific level */ -static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain, - unsigned long pfn, - int level, int *large_page) -{ - struct dma_pte *parent, *pte; - int total = agaw_to_level(domain->agaw); - int offset; - - parent = domain->pgd; - while (level <= total) { - offset = pfn_level_offset(pfn, total); - pte = &parent[offset]; - if (level == total) - return pte; - - if (!dma_pte_present(pte)) { - *large_page = total; - break; - } - - if (dma_pte_superpage(pte)) { - *large_page = total; - return pte; - } - - parent = phys_to_virt(dma_pte_addr(pte)); - total--; - } - return NULL; -} - -/* clear last level pte, a tlb flush should be followed */ -static void dma_pte_clear_range(struct dmar_domain *domain, - unsigned long start_pfn, - unsigned long last_pfn) -{ - unsigned int large_page; - struct dma_pte *first_pte, *pte; - - if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) || - WARN_ON(start_pfn > last_pfn)) - return; - - /* we don't need lock here; nobody else touches the iova range */ - do { - large_page = 1; - first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page); - if (!pte) { - start_pfn = align_to_level(start_pfn + 1, large_page + 1); - continue; - } - do { - dma_clear_pte(pte); - start_pfn += lvl_to_nr_pages(large_page); - pte++; - } while (start_pfn <= last_pfn && !first_pte_in_page(pte)); - - domain_flush_cache(domain, first_pte, - (void *)pte - (void *)first_pte); - - } while (start_pfn && start_pfn <= last_pfn); -} - -static void dma_pte_free_level(struct dmar_domain *domain, int level, - int retain_level, struct dma_pte *pte, - unsigned long pfn, unsigned long start_pfn, - unsigned long last_pfn) -{ - pfn = max(start_pfn, pfn); - pte = &pte[pfn_level_offset(pfn, level)]; - - do { - unsigned long level_pfn; - struct dma_pte *level_pte; - - if (!dma_pte_present(pte) || dma_pte_superpage(pte)) - goto next; - - level_pfn = pfn & level_mask(level); - level_pte = phys_to_virt(dma_pte_addr(pte)); - - if (level > 2) { - dma_pte_free_level(domain, level - 1, retain_level, - level_pte, level_pfn, start_pfn, - last_pfn); - } - - /* - * Free the page table if we're below the level we want to - * retain and the range covers the entire table. - */ - if (level < retain_level && !(start_pfn > level_pfn || - last_pfn < level_pfn + level_size(level) - 1)) { - dma_clear_pte(pte); - domain_flush_cache(domain, pte, sizeof(*pte)); - iommu_free_pages(level_pte); - } -next: - pfn += level_size(level); - } while (!first_pte_in_page(++pte) && pfn <= last_pfn); -} - -/* - * clear last level (leaf) ptes and free page table pages below the - * level we wish to keep intact. - */ -static void dma_pte_free_pagetable(struct dmar_domain *domain, - unsigned long start_pfn, - unsigned long last_pfn, - int retain_level) -{ - dma_pte_clear_range(domain, start_pfn, last_pfn); - - /* We don't need lock here; nobody else touches the iova range */ - dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level, - domain->pgd, 0, start_pfn, last_pfn); - - /* free pgd */ - if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) { - iommu_free_pages(domain->pgd); - domain->pgd = NULL; - } -} - -/* When a page at a given level is being unlinked from its parent, we don't - need to *modify* it at all. All we need to do is make a list of all the - pages which can be freed just as soon as we've flushed the IOTLB and we - know the hardware page-walk will no longer touch them. - The 'pte' argument is the *parent* PTE, pointing to the page that is to - be freed. */ -static void dma_pte_list_pagetables(struct dmar_domain *domain, - int level, struct dma_pte *parent_pte, - struct iommu_pages_list *freelist) -{ - struct dma_pte *pte = phys_to_virt(dma_pte_addr(parent_pte)); - - iommu_pages_list_add(freelist, pte); - - if (level == 1) - return; - - do { - if (dma_pte_present(pte) && !dma_pte_superpage(pte)) - dma_pte_list_pagetables(domain, level - 1, pte, freelist); - pte++; - } while (!first_pte_in_page(pte)); -} - -static void dma_pte_clear_level(struct dmar_domain *domain, int level, - struct dma_pte *pte, unsigned long pfn, - unsigned long start_pfn, unsigned long last_pfn, - struct iommu_pages_list *freelist) -{ - struct dma_pte *first_pte = NULL, *last_pte = NULL; - - pfn = max(start_pfn, pfn); - pte = &pte[pfn_level_offset(pfn, level)]; - - do { - unsigned long level_pfn = pfn & level_mask(level); - - if (!dma_pte_present(pte)) - goto next; - - /* If range covers entire pagetable, free it */ - if (start_pfn <= level_pfn && - last_pfn >= level_pfn + level_size(level) - 1) { - /* These suborbinate page tables are going away entirely. Don't - bother to clear them; we're just going to *free* them. */ - if (level > 1 && !dma_pte_superpage(pte)) - dma_pte_list_pagetables(domain, level - 1, pte, freelist); - - dma_clear_pte(pte); - if (!first_pte) - first_pte = pte; - last_pte = pte; - } else if (level > 1) { - /* Recurse down into a level that isn't *entirely* obsolete */ - dma_pte_clear_level(domain, level - 1, - phys_to_virt(dma_pte_addr(pte)), - level_pfn, start_pfn, last_pfn, - freelist); - } -next: - pfn = level_pfn + level_size(level); - } while (!first_pte_in_page(++pte) && pfn <= last_pfn); - - if (first_pte) - domain_flush_cache(domain, first_pte, - (void *)++last_pte - (void *)first_pte); -} - -/* We can't just free the pages because the IOMMU may still be walking - the page tables, and may have cached the intermediate levels. The - pages can only be freed after the IOTLB flush has been done. */ -static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn, - unsigned long last_pfn, - struct iommu_pages_list *freelist) -{ - if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) || - WARN_ON(start_pfn > last_pfn)) - return; - - /* we don't need lock here; nobody else touches the iova range */ - dma_pte_clear_level(domain, agaw_to_level(domain->agaw), - domain->pgd, 0, start_pfn, last_pfn, freelist); - - /* free pgd */ - if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) { - iommu_pages_list_add(freelist, domain->pgd); - domain->pgd = NULL; - } -} - /* iommu handling */ static int iommu_alloc_root_entry(struct intel_iommu *iommu) { @@ -1460,13 +1147,15 @@ static int domain_context_mapping_one(struct dmar_domain *domain, domain_lookup_dev_info(domain, iommu, bus, devfn); u16 did = domain_id_iommu(domain, iommu); int translation = CONTEXT_TT_MULTI_LEVEL; - struct dma_pte *pgd = domain->pgd; + struct pt_iommu_vtdss_hw_info pt_info; struct context_entry *context; int ret; if (WARN_ON(!intel_domain_is_ss_paging(domain))) return -EINVAL; + pt_iommu_vtdss_hw_info(&domain->sspt, &pt_info); + pr_debug("Set context mapping for %02x:%02x.%d\n", bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); @@ -1489,8 +1178,8 @@ static int domain_context_mapping_one(struct dmar_domain *domain, else translation = CONTEXT_TT_MULTI_LEVEL; - context_set_address_root(context, virt_to_phys(pgd)); - context_set_address_width(context, domain->agaw); + context_set_address_root(context, pt_info.ssptptr); + context_set_address_width(context, pt_info.aw); context_set_translation_type(context, translation); context_set_fault_enable(context); context_set_present(context); @@ -1537,177 +1226,6 @@ domain_context_mapping(struct dmar_domain *domain, struct device *dev) return 0; } -/* Return largest possible superpage level for a given mapping */ -static int hardware_largepage_caps(struct dmar_domain *domain, unsigned long iov_pfn, - unsigned long phy_pfn, unsigned long pages) -{ - int support, level = 1; - unsigned long pfnmerge; - - support = domain->iommu_superpage; - - /* To use a large page, the virtual *and* physical addresses - must be aligned to 2MiB/1GiB/etc. Lower bits set in either - of them will mean we have to use smaller pages. So just - merge them and check both at once. */ - pfnmerge = iov_pfn | phy_pfn; - - while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) { - pages >>= VTD_STRIDE_SHIFT; - if (!pages) - break; - pfnmerge >>= VTD_STRIDE_SHIFT; - level++; - support--; - } - return level; -} - -/* - * Ensure that old small page tables are removed to make room for superpage(s). - * We're going to add new large pages, so make sure we don't remove their parent - * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared. - */ -static void switch_to_super_page(struct dmar_domain *domain, - unsigned long start_pfn, - unsigned long end_pfn, int level) -{ - unsigned long lvl_pages = lvl_to_nr_pages(level); - struct dma_pte *pte = NULL; - - if (WARN_ON(!IS_ALIGNED(start_pfn, lvl_pages) || - !IS_ALIGNED(end_pfn + 1, lvl_pages))) - return; - - while (start_pfn <= end_pfn) { - if (!pte) - pte = pfn_to_dma_pte(domain, start_pfn, &level, - GFP_ATOMIC); - - if (dma_pte_present(pte)) { - dma_pte_free_pagetable(domain, start_pfn, - start_pfn + lvl_pages - 1, - level + 1); - - cache_tag_flush_range(domain, start_pfn << VTD_PAGE_SHIFT, - end_pfn << VTD_PAGE_SHIFT, 0); - } - - pte++; - start_pfn += lvl_pages; - if (first_pte_in_page(pte)) - pte = NULL; - } -} - -static int -__domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn, - unsigned long phys_pfn, unsigned long nr_pages, int prot, - gfp_t gfp) -{ - struct dma_pte *first_pte = NULL, *pte = NULL; - unsigned int largepage_lvl = 0; - unsigned long lvl_pages = 0; - phys_addr_t pteval; - u64 attr; - - if (unlikely(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1))) - return -EINVAL; - - if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0) - return -EINVAL; - - if (!(prot & DMA_PTE_WRITE) && domain->nested_parent) { - pr_err_ratelimited("Read-only mapping is disallowed on the domain which serves as the parent in a nested configuration, due to HW errata (ERRATA_772415_SPR17)\n"); - return -EINVAL; - } - - attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP); - if (domain->use_first_level) { - attr |= DMA_FL_PTE_PRESENT | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS; - if (prot & DMA_PTE_WRITE) - attr |= DMA_FL_PTE_DIRTY; - } - - domain->has_mappings = true; - - pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr; - - while (nr_pages > 0) { - uint64_t tmp; - - if (!pte) { - largepage_lvl = hardware_largepage_caps(domain, iov_pfn, - phys_pfn, nr_pages); - - pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl, - gfp); - if (!pte) - return -ENOMEM; - first_pte = pte; - - lvl_pages = lvl_to_nr_pages(largepage_lvl); - - /* It is large page*/ - if (largepage_lvl > 1) { - unsigned long end_pfn; - unsigned long pages_to_remove; - - pteval |= DMA_PTE_LARGE_PAGE; - pages_to_remove = min_t(unsigned long, - round_down(nr_pages, lvl_pages), - nr_pte_to_next_page(pte) * lvl_pages); - end_pfn = iov_pfn + pages_to_remove - 1; - switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl); - } else { - pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE; - } - - } - /* We don't need lock here, nobody else - * touches the iova range - */ - tmp = 0ULL; - if (!try_cmpxchg64_local(&pte->val, &tmp, pteval)) { - static int dumps = 5; - pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n", - iov_pfn, tmp, (unsigned long long)pteval); - if (dumps) { - dumps--; - debug_dma_dump_mappings(NULL); - } - WARN_ON(1); - } - - nr_pages -= lvl_pages; - iov_pfn += lvl_pages; - phys_pfn += lvl_pages; - pteval += lvl_pages * VTD_PAGE_SIZE; - - /* If the next PTE would be the first in a new page, then we - * need to flush the cache on the entries we've just written. - * And then we'll need to recalculate 'pte', so clear it and - * let it get set again in the if (!pte) block above. - * - * If we're done (!nr_pages) we need to flush the cache too. - * - * Also if we've been setting superpages, we may need to - * recalculate 'pte' and switch back to smaller pages for the - * end of the mapping, if the trailing size is not enough to - * use another superpage (i.e. nr_pages < lvl_pages). - */ - pte++; - if (!nr_pages || first_pte_in_page(pte) || - (largepage_lvl > 1 && nr_pages < lvl_pages)) { - domain_flush_cache(domain, first_pte, - (void *)pte - (void *)first_pte); - pte = NULL; - } - } - - return 0; -} - static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn) { struct intel_iommu *iommu = info->iommu; @@ -1769,22 +1287,26 @@ static int domain_setup_first_level(struct intel_iommu *iommu, struct device *dev, u32 pasid, struct iommu_domain *old) { - struct dma_pte *pgd = domain->pgd; - int level, flags = 0; + struct pt_iommu_x86_64_hw_info pt_info; + unsigned int flags = 0; - level = agaw_to_level(domain->agaw); - if (level != 4 && level != 5) + pt_iommu_x86_64_hw_info(&domain->fspt, &pt_info); + if (WARN_ON(pt_info.levels != 4 && pt_info.levels != 5)) return -EINVAL; - if (level == 5) + if (pt_info.levels == 5) flags |= PASID_FLAG_FL5LP; if (domain->force_snooping) flags |= PASID_FLAG_PAGE_SNOOP; + if (!(domain->fspt.x86_64_pt.common.features & + BIT(PT_FEAT_DMA_INCOHERENT))) + flags |= PASID_FLAG_PWSNP; + return __domain_setup_first_level(iommu, dev, pasid, domain_id_iommu(domain, iommu), - __pa(pgd), flags, old); + pt_info.gcr3_pt, flags, old); } static int dmar_domain_attach_device(struct dmar_domain *domain, @@ -3230,7 +2752,8 @@ void device_block_translation(struct device *dev) } static int blocking_domain_attach_dev(struct iommu_domain *domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { struct device_domain_info *info = dev_iommu_priv_get(dev); @@ -3251,23 +2774,9 @@ static struct iommu_domain blocking_domain = { } }; -static int iommu_superpage_capability(struct intel_iommu *iommu, bool first_stage) -{ - if (!intel_iommu_superpage) - return 0; - - if (first_stage) - return cap_fl1gp_support(iommu->cap) ? 2 : 1; - - return fls(cap_super_page_val(iommu->cap)); -} - -static struct dmar_domain *paging_domain_alloc(struct device *dev, bool first_stage) +static struct dmar_domain *paging_domain_alloc(void) { - struct device_domain_info *info = dev_iommu_priv_get(dev); - struct intel_iommu *iommu = info->iommu; struct dmar_domain *domain; - int addr_width; domain = kzalloc(sizeof(*domain), GFP_KERNEL); if (!domain) @@ -3282,56 +2791,38 @@ static struct dmar_domain *paging_domain_alloc(struct device *dev, bool first_st INIT_LIST_HEAD(&domain->s1_domains); spin_lock_init(&domain->s1_lock); - domain->nid = dev_to_node(dev); - domain->use_first_level = first_stage; - - domain->domain.type = IOMMU_DOMAIN_UNMANAGED; - - /* calculate the address width */ - addr_width = agaw_to_width(iommu->agaw); - if (addr_width > cap_mgaw(iommu->cap)) - addr_width = cap_mgaw(iommu->cap); - domain->gaw = addr_width; - domain->agaw = iommu->agaw; - domain->max_addr = __DOMAIN_MAX_ADDR(addr_width); - - /* iommu memory access coherency */ - domain->iommu_coherency = iommu_paging_structure_coherency(iommu); + return domain; +} - /* pagesize bitmap */ - domain->domain.pgsize_bitmap = SZ_4K; - domain->iommu_superpage = iommu_superpage_capability(iommu, first_stage); - domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain); +static unsigned int compute_vasz_lg2_fs(struct intel_iommu *iommu, + unsigned int *top_level) +{ + unsigned int mgaw = cap_mgaw(iommu->cap); /* - * IOVA aperture: First-level translation restricts the input-address - * to a canonical address (i.e., address bits 63:N have the same value - * as address bit [N-1], where N is 48-bits with 4-level paging and - * 57-bits with 5-level paging). Hence, skip bit [N-1]. + * Spec 3.6 First-Stage Translation: + * + * Software must limit addresses to less than the minimum of MGAW + * and the lower canonical address width implied by FSPM (i.e., + * 47-bit when FSPM is 4-level and 56-bit when FSPM is 5-level). */ - domain->domain.geometry.force_aperture = true; - domain->domain.geometry.aperture_start = 0; - if (first_stage) - domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1); - else - domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw); - - /* always allocate the top pgd */ - domain->pgd = iommu_alloc_pages_node_sz(domain->nid, GFP_KERNEL, SZ_4K); - if (!domain->pgd) { - kfree(domain); - return ERR_PTR(-ENOMEM); + if (mgaw > 48 && cap_fl5lp_support(iommu->cap)) { + *top_level = 4; + return min(57, mgaw); } - domain_flush_cache(domain, domain->pgd, PAGE_SIZE); - return domain; + /* Four level is always supported */ + *top_level = 3; + return min(48, mgaw); } static struct iommu_domain * intel_iommu_domain_alloc_first_stage(struct device *dev, struct intel_iommu *iommu, u32 flags) { + struct pt_iommu_x86_64_cfg cfg = {}; struct dmar_domain *dmar_domain; + int ret; if (flags & ~IOMMU_HWPT_ALLOC_PASID) return ERR_PTR(-EOPNOTSUPP); @@ -3340,10 +2831,20 @@ intel_iommu_domain_alloc_first_stage(struct device *dev, if (!sm_supported(iommu) || !ecap_flts(iommu->ecap)) return ERR_PTR(-EOPNOTSUPP); - dmar_domain = paging_domain_alloc(dev, true); + dmar_domain = paging_domain_alloc(); if (IS_ERR(dmar_domain)) return ERR_CAST(dmar_domain); + cfg.common.hw_max_vasz_lg2 = + compute_vasz_lg2_fs(iommu, &cfg.top_level); + cfg.common.hw_max_oasz_lg2 = 52; + cfg.common.features = BIT(PT_FEAT_SIGN_EXTEND) | + BIT(PT_FEAT_FLUSH_RANGE); + /* First stage always uses scalable mode */ + if (!ecap_smpwc(iommu->ecap)) + cfg.common.features |= BIT(PT_FEAT_DMA_INCOHERENT); + dmar_domain->iommu.iommu_device = dev; + dmar_domain->iommu.nid = dev_to_node(dev); dmar_domain->domain.ops = &intel_fs_paging_domain_ops; /* * iotlb sync for map is only needed for legacy implementations that @@ -3353,14 +2854,58 @@ intel_iommu_domain_alloc_first_stage(struct device *dev, if (rwbf_required(iommu)) dmar_domain->iotlb_sync_map = true; + ret = pt_iommu_x86_64_init(&dmar_domain->fspt, &cfg, GFP_KERNEL); + if (ret) { + kfree(dmar_domain); + return ERR_PTR(ret); + } + + if (!cap_fl1gp_support(iommu->cap)) + dmar_domain->domain.pgsize_bitmap &= ~(u64)SZ_1G; + if (!intel_iommu_superpage) + dmar_domain->domain.pgsize_bitmap = SZ_4K; + return &dmar_domain->domain; } +static unsigned int compute_vasz_lg2_ss(struct intel_iommu *iommu, + unsigned int *top_level) +{ + unsigned int sagaw = cap_sagaw(iommu->cap); + unsigned int mgaw = cap_mgaw(iommu->cap); + + /* + * Find the largest table size that both the mgaw and sagaw support. + * This sets the valid range of IOVA and the top starting level. + * Some HW may only support a 4 or 5 level walk but must limit IOVA to + * 3 levels. + */ + if (mgaw > 48 && sagaw >= BIT(3)) { + *top_level = 4; + return min(57, mgaw); + } else if (mgaw > 39 && sagaw >= BIT(2)) { + *top_level = 3 + ffs(sagaw >> 3); + return min(48, mgaw); + } else if (mgaw > 30 && sagaw >= BIT(1)) { + *top_level = 2 + ffs(sagaw >> 2); + return min(39, mgaw); + } + return 0; +} + +static const struct iommu_dirty_ops intel_second_stage_dirty_ops = { + IOMMU_PT_DIRTY_OPS(vtdss), + .set_dirty_tracking = intel_iommu_set_dirty_tracking, +}; + static struct iommu_domain * intel_iommu_domain_alloc_second_stage(struct device *dev, struct intel_iommu *iommu, u32 flags) { + struct pt_iommu_vtdss_cfg cfg = {}; struct dmar_domain *dmar_domain; + unsigned int sslps; + int ret; if (flags & (~(IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING | @@ -3377,15 +2922,46 @@ intel_iommu_domain_alloc_second_stage(struct device *dev, if (sm_supported(iommu) && !ecap_slts(iommu->ecap)) return ERR_PTR(-EOPNOTSUPP); - dmar_domain = paging_domain_alloc(dev, false); + dmar_domain = paging_domain_alloc(); if (IS_ERR(dmar_domain)) return ERR_CAST(dmar_domain); + cfg.common.hw_max_vasz_lg2 = compute_vasz_lg2_ss(iommu, &cfg.top_level); + cfg.common.hw_max_oasz_lg2 = 52; + cfg.common.features = BIT(PT_FEAT_FLUSH_RANGE); + + /* + * Read-only mapping is disallowed on the domain which serves as the + * parent in a nested configuration, due to HW errata + * (ERRATA_772415_SPR17) + */ + if (flags & IOMMU_HWPT_ALLOC_NEST_PARENT) + cfg.common.features |= BIT(PT_FEAT_VTDSS_FORCE_WRITEABLE); + + if (!iommu_paging_structure_coherency(iommu)) + cfg.common.features |= BIT(PT_FEAT_DMA_INCOHERENT); + dmar_domain->iommu.iommu_device = dev; + dmar_domain->iommu.nid = dev_to_node(dev); dmar_domain->domain.ops = &intel_ss_paging_domain_ops; dmar_domain->nested_parent = flags & IOMMU_HWPT_ALLOC_NEST_PARENT; if (flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING) - dmar_domain->domain.dirty_ops = &intel_dirty_ops; + dmar_domain->domain.dirty_ops = &intel_second_stage_dirty_ops; + + ret = pt_iommu_vtdss_init(&dmar_domain->sspt, &cfg, GFP_KERNEL); + if (ret) { + kfree(dmar_domain); + return ERR_PTR(ret); + } + + /* Adjust the supported page sizes to HW capability */ + sslps = cap_super_page_val(iommu->cap); + if (!(sslps & BIT(0))) + dmar_domain->domain.pgsize_bitmap &= ~(u64)SZ_2M; + if (!(sslps & BIT(1))) + dmar_domain->domain.pgsize_bitmap &= ~(u64)SZ_1G; + if (!intel_iommu_superpage) + dmar_domain->domain.pgsize_bitmap = SZ_4K; /* * Besides the internal write buffer flush, the caching mode used for @@ -3427,14 +3003,7 @@ static void intel_iommu_domain_free(struct iommu_domain *domain) if (WARN_ON(!list_empty(&dmar_domain->devices))) return; - if (dmar_domain->pgd) { - struct iommu_pages_list freelist = - IOMMU_PAGES_LIST_INIT(freelist); - - domain_unmap(dmar_domain, 0, DOMAIN_MAX_PFN(dmar_domain->gaw), - &freelist); - iommu_put_pages_list(&freelist); - } + pt_iommu_deinit(&dmar_domain->iommu); kfree(dmar_domain->qi_batch); kfree(dmar_domain); @@ -3451,6 +3020,16 @@ static int paging_domain_compatible_first_stage(struct dmar_domain *dmar_domain, if (!sm_supported(iommu) || !ecap_flts(iommu->ecap)) return -EINVAL; + if (!ecap_smpwc(iommu->ecap) && + !(dmar_domain->fspt.x86_64_pt.common.features & + BIT(PT_FEAT_DMA_INCOHERENT))) + return -EINVAL; + + /* Supports the number of table levels */ + if (!cap_fl5lp_support(iommu->cap) && + dmar_domain->fspt.x86_64_pt.common.max_vasz_lg2 > 48) + return -EINVAL; + /* Same page size support */ if (!cap_fl1gp_support(iommu->cap) && (dmar_domain->domain.pgsize_bitmap & SZ_1G)) @@ -3467,7 +3046,11 @@ static int paging_domain_compatible_second_stage(struct dmar_domain *dmar_domain, struct intel_iommu *iommu) { + unsigned int vasz_lg2 = dmar_domain->sspt.vtdss_pt.common.max_vasz_lg2; unsigned int sslps = cap_super_page_val(iommu->cap); + struct pt_iommu_vtdss_hw_info pt_info; + + pt_iommu_vtdss_hw_info(&dmar_domain->sspt, &pt_info); if (dmar_domain->domain.dirty_ops && !ssads_supported(iommu)) return -EINVAL; @@ -3478,6 +3061,19 @@ paging_domain_compatible_second_stage(struct dmar_domain *dmar_domain, if (sm_supported(iommu) && !ecap_slts(iommu->ecap)) return -EINVAL; + if (!iommu_paging_structure_coherency(iommu) && + !(dmar_domain->sspt.vtdss_pt.common.features & + BIT(PT_FEAT_DMA_INCOHERENT))) + return -EINVAL; + + /* Address width falls within the capability */ + if (cap_mgaw(iommu->cap) < vasz_lg2) + return -EINVAL; + + /* Page table level is supported. */ + if (!(cap_sagaw(iommu->cap) & BIT(pt_info.aw))) + return -EINVAL; + /* Same page size support */ if (!(sslps & BIT(0)) && (dmar_domain->domain.pgsize_bitmap & SZ_2M)) return -EINVAL; @@ -3489,6 +3085,14 @@ paging_domain_compatible_second_stage(struct dmar_domain *dmar_domain, !dmar_domain->iotlb_sync_map) return -EINVAL; + /* + * FIXME this is locked wrong, it needs to be under the + * dmar_domain->lock + */ + if ((dmar_domain->sspt.vtdss_pt.common.features & + BIT(PT_FEAT_VTDSS_FORCE_COHERENCE)) && + !ecap_sc_support(iommu->ecap)) + return -EINVAL; return 0; } @@ -3498,7 +3102,6 @@ int paging_domain_compatible(struct iommu_domain *domain, struct device *dev) struct dmar_domain *dmar_domain = to_dmar_domain(domain); struct intel_iommu *iommu = info->iommu; int ret = -EINVAL; - int addr_width; if (intel_domain_is_fs_paging(dmar_domain)) ret = paging_domain_compatible_first_stage(dmar_domain, iommu); @@ -3509,26 +3112,6 @@ int paging_domain_compatible(struct iommu_domain *domain, struct device *dev) if (ret) return ret; - /* - * FIXME this is locked wrong, it needs to be under the - * dmar_domain->lock - */ - if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap)) - return -EINVAL; - - if (dmar_domain->iommu_coherency != - iommu_paging_structure_coherency(iommu)) - return -EINVAL; - - - /* check if this iommu agaw is sufficient for max mapped address */ - addr_width = agaw_to_width(iommu->agaw); - if (addr_width > cap_mgaw(iommu->cap)) - addr_width = cap_mgaw(iommu->cap); - - if (dmar_domain->gaw > addr_width || dmar_domain->agaw > iommu->agaw) - return -EINVAL; - if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev) && context_copied(iommu, info->bus, info->devfn)) return intel_pasid_setup_sm_context(dev); @@ -3537,7 +3120,8 @@ int paging_domain_compatible(struct iommu_domain *domain, struct device *dev) } static int intel_iommu_attach_device(struct iommu_domain *domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { int ret; @@ -3558,110 +3142,6 @@ static int intel_iommu_attach_device(struct iommu_domain *domain, return ret; } -static int intel_iommu_map(struct iommu_domain *domain, - unsigned long iova, phys_addr_t hpa, - size_t size, int iommu_prot, gfp_t gfp) -{ - struct dmar_domain *dmar_domain = to_dmar_domain(domain); - u64 max_addr; - int prot = 0; - - if (iommu_prot & IOMMU_READ) - prot |= DMA_PTE_READ; - if (iommu_prot & IOMMU_WRITE) - prot |= DMA_PTE_WRITE; - if (dmar_domain->set_pte_snp) - prot |= DMA_PTE_SNP; - - max_addr = iova + size; - if (dmar_domain->max_addr < max_addr) { - u64 end; - - /* check if minimum agaw is sufficient for mapped address */ - end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1; - if (end < max_addr) { - pr_err("%s: iommu width (%d) is not " - "sufficient for the mapped address (%llx)\n", - __func__, dmar_domain->gaw, max_addr); - return -EFAULT; - } - dmar_domain->max_addr = max_addr; - } - /* Round up size to next multiple of PAGE_SIZE, if it and - the low bits of hpa would take us onto the next page */ - size = aligned_nrpages(hpa, size); - return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT, - hpa >> VTD_PAGE_SHIFT, size, prot, gfp); -} - -static int intel_iommu_map_pages(struct iommu_domain *domain, - unsigned long iova, phys_addr_t paddr, - size_t pgsize, size_t pgcount, - int prot, gfp_t gfp, size_t *mapped) -{ - unsigned long pgshift = __ffs(pgsize); - size_t size = pgcount << pgshift; - int ret; - - if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G) - return -EINVAL; - - if (!IS_ALIGNED(iova | paddr, pgsize)) - return -EINVAL; - - ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp); - if (!ret && mapped) - *mapped = size; - - return ret; -} - -static size_t intel_iommu_unmap(struct iommu_domain *domain, - unsigned long iova, size_t size, - struct iommu_iotlb_gather *gather) -{ - struct dmar_domain *dmar_domain = to_dmar_domain(domain); - unsigned long start_pfn, last_pfn; - int level = 0; - - /* Cope with horrid API which requires us to unmap more than the - size argument if it happens to be a large-page mapping. */ - if (unlikely(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, - &level, GFP_ATOMIC))) - return 0; - - if (size < VTD_PAGE_SIZE << level_to_offset_bits(level)) - size = VTD_PAGE_SIZE << level_to_offset_bits(level); - - start_pfn = iova >> VTD_PAGE_SHIFT; - last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT; - - domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist); - - if (dmar_domain->max_addr == iova + size) - dmar_domain->max_addr = iova; - - /* - * We do not use page-selective IOTLB invalidation in flush queue, - * so there is no need to track page and sync iotlb. - */ - if (!iommu_iotlb_gather_queued(gather)) - iommu_iotlb_gather_add_page(domain, gather, iova, size); - - return size; -} - -static size_t intel_iommu_unmap_pages(struct iommu_domain *domain, - unsigned long iova, - size_t pgsize, size_t pgcount, - struct iommu_iotlb_gather *gather) -{ - unsigned long pgshift = __ffs(pgsize); - size_t size = pgcount << pgshift; - - return intel_iommu_unmap(domain, iova, size, gather); -} - static void intel_iommu_tlb_sync(struct iommu_domain *domain, struct iommu_iotlb_gather *gather) { @@ -3671,24 +3151,6 @@ static void intel_iommu_tlb_sync(struct iommu_domain *domain, iommu_put_pages_list(&gather->freelist); } -static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain, - dma_addr_t iova) -{ - struct dmar_domain *dmar_domain = to_dmar_domain(domain); - struct dma_pte *pte; - int level = 0; - u64 phys = 0; - - pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level, - GFP_ATOMIC); - if (pte && dma_pte_present(pte)) - phys = dma_pte_addr(pte) + - (iova & (BIT_MASK(level_to_offset_bits(level) + - VTD_PAGE_SHIFT) - 1)); - - return phys; -} - static bool domain_support_force_snooping(struct dmar_domain *domain) { struct device_domain_info *info; @@ -3730,15 +3192,15 @@ static bool intel_iommu_enforce_cache_coherency_ss(struct iommu_domain *domain) struct dmar_domain *dmar_domain = to_dmar_domain(domain); guard(spinlock_irqsave)(&dmar_domain->lock); - if (!domain_support_force_snooping(dmar_domain) || - dmar_domain->has_mappings) + if (!domain_support_force_snooping(dmar_domain)) return false; /* * Second level page table supports per-PTE snoop control. The * iommu_map() interface will handle this by setting SNP bit. */ - dmar_domain->set_pte_snp = true; + dmar_domain->sspt.vtdss_pt.common.features |= + BIT(PT_FEAT_VTDSS_FORCE_COHERENCE); dmar_domain->force_snooping = true; return true; } @@ -4302,49 +3764,6 @@ err_unwind: return ret; } -static int intel_iommu_read_and_clear_dirty(struct iommu_domain *domain, - unsigned long iova, size_t size, - unsigned long flags, - struct iommu_dirty_bitmap *dirty) -{ - struct dmar_domain *dmar_domain = to_dmar_domain(domain); - unsigned long end = iova + size - 1; - unsigned long pgsize; - - /* - * IOMMUFD core calls into a dirty tracking disabled domain without an - * IOVA bitmap set in order to clean dirty bits in all PTEs that might - * have occurred when we stopped dirty tracking. This ensures that we - * never inherit dirtied bits from a previous cycle. - */ - if (!dmar_domain->dirty_tracking && dirty->bitmap) - return -EINVAL; - - do { - struct dma_pte *pte; - int lvl = 0; - - pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &lvl, - GFP_ATOMIC); - pgsize = level_size(lvl) << VTD_PAGE_SHIFT; - if (!pte || !dma_pte_present(pte)) { - iova += pgsize; - continue; - } - - if (dma_sl_pte_test_and_clear_dirty(pte, flags)) - iommu_dirty_bitmap_record(dirty, iova, pgsize); - iova += pgsize; - } while (iova < end); - - return 0; -} - -static const struct iommu_dirty_ops intel_dirty_ops = { - .set_dirty_tracking = intel_iommu_set_dirty_tracking, - .read_and_clear_dirty = intel_iommu_read_and_clear_dirty, -}; - static int context_setup_pass_through(struct device *dev, u8 bus, u8 devfn) { struct device_domain_info *info = dev_iommu_priv_get(dev); @@ -4401,7 +3820,9 @@ static int device_setup_pass_through(struct device *dev) context_setup_pass_through_cb, dev); } -static int identity_domain_attach_dev(struct iommu_domain *domain, struct device *dev) +static int identity_domain_attach_dev(struct iommu_domain *domain, + struct device *dev, + struct iommu_domain *old) { struct device_domain_info *info = dev_iommu_priv_get(dev); struct intel_iommu *iommu = info->iommu; @@ -4462,27 +3883,23 @@ static struct iommu_domain identity_domain = { }; const struct iommu_domain_ops intel_fs_paging_domain_ops = { + IOMMU_PT_DOMAIN_OPS(x86_64), .attach_dev = intel_iommu_attach_device, .set_dev_pasid = intel_iommu_set_dev_pasid, - .map_pages = intel_iommu_map_pages, - .unmap_pages = intel_iommu_unmap_pages, .iotlb_sync_map = intel_iommu_iotlb_sync_map, .flush_iotlb_all = intel_flush_iotlb_all, .iotlb_sync = intel_iommu_tlb_sync, - .iova_to_phys = intel_iommu_iova_to_phys, .free = intel_iommu_domain_free, .enforce_cache_coherency = intel_iommu_enforce_cache_coherency_fs, }; const struct iommu_domain_ops intel_ss_paging_domain_ops = { + IOMMU_PT_DOMAIN_OPS(vtdss), .attach_dev = intel_iommu_attach_device, .set_dev_pasid = intel_iommu_set_dev_pasid, - .map_pages = intel_iommu_map_pages, - .unmap_pages = intel_iommu_unmap_pages, .iotlb_sync_map = intel_iommu_iotlb_sync_map, .flush_iotlb_all = intel_flush_iotlb_all, .iotlb_sync = intel_iommu_tlb_sync, - .iova_to_phys = intel_iommu_iova_to_phys, .free = intel_iommu_domain_free, .enforce_cache_coherency = intel_iommu_enforce_cache_coherency_ss, }; @@ -4797,3 +4214,5 @@ err: return ret; } + +MODULE_IMPORT_NS("GENERIC_PT_IOMMU"); diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h index 3056583d7f56..25c5e22096d4 100644 --- a/drivers/iommu/intel/iommu.h +++ b/drivers/iommu/intel/iommu.h @@ -23,8 +23,8 @@ #include <linux/xarray.h> #include <linux/perf_event.h> #include <linux/pci.h> +#include <linux/generic_pt/iommu.h> -#include <asm/cacheflush.h> #include <asm/iommu.h> #include <uapi/linux/iommufd.h> @@ -595,22 +595,20 @@ struct qi_batch { }; struct dmar_domain { - int nid; /* node id */ + union { + struct iommu_domain domain; + struct pt_iommu iommu; + /* First stage page table */ + struct pt_iommu_x86_64 fspt; + /* Second stage page table */ + struct pt_iommu_vtdss sspt; + }; + struct xarray iommu_array; /* Attached IOMMU array */ - u8 iommu_coherency: 1; /* indicate coherency of iommu access */ - u8 force_snooping : 1; /* Create IOPTEs with snoop control */ - u8 set_pte_snp:1; - u8 use_first_level:1; /* DMA translation for the domain goes - * through the first level page table, - * otherwise, goes through the second - * level. - */ + u8 force_snooping:1; /* Create PASID entry with snoop control */ u8 dirty_tracking:1; /* Dirty tracking is enabled */ u8 nested_parent:1; /* Has other domains nested on it */ - u8 has_mappings:1; /* Has mappings configured through - * iommu_map() interface. - */ u8 iotlb_sync_map:1; /* Need to flush IOTLB cache or write * buffer when creating mappings. */ @@ -623,26 +621,9 @@ struct dmar_domain { struct list_head cache_tags; /* Cache tag list */ struct qi_batch *qi_batch; /* Batched QI descriptors */ - int iommu_superpage;/* Level of superpages supported: - 0 == 4KiB (no superpages), 1 == 2MiB, - 2 == 1GiB, 3 == 512GiB, 4 == 1TiB */ union { /* DMA remapping domain */ struct { - /* virtual address */ - struct dma_pte *pgd; - /* max guest address width */ - int gaw; - /* - * adjusted guest address width: - * 0: level 2 30-bit - * 1: level 3 39-bit - * 2: level 4 48-bit - * 3: level 5 57-bit - */ - int agaw; - /* maximum mapped address */ - u64 max_addr; /* Protect the s1_domains list */ spinlock_t s1_lock; /* Track s1_domains nested on this domain */ @@ -664,10 +645,10 @@ struct dmar_domain { struct mmu_notifier notifier; }; }; - - struct iommu_domain domain; /* generic domain data structure for - iommu core */ }; +PT_IOMMU_CHECK_DOMAIN(struct dmar_domain, iommu, domain); +PT_IOMMU_CHECK_DOMAIN(struct dmar_domain, sspt.iommu, domain); +PT_IOMMU_CHECK_DOMAIN(struct dmar_domain, fspt.iommu, domain); /* * In theory, the VT-d 4.0 spec can support up to 2 ^ 16 counters. @@ -866,11 +847,6 @@ struct dma_pte { u64 val; }; -static inline void dma_clear_pte(struct dma_pte *pte) -{ - pte->val = 0; -} - static inline u64 dma_pte_addr(struct dma_pte *pte) { #ifdef CONFIG_64BIT @@ -886,32 +862,11 @@ static inline bool dma_pte_present(struct dma_pte *pte) return (pte->val & 3) != 0; } -static inline bool dma_sl_pte_test_and_clear_dirty(struct dma_pte *pte, - unsigned long flags) -{ - if (flags & IOMMU_DIRTY_NO_CLEAR) - return (pte->val & DMA_SL_PTE_DIRTY) != 0; - - return test_and_clear_bit(DMA_SL_PTE_DIRTY_BIT, - (unsigned long *)&pte->val); -} - static inline bool dma_pte_superpage(struct dma_pte *pte) { return (pte->val & DMA_PTE_LARGE_PAGE); } -static inline bool first_pte_in_page(struct dma_pte *pte) -{ - return IS_ALIGNED((unsigned long)pte, VTD_PAGE_SIZE); -} - -static inline int nr_pte_to_next_page(struct dma_pte *pte) -{ - return first_pte_in_page(pte) ? BIT_ULL(VTD_STRIDE_SHIFT) : - (struct dma_pte *)ALIGN((unsigned long)pte, VTD_PAGE_SIZE) - pte; -} - static inline bool context_present(struct context_entry *context) { return (context->lo & 1); @@ -927,11 +882,6 @@ static inline int agaw_to_level(int agaw) return agaw + 2; } -static inline int agaw_to_width(int agaw) -{ - return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH); -} - static inline int width_to_agaw(int width) { return DIV_ROUND_UP(width - 30, LEVEL_STRIDE); @@ -947,25 +897,6 @@ static inline int pfn_level_offset(u64 pfn, int level) return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK; } -static inline u64 level_mask(int level) -{ - return -1ULL << level_to_offset_bits(level); -} - -static inline u64 level_size(int level) -{ - return 1ULL << level_to_offset_bits(level); -} - -static inline u64 align_to_level(u64 pfn, int level) -{ - return (pfn + level_size(level) - 1) & level_mask(level); -} - -static inline unsigned long lvl_to_nr_pages(unsigned int lvl) -{ - return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH); -} static inline void context_set_present(struct context_entry *context) { @@ -1097,7 +1028,7 @@ static inline void qi_desc_iotlb(struct intel_iommu *iommu, u16 did, u64 addr, struct qi_desc *desc) { u8 dw = 0, dr = 0; - int ih = 0; + int ih = addr & 1; if (cap_write_drain(iommu->cap)) dw = 1; diff --git a/drivers/iommu/intel/nested.c b/drivers/iommu/intel/nested.c index 1b6ad9c900a5..a3fb8c193ca6 100644 --- a/drivers/iommu/intel/nested.c +++ b/drivers/iommu/intel/nested.c @@ -19,7 +19,7 @@ #include "pasid.h" static int intel_nested_attach_dev(struct iommu_domain *domain, - struct device *dev) + struct device *dev, struct iommu_domain *old) { struct device_domain_info *info = dev_iommu_priv_get(dev); struct dmar_domain *dmar_domain = to_dmar_domain(domain); @@ -29,11 +29,6 @@ static int intel_nested_attach_dev(struct iommu_domain *domain, device_block_translation(dev); - if (iommu->agaw < dmar_domain->s2_domain->agaw) { - dev_err_ratelimited(dev, "Adjusted guest address width not compatible\n"); - return -ENODEV; - } - /* * Stage-1 domain cannot work alone, it is nested on a s2_domain. * The s2_domain will be used in nested translation, hence needs diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c index 52f678975da7..3e2255057079 100644 --- a/drivers/iommu/intel/pasid.c +++ b/drivers/iommu/intel/pasid.c @@ -366,7 +366,7 @@ static void pasid_pte_config_first_level(struct intel_iommu *iommu, pasid_set_domain_id(pte, did); pasid_set_address_width(pte, iommu->agaw); - pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap)); + pasid_set_page_snoop(pte, flags & PASID_FLAG_PWSNP); /* Setup Present and PASID Granular Transfer Type: */ pasid_set_translation_type(pte, PASID_ENTRY_PGTT_FL_ONLY); @@ -461,19 +461,22 @@ int intel_pasid_replace_first_level(struct intel_iommu *iommu, */ static void pasid_pte_config_second_level(struct intel_iommu *iommu, struct pasid_entry *pte, - u64 pgd_val, int agaw, u16 did, - bool dirty_tracking) + struct dmar_domain *domain, u16 did) { + struct pt_iommu_vtdss_hw_info pt_info; + lockdep_assert_held(&iommu->lock); + pt_iommu_vtdss_hw_info(&domain->sspt, &pt_info); pasid_clear_entry(pte); pasid_set_domain_id(pte, did); - pasid_set_slptr(pte, pgd_val); - pasid_set_address_width(pte, agaw); + pasid_set_slptr(pte, pt_info.ssptptr); + pasid_set_address_width(pte, pt_info.aw); pasid_set_translation_type(pte, PASID_ENTRY_PGTT_SL_ONLY); pasid_set_fault_enable(pte); - pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap)); - if (dirty_tracking) + pasid_set_page_snoop(pte, !(domain->sspt.vtdss_pt.common.features & + BIT(PT_FEAT_DMA_INCOHERENT))); + if (domain->dirty_tracking) pasid_set_ssade(pte); pasid_set_present(pte); @@ -484,10 +487,9 @@ int intel_pasid_setup_second_level(struct intel_iommu *iommu, struct device *dev, u32 pasid) { struct pasid_entry *pte; - struct dma_pte *pgd; - u64 pgd_val; u16 did; + /* * If hardware advertises no support for second level * translation, return directly. @@ -498,8 +500,6 @@ int intel_pasid_setup_second_level(struct intel_iommu *iommu, return -EINVAL; } - pgd = domain->pgd; - pgd_val = virt_to_phys(pgd); did = domain_id_iommu(domain, iommu); spin_lock(&iommu->lock); @@ -514,8 +514,7 @@ int intel_pasid_setup_second_level(struct intel_iommu *iommu, return -EBUSY; } - pasid_pte_config_second_level(iommu, pte, pgd_val, domain->agaw, - did, domain->dirty_tracking); + pasid_pte_config_second_level(iommu, pte, domain, did); spin_unlock(&iommu->lock); pasid_flush_caches(iommu, pte, pasid, did); @@ -529,8 +528,6 @@ int intel_pasid_replace_second_level(struct intel_iommu *iommu, u32 pasid) { struct pasid_entry *pte, new_pte; - struct dma_pte *pgd; - u64 pgd_val; u16 did; /* @@ -543,13 +540,9 @@ int intel_pasid_replace_second_level(struct intel_iommu *iommu, return -EINVAL; } - pgd = domain->pgd; - pgd_val = virt_to_phys(pgd); did = domain_id_iommu(domain, iommu); - pasid_pte_config_second_level(iommu, &new_pte, pgd_val, - domain->agaw, did, - domain->dirty_tracking); + pasid_pte_config_second_level(iommu, &new_pte, domain, did); spin_lock(&iommu->lock); pte = intel_pasid_get_entry(dev, pasid); @@ -747,10 +740,12 @@ static void pasid_pte_config_nestd(struct intel_iommu *iommu, struct dmar_domain *s2_domain, u16 did) { - struct dma_pte *pgd = s2_domain->pgd; + struct pt_iommu_vtdss_hw_info pt_info; lockdep_assert_held(&iommu->lock); + pt_iommu_vtdss_hw_info(&s2_domain->sspt, &pt_info); + pasid_clear_entry(pte); if (s1_cfg->addr_width == ADDR_WIDTH_5LEVEL) @@ -770,11 +765,12 @@ static void pasid_pte_config_nestd(struct intel_iommu *iommu, if (s2_domain->force_snooping) pasid_set_pgsnp(pte); - pasid_set_slptr(pte, virt_to_phys(pgd)); + pasid_set_slptr(pte, pt_info.ssptptr); pasid_set_fault_enable(pte); pasid_set_domain_id(pte, did); - pasid_set_address_width(pte, s2_domain->agaw); - pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap)); + pasid_set_address_width(pte, pt_info.aw); + pasid_set_page_snoop(pte, !(s2_domain->sspt.vtdss_pt.common.features & + BIT(PT_FEAT_DMA_INCOHERENT))); if (s2_domain->dirty_tracking) pasid_set_ssade(pte); pasid_set_translation_type(pte, PASID_ENTRY_PGTT_NESTED); diff --git a/drivers/iommu/intel/pasid.h b/drivers/iommu/intel/pasid.h index a771a77d4239..b4c85242dc79 100644 --- a/drivers/iommu/intel/pasid.h +++ b/drivers/iommu/intel/pasid.h @@ -24,6 +24,7 @@ #define PASID_FLAG_NESTED BIT(1) #define PASID_FLAG_PAGE_SNOOP BIT(2) +#define PASID_FLAG_PWSNP BIT(2) /* * The PASID_FLAG_FL5LP flag Indicates using 5-level paging for first- diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c index e147f71f91b7..71de7947971f 100644 --- a/drivers/iommu/intel/svm.c +++ b/drivers/iommu/intel/svm.c @@ -170,6 +170,7 @@ static int intel_svm_set_dev_pasid(struct iommu_domain *domain, /* Setup the pasid table: */ sflags = cpu_feature_enabled(X86_FEATURE_LA57) ? PASID_FLAG_FL5LP : 0; + sflags |= PASID_FLAG_PWSNP; ret = __domain_setup_first_level(iommu, dev, pasid, FLPT_DEFAULT_DID, __pa(mm->pgd), sflags, old); diff --git a/drivers/iommu/io-pgtable-arm-selftests.c b/drivers/iommu/io-pgtable-arm-selftests.c new file mode 100644 index 000000000000..334e70350924 --- /dev/null +++ b/drivers/iommu/io-pgtable-arm-selftests.c @@ -0,0 +1,214 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * CPU-agnostic ARM page table allocator. + * + * Copyright (C) 2014 ARM Limited + * + * Author: Will Deacon <will.deacon@arm.com> + */ + +#define pr_fmt(fmt) "arm-lpae io-pgtable: " fmt + +#include <kunit/device.h> +#include <kunit/test.h> +#include <linux/io-pgtable.h> +#include <linux/kernel.h> + +#include "io-pgtable-arm.h" + +static struct io_pgtable_cfg *cfg_cookie; + +static void dummy_tlb_flush_all(void *cookie) +{ + WARN_ON(cookie != cfg_cookie); +} + +static void dummy_tlb_flush(unsigned long iova, size_t size, + size_t granule, void *cookie) +{ + WARN_ON(cookie != cfg_cookie); + WARN_ON(!(size & cfg_cookie->pgsize_bitmap)); +} + +static void dummy_tlb_add_page(struct iommu_iotlb_gather *gather, + unsigned long iova, size_t granule, + void *cookie) +{ + dummy_tlb_flush(iova, granule, granule, cookie); +} + +static const struct iommu_flush_ops dummy_tlb_ops = { + .tlb_flush_all = dummy_tlb_flush_all, + .tlb_flush_walk = dummy_tlb_flush, + .tlb_add_page = dummy_tlb_add_page, +}; + +#define __FAIL(test, i) ({ \ + KUNIT_FAIL(test, "test failed for fmt idx %d\n", (i)); \ + -EFAULT; \ +}) + +static int arm_lpae_run_tests(struct kunit *test, struct io_pgtable_cfg *cfg) +{ + static const enum io_pgtable_fmt fmts[] = { + ARM_64_LPAE_S1, + ARM_64_LPAE_S2, + }; + + int i, j; + unsigned long iova; + size_t size, mapped; + struct io_pgtable_ops *ops; + + for (i = 0; i < ARRAY_SIZE(fmts); ++i) { + cfg_cookie = cfg; + ops = alloc_io_pgtable_ops(fmts[i], cfg, cfg); + if (!ops) { + kunit_err(test, "failed to allocate io pgtable ops\n"); + return -ENOMEM; + } + + /* + * Initial sanity checks. + * Empty page tables shouldn't provide any translations. + */ + if (ops->iova_to_phys(ops, 42)) + return __FAIL(test, i); + + if (ops->iova_to_phys(ops, SZ_1G + 42)) + return __FAIL(test, i); + + if (ops->iova_to_phys(ops, SZ_2G + 42)) + return __FAIL(test, i); + + /* + * Distinct mappings of different granule sizes. + */ + iova = 0; + for_each_set_bit(j, &cfg->pgsize_bitmap, BITS_PER_LONG) { + size = 1UL << j; + + if (ops->map_pages(ops, iova, iova, size, 1, + IOMMU_READ | IOMMU_WRITE | + IOMMU_NOEXEC | IOMMU_CACHE, + GFP_KERNEL, &mapped)) + return __FAIL(test, i); + + /* Overlapping mappings */ + if (!ops->map_pages(ops, iova, iova + size, size, 1, + IOMMU_READ | IOMMU_NOEXEC, + GFP_KERNEL, &mapped)) + return __FAIL(test, i); + + if (ops->iova_to_phys(ops, iova + 42) != (iova + 42)) + return __FAIL(test, i); + + iova += SZ_1G; + } + + /* Full unmap */ + iova = 0; + for_each_set_bit(j, &cfg->pgsize_bitmap, BITS_PER_LONG) { + size = 1UL << j; + + if (ops->unmap_pages(ops, iova, size, 1, NULL) != size) + return __FAIL(test, i); + + if (ops->iova_to_phys(ops, iova + 42)) + return __FAIL(test, i); + + /* Remap full block */ + if (ops->map_pages(ops, iova, iova, size, 1, + IOMMU_WRITE, GFP_KERNEL, &mapped)) + return __FAIL(test, i); + + if (ops->iova_to_phys(ops, iova + 42) != (iova + 42)) + return __FAIL(test, i); + + iova += SZ_1G; + } + + /* + * Map/unmap the last largest supported page of the IAS, this can + * trigger corner cases in the concatednated page tables. + */ + mapped = 0; + size = 1UL << __fls(cfg->pgsize_bitmap); + iova = (1UL << cfg->ias) - size; + if (ops->map_pages(ops, iova, iova, size, 1, + IOMMU_READ | IOMMU_WRITE | + IOMMU_NOEXEC | IOMMU_CACHE, + GFP_KERNEL, &mapped)) + return __FAIL(test, i); + if (mapped != size) + return __FAIL(test, i); + if (ops->unmap_pages(ops, iova, size, 1, NULL) != size) + return __FAIL(test, i); + + free_io_pgtable_ops(ops); + } + + return 0; +} + +static void arm_lpae_do_selftests(struct kunit *test) +{ + static const unsigned long pgsize[] = { + SZ_4K | SZ_2M | SZ_1G, + SZ_16K | SZ_32M, + SZ_64K | SZ_512M, + }; + + static const unsigned int address_size[] = { + 32, 36, 40, 42, 44, 48, + }; + + int i, j, k, pass = 0, fail = 0; + struct device *dev; + struct io_pgtable_cfg cfg = { + .tlb = &dummy_tlb_ops, + .coherent_walk = true, + .quirks = IO_PGTABLE_QUIRK_NO_WARN, + }; + + dev = kunit_device_register(test, "io-pgtable-test"); + KUNIT_EXPECT_NOT_ERR_OR_NULL(test, dev); + if (IS_ERR_OR_NULL(dev)) + return; + + cfg.iommu_dev = dev; + + for (i = 0; i < ARRAY_SIZE(pgsize); ++i) { + for (j = 0; j < ARRAY_SIZE(address_size); ++j) { + /* Don't use ias > oas as it is not valid for stage-2. */ + for (k = 0; k <= j; ++k) { + cfg.pgsize_bitmap = pgsize[i]; + cfg.ias = address_size[k]; + cfg.oas = address_size[j]; + kunit_info(test, "pgsize_bitmap 0x%08lx, IAS %u OAS %u\n", + pgsize[i], cfg.ias, cfg.oas); + if (arm_lpae_run_tests(test, &cfg)) + fail++; + else + pass++; + } + } + } + + kunit_info(test, "completed with %d PASS %d FAIL\n", pass, fail); +} + +static struct kunit_case io_pgtable_arm_test_cases[] = { + KUNIT_CASE(arm_lpae_do_selftests), + {}, +}; + +static struct kunit_suite io_pgtable_arm_test = { + .name = "io-pgtable-arm-test", + .test_cases = io_pgtable_arm_test_cases, +}; + +kunit_test_suite(io_pgtable_arm_test); + +MODULE_DESCRIPTION("io-pgtable-arm library kunit tests"); +MODULE_LICENSE("GPL"); diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c index 7e8e2216c294..e6626004b323 100644 --- a/drivers/iommu/io-pgtable-arm.c +++ b/drivers/iommu/io-pgtable-arm.c @@ -12,8 +12,6 @@ #include <linux/atomic.h> #include <linux/bitops.h> #include <linux/io-pgtable.h> -#include <linux/kernel.h> -#include <linux/device/faux.h> #include <linux/sizes.h> #include <linux/slab.h> #include <linux/types.h> @@ -1267,204 +1265,3 @@ struct io_pgtable_init_fns io_pgtable_arm_mali_lpae_init_fns = { .alloc = arm_mali_lpae_alloc_pgtable, .free = arm_lpae_free_pgtable, }; - -#ifdef CONFIG_IOMMU_IO_PGTABLE_LPAE_SELFTEST - -static struct io_pgtable_cfg *cfg_cookie __initdata; - -static void __init dummy_tlb_flush_all(void *cookie) -{ - WARN_ON(cookie != cfg_cookie); -} - -static void __init dummy_tlb_flush(unsigned long iova, size_t size, - size_t granule, void *cookie) -{ - WARN_ON(cookie != cfg_cookie); - WARN_ON(!(size & cfg_cookie->pgsize_bitmap)); -} - -static void __init dummy_tlb_add_page(struct iommu_iotlb_gather *gather, - unsigned long iova, size_t granule, - void *cookie) -{ - dummy_tlb_flush(iova, granule, granule, cookie); -} - -static const struct iommu_flush_ops dummy_tlb_ops __initconst = { - .tlb_flush_all = dummy_tlb_flush_all, - .tlb_flush_walk = dummy_tlb_flush, - .tlb_add_page = dummy_tlb_add_page, -}; - -static void __init arm_lpae_dump_ops(struct io_pgtable_ops *ops) -{ - struct arm_lpae_io_pgtable *data = io_pgtable_ops_to_data(ops); - struct io_pgtable_cfg *cfg = &data->iop.cfg; - - pr_err("cfg: pgsize_bitmap 0x%lx, ias %u-bit\n", - cfg->pgsize_bitmap, cfg->ias); - pr_err("data: %d levels, 0x%zx pgd_size, %u pg_shift, %u bits_per_level, pgd @ %p\n", - ARM_LPAE_MAX_LEVELS - data->start_level, ARM_LPAE_PGD_SIZE(data), - ilog2(ARM_LPAE_GRANULE(data)), data->bits_per_level, data->pgd); -} - -#define __FAIL(ops, i) ({ \ - WARN(1, "selftest: test failed for fmt idx %d\n", (i)); \ - arm_lpae_dump_ops(ops); \ - -EFAULT; \ -}) - -static int __init arm_lpae_run_tests(struct io_pgtable_cfg *cfg) -{ - static const enum io_pgtable_fmt fmts[] __initconst = { - ARM_64_LPAE_S1, - ARM_64_LPAE_S2, - }; - - int i, j; - unsigned long iova; - size_t size, mapped; - struct io_pgtable_ops *ops; - - for (i = 0; i < ARRAY_SIZE(fmts); ++i) { - cfg_cookie = cfg; - ops = alloc_io_pgtable_ops(fmts[i], cfg, cfg); - if (!ops) { - pr_err("selftest: failed to allocate io pgtable ops\n"); - return -ENOMEM; - } - - /* - * Initial sanity checks. - * Empty page tables shouldn't provide any translations. - */ - if (ops->iova_to_phys(ops, 42)) - return __FAIL(ops, i); - - if (ops->iova_to_phys(ops, SZ_1G + 42)) - return __FAIL(ops, i); - - if (ops->iova_to_phys(ops, SZ_2G + 42)) - return __FAIL(ops, i); - - /* - * Distinct mappings of different granule sizes. - */ - iova = 0; - for_each_set_bit(j, &cfg->pgsize_bitmap, BITS_PER_LONG) { - size = 1UL << j; - - if (ops->map_pages(ops, iova, iova, size, 1, - IOMMU_READ | IOMMU_WRITE | - IOMMU_NOEXEC | IOMMU_CACHE, - GFP_KERNEL, &mapped)) - return __FAIL(ops, i); - - /* Overlapping mappings */ - if (!ops->map_pages(ops, iova, iova + size, size, 1, - IOMMU_READ | IOMMU_NOEXEC, - GFP_KERNEL, &mapped)) - return __FAIL(ops, i); - - if (ops->iova_to_phys(ops, iova + 42) != (iova + 42)) - return __FAIL(ops, i); - - iova += SZ_1G; - } - - /* Full unmap */ - iova = 0; - for_each_set_bit(j, &cfg->pgsize_bitmap, BITS_PER_LONG) { - size = 1UL << j; - - if (ops->unmap_pages(ops, iova, size, 1, NULL) != size) - return __FAIL(ops, i); - - if (ops->iova_to_phys(ops, iova + 42)) - return __FAIL(ops, i); - - /* Remap full block */ - if (ops->map_pages(ops, iova, iova, size, 1, - IOMMU_WRITE, GFP_KERNEL, &mapped)) - return __FAIL(ops, i); - - if (ops->iova_to_phys(ops, iova + 42) != (iova + 42)) - return __FAIL(ops, i); - - iova += SZ_1G; - } - - /* - * Map/unmap the last largest supported page of the IAS, this can - * trigger corner cases in the concatednated page tables. - */ - mapped = 0; - size = 1UL << __fls(cfg->pgsize_bitmap); - iova = (1UL << cfg->ias) - size; - if (ops->map_pages(ops, iova, iova, size, 1, - IOMMU_READ | IOMMU_WRITE | - IOMMU_NOEXEC | IOMMU_CACHE, - GFP_KERNEL, &mapped)) - return __FAIL(ops, i); - if (mapped != size) - return __FAIL(ops, i); - if (ops->unmap_pages(ops, iova, size, 1, NULL) != size) - return __FAIL(ops, i); - - free_io_pgtable_ops(ops); - } - - return 0; -} - -static int __init arm_lpae_do_selftests(void) -{ - static const unsigned long pgsize[] __initconst = { - SZ_4K | SZ_2M | SZ_1G, - SZ_16K | SZ_32M, - SZ_64K | SZ_512M, - }; - - static const unsigned int address_size[] __initconst = { - 32, 36, 40, 42, 44, 48, - }; - - int i, j, k, pass = 0, fail = 0; - struct faux_device *dev; - struct io_pgtable_cfg cfg = { - .tlb = &dummy_tlb_ops, - .coherent_walk = true, - .quirks = IO_PGTABLE_QUIRK_NO_WARN, - }; - - dev = faux_device_create("io-pgtable-test", NULL, 0); - if (!dev) - return -ENOMEM; - - cfg.iommu_dev = &dev->dev; - - for (i = 0; i < ARRAY_SIZE(pgsize); ++i) { - for (j = 0; j < ARRAY_SIZE(address_size); ++j) { - /* Don't use ias > oas as it is not valid for stage-2. */ - for (k = 0; k <= j; ++k) { - cfg.pgsize_bitmap = pgsize[i]; - cfg.ias = address_size[k]; - cfg.oas = address_size[j]; - pr_info("selftest: pgsize_bitmap 0x%08lx, IAS %u OAS %u\n", - pgsize[i], cfg.ias, cfg.oas); - if (arm_lpae_run_tests(&cfg)) - fail++; - else - pass++; - } - } - } - - pr_info("selftest: completed with %d PASS %d FAIL\n", pass, fail); - faux_device_destroy(dev); - - return fail ? -EFAULT : 0; -} -subsys_initcall(arm_lpae_do_selftests); -#endif diff --git a/drivers/iommu/io-pgtable.c b/drivers/iommu/io-pgtable.c index 8841c1487f00..843fec8e8a51 100644 --- a/drivers/iommu/io-pgtable.c +++ b/drivers/iommu/io-pgtable.c @@ -28,10 +28,6 @@ io_pgtable_init_table[IO_PGTABLE_NUM_FMTS] = { #ifdef CONFIG_IOMMU_IO_PGTABLE_ARMV7S [ARM_V7S] = &io_pgtable_arm_v7s_init_fns, #endif -#ifdef CONFIG_AMD_IOMMU - [AMD_IOMMU_V1] = &io_pgtable_amd_iommu_v1_init_fns, - [AMD_IOMMU_V2] = &io_pgtable_amd_iommu_v2_init_fns, -#endif }; static int check_custom_allocator(enum io_pgtable_fmt fmt, diff --git a/drivers/iommu/iommu-pages.c b/drivers/iommu/iommu-pages.c index 238c09e5166b..3bab175d8557 100644 --- a/drivers/iommu/iommu-pages.c +++ b/drivers/iommu/iommu-pages.c @@ -4,6 +4,7 @@ * Pasha Tatashin <pasha.tatashin@soleen.com> */ #include "iommu-pages.h" +#include <linux/dma-mapping.h> #include <linux/gfp.h> #include <linux/mm.h> @@ -22,6 +23,11 @@ IOPTDESC_MATCH(memcg_data, memcg_data); #undef IOPTDESC_MATCH static_assert(sizeof(struct ioptdesc) <= sizeof(struct page)); +static inline size_t ioptdesc_mem_size(struct ioptdesc *desc) +{ + return 1UL << (folio_order(ioptdesc_folio(desc)) + PAGE_SHIFT); +} + /** * iommu_alloc_pages_node_sz - Allocate a zeroed page of a given size from * specific NUMA node @@ -36,6 +42,7 @@ static_assert(sizeof(struct ioptdesc) <= sizeof(struct page)); */ void *iommu_alloc_pages_node_sz(int nid, gfp_t gfp, size_t size) { + struct ioptdesc *iopt; unsigned long pgcnt; struct folio *folio; unsigned int order; @@ -60,6 +67,9 @@ void *iommu_alloc_pages_node_sz(int nid, gfp_t gfp, size_t size) if (unlikely(!folio)) return NULL; + iopt = folio_ioptdesc(folio); + iopt->incoherent = false; + /* * All page allocations that should be reported to as "iommu-pagetables" * to userspace must use one of the functions below. This includes @@ -80,7 +90,10 @@ EXPORT_SYMBOL_GPL(iommu_alloc_pages_node_sz); static void __iommu_free_desc(struct ioptdesc *iopt) { struct folio *folio = ioptdesc_folio(iopt); - const unsigned long pgcnt = 1UL << folio_order(folio); + const unsigned long pgcnt = folio_nr_pages(folio); + + if (IOMMU_PAGES_USE_DMA_API) + WARN_ON_ONCE(iopt->incoherent); mod_node_page_state(folio_pgdat(folio), NR_IOMMU_PAGES, -pgcnt); lruvec_stat_mod_folio(folio, NR_SECONDARY_PAGETABLE, -pgcnt); @@ -117,3 +130,124 @@ void iommu_put_pages_list(struct iommu_pages_list *list) __iommu_free_desc(iopt); } EXPORT_SYMBOL_GPL(iommu_put_pages_list); + +/** + * iommu_pages_start_incoherent - Setup the page for cache incoherent operation + * @virt: The page to setup + * @dma_dev: The iommu device + * + * For incoherent memory this will use the DMA API to manage the cache flushing + * on some arches. This is a lot of complexity compared to just calling + * arch_sync_dma_for_device(), but it is what the existing ARM iommu drivers + * have been doing. The DMA API requires keeping track of the DMA map and + * freeing it when required. This keeps track of the dma map inside the ioptdesc + * so that error paths are simple for the caller. + */ +int iommu_pages_start_incoherent(void *virt, struct device *dma_dev) +{ + struct ioptdesc *iopt = virt_to_ioptdesc(virt); + dma_addr_t dma; + + if (WARN_ON(iopt->incoherent)) + return -EINVAL; + + if (!IOMMU_PAGES_USE_DMA_API) { + iommu_pages_flush_incoherent(dma_dev, virt, 0, + ioptdesc_mem_size(iopt)); + } else { + dma = dma_map_single(dma_dev, virt, ioptdesc_mem_size(iopt), + DMA_TO_DEVICE); + if (dma_mapping_error(dma_dev, dma)) + return -EINVAL; + + /* + * The DMA API is not allowed to do anything other than DMA + * direct. It would be nice to also check + * dev_is_dma_coherent(dma_dev)); + */ + if (WARN_ON(dma != virt_to_phys(virt))) { + dma_unmap_single(dma_dev, dma, ioptdesc_mem_size(iopt), + DMA_TO_DEVICE); + return -EOPNOTSUPP; + } + } + + iopt->incoherent = 1; + return 0; +} +EXPORT_SYMBOL_GPL(iommu_pages_start_incoherent); + +/** + * iommu_pages_start_incoherent_list - Make a list of pages incoherent + * @list: The list of pages to setup + * @dma_dev: The iommu device + * + * Perform iommu_pages_start_incoherent() across all of list. + * + * If this fails the caller must call iommu_pages_stop_incoherent_list(). + */ +int iommu_pages_start_incoherent_list(struct iommu_pages_list *list, + struct device *dma_dev) +{ + struct ioptdesc *cur; + int ret; + + list_for_each_entry(cur, &list->pages, iopt_freelist_elm) { + if (WARN_ON(cur->incoherent)) + continue; + + ret = iommu_pages_start_incoherent( + folio_address(ioptdesc_folio(cur)), dma_dev); + if (ret) + return ret; + } + return 0; +} +EXPORT_SYMBOL_GPL(iommu_pages_start_incoherent_list); + +/** + * iommu_pages_stop_incoherent_list - Undo incoherence across a list + * @list: The list of pages to release + * @dma_dev: The iommu device + * + * Revert iommu_pages_start_incoherent() across all of the list. Pages that did + * not call or succeed iommu_pages_start_incoherent() will be ignored. + */ +#if IOMMU_PAGES_USE_DMA_API +void iommu_pages_stop_incoherent_list(struct iommu_pages_list *list, + struct device *dma_dev) +{ + struct ioptdesc *cur; + + list_for_each_entry(cur, &list->pages, iopt_freelist_elm) { + struct folio *folio = ioptdesc_folio(cur); + + if (!cur->incoherent) + continue; + dma_unmap_single(dma_dev, virt_to_phys(folio_address(folio)), + ioptdesc_mem_size(cur), DMA_TO_DEVICE); + cur->incoherent = 0; + } +} +EXPORT_SYMBOL_GPL(iommu_pages_stop_incoherent_list); + +/** + * iommu_pages_free_incoherent - Free an incoherent page + * @virt: virtual address of the page to be freed. + * @dma_dev: The iommu device + * + * If the page is incoherent it made coherent again then freed. + */ +void iommu_pages_free_incoherent(void *virt, struct device *dma_dev) +{ + struct ioptdesc *iopt = virt_to_ioptdesc(virt); + + if (iopt->incoherent) { + dma_unmap_single(dma_dev, virt_to_phys(virt), + ioptdesc_mem_size(iopt), DMA_TO_DEVICE); + iopt->incoherent = 0; + } + __iommu_free_desc(iopt); +} +EXPORT_SYMBOL_GPL(iommu_pages_free_incoherent); +#endif diff --git a/drivers/iommu/iommu-pages.h b/drivers/iommu/iommu-pages.h index b3af2813ed0c..ae9da4f571f6 100644 --- a/drivers/iommu/iommu-pages.h +++ b/drivers/iommu/iommu-pages.h @@ -21,7 +21,10 @@ struct ioptdesc { struct list_head iopt_freelist_elm; unsigned long __page_mapping; - pgoff_t __index; + union { + u8 incoherent; + pgoff_t __index; + }; void *_private; unsigned int __page_type; @@ -98,4 +101,48 @@ static inline void *iommu_alloc_pages_sz(gfp_t gfp, size_t size) return iommu_alloc_pages_node_sz(NUMA_NO_NODE, gfp, size); } -#endif /* __IOMMU_PAGES_H */ +int iommu_pages_start_incoherent(void *virt, struct device *dma_dev); +int iommu_pages_start_incoherent_list(struct iommu_pages_list *list, + struct device *dma_dev); + +#ifdef CONFIG_X86 +#define IOMMU_PAGES_USE_DMA_API 0 +#include <linux/cacheflush.h> + +static inline void iommu_pages_flush_incoherent(struct device *dma_dev, + void *virt, size_t offset, + size_t len) +{ + clflush_cache_range(virt + offset, len); +} +static inline void +iommu_pages_stop_incoherent_list(struct iommu_pages_list *list, + struct device *dma_dev) +{ + /* + * For performance leave the incoherent flag alone which turns this into + * a NOP. For X86 the rest of the stop/free flow ignores the flag. + */ +} +static inline void iommu_pages_free_incoherent(void *virt, + struct device *dma_dev) +{ + iommu_free_pages(virt); +} +#else +#define IOMMU_PAGES_USE_DMA_API 1 +#include <linux/dma-mapping.h> + +static inline void iommu_pages_flush_incoherent(struct device *dma_dev, + void *virt, size_t offset, + size_t len) +{ + dma_sync_single_for_device(dma_dev, (uintptr_t)virt + offset, len, + DMA_TO_DEVICE); +} +void iommu_pages_stop_incoherent_list(struct iommu_pages_list *list, + struct device *dma_dev); +void iommu_pages_free_incoherent(void *virt, struct device *dma_dev); +#endif + +#endif /* __IOMMU_PAGES_H */ diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 59244c744eab..2ca990dfbb88 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -100,7 +100,7 @@ static int iommu_bus_notifier(struct notifier_block *nb, unsigned long action, void *data); static void iommu_release_device(struct device *dev); static int __iommu_attach_device(struct iommu_domain *domain, - struct device *dev); + struct device *dev, struct iommu_domain *old); static int __iommu_attach_group(struct iommu_domain *domain, struct iommu_group *group); static struct iommu_domain *__iommu_paging_domain_alloc_flags(struct device *dev, @@ -114,6 +114,7 @@ enum { static int __iommu_device_set_domain(struct iommu_group *group, struct device *dev, struct iommu_domain *new_domain, + struct iommu_domain *old_domain, unsigned int flags); static int __iommu_group_set_domain_internal(struct iommu_group *group, struct iommu_domain *new_domain, @@ -542,8 +543,21 @@ static void iommu_deinit_device(struct device *dev) * Regardless, if a delayed attach never occurred, then the release * should still avoid touching any hardware configuration either. */ - if (!dev->iommu->attach_deferred && ops->release_domain) - ops->release_domain->ops->attach_dev(ops->release_domain, dev); + if (!dev->iommu->attach_deferred && ops->release_domain) { + struct iommu_domain *release_domain = ops->release_domain; + + /* + * If the device requires direct mappings then it should not + * be parked on a BLOCKED domain during release as that would + * break the direct mappings. + */ + if (dev->iommu->require_direct && ops->identity_domain && + release_domain == ops->blocked_domain) + release_domain = ops->identity_domain; + + release_domain->ops->attach_dev(release_domain, dev, + group->domain); + } if (ops->release_device) ops->release_device(dev); @@ -628,7 +642,8 @@ static int __iommu_probe_device(struct device *dev, struct list_head *group_list if (group->default_domain) iommu_create_device_direct_mappings(group->default_domain, dev); if (group->domain) { - ret = __iommu_device_set_domain(group, dev, group->domain, 0); + ret = __iommu_device_set_domain(group, dev, group->domain, NULL, + 0); if (ret) goto err_remove_gdev; } else if (!group->default_domain && !group_list) { @@ -2115,14 +2130,14 @@ static void __iommu_group_set_core_domain(struct iommu_group *group) } static int __iommu_attach_device(struct iommu_domain *domain, - struct device *dev) + struct device *dev, struct iommu_domain *old) { int ret; if (unlikely(domain->ops->attach_dev == NULL)) return -ENODEV; - ret = domain->ops->attach_dev(domain, dev); + ret = domain->ops->attach_dev(domain, dev, old); if (ret) return ret; dev->iommu->attach_deferred = 0; @@ -2171,7 +2186,7 @@ EXPORT_SYMBOL_GPL(iommu_attach_device); int iommu_deferred_attach(struct device *dev, struct iommu_domain *domain) { if (dev->iommu && dev->iommu->attach_deferred) - return __iommu_attach_device(domain, dev); + return __iommu_attach_device(domain, dev, NULL); return 0; } @@ -2284,6 +2299,7 @@ EXPORT_SYMBOL_GPL(iommu_attach_group); static int __iommu_device_set_domain(struct iommu_group *group, struct device *dev, struct iommu_domain *new_domain, + struct iommu_domain *old_domain, unsigned int flags) { int ret; @@ -2309,7 +2325,7 @@ static int __iommu_device_set_domain(struct iommu_group *group, dev->iommu->attach_deferred = 0; } - ret = __iommu_attach_device(new_domain, dev); + ret = __iommu_attach_device(new_domain, dev, old_domain); if (ret) { /* * If we have a blocking domain then try to attach that in hopes @@ -2319,7 +2335,8 @@ static int __iommu_device_set_domain(struct iommu_group *group, if ((flags & IOMMU_SET_DOMAIN_MUST_SUCCEED) && group->blocking_domain && group->blocking_domain != new_domain) - __iommu_attach_device(group->blocking_domain, dev); + __iommu_attach_device(group->blocking_domain, dev, + old_domain); return ret; } return 0; @@ -2366,7 +2383,7 @@ static int __iommu_group_set_domain_internal(struct iommu_group *group, result = 0; for_each_group_device(group, gdev) { ret = __iommu_device_set_domain(group, gdev->dev, new_domain, - flags); + group->domain, flags); if (ret) { result = ret; /* @@ -2391,6 +2408,9 @@ err_revert: */ last_gdev = gdev; for_each_group_device(group, gdev) { + /* No need to revert the last gdev that failed to set domain */ + if (gdev == last_gdev) + break; /* * A NULL domain can happen only for first probe, in which case * we leave group->domain as NULL and let release clean @@ -2398,10 +2418,8 @@ err_revert: */ if (group->domain) WARN_ON(__iommu_device_set_domain( - group, gdev->dev, group->domain, + group, gdev->dev, group->domain, new_domain, IOMMU_SET_DOMAIN_MUST_SUCCEED)); - if (gdev == last_gdev) - break; } return ret; } diff --git a/drivers/iommu/iommufd/Kconfig b/drivers/iommu/iommufd/Kconfig index 2beeb4f60ee5..eae3f03629b0 100644 --- a/drivers/iommu/iommufd/Kconfig +++ b/drivers/iommu/iommufd/Kconfig @@ -41,6 +41,7 @@ config IOMMUFD_TEST depends on DEBUG_KERNEL depends on FAULT_INJECTION depends on RUNTIME_TESTING_MENU + depends on IOMMU_PT_AMDV1 select IOMMUFD_DRIVER default n help diff --git a/drivers/iommu/iommufd/driver.c b/drivers/iommu/iommufd/driver.c index 6f1010da221c..21d4a35538f6 100644 --- a/drivers/iommu/iommufd/driver.c +++ b/drivers/iommu/iommufd/driver.c @@ -161,8 +161,8 @@ int iommufd_viommu_report_event(struct iommufd_viommu *viommu, vevent = &veventq->lost_events_header; goto out_set_header; } - memcpy(vevent->event_data, event_data, data_len); vevent->data_len = data_len; + memcpy(vevent->event_data, event_data, data_len); veventq->num_events++; out_set_header: diff --git a/drivers/iommu/iommufd/io_pagetable.c b/drivers/iommu/iommufd/io_pagetable.c index c0360c450880..54cf4d856179 100644 --- a/drivers/iommu/iommufd/io_pagetable.c +++ b/drivers/iommu/iommufd/io_pagetable.c @@ -8,8 +8,10 @@ * The datastructure uses the iopt_pages to optimize the storage of the PFNs * between the domains and xarray. */ +#include <linux/dma-buf.h> #include <linux/err.h> #include <linux/errno.h> +#include <linux/file.h> #include <linux/iommu.h> #include <linux/iommufd.h> #include <linux/lockdep.h> @@ -284,6 +286,9 @@ static int iopt_alloc_area_pages(struct io_pagetable *iopt, case IOPT_ADDRESS_FILE: start = elm->start_byte + elm->pages->start; break; + case IOPT_ADDRESS_DMABUF: + start = elm->start_byte + elm->pages->dmabuf.start; + break; } rc = iopt_alloc_iova(iopt, dst_iova, start, length); if (rc) @@ -468,25 +473,53 @@ int iopt_map_user_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt, * @iopt: io_pagetable to act on * @iova: If IOPT_ALLOC_IOVA is set this is unused on input and contains * the chosen iova on output. Otherwise is the iova to map to on input - * @file: file to map + * @fd: fdno of a file to map * @start: map file starting at this byte offset * @length: Number of bytes to map * @iommu_prot: Combination of IOMMU_READ/WRITE/etc bits for the mapping * @flags: IOPT_ALLOC_IOVA or zero */ int iopt_map_file_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt, - unsigned long *iova, struct file *file, - unsigned long start, unsigned long length, - int iommu_prot, unsigned int flags) + unsigned long *iova, int fd, unsigned long start, + unsigned long length, int iommu_prot, + unsigned int flags) { struct iopt_pages *pages; + struct dma_buf *dmabuf; + unsigned long start_byte; + unsigned long last; + + if (!length) + return -EINVAL; + if (check_add_overflow(start, length - 1, &last)) + return -EOVERFLOW; + + start_byte = start - ALIGN_DOWN(start, PAGE_SIZE); + dmabuf = dma_buf_get(fd); + if (!IS_ERR(dmabuf)) { + pages = iopt_alloc_dmabuf_pages(ictx, dmabuf, start_byte, start, + length, + iommu_prot & IOMMU_WRITE); + if (IS_ERR(pages)) { + dma_buf_put(dmabuf); + return PTR_ERR(pages); + } + } else { + struct file *file; + + file = fget(fd); + if (!file) + return -EBADF; + + pages = iopt_alloc_file_pages(file, start_byte, start, length, + iommu_prot & IOMMU_WRITE); + fput(file); + if (IS_ERR(pages)) + return PTR_ERR(pages); + } - pages = iopt_alloc_file_pages(file, start, length, - iommu_prot & IOMMU_WRITE); - if (IS_ERR(pages)) - return PTR_ERR(pages); return iopt_map_common(ictx, iopt, pages, iova, length, - start - pages->start, iommu_prot, flags); + start_byte, iommu_prot, flags); } struct iova_bitmap_fn_arg { @@ -707,7 +740,8 @@ static int iopt_unmap_iova_range(struct io_pagetable *iopt, unsigned long start, struct iopt_area *area; unsigned long unmapped_bytes = 0; unsigned int tries = 0; - int rc = -ENOENT; + /* If there are no mapped entries then success */ + int rc = 0; /* * The domains_rwsem must be held in read mode any time any area->pages @@ -777,8 +811,6 @@ again: down_write(&iopt->iova_rwsem); } - if (unmapped_bytes) - rc = 0; out_unlock_iova: up_write(&iopt->iova_rwsem); @@ -815,13 +847,8 @@ int iopt_unmap_iova(struct io_pagetable *iopt, unsigned long iova, int iopt_unmap_all(struct io_pagetable *iopt, unsigned long *unmapped) { - int rc; - - rc = iopt_unmap_iova_range(iopt, 0, ULONG_MAX, unmapped); /* If the IOVAs are empty then unmap all succeeds */ - if (rc == -ENOENT) - return 0; - return rc; + return iopt_unmap_iova_range(iopt, 0, ULONG_MAX, unmapped); } /* The caller must always free all the nodes in the allowed_iova rb_root. */ @@ -967,9 +994,15 @@ static void iopt_unfill_domain(struct io_pagetable *iopt, WARN_ON(!area->storage_domain); if (area->storage_domain == domain) area->storage_domain = storage_domain; + if (iopt_is_dmabuf(pages)) { + if (!iopt_dmabuf_revoked(pages)) + iopt_area_unmap_domain(area, domain); + iopt_dmabuf_untrack_domain(pages, area, domain); + } mutex_unlock(&pages->mutex); - iopt_area_unmap_domain(area, domain); + if (!iopt_is_dmabuf(pages)) + iopt_area_unmap_domain(area, domain); } return; } @@ -986,6 +1019,8 @@ static void iopt_unfill_domain(struct io_pagetable *iopt, WARN_ON(area->storage_domain != domain); area->storage_domain = NULL; iopt_area_unfill_domain(area, pages, domain); + if (iopt_is_dmabuf(pages)) + iopt_dmabuf_untrack_domain(pages, area, domain); mutex_unlock(&pages->mutex); } } @@ -1015,10 +1050,16 @@ static int iopt_fill_domain(struct io_pagetable *iopt, if (!pages) continue; - mutex_lock(&pages->mutex); + guard(mutex)(&pages->mutex); + if (iopt_is_dmabuf(pages)) { + rc = iopt_dmabuf_track_domain(pages, area, domain); + if (rc) + goto out_unfill; + } rc = iopt_area_fill_domain(area, domain); if (rc) { - mutex_unlock(&pages->mutex); + if (iopt_is_dmabuf(pages)) + iopt_dmabuf_untrack_domain(pages, area, domain); goto out_unfill; } if (!area->storage_domain) { @@ -1027,7 +1068,6 @@ static int iopt_fill_domain(struct io_pagetable *iopt, interval_tree_insert(&area->pages_node, &pages->domains_itree); } - mutex_unlock(&pages->mutex); } return 0; @@ -1048,6 +1088,8 @@ out_unfill: area->storage_domain = NULL; } iopt_area_unfill_domain(area, pages, domain); + if (iopt_is_dmabuf(pages)) + iopt_dmabuf_untrack_domain(pages, area, domain); mutex_unlock(&pages->mutex); } return rc; @@ -1258,6 +1300,10 @@ static int iopt_area_split(struct iopt_area *area, unsigned long iova) if (!pages || area->prevent_access) return -EBUSY; + /* Maintaining the domains_itree below is a bit complicated */ + if (iopt_is_dmabuf(pages)) + return -EOPNOTSUPP; + if (new_start & (alignment - 1) || iopt_area_start_byte(area, new_start) & (alignment - 1)) return -EINVAL; diff --git a/drivers/iommu/iommufd/io_pagetable.h b/drivers/iommu/iommufd/io_pagetable.h index b6064f4ce4af..14cd052fd320 100644 --- a/drivers/iommu/iommufd/io_pagetable.h +++ b/drivers/iommu/iommufd/io_pagetable.h @@ -5,6 +5,7 @@ #ifndef __IO_PAGETABLE_H #define __IO_PAGETABLE_H +#include <linux/dma-buf.h> #include <linux/interval_tree.h> #include <linux/kref.h> #include <linux/mutex.h> @@ -69,6 +70,16 @@ void iopt_area_unfill_domain(struct iopt_area *area, struct iopt_pages *pages, void iopt_area_unmap_domain(struct iopt_area *area, struct iommu_domain *domain); +int iopt_dmabuf_track_domain(struct iopt_pages *pages, struct iopt_area *area, + struct iommu_domain *domain); +void iopt_dmabuf_untrack_domain(struct iopt_pages *pages, + struct iopt_area *area, + struct iommu_domain *domain); +int iopt_dmabuf_track_all_domains(struct iopt_area *area, + struct iopt_pages *pages); +void iopt_dmabuf_untrack_all_domains(struct iopt_area *area, + struct iopt_pages *pages); + static inline unsigned long iopt_area_index(struct iopt_area *area) { return area->pages_node.start; @@ -179,7 +190,22 @@ enum { enum iopt_address_type { IOPT_ADDRESS_USER = 0, - IOPT_ADDRESS_FILE = 1, + IOPT_ADDRESS_FILE, + IOPT_ADDRESS_DMABUF, +}; + +struct iopt_pages_dmabuf_track { + struct iommu_domain *domain; + struct iopt_area *area; + struct list_head elm; +}; + +struct iopt_pages_dmabuf { + struct dma_buf_attachment *attach; + struct dma_buf_phys_vec phys; + /* Always PAGE_SIZE aligned */ + unsigned long start; + struct list_head tracker; }; /* @@ -209,6 +235,8 @@ struct iopt_pages { struct file *file; unsigned long start; }; + /* IOPT_ADDRESS_DMABUF */ + struct iopt_pages_dmabuf dmabuf; }; bool writable:1; u8 account_mode; @@ -220,10 +248,32 @@ struct iopt_pages { struct rb_root_cached domains_itree; }; +static inline bool iopt_is_dmabuf(struct iopt_pages *pages) +{ + if (!IS_ENABLED(CONFIG_DMA_SHARED_BUFFER)) + return false; + return pages->type == IOPT_ADDRESS_DMABUF; +} + +static inline bool iopt_dmabuf_revoked(struct iopt_pages *pages) +{ + lockdep_assert_held(&pages->mutex); + if (iopt_is_dmabuf(pages)) + return pages->dmabuf.phys.len == 0; + return false; +} + struct iopt_pages *iopt_alloc_user_pages(void __user *uptr, unsigned long length, bool writable); -struct iopt_pages *iopt_alloc_file_pages(struct file *file, unsigned long start, +struct iopt_pages *iopt_alloc_file_pages(struct file *file, + unsigned long start_byte, + unsigned long start, unsigned long length, bool writable); +struct iopt_pages *iopt_alloc_dmabuf_pages(struct iommufd_ctx *ictx, + struct dma_buf *dmabuf, + unsigned long start_byte, + unsigned long start, + unsigned long length, bool writable); void iopt_release_pages(struct kref *kref); static inline void iopt_put_pages(struct iopt_pages *pages) { diff --git a/drivers/iommu/iommufd/ioas.c b/drivers/iommu/iommufd/ioas.c index 1542c5fd10a8..f4721afedadc 100644 --- a/drivers/iommu/iommufd/ioas.c +++ b/drivers/iommu/iommufd/ioas.c @@ -207,7 +207,6 @@ int iommufd_ioas_map_file(struct iommufd_ucmd *ucmd) unsigned long iova = cmd->iova; struct iommufd_ioas *ioas; unsigned int flags = 0; - struct file *file; int rc; if (cmd->flags & @@ -229,11 +228,7 @@ int iommufd_ioas_map_file(struct iommufd_ucmd *ucmd) if (!(cmd->flags & IOMMU_IOAS_MAP_FIXED_IOVA)) flags = IOPT_ALLOC_IOVA; - file = fget(cmd->fd); - if (!file) - return -EBADF; - - rc = iopt_map_file_pages(ucmd->ictx, &ioas->iopt, &iova, file, + rc = iopt_map_file_pages(ucmd->ictx, &ioas->iopt, &iova, cmd->fd, cmd->start, cmd->length, conv_iommu_prot(cmd->flags), flags); if (rc) @@ -243,7 +238,6 @@ int iommufd_ioas_map_file(struct iommufd_ucmd *ucmd) rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); out_put: iommufd_put_object(ucmd->ictx, &ioas->obj); - fput(file); return rc; } @@ -367,6 +361,10 @@ int iommufd_ioas_unmap(struct iommufd_ucmd *ucmd) &unmapped); if (rc) goto out_put; + if (!unmapped) { + rc = -ENOENT; + goto out_put; + } } cmd->length = unmapped; diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index 627f9b78483a..eb6d1a70f673 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -19,6 +19,8 @@ struct iommu_domain; struct iommu_group; struct iommu_option; struct iommufd_device; +struct dma_buf_attachment; +struct dma_buf_phys_vec; struct iommufd_sw_msi_map { struct list_head sw_msi_item; @@ -108,7 +110,7 @@ int iopt_map_user_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt, unsigned long length, int iommu_prot, unsigned int flags); int iopt_map_file_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt, - unsigned long *iova, struct file *file, + unsigned long *iova, int fd, unsigned long start, unsigned long length, int iommu_prot, unsigned int flags); int iopt_map_pages(struct io_pagetable *iopt, struct list_head *pages_list, @@ -504,6 +506,8 @@ void iommufd_device_pre_destroy(struct iommufd_object *obj); void iommufd_device_destroy(struct iommufd_object *obj); int iommufd_get_hw_info(struct iommufd_ucmd *ucmd); +struct device *iommufd_global_device(void); + struct iommufd_access { struct iommufd_object obj; struct iommufd_ctx *ictx; @@ -614,7 +618,6 @@ struct iommufd_veventq { struct iommufd_eventq common; struct iommufd_viommu *viommu; struct list_head node; /* for iommufd_viommu::veventqs */ - struct iommufd_vevent lost_events_header; enum iommu_veventq_type type; unsigned int depth; @@ -622,6 +625,9 @@ struct iommufd_veventq { /* Use common.lock for protection */ u32 num_events; u32 sequence; + + /* Must be last as it ends in a flexible-array member. */ + struct iommufd_vevent lost_events_header; }; static inline struct iommufd_veventq * @@ -711,6 +717,8 @@ bool iommufd_should_fail(void); int __init iommufd_test_init(void); void iommufd_test_exit(void); bool iommufd_selftest_is_mock_dev(struct device *dev); +int iommufd_test_dma_buf_iommufd_map(struct dma_buf_attachment *attachment, + struct dma_buf_phys_vec *phys); #else static inline void iommufd_test_syz_conv_iova_id(struct iommufd_ucmd *ucmd, unsigned int ioas_id, @@ -732,5 +740,11 @@ static inline bool iommufd_selftest_is_mock_dev(struct device *dev) { return false; } +static inline int +iommufd_test_dma_buf_iommufd_map(struct dma_buf_attachment *attachment, + struct dma_buf_phys_vec *phys) +{ + return -EOPNOTSUPP; +} #endif #endif diff --git a/drivers/iommu/iommufd/iommufd_test.h b/drivers/iommu/iommufd/iommufd_test.h index 8fc618b2bcf9..73e73e1ec158 100644 --- a/drivers/iommu/iommufd/iommufd_test.h +++ b/drivers/iommu/iommufd/iommufd_test.h @@ -29,11 +29,22 @@ enum { IOMMU_TEST_OP_PASID_REPLACE, IOMMU_TEST_OP_PASID_DETACH, IOMMU_TEST_OP_PASID_CHECK_HWPT, + IOMMU_TEST_OP_DMABUF_GET, + IOMMU_TEST_OP_DMABUF_REVOKE, }; enum { + MOCK_IOMMUPT_DEFAULT = 0, + MOCK_IOMMUPT_HUGE, + MOCK_IOMMUPT_AMDV1, +}; + +/* These values are true for MOCK_IOMMUPT_DEFAULT */ +enum { MOCK_APERTURE_START = 1UL << 24, MOCK_APERTURE_LAST = (1UL << 31) - 1, + MOCK_PAGE_SIZE = 2048, + MOCK_HUGE_PAGE_SIZE = 512 * MOCK_PAGE_SIZE, }; enum { @@ -52,7 +63,6 @@ enum { enum { MOCK_FLAGS_DEVICE_NO_DIRTY = 1 << 0, - MOCK_FLAGS_DEVICE_HUGE_IOVA = 1 << 1, MOCK_FLAGS_DEVICE_PASID = 1 << 2, }; @@ -176,6 +186,14 @@ struct iommu_test_cmd { __u32 hwpt_id; /* @id is stdev_id */ } pasid_check; + struct { + __u32 length; + __u32 open_flags; + } dmabuf_get; + struct { + __s32 dmabuf_fd; + __u32 revoked; + } dmabuf_revoke; }; __u32 last; }; @@ -205,6 +223,7 @@ struct iommu_test_hw_info { */ struct iommu_hwpt_selftest { __u32 iotlb; + __u32 pagetable_type; }; /* Should not be equal to any defined value in enum iommu_hwpt_invalidate_data_type */ diff --git a/drivers/iommu/iommufd/iova_bitmap.c b/drivers/iommu/iommufd/iova_bitmap.c index 4514575818fc..b5b67a9d3fb3 100644 --- a/drivers/iommu/iommufd/iova_bitmap.c +++ b/drivers/iommu/iommufd/iova_bitmap.c @@ -130,9 +130,8 @@ struct iova_bitmap { static unsigned long iova_bitmap_offset_to_index(struct iova_bitmap *bitmap, unsigned long iova) { - unsigned long pgsize = 1UL << bitmap->mapped.pgshift; - - return iova / (BITS_PER_TYPE(*bitmap->bitmap) * pgsize); + return (iova >> bitmap->mapped.pgshift) / + BITS_PER_TYPE(*bitmap->bitmap); } /* diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c index ce775fbbae94..5cc4b08c25f5 100644 --- a/drivers/iommu/iommufd/main.c +++ b/drivers/iommu/iommufd/main.c @@ -751,6 +751,15 @@ static struct miscdevice vfio_misc_dev = { .mode = 0666, }; +/* + * Used only by DMABUF, returns a valid struct device to use as a dummy struct + * device for attachment. + */ +struct device *iommufd_global_device(void) +{ + return iommu_misc_dev.this_device; +} + static int __init iommufd_init(void) { int ret; @@ -794,5 +803,6 @@ MODULE_ALIAS("devname:vfio/vfio"); #endif MODULE_IMPORT_NS("IOMMUFD_INTERNAL"); MODULE_IMPORT_NS("IOMMUFD"); +MODULE_IMPORT_NS("DMA_BUF"); MODULE_DESCRIPTION("I/O Address Space Management for passthrough devices"); MODULE_LICENSE("GPL"); diff --git a/drivers/iommu/iommufd/pages.c b/drivers/iommu/iommufd/pages.c index c3433b845561..dbe51ecb9a20 100644 --- a/drivers/iommu/iommufd/pages.c +++ b/drivers/iommu/iommufd/pages.c @@ -45,6 +45,8 @@ * last_iova + 1 can overflow. An iopt_pages index will always be much less than * ULONG_MAX so last_index + 1 cannot overflow. */ +#include <linux/dma-buf.h> +#include <linux/dma-resv.h> #include <linux/file.h> #include <linux/highmem.h> #include <linux/iommu.h> @@ -53,6 +55,7 @@ #include <linux/overflow.h> #include <linux/slab.h> #include <linux/sched/mm.h> +#include <linux/vfio_pci_core.h> #include "double_span.h" #include "io_pagetable.h" @@ -258,6 +261,11 @@ static struct iopt_area *iopt_pages_find_domain_area(struct iopt_pages *pages, return container_of(node, struct iopt_area, pages_node); } +enum batch_kind { + BATCH_CPU_MEMORY = 0, + BATCH_MMIO, +}; + /* * A simple datastructure to hold a vector of PFNs, optimized for contiguous * PFNs. This is used as a temporary holding memory for shuttling pfns from one @@ -271,7 +279,9 @@ struct pfn_batch { unsigned int array_size; unsigned int end; unsigned int total_pfns; + enum batch_kind kind; }; +enum { MAX_NPFNS = type_max(typeof(((struct pfn_batch *)0)->npfns[0])) }; static void batch_clear(struct pfn_batch *batch) { @@ -348,11 +358,17 @@ static void batch_destroy(struct pfn_batch *batch, void *backup) } static bool batch_add_pfn_num(struct pfn_batch *batch, unsigned long pfn, - u32 nr) + u32 nr, enum batch_kind kind) { - const unsigned int MAX_NPFNS = type_max(typeof(*batch->npfns)); unsigned int end = batch->end; + if (batch->kind != kind) { + /* One kind per batch */ + if (batch->end != 0) + return false; + batch->kind = kind; + } + if (end && pfn == batch->pfns[end - 1] + batch->npfns[end - 1] && nr <= MAX_NPFNS - batch->npfns[end - 1]) { batch->npfns[end - 1] += nr; @@ -379,7 +395,7 @@ static void batch_remove_pfn_num(struct pfn_batch *batch, unsigned long nr) /* true if the pfn was added, false otherwise */ static bool batch_add_pfn(struct pfn_batch *batch, unsigned long pfn) { - return batch_add_pfn_num(batch, pfn, 1); + return batch_add_pfn_num(batch, pfn, 1, BATCH_CPU_MEMORY); } /* @@ -492,6 +508,7 @@ static int batch_to_domain(struct pfn_batch *batch, struct iommu_domain *domain, { bool disable_large_pages = area->iopt->disable_large_pages; unsigned long last_iova = iopt_area_last_iova(area); + int iommu_prot = area->iommu_prot; unsigned int page_offset = 0; unsigned long start_iova; unsigned long next_iova; @@ -499,6 +516,11 @@ static int batch_to_domain(struct pfn_batch *batch, struct iommu_domain *domain, unsigned long iova; int rc; + if (batch->kind == BATCH_MMIO) { + iommu_prot &= ~IOMMU_CACHE; + iommu_prot |= IOMMU_MMIO; + } + /* The first index might be a partial page */ if (start_index == iopt_area_index(area)) page_offset = area->page_offset; @@ -512,11 +534,11 @@ static int batch_to_domain(struct pfn_batch *batch, struct iommu_domain *domain, rc = batch_iommu_map_small( domain, iova, PFN_PHYS(batch->pfns[cur]) + page_offset, - next_iova - iova, area->iommu_prot); + next_iova - iova, iommu_prot); else rc = iommu_map(domain, iova, PFN_PHYS(batch->pfns[cur]) + page_offset, - next_iova - iova, area->iommu_prot, + next_iova - iova, iommu_prot, GFP_KERNEL_ACCOUNT); if (rc) goto err_unmap; @@ -652,7 +674,7 @@ static int batch_from_folios(struct pfn_batch *batch, struct folio ***folios_p, nr = min(nr, npages); npages -= nr; - if (!batch_add_pfn_num(batch, pfn, nr)) + if (!batch_add_pfn_num(batch, pfn, nr, BATCH_CPU_MEMORY)) break; if (nr > 1) { rc = folio_add_pins(folio, nr - 1); @@ -1054,6 +1076,41 @@ static int pfn_reader_user_update_pinned(struct pfn_reader_user *user, return iopt_pages_update_pinned(pages, npages, inc, user); } +struct pfn_reader_dmabuf { + struct dma_buf_phys_vec phys; + unsigned long start_offset; +}; + +static int pfn_reader_dmabuf_init(struct pfn_reader_dmabuf *dmabuf, + struct iopt_pages *pages) +{ + /* Callers must not get here if the dmabuf was already revoked */ + if (WARN_ON(iopt_dmabuf_revoked(pages))) + return -EINVAL; + + dmabuf->phys = pages->dmabuf.phys; + dmabuf->start_offset = pages->dmabuf.start; + return 0; +} + +static int pfn_reader_fill_dmabuf(struct pfn_reader_dmabuf *dmabuf, + struct pfn_batch *batch, + unsigned long start_index, + unsigned long last_index) +{ + unsigned long start = dmabuf->start_offset + start_index * PAGE_SIZE; + + /* + * start/last_index and start are all PAGE_SIZE aligned, the batch is + * always filled using page size aligned PFNs just like the other types. + * If the dmabuf has been sliced on a sub page offset then the common + * batch to domain code will adjust it before mapping to the domain. + */ + batch_add_pfn_num(batch, PHYS_PFN(dmabuf->phys.paddr + start), + last_index - start_index + 1, BATCH_MMIO); + return 0; +} + /* * PFNs are stored in three places, in order of preference: * - The iopt_pages xarray. This is only populated if there is a @@ -1072,7 +1129,10 @@ struct pfn_reader { unsigned long batch_end_index; unsigned long last_index; - struct pfn_reader_user user; + union { + struct pfn_reader_user user; + struct pfn_reader_dmabuf dmabuf; + }; }; static int pfn_reader_update_pinned(struct pfn_reader *pfns) @@ -1108,7 +1168,7 @@ static int pfn_reader_fill_span(struct pfn_reader *pfns) { struct interval_tree_double_span_iter *span = &pfns->span; unsigned long start_index = pfns->batch_end_index; - struct pfn_reader_user *user = &pfns->user; + struct pfn_reader_user *user; unsigned long npages; struct iopt_area *area; int rc; @@ -1140,8 +1200,13 @@ static int pfn_reader_fill_span(struct pfn_reader *pfns) return 0; } - if (start_index >= pfns->user.upages_end) { - rc = pfn_reader_user_pin(&pfns->user, pfns->pages, start_index, + if (iopt_is_dmabuf(pfns->pages)) + return pfn_reader_fill_dmabuf(&pfns->dmabuf, &pfns->batch, + start_index, span->last_hole); + + user = &pfns->user; + if (start_index >= user->upages_end) { + rc = pfn_reader_user_pin(user, pfns->pages, start_index, span->last_hole); if (rc) return rc; @@ -1209,7 +1274,10 @@ static int pfn_reader_init(struct pfn_reader *pfns, struct iopt_pages *pages, pfns->batch_start_index = start_index; pfns->batch_end_index = start_index; pfns->last_index = last_index; - pfn_reader_user_init(&pfns->user, pages); + if (iopt_is_dmabuf(pages)) + pfn_reader_dmabuf_init(&pfns->dmabuf, pages); + else + pfn_reader_user_init(&pfns->user, pages); rc = batch_init(&pfns->batch, last_index - start_index + 1); if (rc) return rc; @@ -1230,8 +1298,12 @@ static int pfn_reader_init(struct pfn_reader *pfns, struct iopt_pages *pages, static void pfn_reader_release_pins(struct pfn_reader *pfns) { struct iopt_pages *pages = pfns->pages; - struct pfn_reader_user *user = &pfns->user; + struct pfn_reader_user *user; + + if (iopt_is_dmabuf(pages)) + return; + user = &pfns->user; if (user->upages_end > pfns->batch_end_index) { /* Any pages not transferred to the batch are just unpinned */ @@ -1261,7 +1333,8 @@ static void pfn_reader_destroy(struct pfn_reader *pfns) struct iopt_pages *pages = pfns->pages; pfn_reader_release_pins(pfns); - pfn_reader_user_destroy(&pfns->user, pfns->pages); + if (!iopt_is_dmabuf(pfns->pages)) + pfn_reader_user_destroy(&pfns->user, pfns->pages); batch_destroy(&pfns->batch, NULL); WARN_ON(pages->last_npinned != pages->npinned); } @@ -1340,26 +1413,234 @@ struct iopt_pages *iopt_alloc_user_pages(void __user *uptr, return pages; } -struct iopt_pages *iopt_alloc_file_pages(struct file *file, unsigned long start, +struct iopt_pages *iopt_alloc_file_pages(struct file *file, + unsigned long start_byte, + unsigned long start, unsigned long length, bool writable) { struct iopt_pages *pages; - unsigned long start_down = ALIGN_DOWN(start, PAGE_SIZE); - unsigned long end; - if (length && check_add_overflow(start, length - 1, &end)) - return ERR_PTR(-EOVERFLOW); - - pages = iopt_alloc_pages(start - start_down, length, writable); + pages = iopt_alloc_pages(start_byte, length, writable); if (IS_ERR(pages)) return pages; pages->file = get_file(file); - pages->start = start_down; + pages->start = start - start_byte; pages->type = IOPT_ADDRESS_FILE; return pages; } +static void iopt_revoke_notify(struct dma_buf_attachment *attach) +{ + struct iopt_pages *pages = attach->importer_priv; + struct iopt_pages_dmabuf_track *track; + + guard(mutex)(&pages->mutex); + if (iopt_dmabuf_revoked(pages)) + return; + + list_for_each_entry(track, &pages->dmabuf.tracker, elm) { + struct iopt_area *area = track->area; + + iopt_area_unmap_domain_range(area, track->domain, + iopt_area_index(area), + iopt_area_last_index(area)); + } + pages->dmabuf.phys.len = 0; +} + +static struct dma_buf_attach_ops iopt_dmabuf_attach_revoke_ops = { + .allow_peer2peer = true, + .move_notify = iopt_revoke_notify, +}; + +/* + * iommufd and vfio have a circular dependency. Future work for a phys + * based private interconnect will remove this. + */ +static int +sym_vfio_pci_dma_buf_iommufd_map(struct dma_buf_attachment *attachment, + struct dma_buf_phys_vec *phys) +{ + typeof(&vfio_pci_dma_buf_iommufd_map) fn; + int rc; + + rc = iommufd_test_dma_buf_iommufd_map(attachment, phys); + if (rc != -EOPNOTSUPP) + return rc; + + if (!IS_ENABLED(CONFIG_VFIO_PCI_DMABUF)) + return -EOPNOTSUPP; + + fn = symbol_get(vfio_pci_dma_buf_iommufd_map); + if (!fn) + return -EOPNOTSUPP; + rc = fn(attachment, phys); + symbol_put(vfio_pci_dma_buf_iommufd_map); + return rc; +} + +static int iopt_map_dmabuf(struct iommufd_ctx *ictx, struct iopt_pages *pages, + struct dma_buf *dmabuf) +{ + struct dma_buf_attachment *attach; + int rc; + + attach = dma_buf_dynamic_attach(dmabuf, iommufd_global_device(), + &iopt_dmabuf_attach_revoke_ops, pages); + if (IS_ERR(attach)) + return PTR_ERR(attach); + + dma_resv_lock(dmabuf->resv, NULL); + /* + * Lock ordering requires the mutex to be taken inside the reservation, + * make sure lockdep sees this. + */ + if (IS_ENABLED(CONFIG_LOCKDEP)) { + mutex_lock(&pages->mutex); + mutex_unlock(&pages->mutex); + } + + rc = sym_vfio_pci_dma_buf_iommufd_map(attach, &pages->dmabuf.phys); + if (rc) + goto err_detach; + + dma_resv_unlock(dmabuf->resv); + + /* On success iopt_release_pages() will detach and put the dmabuf. */ + pages->dmabuf.attach = attach; + return 0; + +err_detach: + dma_resv_unlock(dmabuf->resv); + dma_buf_detach(dmabuf, attach); + return rc; +} + +struct iopt_pages *iopt_alloc_dmabuf_pages(struct iommufd_ctx *ictx, + struct dma_buf *dmabuf, + unsigned long start_byte, + unsigned long start, + unsigned long length, bool writable) +{ + static struct lock_class_key pages_dmabuf_mutex_key; + struct iopt_pages *pages; + int rc; + + if (!IS_ENABLED(CONFIG_DMA_SHARED_BUFFER)) + return ERR_PTR(-EOPNOTSUPP); + + if (dmabuf->size <= (start + length - 1) || + length / PAGE_SIZE >= MAX_NPFNS) + return ERR_PTR(-EINVAL); + + pages = iopt_alloc_pages(start_byte, length, writable); + if (IS_ERR(pages)) + return pages; + + /* + * The mmap_lock can be held when obtaining the dmabuf reservation lock + * which creates a locking cycle with the pages mutex which is held + * while obtaining the mmap_lock. This locking path is not present for + * IOPT_ADDRESS_DMABUF so split the lock class. + */ + lockdep_set_class(&pages->mutex, &pages_dmabuf_mutex_key); + + /* dmabuf does not use pinned page accounting. */ + pages->account_mode = IOPT_PAGES_ACCOUNT_NONE; + pages->type = IOPT_ADDRESS_DMABUF; + pages->dmabuf.start = start - start_byte; + INIT_LIST_HEAD(&pages->dmabuf.tracker); + + rc = iopt_map_dmabuf(ictx, pages, dmabuf); + if (rc) { + iopt_put_pages(pages); + return ERR_PTR(rc); + } + + return pages; +} + +int iopt_dmabuf_track_domain(struct iopt_pages *pages, struct iopt_area *area, + struct iommu_domain *domain) +{ + struct iopt_pages_dmabuf_track *track; + + lockdep_assert_held(&pages->mutex); + if (WARN_ON(!iopt_is_dmabuf(pages))) + return -EINVAL; + + list_for_each_entry(track, &pages->dmabuf.tracker, elm) + if (WARN_ON(track->domain == domain && track->area == area)) + return -EINVAL; + + track = kzalloc(sizeof(*track), GFP_KERNEL); + if (!track) + return -ENOMEM; + track->domain = domain; + track->area = area; + list_add_tail(&track->elm, &pages->dmabuf.tracker); + + return 0; +} + +void iopt_dmabuf_untrack_domain(struct iopt_pages *pages, + struct iopt_area *area, + struct iommu_domain *domain) +{ + struct iopt_pages_dmabuf_track *track; + + lockdep_assert_held(&pages->mutex); + WARN_ON(!iopt_is_dmabuf(pages)); + + list_for_each_entry(track, &pages->dmabuf.tracker, elm) { + if (track->domain == domain && track->area == area) { + list_del(&track->elm); + kfree(track); + return; + } + } + WARN_ON(true); +} + +int iopt_dmabuf_track_all_domains(struct iopt_area *area, + struct iopt_pages *pages) +{ + struct iopt_pages_dmabuf_track *track; + struct iommu_domain *domain; + unsigned long index; + int rc; + + list_for_each_entry(track, &pages->dmabuf.tracker, elm) + if (WARN_ON(track->area == area)) + return -EINVAL; + + xa_for_each(&area->iopt->domains, index, domain) { + rc = iopt_dmabuf_track_domain(pages, area, domain); + if (rc) + goto err_untrack; + } + return 0; +err_untrack: + iopt_dmabuf_untrack_all_domains(area, pages); + return rc; +} + +void iopt_dmabuf_untrack_all_domains(struct iopt_area *area, + struct iopt_pages *pages) +{ + struct iopt_pages_dmabuf_track *track; + struct iopt_pages_dmabuf_track *tmp; + + list_for_each_entry_safe(track, tmp, &pages->dmabuf.tracker, + elm) { + if (track->area == area) { + list_del(&track->elm); + kfree(track); + } + } +} + void iopt_release_pages(struct kref *kref) { struct iopt_pages *pages = container_of(kref, struct iopt_pages, kref); @@ -1372,8 +1653,15 @@ void iopt_release_pages(struct kref *kref) mutex_destroy(&pages->mutex); put_task_struct(pages->source_task); free_uid(pages->source_user); - if (pages->type == IOPT_ADDRESS_FILE) + if (iopt_is_dmabuf(pages) && pages->dmabuf.attach) { + struct dma_buf *dmabuf = pages->dmabuf.attach->dmabuf; + + dma_buf_detach(dmabuf, pages->dmabuf.attach); + dma_buf_put(dmabuf); + WARN_ON(!list_empty(&pages->dmabuf.tracker)); + } else if (pages->type == IOPT_ADDRESS_FILE) { fput(pages->file); + } kfree(pages); } @@ -1451,6 +1739,14 @@ static void __iopt_area_unfill_domain(struct iopt_area *area, lockdep_assert_held(&pages->mutex); + if (iopt_is_dmabuf(pages)) { + if (WARN_ON(iopt_dmabuf_revoked(pages))) + return; + iopt_area_unmap_domain_range(area, domain, start_index, + last_index); + return; + } + /* * For security we must not unpin something that is still DMA mapped, * so this must unmap any IOVA before we go ahead and unpin the pages. @@ -1526,6 +1822,9 @@ void iopt_area_unmap_domain(struct iopt_area *area, struct iommu_domain *domain) void iopt_area_unfill_domain(struct iopt_area *area, struct iopt_pages *pages, struct iommu_domain *domain) { + if (iopt_dmabuf_revoked(pages)) + return; + __iopt_area_unfill_domain(area, pages, domain, iopt_area_last_index(area)); } @@ -1546,6 +1845,9 @@ int iopt_area_fill_domain(struct iopt_area *area, struct iommu_domain *domain) lockdep_assert_held(&area->pages->mutex); + if (iopt_dmabuf_revoked(area->pages)) + return 0; + rc = pfn_reader_first(&pfns, area->pages, iopt_area_index(area), iopt_area_last_index(area)); if (rc) @@ -1605,33 +1907,44 @@ int iopt_area_fill_domains(struct iopt_area *area, struct iopt_pages *pages) return 0; mutex_lock(&pages->mutex); - rc = pfn_reader_first(&pfns, pages, iopt_area_index(area), - iopt_area_last_index(area)); - if (rc) - goto out_unlock; + if (iopt_is_dmabuf(pages)) { + rc = iopt_dmabuf_track_all_domains(area, pages); + if (rc) + goto out_unlock; + } - while (!pfn_reader_done(&pfns)) { - done_first_end_index = pfns.batch_end_index; - done_all_end_index = pfns.batch_start_index; - xa_for_each(&area->iopt->domains, index, domain) { - rc = batch_to_domain(&pfns.batch, domain, area, - pfns.batch_start_index); + if (!iopt_dmabuf_revoked(pages)) { + rc = pfn_reader_first(&pfns, pages, iopt_area_index(area), + iopt_area_last_index(area)); + if (rc) + goto out_untrack; + + while (!pfn_reader_done(&pfns)) { + done_first_end_index = pfns.batch_end_index; + done_all_end_index = pfns.batch_start_index; + xa_for_each(&area->iopt->domains, index, domain) { + rc = batch_to_domain(&pfns.batch, domain, area, + pfns.batch_start_index); + if (rc) + goto out_unmap; + } + done_all_end_index = done_first_end_index; + + rc = pfn_reader_next(&pfns); if (rc) goto out_unmap; } - done_all_end_index = done_first_end_index; - - rc = pfn_reader_next(&pfns); + rc = pfn_reader_update_pinned(&pfns); if (rc) goto out_unmap; + + pfn_reader_destroy(&pfns); } - rc = pfn_reader_update_pinned(&pfns); - if (rc) - goto out_unmap; area->storage_domain = xa_load(&area->iopt->domains, 0); interval_tree_insert(&area->pages_node, &pages->domains_itree); - goto out_destroy; + mutex_unlock(&pages->mutex); + return 0; out_unmap: pfn_reader_release_pins(&pfns); @@ -1658,8 +1971,10 @@ out_unmap: end_index); } } -out_destroy: pfn_reader_destroy(&pfns); +out_untrack: + if (iopt_is_dmabuf(pages)) + iopt_dmabuf_untrack_all_domains(area, pages); out_unlock: mutex_unlock(&pages->mutex); return rc; @@ -1685,16 +2000,22 @@ void iopt_area_unfill_domains(struct iopt_area *area, struct iopt_pages *pages) if (!area->storage_domain) goto out_unlock; - xa_for_each(&iopt->domains, index, domain) - if (domain != area->storage_domain) + xa_for_each(&iopt->domains, index, domain) { + if (domain == area->storage_domain) + continue; + + if (!iopt_dmabuf_revoked(pages)) iopt_area_unmap_domain_range( area, domain, iopt_area_index(area), iopt_area_last_index(area)); + } if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) WARN_ON(RB_EMPTY_NODE(&area->pages_node.rb)); interval_tree_remove(&area->pages_node, &pages->domains_itree); iopt_area_unfill_domain(area, pages, area->storage_domain); + if (iopt_is_dmabuf(pages)) + iopt_dmabuf_untrack_all_domains(area, pages); area->storage_domain = NULL; out_unlock: mutex_unlock(&pages->mutex); @@ -2031,15 +2352,14 @@ int iopt_pages_rw_access(struct iopt_pages *pages, unsigned long start_byte, if ((flags & IOMMUFD_ACCESS_RW_WRITE) && !pages->writable) return -EPERM; - if (pages->type == IOPT_ADDRESS_FILE) + if (iopt_is_dmabuf(pages)) + return -EINVAL; + + if (pages->type != IOPT_ADDRESS_USER) return iopt_pages_rw_slow(pages, start_index, last_index, start_byte % PAGE_SIZE, data, length, flags); - if (IS_ENABLED(CONFIG_IOMMUFD_TEST) && - WARN_ON(pages->type != IOPT_ADDRESS_USER)) - return -EINVAL; - if (!(flags & IOMMUFD_ACCESS_RW_KTHREAD) && change_mm) { if (start_index == last_index) return iopt_pages_rw_page(pages, start_index, diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c index de178827a078..c4322fd26f93 100644 --- a/drivers/iommu/iommufd/selftest.c +++ b/drivers/iommu/iommufd/selftest.c @@ -5,6 +5,8 @@ */ #include <linux/anon_inodes.h> #include <linux/debugfs.h> +#include <linux/dma-buf.h> +#include <linux/dma-resv.h> #include <linux/fault-inject.h> #include <linux/file.h> #include <linux/iommu.h> @@ -12,6 +14,8 @@ #include <linux/slab.h> #include <linux/xarray.h> #include <uapi/linux/iommufd.h> +#include <linux/generic_pt/iommu.h> +#include "../iommu-pages.h" #include "../iommu-priv.h" #include "io_pagetable.h" @@ -41,21 +45,6 @@ static DEFINE_IDA(mock_dev_ida); enum { MOCK_DIRTY_TRACK = 1, - MOCK_IO_PAGE_SIZE = PAGE_SIZE / 2, - MOCK_HUGE_PAGE_SIZE = 512 * MOCK_IO_PAGE_SIZE, - - /* - * Like a real page table alignment requires the low bits of the address - * to be zero. xarray also requires the high bit to be zero, so we store - * the pfns shifted. The upper bits are used for metadata. - */ - MOCK_PFN_MASK = ULONG_MAX / MOCK_IO_PAGE_SIZE, - - _MOCK_PFN_START = MOCK_PFN_MASK + 1, - MOCK_PFN_START_IOVA = _MOCK_PFN_START, - MOCK_PFN_LAST_IOVA = _MOCK_PFN_START, - MOCK_PFN_DIRTY_IOVA = _MOCK_PFN_START << 1, - MOCK_PFN_HUGE_IOVA = _MOCK_PFN_START << 2, }; static int mock_dev_enable_iopf(struct device *dev, struct iommu_domain *domain); @@ -124,10 +113,15 @@ void iommufd_test_syz_conv_iova_id(struct iommufd_ucmd *ucmd, } struct mock_iommu_domain { + union { + struct iommu_domain domain; + struct pt_iommu iommu; + struct pt_iommu_amdv1 amdv1; + }; unsigned long flags; - struct iommu_domain domain; - struct xarray pfns; }; +PT_IOMMU_CHECK_DOMAIN(struct mock_iommu_domain, iommu, domain); +PT_IOMMU_CHECK_DOMAIN(struct mock_iommu_domain, amdv1.iommu, domain); static inline struct mock_iommu_domain * to_mock_domain(struct iommu_domain *domain) @@ -216,7 +210,7 @@ static inline struct selftest_obj *to_selftest_obj(struct iommufd_object *obj) } static int mock_domain_nop_attach(struct iommu_domain *domain, - struct device *dev) + struct device *dev, struct iommu_domain *old) { struct mock_dev *mdev = to_mock_dev(dev); struct mock_viommu *new_viommu = NULL; @@ -344,74 +338,6 @@ static int mock_domain_set_dirty_tracking(struct iommu_domain *domain, return 0; } -static bool mock_test_and_clear_dirty(struct mock_iommu_domain *mock, - unsigned long iova, size_t page_size, - unsigned long flags) -{ - unsigned long cur, end = iova + page_size - 1; - bool dirty = false; - void *ent, *old; - - for (cur = iova; cur < end; cur += MOCK_IO_PAGE_SIZE) { - ent = xa_load(&mock->pfns, cur / MOCK_IO_PAGE_SIZE); - if (!ent || !(xa_to_value(ent) & MOCK_PFN_DIRTY_IOVA)) - continue; - - dirty = true; - /* Clear dirty */ - if (!(flags & IOMMU_DIRTY_NO_CLEAR)) { - unsigned long val; - - val = xa_to_value(ent) & ~MOCK_PFN_DIRTY_IOVA; - old = xa_store(&mock->pfns, cur / MOCK_IO_PAGE_SIZE, - xa_mk_value(val), GFP_KERNEL); - WARN_ON_ONCE(ent != old); - } - } - - return dirty; -} - -static int mock_domain_read_and_clear_dirty(struct iommu_domain *domain, - unsigned long iova, size_t size, - unsigned long flags, - struct iommu_dirty_bitmap *dirty) -{ - struct mock_iommu_domain *mock = to_mock_domain(domain); - unsigned long end = iova + size; - void *ent; - - if (!(mock->flags & MOCK_DIRTY_TRACK) && dirty->bitmap) - return -EINVAL; - - do { - unsigned long pgsize = MOCK_IO_PAGE_SIZE; - unsigned long head; - - ent = xa_load(&mock->pfns, iova / MOCK_IO_PAGE_SIZE); - if (!ent) { - iova += pgsize; - continue; - } - - if (xa_to_value(ent) & MOCK_PFN_HUGE_IOVA) - pgsize = MOCK_HUGE_PAGE_SIZE; - head = iova & ~(pgsize - 1); - - /* Clear dirty */ - if (mock_test_and_clear_dirty(mock, head, pgsize, flags)) - iommu_dirty_bitmap_record(dirty, iova, pgsize); - iova += pgsize; - } while (iova < end); - - return 0; -} - -static const struct iommu_dirty_ops dirty_ops = { - .set_dirty_tracking = mock_domain_set_dirty_tracking, - .read_and_clear_dirty = mock_domain_read_and_clear_dirty, -}; - static struct mock_iommu_domain_nested * __mock_domain_alloc_nested(const struct iommu_user_data *user_data) { @@ -446,7 +372,7 @@ mock_domain_alloc_nested(struct device *dev, struct iommu_domain *parent, if (flags & ~IOMMU_HWPT_ALLOC_PASID) return ERR_PTR(-EOPNOTSUPP); - if (!parent || parent->ops != mock_ops.default_domain_ops) + if (!parent || !(parent->type & __IOMMU_DOMAIN_PAGING)) return ERR_PTR(-EINVAL); mock_parent = to_mock_domain(parent); @@ -459,159 +385,170 @@ mock_domain_alloc_nested(struct device *dev, struct iommu_domain *parent, return &mock_nested->domain; } -static struct iommu_domain * -mock_domain_alloc_paging_flags(struct device *dev, u32 flags, - const struct iommu_user_data *user_data) -{ - bool has_dirty_flag = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING; - const u32 PAGING_FLAGS = IOMMU_HWPT_ALLOC_DIRTY_TRACKING | - IOMMU_HWPT_ALLOC_NEST_PARENT | - IOMMU_HWPT_ALLOC_PASID; - struct mock_dev *mdev = to_mock_dev(dev); - bool no_dirty_ops = mdev->flags & MOCK_FLAGS_DEVICE_NO_DIRTY; - struct mock_iommu_domain *mock; - - if (user_data) - return ERR_PTR(-EOPNOTSUPP); - if ((flags & ~PAGING_FLAGS) || (has_dirty_flag && no_dirty_ops)) - return ERR_PTR(-EOPNOTSUPP); - - mock = kzalloc(sizeof(*mock), GFP_KERNEL); - if (!mock) - return ERR_PTR(-ENOMEM); - mock->domain.geometry.aperture_start = MOCK_APERTURE_START; - mock->domain.geometry.aperture_end = MOCK_APERTURE_LAST; - mock->domain.pgsize_bitmap = MOCK_IO_PAGE_SIZE; - if (dev && mdev->flags & MOCK_FLAGS_DEVICE_HUGE_IOVA) - mock->domain.pgsize_bitmap |= MOCK_HUGE_PAGE_SIZE; - mock->domain.ops = mock_ops.default_domain_ops; - mock->domain.type = IOMMU_DOMAIN_UNMANAGED; - xa_init(&mock->pfns); - - if (has_dirty_flag) - mock->domain.dirty_ops = &dirty_ops; - return &mock->domain; -} - static void mock_domain_free(struct iommu_domain *domain) { struct mock_iommu_domain *mock = to_mock_domain(domain); - WARN_ON(!xa_empty(&mock->pfns)); + pt_iommu_deinit(&mock->iommu); kfree(mock); } -static int mock_domain_map_pages(struct iommu_domain *domain, - unsigned long iova, phys_addr_t paddr, - size_t pgsize, size_t pgcount, int prot, - gfp_t gfp, size_t *mapped) +static void mock_iotlb_sync(struct iommu_domain *domain, + struct iommu_iotlb_gather *gather) { - struct mock_iommu_domain *mock = to_mock_domain(domain); - unsigned long flags = MOCK_PFN_START_IOVA; - unsigned long start_iova = iova; + iommu_put_pages_list(&gather->freelist); +} - /* - * xarray does not reliably work with fault injection because it does a - * retry allocation, so put our own failure point. - */ - if (iommufd_should_fail()) - return -ENOENT; +static const struct iommu_domain_ops amdv1_mock_ops = { + IOMMU_PT_DOMAIN_OPS(amdv1_mock), + .free = mock_domain_free, + .attach_dev = mock_domain_nop_attach, + .set_dev_pasid = mock_domain_set_dev_pasid_nop, + .iotlb_sync = &mock_iotlb_sync, +}; - WARN_ON(iova % MOCK_IO_PAGE_SIZE); - WARN_ON(pgsize % MOCK_IO_PAGE_SIZE); - for (; pgcount; pgcount--) { - size_t cur; +static const struct iommu_domain_ops amdv1_mock_huge_ops = { + IOMMU_PT_DOMAIN_OPS(amdv1_mock), + .free = mock_domain_free, + .attach_dev = mock_domain_nop_attach, + .set_dev_pasid = mock_domain_set_dev_pasid_nop, + .iotlb_sync = &mock_iotlb_sync, +}; +#undef pt_iommu_amdv1_mock_map_pages - for (cur = 0; cur != pgsize; cur += MOCK_IO_PAGE_SIZE) { - void *old; +static const struct iommu_dirty_ops amdv1_mock_dirty_ops = { + IOMMU_PT_DIRTY_OPS(amdv1_mock), + .set_dirty_tracking = mock_domain_set_dirty_tracking, +}; - if (pgcount == 1 && cur + MOCK_IO_PAGE_SIZE == pgsize) - flags = MOCK_PFN_LAST_IOVA; - if (pgsize != MOCK_IO_PAGE_SIZE) { - flags |= MOCK_PFN_HUGE_IOVA; - } - old = xa_store(&mock->pfns, iova / MOCK_IO_PAGE_SIZE, - xa_mk_value((paddr / MOCK_IO_PAGE_SIZE) | - flags), - gfp); - if (xa_is_err(old)) { - for (; start_iova != iova; - start_iova += MOCK_IO_PAGE_SIZE) - xa_erase(&mock->pfns, - start_iova / - MOCK_IO_PAGE_SIZE); - return xa_err(old); - } - WARN_ON(old); - iova += MOCK_IO_PAGE_SIZE; - paddr += MOCK_IO_PAGE_SIZE; - *mapped += MOCK_IO_PAGE_SIZE; - flags = 0; - } - } - return 0; -} +static const struct iommu_domain_ops amdv1_ops = { + IOMMU_PT_DOMAIN_OPS(amdv1), + .free = mock_domain_free, + .attach_dev = mock_domain_nop_attach, + .set_dev_pasid = mock_domain_set_dev_pasid_nop, + .iotlb_sync = &mock_iotlb_sync, +}; -static size_t mock_domain_unmap_pages(struct iommu_domain *domain, - unsigned long iova, size_t pgsize, - size_t pgcount, - struct iommu_iotlb_gather *iotlb_gather) +static const struct iommu_dirty_ops amdv1_dirty_ops = { + IOMMU_PT_DIRTY_OPS(amdv1), + .set_dirty_tracking = mock_domain_set_dirty_tracking, +}; + +static struct mock_iommu_domain * +mock_domain_alloc_pgtable(struct device *dev, + const struct iommu_hwpt_selftest *user_cfg, u32 flags) { - struct mock_iommu_domain *mock = to_mock_domain(domain); - bool first = true; - size_t ret = 0; - void *ent; + struct mock_iommu_domain *mock; + int rc; - WARN_ON(iova % MOCK_IO_PAGE_SIZE); - WARN_ON(pgsize % MOCK_IO_PAGE_SIZE); + mock = kzalloc(sizeof(*mock), GFP_KERNEL); + if (!mock) + return ERR_PTR(-ENOMEM); + mock->domain.type = IOMMU_DOMAIN_UNMANAGED; - for (; pgcount; pgcount--) { - size_t cur; + mock->amdv1.iommu.nid = NUMA_NO_NODE; + + switch (user_cfg->pagetable_type) { + case MOCK_IOMMUPT_DEFAULT: + case MOCK_IOMMUPT_HUGE: { + struct pt_iommu_amdv1_cfg cfg = {}; + + /* The mock version has a 2k page size */ + cfg.common.hw_max_vasz_lg2 = 56; + cfg.common.hw_max_oasz_lg2 = 51; + cfg.starting_level = 2; + if (user_cfg->pagetable_type == MOCK_IOMMUPT_HUGE) + mock->domain.ops = &amdv1_mock_huge_ops; + else + mock->domain.ops = &amdv1_mock_ops; + rc = pt_iommu_amdv1_mock_init(&mock->amdv1, &cfg, GFP_KERNEL); + if (rc) + goto err_free; + + /* + * In huge mode userspace should only provide huge pages, we + * have to include PAGE_SIZE for the domain to be accepted by + * iommufd. + */ + if (user_cfg->pagetable_type == MOCK_IOMMUPT_HUGE) + mock->domain.pgsize_bitmap = MOCK_HUGE_PAGE_SIZE | + PAGE_SIZE; + if (flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING) + mock->domain.dirty_ops = &amdv1_mock_dirty_ops; + break; + } - for (cur = 0; cur != pgsize; cur += MOCK_IO_PAGE_SIZE) { - ent = xa_erase(&mock->pfns, iova / MOCK_IO_PAGE_SIZE); + case MOCK_IOMMUPT_AMDV1: { + struct pt_iommu_amdv1_cfg cfg = {}; + + cfg.common.hw_max_vasz_lg2 = 64; + cfg.common.hw_max_oasz_lg2 = 52; + cfg.common.features = BIT(PT_FEAT_DYNAMIC_TOP) | + BIT(PT_FEAT_AMDV1_ENCRYPT_TABLES) | + BIT(PT_FEAT_AMDV1_FORCE_COHERENCE); + cfg.starting_level = 2; + mock->domain.ops = &amdv1_ops; + rc = pt_iommu_amdv1_init(&mock->amdv1, &cfg, GFP_KERNEL); + if (rc) + goto err_free; + if (flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING) + mock->domain.dirty_ops = &amdv1_dirty_ops; + break; + } + default: + rc = -EOPNOTSUPP; + goto err_free; + } - /* - * iommufd generates unmaps that must be a strict - * superset of the map's performend So every - * starting/ending IOVA should have been an iova passed - * to map. - * - * This simple logic doesn't work when the HUGE_PAGE is - * turned on since the core code will automatically - * switch between the two page sizes creating a break in - * the unmap calls. The break can land in the middle of - * contiguous IOVA. - */ - if (!(domain->pgsize_bitmap & MOCK_HUGE_PAGE_SIZE)) { - if (first) { - WARN_ON(ent && !(xa_to_value(ent) & - MOCK_PFN_START_IOVA)); - first = false; - } - if (pgcount == 1 && - cur + MOCK_IO_PAGE_SIZE == pgsize) - WARN_ON(ent && !(xa_to_value(ent) & - MOCK_PFN_LAST_IOVA)); - } + /* + * Override the real aperture to the MOCK aperture for test purposes. + */ + if (user_cfg->pagetable_type == MOCK_IOMMUPT_DEFAULT) { + WARN_ON(mock->domain.geometry.aperture_start != 0); + WARN_ON(mock->domain.geometry.aperture_end < MOCK_APERTURE_LAST); - iova += MOCK_IO_PAGE_SIZE; - ret += MOCK_IO_PAGE_SIZE; - } + mock->domain.geometry.aperture_start = MOCK_APERTURE_START; + mock->domain.geometry.aperture_end = MOCK_APERTURE_LAST; } - return ret; + + return mock; +err_free: + kfree(mock); + return ERR_PTR(rc); } -static phys_addr_t mock_domain_iova_to_phys(struct iommu_domain *domain, - dma_addr_t iova) +static struct iommu_domain * +mock_domain_alloc_paging_flags(struct device *dev, u32 flags, + const struct iommu_user_data *user_data) { - struct mock_iommu_domain *mock = to_mock_domain(domain); - void *ent; + bool has_dirty_flag = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING; + const u32 PAGING_FLAGS = IOMMU_HWPT_ALLOC_DIRTY_TRACKING | + IOMMU_HWPT_ALLOC_NEST_PARENT | + IOMMU_HWPT_ALLOC_PASID; + struct mock_dev *mdev = to_mock_dev(dev); + bool no_dirty_ops = mdev->flags & MOCK_FLAGS_DEVICE_NO_DIRTY; + struct iommu_hwpt_selftest user_cfg = {}; + struct mock_iommu_domain *mock; + int rc; - WARN_ON(iova % MOCK_IO_PAGE_SIZE); - ent = xa_load(&mock->pfns, iova / MOCK_IO_PAGE_SIZE); - WARN_ON(!ent); - return (xa_to_value(ent) & MOCK_PFN_MASK) * MOCK_IO_PAGE_SIZE; + if ((flags & ~PAGING_FLAGS) || (has_dirty_flag && no_dirty_ops)) + return ERR_PTR(-EOPNOTSUPP); + + if (user_data && (user_data->type != IOMMU_HWPT_DATA_SELFTEST && + user_data->type != IOMMU_HWPT_DATA_NONE)) + return ERR_PTR(-EOPNOTSUPP); + + if (user_data) { + rc = iommu_copy_struct_from_user( + &user_cfg, user_data, IOMMU_HWPT_DATA_SELFTEST, iotlb); + if (rc) + return ERR_PTR(rc); + } + + mock = mock_domain_alloc_pgtable(dev, &user_cfg, flags); + if (IS_ERR(mock)) + return ERR_CAST(mock); + return &mock->domain; } static bool mock_domain_capable(struct device *dev, enum iommu_cap cap) @@ -955,15 +892,6 @@ static const struct iommu_ops mock_ops = { .user_pasid_table = true, .get_viommu_size = mock_get_viommu_size, .viommu_init = mock_viommu_init, - .default_domain_ops = - &(struct iommu_domain_ops){ - .free = mock_domain_free, - .attach_dev = mock_domain_nop_attach, - .map_pages = mock_domain_map_pages, - .unmap_pages = mock_domain_unmap_pages, - .iova_to_phys = mock_domain_iova_to_phys, - .set_dev_pasid = mock_domain_set_dev_pasid_nop, - }, }; static void mock_domain_free_nested(struct iommu_domain *domain) @@ -1047,7 +975,7 @@ get_md_pagetable(struct iommufd_ucmd *ucmd, u32 mockpt_id, if (IS_ERR(hwpt)) return hwpt; if (hwpt->domain->type != IOMMU_DOMAIN_UNMANAGED || - hwpt->domain->ops != mock_ops.default_domain_ops) { + hwpt->domain->owner != &mock_ops) { iommufd_put_object(ucmd->ictx, &hwpt->obj); return ERR_PTR(-EINVAL); } @@ -1088,7 +1016,6 @@ static struct mock_dev *mock_dev_create(unsigned long dev_flags) {}, }; const u32 valid_flags = MOCK_FLAGS_DEVICE_NO_DIRTY | - MOCK_FLAGS_DEVICE_HUGE_IOVA | MOCK_FLAGS_DEVICE_PASID; struct mock_dev *mdev; int rc, i; @@ -1277,23 +1204,25 @@ static int iommufd_test_md_check_pa(struct iommufd_ucmd *ucmd, { struct iommufd_hw_pagetable *hwpt; struct mock_iommu_domain *mock; + unsigned int page_size; uintptr_t end; int rc; - if (iova % MOCK_IO_PAGE_SIZE || length % MOCK_IO_PAGE_SIZE || - (uintptr_t)uptr % MOCK_IO_PAGE_SIZE || - check_add_overflow((uintptr_t)uptr, (uintptr_t)length, &end)) - return -EINVAL; - hwpt = get_md_pagetable(ucmd, mockpt_id, &mock); if (IS_ERR(hwpt)) return PTR_ERR(hwpt); - for (; length; length -= MOCK_IO_PAGE_SIZE) { + page_size = 1 << __ffs(mock->domain.pgsize_bitmap); + if (iova % page_size || length % page_size || + (uintptr_t)uptr % page_size || + check_add_overflow((uintptr_t)uptr, (uintptr_t)length, &end)) + return -EINVAL; + + for (; length; length -= page_size) { struct page *pages[1]; + phys_addr_t io_phys; unsigned long pfn; long npages; - void *ent; npages = get_user_pages_fast((uintptr_t)uptr & PAGE_MASK, 1, 0, pages); @@ -1308,15 +1237,14 @@ static int iommufd_test_md_check_pa(struct iommufd_ucmd *ucmd, pfn = page_to_pfn(pages[0]); put_page(pages[0]); - ent = xa_load(&mock->pfns, iova / MOCK_IO_PAGE_SIZE); - if (!ent || - (xa_to_value(ent) & MOCK_PFN_MASK) * MOCK_IO_PAGE_SIZE != - pfn * PAGE_SIZE + ((uintptr_t)uptr % PAGE_SIZE)) { + io_phys = mock->domain.ops->iova_to_phys(&mock->domain, iova); + if (io_phys != + pfn * PAGE_SIZE + ((uintptr_t)uptr % PAGE_SIZE)) { rc = -EINVAL; goto out_put; } - iova += MOCK_IO_PAGE_SIZE; - uptr += MOCK_IO_PAGE_SIZE; + iova += page_size; + uptr += page_size; } rc = 0; @@ -1795,7 +1723,7 @@ static int iommufd_test_dirty(struct iommufd_ucmd *ucmd, unsigned int mockpt_id, if (IS_ERR(hwpt)) return PTR_ERR(hwpt); - if (!(mock->flags & MOCK_DIRTY_TRACK)) { + if (!(mock->flags & MOCK_DIRTY_TRACK) || !mock->iommu.ops->set_dirty) { rc = -EINVAL; goto out_put; } @@ -1814,22 +1742,10 @@ static int iommufd_test_dirty(struct iommufd_ucmd *ucmd, unsigned int mockpt_id, } for (i = 0; i < max; i++) { - unsigned long cur = iova + i * page_size; - void *ent, *old; - if (!test_bit(i, (unsigned long *)tmp)) continue; - - ent = xa_load(&mock->pfns, cur / page_size); - if (ent) { - unsigned long val; - - val = xa_to_value(ent) | MOCK_PFN_DIRTY_IOVA; - old = xa_store(&mock->pfns, cur / page_size, - xa_mk_value(val), GFP_KERNEL); - WARN_ON_ONCE(ent != old); - count++; - } + mock->iommu.ops->set_dirty(&mock->iommu, iova + i * page_size); + count++; } cmd->dirty.out_nr_dirty = count; @@ -2031,6 +1947,140 @@ void iommufd_selftest_destroy(struct iommufd_object *obj) } } +struct iommufd_test_dma_buf { + void *memory; + size_t length; + bool revoked; +}; + +static int iommufd_test_dma_buf_attach(struct dma_buf *dmabuf, + struct dma_buf_attachment *attachment) +{ + return 0; +} + +static void iommufd_test_dma_buf_detach(struct dma_buf *dmabuf, + struct dma_buf_attachment *attachment) +{ +} + +static struct sg_table * +iommufd_test_dma_buf_map(struct dma_buf_attachment *attachment, + enum dma_data_direction dir) +{ + return ERR_PTR(-EOPNOTSUPP); +} + +static void iommufd_test_dma_buf_unmap(struct dma_buf_attachment *attachment, + struct sg_table *sgt, + enum dma_data_direction dir) +{ +} + +static void iommufd_test_dma_buf_release(struct dma_buf *dmabuf) +{ + struct iommufd_test_dma_buf *priv = dmabuf->priv; + + kfree(priv->memory); + kfree(priv); +} + +static const struct dma_buf_ops iommufd_test_dmabuf_ops = { + .attach = iommufd_test_dma_buf_attach, + .detach = iommufd_test_dma_buf_detach, + .map_dma_buf = iommufd_test_dma_buf_map, + .release = iommufd_test_dma_buf_release, + .unmap_dma_buf = iommufd_test_dma_buf_unmap, +}; + +int iommufd_test_dma_buf_iommufd_map(struct dma_buf_attachment *attachment, + struct dma_buf_phys_vec *phys) +{ + struct iommufd_test_dma_buf *priv = attachment->dmabuf->priv; + + dma_resv_assert_held(attachment->dmabuf->resv); + + if (attachment->dmabuf->ops != &iommufd_test_dmabuf_ops) + return -EOPNOTSUPP; + + if (priv->revoked) + return -ENODEV; + + phys->paddr = virt_to_phys(priv->memory); + phys->len = priv->length; + return 0; +} + +static int iommufd_test_dmabuf_get(struct iommufd_ucmd *ucmd, + unsigned int open_flags, + size_t len) +{ + DEFINE_DMA_BUF_EXPORT_INFO(exp_info); + struct iommufd_test_dma_buf *priv; + struct dma_buf *dmabuf; + int rc; + + len = ALIGN(len, PAGE_SIZE); + if (len == 0 || len > PAGE_SIZE * 512) + return -EINVAL; + + priv = kzalloc(sizeof(*priv), GFP_KERNEL); + if (!priv) + return -ENOMEM; + + priv->length = len; + priv->memory = kzalloc(len, GFP_KERNEL); + if (!priv->memory) { + rc = -ENOMEM; + goto err_free; + } + + exp_info.ops = &iommufd_test_dmabuf_ops; + exp_info.size = len; + exp_info.flags = open_flags; + exp_info.priv = priv; + + dmabuf = dma_buf_export(&exp_info); + if (IS_ERR(dmabuf)) { + rc = PTR_ERR(dmabuf); + goto err_free; + } + + return dma_buf_fd(dmabuf, open_flags); + +err_free: + kfree(priv->memory); + kfree(priv); + return rc; +} + +static int iommufd_test_dmabuf_revoke(struct iommufd_ucmd *ucmd, int fd, + bool revoked) +{ + struct iommufd_test_dma_buf *priv; + struct dma_buf *dmabuf; + int rc = 0; + + dmabuf = dma_buf_get(fd); + if (IS_ERR(dmabuf)) + return PTR_ERR(dmabuf); + + if (dmabuf->ops != &iommufd_test_dmabuf_ops) { + rc = -EOPNOTSUPP; + goto err_put; + } + + priv = dmabuf->priv; + dma_resv_lock(dmabuf->resv, NULL); + priv->revoked = revoked; + dma_buf_move_notify(dmabuf); + dma_resv_unlock(dmabuf->resv); + +err_put: + dma_buf_put(dmabuf); + return rc; +} + int iommufd_test(struct iommufd_ucmd *ucmd) { struct iommu_test_cmd *cmd = ucmd->cmd; @@ -2109,6 +2159,13 @@ int iommufd_test(struct iommufd_ucmd *ucmd) return iommufd_test_pasid_detach(ucmd, cmd); case IOMMU_TEST_OP_PASID_CHECK_HWPT: return iommufd_test_pasid_check_hwpt(ucmd, cmd); + case IOMMU_TEST_OP_DMABUF_GET: + return iommufd_test_dmabuf_get(ucmd, cmd->dmabuf_get.open_flags, + cmd->dmabuf_get.length); + case IOMMU_TEST_OP_DMABUF_REVOKE: + return iommufd_test_dmabuf_revoke(ucmd, + cmd->dmabuf_revoke.dmabuf_fd, + cmd->dmabuf_revoke.revoked); default: return -EOPNOTSUPP; } @@ -2202,3 +2259,5 @@ void iommufd_test_exit(void) platform_device_unregister(selftest_iommu_dev); debugfs_remove_recursive(dbgfs_root); } + +MODULE_IMPORT_NS("GENERIC_PT_IOMMU"); diff --git a/drivers/iommu/ipmmu-vmsa.c b/drivers/iommu/ipmmu-vmsa.c index ffa892f65714..ca848288dbf2 100644 --- a/drivers/iommu/ipmmu-vmsa.c +++ b/drivers/iommu/ipmmu-vmsa.c @@ -590,7 +590,7 @@ static void ipmmu_domain_free(struct iommu_domain *io_domain) } static int ipmmu_attach_device(struct iommu_domain *io_domain, - struct device *dev) + struct device *dev, struct iommu_domain *old) { struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); struct ipmmu_vmsa_device *mmu = to_ipmmu(dev); @@ -637,17 +637,17 @@ static int ipmmu_attach_device(struct iommu_domain *io_domain, } static int ipmmu_iommu_identity_attach(struct iommu_domain *identity_domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { - struct iommu_domain *io_domain = iommu_get_domain_for_dev(dev); struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); struct ipmmu_vmsa_domain *domain; unsigned int i; - if (io_domain == identity_domain || !io_domain) + if (old == identity_domain || !old) return 0; - domain = to_vmsa_domain(io_domain); + domain = to_vmsa_domain(old); for (i = 0; i < fwspec->num_ids; ++i) ipmmu_utlb_disable(domain, fwspec->ids[i]); @@ -720,6 +720,8 @@ static int ipmmu_init_platform_device(struct device *dev, dev_iommu_priv_set(dev, platform_get_drvdata(ipmmu_pdev)); + put_device(&ipmmu_pdev->dev); + return 0; } diff --git a/drivers/iommu/msm_iommu.c b/drivers/iommu/msm_iommu.c index 43a61ba021a5..819add75a665 100644 --- a/drivers/iommu/msm_iommu.c +++ b/drivers/iommu/msm_iommu.c @@ -391,7 +391,8 @@ static struct iommu_device *msm_iommu_probe_device(struct device *dev) return &iommu->iommu; } -static int msm_iommu_attach_dev(struct iommu_domain *domain, struct device *dev) +static int msm_iommu_attach_dev(struct iommu_domain *domain, struct device *dev, + struct iommu_domain *old) { int ret = 0; unsigned long flags; @@ -441,19 +442,19 @@ fail: } static int msm_iommu_identity_attach(struct iommu_domain *identity_domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { - struct iommu_domain *domain = iommu_get_domain_for_dev(dev); struct msm_priv *priv; unsigned long flags; struct msm_iommu_dev *iommu; struct msm_iommu_ctx_dev *master; int ret = 0; - if (domain == identity_domain || !domain) + if (old == identity_domain || !old) return 0; - priv = to_msm_priv(domain); + priv = to_msm_priv(old); free_io_pgtable_ops(priv->iop); spin_lock_irqsave(&msm_iommu_lock, flags); diff --git a/drivers/iommu/mtk_iommu.c b/drivers/iommu/mtk_iommu.c index 0e0285348d2b..60fcd3d3b5eb 100644 --- a/drivers/iommu/mtk_iommu.c +++ b/drivers/iommu/mtk_iommu.c @@ -139,6 +139,7 @@ /* 2 bits: iommu type */ #define MTK_IOMMU_TYPE_MM (0x0 << 13) #define MTK_IOMMU_TYPE_INFRA (0x1 << 13) +#define MTK_IOMMU_TYPE_APU (0x2 << 13) #define MTK_IOMMU_TYPE_MASK (0x3 << 13) /* PM and clock always on. e.g. infra iommu */ #define PM_CLK_AO BIT(15) @@ -147,6 +148,7 @@ #define TF_PORT_TO_ADDR_MT8173 BIT(18) #define INT_ID_PORT_WIDTH_6 BIT(19) #define CFG_IFA_MASTER_IN_ATF BIT(20) +#define DL_WITH_MULTI_LARB BIT(21) #define MTK_IOMMU_HAS_FLAG_MASK(pdata, _x, mask) \ ((((pdata)->flags) & (mask)) == (_x)) @@ -172,6 +174,7 @@ enum mtk_iommu_plat { M4U_MT8183, M4U_MT8186, M4U_MT8188, + M4U_MT8189, M4U_MT8192, M4U_MT8195, M4U_MT8365, @@ -335,6 +338,8 @@ static int mtk_iommu_hw_init(const struct mtk_iommu_data *data, unsigned int ban */ #define MTK_IOMMU_4GB_MODE_REMAP_BASE 0x140000000UL +static LIST_HEAD(apulist); /* List the apu iommu HWs */ +static LIST_HEAD(infralist); /* List the iommu_infra HW */ static LIST_HEAD(m4ulist); /* List all the M4U HWs */ #define for_each_m4u(data, head) list_for_each_entry(data, head, list) @@ -350,6 +355,15 @@ static const struct mtk_iommu_iova_region single_domain[] = { #define MT8192_MULTI_REGION_NR (IS_ENABLED(CONFIG_ARCH_DMA_ADDR_T_64BIT) ? \ MT8192_MULTI_REGION_NR_MAX : 1) +static const struct mtk_iommu_iova_region mt8189_multi_dom_apu[] = { + { .iova_base = 0x200000ULL, .size = SZ_512M}, /* APU SECURE */ +#if IS_ENABLED(CONFIG_ARCH_DMA_ADDR_T_64BIT) + { .iova_base = SZ_1G, .size = 0xc0000000}, /* APU CODE */ + { .iova_base = 0x70000000ULL, .size = 0x12600000}, /* APU VLM */ + { .iova_base = SZ_4G, .size = SZ_4G * 3}, /* APU VPU */ +#endif +}; + static const struct mtk_iommu_iova_region mt8192_multi_dom[MT8192_MULTI_REGION_NR] = { { .iova_base = 0x0, .size = MTK_IOMMU_IOVA_SZ_4G}, /* 0 ~ 4G, */ #if IS_ENABLED(CONFIG_ARCH_DMA_ADDR_T_64BIT) @@ -705,7 +719,7 @@ static void mtk_iommu_domain_free(struct iommu_domain *domain) } static int mtk_iommu_attach_device(struct iommu_domain *domain, - struct device *dev) + struct device *dev, struct iommu_domain *old) { struct mtk_iommu_data *data = dev_iommu_priv_get(dev), *frstdata; struct mtk_iommu_domain *dom = to_mtk_domain(domain); @@ -773,12 +787,12 @@ err_unlock: } static int mtk_iommu_identity_attach(struct iommu_domain *identity_domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { - struct iommu_domain *domain = iommu_get_domain_for_dev(dev); struct mtk_iommu_data *data = dev_iommu_priv_get(dev); - if (domain == identity_domain || !domain) + if (old == identity_domain || !old) return 0; mtk_iommu_config(data, dev, false, 0); @@ -865,6 +879,7 @@ static struct iommu_device *mtk_iommu_probe_device(struct device *dev) struct mtk_iommu_data *data = dev_iommu_priv_get(dev); struct device_link *link; struct device *larbdev; + unsigned long larbid_msk = 0; unsigned int larbid, larbidx, i; if (!MTK_IOMMU_IS_TYPE(data->plat_data, MTK_IOMMU_TYPE_MM)) @@ -872,30 +887,50 @@ static struct iommu_device *mtk_iommu_probe_device(struct device *dev) /* * Link the consumer device with the smi-larb device(supplier). - * The device that connects with each a larb is a independent HW. - * All the ports in each a device should be in the same larbs. + * w/DL_WITH_MULTI_LARB: the master may connect with multi larbs, + * we should create device link with each larb. + * w/o DL_WITH_MULTI_LARB: the master must connect with one larb, + * otherwise fail. */ larbid = MTK_M4U_TO_LARB(fwspec->ids[0]); if (larbid >= MTK_LARB_NR_MAX) return ERR_PTR(-EINVAL); + larbid_msk |= BIT(larbid); + for (i = 1; i < fwspec->num_ids; i++) { larbidx = MTK_M4U_TO_LARB(fwspec->ids[i]); - if (larbid != larbidx) { + if (MTK_IOMMU_HAS_FLAG(data->plat_data, DL_WITH_MULTI_LARB)) { + larbid_msk |= BIT(larbidx); + } else if (larbid != larbidx) { dev_err(dev, "Can only use one larb. Fail@larb%d-%d.\n", larbid, larbidx); return ERR_PTR(-EINVAL); } } - larbdev = data->larb_imu[larbid].dev; - if (!larbdev) - return ERR_PTR(-EINVAL); - link = device_link_add(dev, larbdev, - DL_FLAG_PM_RUNTIME | DL_FLAG_STATELESS); - if (!link) - dev_err(dev, "Unable to link %s\n", dev_name(larbdev)); + for_each_set_bit(larbid, &larbid_msk, 32) { + larbdev = data->larb_imu[larbid].dev; + if (!larbdev) + return ERR_PTR(-EINVAL); + + link = device_link_add(dev, larbdev, + DL_FLAG_PM_RUNTIME | DL_FLAG_STATELESS); + if (!link) { + dev_err(dev, "Unable to link %s\n", dev_name(larbdev)); + goto link_remove; + } + } + return &data->iommu; + +link_remove: + for_each_set_bit(i, &larbid_msk, larbid) { + larbdev = data->larb_imu[i].dev; + device_link_remove(dev, larbdev); + } + + return ERR_PTR(-ENODEV); } static void mtk_iommu_release_device(struct device *dev) @@ -903,11 +938,19 @@ static void mtk_iommu_release_device(struct device *dev) struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); struct mtk_iommu_data *data; struct device *larbdev; - unsigned int larbid; + unsigned int larbid, i; + unsigned long larbid_msk = 0; data = dev_iommu_priv_get(dev); - if (MTK_IOMMU_IS_TYPE(data->plat_data, MTK_IOMMU_TYPE_MM)) { - larbid = MTK_M4U_TO_LARB(fwspec->ids[0]); + if (!MTK_IOMMU_IS_TYPE(data->plat_data, MTK_IOMMU_TYPE_MM)) + return; + + for (i = 0; i < fwspec->num_ids; i++) { + larbid = MTK_M4U_TO_LARB(fwspec->ids[i]); + larbid_msk |= BIT(larbid); + } + + for_each_set_bit(larbid, &larbid_msk, 32) { larbdev = data->larb_imu[larbid].dev; device_link_remove(dev, larbdev); } @@ -974,6 +1017,8 @@ static int mtk_iommu_of_xlate(struct device *dev, return -EINVAL; dev_iommu_priv_set(dev, platform_get_drvdata(m4updev)); + + put_device(&m4updev->dev); } return iommu_fwspec_add_ids(dev, args->args, 1); @@ -1211,16 +1256,19 @@ static int mtk_iommu_mm_dts_parse(struct device *dev, struct component_match **m } component_match_add(dev, match, component_compare_dev, &plarbdev->dev); - platform_device_put(plarbdev); } - if (!frst_avail_smicomm_node) - return -EINVAL; + if (!frst_avail_smicomm_node) { + ret = -EINVAL; + goto err_larbdev_put; + } pcommdev = of_find_device_by_node(frst_avail_smicomm_node); of_node_put(frst_avail_smicomm_node); - if (!pcommdev) - return -ENODEV; + if (!pcommdev) { + ret = -ENODEV; + goto err_larbdev_put; + } data->smicomm_dev = &pcommdev->dev; link = device_link_add(data->smicomm_dev, dev, @@ -1228,16 +1276,16 @@ static int mtk_iommu_mm_dts_parse(struct device *dev, struct component_match **m platform_device_put(pcommdev); if (!link) { dev_err(dev, "Unable to link %s.\n", dev_name(data->smicomm_dev)); - return -EINVAL; + ret = -EINVAL; + goto err_larbdev_put; } return 0; err_larbdev_put: - for (i = MTK_LARB_NR_MAX - 1; i >= 0; i--) { - if (!data->larb_imu[i].dev) - continue; + /* id mapping may not be linear, loop the whole array */ + for (i = 0; i < MTK_LARB_NR_MAX; i++) put_device(data->larb_imu[i].dev); - } + return ret; } @@ -1400,8 +1448,12 @@ out_sysfs_remove: iommu_device_sysfs_remove(&data->iommu); out_list_del: list_del(&data->list); - if (MTK_IOMMU_IS_TYPE(data->plat_data, MTK_IOMMU_TYPE_MM)) + if (MTK_IOMMU_IS_TYPE(data->plat_data, MTK_IOMMU_TYPE_MM)) { device_link_remove(data->smicomm_dev, dev); + + for (i = 0; i < MTK_LARB_NR_MAX; i++) + put_device(data->larb_imu[i].dev); + } out_runtime_disable: pm_runtime_disable(dev); return ret; @@ -1421,6 +1473,9 @@ static void mtk_iommu_remove(struct platform_device *pdev) if (MTK_IOMMU_IS_TYPE(data->plat_data, MTK_IOMMU_TYPE_MM)) { device_link_remove(data->smicomm_dev, &pdev->dev); component_master_del(&pdev->dev, &mtk_iommu_com_ops); + + for (i = 0; i < MTK_LARB_NR_MAX; i++) + put_device(data->larb_imu[i].dev); } pm_runtime_disable(&pdev->dev); for (i = 0; i < data->plat_data->banks_num; i++) { @@ -1695,6 +1750,66 @@ static const struct mtk_iommu_plat_data mt8188_data_vpp = { 27, 28 /* ccu0 */, MTK_INVALID_LARBID}, {4, 6}}, }; +static const unsigned int mt8189_apu_region_msk[][MTK_LARB_NR_MAX] = { + [0] = {[0] = BIT(2)}, /* Region0: fake larb 0 APU_SECURE */ + [1] = {[0] = BIT(1)}, /* Region1: fake larb 0 APU_CODE */ + [2] = {[0] = BIT(3)}, /* Region2: fake larb 0 APU_VLM */ + [3] = {[0] = BIT(0)}, /* Region3: fake larb 0 APU_DATA */ +}; + +static const struct mtk_iommu_plat_data mt8189_data_apu = { + .m4u_plat = M4U_MT8189, + .flags = IOVA_34_EN | DCM_DISABLE | + MTK_IOMMU_TYPE_APU | PGTABLE_PA_35_EN, + .hw_list = &apulist, + .inv_sel_reg = REG_MMU_INV_SEL_GEN2, + .banks_num = 1, + .banks_enable = {true}, + .iova_region = mt8189_multi_dom_apu, + .iova_region_nr = ARRAY_SIZE(mt8189_multi_dom_apu), + .larbid_remap = {{0}, {1}, {2}, {3}, {4}, {5}, {6}, {7}}, + .iova_region_larb_msk = mt8189_apu_region_msk, +}; + +static const struct mtk_iommu_plat_data mt8189_data_infra = { + .m4u_plat = M4U_MT8189, + .flags = WR_THROT_EN | DCM_DISABLE | MTK_IOMMU_TYPE_INFRA | + CFG_IFA_MASTER_IN_ATF | SHARE_PGTABLE | PGTABLE_PA_35_EN, + .hw_list = &infralist, + .banks_num = 1, + .banks_enable = {true}, + .inv_sel_reg = REG_MMU_INV_SEL_GEN2, + .iova_region = single_domain, + .iova_region_nr = ARRAY_SIZE(single_domain), +}; + +static const u32 mt8189_larb_region_msk[MT8192_MULTI_REGION_NR_MAX][MTK_LARB_NR_MAX] = { + [0] = {~0, ~0, ~0, [22] = BIT(0)}, /* Region0: all ports for larb0/1/2 */ + [1] = {[3] = ~0, [4] = ~0}, /* Region1: all ports for larb4(3)/7(4) */ + [2] = {[5] = ~0, [6] = ~0, /* Region2: all ports for larb9(5)/11(6) */ + [7] = ~0, [8] = ~0, /* Region2: all ports for larb13(7)/14(8) */ + [9] = ~0, [10] = ~0, /* Region2: all ports for larb16(9)/17(10) */ + [11] = ~0, [12] = ~0, /* Region2: all ports for larb19(11)/20(12) */ + [21] = ~0}, /* Region2: larb21 fake GCE larb */ +}; + +static const struct mtk_iommu_plat_data mt8189_data_mm = { + .m4u_plat = M4U_MT8189, + .flags = HAS_BCLK | HAS_SUB_COMM_3BITS | OUT_ORDER_WR_EN | + WR_THROT_EN | IOVA_34_EN | MTK_IOMMU_TYPE_MM | + PGTABLE_PA_35_EN | DL_WITH_MULTI_LARB, + .hw_list = &m4ulist, + .inv_sel_reg = REG_MMU_INV_SEL_GEN2, + .banks_num = 5, + .banks_enable = {true, false, false, false, false}, + .iova_region = mt8192_multi_dom, + .iova_region_nr = ARRAY_SIZE(mt8192_multi_dom), + .iova_region_larb_msk = mt8189_larb_region_msk, + .larbid_remap = {{0}, {1}, {21/* GCE_D */, 21/* GCE_M */, 2}, + {19, 20, 9, 11}, {7}, {4}, + {13, 17}, {14, 16}}, +}; + static const struct mtk_iommu_plat_data mt8192_data = { .m4u_plat = M4U_MT8192, .flags = HAS_BCLK | HAS_SUB_COMM_2BITS | OUT_ORDER_WR_EN | @@ -1796,6 +1911,9 @@ static const struct of_device_id mtk_iommu_of_ids[] = { { .compatible = "mediatek,mt8188-iommu-infra", .data = &mt8188_data_infra}, { .compatible = "mediatek,mt8188-iommu-vdo", .data = &mt8188_data_vdo}, { .compatible = "mediatek,mt8188-iommu-vpp", .data = &mt8188_data_vpp}, + { .compatible = "mediatek,mt8189-iommu-apu", .data = &mt8189_data_apu}, + { .compatible = "mediatek,mt8189-iommu-infra", .data = &mt8189_data_infra}, + { .compatible = "mediatek,mt8189-iommu-mm", .data = &mt8189_data_mm}, { .compatible = "mediatek,mt8192-m4u", .data = &mt8192_data}, { .compatible = "mediatek,mt8195-iommu-infra", .data = &mt8195_data_infra}, { .compatible = "mediatek,mt8195-iommu-vdo", .data = &mt8195_data_vdo}, diff --git a/drivers/iommu/mtk_iommu_v1.c b/drivers/iommu/mtk_iommu_v1.c index 10cc0b1197e8..c8d8eff5373d 100644 --- a/drivers/iommu/mtk_iommu_v1.c +++ b/drivers/iommu/mtk_iommu_v1.c @@ -303,7 +303,9 @@ static void mtk_iommu_v1_domain_free(struct iommu_domain *domain) kfree(to_mtk_domain(domain)); } -static int mtk_iommu_v1_attach_device(struct iommu_domain *domain, struct device *dev) +static int mtk_iommu_v1_attach_device(struct iommu_domain *domain, + struct device *dev, + struct iommu_domain *old) { struct mtk_iommu_v1_data *data = dev_iommu_priv_get(dev); struct mtk_iommu_v1_domain *dom = to_mtk_domain(domain); @@ -329,7 +331,8 @@ static int mtk_iommu_v1_attach_device(struct iommu_domain *domain, struct device } static int mtk_iommu_v1_identity_attach(struct iommu_domain *identity_domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { struct mtk_iommu_v1_data *data = dev_iommu_priv_get(dev); @@ -435,6 +438,8 @@ static int mtk_iommu_v1_create_mapping(struct device *dev, return -EINVAL; dev_iommu_priv_set(dev, platform_get_drvdata(m4updev)); + + put_device(&m4updev->dev); } ret = iommu_fwspec_add_ids(dev, args->args, 1); @@ -641,13 +646,18 @@ static int mtk_iommu_v1_probe(struct platform_device *pdev) if (larb_nr < 0) return larb_nr; + if (larb_nr > MTK_LARB_NR_MAX) + return -EINVAL; + for (i = 0; i < larb_nr; i++) { struct device_node *larbnode; struct platform_device *plarbdev; larbnode = of_parse_phandle(dev->of_node, "mediatek,larbs", i); - if (!larbnode) - return -EINVAL; + if (!larbnode) { + ret = -EINVAL; + goto out_put_larbs; + } if (!of_device_is_available(larbnode)) { of_node_put(larbnode); @@ -657,11 +667,14 @@ static int mtk_iommu_v1_probe(struct platform_device *pdev) plarbdev = of_find_device_by_node(larbnode); if (!plarbdev) { of_node_put(larbnode); - return -ENODEV; + ret = -ENODEV; + goto out_put_larbs; } if (!plarbdev->dev.driver) { of_node_put(larbnode); - return -EPROBE_DEFER; + put_device(&plarbdev->dev); + ret = -EPROBE_DEFER; + goto out_put_larbs; } data->larb_imu[i].dev = &plarbdev->dev; @@ -673,7 +686,7 @@ static int mtk_iommu_v1_probe(struct platform_device *pdev) ret = mtk_iommu_v1_hw_init(data); if (ret) - return ret; + goto out_put_larbs; ret = iommu_device_sysfs_add(&data->iommu, &pdev->dev, NULL, dev_name(&pdev->dev)); @@ -695,12 +708,17 @@ out_sysfs_remove: iommu_device_sysfs_remove(&data->iommu); out_clk_unprepare: clk_disable_unprepare(data->bclk); +out_put_larbs: + for (i = 0; i < MTK_LARB_NR_MAX; i++) + put_device(data->larb_imu[i].dev); + return ret; } static void mtk_iommu_v1_remove(struct platform_device *pdev) { struct mtk_iommu_v1_data *data = platform_get_drvdata(pdev); + int i; iommu_device_sysfs_remove(&data->iommu); iommu_device_unregister(&data->iommu); @@ -708,6 +726,9 @@ static void mtk_iommu_v1_remove(struct platform_device *pdev) clk_disable_unprepare(data->bclk); devm_free_irq(&pdev->dev, data->irq, data); component_master_del(&pdev->dev, &mtk_iommu_v1_com_ops); + + for (i = 0; i < MTK_LARB_NR_MAX; i++) + put_device(data->larb_imu[i].dev); } static int __maybe_unused mtk_iommu_v1_suspend(struct device *dev) diff --git a/drivers/iommu/omap-iommu.c b/drivers/iommu/omap-iommu.c index 5c6f5943f44b..768973b7e511 100644 --- a/drivers/iommu/omap-iommu.c +++ b/drivers/iommu/omap-iommu.c @@ -1431,8 +1431,8 @@ static void omap_iommu_detach_fini(struct omap_iommu_domain *odomain) odomain->iommus = NULL; } -static int -omap_iommu_attach_dev(struct iommu_domain *domain, struct device *dev) +static int omap_iommu_attach_dev(struct iommu_domain *domain, + struct device *dev, struct iommu_domain *old) { struct omap_iommu_arch_data *arch_data = dev_iommu_priv_get(dev); struct omap_iommu_domain *omap_domain = to_omap_domain(domain); @@ -1536,15 +1536,15 @@ static void _omap_iommu_detach_dev(struct omap_iommu_domain *omap_domain, } static int omap_iommu_identity_attach(struct iommu_domain *identity_domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { - struct iommu_domain *domain = iommu_get_domain_for_dev(dev); struct omap_iommu_domain *omap_domain; - if (domain == identity_domain || !domain) + if (old == identity_domain || !old) return 0; - omap_domain = to_omap_domain(domain); + omap_domain = to_omap_domain(old); spin_lock(&omap_domain->lock); _omap_iommu_detach_dev(omap_domain, dev); spin_unlock(&omap_domain->lock); @@ -1668,23 +1668,20 @@ static struct iommu_device *omap_iommu_probe_device(struct device *dev) } pdev = of_find_device_by_node(np); + of_node_put(np); if (!pdev) { - of_node_put(np); kfree(arch_data); return ERR_PTR(-ENODEV); } oiommu = platform_get_drvdata(pdev); + put_device(&pdev->dev); if (!oiommu) { - of_node_put(np); kfree(arch_data); return ERR_PTR(-EINVAL); } tmp->iommu_dev = oiommu; - tmp->dev = &pdev->dev; - - of_node_put(np); } dev_iommu_priv_set(dev, arch_data); diff --git a/drivers/iommu/omap-iommu.h b/drivers/iommu/omap-iommu.h index 27697109ec79..50b39be61abc 100644 --- a/drivers/iommu/omap-iommu.h +++ b/drivers/iommu/omap-iommu.h @@ -88,7 +88,6 @@ struct omap_iommu { /** * struct omap_iommu_arch_data - omap iommu private data * @iommu_dev: handle of the OMAP iommu device - * @dev: handle of the iommu device * * This is an omap iommu private data object, which binds an iommu user * to its iommu device. This object should be placed at the iommu user's @@ -97,7 +96,6 @@ struct omap_iommu { */ struct omap_iommu_arch_data { struct omap_iommu *iommu_dev; - struct device *dev; }; struct cr_regs { diff --git a/drivers/iommu/riscv/iommu.c b/drivers/iommu/riscv/iommu.c index ebb22979075d..d9429097a2b5 100644 --- a/drivers/iommu/riscv/iommu.c +++ b/drivers/iommu/riscv/iommu.c @@ -1321,7 +1321,8 @@ static bool riscv_iommu_pt_supported(struct riscv_iommu_device *iommu, int pgd_m } static int riscv_iommu_attach_paging_domain(struct iommu_domain *iommu_domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain); struct riscv_iommu_device *iommu = dev_to_iommu(dev); @@ -1426,7 +1427,8 @@ static struct iommu_domain *riscv_iommu_alloc_paging_domain(struct device *dev) } static int riscv_iommu_attach_blocking_domain(struct iommu_domain *iommu_domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { struct riscv_iommu_device *iommu = dev_to_iommu(dev); struct riscv_iommu_info *info = dev_iommu_priv_get(dev); @@ -1447,7 +1449,8 @@ static struct iommu_domain riscv_iommu_blocking_domain = { }; static int riscv_iommu_attach_identity_domain(struct iommu_domain *iommu_domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { struct riscv_iommu_device *iommu = dev_to_iommu(dev); struct riscv_iommu_info *info = dev_iommu_priv_get(dev); diff --git a/drivers/iommu/rockchip-iommu.c b/drivers/iommu/rockchip-iommu.c index 0861dd469bd8..85f3667e797c 100644 --- a/drivers/iommu/rockchip-iommu.c +++ b/drivers/iommu/rockchip-iommu.c @@ -960,7 +960,8 @@ out_disable_clocks: } static int rk_iommu_identity_attach(struct iommu_domain *identity_domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { struct rk_iommu *iommu; struct rk_iommu_domain *rk_domain; @@ -1005,7 +1006,7 @@ static struct iommu_domain rk_identity_domain = { }; static int rk_iommu_attach_device(struct iommu_domain *domain, - struct device *dev) + struct device *dev, struct iommu_domain *old) { struct rk_iommu *iommu; struct rk_iommu_domain *rk_domain = to_rk_domain(domain); @@ -1026,7 +1027,7 @@ static int rk_iommu_attach_device(struct iommu_domain *domain, if (iommu->domain == domain) return 0; - ret = rk_iommu_identity_attach(&rk_identity_domain, dev); + ret = rk_iommu_identity_attach(&rk_identity_domain, dev, old); if (ret) return ret; @@ -1041,8 +1042,17 @@ static int rk_iommu_attach_device(struct iommu_domain *domain, return 0; ret = rk_iommu_enable(iommu); - if (ret) - WARN_ON(rk_iommu_identity_attach(&rk_identity_domain, dev)); + if (ret) { + /* + * Note rk_iommu_identity_attach() might fail before physically + * attaching the dev to iommu->domain, in which case the actual + * old domain for this revert should be rk_identity_domain v.s. + * iommu->domain. Since rk_iommu_identity_attach() does not care + * about the old domain argument for now, this is not a problem. + */ + WARN_ON(rk_iommu_identity_attach(&rk_identity_domain, dev, + iommu->domain)); + } pm_runtime_put(iommu->dev); diff --git a/drivers/iommu/s390-iommu.c b/drivers/iommu/s390-iommu.c index aa576736d60b..fe679850af28 100644 --- a/drivers/iommu/s390-iommu.c +++ b/drivers/iommu/s390-iommu.c @@ -670,7 +670,8 @@ int zpci_iommu_register_ioat(struct zpci_dev *zdev, u8 *status) } static int blocking_domain_attach_device(struct iommu_domain *domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { struct zpci_dev *zdev = to_zpci_dev(dev); struct s390_domain *s390_domain; @@ -694,7 +695,8 @@ static int blocking_domain_attach_device(struct iommu_domain *domain, } static int s390_iommu_attach_device(struct iommu_domain *domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { struct s390_domain *s390_domain = to_s390_domain(domain); struct zpci_dev *zdev = to_zpci_dev(dev); @@ -709,7 +711,7 @@ static int s390_iommu_attach_device(struct iommu_domain *domain, domain->geometry.aperture_end < zdev->start_dma)) return -EINVAL; - blocking_domain_attach_device(&blocking_domain, dev); + blocking_domain_attach_device(&blocking_domain, dev, old); /* If we fail now DMA remains blocked via blocking domain */ cc = s390_iommu_domain_reg_ioat(zdev, domain, &status); @@ -1131,13 +1133,14 @@ static int __init s390_iommu_init(void) subsys_initcall(s390_iommu_init); static int s390_attach_dev_identity(struct iommu_domain *domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { struct zpci_dev *zdev = to_zpci_dev(dev); u8 status; int cc; - blocking_domain_attach_device(&blocking_domain, dev); + blocking_domain_attach_device(&blocking_domain, dev, old); /* If we fail now DMA remains blocked via blocking domain */ cc = s390_iommu_domain_reg_ioat(zdev, domain, &status); diff --git a/drivers/iommu/sprd-iommu.c b/drivers/iommu/sprd-iommu.c index c7ca1d8a0b15..555d4505c747 100644 --- a/drivers/iommu/sprd-iommu.c +++ b/drivers/iommu/sprd-iommu.c @@ -247,7 +247,8 @@ static void sprd_iommu_domain_free(struct iommu_domain *domain) } static int sprd_iommu_attach_device(struct iommu_domain *domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { struct sprd_iommu_device *sdev = dev_iommu_priv_get(dev); struct sprd_iommu_domain *dom = to_sprd_domain(domain); diff --git a/drivers/iommu/sun50i-iommu.c b/drivers/iommu/sun50i-iommu.c index de10b569d9a9..90b26fe21817 100644 --- a/drivers/iommu/sun50i-iommu.c +++ b/drivers/iommu/sun50i-iommu.c @@ -771,7 +771,8 @@ static void sun50i_iommu_detach_domain(struct sun50i_iommu *iommu, } static int sun50i_iommu_identity_attach(struct iommu_domain *identity_domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { struct sun50i_iommu *iommu = dev_iommu_priv_get(dev); struct sun50i_iommu_domain *sun50i_domain; @@ -797,7 +798,8 @@ static struct iommu_domain sun50i_iommu_identity_domain = { }; static int sun50i_iommu_attach_device(struct iommu_domain *domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { struct sun50i_iommu_domain *sun50i_domain = to_sun50i_domain(domain); struct sun50i_iommu *iommu; @@ -813,7 +815,7 @@ static int sun50i_iommu_attach_device(struct iommu_domain *domain, if (iommu->domain == domain) return 0; - sun50i_iommu_identity_attach(&sun50i_iommu_identity_domain, dev); + sun50i_iommu_identity_attach(&sun50i_iommu_identity_domain, dev, old); sun50i_iommu_attach_domain(iommu, sun50i_domain); @@ -839,6 +841,8 @@ static int sun50i_iommu_of_xlate(struct device *dev, dev_iommu_priv_set(dev, platform_get_drvdata(iommu_pdev)); + put_device(&iommu_pdev->dev); + return iommu_fwspec_add_ids(dev, &id, 1); } diff --git a/drivers/iommu/tegra-smmu.c b/drivers/iommu/tegra-smmu.c index 36cdd5fbab07..c391e7f2cde6 100644 --- a/drivers/iommu/tegra-smmu.c +++ b/drivers/iommu/tegra-smmu.c @@ -490,7 +490,7 @@ static void tegra_smmu_as_unprepare(struct tegra_smmu *smmu, } static int tegra_smmu_attach_dev(struct iommu_domain *domain, - struct device *dev) + struct device *dev, struct iommu_domain *old) { struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); struct tegra_smmu *smmu = dev_iommu_priv_get(dev); @@ -524,9 +524,9 @@ disable: } static int tegra_smmu_identity_attach(struct iommu_domain *identity_domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { - struct iommu_domain *domain = iommu_get_domain_for_dev(dev); struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); struct tegra_smmu_as *as; struct tegra_smmu *smmu; @@ -535,10 +535,10 @@ static int tegra_smmu_identity_attach(struct iommu_domain *identity_domain, if (!fwspec) return -ENODEV; - if (domain == identity_domain || !domain) + if (old == identity_domain || !old) return 0; - as = to_smmu_as(domain); + as = to_smmu_as(old); smmu = as->smmu; for (index = 0; index < fwspec->num_ids; index++) { tegra_smmu_disable(smmu, fwspec->ids[index], as->id); @@ -830,10 +830,9 @@ static struct tegra_smmu *tegra_smmu_find(struct device_node *np) return NULL; mc = platform_get_drvdata(pdev); - if (!mc) { - put_device(&pdev->dev); + put_device(&pdev->dev); + if (!mc) return NULL; - } return mc->smmu; } diff --git a/drivers/iommu/virtio-iommu.c b/drivers/iommu/virtio-iommu.c index b39d6f134ab2..d314fa5cd847 100644 --- a/drivers/iommu/virtio-iommu.c +++ b/drivers/iommu/virtio-iommu.c @@ -730,7 +730,8 @@ static struct iommu_domain *viommu_domain_alloc_identity(struct device *dev) return domain; } -static int viommu_attach_dev(struct iommu_domain *domain, struct device *dev) +static int viommu_attach_dev(struct iommu_domain *domain, struct device *dev, + struct iommu_domain *old) { int ret = 0; struct virtio_iommu_req_attach req; @@ -781,7 +782,8 @@ static int viommu_attach_dev(struct iommu_domain *domain, struct device *dev) } static int viommu_attach_identity_domain(struct iommu_domain *domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { int ret = 0; struct virtio_iommu_req_attach req; |
