summaryrefslogtreecommitdiff
path: root/arch/arm64/mm/mmu.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/arm64/mm/mmu.c')
-rw-r--r--arch/arm64/mm/mmu.c494
1 files changed, 465 insertions, 29 deletions
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 183801520740..3a444a5fe469 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -27,6 +27,8 @@
#include <linux/kfence.h>
#include <linux/pkeys.h>
#include <linux/mm_inline.h>
+#include <linux/pagewalk.h>
+#include <linux/stop_machine.h>
#include <asm/barrier.h>
#include <asm/cputype.h>
@@ -47,6 +49,8 @@
#define NO_CONT_MAPPINGS BIT(1)
#define NO_EXEC_MAPPINGS BIT(2) /* assumes FEAT_HPDS is not used */
+DEFINE_STATIC_KEY_FALSE(arm64_ptdump_lock_key);
+
u64 kimage_voffset __ro_after_init;
EXPORT_SYMBOL(kimage_voffset);
@@ -474,14 +478,18 @@ void create_kpti_ng_temp_pgd(pgd_t *pgdir, phys_addr_t phys, unsigned long virt,
int flags);
#endif
-static phys_addr_t __pgd_pgtable_alloc(struct mm_struct *mm,
+#define INVALID_PHYS_ADDR (-1ULL)
+
+static phys_addr_t __pgd_pgtable_alloc(struct mm_struct *mm, gfp_t gfp,
enum pgtable_type pgtable_type)
{
/* Page is zeroed by init_clear_pgtable() so don't duplicate effort. */
- struct ptdesc *ptdesc = pagetable_alloc(GFP_PGTABLE_KERNEL & ~__GFP_ZERO, 0);
+ struct ptdesc *ptdesc = pagetable_alloc(gfp & ~__GFP_ZERO, 0);
phys_addr_t pa;
- BUG_ON(!ptdesc);
+ if (!ptdesc)
+ return INVALID_PHYS_ADDR;
+
pa = page_to_phys(ptdesc_page(ptdesc));
switch (pgtable_type) {
@@ -502,16 +510,392 @@ static phys_addr_t __pgd_pgtable_alloc(struct mm_struct *mm,
return pa;
}
+static phys_addr_t
+try_pgd_pgtable_alloc_init_mm(enum pgtable_type pgtable_type, gfp_t gfp)
+{
+ return __pgd_pgtable_alloc(&init_mm, gfp, pgtable_type);
+}
+
static phys_addr_t __maybe_unused
pgd_pgtable_alloc_init_mm(enum pgtable_type pgtable_type)
{
- return __pgd_pgtable_alloc(&init_mm, pgtable_type);
+ phys_addr_t pa;
+
+ pa = __pgd_pgtable_alloc(&init_mm, GFP_PGTABLE_KERNEL, pgtable_type);
+ BUG_ON(pa == INVALID_PHYS_ADDR);
+ return pa;
}
static phys_addr_t
pgd_pgtable_alloc_special_mm(enum pgtable_type pgtable_type)
{
- return __pgd_pgtable_alloc(NULL, pgtable_type);
+ phys_addr_t pa;
+
+ pa = __pgd_pgtable_alloc(NULL, GFP_PGTABLE_KERNEL, pgtable_type);
+ BUG_ON(pa == INVALID_PHYS_ADDR);
+ return pa;
+}
+
+static void split_contpte(pte_t *ptep)
+{
+ int i;
+
+ ptep = PTR_ALIGN_DOWN(ptep, sizeof(*ptep) * CONT_PTES);
+ for (i = 0; i < CONT_PTES; i++, ptep++)
+ __set_pte(ptep, pte_mknoncont(__ptep_get(ptep)));
+}
+
+static int split_pmd(pmd_t *pmdp, pmd_t pmd, gfp_t gfp, bool to_cont)
+{
+ pmdval_t tableprot = PMD_TYPE_TABLE | PMD_TABLE_UXN | PMD_TABLE_AF;
+ unsigned long pfn = pmd_pfn(pmd);
+ pgprot_t prot = pmd_pgprot(pmd);
+ phys_addr_t pte_phys;
+ pte_t *ptep;
+ int i;
+
+ pte_phys = try_pgd_pgtable_alloc_init_mm(TABLE_PTE, gfp);
+ if (pte_phys == INVALID_PHYS_ADDR)
+ return -ENOMEM;
+ ptep = (pte_t *)phys_to_virt(pte_phys);
+
+ if (pgprot_val(prot) & PMD_SECT_PXN)
+ tableprot |= PMD_TABLE_PXN;
+
+ prot = __pgprot((pgprot_val(prot) & ~PTE_TYPE_MASK) | PTE_TYPE_PAGE);
+ prot = __pgprot(pgprot_val(prot) & ~PTE_CONT);
+ if (to_cont)
+ prot = __pgprot(pgprot_val(prot) | PTE_CONT);
+
+ for (i = 0; i < PTRS_PER_PTE; i++, ptep++, pfn++)
+ __set_pte(ptep, pfn_pte(pfn, prot));
+
+ /*
+ * Ensure the pte entries are visible to the table walker by the time
+ * the pmd entry that points to the ptes is visible.
+ */
+ dsb(ishst);
+ __pmd_populate(pmdp, pte_phys, tableprot);
+
+ return 0;
+}
+
+static void split_contpmd(pmd_t *pmdp)
+{
+ int i;
+
+ pmdp = PTR_ALIGN_DOWN(pmdp, sizeof(*pmdp) * CONT_PMDS);
+ for (i = 0; i < CONT_PMDS; i++, pmdp++)
+ set_pmd(pmdp, pmd_mknoncont(pmdp_get(pmdp)));
+}
+
+static int split_pud(pud_t *pudp, pud_t pud, gfp_t gfp, bool to_cont)
+{
+ pudval_t tableprot = PUD_TYPE_TABLE | PUD_TABLE_UXN | PUD_TABLE_AF;
+ unsigned int step = PMD_SIZE >> PAGE_SHIFT;
+ unsigned long pfn = pud_pfn(pud);
+ pgprot_t prot = pud_pgprot(pud);
+ phys_addr_t pmd_phys;
+ pmd_t *pmdp;
+ int i;
+
+ pmd_phys = try_pgd_pgtable_alloc_init_mm(TABLE_PMD, gfp);
+ if (pmd_phys == INVALID_PHYS_ADDR)
+ return -ENOMEM;
+ pmdp = (pmd_t *)phys_to_virt(pmd_phys);
+
+ if (pgprot_val(prot) & PMD_SECT_PXN)
+ tableprot |= PUD_TABLE_PXN;
+
+ prot = __pgprot((pgprot_val(prot) & ~PMD_TYPE_MASK) | PMD_TYPE_SECT);
+ prot = __pgprot(pgprot_val(prot) & ~PTE_CONT);
+ if (to_cont)
+ prot = __pgprot(pgprot_val(prot) | PTE_CONT);
+
+ for (i = 0; i < PTRS_PER_PMD; i++, pmdp++, pfn += step)
+ set_pmd(pmdp, pfn_pmd(pfn, prot));
+
+ /*
+ * Ensure the pmd entries are visible to the table walker by the time
+ * the pud entry that points to the pmds is visible.
+ */
+ dsb(ishst);
+ __pud_populate(pudp, pmd_phys, tableprot);
+
+ return 0;
+}
+
+static int split_kernel_leaf_mapping_locked(unsigned long addr)
+{
+ pgd_t *pgdp, pgd;
+ p4d_t *p4dp, p4d;
+ pud_t *pudp, pud;
+ pmd_t *pmdp, pmd;
+ pte_t *ptep, pte;
+ int ret = 0;
+
+ /*
+ * PGD: If addr is PGD aligned then addr already describes a leaf
+ * boundary. If not present then there is nothing to split.
+ */
+ if (ALIGN_DOWN(addr, PGDIR_SIZE) == addr)
+ goto out;
+ pgdp = pgd_offset_k(addr);
+ pgd = pgdp_get(pgdp);
+ if (!pgd_present(pgd))
+ goto out;
+
+ /*
+ * P4D: If addr is P4D aligned then addr already describes a leaf
+ * boundary. If not present then there is nothing to split.
+ */
+ if (ALIGN_DOWN(addr, P4D_SIZE) == addr)
+ goto out;
+ p4dp = p4d_offset(pgdp, addr);
+ p4d = p4dp_get(p4dp);
+ if (!p4d_present(p4d))
+ goto out;
+
+ /*
+ * PUD: If addr is PUD aligned then addr already describes a leaf
+ * boundary. If not present then there is nothing to split. Otherwise,
+ * if we have a pud leaf, split to contpmd.
+ */
+ if (ALIGN_DOWN(addr, PUD_SIZE) == addr)
+ goto out;
+ pudp = pud_offset(p4dp, addr);
+ pud = pudp_get(pudp);
+ if (!pud_present(pud))
+ goto out;
+ if (pud_leaf(pud)) {
+ ret = split_pud(pudp, pud, GFP_PGTABLE_KERNEL, true);
+ if (ret)
+ goto out;
+ }
+
+ /*
+ * CONTPMD: If addr is CONTPMD aligned then addr already describes a
+ * leaf boundary. If not present then there is nothing to split.
+ * Otherwise, if we have a contpmd leaf, split to pmd.
+ */
+ if (ALIGN_DOWN(addr, CONT_PMD_SIZE) == addr)
+ goto out;
+ pmdp = pmd_offset(pudp, addr);
+ pmd = pmdp_get(pmdp);
+ if (!pmd_present(pmd))
+ goto out;
+ if (pmd_leaf(pmd)) {
+ if (pmd_cont(pmd))
+ split_contpmd(pmdp);
+ /*
+ * PMD: If addr is PMD aligned then addr already describes a
+ * leaf boundary. Otherwise, split to contpte.
+ */
+ if (ALIGN_DOWN(addr, PMD_SIZE) == addr)
+ goto out;
+ ret = split_pmd(pmdp, pmd, GFP_PGTABLE_KERNEL, true);
+ if (ret)
+ goto out;
+ }
+
+ /*
+ * CONTPTE: If addr is CONTPTE aligned then addr already describes a
+ * leaf boundary. If not present then there is nothing to split.
+ * Otherwise, if we have a contpte leaf, split to pte.
+ */
+ if (ALIGN_DOWN(addr, CONT_PTE_SIZE) == addr)
+ goto out;
+ ptep = pte_offset_kernel(pmdp, addr);
+ pte = __ptep_get(ptep);
+ if (!pte_present(pte))
+ goto out;
+ if (pte_cont(pte))
+ split_contpte(ptep);
+
+out:
+ return ret;
+}
+
+static DEFINE_MUTEX(pgtable_split_lock);
+
+int split_kernel_leaf_mapping(unsigned long start, unsigned long end)
+{
+ int ret;
+
+ /*
+ * !BBML2_NOABORT systems should not be trying to change permissions on
+ * anything that is not pte-mapped in the first place. Just return early
+ * and let the permission change code raise a warning if not already
+ * pte-mapped.
+ */
+ if (!system_supports_bbml2_noabort())
+ return 0;
+
+ /*
+ * Ensure start and end are at least page-aligned since this is the
+ * finest granularity we can split to.
+ */
+ if (start != PAGE_ALIGN(start) || end != PAGE_ALIGN(end))
+ return -EINVAL;
+
+ mutex_lock(&pgtable_split_lock);
+ arch_enter_lazy_mmu_mode();
+
+ /*
+ * The split_kernel_leaf_mapping_locked() may sleep, it is not a
+ * problem for ARM64 since ARM64's lazy MMU implementation allows
+ * sleeping.
+ *
+ * Optimize for the common case of splitting out a single page from a
+ * larger mapping. Here we can just split on the "least aligned" of
+ * start and end and this will guarantee that there must also be a split
+ * on the more aligned address since the both addresses must be in the
+ * same contpte block and it must have been split to ptes.
+ */
+ if (end - start == PAGE_SIZE) {
+ start = __ffs(start) < __ffs(end) ? start : end;
+ ret = split_kernel_leaf_mapping_locked(start);
+ } else {
+ ret = split_kernel_leaf_mapping_locked(start);
+ if (!ret)
+ ret = split_kernel_leaf_mapping_locked(end);
+ }
+
+ arch_leave_lazy_mmu_mode();
+ mutex_unlock(&pgtable_split_lock);
+ return ret;
+}
+
+static int __init split_to_ptes_pud_entry(pud_t *pudp, unsigned long addr,
+ unsigned long next,
+ struct mm_walk *walk)
+{
+ pud_t pud = pudp_get(pudp);
+ int ret = 0;
+
+ if (pud_leaf(pud))
+ ret = split_pud(pudp, pud, GFP_ATOMIC, false);
+
+ return ret;
+}
+
+static int __init split_to_ptes_pmd_entry(pmd_t *pmdp, unsigned long addr,
+ unsigned long next,
+ struct mm_walk *walk)
+{
+ pmd_t pmd = pmdp_get(pmdp);
+ int ret = 0;
+
+ if (pmd_leaf(pmd)) {
+ if (pmd_cont(pmd))
+ split_contpmd(pmdp);
+ ret = split_pmd(pmdp, pmd, GFP_ATOMIC, false);
+
+ /*
+ * We have split the pmd directly to ptes so there is no need to
+ * visit each pte to check if they are contpte.
+ */
+ walk->action = ACTION_CONTINUE;
+ }
+
+ return ret;
+}
+
+static int __init split_to_ptes_pte_entry(pte_t *ptep, unsigned long addr,
+ unsigned long next,
+ struct mm_walk *walk)
+{
+ pte_t pte = __ptep_get(ptep);
+
+ if (pte_cont(pte))
+ split_contpte(ptep);
+
+ return 0;
+}
+
+static const struct mm_walk_ops split_to_ptes_ops __initconst = {
+ .pud_entry = split_to_ptes_pud_entry,
+ .pmd_entry = split_to_ptes_pmd_entry,
+ .pte_entry = split_to_ptes_pte_entry,
+};
+
+static bool linear_map_requires_bbml2 __initdata;
+
+u32 idmap_kpti_bbml2_flag;
+
+void __init init_idmap_kpti_bbml2_flag(void)
+{
+ WRITE_ONCE(idmap_kpti_bbml2_flag, 1);
+ /* Must be visible to other CPUs before stop_machine() is called. */
+ smp_mb();
+}
+
+static int __init linear_map_split_to_ptes(void *__unused)
+{
+ /*
+ * Repainting the linear map must be done by CPU0 (the boot CPU) because
+ * that's the only CPU that we know supports BBML2. The other CPUs will
+ * be held in a waiting area with the idmap active.
+ */
+ if (!smp_processor_id()) {
+ unsigned long lstart = _PAGE_OFFSET(vabits_actual);
+ unsigned long lend = PAGE_END;
+ unsigned long kstart = (unsigned long)lm_alias(_stext);
+ unsigned long kend = (unsigned long)lm_alias(__init_begin);
+ int ret;
+
+ /*
+ * Wait for all secondary CPUs to be put into the waiting area.
+ */
+ smp_cond_load_acquire(&idmap_kpti_bbml2_flag, VAL == num_online_cpus());
+
+ /*
+ * Walk all of the linear map [lstart, lend), except the kernel
+ * linear map alias [kstart, kend), and split all mappings to
+ * PTE. The kernel alias remains static throughout runtime so
+ * can continue to be safely mapped with large mappings.
+ */
+ ret = walk_kernel_page_table_range_lockless(lstart, kstart,
+ &split_to_ptes_ops, NULL, NULL);
+ if (!ret)
+ ret = walk_kernel_page_table_range_lockless(kend, lend,
+ &split_to_ptes_ops, NULL, NULL);
+ if (ret)
+ panic("Failed to split linear map\n");
+ flush_tlb_kernel_range(lstart, lend);
+
+ /*
+ * Relies on dsb in flush_tlb_kernel_range() to avoid reordering
+ * before any page table split operations.
+ */
+ WRITE_ONCE(idmap_kpti_bbml2_flag, 0);
+ } else {
+ typedef void (wait_split_fn)(void);
+ extern wait_split_fn wait_linear_map_split_to_ptes;
+ wait_split_fn *wait_fn;
+
+ wait_fn = (void *)__pa_symbol(wait_linear_map_split_to_ptes);
+
+ /*
+ * At least one secondary CPU doesn't support BBML2 so cannot
+ * tolerate the size of the live mappings changing. So have the
+ * secondary CPUs wait for the boot CPU to make the changes
+ * with the idmap active and init_mm inactive.
+ */
+ cpu_install_idmap();
+ wait_fn();
+ cpu_uninstall_idmap();
+ }
+
+ return 0;
+}
+
+void __init linear_map_maybe_split_to_ptes(void)
+{
+ if (linear_map_requires_bbml2 && !system_supports_bbml2_noabort()) {
+ init_idmap_kpti_bbml2_flag();
+ stop_machine(linear_map_split_to_ptes, NULL, cpu_online_mask);
+ }
}
/*
@@ -574,8 +958,8 @@ void __init mark_linear_text_alias_ro(void)
/*
* Remove the write permissions from the linear alias of .text/.rodata
*/
- update_mapping_prot(__pa_symbol(_stext), (unsigned long)lm_alias(_stext),
- (unsigned long)__init_begin - (unsigned long)_stext,
+ update_mapping_prot(__pa_symbol(_text), (unsigned long)lm_alias(_text),
+ (unsigned long)__init_begin - (unsigned long)_text,
PAGE_KERNEL_RO);
}
@@ -633,10 +1017,20 @@ static inline void arm64_kfence_map_pool(phys_addr_t kfence_pool, pgd_t *pgdp) {
#endif /* CONFIG_KFENCE */
+static inline bool force_pte_mapping(void)
+{
+ bool bbml2 = system_capabilities_finalized() ?
+ system_supports_bbml2_noabort() : cpu_supports_bbml2_noabort();
+
+ return (!bbml2 && (rodata_full || arm64_kfence_can_set_direct_map() ||
+ is_realm_world())) ||
+ debug_pagealloc_enabled();
+}
+
static void __init map_mem(pgd_t *pgdp)
{
static const u64 direct_map_end = _PAGE_END(VA_BITS_MIN);
- phys_addr_t kernel_start = __pa_symbol(_stext);
+ phys_addr_t kernel_start = __pa_symbol(_text);
phys_addr_t kernel_end = __pa_symbol(__init_begin);
phys_addr_t start, end;
phys_addr_t early_kfence_pool;
@@ -658,7 +1052,9 @@ static void __init map_mem(pgd_t *pgdp)
early_kfence_pool = arm64_kfence_alloc_pool();
- if (can_set_direct_map())
+ linear_map_requires_bbml2 = !force_pte_mapping() && can_set_direct_map();
+
+ if (force_pte_mapping())
flags |= NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
/*
@@ -683,7 +1079,7 @@ static void __init map_mem(pgd_t *pgdp)
}
/*
- * Map the linear alias of the [_stext, __init_begin) interval
+ * Map the linear alias of the [_text, __init_begin) interval
* as non-executable now, and remove the write permission in
* mark_linear_text_alias_ro() below (which will be called after
* alternative patching has completed). This makes the contents
@@ -710,6 +1106,10 @@ void mark_rodata_ro(void)
WRITE_ONCE(rodata_is_rw, false);
update_mapping_prot(__pa_symbol(__start_rodata), (unsigned long)__start_rodata,
section_size, PAGE_KERNEL_RO);
+ /* mark the range between _text and _stext as read only. */
+ update_mapping_prot(__pa_symbol(_text), (unsigned long)_text,
+ (unsigned long)_stext - (unsigned long)_text,
+ PAGE_KERNEL_RO);
}
static void __init declare_vma(struct vm_struct *vma,
@@ -780,38 +1180,41 @@ static void __init declare_kernel_vmas(void)
{
static struct vm_struct vmlinux_seg[KERNEL_SEGMENT_COUNT];
- declare_vma(&vmlinux_seg[0], _stext, _etext, VM_NO_GUARD);
+ declare_vma(&vmlinux_seg[0], _text, _etext, VM_NO_GUARD);
declare_vma(&vmlinux_seg[1], __start_rodata, __inittext_begin, VM_NO_GUARD);
declare_vma(&vmlinux_seg[2], __inittext_begin, __inittext_end, VM_NO_GUARD);
declare_vma(&vmlinux_seg[3], __initdata_begin, __initdata_end, VM_NO_GUARD);
declare_vma(&vmlinux_seg[4], _data, _end, 0);
}
-void __pi_map_range(u64 *pgd, u64 start, u64 end, u64 pa, pgprot_t prot,
- int level, pte_t *tbl, bool may_use_cont, u64 va_offset);
+void __pi_map_range(phys_addr_t *pte, u64 start, u64 end, phys_addr_t pa,
+ pgprot_t prot, int level, pte_t *tbl, bool may_use_cont,
+ u64 va_offset);
static u8 idmap_ptes[IDMAP_LEVELS - 1][PAGE_SIZE] __aligned(PAGE_SIZE) __ro_after_init,
- kpti_ptes[IDMAP_LEVELS - 1][PAGE_SIZE] __aligned(PAGE_SIZE) __ro_after_init;
+ kpti_bbml2_ptes[IDMAP_LEVELS - 1][PAGE_SIZE] __aligned(PAGE_SIZE) __ro_after_init;
static void __init create_idmap(void)
{
- u64 start = __pa_symbol(__idmap_text_start);
- u64 end = __pa_symbol(__idmap_text_end);
- u64 ptep = __pa_symbol(idmap_ptes);
+ phys_addr_t start = __pa_symbol(__idmap_text_start);
+ phys_addr_t end = __pa_symbol(__idmap_text_end);
+ phys_addr_t ptep = __pa_symbol(idmap_ptes);
__pi_map_range(&ptep, start, end, start, PAGE_KERNEL_ROX,
IDMAP_ROOT_LEVEL, (pte_t *)idmap_pg_dir, false,
__phys_to_virt(ptep) - ptep);
- if (IS_ENABLED(CONFIG_UNMAP_KERNEL_AT_EL0) && !arm64_use_ng_mappings) {
- extern u32 __idmap_kpti_flag;
- u64 pa = __pa_symbol(&__idmap_kpti_flag);
+ if (linear_map_requires_bbml2 ||
+ (IS_ENABLED(CONFIG_UNMAP_KERNEL_AT_EL0) && !arm64_use_ng_mappings)) {
+ phys_addr_t pa = __pa_symbol(&idmap_kpti_bbml2_flag);
/*
* The KPTI G-to-nG conversion code needs a read-write mapping
- * of its synchronization flag in the ID map.
+ * of its synchronization flag in the ID map. This is also used
+ * when splitting the linear map to ptes if a secondary CPU
+ * doesn't support bbml2.
*/
- ptep = __pa_symbol(kpti_ptes);
+ ptep = __pa_symbol(kpti_bbml2_ptes);
__pi_map_range(&ptep, pa, pa + sizeof(u32), pa, PAGE_KERNEL,
IDMAP_ROOT_LEVEL, (pte_t *)idmap_pg_dir, false,
__phys_to_virt(ptep) - ptep);
@@ -1261,7 +1664,8 @@ int pmd_clear_huge(pmd_t *pmdp)
return 1;
}
-int pmd_free_pte_page(pmd_t *pmdp, unsigned long addr)
+static int __pmd_free_pte_page(pmd_t *pmdp, unsigned long addr,
+ bool acquire_mmap_lock)
{
pte_t *table;
pmd_t pmd;
@@ -1273,13 +1677,25 @@ int pmd_free_pte_page(pmd_t *pmdp, unsigned long addr)
return 1;
}
+ /* See comment in pud_free_pmd_page for static key logic */
table = pte_offset_kernel(pmdp, addr);
pmd_clear(pmdp);
__flush_tlb_kernel_pgtable(addr);
+ if (static_branch_unlikely(&arm64_ptdump_lock_key) && acquire_mmap_lock) {
+ mmap_read_lock(&init_mm);
+ mmap_read_unlock(&init_mm);
+ }
+
pte_free_kernel(NULL, table);
return 1;
}
+int pmd_free_pte_page(pmd_t *pmdp, unsigned long addr)
+{
+ /* If ptdump is walking the pagetables, acquire init_mm.mmap_lock */
+ return __pmd_free_pte_page(pmdp, addr, /* acquire_mmap_lock = */ true);
+}
+
int pud_free_pmd_page(pud_t *pudp, unsigned long addr)
{
pmd_t *table;
@@ -1295,16 +1711,36 @@ int pud_free_pmd_page(pud_t *pudp, unsigned long addr)
}
table = pmd_offset(pudp, addr);
+
+ /*
+ * Our objective is to prevent ptdump from reading a PMD table which has
+ * been freed. In this race, if pud_free_pmd_page observes the key on
+ * (which got flipped by ptdump) then the mmap lock sequence here will,
+ * as a result of the mmap write lock/unlock sequence in ptdump, give
+ * us the correct synchronization. If not, this means that ptdump has
+ * yet not started walking the pagetables - the sequence of barriers
+ * issued by __flush_tlb_kernel_pgtable() guarantees that ptdump will
+ * observe an empty PUD.
+ */
+ pud_clear(pudp);
+ __flush_tlb_kernel_pgtable(addr);
+ if (static_branch_unlikely(&arm64_ptdump_lock_key)) {
+ mmap_read_lock(&init_mm);
+ mmap_read_unlock(&init_mm);
+ }
+
pmdp = table;
next = addr;
end = addr + PUD_SIZE;
do {
if (pmd_present(pmdp_get(pmdp)))
- pmd_free_pte_page(pmdp, next);
+ /*
+ * PMD has been isolated, so ptdump won't see it. No
+ * need to acquire init_mm.mmap_lock.
+ */
+ __pmd_free_pte_page(pmdp, next, /* acquire_mmap_lock = */ false);
} while (pmdp++, next += PMD_SIZE, next != end);
- pud_clear(pudp);
- __flush_tlb_kernel_pgtable(addr);
pmd_free(NULL, table);
return 1;
}
@@ -1324,8 +1760,8 @@ static void __remove_pgd_mapping(pgd_t *pgdir, unsigned long start, u64 size)
struct range arch_get_mappable_range(void)
{
struct range mhp_range;
- u64 start_linear_pa = __pa(_PAGE_OFFSET(vabits_actual));
- u64 end_linear_pa = __pa(PAGE_END - 1);
+ phys_addr_t start_linear_pa = __pa(_PAGE_OFFSET(vabits_actual));
+ phys_addr_t end_linear_pa = __pa(PAGE_END - 1);
if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) {
/*
@@ -1360,7 +1796,7 @@ int arch_add_memory(int nid, u64 start, u64 size,
VM_BUG_ON(!mhp_range_allowed(start, size, true));
- if (can_set_direct_map())
+ if (force_pte_mapping())
flags |= NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
__create_pgd_mapping(swapper_pg_dir, start, __phys_to_virt(start),