From f2be745071ffd6793c032ca8443348c3ce0e3e18 Mon Sep 17 00:00:00 2001 From: Kevin Brodsky Date: Mon, 15 Dec 2025 15:03:14 +0000 Subject: mm: clarify lazy_mmu sleeping constraints The lazy MMU mode documentation makes clear that an implementation should not assume that preemption is disabled or any lock is held upon entry to the mode; however it says nothing about what code using the lazy MMU interface should expect. In practice sleeping is forbidden (for generic code) while the lazy MMU mode is active: say it explicitly. Link: https://lkml.kernel.org/r/20251215150323.2218608-6-kevin.brodsky@arm.com Signed-off-by: Kevin Brodsky Reviewed-by: Yeoreum Yun Acked-by: David Hildenbrand (Red Hat) Cc: Alexander Gordeev Cc: Andreas Larsson Cc: Anshuman Khandual Cc: Borislav Betkov Cc: Boris Ostrovsky Cc: Catalin Marinas Cc: Christophe Leroy Cc: David S. Miller Cc: David Woodhouse Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jann Horn Cc: Juegren Gross Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Ritesh Harjani (IBM) Cc: Ryan Roberts Cc: Suren Baghdasaryan Cc: Thomas Gleinxer Cc: Venkat Rao Bagalkote Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'include/linux/pgtable.h') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 652f287c1ef6..1abc4a1c3d72 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -225,11 +225,15 @@ static inline int pmd_dirty(pmd_t pmd) * up to date. * * In the general case, no lock is guaranteed to be held between entry and exit - * of the lazy mode. So the implementation must assume preemption may be enabled - * and cpu migration is possible; it must take steps to be robust against this. - * (In practice, for user PTE updates, the appropriate page table lock(s) are - * held, but for kernel PTE updates, no lock is held). Nesting is not permitted - * and the mode cannot be used in interrupt context. + * of the lazy mode. (In practice, for user PTE updates, the appropriate page + * table lock(s) are held, but for kernel PTE updates, no lock is held). + * The implementation must therefore assume preemption may be enabled upon + * entry to the mode and cpu migration is possible; it must take steps to be + * robust against this. An implementation may handle this by disabling + * preemption, as a consequence generic code may not sleep while the lazy MMU + * mode is active. + * + * Nesting is not permitted and the mode cannot be used in interrupt context. */ #ifndef __HAVE_ARCH_ENTER_LAZY_MMU_MODE static inline void arch_enter_lazy_mmu_mode(void) {} -- cgit v1.2.3 From 7303ecbfe4f46c00191b9b66acaa918784bad210 Mon Sep 17 00:00:00 2001 From: Kevin Brodsky Date: Mon, 15 Dec 2025 15:03:15 +0000 Subject: mm: introduce CONFIG_ARCH_HAS_LAZY_MMU_MODE Architectures currently opt in for implementing lazy_mmu helpers by defining __HAVE_ARCH_ENTER_LAZY_MMU_MODE. In preparation for introducing a generic lazy_mmu layer that will require storage in task_struct, let's switch to a cleaner approach: instead of defining a macro, select a CONFIG option. This patch introduces CONFIG_ARCH_HAS_LAZY_MMU_MODE and has each arch select it when it implements lazy_mmu helpers. __HAVE_ARCH_ENTER_LAZY_MMU_MODE is removed and relies on the new CONFIG instead. On x86, lazy_mmu helpers are only implemented if PARAVIRT_XXL is selected. This creates some complications in arch/x86/boot/, because a few files manually undefine PARAVIRT* options. As a result does not define the lazy_mmu helpers, but this breaks the build as only defines them if !CONFIG_ARCH_HAS_LAZY_MMU_MODE. There does not seem to be a clean way out of this - let's just undefine that new CONFIG too. Link: https://lkml.kernel.org/r/20251215150323.2218608-7-kevin.brodsky@arm.com Signed-off-by: Kevin Brodsky Acked-by: David Hildenbrand Reviewed-by: Ritesh Harjani (IBM) Reviewed-by: Ryan Roberts Reviewed-by: Yeoreum Yun Acked-by: Andreas Larsson [sparc] Cc: Alexander Gordeev Cc: Anshuman Khandual Cc: Borislav Betkov Cc: Boris Ostrovsky Cc: Catalin Marinas Cc: Christophe Leroy Cc: David Hildenbrand (Red Hat) Cc: David S. Miller Cc: David Woodhouse Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jann Horn Cc: Juegren Gross Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Suren Baghdasaryan Cc: Thomas Gleinxer Cc: Venkat Rao Bagalkote Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/arm64/Kconfig | 1 + arch/arm64/include/asm/pgtable.h | 1 - arch/powerpc/include/asm/book3s/64/tlbflush-hash.h | 2 -- arch/powerpc/platforms/Kconfig.cputype | 1 + arch/sparc/Kconfig | 1 + arch/sparc/include/asm/tlbflush_64.h | 2 -- arch/x86/Kconfig | 1 + arch/x86/boot/compressed/misc.h | 1 + arch/x86/boot/startup/sme.c | 1 + arch/x86/include/asm/paravirt.h | 1 - include/linux/pgtable.h | 2 +- mm/Kconfig | 7 +++++++ 12 files changed, 14 insertions(+), 7 deletions(-) (limited to 'include/linux/pgtable.h') diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 93173f0a09c7..3fb4603c0e16 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -35,6 +35,7 @@ config ARM64 select ARCH_HAS_KCOV select ARCH_HAS_KERNEL_FPU_SUPPORT if KERNEL_MODE_NEON select ARCH_HAS_KEEPINITRD + select ARCH_HAS_LAZY_MMU_MODE select ARCH_HAS_MEMBARRIER_SYNC_CORE select ARCH_HAS_MEM_ENCRYPT select ARCH_SUPPORTS_MSEAL_SYSTEM_MAPPINGS diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 64d5f1d9cce9..f7d66c261347 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -80,7 +80,6 @@ static inline void queue_pte_barriers(void) } } -#define __HAVE_ARCH_ENTER_LAZY_MMU_MODE static inline void arch_enter_lazy_mmu_mode(void) { /* diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h b/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h index 2d45f57df169..565c1b7c3eae 100644 --- a/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h +++ b/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h @@ -24,8 +24,6 @@ DECLARE_PER_CPU(struct ppc64_tlb_batch, ppc64_tlb_batch); extern void __flush_tlb_pending(struct ppc64_tlb_batch *batch); -#define __HAVE_ARCH_ENTER_LAZY_MMU_MODE - static inline void arch_enter_lazy_mmu_mode(void) { struct ppc64_tlb_batch *batch; diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index 4c321a8ea896..f399917c17bd 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -93,6 +93,7 @@ config PPC_BOOK3S_64 select IRQ_WORK select PPC_64S_HASH_MMU if !PPC_RADIX_MMU select KASAN_VMALLOC if KASAN + select ARCH_HAS_LAZY_MMU_MODE config PPC_BOOK3E_64 bool "Embedded processors" diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index a630d373e645..2bad14744ca4 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -112,6 +112,7 @@ config SPARC64 select NEED_PER_CPU_PAGE_FIRST_CHUNK select ARCH_SUPPORTS_SCHED_SMT if SMP select ARCH_SUPPORTS_SCHED_MC if SMP + select ARCH_HAS_LAZY_MMU_MODE config ARCH_PROC_KCORE_TEXT def_bool y diff --git a/arch/sparc/include/asm/tlbflush_64.h b/arch/sparc/include/asm/tlbflush_64.h index 925bb5d7a4e1..4e1036728e2f 100644 --- a/arch/sparc/include/asm/tlbflush_64.h +++ b/arch/sparc/include/asm/tlbflush_64.h @@ -39,8 +39,6 @@ static inline void flush_tlb_range(struct vm_area_struct *vma, void flush_tlb_kernel_range(unsigned long start, unsigned long end); -#define __HAVE_ARCH_ENTER_LAZY_MMU_MODE - void flush_tlb_pending(void); void arch_enter_lazy_mmu_mode(void); void arch_flush_lazy_mmu_mode(void); diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 80527299f859..2427a66cb0fe 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -808,6 +808,7 @@ config PARAVIRT config PARAVIRT_XXL bool depends on X86_64 + select ARCH_HAS_LAZY_MMU_MODE config PARAVIRT_DEBUG bool "paravirt-ops debugging" diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index fd855e32c9b9..4f86c5903e03 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -11,6 +11,7 @@ #undef CONFIG_PARAVIRT #undef CONFIG_PARAVIRT_XXL #undef CONFIG_PARAVIRT_SPINLOCKS +#undef CONFIG_ARCH_HAS_LAZY_MMU_MODE #undef CONFIG_KASAN #undef CONFIG_KASAN_GENERIC diff --git a/arch/x86/boot/startup/sme.c b/arch/x86/boot/startup/sme.c index e7ea65f3f1d6..b76a7c95dfe1 100644 --- a/arch/x86/boot/startup/sme.c +++ b/arch/x86/boot/startup/sme.c @@ -24,6 +24,7 @@ #undef CONFIG_PARAVIRT #undef CONFIG_PARAVIRT_XXL #undef CONFIG_PARAVIRT_SPINLOCKS +#undef CONFIG_ARCH_HAS_LAZY_MMU_MODE /* * This code runs before CPU feature bits are set. By default, the diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index b5e59a7ba0d0..13f9cd31c8f8 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -526,7 +526,6 @@ static inline void arch_end_context_switch(struct task_struct *next) PVOP_VCALL1(cpu.end_context_switch, next); } -#define __HAVE_ARCH_ENTER_LAZY_MMU_MODE static inline void arch_enter_lazy_mmu_mode(void) { PVOP_VCALL0(mmu.lazy_mode.enter); diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 1abc4a1c3d72..d46d86959bd6 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -235,7 +235,7 @@ static inline int pmd_dirty(pmd_t pmd) * * Nesting is not permitted and the mode cannot be used in interrupt context. */ -#ifndef __HAVE_ARCH_ENTER_LAZY_MMU_MODE +#ifndef CONFIG_ARCH_HAS_LAZY_MMU_MODE static inline void arch_enter_lazy_mmu_mode(void) {} static inline void arch_leave_lazy_mmu_mode(void) {} static inline void arch_flush_lazy_mmu_mode(void) {} diff --git a/mm/Kconfig b/mm/Kconfig index a992f2203eb9..7c2520e6a6b3 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -1468,6 +1468,13 @@ config PT_RECLAIM config FIND_NORMAL_PAGE def_bool n +config ARCH_HAS_LAZY_MMU_MODE + bool + help + The architecture uses the lazy MMU mode. This allows changes to + MMU-related architectural state to be deferred until the mode is + exited. See for details. + source "mm/damon/Kconfig" endmenu -- cgit v1.2.3 From 0a096ab7a3a6e2859c3c88988e548c5c213138bc Mon Sep 17 00:00:00 2001 From: Kevin Brodsky Date: Mon, 15 Dec 2025 15:03:16 +0000 Subject: mm: introduce generic lazy_mmu helpers The implementation of the lazy MMU mode is currently entirely arch-specific; core code directly calls arch helpers: arch_{enter,leave}_lazy_mmu_mode(). We are about to introduce support for nested lazy MMU sections. As things stand we'd have to duplicate that logic in every arch implementing lazy_mmu - adding to a fair amount of logic already duplicated across lazy_mmu implementations. This patch therefore introduces a new generic layer that calls the existing arch_* helpers. Two pair of calls are introduced: * lazy_mmu_mode_enable() ... lazy_mmu_mode_disable() This is the standard case where the mode is enabled for a given block of code by surrounding it with enable() and disable() calls. * lazy_mmu_mode_pause() ... lazy_mmu_mode_resume() This is for situations where the mode is temporarily disabled by first calling pause() and then resume() (e.g. to prevent any batching from occurring in a critical section). The documentation in will be updated in a subsequent patch. No functional change should be introduced at this stage. The implementation of enable()/resume() and disable()/pause() is currently identical, but nesting support will change that. Most of the call sites have been updated using the following Coccinelle script: @@ @@ { ... - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); ... - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); ... } @@ @@ { ... - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_pause(); ... - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_resume(); ... } A couple of notes regarding x86: * Xen is currently the only case where explicit handling is required for lazy MMU when context-switching. This is purely an implementation detail and using the generic lazy_mmu_mode_* functions would cause trouble when nesting support is introduced, because the generic functions must be called from the current task. For that reason we still use arch_leave() and arch_enter() there. * x86 calls arch_flush_lazy_mmu_mode() unconditionally in a few places, but only defines it if PARAVIRT_XXL is selected, and we are removing the fallback in . Add a new fallback definition to to keep things building. Link: https://lkml.kernel.org/r/20251215150323.2218608-8-kevin.brodsky@arm.com Signed-off-by: Kevin Brodsky Acked-by: David Hildenbrand Reviewed-by: Anshuman Khandual Reviewed-by: Yeoreum Yun Cc: Alexander Gordeev Cc: Andreas Larsson Cc: Borislav Betkov Cc: Boris Ostrovsky Cc: Catalin Marinas Cc: Christophe Leroy Cc: David Hildenbrand (Red Hat) Cc: David S. Miller Cc: David Woodhouse Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jann Horn Cc: Juegren Gross Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Ritesh Harjani (IBM) Cc: Ryan Roberts Cc: Suren Baghdasaryan Cc: Thomas Gleinxer Cc: Venkat Rao Bagalkote Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/arm64/mm/mmu.c | 8 ++++---- arch/arm64/mm/pageattr.c | 4 ++-- arch/powerpc/mm/book3s64/hash_tlb.c | 8 ++++---- arch/powerpc/mm/book3s64/subpage_prot.c | 4 ++-- arch/x86/include/asm/pgtable.h | 1 + fs/proc/task_mmu.c | 4 ++-- include/linux/pgtable.h | 29 +++++++++++++++++++++++++---- mm/kasan/shadow.c | 8 ++++---- mm/madvise.c | 18 +++++++++--------- mm/memory.c | 16 ++++++++-------- mm/migrate_device.c | 8 ++++---- mm/mprotect.c | 4 ++-- mm/mremap.c | 4 ++-- mm/userfaultfd.c | 4 ++-- mm/vmalloc.c | 12 ++++++------ mm/vmscan.c | 12 ++++++------ 16 files changed, 83 insertions(+), 61 deletions(-) (limited to 'include/linux/pgtable.h') diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 8e1d80a7033e..a6a00accf4f9 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -800,7 +800,7 @@ int split_kernel_leaf_mapping(unsigned long start, unsigned long end) return -EINVAL; mutex_lock(&pgtable_split_lock); - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); /* * The split_kernel_leaf_mapping_locked() may sleep, it is not a @@ -822,7 +822,7 @@ int split_kernel_leaf_mapping(unsigned long start, unsigned long end) ret = split_kernel_leaf_mapping_locked(end); } - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); mutex_unlock(&pgtable_split_lock); return ret; } @@ -883,10 +883,10 @@ static int range_split_to_ptes(unsigned long start, unsigned long end, gfp_t gfp { int ret; - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); ret = walk_kernel_page_table_range_lockless(start, end, &split_to_ptes_ops, NULL, &gfp); - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); return ret; } diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c index 7176ff39cb87..358d1dc9a576 100644 --- a/arch/arm64/mm/pageattr.c +++ b/arch/arm64/mm/pageattr.c @@ -110,7 +110,7 @@ static int update_range_prot(unsigned long start, unsigned long size, if (WARN_ON_ONCE(ret)) return ret; - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); /* * The caller must ensure that the range we are operating on does not @@ -119,7 +119,7 @@ static int update_range_prot(unsigned long start, unsigned long size, */ ret = walk_kernel_page_table_range_lockless(start, start + size, &pageattr_ops, NULL, &data); - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); return ret; } diff --git a/arch/powerpc/mm/book3s64/hash_tlb.c b/arch/powerpc/mm/book3s64/hash_tlb.c index 21fcad97ae80..787f7a0e27f0 100644 --- a/arch/powerpc/mm/book3s64/hash_tlb.c +++ b/arch/powerpc/mm/book3s64/hash_tlb.c @@ -205,7 +205,7 @@ void __flush_hash_table_range(unsigned long start, unsigned long end) * way to do things but is fine for our needs here. */ local_irq_save(flags); - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); for (; start < end; start += PAGE_SIZE) { pte_t *ptep = find_init_mm_pte(start, &hugepage_shift); unsigned long pte; @@ -217,7 +217,7 @@ void __flush_hash_table_range(unsigned long start, unsigned long end) continue; hpte_need_flush(&init_mm, start, ptep, pte, hugepage_shift); } - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); local_irq_restore(flags); } @@ -237,7 +237,7 @@ void flush_hash_table_pmd_range(struct mm_struct *mm, pmd_t *pmd, unsigned long * way to do things but is fine for our needs here. */ local_irq_save(flags); - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); start_pte = pte_offset_map(pmd, addr); if (!start_pte) goto out; @@ -249,6 +249,6 @@ void flush_hash_table_pmd_range(struct mm_struct *mm, pmd_t *pmd, unsigned long } pte_unmap(start_pte); out: - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); local_irq_restore(flags); } diff --git a/arch/powerpc/mm/book3s64/subpage_prot.c b/arch/powerpc/mm/book3s64/subpage_prot.c index ec98e526167e..07c47673bba2 100644 --- a/arch/powerpc/mm/book3s64/subpage_prot.c +++ b/arch/powerpc/mm/book3s64/subpage_prot.c @@ -73,13 +73,13 @@ static void hpte_flush_range(struct mm_struct *mm, unsigned long addr, pte = pte_offset_map_lock(mm, pmd, addr, &ptl); if (!pte) return; - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); for (; npages > 0; --npages) { pte_update(mm, addr, pte, 0, 0, 0); addr += PAGE_SIZE; ++pte; } - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); pte_unmap_unlock(pte - 1, ptl); } diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index e33df3da6980..2842fa1f7a2c 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -118,6 +118,7 @@ extern pmdval_t early_pmd_flags; #define __pte(x) native_make_pte(x) #define arch_end_context_switch(prev) do {} while(0) +static inline void arch_flush_lazy_mmu_mode(void) {} #endif /* CONFIG_PARAVIRT_XXL */ static inline pmd_t pmd_set_flags(pmd_t pmd, pmdval_t set) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 81dfc26bfae8..480db575553e 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -2739,7 +2739,7 @@ static int pagemap_scan_pmd_entry(pmd_t *pmd, unsigned long start, return 0; } - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); if ((p->arg.flags & PM_SCAN_WP_MATCHING) && !p->vec_out) { /* Fast path for performing exclusive WP */ @@ -2809,7 +2809,7 @@ flush_and_return: if (flush_end) flush_tlb_range(vma, start, addr); - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); pte_unmap_unlock(start_pte, ptl); cond_resched(); diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index d46d86959bd6..116a18b7916c 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -235,10 +235,31 @@ static inline int pmd_dirty(pmd_t pmd) * * Nesting is not permitted and the mode cannot be used in interrupt context. */ -#ifndef CONFIG_ARCH_HAS_LAZY_MMU_MODE -static inline void arch_enter_lazy_mmu_mode(void) {} -static inline void arch_leave_lazy_mmu_mode(void) {} -static inline void arch_flush_lazy_mmu_mode(void) {} +#ifdef CONFIG_ARCH_HAS_LAZY_MMU_MODE +static inline void lazy_mmu_mode_enable(void) +{ + arch_enter_lazy_mmu_mode(); +} + +static inline void lazy_mmu_mode_disable(void) +{ + arch_leave_lazy_mmu_mode(); +} + +static inline void lazy_mmu_mode_pause(void) +{ + arch_leave_lazy_mmu_mode(); +} + +static inline void lazy_mmu_mode_resume(void) +{ + arch_enter_lazy_mmu_mode(); +} +#else +static inline void lazy_mmu_mode_enable(void) {} +static inline void lazy_mmu_mode_disable(void) {} +static inline void lazy_mmu_mode_pause(void) {} +static inline void lazy_mmu_mode_resume(void) {} #endif #ifndef pte_batch_hint diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c index 32fbdf759ea2..d286e0a04543 100644 --- a/mm/kasan/shadow.c +++ b/mm/kasan/shadow.c @@ -305,7 +305,7 @@ static int kasan_populate_vmalloc_pte(pte_t *ptep, unsigned long addr, pte_t pte; int index; - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_pause(); index = PFN_DOWN(addr - data->start); page = data->pages[index]; @@ -319,7 +319,7 @@ static int kasan_populate_vmalloc_pte(pte_t *ptep, unsigned long addr, } spin_unlock(&init_mm.page_table_lock); - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_resume(); return 0; } @@ -471,7 +471,7 @@ static int kasan_depopulate_vmalloc_pte(pte_t *ptep, unsigned long addr, pte_t pte; int none; - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_pause(); spin_lock(&init_mm.page_table_lock); pte = ptep_get(ptep); @@ -483,7 +483,7 @@ static int kasan_depopulate_vmalloc_pte(pte_t *ptep, unsigned long addr, if (likely(!none)) __free_page(pfn_to_page(pte_pfn(pte))); - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_resume(); return 0; } diff --git a/mm/madvise.c b/mm/madvise.c index b617b1be0f53..6bf7009fa5ce 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -453,7 +453,7 @@ restart: if (!start_pte) return 0; flush_tlb_batched_pending(mm); - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); for (; addr < end; pte += nr, addr += nr * PAGE_SIZE) { nr = 1; ptent = ptep_get(pte); @@ -461,7 +461,7 @@ restart: if (++batch_count == SWAP_CLUSTER_MAX) { batch_count = 0; if (need_resched()) { - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); pte_unmap_unlock(start_pte, ptl); cond_resched(); goto restart; @@ -497,7 +497,7 @@ restart: if (!folio_trylock(folio)) continue; folio_get(folio); - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); pte_unmap_unlock(start_pte, ptl); start_pte = NULL; err = split_folio(folio); @@ -508,7 +508,7 @@ restart: if (!start_pte) break; flush_tlb_batched_pending(mm); - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); if (!err) nr = 0; continue; @@ -556,7 +556,7 @@ restart: } if (start_pte) { - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); pte_unmap_unlock(start_pte, ptl); } if (pageout) @@ -675,7 +675,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, if (!start_pte) return 0; flush_tlb_batched_pending(mm); - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); for (; addr != end; pte += nr, addr += PAGE_SIZE * nr) { nr = 1; ptent = ptep_get(pte); @@ -724,7 +724,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, if (!folio_trylock(folio)) continue; folio_get(folio); - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); pte_unmap_unlock(start_pte, ptl); start_pte = NULL; err = split_folio(folio); @@ -735,7 +735,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, if (!start_pte) break; flush_tlb_batched_pending(mm); - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); if (!err) nr = 0; continue; @@ -775,7 +775,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, if (nr_swap) add_mm_counter(mm, MM_SWAPENTS, nr_swap); if (start_pte) { - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); pte_unmap_unlock(start_pte, ptl); } cond_resched(); diff --git a/mm/memory.c b/mm/memory.c index da360a6eb8a4..e0bce673f053 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1256,7 +1256,7 @@ again: spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); orig_src_pte = src_pte; orig_dst_pte = dst_pte; - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); do { nr = 1; @@ -1325,7 +1325,7 @@ again: } while (dst_pte += nr, src_pte += nr, addr += PAGE_SIZE * nr, addr != end); - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); pte_unmap_unlock(orig_src_pte, src_ptl); add_mm_rss_vec(dst_mm, rss); pte_unmap_unlock(orig_dst_pte, dst_ptl); @@ -1846,7 +1846,7 @@ retry: return addr; flush_tlb_batched_pending(mm); - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); do { bool any_skipped = false; @@ -1878,7 +1878,7 @@ retry: direct_reclaim = try_get_and_clear_pmd(mm, pmd, &pmdval); add_mm_rss_vec(mm, rss); - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); /* Do the actual TLB flush before dropping ptl */ if (force_flush) { @@ -2816,7 +2816,7 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd, mapped_pte = pte = pte_alloc_map_lock(mm, pmd, addr, &ptl); if (!pte) return -ENOMEM; - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); do { BUG_ON(!pte_none(ptep_get(pte))); if (!pfn_modify_allowed(pfn, prot)) { @@ -2826,7 +2826,7 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd, set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot))); pfn++; } while (pte++, addr += PAGE_SIZE, addr != end); - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); pte_unmap_unlock(mapped_pte, ptl); return err; } @@ -3177,7 +3177,7 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, return -EINVAL; } - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); if (fn) { do { @@ -3190,7 +3190,7 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, } *mask |= PGTBL_PTE_MODIFIED; - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); if (mm != &init_mm) pte_unmap_unlock(mapped_pte, ptl); diff --git a/mm/migrate_device.c b/mm/migrate_device.c index 23379663b1e1..0346c2d7819f 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -271,7 +271,7 @@ again: ptep = pte_offset_map_lock(mm, pmdp, start, &ptl); if (!ptep) goto again; - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); ptep += (addr - start) / PAGE_SIZE; for (; addr < end; addr += PAGE_SIZE, ptep++) { @@ -313,7 +313,7 @@ again: if (folio_test_large(folio)) { int ret; - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); pte_unmap_unlock(ptep, ptl); ret = migrate_vma_split_folio(folio, migrate->fault_page); @@ -356,7 +356,7 @@ again: if (folio && folio_test_large(folio)) { int ret; - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); pte_unmap_unlock(ptep, ptl); ret = migrate_vma_split_folio(folio, migrate->fault_page); @@ -485,7 +485,7 @@ next: if (unmapped) flush_tlb_range(walk->vma, start, end); - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); pte_unmap_unlock(ptep - 1, ptl); return 0; diff --git a/mm/mprotect.c b/mm/mprotect.c index 283889e4f1ce..c0571445bef7 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -233,7 +233,7 @@ static long change_pte_range(struct mmu_gather *tlb, is_private_single_threaded = vma_is_single_threaded_private(vma); flush_tlb_batched_pending(vma->vm_mm); - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); do { nr_ptes = 1; oldpte = ptep_get(pte); @@ -379,7 +379,7 @@ static long change_pte_range(struct mmu_gather *tlb, } } } while (pte += nr_ptes, addr += nr_ptes * PAGE_SIZE, addr != end); - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); pte_unmap_unlock(pte - 1, ptl); return pages; diff --git a/mm/mremap.c b/mm/mremap.c index 672264807db6..8275b9772ec1 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -260,7 +260,7 @@ static int move_ptes(struct pagetable_move_control *pmc, if (new_ptl != old_ptl) spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); flush_tlb_batched_pending(vma->vm_mm); - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); for (; old_addr < old_end; old_ptep += nr_ptes, old_addr += nr_ptes * PAGE_SIZE, new_ptep += nr_ptes, new_addr += nr_ptes * PAGE_SIZE) { @@ -305,7 +305,7 @@ static int move_ptes(struct pagetable_move_control *pmc, } } - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); if (force_flush) flush_tlb_range(vma, old_end - len, old_end); if (new_ptl != old_ptl) diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index e6dfd5f28acd..b11f81095fa5 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -1103,7 +1103,7 @@ static long move_present_ptes(struct mm_struct *mm, /* It's safe to drop the reference now as the page-table is holding one. */ folio_put(*first_src_folio); *first_src_folio = NULL; - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); while (true) { orig_src_pte = ptep_get_and_clear(mm, src_addr, src_pte); @@ -1140,7 +1140,7 @@ static long move_present_ptes(struct mm_struct *mm, break; } - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); if (src_addr > src_start) flush_tlb_range(src_vma, src_start, src_addr); diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 429a893b0505..32d6ee92d4ff 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -108,7 +108,7 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, if (!pte) return -ENOMEM; - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); do { if (unlikely(!pte_none(ptep_get(pte)))) { @@ -134,7 +134,7 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, pfn++; } while (pte += PFN_DOWN(size), addr += size, addr != end); - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); *mask |= PGTBL_PTE_MODIFIED; return 0; } @@ -371,7 +371,7 @@ static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, unsigned long size = PAGE_SIZE; pte = pte_offset_kernel(pmd, addr); - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); do { #ifdef CONFIG_HUGETLB_PAGE @@ -390,7 +390,7 @@ static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, WARN_ON(!pte_none(ptent) && !pte_present(ptent)); } while (pte += (size >> PAGE_SHIFT), addr += size, addr != end); - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); *mask |= PGTBL_PTE_MODIFIED; } @@ -538,7 +538,7 @@ static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr, if (!pte) return -ENOMEM; - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); do { struct page *page = pages[*nr]; @@ -560,7 +560,7 @@ static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr, (*nr)++; } while (pte++, addr += PAGE_SIZE, addr != end); - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); *mask |= PGTBL_PTE_MODIFIED; return err; diff --git a/mm/vmscan.c b/mm/vmscan.c index 614ccf39fe3f..6cf5ee94be7a 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3516,7 +3516,7 @@ static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end, return false; } - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); restart: for (i = pte_index(start), addr = start; addr != end; i++, addr += PAGE_SIZE) { unsigned long pfn; @@ -3557,7 +3557,7 @@ restart: if (i < PTRS_PER_PTE && get_next_vma(PMD_MASK, PAGE_SIZE, args, &start, &end)) goto restart; - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); pte_unmap_unlock(pte, ptl); return suitable_to_scan(total, young); @@ -3598,7 +3598,7 @@ static void walk_pmd_range_locked(pud_t *pud, unsigned long addr, struct vm_area if (!spin_trylock(ptl)) goto done; - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); do { unsigned long pfn; @@ -3645,7 +3645,7 @@ next: walk_update_folio(walk, last, gen, dirty); - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); spin_unlock(ptl); done: *first = -1; @@ -4244,7 +4244,7 @@ bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw) } } - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); pte -= (addr - start) / PAGE_SIZE; @@ -4278,7 +4278,7 @@ bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw) walk_update_folio(walk, last, gen, dirty); - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); /* feedback from rmap walkers to page table walkers */ if (mm_state && suitable_to_scan(i, young)) -- cgit v1.2.3 From 9273dfaeaca8ea4d88c7e9fd081922a029984fd4 Mon Sep 17 00:00:00 2001 From: Kevin Brodsky Date: Mon, 15 Dec 2025 15:03:17 +0000 Subject: mm: bail out of lazy_mmu_mode_* in interrupt context The lazy MMU mode cannot be used in interrupt context. This is documented in , but isn't consistently handled across architectures. arm64 ensures that calls to lazy_mmu_mode_* have no effect in interrupt context, because such calls do occur in certain configurations - see commit b81c688426a9 ("arm64/mm: Disable barrier batching in interrupt contexts"). Other architectures do not check this situation, most likely because it hasn't occurred so far. Let's handle this in the new generic lazy_mmu layer, in the same fashion as arm64: bail out of lazy_mmu_mode_* if in_interrupt(). Also remove the arm64 handling that is now redundant. Both arm64 and x86/Xen also ensure that any lazy MMU optimisation is disabled while in interrupt (see queue_pte_barriers() and xen_get_lazy_mode() respectively). This will be handled in the generic layer in a subsequent patch. Link: https://lkml.kernel.org/r/20251215150323.2218608-9-kevin.brodsky@arm.com Signed-off-by: Kevin Brodsky Acked-by: David Hildenbrand (Red Hat) Reviewed-by: Anshuman Khandual Reviewed-by: Yeoreum Yun Cc: Alexander Gordeev Cc: Andreas Larsson Cc: Borislav Betkov Cc: Boris Ostrovsky Cc: Catalin Marinas Cc: Christophe Leroy Cc: David Hildenbrand Cc: David S. Miller Cc: David Woodhouse Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jann Horn Cc: Juegren Gross Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Ritesh Harjani (IBM) Cc: Ryan Roberts Cc: Suren Baghdasaryan Cc: Thomas Gleinxer Cc: Venkat Rao Bagalkote Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/arm64/include/asm/pgtable.h | 9 --------- include/linux/pgtable.h | 17 ++++++++++++++++- 2 files changed, 16 insertions(+), 10 deletions(-) (limited to 'include/linux/pgtable.h') diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index f7d66c261347..bf9178902bdb 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -94,26 +94,17 @@ static inline void arch_enter_lazy_mmu_mode(void) * keeps tracking simple. */ - if (in_interrupt()) - return; - set_thread_flag(TIF_LAZY_MMU); } static inline void arch_flush_lazy_mmu_mode(void) { - if (in_interrupt()) - return; - if (test_and_clear_thread_flag(TIF_LAZY_MMU_PENDING)) emit_pte_barriers(); } static inline void arch_leave_lazy_mmu_mode(void) { - if (in_interrupt()) - return; - arch_flush_lazy_mmu_mode(); clear_thread_flag(TIF_LAZY_MMU); } diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 116a18b7916c..dddde6873d1e 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -233,26 +233,41 @@ static inline int pmd_dirty(pmd_t pmd) * preemption, as a consequence generic code may not sleep while the lazy MMU * mode is active. * - * Nesting is not permitted and the mode cannot be used in interrupt context. + * The mode is disabled in interrupt context and calls to the lazy_mmu API have + * no effect. + * + * Nesting is not permitted. */ #ifdef CONFIG_ARCH_HAS_LAZY_MMU_MODE static inline void lazy_mmu_mode_enable(void) { + if (in_interrupt()) + return; + arch_enter_lazy_mmu_mode(); } static inline void lazy_mmu_mode_disable(void) { + if (in_interrupt()) + return; + arch_leave_lazy_mmu_mode(); } static inline void lazy_mmu_mode_pause(void) { + if (in_interrupt()) + return; + arch_leave_lazy_mmu_mode(); } static inline void lazy_mmu_mode_resume(void) { + if (in_interrupt()) + return; + arch_enter_lazy_mmu_mode(); } #else -- cgit v1.2.3 From 5ab246749569cff9f815618f02ba0d7cf20e5edd Mon Sep 17 00:00:00 2001 From: Kevin Brodsky Date: Mon, 15 Dec 2025 15:03:18 +0000 Subject: mm: enable lazy_mmu sections to nest MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Despite recent efforts to prevent lazy_mmu sections from nesting, it remains difficult to ensure that it never occurs - and in fact it does occur on arm64 in certain situations (CONFIG_DEBUG_PAGEALLOC). Commit 1ef3095b1405 ("arm64/mm: Permit lazy_mmu_mode to be nested") made nesting tolerable on arm64, but without truly supporting it: the inner call to leave() disables the batching optimisation before the outer section ends. This patch actually enables lazy_mmu sections to nest by tracking the nesting level in task_struct, in a similar fashion to e.g. pagefault_{enable,disable}(). This is fully handled by the generic lazy_mmu helpers that were recently introduced. lazy_mmu sections were not initially intended to nest, so we need to clarify the semantics w.r.t. the arch_*_lazy_mmu_mode() callbacks. This patch takes the following approach: * The outermost calls to lazy_mmu_mode_{enable,disable}() trigger calls to arch_{enter,leave}_lazy_mmu_mode() - this is unchanged. * Nested calls to lazy_mmu_mode_{enable,disable}() are not forwarded to the arch via arch_{enter,leave} - lazy MMU remains enabled so the assumption is that these callbacks are not relevant. However, existing code may rely on a call to disable() to flush any batched state, regardless of nesting. arch_flush_lazy_mmu_mode() is therefore called in that situation. A separate interface was recently introduced to temporarily pause the lazy MMU mode: lazy_mmu_mode_{pause,resume}(). pause() fully exits the mode *regardless of the nesting level*, and resume() restores the mode at the same nesting level. pause()/resume() are themselves allowed to nest, so we actually store two nesting levels in task_struct: enable_count and pause_count. A new helper is_lazy_mmu_mode_active() is introduced to determine whether we are currently in lazy MMU mode; this will be used in subsequent patches to replace the various ways arch's currently track whether the mode is enabled. In summary (enable/pause represent the values *after* the call): lazy_mmu_mode_enable() -> arch_enter() enable=1 pause=0 lazy_mmu_mode_enable() -> ΓΈ enable=2 pause=0 lazy_mmu_mode_pause() -> arch_leave() enable=2 pause=1 lazy_mmu_mode_resume() -> arch_enter() enable=2 pause=0 lazy_mmu_mode_disable() -> arch_flush() enable=1 pause=0 lazy_mmu_mode_disable() -> arch_leave() enable=0 pause=0 Note: is_lazy_mmu_mode_active() is added to to allow arch headers included by to use it. Link: https://lkml.kernel.org/r/20251215150323.2218608-10-kevin.brodsky@arm.com Signed-off-by: Kevin Brodsky Acked-by: David Hildenbrand (Red Hat) Reviewed-by: Yeoreum Yun Cc: Alexander Gordeev Cc: Andreas Larsson Cc: Anshuman Khandual Cc: Borislav Betkov Cc: Boris Ostrovsky Cc: Catalin Marinas Cc: Christophe Leroy Cc: David Hildenbrand Cc: David S. Miller Cc: David Woodhouse Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jann Horn Cc: Juegren Gross Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Ritesh Harjani (IBM) Cc: Ryan Roberts Cc: Suren Baghdasaryan Cc: Thomas Gleinxer Cc: Venkat Rao Bagalkote Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/arm64/include/asm/pgtable.h | 12 ----- include/linux/mm_types_task.h | 5 ++ include/linux/pgtable.h | 114 ++++++++++++++++++++++++++++++++++++--- include/linux/sched.h | 45 ++++++++++++++++ 4 files changed, 157 insertions(+), 19 deletions(-) (limited to 'include/linux/pgtable.h') diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index bf9178902bdb..7f528c36d53c 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -82,18 +82,6 @@ static inline void queue_pte_barriers(void) static inline void arch_enter_lazy_mmu_mode(void) { - /* - * lazy_mmu_mode is not supposed to permit nesting. But in practice this - * does happen with CONFIG_DEBUG_PAGEALLOC, where a page allocation - * inside a lazy_mmu_mode section (such as zap_pte_range()) will change - * permissions on the linear map with apply_to_page_range(), which - * re-enters lazy_mmu_mode. So we tolerate nesting in our - * implementation. The first call to arch_leave_lazy_mmu_mode() will - * flush and clear the flag such that the remainder of the work in the - * outer nest behaves as if outside of lazy mmu mode. This is safe and - * keeps tracking simple. - */ - set_thread_flag(TIF_LAZY_MMU); } diff --git a/include/linux/mm_types_task.h b/include/linux/mm_types_task.h index a82aa80c0ba4..11bf319d78ec 100644 --- a/include/linux/mm_types_task.h +++ b/include/linux/mm_types_task.h @@ -88,4 +88,9 @@ struct tlbflush_unmap_batch { #endif }; +struct lazy_mmu_state { + u8 enable_count; + u8 pause_count; +}; + #endif /* _LINUX_MM_TYPES_TASK_H */ diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index dddde6873d1e..2f0dd3a4ace1 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -236,39 +236,139 @@ static inline int pmd_dirty(pmd_t pmd) * The mode is disabled in interrupt context and calls to the lazy_mmu API have * no effect. * - * Nesting is not permitted. + * The lazy MMU mode is enabled for a given block of code using: + * + * lazy_mmu_mode_enable(); + * + * lazy_mmu_mode_disable(); + * + * Nesting is permitted: may itself use an enable()/disable() pair. + * A nested call to enable() has no functional effect; however disable() causes + * any batched architectural state to be flushed regardless of nesting. After a + * call to disable(), the caller can therefore rely on all previous page table + * modifications to have taken effect, but the lazy MMU mode may still be + * enabled. + * + * In certain cases, it may be desirable to temporarily pause the lazy MMU mode. + * This can be done using: + * + * lazy_mmu_mode_pause(); + * + * lazy_mmu_mode_resume(); + * + * pause() ensures that the mode is exited regardless of the nesting level; + * resume() re-enters the mode at the same nesting level. Any call to the + * lazy_mmu_mode_* API between those two calls has no effect. In particular, + * this means that pause()/resume() pairs may nest. + * + * is_lazy_mmu_mode_active() can be used to check whether the lazy MMU mode is + * currently enabled. */ #ifdef CONFIG_ARCH_HAS_LAZY_MMU_MODE +/** + * lazy_mmu_mode_enable() - Enable the lazy MMU mode. + * + * Enters a new lazy MMU mode section; if the mode was not already enabled, + * enables it and calls arch_enter_lazy_mmu_mode(). + * + * Must be paired with a call to lazy_mmu_mode_disable(). + * + * Has no effect if called: + * - While paused - see lazy_mmu_mode_pause() + * - In interrupt context + */ static inline void lazy_mmu_mode_enable(void) { - if (in_interrupt()) + struct lazy_mmu_state *state = ¤t->lazy_mmu_state; + + if (in_interrupt() || state->pause_count > 0) return; - arch_enter_lazy_mmu_mode(); + VM_WARN_ON_ONCE(state->enable_count == U8_MAX); + + if (state->enable_count++ == 0) + arch_enter_lazy_mmu_mode(); } +/** + * lazy_mmu_mode_disable() - Disable the lazy MMU mode. + * + * Exits the current lazy MMU mode section. If it is the outermost section, + * disables the mode and calls arch_leave_lazy_mmu_mode(). Otherwise (nested + * section), calls arch_flush_lazy_mmu_mode(). + * + * Must match a call to lazy_mmu_mode_enable(). + * + * Has no effect if called: + * - While paused - see lazy_mmu_mode_pause() + * - In interrupt context + */ static inline void lazy_mmu_mode_disable(void) { - if (in_interrupt()) + struct lazy_mmu_state *state = ¤t->lazy_mmu_state; + + if (in_interrupt() || state->pause_count > 0) return; - arch_leave_lazy_mmu_mode(); + VM_WARN_ON_ONCE(state->enable_count == 0); + + if (--state->enable_count == 0) + arch_leave_lazy_mmu_mode(); + else /* Exiting a nested section */ + arch_flush_lazy_mmu_mode(); + } +/** + * lazy_mmu_mode_pause() - Pause the lazy MMU mode. + * + * Pauses the lazy MMU mode; if it is currently active, disables it and calls + * arch_leave_lazy_mmu_mode(). + * + * Must be paired with a call to lazy_mmu_mode_resume(). Calls to the + * lazy_mmu_mode_* API have no effect until the matching resume() call. + * + * Has no effect if called: + * - While paused (inside another pause()/resume() pair) + * - In interrupt context + */ static inline void lazy_mmu_mode_pause(void) { + struct lazy_mmu_state *state = ¤t->lazy_mmu_state; + if (in_interrupt()) return; - arch_leave_lazy_mmu_mode(); + VM_WARN_ON_ONCE(state->pause_count == U8_MAX); + + if (state->pause_count++ == 0 && state->enable_count > 0) + arch_leave_lazy_mmu_mode(); } +/** + * lazy_mmu_mode_resume() - Resume the lazy MMU mode. + * + * Resumes the lazy MMU mode; if it was active at the point where the matching + * call to lazy_mmu_mode_pause() was made, re-enables it and calls + * arch_enter_lazy_mmu_mode(). + * + * Must match a call to lazy_mmu_mode_pause(). + * + * Has no effect if called: + * - While paused (inside another pause()/resume() pair) + * - In interrupt context + */ static inline void lazy_mmu_mode_resume(void) { + struct lazy_mmu_state *state = ¤t->lazy_mmu_state; + if (in_interrupt()) return; - arch_enter_lazy_mmu_mode(); + VM_WARN_ON_ONCE(state->pause_count == 0); + + if (--state->pause_count == 0 && state->enable_count > 0) + arch_enter_lazy_mmu_mode(); } #else static inline void lazy_mmu_mode_enable(void) {} diff --git a/include/linux/sched.h b/include/linux/sched.h index da0133524d08..6b563d4e68f6 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1419,6 +1419,10 @@ struct task_struct { struct page_frag task_frag; +#ifdef CONFIG_ARCH_HAS_LAZY_MMU_MODE + struct lazy_mmu_state lazy_mmu_state; +#endif + #ifdef CONFIG_TASK_DELAY_ACCT struct task_delay_info *delays; #endif @@ -1702,6 +1706,47 @@ static inline char task_state_to_char(struct task_struct *tsk) return task_index_to_char(task_state_index(tsk)); } +#ifdef CONFIG_ARCH_HAS_LAZY_MMU_MODE +/** + * __task_lazy_mmu_mode_active() - Test the lazy MMU mode state for a task. + * @tsk: The task to check. + * + * Test whether @tsk has its lazy MMU mode state set to active (i.e. enabled + * and not paused). + * + * This function only considers the state saved in task_struct; to test whether + * current actually is in lazy MMU mode, is_lazy_mmu_mode_active() should be + * used instead. + * + * This function is intended for architectures that implement the lazy MMU + * mode; it must not be called from generic code. + */ +static inline bool __task_lazy_mmu_mode_active(struct task_struct *tsk) +{ + struct lazy_mmu_state *state = &tsk->lazy_mmu_state; + + return state->enable_count > 0 && state->pause_count == 0; +} + +/** + * is_lazy_mmu_mode_active() - Test whether we are currently in lazy MMU mode. + * + * Test whether the current context is in lazy MMU mode. This is true if both: + * 1. We are not in interrupt context + * 2. Lazy MMU mode is active for the current task + * + * This function is intended for architectures that implement the lazy MMU + * mode; it must not be called from generic code. + */ +static inline bool is_lazy_mmu_mode_active(void) +{ + if (in_interrupt()) + return false; + + return __task_lazy_mmu_mode_active(current); +} +#endif + extern struct pid *cad_pid; /* -- cgit v1.2.3 From 0a5ae4483177a621f5498c349d31f24b1ef10739 Mon Sep 17 00:00:00 2001 From: Rohan McLure Date: Fri, 19 Dec 2025 04:09:37 +1100 Subject: mm/page_table_check: provide addr parameter to page_table_check_ptes_set() To provide support for powerpc platforms, provide an addr parameter to the __page_table_check_ptes_set() and page_table_check_ptes_set() routines. This parameter is needed on some powerpc platforms which do not encode whether a mapping is for user or kernel in the pte. On such platforms, this can be inferred from the addr parameter. [ajd@linux.ibm.com: rebase on arm64 + riscv changes, update commit message] Link: https://lkml.kernel.org/r/20251219-pgtable_check_v18rebase-v18-5-755bc151a50b@linux.ibm.com Signed-off-by: Rohan McLure Reviewed-by: Pasha Tatashin Acked-by: Alexandre Ghiti # riscv Signed-off-by: Andrew Donnellan Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Alistair Popple Cc: Christophe Leroy Cc: "Christophe Leroy (CS GROUP)" Cc: David Hildenbrand Cc: Donet Tom Cc: Guo Weikang Cc: Ingo Molnar Cc: Jason Gunthorpe Cc: Kevin Brodsky Cc: Madhavan Srinivasan Cc: Magnus Lindholm Cc: "Matthew Wilcox (Oracle)" Cc: Michael Ellerman Cc: Nicholas Miehlbradt Cc: Nicholas Piggin Cc: Paul Mackerras Cc: Qi Zheng Cc: "Ritesh Harjani (IBM)" Cc: Sweet Tea Dorminy Cc: Thomas Huth Cc: "Vishal Moola (Oracle)" Cc: Zi Yan Signed-off-by: Andrew Morton --- arch/arm64/include/asm/pgtable.h | 2 +- arch/riscv/include/asm/pgtable.h | 2 +- include/linux/page_table_check.h | 12 +++++++----- include/linux/pgtable.h | 2 +- mm/page_table_check.c | 4 ++-- 5 files changed, 12 insertions(+), 10 deletions(-) (limited to 'include/linux/pgtable.h') diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 4b580d6246f5..d1dd0266bb0c 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -681,7 +681,7 @@ static inline void __set_ptes_anysz(struct mm_struct *mm, unsigned long addr, switch (pgsize) { case PAGE_SIZE: - page_table_check_ptes_set(mm, ptep, pte, nr); + page_table_check_ptes_set(mm, addr, ptep, pte, nr); break; case PMD_SIZE: page_table_check_pmds_set(mm, addr, (pmd_t *)ptep, diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 82b1c79bc2dd..574a45a22454 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -627,7 +627,7 @@ static inline void __set_pte_at(struct mm_struct *mm, pte_t *ptep, pte_t pteval) static inline void set_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pteval, unsigned int nr) { - page_table_check_ptes_set(mm, ptep, pteval, nr); + page_table_check_ptes_set(mm, addr, ptep, pteval, nr); for (;;) { __set_pte_at(mm, ptep, pteval); diff --git a/include/linux/page_table_check.h b/include/linux/page_table_check.h index cf7c28d8d468..66e109238416 100644 --- a/include/linux/page_table_check.h +++ b/include/linux/page_table_check.h @@ -17,8 +17,8 @@ void __page_table_check_zero(struct page *page, unsigned int order); void __page_table_check_pte_clear(struct mm_struct *mm, pte_t pte); void __page_table_check_pmd_clear(struct mm_struct *mm, pmd_t pmd); void __page_table_check_pud_clear(struct mm_struct *mm, pud_t pud); -void __page_table_check_ptes_set(struct mm_struct *mm, pte_t *ptep, pte_t pte, - unsigned int nr); +void __page_table_check_ptes_set(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte, unsigned int nr); void __page_table_check_pmds_set(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, pmd_t pmd, unsigned int nr); void __page_table_check_puds_set(struct mm_struct *mm, unsigned long addr, @@ -68,12 +68,13 @@ static inline void page_table_check_pud_clear(struct mm_struct *mm, pud_t pud) } static inline void page_table_check_ptes_set(struct mm_struct *mm, - pte_t *ptep, pte_t pte, unsigned int nr) + unsigned long addr, pte_t *ptep, + pte_t pte, unsigned int nr) { if (static_branch_likely(&page_table_check_disabled)) return; - __page_table_check_ptes_set(mm, ptep, pte, nr); + __page_table_check_ptes_set(mm, addr, ptep, pte, nr); } static inline void page_table_check_pmds_set(struct mm_struct *mm, @@ -127,7 +128,8 @@ static inline void page_table_check_pud_clear(struct mm_struct *mm, pud_t pud) } static inline void page_table_check_ptes_set(struct mm_struct *mm, - pte_t *ptep, pte_t pte, unsigned int nr) + unsigned long addr, pte_t *ptep, + pte_t pte, unsigned int nr) { } diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 2f0dd3a4ace1..496873f44f67 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -429,7 +429,7 @@ static inline pte_t pte_advance_pfn(pte_t pte, unsigned long nr) static inline void set_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte, unsigned int nr) { - page_table_check_ptes_set(mm, ptep, pte, nr); + page_table_check_ptes_set(mm, addr, ptep, pte, nr); for (;;) { set_pte(ptep, pte); diff --git a/mm/page_table_check.c b/mm/page_table_check.c index 86dc4e4d1dad..2871d9c45368 100644 --- a/mm/page_table_check.c +++ b/mm/page_table_check.c @@ -196,8 +196,8 @@ static void page_table_check_pte_flags(pte_t pte) } } -void __page_table_check_ptes_set(struct mm_struct *mm, pte_t *ptep, pte_t pte, - unsigned int nr) +void __page_table_check_ptes_set(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte, unsigned int nr) { unsigned int i; -- cgit v1.2.3 From 2e6ac078ce5d6a9dc96cab861359faac508eb56d Mon Sep 17 00:00:00 2001 From: Rohan McLure Date: Fri, 19 Dec 2025 04:09:38 +1100 Subject: mm/page_table_check: reinstate address parameter in [__]page_table_check_pud_clear() This reverts commit 931c38e16499 ("mm/page_table_check: remove unused parameter in [__]page_table_check_pud_clear"). Reinstate previously unused parameters for the purpose of supporting powerpc platforms, as many do not encode user/kernel ownership of the page in the pte, but instead in the address of the access. [ajd@linux.ibm.com: rebase on arm64 changes] Link: https://lkml.kernel.org/r/20251219-pgtable_check_v18rebase-v18-6-755bc151a50b@linux.ibm.com Signed-off-by: Rohan McLure Signed-off-by: Andrew Donnellan Reviewed-by: Pasha Tatashin Acked-by: Ingo Molnar # x86 Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Alexandre Ghiti Cc: Alistair Popple Cc: Christophe Leroy Cc: "Christophe Leroy (CS GROUP)" Cc: David Hildenbrand Cc: Donet Tom Cc: Guo Weikang Cc: Jason Gunthorpe Cc: Kevin Brodsky Cc: Madhavan Srinivasan Cc: Magnus Lindholm Cc: "Matthew Wilcox (Oracle)" Cc: Michael Ellerman Cc: Nicholas Miehlbradt Cc: Nicholas Piggin Cc: Paul Mackerras Cc: Qi Zheng Cc: "Ritesh Harjani (IBM)" Cc: Sweet Tea Dorminy Cc: Thomas Huth Cc: "Vishal Moola (Oracle)" Cc: Zi Yan Signed-off-by: Andrew Morton --- arch/arm64/include/asm/pgtable.h | 2 +- arch/riscv/include/asm/pgtable.h | 2 +- arch/x86/include/asm/pgtable.h | 2 +- include/linux/page_table_check.h | 11 +++++++---- include/linux/pgtable.h | 2 +- mm/page_table_check.c | 5 +++-- 6 files changed, 14 insertions(+), 10 deletions(-) (limited to 'include/linux/pgtable.h') diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index d1dd0266bb0c..595405e6bfc7 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -1349,7 +1349,7 @@ static inline pte_t __ptep_get_and_clear_anysz(struct mm_struct *mm, break; #ifndef __PAGETABLE_PMD_FOLDED case PUD_SIZE: - page_table_check_pud_clear(mm, pte_pud(pte)); + page_table_check_pud_clear(mm, address, pte_pud(pte)); break; #endif default: diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 574a45a22454..e06727c975fe 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -1101,7 +1101,7 @@ static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm, pud_clear(pudp); #endif - page_table_check_pud_clear(mm, pud); + page_table_check_pud_clear(mm, address, pud); return pud; } diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 7fd876f8d828..3eb36a36058f 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -1330,7 +1330,7 @@ static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm, { pud_t pud = native_pudp_get_and_clear(pudp); - page_table_check_pud_clear(mm, pud); + page_table_check_pud_clear(mm, addr, pud); return pud; } diff --git a/include/linux/page_table_check.h b/include/linux/page_table_check.h index 66e109238416..808cc3a48c28 100644 --- a/include/linux/page_table_check.h +++ b/include/linux/page_table_check.h @@ -16,7 +16,8 @@ extern struct page_ext_operations page_table_check_ops; void __page_table_check_zero(struct page *page, unsigned int order); void __page_table_check_pte_clear(struct mm_struct *mm, pte_t pte); void __page_table_check_pmd_clear(struct mm_struct *mm, pmd_t pmd); -void __page_table_check_pud_clear(struct mm_struct *mm, pud_t pud); +void __page_table_check_pud_clear(struct mm_struct *mm, unsigned long addr, + pud_t pud); void __page_table_check_ptes_set(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte, unsigned int nr); void __page_table_check_pmds_set(struct mm_struct *mm, unsigned long addr, @@ -59,12 +60,13 @@ static inline void page_table_check_pmd_clear(struct mm_struct *mm, pmd_t pmd) __page_table_check_pmd_clear(mm, pmd); } -static inline void page_table_check_pud_clear(struct mm_struct *mm, pud_t pud) +static inline void page_table_check_pud_clear(struct mm_struct *mm, + unsigned long addr, pud_t pud) { if (static_branch_likely(&page_table_check_disabled)) return; - __page_table_check_pud_clear(mm, pud); + __page_table_check_pud_clear(mm, addr, pud); } static inline void page_table_check_ptes_set(struct mm_struct *mm, @@ -123,7 +125,8 @@ static inline void page_table_check_pmd_clear(struct mm_struct *mm, pmd_t pmd) { } -static inline void page_table_check_pud_clear(struct mm_struct *mm, pud_t pud) +static inline void page_table_check_pud_clear(struct mm_struct *mm, + unsigned long addr, pud_t pud) { } diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 496873f44f67..ed3c28ebeb35 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -801,7 +801,7 @@ static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm, pud_t pud = *pudp; pud_clear(pudp); - page_table_check_pud_clear(mm, pud); + page_table_check_pud_clear(mm, address, pud); return pud; } diff --git a/mm/page_table_check.c b/mm/page_table_check.c index 2871d9c45368..2295bc9368ab 100644 --- a/mm/page_table_check.c +++ b/mm/page_table_check.c @@ -167,7 +167,8 @@ void __page_table_check_pmd_clear(struct mm_struct *mm, pmd_t pmd) } EXPORT_SYMBOL(__page_table_check_pmd_clear); -void __page_table_check_pud_clear(struct mm_struct *mm, pud_t pud) +void __page_table_check_pud_clear(struct mm_struct *mm, unsigned long addr, + pud_t pud) { if (&init_mm == mm) return; @@ -253,7 +254,7 @@ void __page_table_check_puds_set(struct mm_struct *mm, unsigned long addr, return; for (i = 0; i < nr; i++) - __page_table_check_pud_clear(mm, *(pudp + i)); + __page_table_check_pud_clear(mm, addr + PUD_SIZE * i, *(pudp + i)); if (pud_user_accessible_page(pud)) page_table_check_set(pud_pfn(pud), stride * nr, pud_write(pud)); } -- cgit v1.2.3 From 649ec9e3d03c4908ef51731cd7b422c4a3e2ccff Mon Sep 17 00:00:00 2001 From: Rohan McLure Date: Fri, 19 Dec 2025 04:09:39 +1100 Subject: mm/page_table_check: reinstate address parameter in [__]page_table_check_pmd_clear() This reverts commit 1831414cd729 ("mm/page_table_check: remove unused parameter in [__]page_table_check_pmd_clear"). Reinstate previously unused parameters for the purpose of supporting powerpc platforms, as many do not encode user/kernel ownership of the page in the pte, but instead in the address of the access. [ajd@linux.ibm.com: rebase on arm64 changes] Link: https://lkml.kernel.org/r/20251219-pgtable_check_v18rebase-v18-7-755bc151a50b@linux.ibm.com Signed-off-by: Rohan McLure Signed-off-by: Andrew Donnellan Reviewed-by: Pasha Tatashin Acked-by: Ingo Molnar # x86 Acked-by: Alexandre Ghiti # riscv Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Alistair Popple Cc: Christophe Leroy Cc: "Christophe Leroy (CS GROUP)" Cc: David Hildenbrand Cc: Donet Tom Cc: Guo Weikang Cc: Jason Gunthorpe Cc: Kevin Brodsky Cc: Madhavan Srinivasan Cc: Magnus Lindholm Cc: "Matthew Wilcox (Oracle)" Cc: Michael Ellerman Cc: Nicholas Miehlbradt Cc: Nicholas Piggin Cc: Paul Mackerras Cc: Qi Zheng Cc: "Ritesh Harjani (IBM)" Cc: Sweet Tea Dorminy Cc: Thomas Huth Cc: "Vishal Moola (Oracle)" Cc: Zi Yan Signed-off-by: Andrew Morton --- arch/arm64/include/asm/pgtable.h | 2 +- arch/riscv/include/asm/pgtable.h | 2 +- arch/x86/include/asm/pgtable.h | 2 +- include/linux/page_table_check.h | 11 +++++++---- include/linux/pgtable.h | 2 +- mm/page_table_check.c | 5 +++-- 6 files changed, 14 insertions(+), 10 deletions(-) (limited to 'include/linux/pgtable.h') diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 595405e6bfc7..5abad90913eb 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -1345,7 +1345,7 @@ static inline pte_t __ptep_get_and_clear_anysz(struct mm_struct *mm, page_table_check_pte_clear(mm, pte); break; case PMD_SIZE: - page_table_check_pmd_clear(mm, pte_pmd(pte)); + page_table_check_pmd_clear(mm, address, pte_pmd(pte)); break; #ifndef __PAGETABLE_PMD_FOLDED case PUD_SIZE: diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index e06727c975fe..6464a2c18ebe 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -1007,7 +1007,7 @@ static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, pmd_clear(pmdp); #endif - page_table_check_pmd_clear(mm, pmd); + page_table_check_pmd_clear(mm, address, pmd); return pmd; } diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 3eb36a36058f..5a2b2d3a80d8 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -1319,7 +1319,7 @@ static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, unsigned long { pmd_t pmd = native_pmdp_get_and_clear(pmdp); - page_table_check_pmd_clear(mm, pmd); + page_table_check_pmd_clear(mm, addr, pmd); return pmd; } diff --git a/include/linux/page_table_check.h b/include/linux/page_table_check.h index 808cc3a48c28..3973b69ae294 100644 --- a/include/linux/page_table_check.h +++ b/include/linux/page_table_check.h @@ -15,7 +15,8 @@ extern struct page_ext_operations page_table_check_ops; void __page_table_check_zero(struct page *page, unsigned int order); void __page_table_check_pte_clear(struct mm_struct *mm, pte_t pte); -void __page_table_check_pmd_clear(struct mm_struct *mm, pmd_t pmd); +void __page_table_check_pmd_clear(struct mm_struct *mm, unsigned long addr, + pmd_t pmd); void __page_table_check_pud_clear(struct mm_struct *mm, unsigned long addr, pud_t pud); void __page_table_check_ptes_set(struct mm_struct *mm, unsigned long addr, @@ -52,12 +53,13 @@ static inline void page_table_check_pte_clear(struct mm_struct *mm, pte_t pte) __page_table_check_pte_clear(mm, pte); } -static inline void page_table_check_pmd_clear(struct mm_struct *mm, pmd_t pmd) +static inline void page_table_check_pmd_clear(struct mm_struct *mm, + unsigned long addr, pmd_t pmd) { if (static_branch_likely(&page_table_check_disabled)) return; - __page_table_check_pmd_clear(mm, pmd); + __page_table_check_pmd_clear(mm, addr, pmd); } static inline void page_table_check_pud_clear(struct mm_struct *mm, @@ -121,7 +123,8 @@ static inline void page_table_check_pte_clear(struct mm_struct *mm, pte_t pte) { } -static inline void page_table_check_pmd_clear(struct mm_struct *mm, pmd_t pmd) +static inline void page_table_check_pmd_clear(struct mm_struct *mm, + unsigned long addr, pmd_t pmd) { } diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index ed3c28ebeb35..2d1f7369624c 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -788,7 +788,7 @@ static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, pmd_t pmd = *pmdp; pmd_clear(pmdp); - page_table_check_pmd_clear(mm, pmd); + page_table_check_pmd_clear(mm, address, pmd); return pmd; } diff --git a/mm/page_table_check.c b/mm/page_table_check.c index 2295bc9368ab..e8280b0b6dda 100644 --- a/mm/page_table_check.c +++ b/mm/page_table_check.c @@ -156,7 +156,8 @@ void __page_table_check_pte_clear(struct mm_struct *mm, pte_t pte) } EXPORT_SYMBOL(__page_table_check_pte_clear); -void __page_table_check_pmd_clear(struct mm_struct *mm, pmd_t pmd) +void __page_table_check_pmd_clear(struct mm_struct *mm, unsigned long addr, + pmd_t pmd) { if (&init_mm == mm) return; @@ -238,7 +239,7 @@ void __page_table_check_pmds_set(struct mm_struct *mm, unsigned long addr, page_table_check_pmd_flags(pmd); for (i = 0; i < nr; i++) - __page_table_check_pmd_clear(mm, *(pmdp + i)); + __page_table_check_pmd_clear(mm, addr + PMD_SIZE * i, *(pmdp + i)); if (pmd_user_accessible_page(pmd)) page_table_check_set(pmd_pfn(pmd), stride * nr, pmd_write(pmd)); } -- cgit v1.2.3 From d7b4b67eb6b37aef1723a69add88c9a7add81308 Mon Sep 17 00:00:00 2001 From: Rohan McLure Date: Fri, 19 Dec 2025 04:09:40 +1100 Subject: mm/page_table_check: reinstate address parameter in [__]page_table_check_pte_clear() This reverts commit aa232204c468 ("mm/page_table_check: remove unused parameter in [__]page_table_check_pte_clear"). Reinstate previously unused parameters for the purpose of supporting powerpc platforms, as many do not encode user/kernel ownership of the page in the pte, but instead in the address of the access. [ajd@linux.ibm.com: rebase, fix additional occurrence and loop handling] Link: https://lkml.kernel.org/r/20251219-pgtable_check_v18rebase-v18-8-755bc151a50b@linux.ibm.com Signed-off-by: Rohan McLure Signed-off-by: Andrew Donnellan Reviewed-by: Pasha Tatashin Acked-by: Ingo Molnar # x86 Acked-by: Alexandre Ghiti # riscv Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Alistair Popple Cc: Christophe Leroy Cc: "Christophe Leroy (CS GROUP)" Cc: David Hildenbrand Cc: Donet Tom Cc: Guo Weikang Cc: Jason Gunthorpe Cc: Kevin Brodsky Cc: Madhavan Srinivasan Cc: Magnus Lindholm Cc: "Matthew Wilcox (Oracle)" Cc: Michael Ellerman Cc: Nicholas Miehlbradt Cc: Nicholas Piggin Cc: Paul Mackerras Cc: Qi Zheng Cc: "Ritesh Harjani (IBM)" Cc: Sweet Tea Dorminy Cc: Thomas Huth Cc: "Vishal Moola (Oracle)" Cc: Zi Yan Signed-off-by: Andrew Morton --- arch/arm64/include/asm/pgtable.h | 2 +- arch/riscv/include/asm/pgtable.h | 2 +- arch/x86/include/asm/pgtable.h | 4 ++-- include/linux/page_table_check.h | 11 +++++++---- include/linux/pgtable.h | 4 ++-- mm/page_table_check.c | 7 ++++--- 6 files changed, 17 insertions(+), 13 deletions(-) (limited to 'include/linux/pgtable.h') diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 5abad90913eb..ce64c560e284 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -1342,7 +1342,7 @@ static inline pte_t __ptep_get_and_clear_anysz(struct mm_struct *mm, switch (pgsize) { case PAGE_SIZE: - page_table_check_pte_clear(mm, pte); + page_table_check_pte_clear(mm, address, pte); break; case PMD_SIZE: page_table_check_pmd_clear(mm, address, pte_pmd(pte)); diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 6464a2c18ebe..e3618d789aa4 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -664,7 +664,7 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm, set_pte(ptep, __pte(0)); #endif - page_table_check_pte_clear(mm, pte); + page_table_check_pte_clear(mm, address, pte); return pte; } diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 5a2b2d3a80d8..6ec6cf7ad2d4 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -1252,7 +1252,7 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { pte_t pte = native_ptep_get_and_clear(ptep); - page_table_check_pte_clear(mm, pte); + page_table_check_pte_clear(mm, addr, pte); return pte; } @@ -1268,7 +1268,7 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, * care about updates and native needs no locking */ pte = native_local_ptep_get_and_clear(ptep); - page_table_check_pte_clear(mm, pte); + page_table_check_pte_clear(mm, addr, pte); } else { pte = ptep_get_and_clear(mm, addr, ptep); } diff --git a/include/linux/page_table_check.h b/include/linux/page_table_check.h index 3973b69ae294..12268a32e8be 100644 --- a/include/linux/page_table_check.h +++ b/include/linux/page_table_check.h @@ -14,7 +14,8 @@ extern struct static_key_true page_table_check_disabled; extern struct page_ext_operations page_table_check_ops; void __page_table_check_zero(struct page *page, unsigned int order); -void __page_table_check_pte_clear(struct mm_struct *mm, pte_t pte); +void __page_table_check_pte_clear(struct mm_struct *mm, unsigned long addr, + pte_t pte); void __page_table_check_pmd_clear(struct mm_struct *mm, unsigned long addr, pmd_t pmd); void __page_table_check_pud_clear(struct mm_struct *mm, unsigned long addr, @@ -45,12 +46,13 @@ static inline void page_table_check_free(struct page *page, unsigned int order) __page_table_check_zero(page, order); } -static inline void page_table_check_pte_clear(struct mm_struct *mm, pte_t pte) +static inline void page_table_check_pte_clear(struct mm_struct *mm, + unsigned long addr, pte_t pte) { if (static_branch_likely(&page_table_check_disabled)) return; - __page_table_check_pte_clear(mm, pte); + __page_table_check_pte_clear(mm, addr, pte); } static inline void page_table_check_pmd_clear(struct mm_struct *mm, @@ -119,7 +121,8 @@ static inline void page_table_check_free(struct page *page, unsigned int order) { } -static inline void page_table_check_pte_clear(struct mm_struct *mm, pte_t pte) +static inline void page_table_check_pte_clear(struct mm_struct *mm, + unsigned long addr, pte_t pte) { } diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 2d1f7369624c..827dca25c0bc 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -634,7 +634,7 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm, { pte_t pte = ptep_get(ptep); pte_clear(mm, address, ptep); - page_table_check_pte_clear(mm, pte); + page_table_check_pte_clear(mm, address, pte); return pte; } #endif @@ -693,7 +693,7 @@ static inline void ptep_clear(struct mm_struct *mm, unsigned long addr, * No need for ptep_get_and_clear(): page table check doesn't care about * any bits that could have been set by HW concurrently. */ - page_table_check_pte_clear(mm, pte); + page_table_check_pte_clear(mm, addr, pte); } #ifdef CONFIG_GUP_GET_PXX_LOW_HIGH diff --git a/mm/page_table_check.c b/mm/page_table_check.c index e8280b0b6dda..de9e54bd27e6 100644 --- a/mm/page_table_check.c +++ b/mm/page_table_check.c @@ -145,7 +145,8 @@ void __page_table_check_zero(struct page *page, unsigned int order) rcu_read_unlock(); } -void __page_table_check_pte_clear(struct mm_struct *mm, pte_t pte) +void __page_table_check_pte_clear(struct mm_struct *mm, unsigned long addr, + pte_t pte) { if (&init_mm == mm) return; @@ -209,7 +210,7 @@ void __page_table_check_ptes_set(struct mm_struct *mm, unsigned long addr, page_table_check_pte_flags(pte); for (i = 0; i < nr; i++) - __page_table_check_pte_clear(mm, ptep_get(ptep + i)); + __page_table_check_pte_clear(mm, addr + PAGE_SIZE * i, ptep_get(ptep + i)); if (pte_user_accessible_page(pte)) page_table_check_set(pte_pfn(pte), nr, pte_write(pte)); } @@ -275,7 +276,7 @@ void __page_table_check_pte_clear_range(struct mm_struct *mm, if (WARN_ON(!ptep)) return; for (i = 0; i < PTRS_PER_PTE; i++) { - __page_table_check_pte_clear(mm, ptep_get(ptep)); + __page_table_check_pte_clear(mm, addr, ptep_get(ptep)); addr += PAGE_SIZE; ptep++; } -- cgit v1.2.3