From f2be745071ffd6793c032ca8443348c3ce0e3e18 Mon Sep 17 00:00:00 2001 From: Kevin Brodsky Date: Mon, 15 Dec 2025 15:03:14 +0000 Subject: mm: clarify lazy_mmu sleeping constraints The lazy MMU mode documentation makes clear that an implementation should not assume that preemption is disabled or any lock is held upon entry to the mode; however it says nothing about what code using the lazy MMU interface should expect. In practice sleeping is forbidden (for generic code) while the lazy MMU mode is active: say it explicitly. Link: https://lkml.kernel.org/r/20251215150323.2218608-6-kevin.brodsky@arm.com Signed-off-by: Kevin Brodsky Reviewed-by: Yeoreum Yun Acked-by: David Hildenbrand (Red Hat) Cc: Alexander Gordeev Cc: Andreas Larsson Cc: Anshuman Khandual Cc: Borislav Betkov Cc: Boris Ostrovsky Cc: Catalin Marinas Cc: Christophe Leroy Cc: David S. Miller Cc: David Woodhouse Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jann Horn Cc: Juegren Gross Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Ritesh Harjani (IBM) Cc: Ryan Roberts Cc: Suren Baghdasaryan Cc: Thomas Gleinxer Cc: Venkat Rao Bagalkote Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 652f287c1ef6..1abc4a1c3d72 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -225,11 +225,15 @@ static inline int pmd_dirty(pmd_t pmd) * up to date. * * In the general case, no lock is guaranteed to be held between entry and exit - * of the lazy mode. So the implementation must assume preemption may be enabled - * and cpu migration is possible; it must take steps to be robust against this. - * (In practice, for user PTE updates, the appropriate page table lock(s) are - * held, but for kernel PTE updates, no lock is held). Nesting is not permitted - * and the mode cannot be used in interrupt context. + * of the lazy mode. (In practice, for user PTE updates, the appropriate page + * table lock(s) are held, but for kernel PTE updates, no lock is held). + * The implementation must therefore assume preemption may be enabled upon + * entry to the mode and cpu migration is possible; it must take steps to be + * robust against this. An implementation may handle this by disabling + * preemption, as a consequence generic code may not sleep while the lazy MMU + * mode is active. + * + * Nesting is not permitted and the mode cannot be used in interrupt context. */ #ifndef __HAVE_ARCH_ENTER_LAZY_MMU_MODE static inline void arch_enter_lazy_mmu_mode(void) {} -- cgit v1.2.3 From 7303ecbfe4f46c00191b9b66acaa918784bad210 Mon Sep 17 00:00:00 2001 From: Kevin Brodsky Date: Mon, 15 Dec 2025 15:03:15 +0000 Subject: mm: introduce CONFIG_ARCH_HAS_LAZY_MMU_MODE Architectures currently opt in for implementing lazy_mmu helpers by defining __HAVE_ARCH_ENTER_LAZY_MMU_MODE. In preparation for introducing a generic lazy_mmu layer that will require storage in task_struct, let's switch to a cleaner approach: instead of defining a macro, select a CONFIG option. This patch introduces CONFIG_ARCH_HAS_LAZY_MMU_MODE and has each arch select it when it implements lazy_mmu helpers. __HAVE_ARCH_ENTER_LAZY_MMU_MODE is removed and relies on the new CONFIG instead. On x86, lazy_mmu helpers are only implemented if PARAVIRT_XXL is selected. This creates some complications in arch/x86/boot/, because a few files manually undefine PARAVIRT* options. As a result does not define the lazy_mmu helpers, but this breaks the build as only defines them if !CONFIG_ARCH_HAS_LAZY_MMU_MODE. There does not seem to be a clean way out of this - let's just undefine that new CONFIG too. Link: https://lkml.kernel.org/r/20251215150323.2218608-7-kevin.brodsky@arm.com Signed-off-by: Kevin Brodsky Acked-by: David Hildenbrand Reviewed-by: Ritesh Harjani (IBM) Reviewed-by: Ryan Roberts Reviewed-by: Yeoreum Yun Acked-by: Andreas Larsson [sparc] Cc: Alexander Gordeev Cc: Anshuman Khandual Cc: Borislav Betkov Cc: Boris Ostrovsky Cc: Catalin Marinas Cc: Christophe Leroy Cc: David Hildenbrand (Red Hat) Cc: David S. Miller Cc: David Woodhouse Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jann Horn Cc: Juegren Gross Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Suren Baghdasaryan Cc: Thomas Gleinxer Cc: Venkat Rao Bagalkote Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/arm64/Kconfig | 1 + arch/arm64/include/asm/pgtable.h | 1 - arch/powerpc/include/asm/book3s/64/tlbflush-hash.h | 2 -- arch/powerpc/platforms/Kconfig.cputype | 1 + arch/sparc/Kconfig | 1 + arch/sparc/include/asm/tlbflush_64.h | 2 -- arch/x86/Kconfig | 1 + arch/x86/boot/compressed/misc.h | 1 + arch/x86/boot/startup/sme.c | 1 + arch/x86/include/asm/paravirt.h | 1 - include/linux/pgtable.h | 2 +- mm/Kconfig | 7 +++++++ 12 files changed, 14 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 93173f0a09c7..3fb4603c0e16 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -35,6 +35,7 @@ config ARM64 select ARCH_HAS_KCOV select ARCH_HAS_KERNEL_FPU_SUPPORT if KERNEL_MODE_NEON select ARCH_HAS_KEEPINITRD + select ARCH_HAS_LAZY_MMU_MODE select ARCH_HAS_MEMBARRIER_SYNC_CORE select ARCH_HAS_MEM_ENCRYPT select ARCH_SUPPORTS_MSEAL_SYSTEM_MAPPINGS diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 64d5f1d9cce9..f7d66c261347 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -80,7 +80,6 @@ static inline void queue_pte_barriers(void) } } -#define __HAVE_ARCH_ENTER_LAZY_MMU_MODE static inline void arch_enter_lazy_mmu_mode(void) { /* diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h b/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h index 2d45f57df169..565c1b7c3eae 100644 --- a/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h +++ b/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h @@ -24,8 +24,6 @@ DECLARE_PER_CPU(struct ppc64_tlb_batch, ppc64_tlb_batch); extern void __flush_tlb_pending(struct ppc64_tlb_batch *batch); -#define __HAVE_ARCH_ENTER_LAZY_MMU_MODE - static inline void arch_enter_lazy_mmu_mode(void) { struct ppc64_tlb_batch *batch; diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index 4c321a8ea896..f399917c17bd 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -93,6 +93,7 @@ config PPC_BOOK3S_64 select IRQ_WORK select PPC_64S_HASH_MMU if !PPC_RADIX_MMU select KASAN_VMALLOC if KASAN + select ARCH_HAS_LAZY_MMU_MODE config PPC_BOOK3E_64 bool "Embedded processors" diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index a630d373e645..2bad14744ca4 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -112,6 +112,7 @@ config SPARC64 select NEED_PER_CPU_PAGE_FIRST_CHUNK select ARCH_SUPPORTS_SCHED_SMT if SMP select ARCH_SUPPORTS_SCHED_MC if SMP + select ARCH_HAS_LAZY_MMU_MODE config ARCH_PROC_KCORE_TEXT def_bool y diff --git a/arch/sparc/include/asm/tlbflush_64.h b/arch/sparc/include/asm/tlbflush_64.h index 925bb5d7a4e1..4e1036728e2f 100644 --- a/arch/sparc/include/asm/tlbflush_64.h +++ b/arch/sparc/include/asm/tlbflush_64.h @@ -39,8 +39,6 @@ static inline void flush_tlb_range(struct vm_area_struct *vma, void flush_tlb_kernel_range(unsigned long start, unsigned long end); -#define __HAVE_ARCH_ENTER_LAZY_MMU_MODE - void flush_tlb_pending(void); void arch_enter_lazy_mmu_mode(void); void arch_flush_lazy_mmu_mode(void); diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 80527299f859..2427a66cb0fe 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -808,6 +808,7 @@ config PARAVIRT config PARAVIRT_XXL bool depends on X86_64 + select ARCH_HAS_LAZY_MMU_MODE config PARAVIRT_DEBUG bool "paravirt-ops debugging" diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index fd855e32c9b9..4f86c5903e03 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -11,6 +11,7 @@ #undef CONFIG_PARAVIRT #undef CONFIG_PARAVIRT_XXL #undef CONFIG_PARAVIRT_SPINLOCKS +#undef CONFIG_ARCH_HAS_LAZY_MMU_MODE #undef CONFIG_KASAN #undef CONFIG_KASAN_GENERIC diff --git a/arch/x86/boot/startup/sme.c b/arch/x86/boot/startup/sme.c index e7ea65f3f1d6..b76a7c95dfe1 100644 --- a/arch/x86/boot/startup/sme.c +++ b/arch/x86/boot/startup/sme.c @@ -24,6 +24,7 @@ #undef CONFIG_PARAVIRT #undef CONFIG_PARAVIRT_XXL #undef CONFIG_PARAVIRT_SPINLOCKS +#undef CONFIG_ARCH_HAS_LAZY_MMU_MODE /* * This code runs before CPU feature bits are set. By default, the diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index b5e59a7ba0d0..13f9cd31c8f8 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -526,7 +526,6 @@ static inline void arch_end_context_switch(struct task_struct *next) PVOP_VCALL1(cpu.end_context_switch, next); } -#define __HAVE_ARCH_ENTER_LAZY_MMU_MODE static inline void arch_enter_lazy_mmu_mode(void) { PVOP_VCALL0(mmu.lazy_mode.enter); diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 1abc4a1c3d72..d46d86959bd6 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -235,7 +235,7 @@ static inline int pmd_dirty(pmd_t pmd) * * Nesting is not permitted and the mode cannot be used in interrupt context. */ -#ifndef __HAVE_ARCH_ENTER_LAZY_MMU_MODE +#ifndef CONFIG_ARCH_HAS_LAZY_MMU_MODE static inline void arch_enter_lazy_mmu_mode(void) {} static inline void arch_leave_lazy_mmu_mode(void) {} static inline void arch_flush_lazy_mmu_mode(void) {} diff --git a/mm/Kconfig b/mm/Kconfig index a992f2203eb9..7c2520e6a6b3 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -1468,6 +1468,13 @@ config PT_RECLAIM config FIND_NORMAL_PAGE def_bool n +config ARCH_HAS_LAZY_MMU_MODE + bool + help + The architecture uses the lazy MMU mode. This allows changes to + MMU-related architectural state to be deferred until the mode is + exited. See for details. + source "mm/damon/Kconfig" endmenu -- cgit v1.2.3 From 0a096ab7a3a6e2859c3c88988e548c5c213138bc Mon Sep 17 00:00:00 2001 From: Kevin Brodsky Date: Mon, 15 Dec 2025 15:03:16 +0000 Subject: mm: introduce generic lazy_mmu helpers The implementation of the lazy MMU mode is currently entirely arch-specific; core code directly calls arch helpers: arch_{enter,leave}_lazy_mmu_mode(). We are about to introduce support for nested lazy MMU sections. As things stand we'd have to duplicate that logic in every arch implementing lazy_mmu - adding to a fair amount of logic already duplicated across lazy_mmu implementations. This patch therefore introduces a new generic layer that calls the existing arch_* helpers. Two pair of calls are introduced: * lazy_mmu_mode_enable() ... lazy_mmu_mode_disable() This is the standard case where the mode is enabled for a given block of code by surrounding it with enable() and disable() calls. * lazy_mmu_mode_pause() ... lazy_mmu_mode_resume() This is for situations where the mode is temporarily disabled by first calling pause() and then resume() (e.g. to prevent any batching from occurring in a critical section). The documentation in will be updated in a subsequent patch. No functional change should be introduced at this stage. The implementation of enable()/resume() and disable()/pause() is currently identical, but nesting support will change that. Most of the call sites have been updated using the following Coccinelle script: @@ @@ { ... - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); ... - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); ... } @@ @@ { ... - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_pause(); ... - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_resume(); ... } A couple of notes regarding x86: * Xen is currently the only case where explicit handling is required for lazy MMU when context-switching. This is purely an implementation detail and using the generic lazy_mmu_mode_* functions would cause trouble when nesting support is introduced, because the generic functions must be called from the current task. For that reason we still use arch_leave() and arch_enter() there. * x86 calls arch_flush_lazy_mmu_mode() unconditionally in a few places, but only defines it if PARAVIRT_XXL is selected, and we are removing the fallback in . Add a new fallback definition to to keep things building. Link: https://lkml.kernel.org/r/20251215150323.2218608-8-kevin.brodsky@arm.com Signed-off-by: Kevin Brodsky Acked-by: David Hildenbrand Reviewed-by: Anshuman Khandual Reviewed-by: Yeoreum Yun Cc: Alexander Gordeev Cc: Andreas Larsson Cc: Borislav Betkov Cc: Boris Ostrovsky Cc: Catalin Marinas Cc: Christophe Leroy Cc: David Hildenbrand (Red Hat) Cc: David S. Miller Cc: David Woodhouse Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jann Horn Cc: Juegren Gross Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Ritesh Harjani (IBM) Cc: Ryan Roberts Cc: Suren Baghdasaryan Cc: Thomas Gleinxer Cc: Venkat Rao Bagalkote Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/arm64/mm/mmu.c | 8 ++++---- arch/arm64/mm/pageattr.c | 4 ++-- arch/powerpc/mm/book3s64/hash_tlb.c | 8 ++++---- arch/powerpc/mm/book3s64/subpage_prot.c | 4 ++-- arch/x86/include/asm/pgtable.h | 1 + fs/proc/task_mmu.c | 4 ++-- include/linux/pgtable.h | 29 +++++++++++++++++++++++++---- mm/kasan/shadow.c | 8 ++++---- mm/madvise.c | 18 +++++++++--------- mm/memory.c | 16 ++++++++-------- mm/migrate_device.c | 8 ++++---- mm/mprotect.c | 4 ++-- mm/mremap.c | 4 ++-- mm/userfaultfd.c | 4 ++-- mm/vmalloc.c | 12 ++++++------ mm/vmscan.c | 12 ++++++------ 16 files changed, 83 insertions(+), 61 deletions(-) (limited to 'include') diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 8e1d80a7033e..a6a00accf4f9 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -800,7 +800,7 @@ int split_kernel_leaf_mapping(unsigned long start, unsigned long end) return -EINVAL; mutex_lock(&pgtable_split_lock); - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); /* * The split_kernel_leaf_mapping_locked() may sleep, it is not a @@ -822,7 +822,7 @@ int split_kernel_leaf_mapping(unsigned long start, unsigned long end) ret = split_kernel_leaf_mapping_locked(end); } - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); mutex_unlock(&pgtable_split_lock); return ret; } @@ -883,10 +883,10 @@ static int range_split_to_ptes(unsigned long start, unsigned long end, gfp_t gfp { int ret; - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); ret = walk_kernel_page_table_range_lockless(start, end, &split_to_ptes_ops, NULL, &gfp); - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); return ret; } diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c index 7176ff39cb87..358d1dc9a576 100644 --- a/arch/arm64/mm/pageattr.c +++ b/arch/arm64/mm/pageattr.c @@ -110,7 +110,7 @@ static int update_range_prot(unsigned long start, unsigned long size, if (WARN_ON_ONCE(ret)) return ret; - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); /* * The caller must ensure that the range we are operating on does not @@ -119,7 +119,7 @@ static int update_range_prot(unsigned long start, unsigned long size, */ ret = walk_kernel_page_table_range_lockless(start, start + size, &pageattr_ops, NULL, &data); - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); return ret; } diff --git a/arch/powerpc/mm/book3s64/hash_tlb.c b/arch/powerpc/mm/book3s64/hash_tlb.c index 21fcad97ae80..787f7a0e27f0 100644 --- a/arch/powerpc/mm/book3s64/hash_tlb.c +++ b/arch/powerpc/mm/book3s64/hash_tlb.c @@ -205,7 +205,7 @@ void __flush_hash_table_range(unsigned long start, unsigned long end) * way to do things but is fine for our needs here. */ local_irq_save(flags); - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); for (; start < end; start += PAGE_SIZE) { pte_t *ptep = find_init_mm_pte(start, &hugepage_shift); unsigned long pte; @@ -217,7 +217,7 @@ void __flush_hash_table_range(unsigned long start, unsigned long end) continue; hpte_need_flush(&init_mm, start, ptep, pte, hugepage_shift); } - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); local_irq_restore(flags); } @@ -237,7 +237,7 @@ void flush_hash_table_pmd_range(struct mm_struct *mm, pmd_t *pmd, unsigned long * way to do things but is fine for our needs here. */ local_irq_save(flags); - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); start_pte = pte_offset_map(pmd, addr); if (!start_pte) goto out; @@ -249,6 +249,6 @@ void flush_hash_table_pmd_range(struct mm_struct *mm, pmd_t *pmd, unsigned long } pte_unmap(start_pte); out: - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); local_irq_restore(flags); } diff --git a/arch/powerpc/mm/book3s64/subpage_prot.c b/arch/powerpc/mm/book3s64/subpage_prot.c index ec98e526167e..07c47673bba2 100644 --- a/arch/powerpc/mm/book3s64/subpage_prot.c +++ b/arch/powerpc/mm/book3s64/subpage_prot.c @@ -73,13 +73,13 @@ static void hpte_flush_range(struct mm_struct *mm, unsigned long addr, pte = pte_offset_map_lock(mm, pmd, addr, &ptl); if (!pte) return; - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); for (; npages > 0; --npages) { pte_update(mm, addr, pte, 0, 0, 0); addr += PAGE_SIZE; ++pte; } - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); pte_unmap_unlock(pte - 1, ptl); } diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index e33df3da6980..2842fa1f7a2c 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -118,6 +118,7 @@ extern pmdval_t early_pmd_flags; #define __pte(x) native_make_pte(x) #define arch_end_context_switch(prev) do {} while(0) +static inline void arch_flush_lazy_mmu_mode(void) {} #endif /* CONFIG_PARAVIRT_XXL */ static inline pmd_t pmd_set_flags(pmd_t pmd, pmdval_t set) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 81dfc26bfae8..480db575553e 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -2739,7 +2739,7 @@ static int pagemap_scan_pmd_entry(pmd_t *pmd, unsigned long start, return 0; } - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); if ((p->arg.flags & PM_SCAN_WP_MATCHING) && !p->vec_out) { /* Fast path for performing exclusive WP */ @@ -2809,7 +2809,7 @@ flush_and_return: if (flush_end) flush_tlb_range(vma, start, addr); - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); pte_unmap_unlock(start_pte, ptl); cond_resched(); diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index d46d86959bd6..116a18b7916c 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -235,10 +235,31 @@ static inline int pmd_dirty(pmd_t pmd) * * Nesting is not permitted and the mode cannot be used in interrupt context. */ -#ifndef CONFIG_ARCH_HAS_LAZY_MMU_MODE -static inline void arch_enter_lazy_mmu_mode(void) {} -static inline void arch_leave_lazy_mmu_mode(void) {} -static inline void arch_flush_lazy_mmu_mode(void) {} +#ifdef CONFIG_ARCH_HAS_LAZY_MMU_MODE +static inline void lazy_mmu_mode_enable(void) +{ + arch_enter_lazy_mmu_mode(); +} + +static inline void lazy_mmu_mode_disable(void) +{ + arch_leave_lazy_mmu_mode(); +} + +static inline void lazy_mmu_mode_pause(void) +{ + arch_leave_lazy_mmu_mode(); +} + +static inline void lazy_mmu_mode_resume(void) +{ + arch_enter_lazy_mmu_mode(); +} +#else +static inline void lazy_mmu_mode_enable(void) {} +static inline void lazy_mmu_mode_disable(void) {} +static inline void lazy_mmu_mode_pause(void) {} +static inline void lazy_mmu_mode_resume(void) {} #endif #ifndef pte_batch_hint diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c index 32fbdf759ea2..d286e0a04543 100644 --- a/mm/kasan/shadow.c +++ b/mm/kasan/shadow.c @@ -305,7 +305,7 @@ static int kasan_populate_vmalloc_pte(pte_t *ptep, unsigned long addr, pte_t pte; int index; - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_pause(); index = PFN_DOWN(addr - data->start); page = data->pages[index]; @@ -319,7 +319,7 @@ static int kasan_populate_vmalloc_pte(pte_t *ptep, unsigned long addr, } spin_unlock(&init_mm.page_table_lock); - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_resume(); return 0; } @@ -471,7 +471,7 @@ static int kasan_depopulate_vmalloc_pte(pte_t *ptep, unsigned long addr, pte_t pte; int none; - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_pause(); spin_lock(&init_mm.page_table_lock); pte = ptep_get(ptep); @@ -483,7 +483,7 @@ static int kasan_depopulate_vmalloc_pte(pte_t *ptep, unsigned long addr, if (likely(!none)) __free_page(pfn_to_page(pte_pfn(pte))); - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_resume(); return 0; } diff --git a/mm/madvise.c b/mm/madvise.c index b617b1be0f53..6bf7009fa5ce 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -453,7 +453,7 @@ restart: if (!start_pte) return 0; flush_tlb_batched_pending(mm); - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); for (; addr < end; pte += nr, addr += nr * PAGE_SIZE) { nr = 1; ptent = ptep_get(pte); @@ -461,7 +461,7 @@ restart: if (++batch_count == SWAP_CLUSTER_MAX) { batch_count = 0; if (need_resched()) { - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); pte_unmap_unlock(start_pte, ptl); cond_resched(); goto restart; @@ -497,7 +497,7 @@ restart: if (!folio_trylock(folio)) continue; folio_get(folio); - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); pte_unmap_unlock(start_pte, ptl); start_pte = NULL; err = split_folio(folio); @@ -508,7 +508,7 @@ restart: if (!start_pte) break; flush_tlb_batched_pending(mm); - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); if (!err) nr = 0; continue; @@ -556,7 +556,7 @@ restart: } if (start_pte) { - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); pte_unmap_unlock(start_pte, ptl); } if (pageout) @@ -675,7 +675,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, if (!start_pte) return 0; flush_tlb_batched_pending(mm); - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); for (; addr != end; pte += nr, addr += PAGE_SIZE * nr) { nr = 1; ptent = ptep_get(pte); @@ -724,7 +724,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, if (!folio_trylock(folio)) continue; folio_get(folio); - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); pte_unmap_unlock(start_pte, ptl); start_pte = NULL; err = split_folio(folio); @@ -735,7 +735,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, if (!start_pte) break; flush_tlb_batched_pending(mm); - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); if (!err) nr = 0; continue; @@ -775,7 +775,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, if (nr_swap) add_mm_counter(mm, MM_SWAPENTS, nr_swap); if (start_pte) { - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); pte_unmap_unlock(start_pte, ptl); } cond_resched(); diff --git a/mm/memory.c b/mm/memory.c index da360a6eb8a4..e0bce673f053 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1256,7 +1256,7 @@ again: spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); orig_src_pte = src_pte; orig_dst_pte = dst_pte; - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); do { nr = 1; @@ -1325,7 +1325,7 @@ again: } while (dst_pte += nr, src_pte += nr, addr += PAGE_SIZE * nr, addr != end); - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); pte_unmap_unlock(orig_src_pte, src_ptl); add_mm_rss_vec(dst_mm, rss); pte_unmap_unlock(orig_dst_pte, dst_ptl); @@ -1846,7 +1846,7 @@ retry: return addr; flush_tlb_batched_pending(mm); - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); do { bool any_skipped = false; @@ -1878,7 +1878,7 @@ retry: direct_reclaim = try_get_and_clear_pmd(mm, pmd, &pmdval); add_mm_rss_vec(mm, rss); - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); /* Do the actual TLB flush before dropping ptl */ if (force_flush) { @@ -2816,7 +2816,7 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd, mapped_pte = pte = pte_alloc_map_lock(mm, pmd, addr, &ptl); if (!pte) return -ENOMEM; - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); do { BUG_ON(!pte_none(ptep_get(pte))); if (!pfn_modify_allowed(pfn, prot)) { @@ -2826,7 +2826,7 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd, set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot))); pfn++; } while (pte++, addr += PAGE_SIZE, addr != end); - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); pte_unmap_unlock(mapped_pte, ptl); return err; } @@ -3177,7 +3177,7 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, return -EINVAL; } - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); if (fn) { do { @@ -3190,7 +3190,7 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, } *mask |= PGTBL_PTE_MODIFIED; - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); if (mm != &init_mm) pte_unmap_unlock(mapped_pte, ptl); diff --git a/mm/migrate_device.c b/mm/migrate_device.c index 23379663b1e1..0346c2d7819f 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -271,7 +271,7 @@ again: ptep = pte_offset_map_lock(mm, pmdp, start, &ptl); if (!ptep) goto again; - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); ptep += (addr - start) / PAGE_SIZE; for (; addr < end; addr += PAGE_SIZE, ptep++) { @@ -313,7 +313,7 @@ again: if (folio_test_large(folio)) { int ret; - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); pte_unmap_unlock(ptep, ptl); ret = migrate_vma_split_folio(folio, migrate->fault_page); @@ -356,7 +356,7 @@ again: if (folio && folio_test_large(folio)) { int ret; - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); pte_unmap_unlock(ptep, ptl); ret = migrate_vma_split_folio(folio, migrate->fault_page); @@ -485,7 +485,7 @@ next: if (unmapped) flush_tlb_range(walk->vma, start, end); - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); pte_unmap_unlock(ptep - 1, ptl); return 0; diff --git a/mm/mprotect.c b/mm/mprotect.c index 283889e4f1ce..c0571445bef7 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -233,7 +233,7 @@ static long change_pte_range(struct mmu_gather *tlb, is_private_single_threaded = vma_is_single_threaded_private(vma); flush_tlb_batched_pending(vma->vm_mm); - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); do { nr_ptes = 1; oldpte = ptep_get(pte); @@ -379,7 +379,7 @@ static long change_pte_range(struct mmu_gather *tlb, } } } while (pte += nr_ptes, addr += nr_ptes * PAGE_SIZE, addr != end); - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); pte_unmap_unlock(pte - 1, ptl); return pages; diff --git a/mm/mremap.c b/mm/mremap.c index 672264807db6..8275b9772ec1 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -260,7 +260,7 @@ static int move_ptes(struct pagetable_move_control *pmc, if (new_ptl != old_ptl) spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); flush_tlb_batched_pending(vma->vm_mm); - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); for (; old_addr < old_end; old_ptep += nr_ptes, old_addr += nr_ptes * PAGE_SIZE, new_ptep += nr_ptes, new_addr += nr_ptes * PAGE_SIZE) { @@ -305,7 +305,7 @@ static int move_ptes(struct pagetable_move_control *pmc, } } - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); if (force_flush) flush_tlb_range(vma, old_end - len, old_end); if (new_ptl != old_ptl) diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index e6dfd5f28acd..b11f81095fa5 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -1103,7 +1103,7 @@ static long move_present_ptes(struct mm_struct *mm, /* It's safe to drop the reference now as the page-table is holding one. */ folio_put(*first_src_folio); *first_src_folio = NULL; - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); while (true) { orig_src_pte = ptep_get_and_clear(mm, src_addr, src_pte); @@ -1140,7 +1140,7 @@ static long move_present_ptes(struct mm_struct *mm, break; } - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); if (src_addr > src_start) flush_tlb_range(src_vma, src_start, src_addr); diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 429a893b0505..32d6ee92d4ff 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -108,7 +108,7 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, if (!pte) return -ENOMEM; - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); do { if (unlikely(!pte_none(ptep_get(pte)))) { @@ -134,7 +134,7 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, pfn++; } while (pte += PFN_DOWN(size), addr += size, addr != end); - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); *mask |= PGTBL_PTE_MODIFIED; return 0; } @@ -371,7 +371,7 @@ static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, unsigned long size = PAGE_SIZE; pte = pte_offset_kernel(pmd, addr); - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); do { #ifdef CONFIG_HUGETLB_PAGE @@ -390,7 +390,7 @@ static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, WARN_ON(!pte_none(ptent) && !pte_present(ptent)); } while (pte += (size >> PAGE_SHIFT), addr += size, addr != end); - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); *mask |= PGTBL_PTE_MODIFIED; } @@ -538,7 +538,7 @@ static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr, if (!pte) return -ENOMEM; - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); do { struct page *page = pages[*nr]; @@ -560,7 +560,7 @@ static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr, (*nr)++; } while (pte++, addr += PAGE_SIZE, addr != end); - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); *mask |= PGTBL_PTE_MODIFIED; return err; diff --git a/mm/vmscan.c b/mm/vmscan.c index 614ccf39fe3f..6cf5ee94be7a 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3516,7 +3516,7 @@ static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end, return false; } - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); restart: for (i = pte_index(start), addr = start; addr != end; i++, addr += PAGE_SIZE) { unsigned long pfn; @@ -3557,7 +3557,7 @@ restart: if (i < PTRS_PER_PTE && get_next_vma(PMD_MASK, PAGE_SIZE, args, &start, &end)) goto restart; - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); pte_unmap_unlock(pte, ptl); return suitable_to_scan(total, young); @@ -3598,7 +3598,7 @@ static void walk_pmd_range_locked(pud_t *pud, unsigned long addr, struct vm_area if (!spin_trylock(ptl)) goto done; - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); do { unsigned long pfn; @@ -3645,7 +3645,7 @@ next: walk_update_folio(walk, last, gen, dirty); - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); spin_unlock(ptl); done: *first = -1; @@ -4244,7 +4244,7 @@ bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw) } } - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); pte -= (addr - start) / PAGE_SIZE; @@ -4278,7 +4278,7 @@ bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw) walk_update_folio(walk, last, gen, dirty); - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); /* feedback from rmap walkers to page table walkers */ if (mm_state && suitable_to_scan(i, young)) -- cgit v1.2.3 From 9273dfaeaca8ea4d88c7e9fd081922a029984fd4 Mon Sep 17 00:00:00 2001 From: Kevin Brodsky Date: Mon, 15 Dec 2025 15:03:17 +0000 Subject: mm: bail out of lazy_mmu_mode_* in interrupt context The lazy MMU mode cannot be used in interrupt context. This is documented in , but isn't consistently handled across architectures. arm64 ensures that calls to lazy_mmu_mode_* have no effect in interrupt context, because such calls do occur in certain configurations - see commit b81c688426a9 ("arm64/mm: Disable barrier batching in interrupt contexts"). Other architectures do not check this situation, most likely because it hasn't occurred so far. Let's handle this in the new generic lazy_mmu layer, in the same fashion as arm64: bail out of lazy_mmu_mode_* if in_interrupt(). Also remove the arm64 handling that is now redundant. Both arm64 and x86/Xen also ensure that any lazy MMU optimisation is disabled while in interrupt (see queue_pte_barriers() and xen_get_lazy_mode() respectively). This will be handled in the generic layer in a subsequent patch. Link: https://lkml.kernel.org/r/20251215150323.2218608-9-kevin.brodsky@arm.com Signed-off-by: Kevin Brodsky Acked-by: David Hildenbrand (Red Hat) Reviewed-by: Anshuman Khandual Reviewed-by: Yeoreum Yun Cc: Alexander Gordeev Cc: Andreas Larsson Cc: Borislav Betkov Cc: Boris Ostrovsky Cc: Catalin Marinas Cc: Christophe Leroy Cc: David Hildenbrand Cc: David S. Miller Cc: David Woodhouse Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jann Horn Cc: Juegren Gross Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Ritesh Harjani (IBM) Cc: Ryan Roberts Cc: Suren Baghdasaryan Cc: Thomas Gleinxer Cc: Venkat Rao Bagalkote Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/arm64/include/asm/pgtable.h | 9 --------- include/linux/pgtable.h | 17 ++++++++++++++++- 2 files changed, 16 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index f7d66c261347..bf9178902bdb 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -94,26 +94,17 @@ static inline void arch_enter_lazy_mmu_mode(void) * keeps tracking simple. */ - if (in_interrupt()) - return; - set_thread_flag(TIF_LAZY_MMU); } static inline void arch_flush_lazy_mmu_mode(void) { - if (in_interrupt()) - return; - if (test_and_clear_thread_flag(TIF_LAZY_MMU_PENDING)) emit_pte_barriers(); } static inline void arch_leave_lazy_mmu_mode(void) { - if (in_interrupt()) - return; - arch_flush_lazy_mmu_mode(); clear_thread_flag(TIF_LAZY_MMU); } diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 116a18b7916c..dddde6873d1e 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -233,26 +233,41 @@ static inline int pmd_dirty(pmd_t pmd) * preemption, as a consequence generic code may not sleep while the lazy MMU * mode is active. * - * Nesting is not permitted and the mode cannot be used in interrupt context. + * The mode is disabled in interrupt context and calls to the lazy_mmu API have + * no effect. + * + * Nesting is not permitted. */ #ifdef CONFIG_ARCH_HAS_LAZY_MMU_MODE static inline void lazy_mmu_mode_enable(void) { + if (in_interrupt()) + return; + arch_enter_lazy_mmu_mode(); } static inline void lazy_mmu_mode_disable(void) { + if (in_interrupt()) + return; + arch_leave_lazy_mmu_mode(); } static inline void lazy_mmu_mode_pause(void) { + if (in_interrupt()) + return; + arch_leave_lazy_mmu_mode(); } static inline void lazy_mmu_mode_resume(void) { + if (in_interrupt()) + return; + arch_enter_lazy_mmu_mode(); } #else -- cgit v1.2.3 From 5ab246749569cff9f815618f02ba0d7cf20e5edd Mon Sep 17 00:00:00 2001 From: Kevin Brodsky Date: Mon, 15 Dec 2025 15:03:18 +0000 Subject: mm: enable lazy_mmu sections to nest MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Despite recent efforts to prevent lazy_mmu sections from nesting, it remains difficult to ensure that it never occurs - and in fact it does occur on arm64 in certain situations (CONFIG_DEBUG_PAGEALLOC). Commit 1ef3095b1405 ("arm64/mm: Permit lazy_mmu_mode to be nested") made nesting tolerable on arm64, but without truly supporting it: the inner call to leave() disables the batching optimisation before the outer section ends. This patch actually enables lazy_mmu sections to nest by tracking the nesting level in task_struct, in a similar fashion to e.g. pagefault_{enable,disable}(). This is fully handled by the generic lazy_mmu helpers that were recently introduced. lazy_mmu sections were not initially intended to nest, so we need to clarify the semantics w.r.t. the arch_*_lazy_mmu_mode() callbacks. This patch takes the following approach: * The outermost calls to lazy_mmu_mode_{enable,disable}() trigger calls to arch_{enter,leave}_lazy_mmu_mode() - this is unchanged. * Nested calls to lazy_mmu_mode_{enable,disable}() are not forwarded to the arch via arch_{enter,leave} - lazy MMU remains enabled so the assumption is that these callbacks are not relevant. However, existing code may rely on a call to disable() to flush any batched state, regardless of nesting. arch_flush_lazy_mmu_mode() is therefore called in that situation. A separate interface was recently introduced to temporarily pause the lazy MMU mode: lazy_mmu_mode_{pause,resume}(). pause() fully exits the mode *regardless of the nesting level*, and resume() restores the mode at the same nesting level. pause()/resume() are themselves allowed to nest, so we actually store two nesting levels in task_struct: enable_count and pause_count. A new helper is_lazy_mmu_mode_active() is introduced to determine whether we are currently in lazy MMU mode; this will be used in subsequent patches to replace the various ways arch's currently track whether the mode is enabled. In summary (enable/pause represent the values *after* the call): lazy_mmu_mode_enable() -> arch_enter() enable=1 pause=0 lazy_mmu_mode_enable() -> ø enable=2 pause=0 lazy_mmu_mode_pause() -> arch_leave() enable=2 pause=1 lazy_mmu_mode_resume() -> arch_enter() enable=2 pause=0 lazy_mmu_mode_disable() -> arch_flush() enable=1 pause=0 lazy_mmu_mode_disable() -> arch_leave() enable=0 pause=0 Note: is_lazy_mmu_mode_active() is added to to allow arch headers included by to use it. Link: https://lkml.kernel.org/r/20251215150323.2218608-10-kevin.brodsky@arm.com Signed-off-by: Kevin Brodsky Acked-by: David Hildenbrand (Red Hat) Reviewed-by: Yeoreum Yun Cc: Alexander Gordeev Cc: Andreas Larsson Cc: Anshuman Khandual Cc: Borislav Betkov Cc: Boris Ostrovsky Cc: Catalin Marinas Cc: Christophe Leroy Cc: David Hildenbrand Cc: David S. Miller Cc: David Woodhouse Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jann Horn Cc: Juegren Gross Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Ritesh Harjani (IBM) Cc: Ryan Roberts Cc: Suren Baghdasaryan Cc: Thomas Gleinxer Cc: Venkat Rao Bagalkote Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/arm64/include/asm/pgtable.h | 12 ----- include/linux/mm_types_task.h | 5 ++ include/linux/pgtable.h | 114 ++++++++++++++++++++++++++++++++++++--- include/linux/sched.h | 45 ++++++++++++++++ 4 files changed, 157 insertions(+), 19 deletions(-) (limited to 'include') diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index bf9178902bdb..7f528c36d53c 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -82,18 +82,6 @@ static inline void queue_pte_barriers(void) static inline void arch_enter_lazy_mmu_mode(void) { - /* - * lazy_mmu_mode is not supposed to permit nesting. But in practice this - * does happen with CONFIG_DEBUG_PAGEALLOC, where a page allocation - * inside a lazy_mmu_mode section (such as zap_pte_range()) will change - * permissions on the linear map with apply_to_page_range(), which - * re-enters lazy_mmu_mode. So we tolerate nesting in our - * implementation. The first call to arch_leave_lazy_mmu_mode() will - * flush and clear the flag such that the remainder of the work in the - * outer nest behaves as if outside of lazy mmu mode. This is safe and - * keeps tracking simple. - */ - set_thread_flag(TIF_LAZY_MMU); } diff --git a/include/linux/mm_types_task.h b/include/linux/mm_types_task.h index a82aa80c0ba4..11bf319d78ec 100644 --- a/include/linux/mm_types_task.h +++ b/include/linux/mm_types_task.h @@ -88,4 +88,9 @@ struct tlbflush_unmap_batch { #endif }; +struct lazy_mmu_state { + u8 enable_count; + u8 pause_count; +}; + #endif /* _LINUX_MM_TYPES_TASK_H */ diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index dddde6873d1e..2f0dd3a4ace1 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -236,39 +236,139 @@ static inline int pmd_dirty(pmd_t pmd) * The mode is disabled in interrupt context and calls to the lazy_mmu API have * no effect. * - * Nesting is not permitted. + * The lazy MMU mode is enabled for a given block of code using: + * + * lazy_mmu_mode_enable(); + * + * lazy_mmu_mode_disable(); + * + * Nesting is permitted: may itself use an enable()/disable() pair. + * A nested call to enable() has no functional effect; however disable() causes + * any batched architectural state to be flushed regardless of nesting. After a + * call to disable(), the caller can therefore rely on all previous page table + * modifications to have taken effect, but the lazy MMU mode may still be + * enabled. + * + * In certain cases, it may be desirable to temporarily pause the lazy MMU mode. + * This can be done using: + * + * lazy_mmu_mode_pause(); + * + * lazy_mmu_mode_resume(); + * + * pause() ensures that the mode is exited regardless of the nesting level; + * resume() re-enters the mode at the same nesting level. Any call to the + * lazy_mmu_mode_* API between those two calls has no effect. In particular, + * this means that pause()/resume() pairs may nest. + * + * is_lazy_mmu_mode_active() can be used to check whether the lazy MMU mode is + * currently enabled. */ #ifdef CONFIG_ARCH_HAS_LAZY_MMU_MODE +/** + * lazy_mmu_mode_enable() - Enable the lazy MMU mode. + * + * Enters a new lazy MMU mode section; if the mode was not already enabled, + * enables it and calls arch_enter_lazy_mmu_mode(). + * + * Must be paired with a call to lazy_mmu_mode_disable(). + * + * Has no effect if called: + * - While paused - see lazy_mmu_mode_pause() + * - In interrupt context + */ static inline void lazy_mmu_mode_enable(void) { - if (in_interrupt()) + struct lazy_mmu_state *state = ¤t->lazy_mmu_state; + + if (in_interrupt() || state->pause_count > 0) return; - arch_enter_lazy_mmu_mode(); + VM_WARN_ON_ONCE(state->enable_count == U8_MAX); + + if (state->enable_count++ == 0) + arch_enter_lazy_mmu_mode(); } +/** + * lazy_mmu_mode_disable() - Disable the lazy MMU mode. + * + * Exits the current lazy MMU mode section. If it is the outermost section, + * disables the mode and calls arch_leave_lazy_mmu_mode(). Otherwise (nested + * section), calls arch_flush_lazy_mmu_mode(). + * + * Must match a call to lazy_mmu_mode_enable(). + * + * Has no effect if called: + * - While paused - see lazy_mmu_mode_pause() + * - In interrupt context + */ static inline void lazy_mmu_mode_disable(void) { - if (in_interrupt()) + struct lazy_mmu_state *state = ¤t->lazy_mmu_state; + + if (in_interrupt() || state->pause_count > 0) return; - arch_leave_lazy_mmu_mode(); + VM_WARN_ON_ONCE(state->enable_count == 0); + + if (--state->enable_count == 0) + arch_leave_lazy_mmu_mode(); + else /* Exiting a nested section */ + arch_flush_lazy_mmu_mode(); + } +/** + * lazy_mmu_mode_pause() - Pause the lazy MMU mode. + * + * Pauses the lazy MMU mode; if it is currently active, disables it and calls + * arch_leave_lazy_mmu_mode(). + * + * Must be paired with a call to lazy_mmu_mode_resume(). Calls to the + * lazy_mmu_mode_* API have no effect until the matching resume() call. + * + * Has no effect if called: + * - While paused (inside another pause()/resume() pair) + * - In interrupt context + */ static inline void lazy_mmu_mode_pause(void) { + struct lazy_mmu_state *state = ¤t->lazy_mmu_state; + if (in_interrupt()) return; - arch_leave_lazy_mmu_mode(); + VM_WARN_ON_ONCE(state->pause_count == U8_MAX); + + if (state->pause_count++ == 0 && state->enable_count > 0) + arch_leave_lazy_mmu_mode(); } +/** + * lazy_mmu_mode_resume() - Resume the lazy MMU mode. + * + * Resumes the lazy MMU mode; if it was active at the point where the matching + * call to lazy_mmu_mode_pause() was made, re-enables it and calls + * arch_enter_lazy_mmu_mode(). + * + * Must match a call to lazy_mmu_mode_pause(). + * + * Has no effect if called: + * - While paused (inside another pause()/resume() pair) + * - In interrupt context + */ static inline void lazy_mmu_mode_resume(void) { + struct lazy_mmu_state *state = ¤t->lazy_mmu_state; + if (in_interrupt()) return; - arch_enter_lazy_mmu_mode(); + VM_WARN_ON_ONCE(state->pause_count == 0); + + if (--state->pause_count == 0 && state->enable_count > 0) + arch_enter_lazy_mmu_mode(); } #else static inline void lazy_mmu_mode_enable(void) {} diff --git a/include/linux/sched.h b/include/linux/sched.h index da0133524d08..6b563d4e68f6 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1419,6 +1419,10 @@ struct task_struct { struct page_frag task_frag; +#ifdef CONFIG_ARCH_HAS_LAZY_MMU_MODE + struct lazy_mmu_state lazy_mmu_state; +#endif + #ifdef CONFIG_TASK_DELAY_ACCT struct task_delay_info *delays; #endif @@ -1702,6 +1706,47 @@ static inline char task_state_to_char(struct task_struct *tsk) return task_index_to_char(task_state_index(tsk)); } +#ifdef CONFIG_ARCH_HAS_LAZY_MMU_MODE +/** + * __task_lazy_mmu_mode_active() - Test the lazy MMU mode state for a task. + * @tsk: The task to check. + * + * Test whether @tsk has its lazy MMU mode state set to active (i.e. enabled + * and not paused). + * + * This function only considers the state saved in task_struct; to test whether + * current actually is in lazy MMU mode, is_lazy_mmu_mode_active() should be + * used instead. + * + * This function is intended for architectures that implement the lazy MMU + * mode; it must not be called from generic code. + */ +static inline bool __task_lazy_mmu_mode_active(struct task_struct *tsk) +{ + struct lazy_mmu_state *state = &tsk->lazy_mmu_state; + + return state->enable_count > 0 && state->pause_count == 0; +} + +/** + * is_lazy_mmu_mode_active() - Test whether we are currently in lazy MMU mode. + * + * Test whether the current context is in lazy MMU mode. This is true if both: + * 1. We are not in interrupt context + * 2. Lazy MMU mode is active for the current task + * + * This function is intended for architectures that implement the lazy MMU + * mode; it must not be called from generic code. + */ +static inline bool is_lazy_mmu_mode_active(void) +{ + if (in_interrupt()) + return false; + + return __task_lazy_mmu_mode_active(current); +} +#endif + extern struct pid *cad_pid; /* -- cgit v1.2.3 From 8e38607aa4aa8ee7ad4058d183465d248d04dca4 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 6 Jan 2026 23:20:02 -0800 Subject: treewide: provide a generic clear_user_page() variant Patch series "mm: folio_zero_user: clear page ranges", v11. This series adds clearing of contiguous page ranges for hugepages. The series improves on the current discontiguous clearing approach in two ways: - clear pages in a contiguous fashion. - use batched clearing via clear_pages() wherever exposed. The first is useful because it allows us to make much better use of hardware prefetchers. The second, enables advertising the real extent to the processor. Where specific instructions support it (ex. string instructions on x86; "mops" on arm64 etc), a processor can optimize based on this because, instead of seeing a sequence of 8-byte stores, or a sequence of 4KB pages, it sees a larger unit being operated on. For instance, AMD Zen uarchs (for extents larger than LLC-size) switch to a mode where they start eliding cacheline allocation. This is helpful not just because it results in higher bandwidth, but also because now the cache is not evicting useful cachelines and replacing them with zeroes. Demand faulting a 64GB region shows performance improvement: $ perf bench mem mmap -p $pg-sz -f demand -s 64GB -l 5 baseline +series (GBps +- %stdev) (GBps +- %stdev) pg-sz=2MB 11.76 +- 1.10% 25.34 +- 1.18% [*] +115.47% preempt=* pg-sz=1GB 24.85 +- 2.41% 39.22 +- 2.32% + 57.82% preempt=none|voluntary pg-sz=1GB (similar) 52.73 +- 0.20% [#] +112.19% preempt=full|lazy [*] This improvement is because switching to sequential clearing allows the hardware prefetchers to do a much better job. [#] For pg-sz=1GB a large part of the improvement is because of the cacheline elision mentioned above. preempt=full|lazy improves upon that because, not needing explicit invocations of cond_resched() to ensure reasonable preemption latency, it can clear the full extent as a single unit. In comparison the maximum extent used for preempt=none|voluntary is PROCESS_PAGES_NON_PREEMPT_BATCH (32MB). When provided the full extent the processor forgoes allocating cachelines on this path almost entirely. (The hope is that eventually, in the fullness of time, the lazy preemption model will be able to do the same job that none or voluntary models are used for, allowing us to do away with cond_resched().) Raghavendra also tested previous version of the series on AMD Genoa and sees similar improvement [1] with preempt=lazy. $ perf bench mem map -p $page-size -f populate -s 64GB -l 10 base patched change pg-sz=2MB 12.731939 GB/sec 26.304263 GB/sec 106.6% pg-sz=1GB 26.232423 GB/sec 61.174836 GB/sec 133.2% This patch (of 8): Let's drop all variants that effectively map to clear_page() and provide it in a generic variant instead. We'll use the macro clear_user_page to indicate whether an architecture provides it's own variant. Also, clear_user_page() is only called from the generic variant of clear_user_highpage(), so define it only if the architecture does not provide a clear_user_highpage(). And, for simplicity define it in linux/highmem.h. Note that for parisc, clear_page() and clear_user_page() map to clear_page_asm(), so we can just get rid of the custom clear_user_page() implementation. There is a clear_user_page_asm() function on parisc, that seems to be unused. Not sure what's up with that. Link: https://lkml.kernel.org/r/20260107072009.1615991-1-ankur.a.arora@oracle.com Link: https://lkml.kernel.org/r/20260107072009.1615991-2-ankur.a.arora@oracle.com Signed-off-by: David Hildenbrand Co-developed-by: Ankur Arora Signed-off-by: Ankur Arora Cc: Andy Lutomirski Cc: Ankur Arora Cc: "Borislav Petkov (AMD)" Cc: Boris Ostrovsky Cc: David Hildenbrand Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Konrad Rzessutek Wilk Cc: Lance Yang Cc: "Liam R. Howlett" Cc: Li Zhe Cc: Lorenzo Stoakes Cc: Mateusz Guzik Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Peter Zijlstra Cc: Raghavendra K T Cc: Suren Baghdasaryan Cc: Thomas Gleixner Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- arch/alpha/include/asm/page.h | 1 - arch/arc/include/asm/page.h | 2 ++ arch/arm/include/asm/page-nommu.h | 1 - arch/arm64/include/asm/page.h | 1 - arch/csky/abiv1/inc/abi/page.h | 1 + arch/csky/abiv2/inc/abi/page.h | 7 ------- arch/hexagon/include/asm/page.h | 1 - arch/loongarch/include/asm/page.h | 1 - arch/m68k/include/asm/page_no.h | 1 - arch/microblaze/include/asm/page.h | 1 - arch/mips/include/asm/page.h | 1 + arch/nios2/include/asm/page.h | 1 + arch/openrisc/include/asm/page.h | 1 - arch/parisc/include/asm/page.h | 1 - arch/powerpc/include/asm/page.h | 1 + arch/riscv/include/asm/page.h | 1 - arch/s390/include/asm/page.h | 1 - arch/sparc/include/asm/page_64.h | 1 + arch/um/include/asm/page.h | 1 - arch/x86/include/asm/page.h | 6 ------ arch/xtensa/include/asm/page.h | 1 - include/linux/highmem.h | 24 ++++++++++++++++++++++-- 22 files changed, 29 insertions(+), 28 deletions(-) (limited to 'include') diff --git a/arch/alpha/include/asm/page.h b/arch/alpha/include/asm/page.h index d2c6667d73e9..59d01f9b77f6 100644 --- a/arch/alpha/include/asm/page.h +++ b/arch/alpha/include/asm/page.h @@ -11,7 +11,6 @@ #define STRICT_MM_TYPECHECKS extern void clear_page(void *page); -#define clear_user_page(page, vaddr, pg) clear_page(page) #define vma_alloc_zeroed_movable_folio(vma, vaddr) \ vma_alloc_folio(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, 0, vma, vaddr) diff --git a/arch/arc/include/asm/page.h b/arch/arc/include/asm/page.h index 9720fe6b2c24..38214e126c6d 100644 --- a/arch/arc/include/asm/page.h +++ b/arch/arc/include/asm/page.h @@ -32,6 +32,8 @@ struct page; void copy_user_highpage(struct page *to, struct page *from, unsigned long u_vaddr, struct vm_area_struct *vma); + +#define clear_user_page clear_user_page void clear_user_page(void *to, unsigned long u_vaddr, struct page *page); typedef struct { diff --git a/arch/arm/include/asm/page-nommu.h b/arch/arm/include/asm/page-nommu.h index 7c2c72323d17..e74415c959be 100644 --- a/arch/arm/include/asm/page-nommu.h +++ b/arch/arm/include/asm/page-nommu.h @@ -11,7 +11,6 @@ #define clear_page(page) memset((page), 0, PAGE_SIZE) #define copy_page(to,from) memcpy((to), (from), PAGE_SIZE) -#define clear_user_page(page, vaddr, pg) clear_page(page) #define copy_user_page(to, from, vaddr, pg) copy_page(to, from) /* diff --git a/arch/arm64/include/asm/page.h b/arch/arm64/include/asm/page.h index 00f117ff4f7a..b39cc1127e1f 100644 --- a/arch/arm64/include/asm/page.h +++ b/arch/arm64/include/asm/page.h @@ -36,7 +36,6 @@ struct folio *vma_alloc_zeroed_movable_folio(struct vm_area_struct *vma, bool tag_clear_highpages(struct page *to, int numpages); #define __HAVE_ARCH_TAG_CLEAR_HIGHPAGES -#define clear_user_page(page, vaddr, pg) clear_page(page) #define copy_user_page(to, from, vaddr, pg) copy_page(to, from) typedef struct page *pgtable_t; diff --git a/arch/csky/abiv1/inc/abi/page.h b/arch/csky/abiv1/inc/abi/page.h index 2d2159933b76..58307254e7e5 100644 --- a/arch/csky/abiv1/inc/abi/page.h +++ b/arch/csky/abiv1/inc/abi/page.h @@ -10,6 +10,7 @@ static inline unsigned long pages_do_alias(unsigned long addr1, return (addr1 ^ addr2) & (SHMLBA-1); } +#define clear_user_page clear_user_page static inline void clear_user_page(void *addr, unsigned long vaddr, struct page *page) { diff --git a/arch/csky/abiv2/inc/abi/page.h b/arch/csky/abiv2/inc/abi/page.h index cf005f13cd15..a5a255013308 100644 --- a/arch/csky/abiv2/inc/abi/page.h +++ b/arch/csky/abiv2/inc/abi/page.h @@ -1,11 +1,4 @@ /* SPDX-License-Identifier: GPL-2.0 */ - -static inline void clear_user_page(void *addr, unsigned long vaddr, - struct page *page) -{ - clear_page(addr); -} - static inline void copy_user_page(void *to, void *from, unsigned long vaddr, struct page *page) { diff --git a/arch/hexagon/include/asm/page.h b/arch/hexagon/include/asm/page.h index 137ba7c5de48..f0aed3ed812b 100644 --- a/arch/hexagon/include/asm/page.h +++ b/arch/hexagon/include/asm/page.h @@ -113,7 +113,6 @@ static inline void clear_page(void *page) /* * Under assumption that kernel always "sees" user map... */ -#define clear_user_page(page, vaddr, pg) clear_page(page) #define copy_user_page(to, from, vaddr, pg) copy_page(to, from) static inline unsigned long virt_to_pfn(const void *kaddr) diff --git a/arch/loongarch/include/asm/page.h b/arch/loongarch/include/asm/page.h index 256d1ff7a1e3..327bf0bc92bf 100644 --- a/arch/loongarch/include/asm/page.h +++ b/arch/loongarch/include/asm/page.h @@ -30,7 +30,6 @@ extern void clear_page(void *page); extern void copy_page(void *to, void *from); -#define clear_user_page(page, vaddr, pg) clear_page(page) #define copy_user_page(to, from, vaddr, pg) copy_page(to, from) extern unsigned long shm_align_mask; diff --git a/arch/m68k/include/asm/page_no.h b/arch/m68k/include/asm/page_no.h index 39db2026a4b4..d2532bc407ef 100644 --- a/arch/m68k/include/asm/page_no.h +++ b/arch/m68k/include/asm/page_no.h @@ -10,7 +10,6 @@ extern unsigned long memory_end; #define clear_page(page) memset((page), 0, PAGE_SIZE) #define copy_page(to,from) memcpy((to), (from), PAGE_SIZE) -#define clear_user_page(page, vaddr, pg) clear_page(page) #define copy_user_page(to, from, vaddr, pg) copy_page(to, from) #define vma_alloc_zeroed_movable_folio(vma, vaddr) \ diff --git a/arch/microblaze/include/asm/page.h b/arch/microblaze/include/asm/page.h index 90ac9f34b4b4..e1e396367ba7 100644 --- a/arch/microblaze/include/asm/page.h +++ b/arch/microblaze/include/asm/page.h @@ -45,7 +45,6 @@ typedef unsigned long pte_basic_t; # define copy_page(to, from) memcpy((to), (from), PAGE_SIZE) # define clear_page(pgaddr) memset((pgaddr), 0, PAGE_SIZE) -# define clear_user_page(pgaddr, vaddr, page) memset((pgaddr), 0, PAGE_SIZE) # define copy_user_page(vto, vfrom, vaddr, topg) \ memcpy((vto), (vfrom), PAGE_SIZE) diff --git a/arch/mips/include/asm/page.h b/arch/mips/include/asm/page.h index bc3e3484c1bf..5ec428fcc887 100644 --- a/arch/mips/include/asm/page.h +++ b/arch/mips/include/asm/page.h @@ -90,6 +90,7 @@ static inline void clear_user_page(void *addr, unsigned long vaddr, if (pages_do_alias((unsigned long) addr, vaddr & PAGE_MASK)) flush_data_cache_page((unsigned long)addr); } +#define clear_user_page clear_user_page struct vm_area_struct; extern void copy_user_highpage(struct page *to, struct page *from, diff --git a/arch/nios2/include/asm/page.h b/arch/nios2/include/asm/page.h index 00a51623d38a..722956ac0bf8 100644 --- a/arch/nios2/include/asm/page.h +++ b/arch/nios2/include/asm/page.h @@ -45,6 +45,7 @@ struct page; +#define clear_user_page clear_user_page extern void clear_user_page(void *addr, unsigned long vaddr, struct page *page); extern void copy_user_page(void *vto, void *vfrom, unsigned long vaddr, struct page *to); diff --git a/arch/openrisc/include/asm/page.h b/arch/openrisc/include/asm/page.h index 85797f94d1d7..d2cdbf3579bb 100644 --- a/arch/openrisc/include/asm/page.h +++ b/arch/openrisc/include/asm/page.h @@ -30,7 +30,6 @@ #define clear_page(page) memset((page), 0, PAGE_SIZE) #define copy_page(to, from) memcpy((to), (from), PAGE_SIZE) -#define clear_user_page(page, vaddr, pg) clear_page(page) #define copy_user_page(to, from, vaddr, pg) copy_page(to, from) /* diff --git a/arch/parisc/include/asm/page.h b/arch/parisc/include/asm/page.h index 8f4e51071ea1..3630b36d07da 100644 --- a/arch/parisc/include/asm/page.h +++ b/arch/parisc/include/asm/page.h @@ -21,7 +21,6 @@ struct vm_area_struct; void clear_page_asm(void *page); void copy_page_asm(void *to, void *from); -#define clear_user_page(vto, vaddr, page) clear_page_asm(vto) void copy_user_highpage(struct page *to, struct page *from, unsigned long vaddr, struct vm_area_struct *vma); #define __HAVE_ARCH_COPY_USER_HIGHPAGE diff --git a/arch/powerpc/include/asm/page.h b/arch/powerpc/include/asm/page.h index b28fbb1d57eb..f2bb1f98eebe 100644 --- a/arch/powerpc/include/asm/page.h +++ b/arch/powerpc/include/asm/page.h @@ -271,6 +271,7 @@ static inline const void *pfn_to_kaddr(unsigned long pfn) struct page; extern void clear_user_page(void *page, unsigned long vaddr, struct page *pg); +#define clear_user_page clear_user_page extern void copy_user_page(void *to, void *from, unsigned long vaddr, struct page *p); extern int devmem_is_allowed(unsigned long pfn); diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h index ffe213ad65a4..061b60b954ec 100644 --- a/arch/riscv/include/asm/page.h +++ b/arch/riscv/include/asm/page.h @@ -50,7 +50,6 @@ void clear_page(void *page); #endif #define copy_page(to, from) memcpy((to), (from), PAGE_SIZE) -#define clear_user_page(pgaddr, vaddr, page) clear_page(pgaddr) #define copy_user_page(vto, vfrom, vaddr, topg) \ memcpy((vto), (vfrom), PAGE_SIZE) diff --git a/arch/s390/include/asm/page.h b/arch/s390/include/asm/page.h index c1d63b613bf9..9c8c5283258e 100644 --- a/arch/s390/include/asm/page.h +++ b/arch/s390/include/asm/page.h @@ -65,7 +65,6 @@ static inline void copy_page(void *to, void *from) : : "memory", "cc"); } -#define clear_user_page(page, vaddr, pg) clear_page(page) #define copy_user_page(to, from, vaddr, pg) copy_page(to, from) #define vma_alloc_zeroed_movable_folio(vma, vaddr) \ diff --git a/arch/sparc/include/asm/page_64.h b/arch/sparc/include/asm/page_64.h index d764d8a8586b..fd4dc85fb38b 100644 --- a/arch/sparc/include/asm/page_64.h +++ b/arch/sparc/include/asm/page_64.h @@ -43,6 +43,7 @@ void _clear_page(void *page); #define clear_page(X) _clear_page((void *)(X)) struct page; void clear_user_page(void *addr, unsigned long vaddr, struct page *page); +#define clear_user_page clear_user_page #define copy_page(X,Y) memcpy((void *)(X), (void *)(Y), PAGE_SIZE) void copy_user_page(void *to, void *from, unsigned long vaddr, struct page *topage); #define __HAVE_ARCH_COPY_USER_HIGHPAGE diff --git a/arch/um/include/asm/page.h b/arch/um/include/asm/page.h index 2d363460d896..e348ff489b89 100644 --- a/arch/um/include/asm/page.h +++ b/arch/um/include/asm/page.h @@ -26,7 +26,6 @@ struct page; #define clear_page(page) memset((void *)(page), 0, PAGE_SIZE) #define copy_page(to,from) memcpy((void *)(to), (void *)(from), PAGE_SIZE) -#define clear_user_page(page, vaddr, pg) clear_page(page) #define copy_user_page(to, from, vaddr, pg) copy_page(to, from) typedef struct { unsigned long pte; } pte_t; diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h index 9265f2fca99a..416dc88e35c1 100644 --- a/arch/x86/include/asm/page.h +++ b/arch/x86/include/asm/page.h @@ -22,12 +22,6 @@ struct page; extern struct range pfn_mapped[]; extern int nr_pfn_mapped; -static inline void clear_user_page(void *page, unsigned long vaddr, - struct page *pg) -{ - clear_page(page); -} - static inline void copy_user_page(void *to, void *from, unsigned long vaddr, struct page *topage) { diff --git a/arch/xtensa/include/asm/page.h b/arch/xtensa/include/asm/page.h index 20655174b111..059493256765 100644 --- a/arch/xtensa/include/asm/page.h +++ b/arch/xtensa/include/asm/page.h @@ -126,7 +126,6 @@ void clear_user_highpage(struct page *page, unsigned long vaddr); void copy_user_highpage(struct page *to, struct page *from, unsigned long vaddr, struct vm_area_struct *vma); #else -# define clear_user_page(page, vaddr, pg) clear_page(page) # define copy_user_page(to, from, vaddr, pg) copy_page(to, from) #endif diff --git a/include/linux/highmem.h b/include/linux/highmem.h index abc20f9810fd..393bd51e5a1f 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -197,15 +197,35 @@ static inline void invalidate_kernel_vmap_range(void *vaddr, int size) } #endif -/* when CONFIG_HIGHMEM is not set these will be plain clear/copy_page */ #ifndef clear_user_highpage +#ifndef clear_user_page +/** + * clear_user_page() - clear a page to be mapped to user space + * @addr: the address of the page + * @vaddr: the address of the user mapping + * @page: the page + * + * We condition the definition of clear_user_page() on the architecture + * not having a custom clear_user_highpage(). That's because if there + * is some special flushing needed for clear_user_highpage() then it + * is likely that clear_user_page() also needs some magic. And, since + * our only caller is the generic clear_user_highpage(), not defining + * is not much of a loss. + */ +static inline void clear_user_page(void *addr, unsigned long vaddr, struct page *page) +{ + clear_page(addr); +} +#endif + +/* when CONFIG_HIGHMEM is not set these will be plain clear/copy_page */ static inline void clear_user_highpage(struct page *page, unsigned long vaddr) { void *addr = kmap_local_page(page); clear_user_page(addr, vaddr, page); kunmap_local(addr); } -#endif +#endif /* clear_user_highpage */ #ifndef vma_alloc_zeroed_movable_folio /** -- cgit v1.2.3 From 62a9f5a85b98d6d2d9b5e0d67b2d4e5903bc53ec Mon Sep 17 00:00:00 2001 From: Ankur Arora Date: Tue, 6 Jan 2026 23:20:03 -0800 Subject: mm: introduce clear_pages() and clear_user_pages() Introduce clear_pages(), to be overridden by architectures that support more efficient clearing of consecutive pages. Also introduce clear_user_pages(), however, we will not expect this function to be overridden anytime soon. As we do for clear_user_page(), define clear_user_pages() only if the architecture does not define clear_user_highpage(). That is because if the architecture does define clear_user_highpage(), then it likely needs some flushing magic when clearing user pages or highpages. This means we can get away without defining clear_user_pages(), since, much like its single page sibling, its only potential user is the generic clear_user_highpages() which should instead be using clear_user_highpage(). Link: https://lkml.kernel.org/r/20260107072009.1615991-3-ankur.a.arora@oracle.com Signed-off-by: Ankur Arora Acked-by: David Hildenbrand (Red Hat) Cc: Andy Lutomirski Cc: "Borislav Petkov (AMD)" Cc: Boris Ostrovsky Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Konrad Rzessutek Wilk Cc: Lance Yang Cc: "Liam R. Howlett" Cc: Li Zhe Cc: Lorenzo Stoakes Cc: Mateusz Guzik Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Peter Zijlstra Cc: Raghavendra K T Cc: Suren Baghdasaryan Cc: Thomas Gleixner Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/highmem.h | 33 +++++++++++++++++++++++++++++++++ include/linux/mm.h | 20 ++++++++++++++++++++ 2 files changed, 53 insertions(+) (limited to 'include') diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 393bd51e5a1f..019ab7d8c841 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -218,6 +218,39 @@ static inline void clear_user_page(void *addr, unsigned long vaddr, struct page } #endif +/** + * clear_user_pages() - clear a page range to be mapped to user space + * @addr: start address + * @vaddr: start address of the user mapping + * @page: start page + * @npages: number of pages + * + * Assumes that the region (@addr, +@npages) has been validated + * already so this does no exception handling. + * + * If the architecture provides a clear_user_page(), use that; + * otherwise, we can safely use clear_pages(). + */ +static inline void clear_user_pages(void *addr, unsigned long vaddr, + struct page *page, unsigned int npages) +{ + +#ifdef clear_user_page + do { + clear_user_page(addr, vaddr, page); + addr += PAGE_SIZE; + vaddr += PAGE_SIZE; + page++; + } while (--npages); +#else + /* + * Prefer clear_pages() to allow for architectural optimizations + * when operating on contiguous page ranges. + */ + clear_pages(addr, npages); +#endif +} + /* when CONFIG_HIGHMEM is not set these will be plain clear/copy_page */ static inline void clear_user_highpage(struct page *page, unsigned long vaddr) { diff --git a/include/linux/mm.h b/include/linux/mm.h index f0d5be9dc736..d78e294698b0 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4198,6 +4198,26 @@ static inline void clear_page_guard(struct zone *zone, struct page *page, unsigned int order) {} #endif /* CONFIG_DEBUG_PAGEALLOC */ +#ifndef clear_pages +/** + * clear_pages() - clear a page range for kernel-internal use. + * @addr: start address + * @npages: number of pages + * + * Use clear_user_pages() instead when clearing a page range to be + * mapped to user space. + * + * Does absolutely no exception handling. + */ +static inline void clear_pages(void *addr, unsigned int npages) +{ + do { + clear_page(addr); + addr += PAGE_SIZE; + } while (--npages); +} +#endif + #ifdef __HAVE_ARCH_GATE_AREA extern struct vm_area_struct *get_gate_vma(struct mm_struct *mm); extern int in_gate_area_no_mm(unsigned long addr); -- cgit v1.2.3 From 8d846b723e5723d98d859df9feeab89c2c889fb2 Mon Sep 17 00:00:00 2001 From: Ankur Arora Date: Tue, 6 Jan 2026 23:20:04 -0800 Subject: highmem: introduce clear_user_highpages() Define clear_user_highpages() which uses the range clearing primitive, clear_user_pages(). We can safely use this when CONFIG_HIGHMEM is disabled and if the architecture does not have clear_user_highpage. The first is needed to ensure that contiguous page ranges stay contiguous which precludes intermediate maps via HIGMEM. The second, because if the architecture has clear_user_highpage(), it likely needs flushing magic when clearing the page, magic that we aren't privy to. For both of those cases, just fallback to a loop around clear_user_highpage(). Link: https://lkml.kernel.org/r/20260107072009.1615991-4-ankur.a.arora@oracle.com Signed-off-by: Ankur Arora Acked-by: David Hildenbrand (Red Hat) Cc: Andy Lutomirski Cc: "Borislav Petkov (AMD)" Cc: Boris Ostrovsky Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Konrad Rzessutek Wilk Cc: Lance Yang Cc: "Liam R. Howlett" Cc: Li Zhe Cc: Lorenzo Stoakes Cc: Mateusz Guzik Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Peter Zijlstra Cc: Raghavendra K T Cc: Suren Baghdasaryan Cc: Thomas Gleixner Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/highmem.h | 45 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 44 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 019ab7d8c841..af03db851a1d 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -251,7 +251,14 @@ static inline void clear_user_pages(void *addr, unsigned long vaddr, #endif } -/* when CONFIG_HIGHMEM is not set these will be plain clear/copy_page */ +/** + * clear_user_highpage() - clear a page to be mapped to user space + * @page: start page + * @vaddr: start address of the user mapping + * + * With !CONFIG_HIGHMEM this (and the copy_user_highpage() below) will + * be plain clear_user_page() (and copy_user_page()). + */ static inline void clear_user_highpage(struct page *page, unsigned long vaddr) { void *addr = kmap_local_page(page); @@ -260,6 +267,42 @@ static inline void clear_user_highpage(struct page *page, unsigned long vaddr) } #endif /* clear_user_highpage */ +/** + * clear_user_highpages() - clear a page range to be mapped to user space + * @page: start page + * @vaddr: start address of the user mapping + * @npages: number of pages + * + * Assumes that all the pages in the region (@page, +@npages) are valid + * so this does no exception handling. + */ +static inline void clear_user_highpages(struct page *page, unsigned long vaddr, + unsigned int npages) +{ + +#if defined(clear_user_highpage) || defined(CONFIG_HIGHMEM) + /* + * An architecture defined clear_user_highpage() implies special + * handling is needed. + * + * So we use that or, the generic variant if CONFIG_HIGHMEM is + * enabled. + */ + do { + clear_user_highpage(page, vaddr); + vaddr += PAGE_SIZE; + page++; + } while (--npages); +#else + + /* + * Prefer clear_user_pages() to allow for architectural optimizations + * when operating on contiguous page ranges. + */ + clear_user_pages(page_address(page), vaddr, page, npages); +#endif +} + #ifndef vma_alloc_zeroed_movable_folio /** * vma_alloc_zeroed_movable_folio - Allocate a zeroed page for a VMA. -- cgit v1.2.3 From 94962b2628e6af2c48be6ebdf9f76add28d60ecc Mon Sep 17 00:00:00 2001 From: Ankur Arora Date: Tue, 6 Jan 2026 23:20:08 -0800 Subject: mm: folio_zero_user: clear page ranges Use batch clearing in clear_contig_highpages() instead of clearing a single page at a time. Exposing larger ranges enables the processor to optimize based on extent. To do this we just switch to using clear_user_highpages() which would in turn use clear_user_pages() or clear_pages(). Batched clearing, when running under non-preemptible models, however, has latency considerations. In particular, we need periodic invocations of cond_resched() to keep to reasonable preemption latencies. This is a problem because the clearing primitives do not, or might not be able to, call cond_resched() to check if preemption is needed. So, limit the worst case preemption latency by doing the clearing in units of no more than PROCESS_PAGES_NON_PREEMPT_BATCH pages. (Preemptible models already define away most of cond_resched(), so the batch size is ignored when running under those.) PROCESS_PAGES_NON_PREEMPT_BATCH: for architectures with "fast" clear-pages (ones that define clear_pages()), we define it as 32MB worth of pages. This is meant to be large enough to allow the processor to optimize the operation and yet small enough that we see reasonable preemption latency for when this optimization is not possible (ex. slow microarchitectures, memory bandwidth saturation.) This specific value also allows for a cacheline allocation elision optimization (which might help unrelated applications by not evicting potentially useful cache lines) that kicks in recent generations of AMD Zen processors at around LLC-size (32MB is a typical size). At the same time 32MB is small enough that even with poor clearing bandwidth (say ~10GBps), time to clear 32MB should be well below the scheduler's default warning threshold (sysctl_resched_latency_warn_ms=100). "Slow" architectures (don't have clear_pages()) will continue to use the base value (single page). Performance == Testing a demand fault workload shows a decent improvement in bandwidth with pg-sz=1GB. Bandwidth with pg-sz=2MB stays flat. $ perf bench mem mmap -p $pg-sz -f demand -s 64GB -l 5 contiguous-pages batched-pages (GBps +- %stdev) (GBps +- %stdev) pg-sz=2MB 23.58 +- 1.95% 25.34 +- 1.18% + 7.50% preempt=* pg-sz=1GB 25.09 +- 0.79% 39.22 +- 2.32% + 56.31% preempt=none|voluntary pg-sz=1GB 25.71 +- 0.03% 52.73 +- 0.20% [#] +110.16% preempt=full|lazy [#] We perform much better with preempt=full|lazy because, not needing explicit invocations of cond_resched() we can clear the full extent (pg-sz=1GB) as a single unit which the processor can optimize for. (Unless otherwise noted, all numbers are on AMD Genoa (EPYC 9J13); region-size=64GB, local node; 2.56 GHz, boost=0.) Analysis == pg-sz=1GB: the improvement we see falls in two buckets depending on the batch size in use. For batch-size=32MB the number of cachelines allocated (L1-dcache-loads) -- which stay relatively flat for smaller batches, start to drop off because cacheline allocation elision kicks in. And as can be seen below, at batch-size=1GB, we stop allocating cachelines almost entirely. (Not visible here but from testing with intermediate sizes, the allocation change kicks in only at batch-size=32MB and ramps up from there.) contigous-pages 6,949,417,798 L1-dcache-loads # 883.599 M/sec ( +- 0.01% ) (35.75%) 3,226,709,573 L1-dcache-load-misses # 46.43% of all L1-dcache accesses ( +- 0.05% ) (35.75%) batched,32MB 2,290,365,772 L1-dcache-loads # 471.171 M/sec ( +- 0.36% ) (35.72%) 1,144,426,272 L1-dcache-load-misses # 49.97% of all L1-dcache accesses ( +- 0.58% ) (35.70%) batched,1GB 63,914,157 L1-dcache-loads # 17.464 M/sec ( +- 8.08% ) (35.73%) 22,074,367 L1-dcache-load-misses # 34.54% of all L1-dcache accesses ( +- 16.70% ) (35.70%) The dropoff is also visible in L2 prefetch hits (miss numbers are on similar lines): contiguous-pages 3,464,861,312 l2_pf_hit_l2.all # 437.722 M/sec ( +- 0.74% ) (15.69%) batched,32MB 883,750,087 l2_pf_hit_l2.all # 181.223 M/sec ( +- 1.18% ) (15.71%) batched,1GB 8,967,943 l2_pf_hit_l2.all # 2.450 M/sec ( +- 17.92% ) (15.77%) This largely decouples the frontend from the backend since the clearing operation does not need to wait on loads from memory (we still need cacheline ownership but that's a shorter path). This is most visible if we rerun the test above with (boost=1, 3.66 GHz). $ perf bench mem mmap -p $pg-sz -f demand -s 64GB -l 5 contiguous-pages batched-pages (GBps +- %stdev) (GBps +- %stdev) pg-sz=2MB 26.08 +- 1.72% 26.13 +- 0.92% - preempt=* pg-sz=1GB 26.99 +- 0.62% 48.85 +- 2.19% + 80.99% preempt=none|voluntary pg-sz=1GB 27.69 +- 0.18% 75.18 +- 0.25% +171.50% preempt=full|lazy Comparing the batched-pages numbers from the boost=0 ones and these: for a clock-speed gain of 42% we gain 24.5% for batch-size=32MB and 42.5% for batch-size=1GB. In comparison the baseline contiguous-pages case and both the pg-sz=2MB ones are largely backend bound so gain no more than ~10%. Other platforms tested, Intel Icelakex (Oracle X9) and ARM64 Neoverse-N1 (Ampere Altra) both show an improvement of ~35% for pg-sz=2MB|1GB. The first goes from around 8GBps to 11GBps and the second from 32GBps to 44 GBPs. [ankur.a.arora@oracle.com: move the unit computation and make it a const Link: https://lkml.kernel.org/r/20260108060406.1693853-1-ankur.a.arora@oracle.com Link: https://lkml.kernel.org/r/20260107072009.1615991-8-ankur.a.arora@oracle.com Signed-off-by: Ankur Arora Acked-by: David Hildenbrand (Red Hat) Cc: Andy Lutomirski Cc: "Borislav Petkov (AMD)" Cc: Boris Ostrovsky Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Konrad Rzessutek Wilk Cc: Lance Yang Cc: "Liam R. Howlett" Cc: Li Zhe Cc: Lorenzo Stoakes Cc: Mateusz Guzik Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Peter Zijlstra Cc: Raghavendra K T Cc: Suren Baghdasaryan Cc: Thomas Gleixner Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/mm.h | 35 +++++++++++++++++++++++++++++++++++ mm/memory.c | 18 +++++++++++++++--- 2 files changed, 50 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index d78e294698b0..ab2e7e30aef9 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4208,6 +4208,15 @@ static inline void clear_page_guard(struct zone *zone, struct page *page, * mapped to user space. * * Does absolutely no exception handling. + * + * Note that even though the clearing operation is preemptible, clear_pages() + * does not (and on architectures where it reduces to a few long-running + * instructions, might not be able to) call cond_resched() to check if + * rescheduling is required. + * + * When running under preemptible models this is not a problem. Under + * cooperatively scheduled models, however, the caller is expected to + * limit @npages to no more than PROCESS_PAGES_NON_PREEMPT_BATCH. */ static inline void clear_pages(void *addr, unsigned int npages) { @@ -4218,6 +4227,32 @@ static inline void clear_pages(void *addr, unsigned int npages) } #endif +#ifndef PROCESS_PAGES_NON_PREEMPT_BATCH +#ifdef clear_pages +/* + * The architecture defines clear_pages(), and we assume that it is + * generally "fast". So choose a batch size large enough to allow the processor + * headroom for optimizing the operation and yet small enough that we see + * reasonable preemption latency for when this optimization is not possible + * (ex. slow microarchitectures, memory bandwidth saturation.) + * + * With a value of 32MB and assuming a memory bandwidth of ~10GBps, this should + * result in worst case preemption latency of around 3ms when clearing pages. + * + * (See comment above clear_pages() for why preemption latency is a concern + * here.) + */ +#define PROCESS_PAGES_NON_PREEMPT_BATCH (SZ_32M >> PAGE_SHIFT) +#else /* !clear_pages */ +/* + * The architecture does not provide a clear_pages() implementation. Assume + * that clear_page() -- which clear_pages() will fallback to -- is relatively + * slow and choose a small value for PROCESS_PAGES_NON_PREEMPT_BATCH. + */ +#define PROCESS_PAGES_NON_PREEMPT_BATCH 1 +#endif +#endif + #ifdef __HAVE_ARCH_GATE_AREA extern struct vm_area_struct *get_gate_vma(struct mm_struct *mm); extern int in_gate_area_no_mm(unsigned long addr); diff --git a/mm/memory.c b/mm/memory.c index 74d663943ecb..3f6ec897c9a6 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -7243,13 +7243,25 @@ static inline int process_huge_page( static void clear_contig_highpages(struct page *page, unsigned long addr, unsigned int nr_pages) { - unsigned int i; + unsigned int i, count; + /* + * When clearing we want to operate on the largest extent possible to + * allow for architecture specific extent based optimizations. + * + * However, since clear_user_highpages() (and primitives clear_user_pages(), + * clear_pages()), do not call cond_resched(), limit the unit size when + * running under non-preemptible scheduling models. + */ + const unsigned int unit = preempt_model_preemptible() ? + nr_pages : PROCESS_PAGES_NON_PREEMPT_BATCH; might_sleep(); - for (i = 0; i < nr_pages; i++) { + + for (i = 0; i < nr_pages; i += count) { cond_resched(); - clear_user_highpage(page + i, addr + i * PAGE_SIZE); + count = min(unit, nr_pages - i); + clear_user_highpages(page + i, addr + i * PAGE_SIZE, count); } } -- cgit v1.2.3 From 055059ed720ec7546d2bf7122d858814a9f84741 Mon Sep 17 00:00:00 2001 From: Chen Ridong Date: Thu, 11 Dec 2025 01:30:19 +0000 Subject: memcg: remove mem_cgroup_size() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The mem_cgroup_size helper is used only in apply_proportional_protection to read the current memory usage. Its semantics are unclear and inconsistent with other sites, which directly call page_counter_read for the same purpose. Remove this helper and get its usage via mem_cgroup_protection for clarity. Additionally, rename the local variable 'cgroup_size' to 'usage' to better reflect its meaning. No functional changes intended. Link: https://lkml.kernel.org/r/20251211013019.2080004-3-chenridong@huaweicloud.com Signed-off-by: Chen Ridong Acked-by: Johannes Weiner Acked-by: Michal Hocko Acked-by: Shakeel Butt Cc: Michal Koutný Cc: Axel Rasmussen Cc: Lorenzo Stoakes Cc: Lu Jialin Cc: Muchun Song Cc: Qi Zheng Cc: Roman Gushchin Cc: Wei Xu Cc: Yuanchu Xie Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 18 +++++++----------- mm/memcontrol.c | 5 ----- mm/vmscan.c | 9 ++++----- 3 files changed, 11 insertions(+), 21 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 0651865a4564..25908ba30700 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -557,13 +557,15 @@ static inline bool mem_cgroup_disabled(void) static inline void mem_cgroup_protection(struct mem_cgroup *root, struct mem_cgroup *memcg, unsigned long *min, - unsigned long *low) + unsigned long *low, + unsigned long *usage) { - *min = *low = 0; + *min = *low = *usage = 0; if (mem_cgroup_disabled()) return; + *usage = page_counter_read(&memcg->memory); /* * There is no reclaim protection applied to a targeted reclaim. * We are special casing this specific case here because @@ -919,8 +921,6 @@ static inline void mem_cgroup_handle_over_high(gfp_t gfp_mask) unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg); -unsigned long mem_cgroup_size(struct mem_cgroup *memcg); - void mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct *p); @@ -1102,9 +1102,10 @@ static inline void memcg_memory_event_mm(struct mm_struct *mm, static inline void mem_cgroup_protection(struct mem_cgroup *root, struct mem_cgroup *memcg, unsigned long *min, - unsigned long *low) + unsigned long *low, + unsigned long *usage) { - *min = *low = 0; + *min = *low = *usage = 0; } static inline void mem_cgroup_calculate_protection(struct mem_cgroup *root, @@ -1328,11 +1329,6 @@ static inline unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg) return 0; } -static inline unsigned long mem_cgroup_size(struct mem_cgroup *memcg) -{ - return 0; -} - static inline void mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct *p) { diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 7d4b93d30eb0..15323d5dc69b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1621,11 +1621,6 @@ unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg) return max; } -unsigned long mem_cgroup_size(struct mem_cgroup *memcg) -{ - return page_counter_read(&memcg->memory); -} - void __memcg_memory_event(struct mem_cgroup *memcg, enum memcg_memory_event event, bool allow_spinning) { diff --git a/mm/vmscan.c b/mm/vmscan.c index 67234613fbff..1c87945fa761 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2451,9 +2451,9 @@ static inline void calculate_pressure_balance(struct scan_control *sc, static unsigned long apply_proportional_protection(struct mem_cgroup *memcg, struct scan_control *sc, unsigned long scan) { - unsigned long min, low; + unsigned long min, low, usage; - mem_cgroup_protection(sc->target_mem_cgroup, memcg, &min, &low); + mem_cgroup_protection(sc->target_mem_cgroup, memcg, &min, &low, &usage); if (min || low) { /* @@ -2485,7 +2485,6 @@ static unsigned long apply_proportional_protection(struct mem_cgroup *memcg, * again by how much of the total memory used is under * hard protection. */ - unsigned long cgroup_size = mem_cgroup_size(memcg); unsigned long protection; /* memory.low scaling, make sure we retry before OOM */ @@ -2497,9 +2496,9 @@ static unsigned long apply_proportional_protection(struct mem_cgroup *memcg, } /* Avoid TOCTOU with earlier protection check */ - cgroup_size = max(cgroup_size, protection); + usage = max(usage, protection); - scan -= scan * protection / (cgroup_size + 1); + scan -= scan * protection / (usage + 1); /* * Minimally target SWAP_CLUSTER_MAX pages to keep -- cgit v1.2.3 From 16cc8b9396f6d63c1331059d67626cf907a7f23c Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 10 Dec 2025 10:43:01 -0500 Subject: mm: memcontrol: rename mem_cgroup_from_slab_obj() In addition to slab objects, this function is used for resolving non-slab kernel pointers. This has caused confusion in recent refactoring work. Rename it to mem_cgroup_from_virt(), sticking with terminology established by the virt_to_() converters. Link: https://lore.kernel.org/linux-mm/20251113161424.GB3465062@cmpxchg.org/ Link: https://lkml.kernel.org/r/20251210154301.720133-1-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Roman Gushchin Reviewed-by: Anshuman Khandual Acked-by: Vlastimil Babka Acked-by: Shakeel Butt Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Muchun Song Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 4 ++-- mm/list_lru.c | 4 ++-- mm/memcontrol.c | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 25908ba30700..fd400082313a 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1723,7 +1723,7 @@ static inline int memcg_kmem_id(struct mem_cgroup *memcg) return memcg ? memcg->kmemcg_id : -1; } -struct mem_cgroup *mem_cgroup_from_slab_obj(void *p); +struct mem_cgroup *mem_cgroup_from_virt(void *p); static inline void count_objcg_events(struct obj_cgroup *objcg, enum vm_event_item idx, @@ -1795,7 +1795,7 @@ static inline int memcg_kmem_id(struct mem_cgroup *memcg) return -1; } -static inline struct mem_cgroup *mem_cgroup_from_slab_obj(void *p) +static inline struct mem_cgroup *mem_cgroup_from_virt(void *p) { return NULL; } diff --git a/mm/list_lru.c b/mm/list_lru.c index ec48b5dadf51..37b642f6cbda 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -187,7 +187,7 @@ bool list_lru_add_obj(struct list_lru *lru, struct list_head *item) if (list_lru_memcg_aware(lru)) { rcu_read_lock(); - ret = list_lru_add(lru, item, nid, mem_cgroup_from_slab_obj(item)); + ret = list_lru_add(lru, item, nid, mem_cgroup_from_virt(item)); rcu_read_unlock(); } else { ret = list_lru_add(lru, item, nid, NULL); @@ -224,7 +224,7 @@ bool list_lru_del_obj(struct list_lru *lru, struct list_head *item) if (list_lru_memcg_aware(lru)) { rcu_read_lock(); - ret = list_lru_del(lru, item, nid, mem_cgroup_from_slab_obj(item)); + ret = list_lru_del(lru, item, nid, mem_cgroup_from_virt(item)); rcu_read_unlock(); } else { ret = list_lru_del(lru, item, nid, NULL); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 15323d5dc69b..a01d3e6c157d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -806,7 +806,7 @@ void mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val) struct lruvec *lruvec; rcu_read_lock(); - memcg = mem_cgroup_from_slab_obj(p); + memcg = mem_cgroup_from_virt(p); /* * Untracked pages have no memcg, no lruvec. Update only the @@ -2614,7 +2614,7 @@ struct mem_cgroup *mem_cgroup_from_obj_slab(struct slab *slab, void *p) * The caller must ensure the memcg lifetime, e.g. by taking rcu_read_lock(), * cgroup_mutex, etc. */ -struct mem_cgroup *mem_cgroup_from_slab_obj(void *p) +struct mem_cgroup *mem_cgroup_from_virt(void *p) { struct slab *slab; -- cgit v1.2.3 From 4a6ceb7c9744c69546d4ca43b7bd308f4db0927b Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 16 Dec 2025 00:01:14 -0800 Subject: mm/damon/core: introduce nr_snapshots damos stat Patch series "mm/damon: introduce {,max_}nr_snapshots and tracepoint for damos stats". Introduce three changes for improving DAMOS stat's provided information, deterministic control, and reading usability. DAMOS provides stats that are important for understanding its behavior. It lacks information about how many DAMON-generated monitoring output snapshots it has worked on. Add a new stat, nr_snapshots, to show the information. Users can control DAMOS schemes in multiple ways. Using the online parameters commit feature, they can install and uninstall DAMOS schemes whenever they want while keeping DAMON runs. DAMOS quotas and watermarks can be used for manually or automatically turning on/off or adjusting the aggressiveness of the scheme. DAMOS filters can be used for applying the scheme to specific memory entities based on their types and locations. Some users want their DAMOS scheme to be applied to only specific number of DAMON snapshots, for more deterministic control. One example use case is tracepoint based snapshot reading. Add a new knob, max_nr_snapshots, to support this. If the nr_snapshots parameter becomes same to or greater than the value of this parameter, the scheme is deactivated. Users can read DAMOS stats via DAMON's sysfs interface. For deep level investigations on environments having advanced tools like perf and bpftrace, exposing the stats via a tracepoint can be useful. Implement a new tracepoint, namely damon:damos_stat_after_apply_interval. First five patches (patches 1-5) of this series implement the new stat, nr_snapshots, on the core layer (patch 1), expose on DAMON sysfs user interface (patch 2), and update documents (patches 3-5). Following six patches (patches 6-11) are for the new stat based DAMOS deactivation (max_nr_snapshots). The first one (patch 6) of this group updates a kernel-doc comment before making further changes. Then an implementation of it on the core layer (patch 7), an introduction of a new DAMON sysfs interface file for users of the feature (patch 8), and three updates of the documents (patches 9-11) follow. The final one (patch 12) introduces the new tracepoint that exposes the DAMOS stat values for each scheme apply interval. This patch (of 12): DAMON generates monitoring results snapshots for every sampling interval. DAMOS applies given schemes on the regions of the snapshots, for every apply interval of the scheme. DAMOS stat informs a given scheme has tried to how many memory entities and applied, in the region and byte level. In some use cases including user-space oriented tuning and investigations, it is useful to know that in the DAMON-snapshot level. Introduce a new stat, namely nr_snapshots for DAMON core API callers. [sj@kernel.org: fix wrong list_is_last() call in damons_is_last_region()] Link: https://lkml.kernel.org/r/20260114152049.99727-1-sj@kernel.org Link: https://lkml.kernel.org/r/20251216080128.42991-1-sj@kernel.org Link: https://lkml.kernel.org/r/20251216080128.42991-2-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: "Masami Hiramatsu (Google)" Cc: Mathieu Desnoyers Cc: Michal Hocko Cc: Mike Rapoport Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/damon.h | 3 +++ mm/damon/core.c | 13 ++++++++++--- 2 files changed, 13 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/damon.h b/include/linux/damon.h index 3813373a9200..1d8a1515e75a 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -330,6 +330,8 @@ struct damos_watermarks { * @sz_ops_filter_passed: * Total bytes that passed ops layer-handled DAMOS filters. * @qt_exceeds: Total number of times the quota of the scheme has exceeded. + * @nr_snapshots: + * Total number of DAMON snapshots that the scheme has tried. * * "Tried an action to a region" in this context means the DAMOS core logic * determined the region as eligible to apply the action. The access pattern @@ -355,6 +357,7 @@ struct damos_stat { unsigned long sz_applied; unsigned long sz_ops_filter_passed; unsigned long qt_exceeds; + unsigned long nr_snapshots; }; /** diff --git a/mm/damon/core.c b/mm/damon/core.c index 2379a07c2f87..9d5be7e9b8e0 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -157,6 +157,12 @@ void damon_destroy_region(struct damon_region *r, struct damon_target *t) damon_free_region(r); } +static bool damon_is_last_region(struct damon_region *r, + struct damon_target *t) +{ + return list_is_last(&r->list, &t->regions_list); +} + /* * Check whether a region is intersecting an address range * @@ -1978,10 +1984,11 @@ static void damon_do_apply_schemes(struct damon_ctx *c, if (damos_skip_charged_region(t, &r, s, c->min_sz_region)) continue; - if (!damos_valid_target(c, t, r, s)) - continue; + if (damos_valid_target(c, t, r, s)) + damos_apply_scheme(c, t, r, s); - damos_apply_scheme(c, t, r, s); + if (damon_is_last_region(r, t)) + s->stat.nr_snapshots++; } } -- cgit v1.2.3 From ccaa2d062a35add92832e8f082b8e00eed3f6efd Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 16 Dec 2025 00:01:19 -0800 Subject: mm/damon: update damos kerneldoc for stat field Commit 0e92c2ee9f45 ("mm/damon/schemes: account scheme actions that successfully applied") has replaced ->stat_count and ->stat_sz of 'struct damos' with ->stat. The commit mistakenly did not update the related kernel doc comment, though. Update the comment. Link: https://lkml.kernel.org/r/20251216080128.42991-7-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: "Masami Hiramatsu (Google)" Cc: Mathieu Desnoyers Cc: Michal Hocko Cc: Mike Rapoport Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/damon.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/damon.h b/include/linux/damon.h index 1d8a1515e75a..43dfbfe2292f 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -532,9 +532,7 @@ struct damos_migrate_dests { * unsets @last_applied when each regions walking for applying the scheme is * finished. * - * After applying the &action to each region, &stat_count and &stat_sz is - * updated to reflect the number of regions and total size of regions that the - * &action is applied. + * After applying the &action to each region, &stat is updated. */ struct damos { struct damos_access_pattern pattern; -- cgit v1.2.3 From 84e425c68e6061751adecd2d328789e4f67eac1e Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 16 Dec 2025 00:01:20 -0800 Subject: mm/damon/core: implement max_nr_snapshots There are DAMOS use cases that require user-space centric control of its activation and deactivation. Having the control plane on the user-space, or using DAMOS as a way for monitoring results collection are such examples. DAMON parameters online commit, DAMOS quotas and watermarks can be useful for this purpose. However, those features work only at the sub-DAMON-snapshot level. In some use cases, the DAMON-snapshot level control is required. For example, in DAMOS-based monitoring results collection use case, the user online-installs a DAMOS scheme with DAMOS_STAT action, wait it be applied to whole regions of a single DAMON-snapshot, retrieves the stats and tried regions information, and online-uninstall the scheme. It is efficient to ensure the lifetime of the scheme as no more no less one snapshot consumption. To support such use cases, introduce a new DAMOS core API per-scheme parameter, namely max_nr_snapshots. As the name implies, it is the upper limit of nr_snapshots, which is a DAMOS stat that represents the number of DAMON-snapshots that the scheme has fully applied. If the limit is set with a non-zero value and nr_snapshots reaches or exceeds the limit, the scheme is deactivated. Link: https://lkml.kernel.org/r/20251216080128.42991-8-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: "Masami Hiramatsu (Google)" Cc: Mathieu Desnoyers Cc: Michal Hocko Cc: Mike Rapoport Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/damon.h | 5 +++++ mm/damon/core.c | 11 ++++++++++- 2 files changed, 15 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/damon.h b/include/linux/damon.h index 43dfbfe2292f..a67292a2f09d 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -499,6 +499,7 @@ struct damos_migrate_dests { * @ops_filters: ops layer handling &struct damos_filter objects list. * @last_applied: Last @action applied ops-managing entity. * @stat: Statistics of this scheme. + * @max_nr_snapshots: Upper limit of nr_snapshots stat. * @list: List head for siblings. * * For each @apply_interval_us, DAMON finds regions which fit in the @@ -533,6 +534,9 @@ struct damos_migrate_dests { * finished. * * After applying the &action to each region, &stat is updated. + * + * If &max_nr_snapshots is set as non-zero and &stat.nr_snapshots be same to or + * greater than it, the scheme is deactivated. */ struct damos { struct damos_access_pattern pattern; @@ -567,6 +571,7 @@ struct damos { struct list_head ops_filters; void *last_applied; struct damos_stat stat; + unsigned long max_nr_snapshots; struct list_head list; }; diff --git a/mm/damon/core.c b/mm/damon/core.c index 9d5be7e9b8e0..344773f53f64 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -401,6 +401,7 @@ struct damos *damon_new_scheme(struct damos_access_pattern *pattern, INIT_LIST_HEAD(&scheme->core_filters); INIT_LIST_HEAD(&scheme->ops_filters); scheme->stat = (struct damos_stat){}; + scheme->max_nr_snapshots = 0; INIT_LIST_HEAD(&scheme->list); scheme->quota = *(damos_quota_init(quota)); @@ -1078,7 +1079,11 @@ static int damos_commit(struct damos *dst, struct damos *src) return err; err = damos_commit_filters(dst, src); - return err; + if (err) + return err; + + dst->max_nr_snapshots = src->max_nr_snapshots; + return 0; } static int damon_commit_schemes(struct damon_ctx *dst, struct damon_ctx *src) @@ -1984,6 +1989,10 @@ static void damon_do_apply_schemes(struct damon_ctx *c, if (damos_skip_charged_region(t, &r, s, c->min_sz_region)) continue; + if (s->max_nr_snapshots && + s->max_nr_snapshots <= s->stat.nr_snapshots) + continue; + if (damos_valid_target(c, t, r, s)) damos_apply_scheme(c, t, r, s); -- cgit v1.2.3 From 804c26b961da295bd70c86a3c9dc4bea0b09de88 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 16 Dec 2025 00:01:25 -0800 Subject: mm/damon/core: add trace point for damos stat per apply interval DAMON users can read DAMOS stats via DAMON sysfs interface. It enables efficient, simple and flexible usages of the stats. Especially for systems not having advanced tools like perf or bpftrace, that can be useful. But if the advanced tools are available, exposing the stats via tracepoint can reduce unnecessary reimplementation of the wheels. Add a new tracepoint for DAMOS stats, namely damos_stat_after_apply_interval. The tracepoint is triggered for each scheme's apply interval and exposes the whole stat values. If the user needs sub-apply interval information for any chance, damos_before_apply tracepoint could be used. Link: https://lkml.kernel.org/r/20251216080128.42991-13-sj@kernel.org Signed-off-by: SeongJae Park Reviewed-by: Steven Rostedt (Google) Cc: Jonathan Corbet Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: "Masami Hiramatsu (Google)" Cc: Mathieu Desnoyers Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/trace/events/damon.h | 41 +++++++++++++++++++++++++++++++++++++++++ mm/damon/core.c | 17 +++++++++++++++++ 2 files changed, 58 insertions(+) (limited to 'include') diff --git a/include/trace/events/damon.h b/include/trace/events/damon.h index 852d725afea2..24fc402ab3c8 100644 --- a/include/trace/events/damon.h +++ b/include/trace/events/damon.h @@ -9,6 +9,47 @@ #include #include +TRACE_EVENT(damos_stat_after_apply_interval, + + TP_PROTO(unsigned int context_idx, unsigned int scheme_idx, + struct damos_stat *stat), + + TP_ARGS(context_idx, scheme_idx, stat), + + TP_STRUCT__entry( + __field(unsigned int, context_idx) + __field(unsigned int, scheme_idx) + __field(unsigned long, nr_tried) + __field(unsigned long, sz_tried) + __field(unsigned long, nr_applied) + __field(unsigned long, sz_applied) + __field(unsigned long, sz_ops_filter_passed) + __field(unsigned long, qt_exceeds) + __field(unsigned long, nr_snapshots) + ), + + TP_fast_assign( + __entry->context_idx = context_idx; + __entry->scheme_idx = scheme_idx; + __entry->nr_tried = stat->nr_tried; + __entry->sz_tried = stat->sz_tried; + __entry->nr_applied = stat->nr_applied; + __entry->sz_applied = stat->sz_applied; + __entry->sz_ops_filter_passed = stat->sz_ops_filter_passed; + __entry->qt_exceeds = stat->qt_exceeds; + __entry->nr_snapshots = stat->nr_snapshots; + ), + + TP_printk("ctx_idx=%u scheme_idx=%u nr_tried=%lu sz_tried=%lu " + "nr_applied=%lu sz_tried=%lu sz_ops_filter_passed=%lu " + "qt_exceeds=%lu nr_snapshots=%lu", + __entry->context_idx, __entry->scheme_idx, + __entry->nr_tried, __entry->sz_tried, + __entry->nr_applied, __entry->sz_applied, + __entry->sz_ops_filter_passed, __entry->qt_exceeds, + __entry->nr_snapshots) +); + TRACE_EVENT(damos_esz, TP_PROTO(unsigned int context_idx, unsigned int scheme_idx, diff --git a/mm/damon/core.c b/mm/damon/core.c index 344773f53f64..f4d83e12ba0e 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -2289,6 +2289,22 @@ static void damos_adjust_quota(struct damon_ctx *c, struct damos *s) quota->min_score = score; } +static void damos_trace_stat(struct damon_ctx *c, struct damos *s) +{ + unsigned int cidx = 0, sidx = 0; + struct damos *siter; + + if (!trace_damos_stat_after_apply_interval_enabled()) + return; + + damon_for_each_scheme(siter, c) { + if (siter == s) + break; + sidx++; + } + trace_damos_stat_after_apply_interval(cidx, sidx, &s->stat); +} + static void kdamond_apply_schemes(struct damon_ctx *c) { struct damon_target *t; @@ -2330,6 +2346,7 @@ static void kdamond_apply_schemes(struct damon_ctx *c) (s->apply_interval_us ? s->apply_interval_us : c->attrs.aggr_interval) / sample_interval; s->last_applied = NULL; + damos_trace_stat(c, s); } mutex_unlock(&c->walk_control_lock); } -- cgit v1.2.3 From 64dd89ae01f2708a508e028c28b7906e4702a9a7 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 15 Dec 2025 12:57:53 -0500 Subject: mm/block/fs: remove laptop_mode Laptop mode was introduced to save battery, by delaying and consolidating writes and thereby maximize the time rotating hard drives wouldn't have to spin. Luckily, rotating hard drives, with their high spin-up times and power draw, are a thing of the past for battery-powered devices. Reclaim has also since changed to not write single filesystem pages anymore, and regular filesystem writeback is lumpy by design. The juice doesn't appear worth the squeeze anymore. The footprint of the feature is small, but nevertheless it's a complicating factor in mm, block, filesystems. Developers don't think about it, and it likely hasn't been tested with new reclaim and writeback changes in years. Let's sunset it. Keep the sysctl with a deprecation warning around for a few more cycles, but remove all functionality behind it. [akpm@linux-foundation.org: fix Documentation/admin-guide/laptops/index.rst] Link: https://lkml.kernel.org/r/20251216185201.GH905277@cmpxchg.org Signed-off-by: Johannes Weiner Suggested-by: Christoph Hellwig Reviewed-by: Christoph Hellwig Acked-by: Jens Axboe Reviewed-by: Shakeel Butt Acked-by: Michal Hocko Cc: Deepanshu Kartikey Signed-off-by: Andrew Morton --- Documentation/admin-guide/laptops/index.rst | 1 - Documentation/admin-guide/laptops/laptop-mode.rst | 770 ---------------------- Documentation/admin-guide/sysctl/vm.rst | 8 - block/blk-mq.c | 3 - fs/ext4/inode.c | 3 +- fs/sync.c | 2 - fs/xfs/xfs_super.c | 9 - include/linux/backing-dev-defs.h | 3 - include/linux/writeback.h | 4 - include/trace/events/writeback.h | 1 - include/uapi/linux/sysctl.h | 2 +- mm/backing-dev.c | 3 - mm/page-writeback.c | 74 +-- mm/vmscan.c | 30 +- 14 files changed, 25 insertions(+), 888 deletions(-) delete mode 100644 Documentation/admin-guide/laptops/laptop-mode.rst (limited to 'include') diff --git a/Documentation/admin-guide/laptops/index.rst b/Documentation/admin-guide/laptops/index.rst index 6432c251dc95..c0b911d05c59 100644 --- a/Documentation/admin-guide/laptops/index.rst +++ b/Documentation/admin-guide/laptops/index.rst @@ -10,7 +10,6 @@ Laptop Drivers alienware-wmi asus-laptop disk-shock-protection - laptop-mode lg-laptop samsung-galaxybook sony-laptop diff --git a/Documentation/admin-guide/laptops/laptop-mode.rst b/Documentation/admin-guide/laptops/laptop-mode.rst deleted file mode 100644 index 66eb9cd918b5..000000000000 --- a/Documentation/admin-guide/laptops/laptop-mode.rst +++ /dev/null @@ -1,770 +0,0 @@ -=============================================== -How to conserve battery power using laptop-mode -=============================================== - -Document Author: Bart Samwel (bart@samwel.tk) - -Date created: January 2, 2004 - -Last modified: December 06, 2004 - -Introduction ------------- - -Laptop mode is used to minimize the time that the hard disk needs to be spun up, -to conserve battery power on laptops. It has been reported to cause significant -power savings. - -.. Contents - - * Introduction - * Installation - * Caveats - * The Details - * Tips & Tricks - * Control script - * ACPI integration - * Monitoring tool - - -Installation ------------- - -To use laptop mode, you don't need to set any kernel configuration options -or anything. Simply install all the files included in this document, and -laptop mode will automatically be started when you're on battery. For -your convenience, a tarball containing an installer can be downloaded at: - - http://www.samwel.tk/laptop_mode/laptop_mode/ - -To configure laptop mode, you need to edit the configuration file, which is -located in /etc/default/laptop-mode on Debian-based systems, or in -/etc/sysconfig/laptop-mode on other systems. - -Unfortunately, automatic enabling of laptop mode does not work for -laptops that don't have ACPI. On those laptops, you need to start laptop -mode manually. To start laptop mode, run "laptop_mode start", and to -stop it, run "laptop_mode stop". (Note: The laptop mode tools package now -has experimental support for APM, you might want to try that first.) - - -Caveats -------- - -* The downside of laptop mode is that you have a chance of losing up to 10 - minutes of work. If you cannot afford this, don't use it! The supplied ACPI - scripts automatically turn off laptop mode when the battery almost runs out, - so that you won't lose any data at the end of your battery life. - -* Most desktop hard drives have a very limited lifetime measured in spindown - cycles, typically about 50.000 times (it's usually listed on the spec sheet). - Check your drive's rating, and don't wear down your drive's lifetime if you - don't need to. - -* If you mount some of your ext3 filesystems with the -n option, then - the control script will not be able to remount them correctly. You must set - DO_REMOUNTS=0 in the control script, otherwise it will remount them with the - wrong options -- or it will fail because it cannot write to /etc/mtab. - -* If you have your filesystems listed as type "auto" in fstab, like I did, then - the control script will not recognize them as filesystems that need remounting. - You must list the filesystems with their true type instead. - -* It has been reported that some versions of the mutt mail client use file access - times to determine whether a folder contains new mail. If you use mutt and - experience this, you must disable the noatime remounting by setting the option - DO_REMOUNT_NOATIME to 0 in the configuration file. - - -The Details ------------ - -Laptop mode is controlled by the knob /proc/sys/vm/laptop_mode. This knob is -present for all kernels that have the laptop mode patch, regardless of any -configuration options. When the knob is set, any physical disk I/O (that might -have caused the hard disk to spin up) causes Linux to flush all dirty blocks. The -result of this is that after a disk has spun down, it will not be spun up -anymore to write dirty blocks, because those blocks had already been written -immediately after the most recent read operation. The value of the laptop_mode -knob determines the time between the occurrence of disk I/O and when the flush -is triggered. A sensible value for the knob is 5 seconds. Setting the knob to -0 disables laptop mode. - -To increase the effectiveness of the laptop_mode strategy, the laptop_mode -control script increases dirty_expire_centisecs and dirty_writeback_centisecs in -/proc/sys/vm to about 10 minutes (by default), which means that pages that are -dirtied are not forced to be written to disk as often. The control script also -changes the dirty background ratio, so that background writeback of dirty pages -is not done anymore. Combined with a higher commit value (also 10 minutes) for -ext3 filesystem (also done automatically by the control script), -this results in concentration of disk activity in a small time interval which -occurs only once every 10 minutes, or whenever the disk is forced to spin up by -a cache miss. The disk can then be spun down in the periods of inactivity. - - -Configuration -------------- - -The laptop mode configuration file is located in /etc/default/laptop-mode on -Debian-based systems, or in /etc/sysconfig/laptop-mode on other systems. It -contains the following options: - -MAX_AGE: - -Maximum time, in seconds, of hard drive spindown time that you are -comfortable with. Worst case, it's possible that you could lose this -amount of work if your battery fails while you're in laptop mode. - -MINIMUM_BATTERY_MINUTES: - -Automatically disable laptop mode if the remaining number of minutes of -battery power is less than this value. Default is 10 minutes. - -AC_HD/BATT_HD: - -The idle timeout that should be set on your hard drive when laptop mode -is active (BATT_HD) and when it is not active (AC_HD). The defaults are -20 seconds (value 4) for BATT_HD and 2 hours (value 244) for AC_HD. The -possible values are those listed in the manual page for "hdparm" for the -"-S" option. - -HD: - -The devices for which the spindown timeout should be adjusted by laptop mode. -Default is /dev/hda. If you specify multiple devices, separate them by a space. - -READAHEAD: - -Disk readahead, in 512-byte sectors, while laptop mode is active. A large -readahead can prevent disk accesses for things like executable pages (which are -loaded on demand while the application executes) and sequentially accessed data -(MP3s). - -DO_REMOUNTS: - -The control script automatically remounts any mounted journaled filesystems -with appropriate commit interval options. When this option is set to 0, this -feature is disabled. - -DO_REMOUNT_NOATIME: - -When remounting, should the filesystems be remounted with the noatime option? -Normally, this is set to "1" (enabled), but there may be programs that require -access time recording. - -DIRTY_RATIO: - -The percentage of memory that is allowed to contain "dirty" or unsaved data -before a writeback is forced, while laptop mode is active. Corresponds to -the /proc/sys/vm/dirty_ratio sysctl. - -DIRTY_BACKGROUND_RATIO: - -The percentage of memory that is allowed to contain "dirty" or unsaved data -after a forced writeback is done due to an exceeding of DIRTY_RATIO. Set -this nice and low. This corresponds to the /proc/sys/vm/dirty_background_ratio -sysctl. - -Note that the behaviour of dirty_background_ratio is quite different -when laptop mode is active and when it isn't. When laptop mode is inactive, -dirty_background_ratio is the threshold percentage at which background writeouts -start taking place. When laptop mode is active, however, background writeouts -are disabled, and the dirty_background_ratio only determines how much writeback -is done when dirty_ratio is reached. - -DO_CPU: - -Enable CPU frequency scaling when in laptop mode. (Requires CPUFreq to be setup. -See Documentation/admin-guide/pm/cpufreq.rst for more info. Disabled by default.) - -CPU_MAXFREQ: - -When on battery, what is the maximum CPU speed that the system should use? Legal -values are "slowest" for the slowest speed that your CPU is able to operate at, -or a value listed in /sys/devices/system/cpu/cpu0/cpufreq/scaling_available_frequencies. - - -Tips & Tricks -------------- - -* Bartek Kania reports getting up to 50 minutes of extra battery life (on top - of his regular 3 to 3.5 hours) using a spindown time of 5 seconds (BATT_HD=1). - -* You can spin down the disk while playing MP3, by setting disk readahead - to 8MB (READAHEAD=16384). Effectively, the disk will read a complete MP3 at - once, and will then spin down while the MP3 is playing. (Thanks to Bartek - Kania.) - -* Drew Scott Daniels observed: "I don't know why, but when I decrease the number - of colours that my display uses it consumes less battery power. I've seen - this on powerbooks too. I hope that this is a piece of information that - might be useful to the Laptop Mode patch or its users." - -* In syslog.conf, you can prefix entries with a dash `-` to omit syncing the - file after every logging. When you're using laptop-mode and your disk doesn't - spin down, this is a likely culprit. - -* Richard Atterer observed that laptop mode does not work well with noflushd - (http://noflushd.sourceforge.net/), it seems that noflushd prevents laptop-mode - from doing its thing. - -* If you're worried about your data, you might want to consider using a USB - memory stick or something like that as a "working area". (Be aware though - that flash memory can only handle a limited number of writes, and overuse - may wear out your memory stick pretty quickly. Do _not_ use journalling - filesystems on flash memory sticks.) - - -Configuration file for control and ACPI battery scripts -------------------------------------------------------- - -This allows the tunables to be changed for the scripts via an external -configuration file - -It should be installed as /etc/default/laptop-mode on Debian, and as -/etc/sysconfig/laptop-mode on Red Hat, SUSE, Mandrake, and other work-alikes. - -Config file:: - - # Maximum time, in seconds, of hard drive spindown time that you are - # comfortable with. Worst case, it's possible that you could lose this - # amount of work if your battery fails you while in laptop mode. - #MAX_AGE=600 - - # Automatically disable laptop mode when the number of minutes of battery - # that you have left goes below this threshold. - MINIMUM_BATTERY_MINUTES=10 - - # Read-ahead, in 512-byte sectors. You can spin down the disk while playing MP3/OGG - # by setting the disk readahead to 8MB (READAHEAD=16384). Effectively, the disk - # will read a complete MP3 at once, and will then spin down while the MP3/OGG is - # playing. - #READAHEAD=4096 - - # Shall we remount journaled fs. with appropriate commit interval? (1=yes) - #DO_REMOUNTS=1 - - # And shall we add the "noatime" option to that as well? (1=yes) - #DO_REMOUNT_NOATIME=1 - - # Dirty synchronous ratio. At this percentage of dirty pages the process - # which - # calls write() does its own writeback - #DIRTY_RATIO=40 - - # - # Allowed dirty background ratio, in percent. Once DIRTY_RATIO has been - # exceeded, the kernel will wake flusher threads which will then reduce the - # amount of dirty memory to dirty_background_ratio. Set this nice and low, - # so once some writeout has commenced, we do a lot of it. - # - #DIRTY_BACKGROUND_RATIO=5 - - # kernel default dirty buffer age - #DEF_AGE=30 - #DEF_UPDATE=5 - #DEF_DIRTY_BACKGROUND_RATIO=10 - #DEF_DIRTY_RATIO=40 - #DEF_XFS_AGE_BUFFER=15 - #DEF_XFS_SYNC_INTERVAL=30 - #DEF_XFS_BUFD_INTERVAL=1 - - # This must be adjusted manually to the value of HZ in the running kernel - # on 2.4, until the XFS people change their 2.4 external interfaces to work in - # centisecs. This can be automated, but it's a work in progress that still - # needs# some fixes. On 2.6 kernels, XFS uses USER_HZ instead of HZ for - # external interfaces, and that is currently always set to 100. So you don't - # need to change this on 2.6. - #XFS_HZ=100 - - # Should the maximum CPU frequency be adjusted down while on battery? - # Requires CPUFreq to be setup. - # See Documentation/admin-guide/pm/cpufreq.rst for more info - #DO_CPU=0 - - # When on battery what is the maximum CPU speed that the system should - # use? Legal values are "slowest" for the slowest speed that your - # CPU is able to operate at, or a value listed in: - # /sys/devices/system/cpu/cpu0/cpufreq/scaling_available_frequencies - # Only applicable if DO_CPU=1. - #CPU_MAXFREQ=slowest - - # Idle timeout for your hard drive (man hdparm for valid values, -S option) - # Default is 2 hours on AC (AC_HD=244) and 20 seconds for battery (BATT_HD=4). - #AC_HD=244 - #BATT_HD=4 - - # The drives for which to adjust the idle timeout. Separate them by a space, - # e.g. HD="/dev/hda /dev/hdb". - #HD="/dev/hda" - - # Set the spindown timeout on a hard drive? - #DO_HD=1 - - -Control script --------------- - -Please note that this control script works for the Linux 2.4 and 2.6 series (thanks -to Kiko Piris). - -Control script:: - - #!/bin/bash - - # start or stop laptop_mode, best run by a power management daemon when - # ac gets connected/disconnected from a laptop - # - # install as /sbin/laptop_mode - # - # Contributors to this script: Kiko Piris - # Bart Samwel - # Micha Feigin - # Andrew Morton - # Herve Eychenne - # Dax Kelson - # - # Original Linux 2.4 version by: Jens Axboe - - ############################################################################# - - # Source config - if [ -f /etc/default/laptop-mode ] ; then - # Debian - . /etc/default/laptop-mode - elif [ -f /etc/sysconfig/laptop-mode ] ; then - # Others - . /etc/sysconfig/laptop-mode - fi - - # Don't raise an error if the config file is incomplete - # set defaults instead: - - # Maximum time, in seconds, of hard drive spindown time that you are - # comfortable with. Worst case, it's possible that you could lose this - # amount of work if your battery fails you while in laptop mode. - MAX_AGE=${MAX_AGE:-'600'} - - # Read-ahead, in kilobytes - READAHEAD=${READAHEAD:-'4096'} - - # Shall we remount journaled fs. with appropriate commit interval? (1=yes) - DO_REMOUNTS=${DO_REMOUNTS:-'1'} - - # And shall we add the "noatime" option to that as well? (1=yes) - DO_REMOUNT_NOATIME=${DO_REMOUNT_NOATIME:-'1'} - - # Shall we adjust the idle timeout on a hard drive? - DO_HD=${DO_HD:-'1'} - - # Adjust idle timeout on which hard drive? - HD="${HD:-'/dev/hda'}" - - # spindown time for HD (hdparm -S values) - AC_HD=${AC_HD:-'244'} - BATT_HD=${BATT_HD:-'4'} - - # Dirty synchronous ratio. At this percentage of dirty pages the process which - # calls write() does its own writeback - DIRTY_RATIO=${DIRTY_RATIO:-'40'} - - # cpu frequency scaling - # See Documentation/admin-guide/pm/cpufreq.rst for more info - DO_CPU=${CPU_MANAGE:-'0'} - CPU_MAXFREQ=${CPU_MAXFREQ:-'slowest'} - - # - # Allowed dirty background ratio, in percent. Once DIRTY_RATIO has been - # exceeded, the kernel will wake flusher threads which will then reduce the - # amount of dirty memory to dirty_background_ratio. Set this nice and low, - # so once some writeout has commenced, we do a lot of it. - # - DIRTY_BACKGROUND_RATIO=${DIRTY_BACKGROUND_RATIO:-'5'} - - # kernel default dirty buffer age - DEF_AGE=${DEF_AGE:-'30'} - DEF_UPDATE=${DEF_UPDATE:-'5'} - DEF_DIRTY_BACKGROUND_RATIO=${DEF_DIRTY_BACKGROUND_RATIO:-'10'} - DEF_DIRTY_RATIO=${DEF_DIRTY_RATIO:-'40'} - DEF_XFS_AGE_BUFFER=${DEF_XFS_AGE_BUFFER:-'15'} - DEF_XFS_SYNC_INTERVAL=${DEF_XFS_SYNC_INTERVAL:-'30'} - DEF_XFS_BUFD_INTERVAL=${DEF_XFS_BUFD_INTERVAL:-'1'} - - # This must be adjusted manually to the value of HZ in the running kernel - # on 2.4, until the XFS people change their 2.4 external interfaces to work in - # centisecs. This can be automated, but it's a work in progress that still needs - # some fixes. On 2.6 kernels, XFS uses USER_HZ instead of HZ for external - # interfaces, and that is currently always set to 100. So you don't need to - # change this on 2.6. - XFS_HZ=${XFS_HZ:-'100'} - - ############################################################################# - - KLEVEL="$(uname -r | - { - IFS='.' read a b c - echo $a.$b - } - )" - case "$KLEVEL" in - "2.4"|"2.6") - ;; - *) - echo "Unhandled kernel version: $KLEVEL ('uname -r' = '$(uname -r)')" >&2 - exit 1 - ;; - esac - - if [ ! -e /proc/sys/vm/laptop_mode ] ; then - echo "Kernel is not patched with laptop_mode patch." >&2 - exit 1 - fi - - if [ ! -w /proc/sys/vm/laptop_mode ] ; then - echo "You do not have enough privileges to enable laptop_mode." >&2 - exit 1 - fi - - # Remove an option (the first parameter) of the form option= from - # a mount options string (the rest of the parameters). - parse_mount_opts () { - OPT="$1" - shift - echo ",$*," | sed \ - -e 's/,'"$OPT"'=[0-9]*,/,/g' \ - -e 's/,,*/,/g' \ - -e 's/^,//' \ - -e 's/,$//' - } - - # Remove an option (the first parameter) without any arguments from - # a mount option string (the rest of the parameters). - parse_nonumber_mount_opts () { - OPT="$1" - shift - echo ",$*," | sed \ - -e 's/,'"$OPT"',/,/g' \ - -e 's/,,*/,/g' \ - -e 's/^,//' \ - -e 's/,$//' - } - - # Find out the state of a yes/no option (e.g. "atime"/"noatime") in - # fstab for a given filesystem, and use this state to replace the - # value of the option in another mount options string. The device - # is the first argument, the option name the second, and the default - # value the third. The remainder is the mount options string. - # - # Example: - # parse_yesno_opts_wfstab /dev/hda1 atime atime defaults,noatime - # - # If fstab contains, say, "rw" for this filesystem, then the result - # will be "defaults,atime". - parse_yesno_opts_wfstab () { - L_DEV="$1" - OPT="$2" - DEF_OPT="$3" - shift 3 - L_OPTS="$*" - PARSEDOPTS1="$(parse_nonumber_mount_opts $OPT $L_OPTS)" - PARSEDOPTS1="$(parse_nonumber_mount_opts no$OPT $PARSEDOPTS1)" - # Watch for a default atime in fstab - FSTAB_OPTS="$(awk '$1 == "'$L_DEV'" { print $4 }' /etc/fstab)" - if echo "$FSTAB_OPTS" | grep "$OPT" > /dev/null ; then - # option specified in fstab: extract the value and use it - if echo "$FSTAB_OPTS" | grep "no$OPT" > /dev/null ; then - echo "$PARSEDOPTS1,no$OPT" - else - # no$OPT not found -- so we must have $OPT. - echo "$PARSEDOPTS1,$OPT" - fi - else - # option not specified in fstab -- choose the default. - echo "$PARSEDOPTS1,$DEF_OPT" - fi - } - - # Find out the state of a numbered option (e.g. "commit=NNN") in - # fstab for a given filesystem, and use this state to replace the - # value of the option in another mount options string. The device - # is the first argument, and the option name the second. The - # remainder is the mount options string in which the replacement - # must be done. - # - # Example: - # parse_mount_opts_wfstab /dev/hda1 commit defaults,commit=7 - # - # If fstab contains, say, "commit=3,rw" for this filesystem, then the - # result will be "rw,commit=3". - parse_mount_opts_wfstab () { - L_DEV="$1" - OPT="$2" - shift 2 - L_OPTS="$*" - PARSEDOPTS1="$(parse_mount_opts $OPT $L_OPTS)" - # Watch for a default commit in fstab - FSTAB_OPTS="$(awk '$1 == "'$L_DEV'" { print $4 }' /etc/fstab)" - if echo "$FSTAB_OPTS" | grep "$OPT=" > /dev/null ; then - # option specified in fstab: extract the value, and use it - echo -n "$PARSEDOPTS1,$OPT=" - echo ",$FSTAB_OPTS," | sed \ - -e 's/.*,'"$OPT"'=//' \ - -e 's/,.*//' - else - # option not specified in fstab: set it to 0 - echo "$PARSEDOPTS1,$OPT=0" - fi - } - - deduce_fstype () { - MP="$1" - # My root filesystem unfortunately has - # type "unknown" in /etc/mtab. If we encounter - # "unknown", we try to get the type from fstab. - cat /etc/fstab | - grep -v '^#' | - while read FSTAB_DEV FSTAB_MP FSTAB_FST FSTAB_OPTS FSTAB_DUMP FSTAB_DUMP ; do - if [ "$FSTAB_MP" = "$MP" ]; then - echo $FSTAB_FST - exit 0 - fi - done - } - - if [ $DO_REMOUNT_NOATIME -eq 1 ] ; then - NOATIME_OPT=",noatime" - fi - - case "$1" in - start) - AGE=$((100*$MAX_AGE)) - XFS_AGE=$(($XFS_HZ*$MAX_AGE)) - echo -n "Starting laptop_mode" - - if [ -d /proc/sys/vm/pagebuf ] ; then - # (For 2.4 and early 2.6.) - # This only needs to be set, not reset -- it is only used when - # laptop mode is enabled. - echo $XFS_AGE > /proc/sys/vm/pagebuf/lm_flush_age - echo $XFS_AGE > /proc/sys/fs/xfs/lm_sync_interval - elif [ -f /proc/sys/fs/xfs/lm_age_buffer ] ; then - # (A couple of early 2.6 laptop mode patches had these.) - # The same goes for these. - echo $XFS_AGE > /proc/sys/fs/xfs/lm_age_buffer - echo $XFS_AGE > /proc/sys/fs/xfs/lm_sync_interval - elif [ -f /proc/sys/fs/xfs/age_buffer ] ; then - # (2.6.6) - # But not for these -- they are also used in normal - # operation. - echo $XFS_AGE > /proc/sys/fs/xfs/age_buffer - echo $XFS_AGE > /proc/sys/fs/xfs/sync_interval - elif [ -f /proc/sys/fs/xfs/age_buffer_centisecs ] ; then - # (2.6.7 upwards) - # And not for these either. These are in centisecs, - # not USER_HZ, so we have to use $AGE, not $XFS_AGE. - echo $AGE > /proc/sys/fs/xfs/age_buffer_centisecs - echo $AGE > /proc/sys/fs/xfs/xfssyncd_centisecs - echo 3000 > /proc/sys/fs/xfs/xfsbufd_centisecs - fi - - case "$KLEVEL" in - "2.4") - echo 1 > /proc/sys/vm/laptop_mode - echo "30 500 0 0 $AGE $AGE 60 20 0" > /proc/sys/vm/bdflush - ;; - "2.6") - echo 5 > /proc/sys/vm/laptop_mode - echo "$AGE" > /proc/sys/vm/dirty_writeback_centisecs - echo "$AGE" > /proc/sys/vm/dirty_expire_centisecs - echo "$DIRTY_RATIO" > /proc/sys/vm/dirty_ratio - echo "$DIRTY_BACKGROUND_RATIO" > /proc/sys/vm/dirty_background_ratio - ;; - esac - if [ $DO_REMOUNTS -eq 1 ]; then - cat /etc/mtab | while read DEV MP FST OPTS DUMP PASS ; do - PARSEDOPTS="$(parse_mount_opts "$OPTS")" - if [ "$FST" = 'unknown' ]; then - FST=$(deduce_fstype $MP) - fi - case "$FST" in - "ext3") - PARSEDOPTS="$(parse_mount_opts commit "$OPTS")" - mount $DEV -t $FST $MP -o remount,$PARSEDOPTS,commit=$MAX_AGE$NOATIME_OPT - ;; - "xfs") - mount $DEV -t $FST $MP -o remount,$OPTS$NOATIME_OPT - ;; - esac - if [ -b $DEV ] ; then - blockdev --setra $(($READAHEAD * 2)) $DEV - fi - done - fi - if [ $DO_HD -eq 1 ] ; then - for THISHD in $HD ; do - /sbin/hdparm -S $BATT_HD $THISHD > /dev/null 2>&1 - /sbin/hdparm -B 1 $THISHD > /dev/null 2>&1 - done - fi - if [ $DO_CPU -eq 1 -a -e /sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_min_freq ]; then - if [ $CPU_MAXFREQ = 'slowest' ]; then - CPU_MAXFREQ=`cat /sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_min_freq` - fi - echo $CPU_MAXFREQ > /sys/devices/system/cpu/cpu0/cpufreq/scaling_max_freq - fi - echo "." - ;; - stop) - U_AGE=$((100*$DEF_UPDATE)) - B_AGE=$((100*$DEF_AGE)) - echo -n "Stopping laptop_mode" - echo 0 > /proc/sys/vm/laptop_mode - if [ -f /proc/sys/fs/xfs/age_buffer -a ! -f /proc/sys/fs/xfs/lm_age_buffer ] ; then - # These need to be restored, if there are no lm_*. - echo $(($XFS_HZ*$DEF_XFS_AGE_BUFFER)) > /proc/sys/fs/xfs/age_buffer - echo $(($XFS_HZ*$DEF_XFS_SYNC_INTERVAL)) > /proc/sys/fs/xfs/sync_interval - elif [ -f /proc/sys/fs/xfs/age_buffer_centisecs ] ; then - # These need to be restored as well. - echo $((100*$DEF_XFS_AGE_BUFFER)) > /proc/sys/fs/xfs/age_buffer_centisecs - echo $((100*$DEF_XFS_SYNC_INTERVAL)) > /proc/sys/fs/xfs/xfssyncd_centisecs - echo $((100*$DEF_XFS_BUFD_INTERVAL)) > /proc/sys/fs/xfs/xfsbufd_centisecs - fi - case "$KLEVEL" in - "2.4") - echo "30 500 0 0 $U_AGE $B_AGE 60 20 0" > /proc/sys/vm/bdflush - ;; - "2.6") - echo "$U_AGE" > /proc/sys/vm/dirty_writeback_centisecs - echo "$B_AGE" > /proc/sys/vm/dirty_expire_centisecs - echo "$DEF_DIRTY_RATIO" > /proc/sys/vm/dirty_ratio - echo "$DEF_DIRTY_BACKGROUND_RATIO" > /proc/sys/vm/dirty_background_ratio - ;; - esac - if [ $DO_REMOUNTS -eq 1 ] ; then - cat /etc/mtab | while read DEV MP FST OPTS DUMP PASS ; do - # Reset commit and atime options to defaults. - if [ "$FST" = 'unknown' ]; then - FST=$(deduce_fstype $MP) - fi - case "$FST" in - "ext3") - PARSEDOPTS="$(parse_mount_opts_wfstab $DEV commit $OPTS)" - PARSEDOPTS="$(parse_yesno_opts_wfstab $DEV atime atime $PARSEDOPTS)" - mount $DEV -t $FST $MP -o remount,$PARSEDOPTS - ;; - "xfs") - PARSEDOPTS="$(parse_yesno_opts_wfstab $DEV atime atime $OPTS)" - mount $DEV -t $FST $MP -o remount,$PARSEDOPTS - ;; - esac - if [ -b $DEV ] ; then - blockdev --setra 256 $DEV - fi - done - fi - if [ $DO_HD -eq 1 ] ; then - for THISHD in $HD ; do - /sbin/hdparm -S $AC_HD $THISHD > /dev/null 2>&1 - /sbin/hdparm -B 255 $THISHD > /dev/null 2>&1 - done - fi - if [ $DO_CPU -eq 1 -a -e /sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_min_freq ]; then - echo `cat /sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq` > /sys/devices/system/cpu/cpu0/cpufreq/scaling_max_freq - fi - echo "." - ;; - *) - echo "Usage: $0 {start|stop}" 2>&1 - exit 1 - ;; - - esac - - exit 0 - - -ACPI integration ----------------- - -Dax Kelson submitted this so that the ACPI acpid daemon will -kick off the laptop_mode script and run hdparm. The part that -automatically disables laptop mode when the battery is low was -written by Jan Topinski. - -/etc/acpi/events/ac_adapter:: - - event=ac_adapter - action=/etc/acpi/actions/ac.sh %e - -/etc/acpi/events/battery:: - - event=battery.* - action=/etc/acpi/actions/battery.sh %e - -/etc/acpi/actions/ac.sh:: - - #!/bin/bash - - # ac on/offline event handler - - status=`awk '/^state: / { print $2 }' /proc/acpi/ac_adapter/$2/state` - - case $status in - "on-line") - /sbin/laptop_mode stop - exit 0 - ;; - "off-line") - /sbin/laptop_mode start - exit 0 - ;; - esac - - -/etc/acpi/actions/battery.sh:: - - #! /bin/bash - - # Automatically disable laptop mode when the battery almost runs out. - - BATT_INFO=/proc/acpi/battery/$2/state - - if [[ -f /proc/sys/vm/laptop_mode ]] - then - LM=`cat /proc/sys/vm/laptop_mode` - if [[ $LM -gt 0 ]] - then - if [[ -f $BATT_INFO ]] - then - # Source the config file only now that we know we need - if [ -f /etc/default/laptop-mode ] ; then - # Debian - . /etc/default/laptop-mode - elif [ -f /etc/sysconfig/laptop-mode ] ; then - # Others - . /etc/sysconfig/laptop-mode - fi - MINIMUM_BATTERY_MINUTES=${MINIMUM_BATTERY_MINUTES:-'10'} - - ACTION="`cat $BATT_INFO | grep charging | cut -c 26-`" - if [[ ACTION -eq "discharging" ]] - then - PRESENT_RATE=`cat $BATT_INFO | grep "present rate:" | sed "s/.* \([0-9][0-9]* \).*/\1/" ` - REMAINING=`cat $BATT_INFO | grep "remaining capacity:" | sed "s/.* \([0-9][0-9]* \).*/\1/" ` - fi - if (($REMAINING * 60 / $PRESENT_RATE < $MINIMUM_BATTERY_MINUTES)) - then - /sbin/laptop_mode stop - fi - else - logger -p daemon.warning "You are using laptop mode and your battery interface $BATT_INFO is missing. This may lead to loss of data when the battery runs out. Check kernel ACPI support and /proc/acpi/battery folder, and edit /etc/acpi/battery.sh to set BATT_INFO to the correct path." - fi - fi - fi - - -Monitoring tool ---------------- - -Bartek Kania submitted this, it can be used to measure how much time your disk -spends spun up/down. See tools/laptop/dslm/dslm.c diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst index 245bf6394935..ca6ebeb5171c 100644 --- a/Documentation/admin-guide/sysctl/vm.rst +++ b/Documentation/admin-guide/sysctl/vm.rst @@ -41,7 +41,6 @@ Currently, these files are in /proc/sys/vm: - extfrag_threshold - highmem_is_dirtyable - hugetlb_shm_group -- laptop_mode - legacy_va_layout - lowmem_reserve_ratio - max_map_count @@ -363,13 +362,6 @@ hugetlb_shm_group contains group id that is allowed to create SysV shared memory segment using hugetlb page. -laptop_mode -=========== - -laptop_mode is a knob that controls "laptop mode". All the things that are -controlled by this knob are discussed in Documentation/admin-guide/laptops/laptop-mode.rst. - - legacy_va_layout ================ diff --git a/block/blk-mq.c b/block/blk-mq.c index a29d8ac9d3e3..4bae7c4c664e 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -811,9 +811,6 @@ void blk_mq_free_request(struct request *rq) blk_mq_finish_request(rq); - if (unlikely(laptop_mode && !blk_rq_is_passthrough(rq))) - laptop_io_completion(q->disk->bdi); - rq_qos_done(q, rq); WRITE_ONCE(rq->state, MQ_RQ_IDLE); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 0c466ccbed69..15eb463d5a9b 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3305,8 +3305,7 @@ int ext4_alloc_da_blocks(struct inode *inode) /* * We do something simple for now. The filemap_flush() will * also start triggering a write of the data blocks, which is - * not strictly speaking necessary (and for users of - * laptop_mode, not even desirable). However, to do otherwise + * not strictly speaking necessary. However, to do otherwise * would require replicating code paths in: * * ext4_writepages() -> diff --git a/fs/sync.c b/fs/sync.c index 431fc5f5be06..6330150792f6 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -104,8 +104,6 @@ void ksys_sync(void) iterate_supers(sync_fs_one_sb, &wait); sync_bdevs(false); sync_bdevs(true); - if (unlikely(laptop_mode)) - laptop_sync_completion(); } SYSCALL_DEFINE0(sync) diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index bc71aa9dcee8..a2014fb1bc66 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -845,15 +845,6 @@ xfs_fs_sync_fs( if (error) return error; - if (laptop_mode) { - /* - * The disk must be active because we're syncing. - * We schedule log work now (now that the disk is - * active) instead of later (when it might not be). - */ - flush_delayed_work(&mp->m_log->l_work); - } - /* * If we are called with page faults frozen out, it means we are about * to freeze the transaction subsystem. Take the opportunity to shut diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index 0217c1073735..c88fd4d37d1f 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -46,7 +46,6 @@ enum wb_reason { WB_REASON_VMSCAN, WB_REASON_SYNC, WB_REASON_PERIODIC, - WB_REASON_LAPTOP_TIMER, WB_REASON_FS_FREE_SPACE, /* * There is no bdi forker thread any more and works are done @@ -204,8 +203,6 @@ struct backing_dev_info { char dev_name[64]; struct device *owner; - struct timer_list laptop_mode_wb_timer; - #ifdef CONFIG_DEBUG_FS struct dentry *debug_dir; #endif diff --git a/include/linux/writeback.h b/include/linux/writeback.h index f48e8ccffe81..e530112c4b3a 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -328,9 +328,6 @@ struct dirty_throttle_control { bool dirty_exceeded; }; -void laptop_io_completion(struct backing_dev_info *info); -void laptop_sync_completion(void); -void laptop_mode_timer_fn(struct timer_list *t); bool node_dirty_ok(struct pglist_data *pgdat); int wb_domain_init(struct wb_domain *dom, gfp_t gfp); #ifdef CONFIG_CGROUP_WRITEBACK @@ -342,7 +339,6 @@ extern struct wb_domain global_wb_domain; /* These are exported to sysctl. */ extern unsigned int dirty_writeback_interval; extern unsigned int dirty_expire_interval; -extern int laptop_mode; void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty); unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh); diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index 311a341e6fe4..b6f94e97788a 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -42,7 +42,6 @@ EM( WB_REASON_VMSCAN, "vmscan") \ EM( WB_REASON_SYNC, "sync") \ EM( WB_REASON_PERIODIC, "periodic") \ - EM( WB_REASON_LAPTOP_TIMER, "laptop_timer") \ EM( WB_REASON_FS_FREE_SPACE, "fs_free_space") \ EM( WB_REASON_FORKER_THREAD, "forker_thread") \ EMe(WB_REASON_FOREIGN_FLUSH, "foreign_flush") diff --git a/include/uapi/linux/sysctl.h b/include/uapi/linux/sysctl.h index 63d1464cb71c..6ea9ea8413fa 100644 --- a/include/uapi/linux/sysctl.h +++ b/include/uapi/linux/sysctl.h @@ -183,7 +183,7 @@ enum VM_LOWMEM_RESERVE_RATIO=20,/* reservation ratio for lower memory zones */ VM_MIN_FREE_KBYTES=21, /* Minimum free kilobytes to maintain */ VM_MAX_MAP_COUNT=22, /* int: Maximum number of mmaps/address-space */ - VM_LAPTOP_MODE=23, /* vm laptop mode */ + VM_BLOCK_DUMP=24, /* block dump mode */ VM_HUGETLB_GROUP=25, /* permitted hugetlb group */ VM_VFS_CACHE_PRESSURE=26, /* dcache/icache reclaim pressure */ diff --git a/mm/backing-dev.c b/mm/backing-dev.c index c5740c6d37a2..a0e26d1b717f 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -1034,7 +1034,6 @@ struct backing_dev_info *bdi_alloc(int node_id) bdi->capabilities = BDI_CAP_WRITEBACK; bdi->ra_pages = VM_READAHEAD_PAGES; bdi->io_pages = VM_READAHEAD_PAGES; - timer_setup(&bdi->laptop_mode_wb_timer, laptop_mode_timer_fn, 0); return bdi; } EXPORT_SYMBOL(bdi_alloc); @@ -1156,8 +1155,6 @@ static void bdi_remove_from_list(struct backing_dev_info *bdi) void bdi_unregister(struct backing_dev_info *bdi) { - timer_delete_sync(&bdi->laptop_mode_wb_timer); - /* make sure nobody finds us on the bdi_list anymore */ bdi_remove_from_list(bdi); wb_shutdown(&bdi->wb); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index ccdeb0e84d39..601a5e048d12 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -109,14 +109,6 @@ EXPORT_SYMBOL_GPL(dirty_writeback_interval); */ unsigned int dirty_expire_interval = 30 * 100; /* centiseconds */ -/* - * Flag that puts the machine in "laptop mode". Doubles as a timeout in jiffies: - * a full sync is triggered after this time elapses without any disk activity. - */ -int laptop_mode; - -EXPORT_SYMBOL(laptop_mode); - /* End of sysctl-exported parameters */ struct wb_domain global_wb_domain; @@ -1843,17 +1835,7 @@ static int balance_dirty_pages(struct bdi_writeback *wb, balance_domain_limits(mdtc, strictlimit); } - /* - * In laptop mode, we wait until hitting the higher threshold - * before starting background writeout, and then write out all - * the way down to the lower threshold. So slow writers cause - * minimal disk activity. - * - * In normal mode, we start background writeout at the lower - * background_thresh, to keep the amount of dirty memory low. - */ - if (!laptop_mode && nr_dirty > gdtc->bg_thresh && - !writeback_in_progress(wb)) + if (nr_dirty > gdtc->bg_thresh && !writeback_in_progress(wb)) wb_start_background_writeback(wb); /* @@ -1876,10 +1858,6 @@ free_running: break; } - /* Start writeback even when in laptop mode */ - if (unlikely(!writeback_in_progress(wb))) - wb_start_background_writeback(wb); - mem_cgroup_flush_foreign(wb); /* @@ -2198,41 +2176,6 @@ static int dirty_writeback_centisecs_handler(const struct ctl_table *table, int } #endif -void laptop_mode_timer_fn(struct timer_list *t) -{ - struct backing_dev_info *backing_dev_info = - timer_container_of(backing_dev_info, t, laptop_mode_wb_timer); - - wakeup_flusher_threads_bdi(backing_dev_info, WB_REASON_LAPTOP_TIMER); -} - -/* - * We've spun up the disk and we're in laptop mode: schedule writeback - * of all dirty data a few seconds from now. If the flush is already scheduled - * then push it back - the user is still using the disk. - */ -void laptop_io_completion(struct backing_dev_info *info) -{ - mod_timer(&info->laptop_mode_wb_timer, jiffies + laptop_mode); -} - -/* - * We're in laptop mode and we've just synced. The sync's writes will have - * caused another writeback to be scheduled by laptop_io_completion. - * Nothing needs to be written back anymore, so we unschedule the writeback. - */ -void laptop_sync_completion(void) -{ - struct backing_dev_info *bdi; - - rcu_read_lock(); - - list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) - timer_delete(&bdi->laptop_mode_wb_timer); - - rcu_read_unlock(); -} - /* * If ratelimit_pages is too high then we can get into dirty-data overload * if a large number of processes all perform writes at the same time. @@ -2263,6 +2206,19 @@ static int page_writeback_cpu_online(unsigned int cpu) #ifdef CONFIG_SYSCTL +static int laptop_mode; +static int laptop_mode_handler(const struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + int ret = proc_dointvec_jiffies(table, write, buffer, lenp, ppos); + + if (!ret && write) + pr_warn("%s: vm.laptop_mode is deprecated. Ignoring setting.\n", + current->comm); + + return ret; +} + /* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */ static const unsigned long dirty_bytes_min = 2 * PAGE_SIZE; @@ -2332,7 +2288,7 @@ static const struct ctl_table vm_page_writeback_sysctls[] = { .data = &laptop_mode, .maxlen = sizeof(laptop_mode), .mode = 0644, - .proc_handler = proc_dointvec_jiffies, + .proc_handler = laptop_mode_handler, }, }; #endif diff --git a/mm/vmscan.c b/mm/vmscan.c index 1c87945fa761..fc5691afb998 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -104,13 +104,13 @@ struct scan_control { unsigned int force_deactivate:1; unsigned int skipped_deactivate:1; - /* Writepage batching in laptop mode; RECLAIM_WRITE */ + /* zone_reclaim_mode, boost reclaim */ unsigned int may_writepage:1; - /* Can mapped folios be reclaimed? */ + /* zone_reclaim_mode */ unsigned int may_unmap:1; - /* Can folios be swapped as part of reclaim? */ + /* zome_reclaim_mode, boost reclaim, cgroup restrictions */ unsigned int may_swap:1; /* Not allow cache_trim_mode to be turned on as part of reclaim? */ @@ -6365,13 +6365,6 @@ retry: if (sc->compaction_ready) break; - - /* - * If we're getting trouble reclaiming, start doing - * writepage even in laptop mode. - */ - if (sc->priority < DEF_PRIORITY - 2) - sc->may_writepage = 1; } while (--sc->priority >= 0); last_pgdat = NULL; @@ -6580,7 +6573,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, .order = order, .nodemask = nodemask, .priority = DEF_PRIORITY, - .may_writepage = !laptop_mode, + .may_writepage = 1, .may_unmap = 1, .may_swap = 1, }; @@ -6624,7 +6617,7 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg, struct scan_control sc = { .nr_to_reclaim = SWAP_CLUSTER_MAX, .target_mem_cgroup = memcg, - .may_writepage = !laptop_mode, + .may_writepage = 1, .may_unmap = 1, .reclaim_idx = MAX_NR_ZONES - 1, .may_swap = !noswap, @@ -6670,7 +6663,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, .reclaim_idx = MAX_NR_ZONES - 1, .target_mem_cgroup = memcg, .priority = DEF_PRIORITY, - .may_writepage = !laptop_mode, + .may_writepage = 1, .may_unmap = 1, .may_swap = !!(reclaim_options & MEMCG_RECLAIM_MAY_SWAP), .proactive = !!(reclaim_options & MEMCG_RECLAIM_PROACTIVE), @@ -7051,7 +7044,7 @@ restart: * from reclaim context. If no pages are reclaimed, the * reclaim will be aborted. */ - sc.may_writepage = !laptop_mode && !nr_boost_reclaim; + sc.may_writepage = !nr_boost_reclaim; sc.may_swap = !nr_boost_reclaim; /* @@ -7061,13 +7054,6 @@ restart: */ kswapd_age_node(pgdat, &sc); - /* - * If we're getting trouble reclaiming, start doing writepage - * even in laptop mode. - */ - if (sc.priority < DEF_PRIORITY - 2) - sc.may_writepage = 1; - /* Call soft limit reclaim before calling shrink_node. */ sc.nr_scanned = 0; nr_soft_scanned = 0; @@ -7799,7 +7785,7 @@ int user_proactive_reclaim(char *buf, .reclaim_idx = gfp_zone(gfp_mask), .proactive_swappiness = swappiness == -1 ? NULL : &swappiness, .priority = DEF_PRIORITY, - .may_writepage = !laptop_mode, + .may_writepage = 1, .nr_to_reclaim = max(batch_size, SWAP_CLUSTER_MAX), .may_unmap = 1, .may_swap = 1, -- cgit v1.2.3 From bd4526e64bcff4cbeaefbbd91c40d3e38b9920a9 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Wed, 3 Dec 2025 22:45:11 +0000 Subject: maple_tree: remove struct maple_alloc struct maple_alloc is deprecated after the maple tree conversion to sheaves, remove the references from the header file. Link: https://lkml.kernel.org/r/20251203224511.469978-1-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reviewed-by: Jinjie Ruan Reviewed-by: Liam R. Howlett Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/maple_tree.h | 8 -------- 1 file changed, 8 deletions(-) (limited to 'include') diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index 66f98a3da8d8..1323c28a7470 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -129,13 +129,6 @@ struct maple_arange_64 { struct maple_metadata meta; }; -struct maple_alloc { - unsigned long total; - unsigned char node_count; - unsigned int request_count; - struct maple_alloc *slot[MAPLE_ALLOC_SLOTS]; -}; - struct maple_topiary { struct maple_pnode *parent; struct maple_enode *next; /* Overlaps the pivot */ @@ -306,7 +299,6 @@ struct maple_node { }; struct maple_range_64 mr64; struct maple_arange_64 ma64; - struct maple_alloc alloc; }; }; -- cgit v1.2.3 From 84355caa271a0eab2d1b55ff73aa8aa3e4627661 Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Wed, 17 Dec 2025 12:02:13 +0100 Subject: mm/mm_init: replace simple_strtoul with kstrtobool in set_hashdist Use bool for 'hashdist' and replace simple_strtoul() with kstrtobool() for parsing the 'hashdist=' boot parameter. Unlike simple_strtoul(), which returns an unsigned long, kstrtobool() converts the string directly to bool and avoids implicit casting. Check the return value of kstrtobool() and reject invalid values. This adds error handling while preserving behavior for existing values, and removes use of the deprecated simple_strtoul() helper. The current code silently sets 'hashdist = 0' if parsing fails, instead of leaving the default value (HASHDIST_DEFAULT) unchanged. Additionally, kstrtobool() accepts common boolean strings such as "on" and "off". Link: https://lkml.kernel.org/r/20251217110214.50807-1-thorsten.blum@linux.dev Signed-off-by: Thorsten Blum Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: Mike Rapoport (Microsoft) Signed-off-by: Andrew Morton --- include/linux/memblock.h | 4 ++-- mm/mm_init.c | 9 +++------ 2 files changed, 5 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 221118b5a16e..6ec5e9ac0699 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -598,9 +598,9 @@ extern void *alloc_large_system_hash(const char *tablename, */ #ifdef CONFIG_NUMA #define HASHDIST_DEFAULT IS_ENABLED(CONFIG_64BIT) -extern int hashdist; /* Distribute hashes across NUMA nodes? */ +extern bool hashdist; /* Distribute hashes across NUMA nodes? */ #else -#define hashdist (0) +#define hashdist (false) #endif #ifdef CONFIG_MEMTEST diff --git a/mm/mm_init.c b/mm/mm_init.c index fc2a6f1e518f..d86248566a56 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -646,21 +646,18 @@ int __meminit early_pfn_to_nid(unsigned long pfn) return nid; } -int hashdist = HASHDIST_DEFAULT; +bool hashdist = HASHDIST_DEFAULT; static int __init set_hashdist(char *str) { - if (!str) - return 0; - hashdist = simple_strtoul(str, &str, 0); - return 1; + return kstrtobool(str, &hashdist) == 0; } __setup("hashdist=", set_hashdist); static inline void fixup_hashdist(void) { if (num_node_state(N_MEMORY) == 1) - hashdist = 0; + hashdist = false; } #else static inline void fixup_hashdist(void) {} -- cgit v1.2.3 From 241b3a09639c317bdcaeea6721b7d1aabef341f9 Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Fri, 19 Dec 2025 11:32:18 +0000 Subject: mm: clarify GFP_ATOMIC/GFP_NOWAIT doc-comment The current description of contexts where it's invalid to make GFP_ATOMIC and GFP_NOWAIT calls is rather vague. Replace this with a direct description of the actual contexts of concern and refer to the RT docs where this is explained more discursively. While rejigging this prose, also move the documentation of GFP_NOWAIT to the GFP_NOWAIT section. Link: https://lore.kernel.org/all/d912480a-5229-4efe-9336-b31acded30f5@suse.cz/ Link: https://lkml.kernel.org/r/20251219-b4-gfp_atomic-comment-v2-1-4c4ce274c2b6@google.com Signed-off-by: Brendan Jackman Acked-by: Vlastimil Babka Acked-by: David Hildenbrand (Red Hat) Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Sebastian Andrzej Siewior Cc: Steven Rostedt Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- include/linux/gfp_types.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/gfp_types.h b/include/linux/gfp_types.h index 3de43b12209e..814bb2892f99 100644 --- a/include/linux/gfp_types.h +++ b/include/linux/gfp_types.h @@ -309,8 +309,10 @@ enum { * * %GFP_ATOMIC users can not sleep and need the allocation to succeed. A lower * watermark is applied to allow access to "atomic reserves". - * The current implementation doesn't support NMI and few other strict - * non-preemptive contexts (e.g. raw_spin_lock). The same applies to %GFP_NOWAIT. + * The current implementation doesn't support NMI, nor contexts that disable + * preemption under PREEMPT_RT. This includes raw_spin_lock() and plain + * preempt_disable() - see "Memory allocation" in + * Documentation/core-api/real-time/differences.rst for more info. * * %GFP_KERNEL is typical for kernel-internal allocations. The caller requires * %ZONE_NORMAL or a lower zone for direct access but can direct reclaim. @@ -321,6 +323,7 @@ enum { * %GFP_NOWAIT is for kernel allocations that should not stall for direct * reclaim, start physical IO or use any filesystem callback. It is very * likely to fail to allocate memory, even for very small allocations. + * The same restrictions on calling contexts apply as for %GFP_ATOMIC. * * %GFP_NOIO will use direct reclaim to discard clean pages or slab pages * that do not require the starting of any physical IO. -- cgit v1.2.3 From 7db0787000d44d52710e5cdd67113458fa28f3cd Mon Sep 17 00:00:00 2001 From: Wentao Guan Date: Thu, 6 Nov 2025 19:09:29 +0800 Subject: mm: cleanup vma_iter_bulk_alloc commit d24062914837 ("fork: use __mt_dup() to duplicate maple tree in dup_mmap()"), removed the only user and mas_expected_entries has been removed, since commit e3852a1213ffc ("maple_tree: Drop bulk insert support"). Also cleanup the mas_expected_entries in maple_tree.h. No functional change. Link: https://lkml.kernel.org/r/20251106110929.3522073-1-guanwentao@uniontech.com Signed-off-by: Wentao Guan Reviewed-by: Liam R. Howlett Cc: Anshuman Khandual Cc: Cheng Nie Cc: Guan Wentao Cc: Vlastimil Babka Cc: Lorenzo Stoakes Cc: Jann Horn Cc: Pedro Falcato Signed-off-by: Andrew Morton --- include/linux/maple_tree.h | 1 - mm/vma.h | 6 ------ 2 files changed, 7 deletions(-) (limited to 'include') diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index 1323c28a7470..7b8aad47121e 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -528,7 +528,6 @@ bool mas_nomem(struct ma_state *mas, gfp_t gfp); void mas_pause(struct ma_state *mas); void maple_tree_init(void); void mas_destroy(struct ma_state *mas); -int mas_expected_entries(struct ma_state *mas, unsigned long nr_entries); void *mas_prev(struct ma_state *mas, unsigned long min); void *mas_prev_range(struct ma_state *mas, unsigned long max); diff --git a/mm/vma.h b/mm/vma.h index 8526f22c9f5a..d51efd9da113 100644 --- a/mm/vma.h +++ b/mm/vma.h @@ -561,12 +561,6 @@ static inline unsigned long vma_iter_end(struct vma_iterator *vmi) return vmi->mas.last + 1; } -static inline int vma_iter_bulk_alloc(struct vma_iterator *vmi, - unsigned long count) -{ - return mas_expected_entries(&vmi->mas, count); -} - static inline struct vm_area_struct *vma_iter_prev_range(struct vma_iterator *vmi) { -- cgit v1.2.3 From 9e80e66ddaf736e5ca80cba8adf8d497bd53092f Mon Sep 17 00:00:00 2001 From: Gregory Price Date: Sun, 21 Dec 2025 07:56:03 -0500 Subject: mm, hugetlb: implement movable_gigantic_pages sysctl This reintroduces a concept removed by: commit d6cb41cc44c6 ("mm, hugetlb: remove hugepages_treat_as_movable sysctl") This sysctl provides flexibility between ZONE_MOVABLE use cases: 1) onlining memory in ZONE_MOVABLE to maintain hotplug compatibility 2) onlining memory in ZONE_MOVABLE to make hugepage allocate reliable When ZONE_MOVABLE is used to make huge page allocation more reliable, disallowing gigantic pages memory in this region is pointless. If hotplug is not a requirement, we can loosen the restrictions to allow 1GB gigantic pages in ZONE_MOVABLE. Since 1GB can be difficult to migrate / has impacts on compaction / defragmentation, we don't enable this by default. Notably, 1GB pages can only be migrated if another 1GB page is available - so hot-unplug will fail if such a page cannot be found. However, since there are scenarios where gigantic pages are migratable, we should allow use of these on movable regions. When not valid 1GB is available for migration, hot-unplug will retry indefinitely (or until interrupted). For example: echo 0 > node0/hugepages/..-1GB/nr_hugepages # clear node0 1GB pages echo 1 > node1/hugepages/..-1GB/nr_hugepages # reserve node1 1GB page ./alloc_huge_node1 & # Allocate a 1GB page on node1 ./node1_offline & # attempt to offline all node1 memory echo 1 > node0/hugepages/..-1GB/nr_hugepages # reserve node0 1GB page In this example, node1_offline will block indefinitely until the final step, when a node0 1GB page is made available. Note: Boot-time CMA is not possible for driver-managed hotplug memory, as CMA requires the memory to be registered as SystemRAM at boot time. Additionally, 1GB huge pages are not supported by THP. Link: https://lkml.kernel.org/r/20251221125603.2364174-1-gourry@gourry.net Signed-off-by: Gregory Price Suggested-by: David Rientjes Link: https://lore.kernel.org/all/20180201193132.Hk7vI_xaU%25akpm@linux-foundation.org/ Acked-by: David Hildenbrand (Red Hat) Acked-by: David Rientjes Cc: Mel Gorman Cc: Michal Hocko Cc: "David Hildenbrand (Red Hat)" Cc: Gregory Price Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Mike Rapoport Cc: Muchun Song Cc: Oscar Salvador Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/memory-hotplug.rst | 14 +++++++++++-- Documentation/admin-guide/sysctl/vm.rst | 28 +++++++++++++++++++++++++ include/linux/hugetlb.h | 3 ++- mm/hugetlb_sysctl.c | 11 ++++++++++ 4 files changed, 53 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/Documentation/admin-guide/mm/memory-hotplug.rst b/Documentation/admin-guide/mm/memory-hotplug.rst index 33c886f3d198..6581558fd0d7 100644 --- a/Documentation/admin-guide/mm/memory-hotplug.rst +++ b/Documentation/admin-guide/mm/memory-hotplug.rst @@ -612,8 +612,9 @@ ZONE_MOVABLE, especially when fine-tuning zone ratios: allocations and silently create a zone imbalance, usually triggered by inflation requests from the hypervisor. -- Gigantic pages are unmovable, resulting in user space consuming a - lot of unmovable memory. +- Gigantic pages are unmovable when an architecture does not support + huge page migration and/or the ``movable_gigantic_pages`` sysctl is false. + See Documentation/admin-guide/sysctl/vm.rst for more info on this sysctl. - Huge pages are unmovable when an architectures does not support huge page migration, resulting in a similar issue as with gigantic pages. @@ -672,6 +673,15 @@ block might fail: - Concurrent activity that operates on the same physical memory area, such as allocating gigantic pages, can result in temporary offlining failures. +- When an admin sets the ``movable_gigantic_pages`` sysctl to true, gigantic + pages are allowed in ZONE_MOVABLE. This only allows migratable gigantic + pages to be allocated; however, if there are no eligible destination gigantic + pages at offline, the offlining operation will fail. + + Users leveraging ``movable_gigantic_pages`` should weigh the value of + ZONE_MOVABLE for increasing the reliability of gigantic page allocation + against the potential loss of hot-unplug reliability. + - Out of memory when dissolving huge pages, especially when HugeTLB Vmemmap Optimization (HVO) is enabled. diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst index ca6ebeb5171c..b98ccb5cb210 100644 --- a/Documentation/admin-guide/sysctl/vm.rst +++ b/Documentation/admin-guide/sysctl/vm.rst @@ -53,6 +53,7 @@ Currently, these files are in /proc/sys/vm: - mmap_min_addr - mmap_rnd_bits - mmap_rnd_compat_bits +- movable_gigantic_pages - nr_hugepages - nr_hugepages_mempolicy - nr_overcommit_hugepages @@ -620,6 +621,33 @@ This value can be changed after boot using the /proc/sys/vm/mmap_rnd_compat_bits tunable +movable_gigantic_pages +====================== + +This parameter controls whether gigantic pages may be allocated from +ZONE_MOVABLE. If set to non-zero, gigantic pages can be allocated +from ZONE_MOVABLE. ZONE_MOVABLE memory may be created via the kernel +boot parameter `kernelcore` or via memory hotplug as discussed in +Documentation/admin-guide/mm/memory-hotplug.rst. + +Support may depend on specific architecture. + +Note that using ZONE_MOVABLE gigantic pages make memory hotremove unreliable. + +Memory hot-remove operations will block indefinitely until the admin reserves +sufficient gigantic pages to service migration requests associated with the +memory offlining process. As HugeTLB gigantic page reservation is a manual +process (via `nodeN/hugepages/.../nr_hugepages` interfaces) this may not be +obvious when just attempting to offline a block of memory. + +Additionally, as multiple gigantic pages may be reserved on a single block, +it may appear that gigantic pages are available for migration when in reality +they are in the process of being removed. For example if `memoryN` contains +two gigantic pages, one reserved and one allocated, and an admin attempts to +offline that block, this operations may hang indefinitely unless another +reserved gigantic page is available on another block `memoryM`. + + nr_hugepages ============ diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index e51b8ef0cebd..694f6e83c637 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -171,6 +171,7 @@ bool hugetlbfs_pagecache_present(struct hstate *h, struct address_space *hugetlb_folio_mapping_lock_write(struct folio *folio); +extern int movable_gigantic_pages __read_mostly; extern int sysctl_hugetlb_shm_group __read_mostly; extern struct list_head huge_boot_pages[MAX_NUMNODES]; @@ -929,7 +930,7 @@ static inline bool hugepage_movable_supported(struct hstate *h) if (!hugepage_migration_supported(h)) return false; - if (hstate_is_gigantic(h)) + if (hstate_is_gigantic(h) && !movable_gigantic_pages) return false; return true; } diff --git a/mm/hugetlb_sysctl.c b/mm/hugetlb_sysctl.c index bd3077150542..e74cf18ad431 100644 --- a/mm/hugetlb_sysctl.c +++ b/mm/hugetlb_sysctl.c @@ -8,6 +8,8 @@ #include "hugetlb_internal.h" +int movable_gigantic_pages; + #ifdef CONFIG_SYSCTL static int proc_hugetlb_doulongvec_minmax(const struct ctl_table *table, int write, void *buffer, size_t *length, @@ -125,6 +127,15 @@ static const struct ctl_table hugetlb_table[] = { .mode = 0644, .proc_handler = hugetlb_overcommit_handler, }, +#ifdef CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION + { + .procname = "movable_gigantic_pages", + .data = &movable_gigantic_pages, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif }; void __init hugetlb_sysctl_init(void) -- cgit v1.2.3 From a8d933dc3354bfb9db1fc0e09c289ec1778ee271 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Thu, 25 Dec 2025 21:02:13 +0000 Subject: mm/vmstat: remove unused node and zone state helpers Several helper functions for managing node and zone states have become obsolete and no longer have any callers within the kernel. inc_node_state() inc_zone_state() dec_zone_state() This commit removes the dead code. Link: https://lkml.kernel.org/r/20251225210213.2553-1-richard.weiyang@gmail.com Signed-off-by: Wei Yang Reviewed-by: Joshua Hahn Acked-by: David Hildenbrand (Red Hat) Acked-by: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/vmstat.h | 6 ------ mm/vmstat.c | 15 --------------- 2 files changed, 21 deletions(-) (limited to 'include') diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 3398a345bda8..cf559e2ce1d4 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -286,10 +286,8 @@ void mod_node_page_state(struct pglist_data *, enum node_stat_item, long); void inc_node_page_state(struct page *, enum node_stat_item); void dec_node_page_state(struct page *, enum node_stat_item); -extern void inc_node_state(struct pglist_data *, enum node_stat_item); extern void __inc_zone_state(struct zone *, enum zone_stat_item); extern void __inc_node_state(struct pglist_data *, enum node_stat_item); -extern void dec_zone_state(struct zone *, enum zone_stat_item); extern void __dec_zone_state(struct zone *, enum zone_stat_item); extern void __dec_node_state(struct pglist_data *, enum node_stat_item); @@ -394,10 +392,6 @@ static inline void __dec_node_page_state(struct page *page, #define dec_node_page_state __dec_node_page_state #define mod_node_page_state __mod_node_page_state -#define inc_zone_state __inc_zone_state -#define inc_node_state __inc_node_state -#define dec_zone_state __dec_zone_state - #define set_pgdat_percpu_threshold(pgdat, callback) { } static inline void refresh_zone_stat_thresholds(void) { } diff --git a/mm/vmstat.c b/mm/vmstat.c index bd2af431ff86..6ae8891c9693 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -672,11 +672,6 @@ void mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item, } EXPORT_SYMBOL(mod_node_page_state); -void inc_node_state(struct pglist_data *pgdat, enum node_stat_item item) -{ - mod_node_state(pgdat, item, 1, 1); -} - void inc_node_page_state(struct page *page, enum node_stat_item item) { mod_node_state(page_pgdat(page), item, 1, 1); @@ -725,16 +720,6 @@ void dec_zone_page_state(struct page *page, enum zone_stat_item item) } EXPORT_SYMBOL(dec_zone_page_state); -void inc_node_state(struct pglist_data *pgdat, enum node_stat_item item) -{ - unsigned long flags; - - local_irq_save(flags); - __inc_node_state(pgdat, item); - local_irq_restore(flags); -} -EXPORT_SYMBOL(inc_node_state); - void mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item, long delta) { -- cgit v1.2.3 From f9b74c13b773b7c7e4920d7bc214ea3d5f37b422 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Wed, 31 Dec 2025 03:00:26 +0000 Subject: mm/mmu_gather: remove @delay_remap of __tlb_remove_page_size() __tlb_remove_page_size() is only used in tlb_remove_page_size() with @delay_remap set to false and it is passed directly to __tlb_remove_folio_pages_size(). Remove @delay_remap of __tlb_remove_page_size() and call __tlb_remove_folio_pages_size() with false @delay_remap. Link: https://lkml.kernel.org/r/20251231030026.15938-1-richard.weiyang@gmail.com Signed-off-by: Wei Yang Acked-by: SeongJae Park Acked-by: David Hildenbrand (Red Hat) Acked-by: Will Deacon Acked-by: Heiko Carstens # s390 Cc: Alexander Gordeev Cc: "Aneesh Kumar K.V" Cc: Arnd Bergmann Cc: Christian Borntraeger Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Sven Schnelle Cc: Vasily Gorbik Signed-off-by: Andrew Morton --- arch/s390/include/asm/tlb.h | 6 ++---- include/asm-generic/tlb.h | 5 ++--- mm/mmu_gather.c | 5 ++--- 3 files changed, 6 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/arch/s390/include/asm/tlb.h b/arch/s390/include/asm/tlb.h index 1e50f6f1ad9d..0b7b4df94b24 100644 --- a/arch/s390/include/asm/tlb.h +++ b/arch/s390/include/asm/tlb.h @@ -24,7 +24,7 @@ static inline void tlb_flush(struct mmu_gather *tlb); static inline bool __tlb_remove_page_size(struct mmu_gather *tlb, - struct page *page, bool delay_rmap, int page_size); + struct page *page, int page_size); static inline bool __tlb_remove_folio_pages(struct mmu_gather *tlb, struct page *page, unsigned int nr_pages, bool delay_rmap); @@ -46,10 +46,8 @@ static inline bool __tlb_remove_folio_pages(struct mmu_gather *tlb, * s390 doesn't delay rmap removal. */ static inline bool __tlb_remove_page_size(struct mmu_gather *tlb, - struct page *page, bool delay_rmap, int page_size) + struct page *page, int page_size) { - VM_WARN_ON_ONCE(delay_rmap); - free_folio_and_swap_cache(page_folio(page)); return false; } diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index 4d679d2a206b..3975f7d11553 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -287,8 +287,7 @@ struct mmu_gather_batch { */ #define MAX_GATHER_BATCH_COUNT (10000UL/MAX_GATHER_BATCH) -extern bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, - bool delay_rmap, int page_size); +extern bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size); bool __tlb_remove_folio_pages(struct mmu_gather *tlb, struct page *page, unsigned int nr_pages, bool delay_rmap); @@ -510,7 +509,7 @@ static inline void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb) static inline void tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size) { - if (__tlb_remove_page_size(tlb, page, false, page_size)) + if (__tlb_remove_page_size(tlb, page, page_size)) tlb_flush_mmu(tlb); } diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c index 7468ec388455..2faa23d7f8d4 100644 --- a/mm/mmu_gather.c +++ b/mm/mmu_gather.c @@ -210,10 +210,9 @@ bool __tlb_remove_folio_pages(struct mmu_gather *tlb, struct page *page, PAGE_SIZE); } -bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, - bool delay_rmap, int page_size) +bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size) { - return __tlb_remove_folio_pages_size(tlb, page, 1, delay_rmap, page_size); + return __tlb_remove_folio_pages_size(tlb, page, 1, false, page_size); } #endif /* MMU_GATHER_NO_GATHER */ -- cgit v1.2.3 From 5173ae0a068d64643ccf4915b7cbedf82810a592 Mon Sep 17 00:00:00 2001 From: Shivank Garg Date: Sun, 18 Jan 2026 19:09:40 +0000 Subject: mm/khugepaged: map dirty/writeback pages failures to EAGAIN Patch series "mm/khugepaged: fix dirty page handling for MADV_COLLAPSE", v5. MADV_COLLAPSE on file-backed mappings fails with -EINVAL when TEXT pages are dirty. This affects scenarios like package/container updates or executing binaries immediately after writing them, etc. The issue is that collapse_file() triggers async writeback and returns SCAN_FAIL (maps to -EINVAL), expecting khugepaged to revisit later. But MADV_COLLAPSE is synchronous and userspace expects immediate success or a clear retry signal. Reproduction: - Compile or copy 2MB-aligned executable to XFS/ext4 FS - Call MADV_COLLAPSE on .text section - First call fails with -EINVAL (text pages dirty from copy) - Second call succeeds (async writeback completed) Issue Report: https://lore.kernel.org/all/4e26fe5e-7374-467c-a333-9dd48f85d7cc@amd.com This patch (of 2): When collapse_file encounters dirty or writeback pages in file-backed mappings, it currently returns SCAN_FAIL which maps to -EINVAL. This is misleading as EINVAL suggests invalid arguments, whereas dirty/writeback pages represent transient conditions that may resolve on retry. Introduce SCAN_PAGE_DIRTY_OR_WRITEBACK to cover both dirty and writeback states, mapping it to -EAGAIN. For MADV_COLLAPSE, this provides userspace with a clear signal that retry may succeed after writeback completes. For khugepaged, this is harmless as it will naturally revisit the range during periodic scans after async writeback completes. Link: https://lkml.kernel.org/r/20260118190939.8986-2-shivankg@amd.com Link: https://lkml.kernel.org/r/20260118190939.8986-4-shivankg@amd.com Fixes: 34488399fa08 ("mm/madvise: add file and shmem support to MADV_COLLAPSE") Signed-off-by: Shivank Garg Reported-by: Branden Moore Closes: https://lore.kernel.org/all/4e26fe5e-7374-467c-a333-9dd48f85d7cc@amd.com Reviewed-by: Dev Jain Reviewed-by: Lance Yang Reviewed-by: Baolin Wang Reviewed-by: wang lian Acked-by: David Hildenbrand (Red Hat) Cc: Barry Song Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Nico Pache Cc: Ryan Roberts Cc: Zach O'Keefe Cc: Zi Yan Signed-off-by: Andrew Morton --- include/trace/events/huge_memory.h | 3 ++- mm/khugepaged.c | 8 +++++--- 2 files changed, 7 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/trace/events/huge_memory.h b/include/trace/events/huge_memory.h index 4cde53b45a85..4e41bff31888 100644 --- a/include/trace/events/huge_memory.h +++ b/include/trace/events/huge_memory.h @@ -37,7 +37,8 @@ EM( SCAN_PAGE_HAS_PRIVATE, "page_has_private") \ EM( SCAN_STORE_FAILED, "store_failed") \ EM( SCAN_COPY_MC, "copy_poisoned_page") \ - EMe(SCAN_PAGE_FILLED, "page_filled") + EM( SCAN_PAGE_FILLED, "page_filled") \ + EMe(SCAN_PAGE_DIRTY_OR_WRITEBACK, "page_dirty_or_writeback") #undef EM #undef EMe diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 97d1b2824386..219dfa2e523c 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -58,6 +58,7 @@ enum scan_result { SCAN_STORE_FAILED, SCAN_COPY_MC, SCAN_PAGE_FILLED, + SCAN_PAGE_DIRTY_OR_WRITEBACK, }; #define CREATE_TRACE_POINTS @@ -1967,11 +1968,11 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, */ xas_unlock_irq(&xas); filemap_flush(mapping); - result = SCAN_FAIL; + result = SCAN_PAGE_DIRTY_OR_WRITEBACK; goto xa_unlocked; } else if (folio_test_writeback(folio)) { xas_unlock_irq(&xas); - result = SCAN_FAIL; + result = SCAN_PAGE_DIRTY_OR_WRITEBACK; goto xa_unlocked; } else if (folio_trylock(folio)) { folio_get(folio); @@ -2018,7 +2019,7 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, * folio is dirty because it hasn't been flushed * since first write. */ - result = SCAN_FAIL; + result = SCAN_PAGE_DIRTY_OR_WRITEBACK; goto out_unlock; } @@ -2747,6 +2748,7 @@ static int madvise_collapse_errno(enum scan_result r) case SCAN_PAGE_LRU: case SCAN_DEL_PAGE_LRU: case SCAN_PAGE_FILLED: + case SCAN_PAGE_DIRTY_OR_WRITEBACK: return -EAGAIN; /* * Other: Trying again likely not to succeed / error intrinsic to -- cgit v1.2.3 From ba1c86874e25e95de9b253570bb50cc3b5df542e Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Sun, 11 Jan 2026 10:20:35 +0200 Subject: alpha: introduce arch_zone_limits_init() Patch series "arch, mm: consolidate hugetlb early reservation", v3. Order in which early memory reservation for hugetlb happens depends on architecture, on configuration options and on command line parameters. Some architectures rely on the core MM to call hugetlb_bootmem_alloc() while others call it very early to allow pre-allocation of HVO-style vmemmap. When hugetlb_cma is supported by an architecture it is initialized during setup_arch() and then later hugetlb_init code needs to understand did it happen or not. To make everything consistent and unified, both reservation of hugetlb memory from bootmem and creation of CMA areas for hugetlb must be called from core MM initialization and it would have been a simple change. However, HVO-style pre-initialization ordering requirements slightly complicate things and for HVO pre-init to work sparse and memory map should be initialized after hugetlb reservations. This required pulling out the call to free_area_init() out of setup_arch() path and moving it MM initialization and this is what the first 23 patches do. These changes are deliberately split into per-arch patches that change how the zone limits are calculated for each architecture and the patches 22 and 23 just remove the calls to free_area_init() and sprase_init() from arch/*. Patch 24 is a simple cleanup for MIPS. Patches 25 and 26 actually consolidate hugetlb reservations and patches 27 and 28 perform some aftermath cleanups. This patch (of 29): Move calculations of zone limits to a dedicated arch_zone_limits_init() function. Later MM core will use this function as an architecture specific callback during nodes and zones initialization and thus there won't be a need to call free_area_init() from every architecture. Link: https://lkml.kernel.org/r/20260111082105.290734-1-rppt@kernel.org Link: https://lkml.kernel.org/r/20260111082105.290734-2-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Acked-by: Magnus Lindholm Cc: Alexander Gordeev Cc: Alex Shi Cc: Andreas Larsson Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: David Hildenbrand Cc: David S. Miller Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Heiko Carstens Cc: Helge Deller Cc: Huacai Chen Cc: Ingo Molnar Cc: Johannes Berg Cc: John Paul Adrian Glaubitz Cc: Jonathan Corbet Cc: Klara Modin Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Hocko Cc: Michal Simek Cc: Muchun Song Cc: Oscar Salvador Cc: Palmer Dabbelt Cc: Pratyush Yadav Cc: Richard Weinberger Cc: "Ritesh Harjani (IBM)" Cc: Russell King Cc: Stafford Horne Cc: Suren Baghdasaryan Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/alpha/mm/init.c | 15 ++++++++++----- include/linux/mm.h | 1 + 2 files changed, 11 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/arch/alpha/mm/init.c b/arch/alpha/mm/init.c index 4c5ab9cd8a0a..cd0cb1abde5f 100644 --- a/arch/alpha/mm/init.c +++ b/arch/alpha/mm/init.c @@ -208,12 +208,8 @@ callback_init(void * kernel_end) return kernel_end; } -/* - * paging_init() sets up the memory map. - */ -void __init paging_init(void) +void __init arch_zone_limits_init(unsigned long *max_zone_pfn) { - unsigned long max_zone_pfn[MAX_NR_ZONES] = {0, }; unsigned long dma_pfn; dma_pfn = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT; @@ -221,8 +217,17 @@ void __init paging_init(void) max_zone_pfn[ZONE_DMA] = dma_pfn; max_zone_pfn[ZONE_NORMAL] = max_pfn; +} + +/* + * paging_init() sets up the memory map. + */ +void __init paging_init(void) +{ + unsigned long max_zone_pfn[MAX_NR_ZONES] = {0, }; /* Initialize mem_map[]. */ + arch_zone_limits_init(max_zone_pfn); free_area_init(max_zone_pfn); /* Initialize the kernel's ZERO_PGE. */ diff --git a/include/linux/mm.h b/include/linux/mm.h index ab2e7e30aef9..477339b7a032 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3556,6 +3556,7 @@ static inline unsigned long get_num_physpages(void) * free_area_init(max_zone_pfns); */ void free_area_init(unsigned long *max_zone_pfn); +void arch_zone_limits_init(unsigned long *max_zone_pfn); unsigned long node_map_pfn_alignment(void); extern unsigned long absent_pages_in_range(unsigned long start_pfn, unsigned long end_pfn); -- cgit v1.2.3 From d49004c5f0c140bb83c87fab46dcf449cf00eb24 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Sun, 11 Jan 2026 10:20:57 +0200 Subject: arch, mm: consolidate initialization of nodes, zones and memory map To initialize node, zone and memory map data structures every architecture calls free_area_init() during setup_arch() and passes it an array of zone limits. Beside code duplication it creates "interesting" ordering cases between allocation and initialization of hugetlb and the memory map. Some architectures allocate hugetlb pages very early in setup_arch() in certain cases, some only create hugetlb CMA areas in setup_arch() and sometimes hugetlb allocations happen mm_core_init(). With arch_zone_limits_init() helper available now on all architectures it is no longer necessary to call free_area_init() from architecture setup code. Rather core MM initialization can call arch_zone_limits_init() in a single place. This allows to unify ordering of hugetlb vs memory map allocation and initialization. Remove the call to free_area_init() from architecture specific code and place it in a new mm_core_init_early() function that is called immediately after setup_arch(). After this refactoring it is possible to consolidate hugetlb allocations and eliminate differences in ordering of hugetlb and memory map initialization among different architectures. As the first step of this consolidation move hugetlb_bootmem_alloc() to mm_core_early_init(). Link: https://lkml.kernel.org/r/20260111082105.290734-24-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Cc: Alexander Gordeev Cc: Alex Shi Cc: Andreas Larsson Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: David Hildenbrand Cc: David S. Miller Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Heiko Carstens Cc: Helge Deller Cc: Huacai Chen Cc: Ingo Molnar Cc: Johannes Berg Cc: John Paul Adrian Glaubitz Cc: Jonathan Corbet Cc: Klara Modin Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Magnus Lindholm Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Hocko Cc: Michal Simek Cc: Muchun Song Cc: Oscar Salvador Cc: Palmer Dabbelt Cc: Pratyush Yadav Cc: Richard Weinberger Cc: "Ritesh Harjani (IBM)" Cc: Russell King Cc: Stafford Horne Cc: Suren Baghdasaryan Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/alpha/mm/init.c | 9 +-------- arch/arc/mm/init.c | 5 ----- arch/arm/mm/init.c | 16 ---------------- arch/arm64/mm/init.c | 4 ---- arch/csky/kernel/setup.c | 4 ---- arch/hexagon/mm/init.c | 12 ------------ arch/loongarch/include/asm/pgtable.h | 2 -- arch/loongarch/kernel/setup.c | 2 -- arch/loongarch/mm/init.c | 8 -------- arch/m68k/mm/init.c | 3 --- arch/m68k/mm/mcfmmu.c | 3 --- arch/m68k/mm/motorola.c | 6 +----- arch/m68k/mm/sun3mmu.c | 9 --------- arch/microblaze/mm/init.c | 7 ------- arch/mips/loongson64/numa.c | 4 ---- arch/mips/mm/init.c | 5 ----- arch/mips/sgi-ip27/ip27-memory.c | 4 ---- arch/nios2/mm/init.c | 6 ------ arch/openrisc/mm/init.c | 10 ---------- arch/parisc/mm/init.c | 9 --------- arch/powerpc/mm/mem.c | 4 ---- arch/riscv/mm/init.c | 9 --------- arch/s390/mm/init.c | 5 ----- arch/sh/mm/init.c | 5 ----- arch/sparc/mm/init_64.c | 11 ----------- arch/sparc/mm/srmmu.c | 7 ------- arch/um/kernel/mem.c | 5 ----- arch/x86/mm/init.c | 10 ---------- arch/x86/mm/init_32.c | 1 - arch/x86/mm/init_64.c | 2 -- arch/x86/mm/mm_internal.h | 1 - arch/xtensa/mm/init.c | 4 ---- include/linux/mm.h | 4 ++-- init/main.c | 1 + mm/mm_init.c | 18 ++++++++++-------- 35 files changed, 15 insertions(+), 200 deletions(-) (limited to 'include') diff --git a/arch/alpha/mm/init.c b/arch/alpha/mm/init.c index cd0cb1abde5f..9531cbc761c0 100644 --- a/arch/alpha/mm/init.c +++ b/arch/alpha/mm/init.c @@ -220,17 +220,10 @@ void __init arch_zone_limits_init(unsigned long *max_zone_pfn) } /* - * paging_init() sets up the memory map. + * paging_init() initializes the kernel's ZERO_PGE. */ void __init paging_init(void) { - unsigned long max_zone_pfn[MAX_NR_ZONES] = {0, }; - - /* Initialize mem_map[]. */ - arch_zone_limits_init(max_zone_pfn); - free_area_init(max_zone_pfn); - - /* Initialize the kernel's ZERO_PGE. */ memset(absolute_pointer(ZERO_PGE), 0, PAGE_SIZE); } diff --git a/arch/arc/mm/init.c b/arch/arc/mm/init.c index ff7974d38011..a5e92f46e5d1 100644 --- a/arch/arc/mm/init.c +++ b/arch/arc/mm/init.c @@ -102,8 +102,6 @@ void __init arch_zone_limits_init(unsigned long *max_zone_pfn) */ void __init setup_arch_memory(void) { - unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0 }; - setup_initial_init_mm(_text, _etext, _edata, _end); /* first page of system - kernel .vector starts here */ @@ -158,9 +156,6 @@ void __init setup_arch_memory(void) arch_pfn_offset = min(min_low_pfn, min_high_pfn); kmap_init(); #endif /* CONFIG_HIGHMEM */ - - arch_zone_limits_init(max_zone_pfn); - free_area_init(max_zone_pfn); } void __init arch_mm_preinit(void) diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c index bdcc3639681f..a8f7b4084715 100644 --- a/arch/arm/mm/init.c +++ b/arch/arm/mm/init.c @@ -118,15 +118,6 @@ void __init arch_zone_limits_init(unsigned long *max_zone_pfn) #endif } -static void __init zone_sizes_init(unsigned long min, unsigned long max_low, - unsigned long max_high) -{ - unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0 }; - - arch_zone_limits_init(max_zone_pfn); - free_area_init(max_zone_pfn); -} - #ifdef CONFIG_HAVE_ARCH_PFN_VALID int pfn_valid(unsigned long pfn) { @@ -222,13 +213,6 @@ void __init bootmem_init(void) * done after the fixed reservations */ sparse_init(); - - /* - * Now free the memory - free_area_init needs - * the sparse mem_map arrays initialized by sparse_init() - * for memmap_init_zone(), otherwise all PFNs are invalid. - */ - zone_sizes_init(min_low_pfn, max_low_pfn, max_pfn); } /* diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 06815d34cc11..3641e88ea871 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -134,7 +134,6 @@ void __init arch_zone_limits_init(unsigned long *max_zone_pfns) static void __init dma_limits_init(void) { - unsigned long max_zone_pfns[MAX_NR_ZONES] = {0}; phys_addr_t __maybe_unused acpi_zone_dma_limit; phys_addr_t __maybe_unused dt_zone_dma_limit; phys_addr_t __maybe_unused dma32_phys_limit = @@ -160,9 +159,6 @@ static void __init dma_limits_init(void) #endif if (!arm64_dma_phys_limit) arm64_dma_phys_limit = PHYS_MASK + 1; - - arch_zone_limits_init(max_zone_pfns); - free_area_init(max_zone_pfns); } int pfn_is_map_memory(unsigned long pfn) diff --git a/arch/csky/kernel/setup.c b/arch/csky/kernel/setup.c index 8968815d93e6..4bf3c01ead3a 100644 --- a/arch/csky/kernel/setup.c +++ b/arch/csky/kernel/setup.c @@ -63,7 +63,6 @@ static void __init csky_memblock_init(void) { unsigned long lowmem_size = PFN_DOWN(LOWMEM_LIMIT - PHYS_OFFSET_OFFSET); unsigned long sseg_size = PFN_DOWN(SSEG_SIZE - PHYS_OFFSET_OFFSET); - unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0 }; signed long size; memblock_reserve(__pa(_start), _end - _start); @@ -101,9 +100,6 @@ static void __init csky_memblock_init(void) memblock_set_current_limit(PFN_PHYS(max_low_pfn)); dma_contiguous_reserve(0); - - arch_zone_limits_init(max_zone_pfn); - free_area_init(max_zone_pfn); } void __init setup_arch(char **cmdline_p) diff --git a/arch/hexagon/mm/init.c b/arch/hexagon/mm/init.c index e2c9487d8d34..07086dbd33fd 100644 --- a/arch/hexagon/mm/init.c +++ b/arch/hexagon/mm/init.c @@ -66,20 +66,8 @@ void __init arch_zone_limits_init(unsigned long *max_zone_pfns) max_zone_pfns[ZONE_NORMAL] = max_low_pfn; } -/* - * In order to set up page allocator "nodes", - * somebody has to call free_area_init() for UMA. - * - * In this mode, we only have one pg_data_t - * structure: contig_mem_data. - */ static void __init paging_init(void) { - unsigned long max_zone_pfn[MAX_NR_ZONES] = {0, }; - - arch_zone_limits_init(max_zone_pfn); - free_area_init(max_zone_pfn); /* sets up the zonelists and mem_map */ - /* * Set the init_mm descriptors "context" value to point to the * initial kernel segment table's physical address. diff --git a/arch/loongarch/include/asm/pgtable.h b/arch/loongarch/include/asm/pgtable.h index f41a648a3d9e..c33b3bcb733e 100644 --- a/arch/loongarch/include/asm/pgtable.h +++ b/arch/loongarch/include/asm/pgtable.h @@ -353,8 +353,6 @@ static inline pte_t pte_swp_clear_exclusive(pte_t pte) return pte; } -extern void paging_init(void); - #define pte_none(pte) (!(pte_val(pte) & ~_PAGE_GLOBAL)) #define pte_present(pte) (pte_val(pte) & (_PAGE_PRESENT | _PAGE_PROTNONE)) #define pte_no_exec(pte) (pte_val(pte) & _PAGE_NO_EXEC) diff --git a/arch/loongarch/kernel/setup.c b/arch/loongarch/kernel/setup.c index 20cb6f306456..708ac025db71 100644 --- a/arch/loongarch/kernel/setup.c +++ b/arch/loongarch/kernel/setup.c @@ -621,8 +621,6 @@ void __init setup_arch(char **cmdline_p) prefill_possible_map(); #endif - paging_init(); - #ifdef CONFIG_KASAN kasan_init(); #endif diff --git a/arch/loongarch/mm/init.c b/arch/loongarch/mm/init.c index 17235f87eafb..c331bf69d2ec 100644 --- a/arch/loongarch/mm/init.c +++ b/arch/loongarch/mm/init.c @@ -68,14 +68,6 @@ void __init arch_zone_limits_init(unsigned long *max_zone_pfns) max_zone_pfns[ZONE_NORMAL] = max_low_pfn; } -void __init paging_init(void) -{ - unsigned long max_zone_pfns[MAX_NR_ZONES]; - - arch_zone_limits_init(max_zone_pfns); - free_area_init(max_zone_pfns); -} - void __ref free_initmem(void) { free_initmem_default(POISON_FREE_INITMEM); diff --git a/arch/m68k/mm/init.c b/arch/m68k/mm/init.c index 6b1d9d2434b5..53b71f786c27 100644 --- a/arch/m68k/mm/init.c +++ b/arch/m68k/mm/init.c @@ -69,13 +69,10 @@ void __init paging_init(void) * page_alloc get different views of the world. */ unsigned long end_mem = memory_end & PAGE_MASK; - unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0, }; high_memory = (void *) end_mem; empty_zero_page = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE); - arch_zone_limits_init(max_zone_pfn); - free_area_init(max_zone_pfn); } #endif /* CONFIG_MMU */ diff --git a/arch/m68k/mm/mcfmmu.c b/arch/m68k/mm/mcfmmu.c index 24a6f7bbd1ce..3418fd864237 100644 --- a/arch/m68k/mm/mcfmmu.c +++ b/arch/m68k/mm/mcfmmu.c @@ -39,7 +39,6 @@ void __init paging_init(void) pte_t *pg_table; unsigned long address, size; unsigned long next_pgtable; - unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0 }; int i; empty_zero_page = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE); @@ -73,8 +72,6 @@ void __init paging_init(void) } current->mm = NULL; - arch_zone_limits_init(max_zone_pfn); - free_area_init(max_zone_pfn); } int cf_tlb_miss(struct pt_regs *regs, int write, int dtlb, int extension_word) diff --git a/arch/m68k/mm/motorola.c b/arch/m68k/mm/motorola.c index d6ccd23caf61..127a3fa69f4c 100644 --- a/arch/m68k/mm/motorola.c +++ b/arch/m68k/mm/motorola.c @@ -429,7 +429,6 @@ DECLARE_VM_GET_PAGE_PROT */ void __init paging_init(void) { - unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0, }; unsigned long min_addr, max_addr; unsigned long addr; int i; @@ -511,12 +510,9 @@ void __init paging_init(void) set_fc(USER_DATA); #ifdef DEBUG - printk ("before free_area_init\n"); + printk ("before node_set_state\n"); #endif for (i = 0; i < m68k_num_memory; i++) if (node_present_pages(i)) node_set_state(i, N_NORMAL_MEMORY); - - arch_zone_limits_init(max_zone_pfn); - free_area_init(max_zone_pfn); } diff --git a/arch/m68k/mm/sun3mmu.c b/arch/m68k/mm/sun3mmu.c index fdd69cc4240c..c801677f7df8 100644 --- a/arch/m68k/mm/sun3mmu.c +++ b/arch/m68k/mm/sun3mmu.c @@ -41,7 +41,6 @@ void __init paging_init(void) unsigned long address; unsigned long next_pgtable; unsigned long bootmem_end; - unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0, }; unsigned long size; empty_zero_page = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE); @@ -80,14 +79,6 @@ void __init paging_init(void) mmu_emu_init(bootmem_end); current->mm = NULL; - - /* memory sizing is a hack stolen from motorola.c.. hope it works for us */ - arch_zone_limits_init(max_zone_pfn); - - /* I really wish I knew why the following change made things better... -- Sam */ - free_area_init(max_zone_pfn); - - } static const pgprot_t protection_map[16] = { diff --git a/arch/microblaze/mm/init.c b/arch/microblaze/mm/init.c index 54da60b81094..848cdee1380c 100644 --- a/arch/microblaze/mm/init.c +++ b/arch/microblaze/mm/init.c @@ -69,22 +69,15 @@ void __init arch_zone_limits_init(unsigned long *max_zone_pfns) */ static void __init paging_init(void) { - unsigned long zones_size[MAX_NR_ZONES]; int idx; /* Setup fixmaps */ for (idx = 0; idx < __end_of_fixed_addresses; idx++) clear_fixmap(idx); - /* Clean every zones */ - memset(zones_size, 0, sizeof(zones_size)); - #ifdef CONFIG_HIGHMEM highmem_init(); #endif - arch_zone_limits_init(zones_size); - /* We don't have holes in memory map */ - free_area_init(zones_size); } void __init setup_memory(void) diff --git a/arch/mips/loongson64/numa.c b/arch/mips/loongson64/numa.c index f72a58f87878..2cd95020df08 100644 --- a/arch/mips/loongson64/numa.c +++ b/arch/mips/loongson64/numa.c @@ -162,11 +162,7 @@ void __init arch_zone_limits_init(unsigned long *max_zone_pfns) void __init paging_init(void) { - unsigned long zones_size[MAX_NR_ZONES] = {0, }; - pagetable_init(); - arch_zone_limits_init(zones_size); - free_area_init(zones_size); } /* All PCI device belongs to logical Node-0 */ diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c index 269bf6335ac4..2575cba856d3 100644 --- a/arch/mips/mm/init.c +++ b/arch/mips/mm/init.c @@ -417,12 +417,7 @@ void __init arch_zone_limits_init(unsigned long *max_zone_pfns) void __init paging_init(void) { - unsigned long max_zone_pfns[MAX_NR_ZONES]; - pagetable_init(); - - arch_zone_limits_init(max_zone_pfns); - free_area_init(max_zone_pfns); } #ifdef CONFIG_64BIT diff --git a/arch/mips/sgi-ip27/ip27-memory.c b/arch/mips/sgi-ip27/ip27-memory.c index babeb0e07687..082651facf4f 100644 --- a/arch/mips/sgi-ip27/ip27-memory.c +++ b/arch/mips/sgi-ip27/ip27-memory.c @@ -413,9 +413,5 @@ void __init arch_zone_limits_init(unsigned long *max_zone_pfns) void __init paging_init(void) { - unsigned long zones_size[MAX_NR_ZONES] = {0, }; - pagetable_init(); - arch_zone_limits_init(zones_size); - free_area_init(zones_size); } diff --git a/arch/nios2/mm/init.c b/arch/nios2/mm/init.c index 2cb666a65d9e..6b22f1995c16 100644 --- a/arch/nios2/mm/init.c +++ b/arch/nios2/mm/init.c @@ -51,15 +51,9 @@ void __init arch_zone_limits_init(unsigned long *max_zone_pfns) */ void __init paging_init(void) { - unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0 }; - pagetable_init(); pgd_current = swapper_pg_dir; - arch_zone_limits_init(max_zone_pfn); - /* pass the memory from the bootmem allocator to the main allocator */ - free_area_init(max_zone_pfn); - flush_dcache_range((unsigned long)empty_zero_page, (unsigned long)empty_zero_page + PAGE_SIZE); } diff --git a/arch/openrisc/mm/init.c b/arch/openrisc/mm/init.c index 67de93e7a685..78fb0734cdbc 100644 --- a/arch/openrisc/mm/init.c +++ b/arch/openrisc/mm/init.c @@ -47,14 +47,6 @@ void __init arch_zone_limits_init(unsigned long *max_zone_pfns) max_zone_pfns[ZONE_NORMAL] = max_low_pfn; } -static void __init zone_sizes_init(void) -{ - unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0 }; - - arch_zone_limits_init(max_zone_pfn); - free_area_init(max_zone_pfn); -} - extern const char _s_kernel_ro[], _e_kernel_ro[]; /* @@ -145,8 +137,6 @@ void __init paging_init(void) map_ram(); - zone_sizes_init(); - /* self modifying code ;) */ /* Since the old TLB miss handler has been running up until now, * the kernel pages are still all RW, so we can still modify the diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c index dc5bd3efe738..ce6f09ab7a90 100644 --- a/arch/parisc/mm/init.c +++ b/arch/parisc/mm/init.c @@ -698,14 +698,6 @@ void __init arch_zone_limits_init(unsigned long *max_zone_pfns) max_zone_pfns[ZONE_NORMAL] = PFN_DOWN(memblock_end_of_DRAM()); } -static void __init parisc_bootmem_free(void) -{ - unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0, }; - - arch_zone_limits_init(max_zone_pfn); - free_area_init(max_zone_pfn); -} - void __init paging_init(void) { setup_bootmem(); @@ -716,7 +708,6 @@ void __init paging_init(void) flush_tlb_all_local(NULL); sparse_init(); - parisc_bootmem_free(); } static void alloc_btlb(unsigned long start, unsigned long end, int *slot, diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 03c05ec56041..b716c9cd141c 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -237,7 +237,6 @@ void __init arch_zone_limits_init(unsigned long *max_zone_pfns) */ void __init paging_init(void) { - unsigned long max_zone_pfns[MAX_NR_ZONES] = { 0 }; unsigned long long total_ram = memblock_phys_mem_size(); phys_addr_t top_of_ram = memblock_end_of_DRAM(); int zone_dma_bits; @@ -269,9 +268,6 @@ void __init paging_init(void) zone_dma_limit = DMA_BIT_MASK(zone_dma_bits); - arch_zone_limits_init(max_zone_pfns); - free_area_init(max_zone_pfns); - mark_nonram_nosave(); } diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index 97e8661fbcff..79b4792578c4 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -87,14 +87,6 @@ void __init arch_zone_limits_init(unsigned long *max_zone_pfns) max_zone_pfns[ZONE_NORMAL] = max_low_pfn; } -static void __init zone_sizes_init(void) -{ - unsigned long max_zone_pfns[MAX_NR_ZONES] = { 0, }; - - arch_zone_limits_init(max_zone_pfns); - free_area_init(max_zone_pfns); -} - #if defined(CONFIG_MMU) && defined(CONFIG_DEBUG_VM) #define LOG2_SZ_1K ilog2(SZ_1K) @@ -1443,7 +1435,6 @@ void __init misc_mem_init(void) /* The entire VMEMMAP region has been populated. Flush TLB for this region */ local_flush_tlb_kernel_range(VMEMMAP_START, VMEMMAP_END); #endif - zone_sizes_init(); arch_reserve_crashkernel(); memblock_dump_all(); } diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index 1c11ad84dddb..9ec608b5cbb1 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -97,14 +97,9 @@ void __init arch_zone_limits_init(unsigned long *max_zone_pfns) */ void __init paging_init(void) { - unsigned long max_zone_pfns[MAX_NR_ZONES]; - vmem_map_init(); sparse_init(); zone_dma_limit = DMA_BIT_MASK(31); - memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); - arch_zone_limits_init(max_zone_pfns); - free_area_init(max_zone_pfns); } void mark_rodata_ro(void) diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c index 5e7e63642611..3edee854b755 100644 --- a/arch/sh/mm/init.c +++ b/arch/sh/mm/init.c @@ -271,7 +271,6 @@ void __init arch_zone_limits_init(unsigned long *max_zone_pfns) void __init paging_init(void) { - unsigned long max_zone_pfns[MAX_NR_ZONES]; unsigned long vaddr, end; sh_mv.mv_mem_init(); @@ -325,10 +324,6 @@ void __init paging_init(void) page_table_range_init(vaddr, end, swapper_pg_dir); kmap_coherent_init(); - - memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); - arch_zone_limits_init(max_zone_pfns); - free_area_init(max_zone_pfns); } unsigned int mem_init_done = 0; diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c index fbaad449dfc9..931f872ce84a 100644 --- a/arch/sparc/mm/init_64.c +++ b/arch/sparc/mm/init_64.c @@ -2459,17 +2459,6 @@ void __init paging_init(void) kernel_physical_mapping_init(); - { - unsigned long max_zone_pfns[MAX_NR_ZONES]; - - memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); - - max_zone_pfns[ZONE_NORMAL] = end_pfn; - - arch_zone_limits_init(max_zone_pfns); - free_area_init(max_zone_pfns); - } - printk("Booting Linux...\n"); } diff --git a/arch/sparc/mm/srmmu.c b/arch/sparc/mm/srmmu.c index 81e90151db90..1b24c5e8d73d 100644 --- a/arch/sparc/mm/srmmu.c +++ b/arch/sparc/mm/srmmu.c @@ -970,13 +970,6 @@ void __init srmmu_paging_init(void) flush_tlb_all(); sparc_context_init(num_contexts); - - { - unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0 }; - - arch_zone_limits_init(max_zone_pfn); - free_area_init(max_zone_pfn); - } } void mmu_info(struct seq_file *m) diff --git a/arch/um/kernel/mem.c b/arch/um/kernel/mem.c index 2ac4e9debedd..89c8c8b94a79 100644 --- a/arch/um/kernel/mem.c +++ b/arch/um/kernel/mem.c @@ -91,16 +91,11 @@ void __init arch_zone_limits_init(unsigned long *max_zone_pfns) void __init paging_init(void) { - unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0 }; - empty_zero_page = (unsigned long *) memblock_alloc_low(PAGE_SIZE, PAGE_SIZE); if (!empty_zero_page) panic("%s: Failed to allocate %lu bytes align=%lx\n", __func__, PAGE_SIZE, PAGE_SIZE); - - arch_zone_limits_init(max_zone_pfn); - free_area_init(max_zone_pfn); } /* diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index e7ef605a18d6..e52a262d3207 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -1011,16 +1011,6 @@ void __init arch_zone_limits_init(unsigned long *max_zone_pfns) #endif } -void __init zone_sizes_init(void) -{ - unsigned long max_zone_pfns[MAX_NR_ZONES]; - - memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); - - arch_zone_limits_init(max_zone_pfns); - free_area_init(max_zone_pfns); -} - __visible DEFINE_PER_CPU_ALIGNED(struct tlb_state, cpu_tlbstate) = { .loaded_mm = &init_mm, .next_asid = 1, diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 8a34fff6ab2b..b55172118c91 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -655,7 +655,6 @@ void __init paging_init(void) */ olpc_dt_build_devicetree(); sparse_init(); - zone_sizes_init(); } /* diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 9983017ecbe0..4daa40071c9f 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -843,8 +843,6 @@ void __init paging_init(void) */ node_clear_state(0, N_MEMORY); node_clear_state(0, N_NORMAL_MEMORY); - - zone_sizes_init(); } #define PAGE_UNUSED 0xFD diff --git a/arch/x86/mm/mm_internal.h b/arch/x86/mm/mm_internal.h index 097aadc250f7..7c4a41235323 100644 --- a/arch/x86/mm/mm_internal.h +++ b/arch/x86/mm/mm_internal.h @@ -17,7 +17,6 @@ unsigned long kernel_physical_mapping_init(unsigned long start, unsigned long kernel_physical_mapping_change(unsigned long start, unsigned long end, unsigned long page_size_mask); -void zone_sizes_init(void); extern int after_bootmem; diff --git a/arch/xtensa/mm/init.c b/arch/xtensa/mm/init.c index 60299f359a3c..fe83a68335da 100644 --- a/arch/xtensa/mm/init.c +++ b/arch/xtensa/mm/init.c @@ -126,10 +126,6 @@ void __init arch_zone_limits_init(unsigned long *max_zone_pfns) void __init zones_init(void) { - unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0, }; - - arch_zone_limits_init(max_zone_pfn); - free_area_init(max_zone_pfn); print_vm_layout(); } diff --git a/include/linux/mm.h b/include/linux/mm.h index 477339b7a032..aacabf8a0b58 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -45,6 +45,7 @@ struct pt_regs; struct folio_batch; void arch_mm_preinit(void); +void mm_core_init_early(void); void mm_core_init(void); void init_mm_internals(void); @@ -3540,7 +3541,7 @@ static inline unsigned long get_num_physpages(void) } /* - * Using memblock node mappings, an architecture may initialise its + * FIXME: Using memblock node mappings, an architecture may initialise its * zones, allocate the backing mem_map and account for memory holes in an * architecture independent manner. * @@ -3555,7 +3556,6 @@ static inline unsigned long get_num_physpages(void) * memblock_add_node(base, size, nid, MEMBLOCK_NONE) * free_area_init(max_zone_pfns); */ -void free_area_init(unsigned long *max_zone_pfn); void arch_zone_limits_init(unsigned long *max_zone_pfn); unsigned long node_map_pfn_alignment(void); extern unsigned long absent_pages_in_range(unsigned long start_pfn, diff --git a/init/main.c b/init/main.c index b84818ad9685..445b5643ecec 100644 --- a/init/main.c +++ b/init/main.c @@ -1025,6 +1025,7 @@ void start_kernel(void) page_address_init(); pr_notice("%s", linux_banner); setup_arch(&command_line); + mm_core_init_early(); /* Static keys and static calls are needed by LSMs */ jump_label_init(); static_call_init(); diff --git a/mm/mm_init.c b/mm/mm_init.c index 0927bedb1254..6fb4415c0d1c 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -1807,7 +1807,6 @@ static void __init set_high_memory(void) /** * free_area_init - Initialise all pg_data_t and zone data - * @max_zone_pfn: an array of max PFNs for each zone * * This will call free_area_init_node() for each active node in the system. * Using the page ranges provided by memblock_set_node(), the size of each @@ -1818,17 +1817,14 @@ static void __init set_high_memory(void) * starts where the previous one ended. For example, ZONE_DMA32 starts * at arch_max_dma_pfn. */ -void __init free_area_init(unsigned long *max_zone_pfn) +static void __init free_area_init(void) { + unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0 }; unsigned long start_pfn, end_pfn; int i, nid, zone; bool descending; - /* Record where the zone boundaries are */ - memset(arch_zone_lowest_possible_pfn, 0, - sizeof(arch_zone_lowest_possible_pfn)); - memset(arch_zone_highest_possible_pfn, 0, - sizeof(arch_zone_highest_possible_pfn)); + arch_zone_limits_init(max_zone_pfn); start_pfn = PHYS_PFN(memblock_start_of_DRAM()); descending = arch_has_descending_max_zone_pfns(); @@ -2678,13 +2674,19 @@ void __init __weak mem_init(void) { } +void __init mm_core_init_early(void) +{ + hugetlb_bootmem_alloc(); + + free_area_init(); +} + /* * Set up kernel memory allocators */ void __init mm_core_init(void) { arch_mm_preinit(); - hugetlb_bootmem_alloc(); /* Initializations relying on SMP setup */ BUILD_BUG_ON(MAX_ZONELISTS > 2); -- cgit v1.2.3 From 4267739cabb82da75780c4699fe8208821929944 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Sun, 11 Jan 2026 10:20:58 +0200 Subject: arch, mm: consolidate initialization of SPARSE memory model Every architecture calls sparse_init() during setup_arch() although the data structures created by sparse_init() are not used until the initialization of the core MM. Beside the code duplication, calling sparse_init() from architecture specific code causes ordering differences of vmemmap and HVO initialization on different architectures. Move the call to sparse_init() from architecture specific code to free_area_init() to ensure that vmemmap and HVO initialization order is always the same. Link: https://lkml.kernel.org/r/20260111082105.290734-25-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Cc: Alexander Gordeev Cc: Alex Shi Cc: Andreas Larsson Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: David Hildenbrand Cc: David S. Miller Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Heiko Carstens Cc: Helge Deller Cc: Huacai Chen Cc: Ingo Molnar Cc: Johannes Berg Cc: John Paul Adrian Glaubitz Cc: Jonathan Corbet Cc: Klara Modin Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Magnus Lindholm Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Hocko Cc: Michal Simek Cc: Muchun Song Cc: Oscar Salvador Cc: Palmer Dabbelt Cc: Pratyush Yadav Cc: Richard Weinberger Cc: "Ritesh Harjani (IBM)" Cc: Russell King Cc: Stafford Horne Cc: Suren Baghdasaryan Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- Documentation/mm/memory-model.rst | 3 --- Documentation/translations/zh_CN/mm/memory-model.rst | 2 -- arch/alpha/kernel/setup.c | 1 - arch/arm/mm/init.c | 6 ------ arch/arm64/mm/init.c | 6 ------ arch/csky/kernel/setup.c | 2 -- arch/loongarch/kernel/setup.c | 8 -------- arch/mips/kernel/setup.c | 11 ----------- arch/parisc/mm/init.c | 2 -- arch/powerpc/include/asm/setup.h | 4 ++++ arch/powerpc/mm/mem.c | 5 ----- arch/powerpc/mm/numa.c | 2 -- arch/riscv/mm/init.c | 1 - arch/s390/mm/init.c | 1 - arch/sh/mm/init.c | 2 -- arch/sparc/mm/init_64.c | 2 -- arch/x86/mm/init_32.c | 1 - arch/x86/mm/init_64.c | 2 -- include/linux/mmzone.h | 2 -- mm/internal.h | 6 ++++++ mm/mm_init.c | 1 + 21 files changed, 11 insertions(+), 59 deletions(-) (limited to 'include') diff --git a/Documentation/mm/memory-model.rst b/Documentation/mm/memory-model.rst index 7957122039e8..199b11328f4f 100644 --- a/Documentation/mm/memory-model.rst +++ b/Documentation/mm/memory-model.rst @@ -97,9 +97,6 @@ sections: `mem_section` objects and the number of rows is calculated to fit all the memory sections. -The architecture setup code should call sparse_init() to -initialize the memory sections and the memory maps. - With SPARSEMEM there are two possible ways to convert a PFN to the corresponding `struct page` - a "classic sparse" and "sparse vmemmap". The selection is made at build time and it is determined by diff --git a/Documentation/translations/zh_CN/mm/memory-model.rst b/Documentation/translations/zh_CN/mm/memory-model.rst index 77ec149a970c..c0c5d8ecd880 100644 --- a/Documentation/translations/zh_CN/mm/memory-model.rst +++ b/Documentation/translations/zh_CN/mm/memory-model.rst @@ -83,8 +83,6 @@ SPARSEMEM模型将物理内存显示为一个部分的集合。一个区段用me 每一行包含价值 `PAGE_SIZE` 的 `mem_section` 对象,行数的计算是为了适应所有的 内存区。 -架构设置代码应该调用sparse_init()来初始化内存区和内存映射。 - 通过SPARSEMEM,有两种可能的方式将PFN转换为相应的 `struct page` --"classic sparse"和 "sparse vmemmap"。选择是在构建时进行的,它由 `CONFIG_SPARSEMEM_VMEMMAP` 的 值决定。 diff --git a/arch/alpha/kernel/setup.c b/arch/alpha/kernel/setup.c index bebdffafaee8..f0af444a69a4 100644 --- a/arch/alpha/kernel/setup.c +++ b/arch/alpha/kernel/setup.c @@ -607,7 +607,6 @@ setup_arch(char **cmdline_p) /* Find our memory. */ setup_memory(kernel_end); memblock_set_bottom_up(true); - sparse_init(); /* First guess at cpu cache sizes. Do this before init_arch. */ determine_cpu_caches(cpu->type); diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c index a8f7b4084715..0cc1bf04686d 100644 --- a/arch/arm/mm/init.c +++ b/arch/arm/mm/init.c @@ -207,12 +207,6 @@ void __init bootmem_init(void) early_memtest((phys_addr_t)min_low_pfn << PAGE_SHIFT, (phys_addr_t)max_low_pfn << PAGE_SHIFT); - - /* - * sparse_init() tries to allocate memory from memblock, so must be - * done after the fixed reservations - */ - sparse_init(); } /* diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 3641e88ea871..9d271aff7652 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -321,12 +321,6 @@ void __init bootmem_init(void) #endif kvm_hyp_reserve(); - - /* - * sparse_init() tries to allocate memory from memblock, so must be - * done after the fixed reservations - */ - sparse_init(); dma_limits_init(); /* diff --git a/arch/csky/kernel/setup.c b/arch/csky/kernel/setup.c index 4bf3c01ead3a..45c98dcf7f50 100644 --- a/arch/csky/kernel/setup.c +++ b/arch/csky/kernel/setup.c @@ -123,8 +123,6 @@ void __init setup_arch(char **cmdline_p) setup_smp(); #endif - sparse_init(); - fixaddr_init(); #ifdef CONFIG_HIGHMEM diff --git a/arch/loongarch/kernel/setup.c b/arch/loongarch/kernel/setup.c index 708ac025db71..d6a1ff0e16f1 100644 --- a/arch/loongarch/kernel/setup.c +++ b/arch/loongarch/kernel/setup.c @@ -402,14 +402,6 @@ static void __init arch_mem_init(char **cmdline_p) check_kernel_sections_mem(); - /* - * In order to reduce the possibility of kernel panic when failed to - * get IO TLB memory under CONFIG_SWIOTLB, it is better to allocate - * low memory as small as possible before swiotlb_init(), so make - * sparse_init() using top-down allocation. - */ - memblock_set_bottom_up(false); - sparse_init(); memblock_set_bottom_up(true); swiotlb_init(true, SWIOTLB_VERBOSE); diff --git a/arch/mips/kernel/setup.c b/arch/mips/kernel/setup.c index 11b9b6b63e19..d36d89d01fa4 100644 --- a/arch/mips/kernel/setup.c +++ b/arch/mips/kernel/setup.c @@ -614,7 +614,6 @@ static void __init bootcmdline_init(void) * kernel but generic memory management system is still entirely uninitialized. * * o bootmem_init() - * o sparse_init() * o paging_init() * o dma_contiguous_reserve() * @@ -665,16 +664,6 @@ static void __init arch_mem_init(char **cmdline_p) mips_parse_crashkernel(); device_tree_init(); - /* - * In order to reduce the possibility of kernel panic when failed to - * get IO TLB memory under CONFIG_SWIOTLB, it is better to allocate - * low memory as small as possible before plat_swiotlb_setup(), so - * make sparse_init() using top-down allocation. - */ - memblock_set_bottom_up(false); - sparse_init(); - memblock_set_bottom_up(true); - plat_swiotlb_setup(); dma_contiguous_reserve(PFN_PHYS(max_low_pfn)); diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c index ce6f09ab7a90..6a39e031e5ff 100644 --- a/arch/parisc/mm/init.c +++ b/arch/parisc/mm/init.c @@ -706,8 +706,6 @@ void __init paging_init(void) fixmap_init(); flush_cache_all_local(); /* start with known state */ flush_tlb_all_local(NULL); - - sparse_init(); } static void alloc_btlb(unsigned long start, unsigned long end, int *slot, diff --git a/arch/powerpc/include/asm/setup.h b/arch/powerpc/include/asm/setup.h index 50a92b24628d..6d60ea4868ab 100644 --- a/arch/powerpc/include/asm/setup.h +++ b/arch/powerpc/include/asm/setup.h @@ -20,7 +20,11 @@ extern void reloc_got2(unsigned long); void check_for_initrd(void); void mem_topology_setup(void); +#ifdef CONFIG_NUMA void initmem_init(void); +#else +static inline void initmem_init(void) {} +#endif void setup_panic(void); #define ARCH_PANIC_TIMEOUT 180 diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index b716c9cd141c..3789a51bdaae 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -182,11 +182,6 @@ void __init mem_topology_setup(void) memblock_set_node(0, PHYS_ADDR_MAX, &memblock.memory, 0); } -void __init initmem_init(void) -{ - sparse_init(); -} - /* mark pages that don't exist as nosave */ static int __init mark_nonram_nosave(void) { diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index 603a0f652ba6..f4cf3ae036de 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -1213,8 +1213,6 @@ void __init initmem_init(void) setup_node_data(nid, start_pfn, end_pfn); } - sparse_init(); - /* * We need the numa_cpu_lookup_table to be accurate for all CPUs, * even before we online them, so that we can use cpu_to_{node,mem} diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index 79b4792578c4..11ac4041afc0 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -1430,7 +1430,6 @@ void __init misc_mem_init(void) { early_memtest(min_low_pfn << PAGE_SHIFT, max_low_pfn << PAGE_SHIFT); arch_numa_init(); - sparse_init(); #ifdef CONFIG_SPARSEMEM_VMEMMAP /* The entire VMEMMAP region has been populated. Flush TLB for this region */ local_flush_tlb_kernel_range(VMEMMAP_START, VMEMMAP_END); diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index 9ec608b5cbb1..3c20475cbee2 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -98,7 +98,6 @@ void __init arch_zone_limits_init(unsigned long *max_zone_pfns) void __init paging_init(void) { vmem_map_init(); - sparse_init(); zone_dma_limit = DMA_BIT_MASK(31); } diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c index 3edee854b755..464a3a63e2fa 100644 --- a/arch/sh/mm/init.c +++ b/arch/sh/mm/init.c @@ -227,8 +227,6 @@ static void __init do_init_bootmem(void) node_set_online(0); plat_mem_setup(); - - sparse_init(); } static void __init early_reserve_mem(void) diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c index 931f872ce84a..4f7bdb18774b 100644 --- a/arch/sparc/mm/init_64.c +++ b/arch/sparc/mm/init_64.c @@ -1615,8 +1615,6 @@ static unsigned long __init bootmem_init(unsigned long phys_base) /* XXX cpu notifier XXX */ - sparse_init(); - return end_pfn; } diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index b55172118c91..0908c44d51e6 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -654,7 +654,6 @@ void __init paging_init(void) * NOTE: at this point the bootmem allocator is fully available. */ olpc_dt_build_devicetree(); - sparse_init(); } /* diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 4daa40071c9f..df2261fa4f98 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -833,8 +833,6 @@ void __init initmem_init(void) void __init paging_init(void) { - sparse_init(); - /* * clear the default setting with node 0 * note: don't use nodes_clear here, that is really clearing when diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index fc5d6c88d2f0..eb3815fc94ad 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -2286,9 +2286,7 @@ static inline unsigned long next_present_section_nr(unsigned long section_nr) #define pfn_to_nid(pfn) (0) #endif -void sparse_init(void); #else -#define sparse_init() do {} while (0) #define sparse_index_init(_sec, _nid) do {} while (0) #define sparse_vmemmap_init_nid_early(_nid) do {} while (0) #define sparse_vmemmap_init_nid_late(_nid) do {} while (0) diff --git a/mm/internal.h b/mm/internal.h index 9ee336aa0365..ecb6020cf313 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -852,6 +852,12 @@ void memmap_init_range(unsigned long, int, unsigned long, unsigned long, unsigned long, enum meminit_context, struct vmem_altmap *, int, bool); +#ifdef CONFIG_SPARSEMEM +void sparse_init(void); +#else +static inline void sparse_init(void) {} +#endif /* CONFIG_SPARSEMEM */ + #if defined CONFIG_COMPACTION || defined CONFIG_CMA /* diff --git a/mm/mm_init.c b/mm/mm_init.c index 6fb4415c0d1c..31246fe5c361 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -1825,6 +1825,7 @@ static void __init free_area_init(void) bool descending; arch_zone_limits_init(max_zone_pfn); + sparse_init(); start_pfn = PHYS_PFN(memblock_start_of_DRAM()); descending = arch_has_descending_max_zone_pfns(); -- cgit v1.2.3 From 9fac145b6d3fe570277438f8d860eabf229dc545 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Sun, 11 Jan 2026 10:21:01 +0200 Subject: mm, arch: consolidate hugetlb CMA reservation Every architecture that supports hugetlb_cma command line parameter reserves CMA areas for hugetlb during setup_arch(). This obfuscates the ordering of hugetlb CMA initialization with respect to the rest initialization of the core MM. Introduce arch_hugetlb_cma_order() callback to allow architectures report the desired order-per-bit of CMA areas and provide a week implementation of arch_hugetlb_cma_order() for architectures that don't support hugetlb with CMA. Use this callback in hugetlb_cma_reserve() instead if passing the order as parameter and call hugetlb_cma_reserve() from mm_core_init_early() rather than have it spread over architecture specific code. Link: https://lkml.kernel.org/r/20260111082105.290734-28-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Cc: Alexander Gordeev Cc: Alex Shi Cc: Andreas Larsson Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: David Hildenbrand Cc: David S. Miller Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Heiko Carstens Cc: Helge Deller Cc: Huacai Chen Cc: Ingo Molnar Cc: Johannes Berg Cc: John Paul Adrian Glaubitz Cc: Jonathan Corbet Cc: Klara Modin Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Magnus Lindholm Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Hocko Cc: Michal Simek Cc: Muchun Song Cc: Oscar Salvador Cc: Palmer Dabbelt Cc: Pratyush Yadav Cc: Richard Weinberger Cc: "Ritesh Harjani (IBM)" Cc: Russell King Cc: Stafford Horne Cc: Suren Baghdasaryan Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- Documentation/driver-api/cxl/linux/early-boot.rst | 2 +- arch/arm64/include/asm/hugetlb.h | 2 -- arch/arm64/mm/hugetlbpage.c | 10 +++------- arch/arm64/mm/init.c | 9 --------- arch/powerpc/include/asm/hugetlb.h | 5 ----- arch/powerpc/kernel/setup-common.c | 1 - arch/powerpc/mm/hugetlbpage.c | 11 ++++------- arch/riscv/mm/hugetlbpage.c | 8 ++++++++ arch/riscv/mm/init.c | 2 -- arch/s390/kernel/setup.c | 2 -- arch/s390/mm/hugetlbpage.c | 8 ++++++++ arch/x86/kernel/setup.c | 4 ---- arch/x86/mm/hugetlbpage.c | 8 ++++++++ include/linux/hugetlb.h | 6 ++++-- mm/hugetlb_cma.c | 19 ++++++++++++++----- mm/mm_init.c | 1 + 16 files changed, 51 insertions(+), 47 deletions(-) (limited to 'include') diff --git a/Documentation/driver-api/cxl/linux/early-boot.rst b/Documentation/driver-api/cxl/linux/early-boot.rst index a7fc6fc85fbe..414481f33819 100644 --- a/Documentation/driver-api/cxl/linux/early-boot.rst +++ b/Documentation/driver-api/cxl/linux/early-boot.rst @@ -125,7 +125,7 @@ The contiguous memory allocator (CMA) enables reservation of contiguous memory regions on NUMA nodes during early boot. However, CMA cannot reserve memory on NUMA nodes that are not online during early boot. :: - void __init hugetlb_cma_reserve(int order) { + void __init hugetlb_cma_reserve(void) { if (!node_online(nid)) /* do not allow reservations */ } diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h index 44c1f757bfcf..e6f8ff3cc630 100644 --- a/arch/arm64/include/asm/hugetlb.h +++ b/arch/arm64/include/asm/hugetlb.h @@ -56,8 +56,6 @@ extern void huge_pte_clear(struct mm_struct *mm, unsigned long addr, #define __HAVE_ARCH_HUGE_PTEP_GET extern pte_t huge_ptep_get(struct mm_struct *mm, unsigned long addr, pte_t *ptep); -void __init arm64_hugetlb_cma_reserve(void); - #define huge_ptep_modify_prot_start huge_ptep_modify_prot_start extern pte_t huge_ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep); diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index 1d90a7e75333..f8dd58ab67a8 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c @@ -36,16 +36,12 @@ * huge pages could still be served from those areas. */ #ifdef CONFIG_CMA -void __init arm64_hugetlb_cma_reserve(void) +unsigned int arch_hugetlb_cma_order(void) { - int order; - if (pud_sect_supported()) - order = PUD_SHIFT - PAGE_SHIFT; - else - order = CONT_PMD_SHIFT - PAGE_SHIFT; + return PUD_SHIFT - PAGE_SHIFT; - hugetlb_cma_reserve(order); + return CONT_PMD_SHIFT - PAGE_SHIFT; } #endif /* CONFIG_CMA */ diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 9d271aff7652..96711b8578fd 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -311,15 +311,6 @@ void __init bootmem_init(void) arch_numa_init(); - /* - * must be done after arch_numa_init() which calls numa_init() to - * initialize node_online_map that gets used in hugetlb_cma_reserve() - * while allocating required CMA size across online nodes. - */ -#if defined(CONFIG_HUGETLB_PAGE) && defined(CONFIG_CMA) - arm64_hugetlb_cma_reserve(); -#endif - kvm_hyp_reserve(); dma_limits_init(); diff --git a/arch/powerpc/include/asm/hugetlb.h b/arch/powerpc/include/asm/hugetlb.h index 86326587e58d..6d32a4299445 100644 --- a/arch/powerpc/include/asm/hugetlb.h +++ b/arch/powerpc/include/asm/hugetlb.h @@ -68,7 +68,6 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t pte, int dirty); -void gigantic_hugetlb_cma_reserve(void) __init; #include #else /* ! CONFIG_HUGETLB_PAGE */ @@ -77,10 +76,6 @@ static inline void flush_hugetlb_page(struct vm_area_struct *vma, { } -static inline void __init gigantic_hugetlb_cma_reserve(void) -{ -} - static inline void __init hugetlbpage_init_defaultsize(void) { } diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c index c8c42b419742..cb5b73adc250 100644 --- a/arch/powerpc/kernel/setup-common.c +++ b/arch/powerpc/kernel/setup-common.c @@ -1003,7 +1003,6 @@ void __init setup_arch(char **cmdline_p) fadump_cma_init(); kdump_cma_reserve(); kvm_cma_reserve(); - gigantic_hugetlb_cma_reserve(); early_memtest(min_low_pfn << PAGE_SHIFT, max_low_pfn << PAGE_SHIFT); diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index d3c1b749dcfc..558fafb82b8a 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -200,18 +200,15 @@ static int __init hugetlbpage_init(void) arch_initcall(hugetlbpage_init); -void __init gigantic_hugetlb_cma_reserve(void) +unsigned int __init arch_hugetlb_cma_order(void) { - unsigned long order = 0; - if (radix_enabled()) - order = PUD_SHIFT - PAGE_SHIFT; + return PUD_SHIFT - PAGE_SHIFT; else if (!firmware_has_feature(FW_FEATURE_LPAR) && mmu_psize_defs[MMU_PAGE_16G].shift) /* * For pseries we do use ibm,expected#pages for reserving 16G pages. */ - order = mmu_psize_to_shift(MMU_PAGE_16G) - PAGE_SHIFT; + return mmu_psize_to_shift(MMU_PAGE_16G) - PAGE_SHIFT; - if (order) - hugetlb_cma_reserve(order); + return 0; } diff --git a/arch/riscv/mm/hugetlbpage.c b/arch/riscv/mm/hugetlbpage.c index 375dd96bb4a0..a6d217112cf4 100644 --- a/arch/riscv/mm/hugetlbpage.c +++ b/arch/riscv/mm/hugetlbpage.c @@ -447,3 +447,11 @@ static __init int gigantic_pages_init(void) } arch_initcall(gigantic_pages_init); #endif + +unsigned int __init arch_hugetlb_cma_order(void) +{ + if (IS_ENABLED(CONFIG_64BIT)) + return PUD_SHIFT - PAGE_SHIFT; + + return 0; +} diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index 11ac4041afc0..848efeb9e163 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -311,8 +311,6 @@ static void __init setup_bootmem(void) memblock_reserve(dtb_early_pa, fdt_totalsize(dtb_early_va)); dma_contiguous_reserve(dma32_phys_limit); - if (IS_ENABLED(CONFIG_64BIT)) - hugetlb_cma_reserve(PUD_SHIFT - PAGE_SHIFT); } #ifdef CONFIG_RELOCATABLE diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index c1fe0b53c5ac..b60284328fe3 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -963,8 +963,6 @@ void __init setup_arch(char **cmdline_p) setup_uv(); dma_contiguous_reserve(ident_map_size); vmcp_cma_reserve(); - if (cpu_has_edat2()) - hugetlb_cma_reserve(PUD_SHIFT - PAGE_SHIFT); reserve_crashkernel(); #ifdef CONFIG_CRASH_DUMP diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c index d42e61c7594e..d93417d1e53c 100644 --- a/arch/s390/mm/hugetlbpage.c +++ b/arch/s390/mm/hugetlbpage.c @@ -255,3 +255,11 @@ bool __init arch_hugetlb_valid_size(unsigned long size) else return false; } + +unsigned int __init arch_hugetlb_cma_order(void) +{ + if (cpu_has_edat2()) + return PUD_SHIFT - PAGE_SHIFT; + + return 0; +} diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index e2318fa9b1bb..e1efe3975aa0 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1189,10 +1189,6 @@ void __init setup_arch(char **cmdline_p) initmem_init(); dma_contiguous_reserve(max_pfn_mapped << PAGE_SHIFT); - if (boot_cpu_has(X86_FEATURE_GBPAGES)) { - hugetlb_cma_reserve(PUD_SHIFT - PAGE_SHIFT); - } - /* * Reserve memory for crash kernel after SRAT is parsed so that it * won't consume hotpluggable memory. diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index 58f7f2bd535d..3b26621c9128 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c @@ -42,3 +42,11 @@ static __init int gigantic_pages_init(void) arch_initcall(gigantic_pages_init); #endif #endif + +unsigned int __init arch_hugetlb_cma_order(void) +{ + if (boot_cpu_has(X86_FEATURE_GBPAGES)) + return PUD_SHIFT - PAGE_SHIFT; + + return 0; +} diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 694f6e83c637..00e6a73e7bba 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -281,6 +281,8 @@ void fixup_hugetlb_reservations(struct vm_area_struct *vma); void hugetlb_split(struct vm_area_struct *vma, unsigned long addr); int hugetlb_vma_lock_alloc(struct vm_area_struct *vma); +unsigned int arch_hugetlb_cma_order(void); + #else /* !CONFIG_HUGETLB_PAGE */ static inline void hugetlb_dup_vma_private(struct vm_area_struct *vma) @@ -1322,9 +1324,9 @@ static inline spinlock_t *huge_pte_lock(struct hstate *h, } #if defined(CONFIG_HUGETLB_PAGE) && defined(CONFIG_CMA) -extern void __init hugetlb_cma_reserve(int order); +extern void __init hugetlb_cma_reserve(void); #else -static inline __init void hugetlb_cma_reserve(int order) +static inline __init void hugetlb_cma_reserve(void) { } #endif diff --git a/mm/hugetlb_cma.c b/mm/hugetlb_cma.c index e8e4dc7182d5..b1eb5998282c 100644 --- a/mm/hugetlb_cma.c +++ b/mm/hugetlb_cma.c @@ -134,12 +134,24 @@ static int __init cmdline_parse_hugetlb_cma_only(char *p) early_param("hugetlb_cma_only", cmdline_parse_hugetlb_cma_only); -void __init hugetlb_cma_reserve(int order) +unsigned int __weak arch_hugetlb_cma_order(void) { - unsigned long size, reserved, per_node; + return 0; +} + +void __init hugetlb_cma_reserve(void) +{ + unsigned long size, reserved, per_node, order; bool node_specific_cma_alloc = false; int nid; + if (!hugetlb_cma_size) + return; + + order = arch_hugetlb_cma_order(); + if (!order) + return; + /* * HugeTLB CMA reservation is required for gigantic * huge pages which could not be allocated via the @@ -149,9 +161,6 @@ void __init hugetlb_cma_reserve(int order) VM_WARN_ON(order <= MAX_PAGE_ORDER); cma_reserve_called = true; - if (!hugetlb_cma_size) - return; - hugetlb_bootmem_set_nodes(); for (nid = 0; nid < MAX_NUMNODES; nid++) { diff --git a/mm/mm_init.c b/mm/mm_init.c index 31246fe5c361..0cfbdef91d72 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -2677,6 +2677,7 @@ void __init __weak mem_init(void) void __init mm_core_init_early(void) { + hugetlb_cma_reserve(); hugetlb_bootmem_alloc(); free_area_init(); -- cgit v1.2.3 From 743758ccf8bede3e7c38f3f7d3f5131aa0a7b4a6 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Sun, 11 Jan 2026 10:21:03 +0200 Subject: Revert "mm/hugetlb: deal with multiple calls to hugetlb_bootmem_alloc" This reverts commit d58b2498200724e4f8c12d71a5953da03c8c8bdf. hugetlb_bootmem_alloc() is called only once, no need to check if it was called already at its entry. Other checks performed during HVO initialization are also no longer necessary because sparse_init() that calls hugetlb_vmemmap_init_early() and hugetlb_vmemmap_init_late() is always called after hugetlb_bootmem_alloc(). Link: https://lkml.kernel.org/r/20260111082105.290734-30-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Acked-by: Muchun Song Cc: Alexander Gordeev Cc: Alex Shi Cc: Andreas Larsson Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: David Hildenbrand Cc: David S. Miller Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Heiko Carstens Cc: Helge Deller Cc: Huacai Chen Cc: Ingo Molnar Cc: Johannes Berg Cc: John Paul Adrian Glaubitz Cc: Jonathan Corbet Cc: Klara Modin Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Magnus Lindholm Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Hocko Cc: Michal Simek Cc: Oscar Salvador Cc: Palmer Dabbelt Cc: Pratyush Yadav Cc: Richard Weinberger Cc: "Ritesh Harjani (IBM)" Cc: Russell King Cc: Stafford Horne Cc: Suren Baghdasaryan Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 6 ------ mm/hugetlb.c | 12 ------------ mm/hugetlb_vmemmap.c | 11 ----------- 3 files changed, 29 deletions(-) (limited to 'include') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 00e6a73e7bba..94a03591990c 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -176,7 +176,6 @@ extern int sysctl_hugetlb_shm_group __read_mostly; extern struct list_head huge_boot_pages[MAX_NUMNODES]; void hugetlb_bootmem_alloc(void); -bool hugetlb_bootmem_allocated(void); extern nodemask_t hugetlb_bootmem_nodes; void hugetlb_bootmem_set_nodes(void); @@ -1306,11 +1305,6 @@ static inline bool hugetlbfs_pagecache_present( static inline void hugetlb_bootmem_alloc(void) { } - -static inline bool hugetlb_bootmem_allocated(void) -{ - return false; -} #endif /* CONFIG_HUGETLB_PAGE */ static inline spinlock_t *huge_pte_lock(struct hstate *h, diff --git a/mm/hugetlb.c b/mm/hugetlb.c index fe4b9f2ebdb6..04385a0122de 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4486,21 +4486,11 @@ void __init hugetlb_bootmem_set_nodes(void) } } -static bool __hugetlb_bootmem_allocated __initdata; - -bool __init hugetlb_bootmem_allocated(void) -{ - return __hugetlb_bootmem_allocated; -} - void __init hugetlb_bootmem_alloc(void) { struct hstate *h; int i; - if (__hugetlb_bootmem_allocated) - return; - hugetlb_bootmem_set_nodes(); for (i = 0; i < MAX_NUMNODES; i++) @@ -4514,8 +4504,6 @@ void __init hugetlb_bootmem_alloc(void) if (hstate_is_gigantic(h)) hugetlb_hstate_alloc_pages(h); } - - __hugetlb_bootmem_allocated = true; } /* diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index 9d01f883fd71..a9280259e12a 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -794,14 +794,6 @@ void __init hugetlb_vmemmap_init_early(int nid) struct huge_bootmem_page *m = NULL; void *map; - /* - * Noting to do if bootmem pages were not allocated - * early in boot, or if HVO wasn't enabled in the - * first place. - */ - if (!hugetlb_bootmem_allocated()) - return; - if (!READ_ONCE(vmemmap_optimize_enabled)) return; @@ -847,9 +839,6 @@ void __init hugetlb_vmemmap_init_late(int nid) struct hstate *h; void *map; - if (!hugetlb_bootmem_allocated()) - return; - if (!READ_ONCE(vmemmap_optimize_enabled)) return; -- cgit v1.2.3 From 53eb797ffc3abe30418b19777922b55fb339fc1f Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Sun, 18 Jan 2026 14:50:41 +0000 Subject: mm/rmap: remove anon_vma_merge() function This function is confusing, we already have the concept of anon_vma merge to adjacent VMA's anon_vma's to increase probability of anon_vma compatibility and therefore VMA merge (see is_mergeable_anon_vma() etc.), as well as anon_vma reuse, along side the usual VMA merge logic. We can remove the anon_vma check as it is redundant - a merge would not have been permitted with removal if the anon_vma's were not the same (and in the case of an unfaulted/faulted merge, we would have already set the unfaulted VMA's anon_vma to vp->remove->anon_vma in dup_anon_vma()). Avoid overloading this term when we're very simply unlinking anon_vma state from a removed VMA upon merge. Link: https://lkml.kernel.org/r/56bbe45e309f7af197b1c4f94a9a0c8931ff2d29.1768746221.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Suren Baghdasaryan Reviewed-by: Liam R. Howlett Cc: Barry Song Cc: Chris Li Cc: David Hildenbrand Cc: Harry Yoo Cc: Jann Horn Cc: Michal Hocko Cc: Mike Rapoport Cc: Pedro Falcato Cc: Rik van Riel Cc: Shakeel Butt Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/rmap.h | 7 ------- mm/vma.c | 2 +- tools/testing/vma/vma_internal.h | 5 ----- 3 files changed, 1 insertion(+), 13 deletions(-) (limited to 'include') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index daa92a58585d..832bfc0ccfc6 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -165,13 +165,6 @@ static inline int anon_vma_prepare(struct vm_area_struct *vma) return __anon_vma_prepare(vma); } -static inline void anon_vma_merge(struct vm_area_struct *vma, - struct vm_area_struct *next) -{ - VM_BUG_ON_VMA(vma->anon_vma != next->anon_vma, vma); - unlink_anon_vmas(next); -} - struct anon_vma *folio_get_anon_vma(const struct folio *folio); #ifdef CONFIG_MM_ID diff --git a/mm/vma.c b/mm/vma.c index f81a5cfcd7cc..6c458c8656b8 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -381,7 +381,7 @@ again: fput(vp->file); } if (vp->remove->anon_vma) - anon_vma_merge(vp->vma, vp->remove); + unlink_anon_vmas(vp->remove); mm->map_count--; mpol_put(vma_policy(vp->remove)); if (!vp->remove2) diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h index 9f0a9f5ed0fe..93e5792306d9 100644 --- a/tools/testing/vma/vma_internal.h +++ b/tools/testing/vma/vma_internal.h @@ -1265,11 +1265,6 @@ static inline void i_mmap_unlock_write(struct address_space *mapping) { } -static inline void anon_vma_merge(struct vm_area_struct *vma, - struct vm_area_struct *next) -{ -} - static inline int userfaultfd_unmap_prep(struct vm_area_struct *vma, unsigned long start, unsigned long end, -- cgit v1.2.3 From 7549e3d20f1aa9a0b8c77f83144dde54ed6ab4fe Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Sun, 18 Jan 2026 14:50:42 +0000 Subject: mm/rmap: make anon_vma functions internal The bulk of the anon_vma operations are only used by mm, so formalise this by putting the function prototypes and inlines in mm/internal.h. This allows us to make changes without having to worry about the rest of the kernel. Link: https://lkml.kernel.org/r/79ec933c3a9c8bf1f64dab253bbfdae8a01cb921.1768746221.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Suren Baghdasaryan Reviewed-by: Liam R. Howlett Cc: Barry Song Cc: Chris Li Cc: David Hildenbrand Cc: Harry Yoo Cc: Jann Horn Cc: Michal Hocko Cc: Mike Rapoport Cc: Pedro Falcato Cc: Rik van Riel Cc: Shakeel Butt Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/rmap.h | 60 ---------------------------------------------------- mm/internal.h | 58 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 58 insertions(+), 60 deletions(-) (limited to 'include') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 832bfc0ccfc6..dd764951b03d 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -104,68 +104,8 @@ enum ttu_flags { }; #ifdef CONFIG_MMU -static inline void get_anon_vma(struct anon_vma *anon_vma) -{ - atomic_inc(&anon_vma->refcount); -} - -void __put_anon_vma(struct anon_vma *anon_vma); - -static inline void put_anon_vma(struct anon_vma *anon_vma) -{ - if (atomic_dec_and_test(&anon_vma->refcount)) - __put_anon_vma(anon_vma); -} - -static inline void anon_vma_lock_write(struct anon_vma *anon_vma) -{ - down_write(&anon_vma->root->rwsem); -} -static inline int anon_vma_trylock_write(struct anon_vma *anon_vma) -{ - return down_write_trylock(&anon_vma->root->rwsem); -} - -static inline void anon_vma_unlock_write(struct anon_vma *anon_vma) -{ - up_write(&anon_vma->root->rwsem); -} - -static inline void anon_vma_lock_read(struct anon_vma *anon_vma) -{ - down_read(&anon_vma->root->rwsem); -} - -static inline int anon_vma_trylock_read(struct anon_vma *anon_vma) -{ - return down_read_trylock(&anon_vma->root->rwsem); -} - -static inline void anon_vma_unlock_read(struct anon_vma *anon_vma) -{ - up_read(&anon_vma->root->rwsem); -} - - -/* - * anon_vma helper functions. - */ void anon_vma_init(void); /* create anon_vma_cachep */ -int __anon_vma_prepare(struct vm_area_struct *); -void unlink_anon_vmas(struct vm_area_struct *); -int anon_vma_clone(struct vm_area_struct *, struct vm_area_struct *); -int anon_vma_fork(struct vm_area_struct *, struct vm_area_struct *); - -static inline int anon_vma_prepare(struct vm_area_struct *vma) -{ - if (likely(vma->anon_vma)) - return 0; - - return __anon_vma_prepare(vma); -} - -struct anon_vma *folio_get_anon_vma(const struct folio *folio); #ifdef CONFIG_MM_ID static __always_inline void folio_lock_large_mapcount(struct folio *folio) diff --git a/mm/internal.h b/mm/internal.h index ecb6020cf313..aac4ec53fe15 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -199,6 +199,64 @@ static inline void vma_close(struct vm_area_struct *vma) #ifdef CONFIG_MMU +static inline void get_anon_vma(struct anon_vma *anon_vma) +{ + atomic_inc(&anon_vma->refcount); +} + +void __put_anon_vma(struct anon_vma *anon_vma); + +static inline void put_anon_vma(struct anon_vma *anon_vma) +{ + if (atomic_dec_and_test(&anon_vma->refcount)) + __put_anon_vma(anon_vma); +} + +static inline void anon_vma_lock_write(struct anon_vma *anon_vma) +{ + down_write(&anon_vma->root->rwsem); +} + +static inline int anon_vma_trylock_write(struct anon_vma *anon_vma) +{ + return down_write_trylock(&anon_vma->root->rwsem); +} + +static inline void anon_vma_unlock_write(struct anon_vma *anon_vma) +{ + up_write(&anon_vma->root->rwsem); +} + +static inline void anon_vma_lock_read(struct anon_vma *anon_vma) +{ + down_read(&anon_vma->root->rwsem); +} + +static inline int anon_vma_trylock_read(struct anon_vma *anon_vma) +{ + return down_read_trylock(&anon_vma->root->rwsem); +} + +static inline void anon_vma_unlock_read(struct anon_vma *anon_vma) +{ + up_read(&anon_vma->root->rwsem); +} + +struct anon_vma *folio_get_anon_vma(const struct folio *folio); + +int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src); +int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma); +int __anon_vma_prepare(struct vm_area_struct *vma); +void unlink_anon_vmas(struct vm_area_struct *vma); + +static inline int anon_vma_prepare(struct vm_area_struct *vma) +{ + if (likely(vma->anon_vma)) + return 0; + + return __anon_vma_prepare(vma); +} + /* Flags for folio_pte_batch(). */ typedef int __bitwise fpb_t; -- cgit v1.2.3 From 85f03a86318c4172bfda4484cdf588ebab5fa410 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Sun, 18 Jan 2026 14:50:43 +0000 Subject: mm/mmap_lock: add vma_is_attached() helper This makes it easy to explicitly check for VMA detachment, which is useful for things like asserts. Note that we intentionally do not allow this function to be available should CONFIG_PER_VMA_LOCK be set - this is because vma_assert_attached() and vma_assert_detached() are no-ops if !CONFIG_PER_VMA_LOCK, so there is no correct state for vma_is_attached() to be in if this configuration option is not specified. Therefore users elsewhere must invoke this function only after checking for CONFIG_PER_VMA_LOCK. We rework the assert functions to utilise this. Link: https://lkml.kernel.org/r/0172d3bf527ca54ba27d8bce8f8476095b241ac7.1768746221.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Suren Baghdasaryan Reviewed-by: Liam R. Howlett Cc: Barry Song Cc: Chris Li Cc: David Hildenbrand Cc: Harry Yoo Cc: Jann Horn Cc: Michal Hocko Cc: Mike Rapoport Cc: Pedro Falcato Cc: Rik van Riel Cc: Shakeel Butt Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/mmap_lock.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h index d53f72dba7fe..b50416fbba20 100644 --- a/include/linux/mmap_lock.h +++ b/include/linux/mmap_lock.h @@ -251,6 +251,11 @@ static inline void vma_assert_locked(struct vm_area_struct *vma) !__is_vma_write_locked(vma, &mm_lock_seq), vma); } +static inline bool vma_is_attached(struct vm_area_struct *vma) +{ + return refcount_read(&vma->vm_refcnt); +} + /* * WARNING: to avoid racing with vma_mark_attached()/vma_mark_detached(), these * assertions should be made either under mmap_write_lock or when the object @@ -258,12 +263,12 @@ static inline void vma_assert_locked(struct vm_area_struct *vma) */ static inline void vma_assert_attached(struct vm_area_struct *vma) { - WARN_ON_ONCE(!refcount_read(&vma->vm_refcnt)); + WARN_ON_ONCE(!vma_is_attached(vma)); } static inline void vma_assert_detached(struct vm_area_struct *vma) { - WARN_ON_ONCE(refcount_read(&vma->vm_refcnt)); + WARN_ON_ONCE(vma_is_attached(vma)); } static inline void vma_mark_attached(struct vm_area_struct *vma) -- cgit v1.2.3 From 53a9b4646f67c95df1775aa5f381cb7f42cae957 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 6 Jan 2026 12:52:37 +0100 Subject: mm/page_alloc: refactor the initial compaction handling The initial direct compaction done in some cases in __alloc_pages_slowpath() stands out from the main retry loop of reclaim + compaction. We can simplify this by instead skipping the initial reclaim attempt via a new local variable compact_first, and handle the compact_prority as necessary to match the original behavior. No functional change intended. Link: https://lkml.kernel.org/r/20260106-thp-thisnode-tweak-v3-2-f5d67c21a193@suse.cz Signed-off-by: Vlastimil Babka Suggested-by: Johannes Weiner Reviewed-by: Joshua Hahn Acked-by: Michal Hocko Cc: Brendan Jackman Cc: David Hildenbrand (Red Hat) Cc: David Rientjes Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Mike Rapoport Cc: Pedro Falcato Cc: Suren Baghdasaryan Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/gfp.h | 8 ++++- mm/page_alloc.c | 100 +++++++++++++++++++++++++--------------------------- 2 files changed, 55 insertions(+), 53 deletions(-) (limited to 'include') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index b155929af5b1..f9fdc99ae594 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -407,9 +407,15 @@ extern gfp_t gfp_allowed_mask; /* Returns true if the gfp_mask allows use of ALLOC_NO_WATERMARK */ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask); +/* A helper for checking if gfp includes all the specified flags */ +static inline bool gfp_has_flags(gfp_t gfp, gfp_t flags) +{ + return (gfp & flags) == flags; +} + static inline bool gfp_has_io_fs(gfp_t gfp) { - return (gfp & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS); + return gfp_has_flags(gfp, __GFP_IO | __GFP_FS); } /* diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8e6d2e61374a..848c5c93ccb5 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4694,7 +4694,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, struct alloc_context *ac) { bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM; - bool can_compact = gfp_compaction_allowed(gfp_mask); + bool can_compact = can_direct_reclaim && gfp_compaction_allowed(gfp_mask); bool nofail = gfp_mask & __GFP_NOFAIL; const bool costly_order = order > PAGE_ALLOC_COSTLY_ORDER; struct page *page = NULL; @@ -4707,6 +4707,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, unsigned int cpuset_mems_cookie; unsigned int zonelist_iter_cookie; int reserve_flags; + bool compact_first = false; if (unlikely(nofail)) { /* @@ -4730,6 +4731,19 @@ restart: cpuset_mems_cookie = read_mems_allowed_begin(); zonelist_iter_cookie = zonelist_iter_begin(); + /* + * For costly allocations, try direct compaction first, as it's likely + * that we have enough base pages and don't need to reclaim. For non- + * movable high-order allocations, do that as well, as compaction will + * try prevent permanent fragmentation by migrating from blocks of the + * same migratetype. + */ + if (can_compact && (costly_order || (order > 0 && + ac->migratetype != MIGRATE_MOVABLE))) { + compact_first = true; + compact_priority = INIT_COMPACT_PRIORITY; + } + /* * The fast path uses conservative alloc_flags to succeed only until * kswapd needs to be woken up, and to avoid the cost of setting up @@ -4772,53 +4786,6 @@ restart: if (page) goto got_pg; - /* - * For costly allocations, try direct compaction first, as it's likely - * that we have enough base pages and don't need to reclaim. For non- - * movable high-order allocations, do that as well, as compaction will - * try prevent permanent fragmentation by migrating from blocks of the - * same migratetype. - * Don't try this for allocations that are allowed to ignore - * watermarks, as the ALLOC_NO_WATERMARKS attempt didn't yet happen. - */ - if (can_direct_reclaim && can_compact && - (costly_order || - (order > 0 && ac->migratetype != MIGRATE_MOVABLE)) - && !gfp_pfmemalloc_allowed(gfp_mask)) { - page = __alloc_pages_direct_compact(gfp_mask, order, - alloc_flags, ac, - INIT_COMPACT_PRIORITY, - &compact_result); - if (page) - goto got_pg; - - /* - * Checks for costly allocations with __GFP_NORETRY, which - * includes some THP page fault allocations - */ - if (costly_order && (gfp_mask & __GFP_NORETRY)) { - /* - * THP page faults may attempt local node only first, - * but are then allowed to only compact, not reclaim, - * see alloc_pages_mpol(). - * - * Compaction has failed above and we don't want such - * THP allocations to put reclaim pressure on a single - * node in a situation where other nodes might have - * plenty of available memory. - */ - if (gfp_mask & __GFP_THISNODE) - goto nopage; - - /* - * Proceed with single round of reclaim/compaction, but - * since sync compaction could be very expensive, keep - * using async compaction. - */ - compact_priority = INIT_COMPACT_PRIORITY; - } - } - retry: /* * Deal with possible cpuset update races or zonelist updates to avoid @@ -4862,10 +4829,12 @@ retry: goto nopage; /* Try direct reclaim and then allocating */ - page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac, - &did_some_progress); - if (page) - goto got_pg; + if (!compact_first) { + page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, + ac, &did_some_progress); + if (page) + goto got_pg; + } /* Try direct compaction and then allocating */ page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac, @@ -4873,6 +4842,33 @@ retry: if (page) goto got_pg; + if (compact_first) { + /* + * THP page faults may attempt local node only first, but are + * then allowed to only compact, not reclaim, see + * alloc_pages_mpol(). + * + * Compaction has failed above and we don't want such THP + * allocations to put reclaim pressure on a single node in a + * situation where other nodes might have plenty of available + * memory. + */ + if (gfp_has_flags(gfp_mask, __GFP_NORETRY | __GFP_THISNODE)) + goto nopage; + + /* + * For the initial compaction attempt we have lowered its + * priority. Restore it for further retries, if those are + * allowed. With __GFP_NORETRY there will be a single round of + * reclaim and compaction with the lowered priority. + */ + if (!(gfp_mask & __GFP_NORETRY)) + compact_priority = DEF_COMPACT_PRIORITY; + + compact_first = false; + goto retry; + } + /* Do not loop if specifically requested */ if (gfp_mask & __GFP_NORETRY) goto nopage; -- cgit v1.2.3 From e77786b4682e69336e3de3eaeb12ec994027f611 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Thu, 25 Dec 2025 15:21:09 -0800 Subject: memcg: introduce private id API for in-kernel users Patch series "memcg: separate private and public ID namespaces". The memory cgroup subsystem maintains a private ID infrastructure that is decoupled from the cgroup IDs. This private ID system exists because some kernel objects (like swap entries and shadow entries in the workingset code) can outlive the cgroup they were associated with. The motivation is best described in commit 73f576c04b941 ("mm: memcontrol: fix cgroup creation failure after many small jobs"). Unfortunately, some in-kernel users (DAMON, LRU gen debugfs interface, shrinker debugfs) started exposing these private IDs to userspace. This is problematic because: 1. The private IDs are internal implementation details that could change 2. Userspace already has access to cgroup IDs through the cgroup filesystem 3. Using different ID namespaces in different interfaces is confusing This series cleans up the memcg ID infrastructure by: 1. Explicitly marking the private ID APIs with "private" in their names to make it clear they are for internal use only (swap/workingset) 2. Making the public cgroup ID APIs (mem_cgroup_id/mem_cgroup_get_from_id) unconditionally available 3. Converting DAMON, LRU gen, and shrinker debugfs interfaces to use the public cgroup IDs instead of the private IDs 4. Removing the now-unused wrapper functions and renaming the public APIs for clarity After this series: - mem_cgroup_private_id() / mem_cgroup_from_private_id() are used for internal kernel objects that outlive their cgroup (swap, workingset) - mem_cgroup_id() / mem_cgroup_get_from_id() return the public cgroup ID (from cgroup_id()) for use in userspace-facing interfaces This patch (of 8): The memory cgroup maintains a private ID infrastructure decoupled from the cgroup IDs for swapout records and shadow entries. The main motivation of this private ID infra is best described in the commit 73f576c04b941 ("mm: memcontrol: fix cgroup creation failure after many small jobs"). Unfortunately some users have started exposing these private IDs to the userspace where they should have used the cgroup IDs which are already exposed to the userspace. Let's rename the memcg ID APIs to explicitly mark them private. No functional change is intended. Link: https://lkml.kernel.org/r/20251225232116.294540-1-shakeel.butt@linux.dev Link: https://lkml.kernel.org/r/20251225232116.294540-2-shakeel.butt@linux.dev Signed-off-by: Shakeel Butt Acked-by: Michal Hocko Cc: Axel Rasmussen Cc: Dave Chinner Cc: David Hildenbrand Cc: Johannes Weiner Cc: Lorenzo Stoakes Cc: Muchun Song Cc: Qi Zheng Cc: Roman Gushchin Cc: SeongJae Park Cc: Wei Xu Cc: Yuanchu Xie Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 24 +++++++++++++++++--- mm/list_lru.c | 2 +- mm/memcontrol-v1.c | 6 ++--- mm/memcontrol-v1.h | 4 ++-- mm/memcontrol.c | 55 +++++++++++++++++++++++++--------------------- mm/workingset.c | 8 +++---- 6 files changed, 61 insertions(+), 38 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index fd400082313a..1c4224bcfb23 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -65,7 +65,7 @@ struct mem_cgroup_reclaim_cookie { #define MEM_CGROUP_ID_SHIFT 16 -struct mem_cgroup_id { +struct mem_cgroup_private_id { int id; refcount_t ref; }; @@ -191,7 +191,7 @@ struct mem_cgroup { struct cgroup_subsys_state css; /* Private memcg ID. Used to ID objects that outlive the cgroup */ - struct mem_cgroup_id id; + struct mem_cgroup_private_id id; /* Accounted resources */ struct page_counter memory; /* Both v1 & v2 */ @@ -821,13 +821,19 @@ void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *); void mem_cgroup_scan_tasks(struct mem_cgroup *memcg, int (*)(struct task_struct *, void *), void *arg); -static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg) +static inline unsigned short mem_cgroup_private_id(struct mem_cgroup *memcg) { if (mem_cgroup_disabled()) return 0; return memcg->id.id; } +struct mem_cgroup *mem_cgroup_from_private_id(unsigned short id); + +static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg) +{ + return mem_cgroup_private_id(memcg); +} struct mem_cgroup *mem_cgroup_from_id(unsigned short id); #ifdef CONFIG_SHRINKER_DEBUG @@ -1290,6 +1296,18 @@ static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id) return NULL; } +static inline unsigned short mem_cgroup_private_id(struct mem_cgroup *memcg) +{ + return 0; +} + +static inline struct mem_cgroup *mem_cgroup_from_private_id(unsigned short id) +{ + WARN_ON_ONCE(id); + /* XXX: This should always return root_mem_cgroup */ + return NULL; +} + #ifdef CONFIG_SHRINKER_DEBUG static inline unsigned long mem_cgroup_ino(struct mem_cgroup *memcg) { diff --git a/mm/list_lru.c b/mm/list_lru.c index 37b642f6cbda..13b9f66d950e 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -369,7 +369,7 @@ unsigned long list_lru_walk_node(struct list_lru *lru, int nid, xa_for_each(&lru->xa, index, mlru) { rcu_read_lock(); - memcg = mem_cgroup_from_id(index); + memcg = mem_cgroup_from_private_id(index); if (!mem_cgroup_tryget(memcg)) { rcu_read_unlock(); continue; diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c index 0b50cb122ff3..0e3d972fad33 100644 --- a/mm/memcontrol-v1.c +++ b/mm/memcontrol-v1.c @@ -635,14 +635,14 @@ void memcg1_swapout(struct folio *folio, swp_entry_t entry) * have an ID allocated to it anymore, charge the closest online * ancestor for the swap instead and transfer the memory+swap charge. */ - swap_memcg = mem_cgroup_id_get_online(memcg); + swap_memcg = mem_cgroup_private_id_get_online(memcg); nr_entries = folio_nr_pages(folio); /* Get references for the tail pages, too */ if (nr_entries > 1) - mem_cgroup_id_get_many(swap_memcg, nr_entries - 1); + mem_cgroup_private_id_get_many(swap_memcg, nr_entries - 1); mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries); - swap_cgroup_record(folio, mem_cgroup_id(swap_memcg), entry); + swap_cgroup_record(folio, mem_cgroup_private_id(swap_memcg), entry); folio_unqueue_deferred_split(folio); folio->memcg_data = 0; diff --git a/mm/memcontrol-v1.h b/mm/memcontrol-v1.h index e92b21af92b1..49933925b4ba 100644 --- a/mm/memcontrol-v1.h +++ b/mm/memcontrol-v1.h @@ -28,8 +28,8 @@ unsigned long memcg_events(struct mem_cgroup *memcg, int event); unsigned long memcg_page_state_output(struct mem_cgroup *memcg, int item); int memory_stat_show(struct seq_file *m, void *v); -void mem_cgroup_id_get_many(struct mem_cgroup *memcg, unsigned int n); -struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg); +void mem_cgroup_private_id_get_many(struct mem_cgroup *memcg, unsigned int n); +struct mem_cgroup *mem_cgroup_private_id_get_online(struct mem_cgroup *memcg); /* Cgroup v1-specific declarations */ #ifdef CONFIG_MEMCG_V1 diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 75fc22a33b28..25ad8433df2e 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3554,38 +3554,38 @@ static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg) */ #define MEM_CGROUP_ID_MAX ((1UL << MEM_CGROUP_ID_SHIFT) - 1) -static DEFINE_XARRAY_ALLOC1(mem_cgroup_ids); +static DEFINE_XARRAY_ALLOC1(mem_cgroup_private_ids); -static void mem_cgroup_id_remove(struct mem_cgroup *memcg) +static void mem_cgroup_private_id_remove(struct mem_cgroup *memcg) { if (memcg->id.id > 0) { - xa_erase(&mem_cgroup_ids, memcg->id.id); + xa_erase(&mem_cgroup_private_ids, memcg->id.id); memcg->id.id = 0; } } -void __maybe_unused mem_cgroup_id_get_many(struct mem_cgroup *memcg, +void __maybe_unused mem_cgroup_private_id_get_many(struct mem_cgroup *memcg, unsigned int n) { refcount_add(n, &memcg->id.ref); } -static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n) +static void mem_cgroup_private_id_put_many(struct mem_cgroup *memcg, unsigned int n) { if (refcount_sub_and_test(n, &memcg->id.ref)) { - mem_cgroup_id_remove(memcg); + mem_cgroup_private_id_remove(memcg); /* Memcg ID pins CSS */ css_put(&memcg->css); } } -static inline void mem_cgroup_id_put(struct mem_cgroup *memcg) +static inline void mem_cgroup_private_id_put(struct mem_cgroup *memcg) { - mem_cgroup_id_put_many(memcg, 1); + mem_cgroup_private_id_put_many(memcg, 1); } -struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg) +struct mem_cgroup *mem_cgroup_private_id_get_online(struct mem_cgroup *memcg) { while (!refcount_inc_not_zero(&memcg->id.ref)) { /* @@ -3604,15 +3604,20 @@ struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg) } /** - * mem_cgroup_from_id - look up a memcg from a memcg id + * mem_cgroup_from_private_id - look up a memcg from a memcg id * @id: the memcg id to look up * * Caller must hold rcu_read_lock(). */ -struct mem_cgroup *mem_cgroup_from_id(unsigned short id) +struct mem_cgroup *mem_cgroup_from_private_id(unsigned short id) { WARN_ON_ONCE(!rcu_read_lock_held()); - return xa_load(&mem_cgroup_ids, id); + return xa_load(&mem_cgroup_private_ids, id); +} + +struct mem_cgroup *mem_cgroup_from_id(unsigned short id) +{ + return mem_cgroup_from_private_id(id); } #ifdef CONFIG_SHRINKER_DEBUG @@ -3711,7 +3716,7 @@ static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent) if (!memcg) return ERR_PTR(-ENOMEM); - error = xa_alloc(&mem_cgroup_ids, &memcg->id.id, NULL, + error = xa_alloc(&mem_cgroup_private_ids, &memcg->id.id, NULL, XA_LIMIT(1, MEM_CGROUP_ID_MAX), GFP_KERNEL); if (error) goto fail; @@ -3771,7 +3776,7 @@ static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent) lru_gen_init_memcg(memcg); return memcg; fail: - mem_cgroup_id_remove(memcg); + mem_cgroup_private_id_remove(memcg); __mem_cgroup_free(memcg); return ERR_PTR(error); } @@ -3854,7 +3859,7 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css) css_get(css); /* - * Ensure mem_cgroup_from_id() works once we're fully online. + * Ensure mem_cgroup_from_private_id() works once we're fully online. * * We could do this earlier and require callers to filter with * css_tryget_online(). But right now there are no users that @@ -3863,13 +3868,13 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css) * publish it here at the end of onlining. This matches the * regular ID destruction during offlining. */ - xa_store(&mem_cgroup_ids, memcg->id.id, memcg, GFP_KERNEL); + xa_store(&mem_cgroup_private_ids, memcg->id.id, memcg, GFP_KERNEL); return 0; offline_kmem: memcg_offline_kmem(memcg); remove_id: - mem_cgroup_id_remove(memcg); + mem_cgroup_private_id_remove(memcg); return -ENOMEM; } @@ -3892,7 +3897,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) drain_all_stock(memcg); - mem_cgroup_id_put(memcg); + mem_cgroup_private_id_put(memcg); } static void mem_cgroup_css_released(struct cgroup_subsys_state *css) @@ -4779,7 +4784,7 @@ int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm, id = lookup_swap_cgroup_id(entry); rcu_read_lock(); - memcg = mem_cgroup_from_id(id); + memcg = mem_cgroup_from_private_id(id); if (!memcg || !css_tryget_online(&memcg->css)) memcg = get_mem_cgroup_from_mm(mm); rcu_read_unlock(); @@ -5174,22 +5179,22 @@ int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry) return 0; } - memcg = mem_cgroup_id_get_online(memcg); + memcg = mem_cgroup_private_id_get_online(memcg); if (!mem_cgroup_is_root(memcg) && !page_counter_try_charge(&memcg->swap, nr_pages, &counter)) { memcg_memory_event(memcg, MEMCG_SWAP_MAX); memcg_memory_event(memcg, MEMCG_SWAP_FAIL); - mem_cgroup_id_put(memcg); + mem_cgroup_private_id_put(memcg); return -ENOMEM; } /* Get references for the tail pages, too */ if (nr_pages > 1) - mem_cgroup_id_get_many(memcg, nr_pages - 1); + mem_cgroup_private_id_get_many(memcg, nr_pages - 1); mod_memcg_state(memcg, MEMCG_SWAP, nr_pages); - swap_cgroup_record(folio, mem_cgroup_id(memcg), entry); + swap_cgroup_record(folio, mem_cgroup_private_id(memcg), entry); return 0; } @@ -5206,7 +5211,7 @@ void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages) id = swap_cgroup_clear(entry, nr_pages); rcu_read_lock(); - memcg = mem_cgroup_from_id(id); + memcg = mem_cgroup_from_private_id(id); if (memcg) { if (!mem_cgroup_is_root(memcg)) { if (do_memsw_account()) @@ -5215,7 +5220,7 @@ void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages) page_counter_uncharge(&memcg->swap, nr_pages); } mod_memcg_state(memcg, MEMCG_SWAP, -nr_pages); - mem_cgroup_id_put_many(memcg, nr_pages); + mem_cgroup_private_id_put_many(memcg, nr_pages); } rcu_read_unlock(); } diff --git a/mm/workingset.c b/mm/workingset.c index e9f05634747a..13422d304715 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -254,7 +254,7 @@ static void *lru_gen_eviction(struct folio *folio) hist = lru_hist_from_seq(min_seq); atomic_long_add(delta, &lrugen->evicted[hist][type][tier]); - return pack_shadow(mem_cgroup_id(memcg), pgdat, token, workingset); + return pack_shadow(mem_cgroup_private_id(memcg), pgdat, token, workingset); } /* @@ -271,7 +271,7 @@ static bool lru_gen_test_recent(void *shadow, struct lruvec **lruvec, unpack_shadow(shadow, &memcg_id, &pgdat, token, workingset); - memcg = mem_cgroup_from_id(memcg_id); + memcg = mem_cgroup_from_private_id(memcg_id); *lruvec = mem_cgroup_lruvec(memcg, pgdat); max_seq = READ_ONCE((*lruvec)->lrugen.max_seq); @@ -395,7 +395,7 @@ void *workingset_eviction(struct folio *folio, struct mem_cgroup *target_memcg) lruvec = mem_cgroup_lruvec(target_memcg, pgdat); /* XXX: target_memcg can be NULL, go through lruvec */ - memcgid = mem_cgroup_id(lruvec_memcg(lruvec)); + memcgid = mem_cgroup_private_id(lruvec_memcg(lruvec)); eviction = atomic_long_read(&lruvec->nonresident_age); eviction >>= bucket_order; workingset_age_nonresident(lruvec, folio_nr_pages(folio)); @@ -456,7 +456,7 @@ bool workingset_test_recent(void *shadow, bool file, bool *workingset, * would be better if the root_mem_cgroup existed in all * configurations instead. */ - eviction_memcg = mem_cgroup_from_id(memcgid); + eviction_memcg = mem_cgroup_from_private_id(memcgid); if (!mem_cgroup_tryget(eviction_memcg)) eviction_memcg = NULL; rcu_read_unlock(); -- cgit v1.2.3 From 1d89d7fd592e2490cadd13c253d7b1b9f6116be8 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Thu, 25 Dec 2025 15:21:10 -0800 Subject: memcg: expose mem_cgroup_ino() and mem_cgroup_get_from_ino() unconditionally Remove the CONFIG_SHRINKER_DEBUG guards around mem_cgroup_ino() and mem_cgroup_get_from_ino(). These APIs provide a way to get a memcg's cgroup inode number and to look up a memcg from an inode number respectively. Making these functions unconditionally available allows other in-kernel users to leverage them without requiring CONFIG_SHRINKER_DEBUG to be enabled. No functional change for existing users. Link: https://lkml.kernel.org/r/20251225232116.294540-3-shakeel.butt@linux.dev Signed-off-by: Shakeel Butt Acked-by: Michal Hocko Cc: Axel Rasmussen Cc: Dave Chinner Cc: David Hildenbrand Cc: Johannes Weiner Cc: Lorenzo Stoakes Cc: Muchun Song Cc: Qi Zheng Cc: Roman Gushchin Cc: SeongJae Park Cc: Wei Xu Cc: Yuanchu Xie Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 4 ---- mm/memcontrol.c | 2 -- 2 files changed, 6 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 1c4224bcfb23..77f32be26ea8 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -836,14 +836,12 @@ static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg) } struct mem_cgroup *mem_cgroup_from_id(unsigned short id); -#ifdef CONFIG_SHRINKER_DEBUG static inline unsigned long mem_cgroup_ino(struct mem_cgroup *memcg) { return memcg ? cgroup_ino(memcg->css.cgroup) : 0; } struct mem_cgroup *mem_cgroup_get_from_ino(unsigned long ino); -#endif static inline struct mem_cgroup *mem_cgroup_from_seq(struct seq_file *m) { @@ -1308,7 +1306,6 @@ static inline struct mem_cgroup *mem_cgroup_from_private_id(unsigned short id) return NULL; } -#ifdef CONFIG_SHRINKER_DEBUG static inline unsigned long mem_cgroup_ino(struct mem_cgroup *memcg) { return 0; @@ -1318,7 +1315,6 @@ static inline struct mem_cgroup *mem_cgroup_get_from_ino(unsigned long ino) { return NULL; } -#endif static inline struct mem_cgroup *mem_cgroup_from_seq(struct seq_file *m) { diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 25ad8433df2e..e85816960e38 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3620,7 +3620,6 @@ struct mem_cgroup *mem_cgroup_from_id(unsigned short id) return mem_cgroup_from_private_id(id); } -#ifdef CONFIG_SHRINKER_DEBUG struct mem_cgroup *mem_cgroup_get_from_ino(unsigned long ino) { struct cgroup *cgrp; @@ -3641,7 +3640,6 @@ struct mem_cgroup *mem_cgroup_get_from_ino(unsigned long ino) return memcg; } -#endif static void free_mem_cgroup_per_node_info(struct mem_cgroup_per_node *pn) { -- cgit v1.2.3 From ea73e364716023b1a47d58b9f12e7c92f3b1e6a7 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Thu, 25 Dec 2025 15:21:12 -0800 Subject: memcg: use cgroup_id() instead of cgroup_ino() for memcg ID Switch mem_cgroup_ino() from using cgroup_ino() to cgroup_id(). The cgroup_ino() returns the kernfs inode number while cgroup_id() returns the kernfs node ID. For 64-bit systems, they are the same. Also cgroup_get_from_id() expects 64-bit node ID which is called by mem_cgroup_get_from_ino(). Change the type from unsigned long to u64 to match cgroup_id()'s return type, and update the format specifiers accordingly. Note that the names mem_cgroup_ino() and mem_cgroup_get_from_ino() are now misnomers since they deal with cgroup IDs rather than inode numbers. A follow-up patch will rename them. Link: https://lkml.kernel.org/r/20251225232116.294540-5-shakeel.butt@linux.dev Signed-off-by: Shakeel Butt Acked-by: Michal Hocko Cc: Axel Rasmussen Cc: Dave Chinner Cc: David Hildenbrand Cc: Johannes Weiner Cc: Lorenzo Stoakes Cc: Muchun Song Cc: Qi Zheng Cc: Roman Gushchin Cc: SeongJae Park Cc: Wei Xu Cc: Yuanchu Xie Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 10 +++++----- mm/memcontrol.c | 2 +- mm/shrinker_debug.c | 7 ++++--- 3 files changed, 10 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 77f32be26ea8..c823150ec288 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -836,12 +836,12 @@ static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg) } struct mem_cgroup *mem_cgroup_from_id(unsigned short id); -static inline unsigned long mem_cgroup_ino(struct mem_cgroup *memcg) +static inline u64 mem_cgroup_ino(struct mem_cgroup *memcg) { - return memcg ? cgroup_ino(memcg->css.cgroup) : 0; + return memcg ? cgroup_id(memcg->css.cgroup) : 0; } -struct mem_cgroup *mem_cgroup_get_from_ino(unsigned long ino); +struct mem_cgroup *mem_cgroup_get_from_ino(u64 ino); static inline struct mem_cgroup *mem_cgroup_from_seq(struct seq_file *m) { @@ -1306,12 +1306,12 @@ static inline struct mem_cgroup *mem_cgroup_from_private_id(unsigned short id) return NULL; } -static inline unsigned long mem_cgroup_ino(struct mem_cgroup *memcg) +static inline u64 mem_cgroup_ino(struct mem_cgroup *memcg) { return 0; } -static inline struct mem_cgroup *mem_cgroup_get_from_ino(unsigned long ino) +static inline struct mem_cgroup *mem_cgroup_get_from_ino(u64 ino) { return NULL; } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 92beb74482fa..1ff2f9bd820c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3620,7 +3620,7 @@ struct mem_cgroup *mem_cgroup_from_id(unsigned short id) return mem_cgroup_from_private_id(id); } -struct mem_cgroup *mem_cgroup_get_from_ino(unsigned long ino) +struct mem_cgroup *mem_cgroup_get_from_ino(u64 ino) { struct cgroup *cgrp; struct cgroup_subsys_state *css; diff --git a/mm/shrinker_debug.c b/mm/shrinker_debug.c index 8aaeb8f5c3af..7ef16a0b2959 100644 --- a/mm/shrinker_debug.c +++ b/mm/shrinker_debug.c @@ -70,7 +70,7 @@ static int shrinker_debugfs_count_show(struct seq_file *m, void *v) memcg_aware ? memcg : NULL, count_per_node); if (total) { - seq_printf(m, "%lu", mem_cgroup_ino(memcg)); + seq_printf(m, "%llu", mem_cgroup_ino(memcg)); for_each_node(nid) seq_printf(m, " %lu", count_per_node[nid]); seq_putc(m, '\n'); @@ -106,7 +106,8 @@ static ssize_t shrinker_debugfs_scan_write(struct file *file, size_t size, loff_t *pos) { struct shrinker *shrinker = file->private_data; - unsigned long nr_to_scan = 0, ino, read_len; + unsigned long nr_to_scan = 0, read_len; + u64 ino; struct shrink_control sc = { .gfp_mask = GFP_KERNEL, }; @@ -119,7 +120,7 @@ static ssize_t shrinker_debugfs_scan_write(struct file *file, return -EFAULT; kbuf[read_len] = '\0'; - if (sscanf(kbuf, "%lu %d %lu", &ino, &nid, &nr_to_scan) != 3) + if (sscanf(kbuf, "%llu %d %lu", &ino, &nid, &nr_to_scan) != 3) return -EINVAL; if (nid < 0 || nid >= nr_node_ids) -- cgit v1.2.3 From 5866891a7ab1348686da70f70e925964d9227bf5 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Thu, 25 Dec 2025 15:21:13 -0800 Subject: mm/damon: use cgroup ID instead of private memcg ID DAMON was using the internal private memcg ID which is meant for tracking kernel objects that outlive their cgroup. Switch to using the public cgroup ID instead. Link: https://lkml.kernel.org/r/20251225232116.294540-6-shakeel.butt@linux.dev Signed-off-by: Shakeel Butt Reviewed-by: SeongJae Park Acked-by: Michal Hocko Cc: Axel Rasmussen Cc: Dave Chinner Cc: David Hildenbrand Cc: Johannes Weiner Cc: Lorenzo Stoakes Cc: Muchun Song Cc: Qi Zheng Cc: Roman Gushchin Cc: Wei Xu Cc: Yuanchu Xie Signed-off-by: Andrew Morton --- include/linux/damon.h | 4 ++-- mm/damon/core.c | 7 ++----- mm/damon/ops-common.c | 2 +- mm/damon/sysfs-schemes.c | 8 ++++---- 4 files changed, 9 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/linux/damon.h b/include/linux/damon.h index a67292a2f09d..650e7ecfa32b 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -203,7 +203,7 @@ struct damos_quota_goal { u64 last_psi_total; struct { int nid; - unsigned short memcg_id; + u64 memcg_id; }; }; struct list_head list; @@ -419,7 +419,7 @@ struct damos_filter { bool matching; bool allow; union { - unsigned short memcg_id; + u64 memcg_id; struct damon_addr_range addr_range; int target_idx; struct damon_size_range sz_range; diff --git a/mm/damon/core.c b/mm/damon/core.c index 7f0028e23f92..3edbff685534 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -2094,16 +2094,13 @@ static unsigned long damos_get_node_memcg_used_bp( unsigned long used_pages, numerator; struct sysinfo i; - rcu_read_lock(); - memcg = mem_cgroup_from_id(goal->memcg_id); - if (!memcg || !mem_cgroup_tryget(memcg)) { - rcu_read_unlock(); + memcg = mem_cgroup_get_from_ino(goal->memcg_id); + if (!memcg) { if (goal->metric == DAMOS_QUOTA_NODE_MEMCG_USED_BP) return 0; else /* DAMOS_QUOTA_NODE_MEMCG_FREE_BP */ return 10000; } - rcu_read_unlock(); mem_cgroup_flush_stats(memcg); lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(goal->nid)); diff --git a/mm/damon/ops-common.c b/mm/damon/ops-common.c index a218d9922234..dd81db95f901 100644 --- a/mm/damon/ops-common.c +++ b/mm/damon/ops-common.c @@ -274,7 +274,7 @@ bool damos_folio_filter_match(struct damos_filter *filter, struct folio *folio) if (!memcg) matched = false; else - matched = filter->memcg_id == mem_cgroup_id(memcg); + matched = filter->memcg_id == mem_cgroup_ino(memcg); rcu_read_unlock(); break; case DAMOS_FILTER_TYPE_YOUNG: diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c index 19bc2288cd68..6125f259ecea 100644 --- a/mm/damon/sysfs-schemes.c +++ b/mm/damon/sysfs-schemes.c @@ -2494,7 +2494,7 @@ static bool damon_sysfs_memcg_path_eq(struct mem_cgroup *memcg, return false; } -static int damon_sysfs_memcg_path_to_id(char *memcg_path, unsigned short *id) +static int damon_sysfs_memcg_path_to_id(char *memcg_path, u64 *id) { struct mem_cgroup *memcg; char *path; @@ -2509,11 +2509,11 @@ static int damon_sysfs_memcg_path_to_id(char *memcg_path, unsigned short *id) for (memcg = mem_cgroup_iter(NULL, NULL, NULL); memcg; memcg = mem_cgroup_iter(NULL, memcg, NULL)) { - /* skip removed memcg */ - if (!mem_cgroup_id(memcg)) + /* skip offlined memcg */ + if (!mem_cgroup_online(memcg)) continue; if (damon_sysfs_memcg_path_eq(memcg, path, memcg_path)) { - *id = mem_cgroup_id(memcg); + *id = mem_cgroup_ino(memcg); found = true; break; } -- cgit v1.2.3 From 2202e3a8cb80da583670034ee33c995513708949 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Thu, 25 Dec 2025 15:21:15 -0800 Subject: memcg: remove unused mem_cgroup_id() and mem_cgroup_from_id() Now that all callers have been converted to use either: - The private ID APIs (mem_cgroup_private_id/mem_cgroup_from_private_id) for internal kernel objects that outlive their cgroup - The public cgroup ID APIs (mem_cgroup_ino/mem_cgroup_get_from_ino) for external interfaces Remove the unused wrapper functions mem_cgroup_id() and mem_cgroup_from_id() along with their !CONFIG_MEMCG stubs. Link: https://lkml.kernel.org/r/20251225232116.294540-8-shakeel.butt@linux.dev Signed-off-by: Shakeel Butt Acked-by: Michal Hocko Cc: Axel Rasmussen Cc: Dave Chinner Cc: David Hildenbrand Cc: Johannes Weiner Cc: Lorenzo Stoakes Cc: Muchun Song Cc: Qi Zheng Cc: Roman Gushchin Cc: SeongJae Park Cc: Wei Xu Cc: Yuanchu Xie Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 18 ------------------ mm/memcontrol.c | 5 ----- 2 files changed, 23 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index c823150ec288..3e7d69020b39 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -830,12 +830,6 @@ static inline unsigned short mem_cgroup_private_id(struct mem_cgroup *memcg) } struct mem_cgroup *mem_cgroup_from_private_id(unsigned short id); -static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg) -{ - return mem_cgroup_private_id(memcg); -} -struct mem_cgroup *mem_cgroup_from_id(unsigned short id); - static inline u64 mem_cgroup_ino(struct mem_cgroup *memcg) { return memcg ? cgroup_id(memcg->css.cgroup) : 0; @@ -1282,18 +1276,6 @@ static inline void mem_cgroup_scan_tasks(struct mem_cgroup *memcg, { } -static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg) -{ - return 0; -} - -static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id) -{ - WARN_ON_ONCE(id); - /* XXX: This should always return root_mem_cgroup */ - return NULL; -} - static inline unsigned short mem_cgroup_private_id(struct mem_cgroup *memcg) { return 0; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 1ff2f9bd820c..ede39dde05df 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3615,11 +3615,6 @@ struct mem_cgroup *mem_cgroup_from_private_id(unsigned short id) return xa_load(&mem_cgroup_private_ids, id); } -struct mem_cgroup *mem_cgroup_from_id(unsigned short id) -{ - return mem_cgroup_from_private_id(id); -} - struct mem_cgroup *mem_cgroup_get_from_ino(u64 ino) { struct cgroup *cgrp; -- cgit v1.2.3 From 95296536eb19c969e91684287cf3bfcb382221d3 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Thu, 25 Dec 2025 15:21:16 -0800 Subject: memcg: rename mem_cgroup_ino() to mem_cgroup_id() Rename mem_cgroup_ino() to mem_cgroup_id() and mem_cgroup_get_from_ino() to mem_cgroup_get_from_id(). These functions now use cgroup IDs (from cgroup_id()) rather than inode numbers, so the names should reflect that. [shakeel.butt@linux.dev: replace ino with id, per SeongJae] Link: https://lkml.kernel.org/r/flkqanhyettp5uq22bjwg37rtmnpeg3mghznsylxcxxgaafpl4@nov2x7tagma7 [akpm@linux-foundation.org: build fix] Link: https://lkml.kernel.org/r/20251225232116.294540-9-shakeel.butt@linux.dev Signed-off-by: Shakeel Butt Acked-by: Michal Hocko Reviewed-by: SeongJae Park Cc: Axel Rasmussen Cc: Dave Chinner Cc: David Hildenbrand Cc: Johannes Weiner Cc: Lorenzo Stoakes Cc: Muchun Song Cc: Qi Zheng Cc: Roman Gushchin Cc: Wei Xu Cc: Yuanchu Xie Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 8 ++++---- mm/damon/core.c | 2 +- mm/damon/ops-common.c | 2 +- mm/damon/sysfs-schemes.c | 2 +- mm/memcontrol.c | 4 ++-- mm/shrinker_debug.c | 10 +++++----- mm/vmscan.c | 6 +++--- 7 files changed, 17 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 3e7d69020b39..ed4764e1a30e 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -830,12 +830,12 @@ static inline unsigned short mem_cgroup_private_id(struct mem_cgroup *memcg) } struct mem_cgroup *mem_cgroup_from_private_id(unsigned short id); -static inline u64 mem_cgroup_ino(struct mem_cgroup *memcg) +static inline u64 mem_cgroup_id(struct mem_cgroup *memcg) { return memcg ? cgroup_id(memcg->css.cgroup) : 0; } -struct mem_cgroup *mem_cgroup_get_from_ino(u64 ino); +struct mem_cgroup *mem_cgroup_get_from_id(u64 id); static inline struct mem_cgroup *mem_cgroup_from_seq(struct seq_file *m) { @@ -1288,12 +1288,12 @@ static inline struct mem_cgroup *mem_cgroup_from_private_id(unsigned short id) return NULL; } -static inline u64 mem_cgroup_ino(struct mem_cgroup *memcg) +static inline u64 mem_cgroup_id(struct mem_cgroup *memcg) { return 0; } -static inline struct mem_cgroup *mem_cgroup_get_from_ino(u64 ino) +static inline struct mem_cgroup *mem_cgroup_get_from_id(u64 id) { return NULL; } diff --git a/mm/damon/core.c b/mm/damon/core.c index 3edbff685534..6888917c1a00 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -2094,7 +2094,7 @@ static unsigned long damos_get_node_memcg_used_bp( unsigned long used_pages, numerator; struct sysinfo i; - memcg = mem_cgroup_get_from_ino(goal->memcg_id); + memcg = mem_cgroup_get_from_id(goal->memcg_id); if (!memcg) { if (goal->metric == DAMOS_QUOTA_NODE_MEMCG_USED_BP) return 0; diff --git a/mm/damon/ops-common.c b/mm/damon/ops-common.c index dd81db95f901..a218d9922234 100644 --- a/mm/damon/ops-common.c +++ b/mm/damon/ops-common.c @@ -274,7 +274,7 @@ bool damos_folio_filter_match(struct damos_filter *filter, struct folio *folio) if (!memcg) matched = false; else - matched = filter->memcg_id == mem_cgroup_ino(memcg); + matched = filter->memcg_id == mem_cgroup_id(memcg); rcu_read_unlock(); break; case DAMOS_FILTER_TYPE_YOUNG: diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c index 6125f259ecea..419d6e7ee945 100644 --- a/mm/damon/sysfs-schemes.c +++ b/mm/damon/sysfs-schemes.c @@ -2513,7 +2513,7 @@ static int damon_sysfs_memcg_path_to_id(char *memcg_path, u64 *id) if (!mem_cgroup_online(memcg)) continue; if (damon_sysfs_memcg_path_eq(memcg, path, memcg_path)) { - *id = mem_cgroup_ino(memcg); + *id = mem_cgroup_id(memcg); found = true; break; } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index ede39dde05df..7d6cf47e6d4c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3615,13 +3615,13 @@ struct mem_cgroup *mem_cgroup_from_private_id(unsigned short id) return xa_load(&mem_cgroup_private_ids, id); } -struct mem_cgroup *mem_cgroup_get_from_ino(u64 ino) +struct mem_cgroup *mem_cgroup_get_from_id(u64 id) { struct cgroup *cgrp; struct cgroup_subsys_state *css; struct mem_cgroup *memcg = NULL; - cgrp = cgroup_get_from_id(ino); + cgrp = cgroup_get_from_id(id); if (IS_ERR(cgrp)) return NULL; diff --git a/mm/shrinker_debug.c b/mm/shrinker_debug.c index 7ef16a0b2959..affa64437302 100644 --- a/mm/shrinker_debug.c +++ b/mm/shrinker_debug.c @@ -70,7 +70,7 @@ static int shrinker_debugfs_count_show(struct seq_file *m, void *v) memcg_aware ? memcg : NULL, count_per_node); if (total) { - seq_printf(m, "%llu", mem_cgroup_ino(memcg)); + seq_printf(m, "%llu", mem_cgroup_id(memcg)); for_each_node(nid) seq_printf(m, " %lu", count_per_node[nid]); seq_putc(m, '\n'); @@ -107,7 +107,7 @@ static ssize_t shrinker_debugfs_scan_write(struct file *file, { struct shrinker *shrinker = file->private_data; unsigned long nr_to_scan = 0, read_len; - u64 ino; + u64 id; struct shrink_control sc = { .gfp_mask = GFP_KERNEL, }; @@ -120,7 +120,7 @@ static ssize_t shrinker_debugfs_scan_write(struct file *file, return -EFAULT; kbuf[read_len] = '\0'; - if (sscanf(kbuf, "%llu %d %lu", &ino, &nid, &nr_to_scan) != 3) + if (sscanf(kbuf, "%llu %d %lu", &id, &nid, &nr_to_scan) != 3) return -EINVAL; if (nid < 0 || nid >= nr_node_ids) @@ -130,7 +130,7 @@ static ssize_t shrinker_debugfs_scan_write(struct file *file, return size; if (shrinker->flags & SHRINKER_MEMCG_AWARE) { - memcg = mem_cgroup_get_from_ino(ino); + memcg = mem_cgroup_get_from_id(id); if (!memcg) return -ENOENT; @@ -138,7 +138,7 @@ static ssize_t shrinker_debugfs_scan_write(struct file *file, mem_cgroup_put(memcg); return -ENOENT; } - } else if (ino != 0) { + } else if (id != 0) { return -EINVAL; } diff --git a/mm/vmscan.c b/mm/vmscan.c index b87baf3fc77f..4aa47ab000c2 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -5416,7 +5416,7 @@ static int lru_gen_seq_show(struct seq_file *m, void *v) if (memcg) cgroup_path(memcg->css.cgroup, m->private, PATH_MAX); #endif - seq_printf(m, "memcg %llu %s\n", mem_cgroup_ino(memcg), path); + seq_printf(m, "memcg %llu %s\n", mem_cgroup_id(memcg), path); } seq_printf(m, " node %5d\n", nid); @@ -5512,12 +5512,12 @@ static int run_cmd(char cmd, u64 memcg_id, int nid, unsigned long seq, return -EINVAL; if (!mem_cgroup_disabled()) { - memcg = mem_cgroup_get_from_ino(memcg_id); + memcg = mem_cgroup_get_from_id(memcg_id); if (!memcg) return -EINVAL; } - if (memcg_id != mem_cgroup_ino(memcg)) + if (memcg_id != mem_cgroup_id(memcg)) goto done; sc->target_mem_cgroup = memcg; -- cgit v1.2.3 From 0be909f114c4e82a4fe5964851af1ab8889dc76c Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Wed, 7 Jan 2026 14:21:44 +0900 Subject: zsmalloc: use actual object size to detect spans Using class->size to detect spanning objects is not entirely correct, because some size classes can hold a range of object sizes of up to class->size bytes in length, due to size-classes merge. Such classes use padding for cases when actually written objects are smaller than class->size. zs_obj_read_begin() can incorrectly hit the slow path and perform memcpy of such objects, basically copying padding bytes. Instead of class->size zs_obj_read_begin() should use the actual compressed object length (both zram and zswap know it) so that it can correctly handle situations when a written object is small enough to fit into the first physical page. Link: https://lkml.kernel.org/r/20260107052145.3586917-1-senozhatsky@chromium.org Signed-off-by: Sergey Senozhatsky Reviewed-by: Yosry Ahmed [zsmalloc & zswap] Reviewed-by: Nhat Pham Cc: Brian Geffon Cc: Chengming Zhou Cc: Jens Axboe Cc: Johannes Weiner Cc: Minchan Kim Signed-off-by: Andrew Morton --- drivers/block/zram/zram_drv.c | 14 ++++++++------ include/linux/zsmalloc.h | 4 ++-- mm/zsmalloc.c | 16 +++++++++++----- mm/zswap.c | 5 +++-- 4 files changed, 24 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 1d6760b3b557..f92845ef9192 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -2062,11 +2062,11 @@ static int read_incompressible_page(struct zram *zram, struct page *page, void *src, *dst; handle = get_slot_handle(zram, index); - src = zs_obj_read_begin(zram->mem_pool, handle, NULL); + src = zs_obj_read_begin(zram->mem_pool, handle, PAGE_SIZE, NULL); dst = kmap_local_page(page); copy_page(dst, src); kunmap_local(dst); - zs_obj_read_end(zram->mem_pool, handle, src); + zs_obj_read_end(zram->mem_pool, handle, PAGE_SIZE, src); return 0; } @@ -2084,11 +2084,12 @@ static int read_compressed_page(struct zram *zram, struct page *page, u32 index) prio = get_slot_comp_priority(zram, index); zstrm = zcomp_stream_get(zram->comps[prio]); - src = zs_obj_read_begin(zram->mem_pool, handle, zstrm->local_copy); + src = zs_obj_read_begin(zram->mem_pool, handle, size, + zstrm->local_copy); dst = kmap_local_page(page); ret = zcomp_decompress(zram->comps[prio], zstrm, src, size, dst); kunmap_local(dst); - zs_obj_read_end(zram->mem_pool, handle, src); + zs_obj_read_end(zram->mem_pool, handle, size, src); zcomp_stream_put(zstrm); return ret; @@ -2111,9 +2112,10 @@ static int read_from_zspool_raw(struct zram *zram, struct page *page, u32 index) * takes place here, as we read raw compressed data. */ zstrm = zcomp_stream_get(zram->comps[ZRAM_PRIMARY_COMP]); - src = zs_obj_read_begin(zram->mem_pool, handle, zstrm->local_copy); + src = zs_obj_read_begin(zram->mem_pool, handle, size, + zstrm->local_copy); memcpy_to_page(page, 0, src, size); - zs_obj_read_end(zram->mem_pool, handle, src); + zs_obj_read_end(zram->mem_pool, handle, size, src); zcomp_stream_put(zstrm); return 0; diff --git a/include/linux/zsmalloc.h b/include/linux/zsmalloc.h index f3ccff2d966c..5565c3171007 100644 --- a/include/linux/zsmalloc.h +++ b/include/linux/zsmalloc.h @@ -40,9 +40,9 @@ unsigned int zs_lookup_class_index(struct zs_pool *pool, unsigned int size); void zs_pool_stats(struct zs_pool *pool, struct zs_pool_stats *stats); void *zs_obj_read_begin(struct zs_pool *pool, unsigned long handle, - void *local_copy); + size_t mem_len, void *local_copy); void zs_obj_read_end(struct zs_pool *pool, unsigned long handle, - void *handle_mem); + size_t mem_len, void *handle_mem); void zs_obj_write(struct zs_pool *pool, unsigned long handle, void *handle_mem, size_t mem_len); diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 84da164dcbc5..119c196a287a 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -1065,7 +1065,7 @@ unsigned long zs_get_total_pages(struct zs_pool *pool) EXPORT_SYMBOL_GPL(zs_get_total_pages); void *zs_obj_read_begin(struct zs_pool *pool, unsigned long handle, - void *local_copy) + size_t mem_len, void *local_copy) { struct zspage *zspage; struct zpdesc *zpdesc; @@ -1087,7 +1087,10 @@ void *zs_obj_read_begin(struct zs_pool *pool, unsigned long handle, class = zspage_class(pool, zspage); off = offset_in_page(class->size * obj_idx); - if (off + class->size <= PAGE_SIZE) { + if (!ZsHugePage(zspage)) + mem_len += ZS_HANDLE_SIZE; + + if (off + mem_len <= PAGE_SIZE) { /* this object is contained entirely within a page */ addr = kmap_local_zpdesc(zpdesc); addr += off; @@ -1096,7 +1099,7 @@ void *zs_obj_read_begin(struct zs_pool *pool, unsigned long handle, /* this object spans two pages */ sizes[0] = PAGE_SIZE - off; - sizes[1] = class->size - sizes[0]; + sizes[1] = mem_len - sizes[0]; addr = local_copy; memcpy_from_page(addr, zpdesc_page(zpdesc), @@ -1115,7 +1118,7 @@ void *zs_obj_read_begin(struct zs_pool *pool, unsigned long handle, EXPORT_SYMBOL_GPL(zs_obj_read_begin); void zs_obj_read_end(struct zs_pool *pool, unsigned long handle, - void *handle_mem) + size_t mem_len, void *handle_mem) { struct zspage *zspage; struct zpdesc *zpdesc; @@ -1129,7 +1132,10 @@ void zs_obj_read_end(struct zs_pool *pool, unsigned long handle, class = zspage_class(pool, zspage); off = offset_in_page(class->size * obj_idx); - if (off + class->size <= PAGE_SIZE) { + if (!ZsHugePage(zspage)) + mem_len += ZS_HANDLE_SIZE; + + if (off + mem_len <= PAGE_SIZE) { if (!ZsHugePage(zspage)) off += ZS_HANDLE_SIZE; handle_mem -= off; diff --git a/mm/zswap.c b/mm/zswap.c index 6bf4f2441914..1f6c007310d8 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -937,7 +937,8 @@ static bool zswap_decompress(struct zswap_entry *entry, struct folio *folio) u8 *src, *obj; acomp_ctx = acomp_ctx_get_cpu_lock(pool); - obj = zs_obj_read_begin(pool->zs_pool, entry->handle, acomp_ctx->buffer); + obj = zs_obj_read_begin(pool->zs_pool, entry->handle, entry->length, + acomp_ctx->buffer); /* zswap entries of length PAGE_SIZE are not compressed. */ if (entry->length == PAGE_SIZE) { @@ -966,7 +967,7 @@ static bool zswap_decompress(struct zswap_entry *entry, struct folio *folio) dlen = acomp_ctx->req->dlen; read_done: - zs_obj_read_end(pool->zs_pool, entry->handle, obj); + zs_obj_read_end(pool->zs_pool, entry->handle, entry->length, obj); acomp_ctx_put_unlock(acomp_ctx); if (!decomp_ret && dlen == PAGE_SIZE) -- cgit v1.2.3 From 01152bd2e44d6bcecd3573d653221ba3944ed0f1 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Fri, 9 Jan 2026 17:31:31 +0800 Subject: mm: debug_vm_pgtable: add debug_vm_pgtable_free_huge_page() Patch series "mm: hugetlb: allocate frozen gigantic folio", v6. Introduce alloc_contig_frozen_pages() and cma_alloc_frozen_compound() which avoid atomic operation about page refcount, and then convert to allocate frozen gigantic folio by the new helpers in hugetlb to cleanup the alloc_gigantic_folio(). This patch (of 6): Add a new helper to free huge page to be consistency to debug_vm_pgtable_alloc_huge_page(), and use HPAGE_PUD_ORDER instead of open-code. Also move the free_contig_range() under CONFIG_ALLOC_CONTIG since all caller are built with CONFIG_ALLOC_CONTIG. Link: https://lkml.kernel.org/r/20260109093136.1491549-2-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Acked-by: David Hildenbrand Reviewed-by: Zi Yan Reviewed-by: Muchun Song Reviewed-by: Sidhartha Kumar Cc: Brendan Jackman Cc: Jane Chu Cc: Johannes Weiner Cc: Matthew Wilcox (Oracle) Cc: Oscar Salvador Cc: Vlastimil Babka Cc: Claudiu Beznea Cc: Mark Brown Signed-off-by: Andrew Morton --- include/linux/gfp.h | 2 +- mm/debug_vm_pgtable.c | 38 +++++++++++++++++--------------------- mm/page_alloc.c | 2 +- 3 files changed, 19 insertions(+), 23 deletions(-) (limited to 'include') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index f9fdc99ae594..627157972f6a 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -444,8 +444,8 @@ extern struct page *alloc_contig_pages_noprof(unsigned long nr_pages, gfp_t gfp_ int nid, nodemask_t *nodemask); #define alloc_contig_pages(...) alloc_hooks(alloc_contig_pages_noprof(__VA_ARGS__)) -#endif void free_contig_range(unsigned long pfn, unsigned long nr_pages); +#endif #ifdef CONFIG_CONTIG_ALLOC static inline struct folio *folio_alloc_gigantic_noprof(int order, gfp_t gfp, diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index ae9b9310d96f..83cf07269f13 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -971,22 +971,26 @@ static unsigned long __init get_random_vaddr(void) return random_vaddr; } -static void __init destroy_args(struct pgtable_debug_args *args) +static void __init +debug_vm_pgtable_free_huge_page(struct pgtable_debug_args *args, + unsigned long pfn, int order) { - struct page *page = NULL; +#ifdef CONFIG_CONTIG_ALLOC + if (args->is_contiguous_page) { + free_contig_range(pfn, 1 << order); + return; + } +#endif + __free_pages(pfn_to_page(pfn), order); +} +static void __init destroy_args(struct pgtable_debug_args *args) +{ /* Free (huge) page */ if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && has_transparent_pud_hugepage() && args->pud_pfn != ULONG_MAX) { - if (args->is_contiguous_page) { - free_contig_range(args->pud_pfn, - (1 << (HPAGE_PUD_SHIFT - PAGE_SHIFT))); - } else { - page = pfn_to_page(args->pud_pfn); - __free_pages(page, HPAGE_PUD_SHIFT - PAGE_SHIFT); - } - + debug_vm_pgtable_free_huge_page(args, args->pud_pfn, HPAGE_PUD_ORDER); args->pud_pfn = ULONG_MAX; args->pmd_pfn = ULONG_MAX; args->pte_pfn = ULONG_MAX; @@ -995,20 +999,13 @@ static void __init destroy_args(struct pgtable_debug_args *args) if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && has_transparent_hugepage() && args->pmd_pfn != ULONG_MAX) { - if (args->is_contiguous_page) { - free_contig_range(args->pmd_pfn, (1 << HPAGE_PMD_ORDER)); - } else { - page = pfn_to_page(args->pmd_pfn); - __free_pages(page, HPAGE_PMD_ORDER); - } - + debug_vm_pgtable_free_huge_page(args, args->pmd_pfn, HPAGE_PMD_ORDER); args->pmd_pfn = ULONG_MAX; args->pte_pfn = ULONG_MAX; } if (args->pte_pfn != ULONG_MAX) { - page = pfn_to_page(args->pte_pfn); - __free_page(page); + __free_page(pfn_to_page(args->pte_pfn)); args->pte_pfn = ULONG_MAX; } @@ -1242,8 +1239,7 @@ static int __init init_args(struct pgtable_debug_args *args) */ if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && has_transparent_pud_hugepage()) { - page = debug_vm_pgtable_alloc_huge_page(args, - HPAGE_PUD_SHIFT - PAGE_SHIFT); + page = debug_vm_pgtable_alloc_huge_page(args, HPAGE_PUD_ORDER); if (page) { args->pud_pfn = page_to_pfn(page); args->pmd_pfn = args->pud_pfn; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index f7d777921f05..c0b048584769 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7255,7 +7255,6 @@ retry: } return NULL; } -#endif /* CONFIG_CONTIG_ALLOC */ void free_contig_range(unsigned long pfn, unsigned long nr_pages) { @@ -7282,6 +7281,7 @@ void free_contig_range(unsigned long pfn, unsigned long nr_pages) WARN(count != 0, "%lu pages are still in use!\n", count); } EXPORT_SYMBOL(free_contig_range); +#endif /* CONFIG_CONTIG_ALLOC */ /* * Effectively disable pcplists for the zone by setting the high limit to 0 -- cgit v1.2.3 From a9deb800b89efb2050453f7178e73b1d8b124e0f Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Fri, 9 Jan 2026 17:31:32 +0800 Subject: mm: page_alloc: add __split_page() Factor out the splitting of non-compound page from make_alloc_exact() and split_page() into a new helper function __split_page(). While at it, convert the VM_BUG_ON_PAGE() into a VM_WARN_ON_PAGE(). Link: https://lkml.kernel.org/r/20260109093136.1491549-3-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Acked-by: David Hildenbrand Acked-by: Muchun Song Reviewed-by: Zi Yan Reviewed-by: Sidhartha Kumar Cc: Brendan Jackman Cc: Jane Chu Cc: Johannes Weiner Cc: Matthew Wilcox (Oracle) Cc: Oscar Salvador Cc: Vlastimil Babka Cc: Claudiu Beznea Cc: Mark Brown Signed-off-by: Andrew Morton --- include/linux/mmdebug.h | 10 ++++++++++ mm/page_alloc.c | 21 +++++++++++++-------- 2 files changed, 23 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/mmdebug.h b/include/linux/mmdebug.h index 14a45979cccc..ab60ffba08f5 100644 --- a/include/linux/mmdebug.h +++ b/include/linux/mmdebug.h @@ -47,6 +47,15 @@ void vma_iter_dump_tree(const struct vma_iterator *vmi); BUG(); \ } \ } while (0) +#define VM_WARN_ON_PAGE(cond, page) ({ \ + int __ret_warn = !!(cond); \ + \ + if (unlikely(__ret_warn)) { \ + dump_page(page, "VM_WARN_ON_PAGE(" __stringify(cond)")");\ + WARN_ON(1); \ + } \ + unlikely(__ret_warn); \ +}) #define VM_WARN_ON_ONCE_PAGE(cond, page) ({ \ static bool __section(".data..once") __warned; \ int __ret_warn_once = !!(cond); \ @@ -122,6 +131,7 @@ void vma_iter_dump_tree(const struct vma_iterator *vmi); #define VM_BUG_ON_MM(cond, mm) VM_BUG_ON(cond) #define VM_WARN_ON(cond) BUILD_BUG_ON_INVALID(cond) #define VM_WARN_ON_ONCE(cond) BUILD_BUG_ON_INVALID(cond) +#define VM_WARN_ON_PAGE(cond, page) BUILD_BUG_ON_INVALID(cond) #define VM_WARN_ON_ONCE_PAGE(cond, page) BUILD_BUG_ON_INVALID(cond) #define VM_WARN_ON_FOLIO(cond, folio) BUILD_BUG_ON_INVALID(cond) #define VM_WARN_ON_ONCE_FOLIO(cond, folio) BUILD_BUG_ON_INVALID(cond) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c0b048584769..3b99296eda5b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3107,6 +3107,15 @@ void free_unref_folios(struct folio_batch *folios) folio_batch_reinit(folios); } +static void __split_page(struct page *page, unsigned int order) +{ + VM_WARN_ON_PAGE(PageCompound(page), page); + + split_page_owner(page, order, 0); + pgalloc_tag_split(page_folio(page), order, 0); + split_page_memcg(page, order); +} + /* * split_page takes a non-compound higher-order page, and splits it into * n (1< Date: Fri, 9 Jan 2026 17:31:33 +0800 Subject: mm: cma: kill cma_pages_valid() Kill cma_pages_valid() which only used in cma_release(), also cleanup code duplication between cma pages valid checking and cma memrange finding. Link: https://lkml.kernel.org/r/20260109093136.1491549-4-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: Jane Chu Reviewed-by: Zi Yan Reviewed-by: Muchun Song Acked-by: David Hildenbrand Cc: Brendan Jackman Cc: Johannes Weiner Cc: Matthew Wilcox (Oracle) Cc: Oscar Salvador Cc: Sidhartha Kumar Cc: Vlastimil Babka Cc: Claudiu Beznea Cc: Mark Brown Signed-off-by: Andrew Morton --- include/linux/cma.h | 1 - mm/cma.c | 48 +++++++++++------------------------------------- 2 files changed, 11 insertions(+), 38 deletions(-) (limited to 'include') diff --git a/include/linux/cma.h b/include/linux/cma.h index 62d9c1cf6326..e5745d2aec55 100644 --- a/include/linux/cma.h +++ b/include/linux/cma.h @@ -49,7 +49,6 @@ extern int cma_init_reserved_mem(phys_addr_t base, phys_addr_t size, struct cma **res_cma); extern struct page *cma_alloc(struct cma *cma, unsigned long count, unsigned int align, bool no_warn); -extern bool cma_pages_valid(struct cma *cma, const struct page *pages, unsigned long count); extern bool cma_release(struct cma *cma, const struct page *pages, unsigned long count); extern int cma_for_each_area(int (*it)(struct cma *cma, void *data), void *data); diff --git a/mm/cma.c b/mm/cma.c index 813e6dc7b095..fe3a9eaac4e5 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -942,36 +942,6 @@ struct folio *cma_alloc_folio(struct cma *cma, int order, gfp_t gfp) return page ? page_folio(page) : NULL; } -bool cma_pages_valid(struct cma *cma, const struct page *pages, - unsigned long count) -{ - unsigned long pfn, end; - int r; - struct cma_memrange *cmr; - bool ret; - - if (!cma || !pages || count > cma->count) - return false; - - pfn = page_to_pfn(pages); - ret = false; - - for (r = 0; r < cma->nranges; r++) { - cmr = &cma->ranges[r]; - end = cmr->base_pfn + cmr->count; - if (pfn >= cmr->base_pfn && pfn < end) { - ret = pfn + count <= end; - break; - } - } - - if (!ret) - pr_debug("%s(page %p, count %lu)\n", - __func__, (void *)pages, count); - - return ret; -} - /** * cma_release() - release allocated pages * @cma: Contiguous memory region for which the allocation is performed. @@ -991,23 +961,27 @@ bool cma_release(struct cma *cma, const struct page *pages, pr_debug("%s(page %p, count %lu)\n", __func__, (void *)pages, count); - if (!cma_pages_valid(cma, pages, count)) + if (!cma || !pages || count > cma->count) return false; pfn = page_to_pfn(pages); - end_pfn = pfn + count; for (r = 0; r < cma->nranges; r++) { cmr = &cma->ranges[r]; - if (pfn >= cmr->base_pfn && - pfn < (cmr->base_pfn + cmr->count)) { - VM_BUG_ON(end_pfn > cmr->base_pfn + cmr->count); - break; + end_pfn = cmr->base_pfn + cmr->count; + if (pfn >= cmr->base_pfn && pfn < end_pfn) { + if (pfn + count <= end_pfn) + break; + + VM_WARN_ON_ONCE(1); } } - if (r == cma->nranges) + if (r == cma->nranges) { + pr_debug("%s(page %p, count %lu, no cma range matches the page range)\n", + __func__, (void *)pages, count); return false; + } free_contig_range(pfn, count); cma_clear_bitmap(cma, cmr, pfn, count); -- cgit v1.2.3 From e0c1326779cc1b8e3a9e30ae273b89202ed4c82c Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Fri, 9 Jan 2026 17:31:34 +0800 Subject: mm: page_alloc: add alloc_contig_frozen_{range,pages}() In order to allocate given range of pages or allocate compound pages without incrementing their refcount, adding two new helper alloc_contig_frozen_{range,pages}() which may be beneficial to some users (eg hugetlb). The new alloc_contig_{range,pages} only take !__GFP_COMP gfp now, and the free_contig_range() is refactored to only free non-compound pages, the only caller to free compound pages in cma_free_folio() is changed accordingly, and the free_contig_frozen_range() is provided to match the alloc_contig_frozen_range(), which is used to free frozen pages. Link: https://lkml.kernel.org/r/20260109093136.1491549-5-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: Zi Yan Reviewed-by: Sidhartha Kumar Cc: Brendan Jackman Cc: David Hildenbrand Cc: Jane Chu Cc: Johannes Weiner Cc: Matthew Wilcox (Oracle) Cc: Muchun Song Cc: Oscar Salvador Cc: Vlastimil Babka Cc: Claudiu Beznea Cc: Mark Brown Signed-off-by: Andrew Morton --- include/linux/gfp.h | 52 ++++++--------- mm/cma.c | 9 ++- mm/hugetlb.c | 9 ++- mm/internal.h | 13 ++++ mm/page_alloc.c | 186 ++++++++++++++++++++++++++++++++++++++-------------- 5 files changed, 184 insertions(+), 85 deletions(-) (limited to 'include') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 627157972f6a..6ecf6dda93e0 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -436,40 +436,30 @@ typedef unsigned int __bitwise acr_flags_t; #define ACR_FLAGS_CMA ((__force acr_flags_t)BIT(0)) // allocate for CMA /* The below functions must be run on a range from a single zone. */ -extern int alloc_contig_range_noprof(unsigned long start, unsigned long end, - acr_flags_t alloc_flags, gfp_t gfp_mask); -#define alloc_contig_range(...) alloc_hooks(alloc_contig_range_noprof(__VA_ARGS__)) - -extern struct page *alloc_contig_pages_noprof(unsigned long nr_pages, gfp_t gfp_mask, - int nid, nodemask_t *nodemask); -#define alloc_contig_pages(...) alloc_hooks(alloc_contig_pages_noprof(__VA_ARGS__)) - +int alloc_contig_frozen_range_noprof(unsigned long start, unsigned long end, + acr_flags_t alloc_flags, gfp_t gfp_mask); +#define alloc_contig_frozen_range(...) \ + alloc_hooks(alloc_contig_frozen_range_noprof(__VA_ARGS__)) + +int alloc_contig_range_noprof(unsigned long start, unsigned long end, + acr_flags_t alloc_flags, gfp_t gfp_mask); +#define alloc_contig_range(...) \ + alloc_hooks(alloc_contig_range_noprof(__VA_ARGS__)) + +struct page *alloc_contig_frozen_pages_noprof(unsigned long nr_pages, + gfp_t gfp_mask, int nid, nodemask_t *nodemask); +#define alloc_contig_frozen_pages(...) \ + alloc_hooks(alloc_contig_frozen_pages_noprof(__VA_ARGS__)) + +struct page *alloc_contig_pages_noprof(unsigned long nr_pages, gfp_t gfp_mask, + int nid, nodemask_t *nodemask); +#define alloc_contig_pages(...) \ + alloc_hooks(alloc_contig_pages_noprof(__VA_ARGS__)) + +void free_contig_frozen_range(unsigned long pfn, unsigned long nr_pages); void free_contig_range(unsigned long pfn, unsigned long nr_pages); #endif -#ifdef CONFIG_CONTIG_ALLOC -static inline struct folio *folio_alloc_gigantic_noprof(int order, gfp_t gfp, - int nid, nodemask_t *node) -{ - struct page *page; - - if (WARN_ON(!order || !(gfp & __GFP_COMP))) - return NULL; - - page = alloc_contig_pages_noprof(1 << order, gfp, nid, node); - - return page ? page_folio(page) : NULL; -} -#else -static inline struct folio *folio_alloc_gigantic_noprof(int order, gfp_t gfp, - int nid, nodemask_t *node) -{ - return NULL; -} -#endif -/* This should be paired with folio_put() rather than free_contig_range(). */ -#define folio_alloc_gigantic(...) alloc_hooks(folio_alloc_gigantic_noprof(__VA_ARGS__)) - DEFINE_FREE(free_page, void *, free_page((unsigned long)_T)) #endif /* __LINUX_GFP_H */ diff --git a/mm/cma.c b/mm/cma.c index fe3a9eaac4e5..0e8c146424fb 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -836,7 +836,7 @@ static int cma_range_alloc(struct cma *cma, struct cma_memrange *cmr, spin_unlock_irq(&cma->lock); mutex_lock(&cma->alloc_mutex); - ret = alloc_contig_range(pfn, pfn + count, ACR_FLAGS_CMA, gfp); + ret = alloc_contig_frozen_range(pfn, pfn + count, ACR_FLAGS_CMA, gfp); mutex_unlock(&cma->alloc_mutex); if (!ret) break; @@ -904,6 +904,7 @@ static struct page *__cma_alloc(struct cma *cma, unsigned long count, trace_cma_alloc_finish(name, page ? page_to_pfn(page) : 0, page, count, align, ret); if (page) { + set_pages_refcounted(page, count); count_vm_event(CMA_ALLOC_SUCCESS); cma_sysfs_account_success_pages(cma, count); } else { @@ -983,7 +984,11 @@ bool cma_release(struct cma *cma, const struct page *pages, return false; } - free_contig_range(pfn, count); + if (PageHead(pages)) + __free_pages((struct page *)pages, compound_order(pages)); + else + free_contig_range(pfn, count); + cma_clear_bitmap(cma, cmr, pfn, count); cma_sysfs_account_release_pages(cma, count); trace_cma_release(cma->name, pfn, pages, count); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 04385a0122de..762aeebf85d2 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1428,12 +1428,17 @@ static struct folio *alloc_gigantic_folio(int order, gfp_t gfp_mask, retry: folio = hugetlb_cma_alloc_folio(order, gfp_mask, nid, nodemask); if (!folio) { + struct page *page; + if (hugetlb_cma_exclusive_alloc()) return NULL; - folio = folio_alloc_gigantic(order, gfp_mask, nid, nodemask); - if (!folio) + page = alloc_contig_frozen_pages(1 << order, gfp_mask, nid, nodemask); + if (!page) return NULL; + + set_page_refcounted(page); + folio = page_folio(page); } if (folio_ref_freeze(folio, 1)) diff --git a/mm/internal.h b/mm/internal.h index 5585059f0209..0623b865ad1a 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -580,6 +580,19 @@ static inline void set_page_refcounted(struct page *page) set_page_count(page, 1); } +static inline void set_pages_refcounted(struct page *page, unsigned long nr_pages) +{ + unsigned long pfn = page_to_pfn(page); + + if (PageHead(page)) { + set_page_refcounted(page); + return; + } + + for (; nr_pages--; pfn++) + set_page_refcounted(pfn_to_page(pfn)); +} + /* * Return true if a folio needs ->release_folio() calling upon it. */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 3b99296eda5b..a0bb57c4e851 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6889,7 +6889,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, return (ret < 0) ? ret : 0; } -static void split_free_pages(struct list_head *list, gfp_t gfp_mask) +static void split_free_frozen_pages(struct list_head *list, gfp_t gfp_mask) { int order; @@ -6901,11 +6901,10 @@ static void split_free_pages(struct list_head *list, gfp_t gfp_mask) int i; post_alloc_hook(page, order, gfp_mask); - set_page_refcounted(page); if (!order) continue; - split_page(page, order); + __split_page(page, order); /* Add all subpages to the order-0 head, in sequence. */ list_del(&page->lru); @@ -6949,8 +6948,14 @@ static int __alloc_contig_verify_gfp_mask(gfp_t gfp_mask, gfp_t *gfp_cc_mask) return 0; } +static void __free_contig_frozen_range(unsigned long pfn, unsigned long nr_pages) +{ + for (; nr_pages--; pfn++) + free_frozen_pages(pfn_to_page(pfn), 0); +} + /** - * alloc_contig_range() -- tries to allocate given range of pages + * alloc_contig_frozen_range() -- tries to allocate given range of frozen pages * @start: start PFN to allocate * @end: one-past-the-last PFN to allocate * @alloc_flags: allocation information @@ -6965,12 +6970,15 @@ static int __alloc_contig_verify_gfp_mask(gfp_t gfp_mask, gfp_t *gfp_cc_mask) * pageblocks in the range. Once isolated, the pageblocks should not * be modified by others. * - * Return: zero on success or negative error code. On success all - * pages which PFN is in [start, end) are allocated for the caller and - * need to be freed with free_contig_range(). + * All frozen pages which PFN is in [start, end) are allocated for the + * caller, and they could be freed with free_contig_frozen_range(), + * free_frozen_pages() also could be used to free compound frozen pages + * directly. + * + * Return: zero on success or negative error code. */ -int alloc_contig_range_noprof(unsigned long start, unsigned long end, - acr_flags_t alloc_flags, gfp_t gfp_mask) +int alloc_contig_frozen_range_noprof(unsigned long start, unsigned long end, + acr_flags_t alloc_flags, gfp_t gfp_mask) { const unsigned int order = ilog2(end - start); unsigned long outer_start, outer_end; @@ -7086,19 +7094,18 @@ int alloc_contig_range_noprof(unsigned long start, unsigned long end, } if (!(gfp_mask & __GFP_COMP)) { - split_free_pages(cc.freepages, gfp_mask); + split_free_frozen_pages(cc.freepages, gfp_mask); /* Free head and tail (if any) */ if (start != outer_start) - free_contig_range(outer_start, start - outer_start); + __free_contig_frozen_range(outer_start, start - outer_start); if (end != outer_end) - free_contig_range(end, outer_end - end); + __free_contig_frozen_range(end, outer_end - end); } else if (start == outer_start && end == outer_end && is_power_of_2(end - start)) { struct page *head = pfn_to_page(start); check_new_pages(head, order); prep_new_page(head, order, gfp_mask, 0); - set_page_refcounted(head); } else { ret = -EINVAL; WARN(true, "PFN range: requested [%lu, %lu), allocated [%lu, %lu)\n", @@ -7108,16 +7115,40 @@ done: undo_isolate_page_range(start, end); return ret; } -EXPORT_SYMBOL(alloc_contig_range_noprof); +EXPORT_SYMBOL(alloc_contig_frozen_range_noprof); -static int __alloc_contig_pages(unsigned long start_pfn, - unsigned long nr_pages, gfp_t gfp_mask) +/** + * alloc_contig_range() -- tries to allocate given range of pages + * @start: start PFN to allocate + * @end: one-past-the-last PFN to allocate + * @alloc_flags: allocation information + * @gfp_mask: GFP mask. + * + * This routine is a wrapper around alloc_contig_frozen_range(), it can't + * be used to allocate compound pages, the refcount of each allocated page + * will be set to one. + * + * All pages which PFN is in [start, end) are allocated for the caller, + * and should be freed with free_contig_range() or by manually calling + * __free_page() on each allocated page. + * + * Return: zero on success or negative error code. + */ +int alloc_contig_range_noprof(unsigned long start, unsigned long end, + acr_flags_t alloc_flags, gfp_t gfp_mask) { - unsigned long end_pfn = start_pfn + nr_pages; + int ret; - return alloc_contig_range_noprof(start_pfn, end_pfn, ACR_FLAGS_NONE, - gfp_mask); + if (WARN_ON(gfp_mask & __GFP_COMP)) + return -EINVAL; + + ret = alloc_contig_frozen_range_noprof(start, end, alloc_flags, gfp_mask); + if (!ret) + set_pages_refcounted(pfn_to_page(start), end - start); + + return ret; } +EXPORT_SYMBOL(alloc_contig_range_noprof); static bool pfn_range_valid_contig(struct zone *z, unsigned long start_pfn, unsigned long nr_pages, bool skip_hugetlb, @@ -7186,7 +7217,7 @@ static bool zone_spans_last_pfn(const struct zone *zone, } /** - * alloc_contig_pages() -- tries to find and allocate contiguous range of pages + * alloc_contig_frozen_pages() -- tries to find and allocate contiguous range of frozen pages * @nr_pages: Number of contiguous pages to allocate * @gfp_mask: GFP mask. Node/zone/placement hints limit the search; only some * action and reclaim modifiers are supported. Reclaim modifiers @@ -7194,22 +7225,25 @@ static bool zone_spans_last_pfn(const struct zone *zone, * @nid: Target node * @nodemask: Mask for other possible nodes * - * This routine is a wrapper around alloc_contig_range(). It scans over zones - * on an applicable zonelist to find a contiguous pfn range which can then be - * tried for allocation with alloc_contig_range(). This routine is intended - * for allocation requests which can not be fulfilled with the buddy allocator. + * This routine is a wrapper around alloc_contig_frozen_range(). It scans over + * zones on an applicable zonelist to find a contiguous pfn range which can then + * be tried for allocation with alloc_contig_frozen_range(). This routine is + * intended for allocation requests which can not be fulfilled with the buddy + * allocator. * * The allocated memory is always aligned to a page boundary. If nr_pages is a * power of two, then allocated range is also guaranteed to be aligned to same * nr_pages (e.g. 1GB request would be aligned to 1GB). * - * Allocated pages can be freed with free_contig_range() or by manually calling - * __free_page() on each allocated page. + * Allocated frozen pages need be freed with free_contig_frozen_range(), + * or by manually calling free_frozen_pages() on each allocated frozen + * non-compound page, for compound frozen pages could be freed with + * free_frozen_pages() directly. * - * Return: pointer to contiguous pages on success, or NULL if not successful. + * Return: pointer to contiguous frozen pages on success, or NULL if not successful. */ -struct page *alloc_contig_pages_noprof(unsigned long nr_pages, gfp_t gfp_mask, - int nid, nodemask_t *nodemask) +struct page *alloc_contig_frozen_pages_noprof(unsigned long nr_pages, + gfp_t gfp_mask, int nid, nodemask_t *nodemask) { unsigned long ret, pfn, flags; struct zonelist *zonelist; @@ -7231,13 +7265,15 @@ retry: &skipped_hugetlb)) { /* * We release the zone lock here because - * alloc_contig_range() will also lock the zone - * at some point. If there's an allocation - * spinning on this lock, it may win the race - * and cause alloc_contig_range() to fail... + * alloc_contig_frozen_range() will also lock + * the zone at some point. If there's an + * allocation spinning on this lock, it may + * win the race and cause allocation to fail. */ spin_unlock_irqrestore(&zone->lock, flags); - ret = __alloc_contig_pages(pfn, nr_pages, + ret = alloc_contig_frozen_range_noprof(pfn, + pfn + nr_pages, + ACR_FLAGS_NONE, gfp_mask); if (!ret) return pfn_to_page(pfn); @@ -7260,30 +7296,80 @@ retry: } return NULL; } +EXPORT_SYMBOL(alloc_contig_frozen_pages_noprof); -void free_contig_range(unsigned long pfn, unsigned long nr_pages) +/** + * alloc_contig_pages() -- tries to find and allocate contiguous range of pages + * @nr_pages: Number of contiguous pages to allocate + * @gfp_mask: GFP mask. + * @nid: Target node + * @nodemask: Mask for other possible nodes + * + * This routine is a wrapper around alloc_contig_frozen_pages(), it can't + * be used to allocate compound pages, the refcount of each allocated page + * will be set to one. + * + * Allocated pages can be freed with free_contig_range() or by manually + * calling __free_page() on each allocated page. + * + * Return: pointer to contiguous pages on success, or NULL if not successful. + */ +struct page *alloc_contig_pages_noprof(unsigned long nr_pages, gfp_t gfp_mask, + int nid, nodemask_t *nodemask) { - unsigned long count = 0; - struct folio *folio = pfn_folio(pfn); + struct page *page; - if (folio_test_large(folio)) { - int expected = folio_nr_pages(folio); + if (WARN_ON(gfp_mask & __GFP_COMP)) + return NULL; - if (nr_pages == expected) - folio_put(folio); - else - WARN(true, "PFN %lu: nr_pages %lu != expected %d\n", - pfn, nr_pages, expected); + page = alloc_contig_frozen_pages_noprof(nr_pages, gfp_mask, nid, + nodemask); + if (page) + set_pages_refcounted(page, nr_pages); + + return page; +} +EXPORT_SYMBOL(alloc_contig_pages_noprof); + +/** + * free_contig_frozen_range() -- free the contiguous range of frozen pages + * @pfn: start PFN to free + * @nr_pages: Number of contiguous frozen pages to free + * + * This can be used to free the allocated compound/non-compound frozen pages. + */ +void free_contig_frozen_range(unsigned long pfn, unsigned long nr_pages) +{ + struct page *first_page = pfn_to_page(pfn); + const unsigned int order = ilog2(nr_pages); + + if (WARN_ON_ONCE(first_page != compound_head(first_page))) + return; + + if (PageHead(first_page)) { + WARN_ON_ONCE(order != compound_order(first_page)); + free_frozen_pages(first_page, order); return; } - for (; nr_pages--; pfn++) { - struct page *page = pfn_to_page(pfn); + __free_contig_frozen_range(pfn, nr_pages); +} +EXPORT_SYMBOL(free_contig_frozen_range); + +/** + * free_contig_range() -- free the contiguous range of pages + * @pfn: start PFN to free + * @nr_pages: Number of contiguous pages to free + * + * This can be only used to free the allocated non-compound pages. + */ +void free_contig_range(unsigned long pfn, unsigned long nr_pages) +{ + if (WARN_ON_ONCE(PageHead(pfn_to_page(pfn)))) + return; - count += page_count(page) != 1; - __free_page(page); - } - WARN(count != 0, "%lu pages are still in use!\n", count); + for (; nr_pages--; pfn++) + __free_page(pfn_to_page(pfn)); } EXPORT_SYMBOL(free_contig_range); #endif /* CONFIG_CONTIG_ALLOC */ -- cgit v1.2.3 From 9bda131c6093e9c4a8739e2eeb65ba4d5fbefc2f Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Fri, 9 Jan 2026 17:31:35 +0800 Subject: mm: cma: add cma_alloc_frozen{_compound}() Introduce cma_alloc_frozen{_compound}() helper to alloc pages without incrementing their refcount, then convert hugetlb cma to use the cma_alloc_frozen_compound() and cma_release_frozen() and remove the unused cma_{alloc,free}_folio(), also move the cma_validate_zones() into mm/internal.h since no outside user. The set_pages_refcounted() is only called to set non-compound pages after above changes, so remove the processing about PageHead. Link: https://lkml.kernel.org/r/20260109093136.1491549-6-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: Zi Yan Cc: Brendan Jackman Cc: David Hildenbrand Cc: Jane Chu Cc: Johannes Weiner Cc: Matthew Wilcox (Oracle) Cc: Muchun Song Cc: Oscar Salvador Cc: Sidhartha Kumar Cc: Vlastimil Babka Cc: Claudiu Beznea Cc: Mark Brown Signed-off-by: Andrew Morton --- include/linux/cma.h | 26 +++---------- mm/cma.c | 107 +++++++++++++++++++++++++++++++++++----------------- mm/hugetlb_cma.c | 24 +++++++----- mm/internal.h | 10 ++--- 4 files changed, 97 insertions(+), 70 deletions(-) (limited to 'include') diff --git a/include/linux/cma.h b/include/linux/cma.h index e5745d2aec55..e2a690f7e77e 100644 --- a/include/linux/cma.h +++ b/include/linux/cma.h @@ -51,29 +51,15 @@ extern struct page *cma_alloc(struct cma *cma, unsigned long count, unsigned int bool no_warn); extern bool cma_release(struct cma *cma, const struct page *pages, unsigned long count); +struct page *cma_alloc_frozen(struct cma *cma, unsigned long count, + unsigned int align, bool no_warn); +struct page *cma_alloc_frozen_compound(struct cma *cma, unsigned int order); +bool cma_release_frozen(struct cma *cma, const struct page *pages, + unsigned long count); + extern int cma_for_each_area(int (*it)(struct cma *cma, void *data), void *data); extern bool cma_intersects(struct cma *cma, unsigned long start, unsigned long end); extern void cma_reserve_pages_on_error(struct cma *cma); -#ifdef CONFIG_CMA -struct folio *cma_alloc_folio(struct cma *cma, int order, gfp_t gfp); -bool cma_free_folio(struct cma *cma, const struct folio *folio); -bool cma_validate_zones(struct cma *cma); -#else -static inline struct folio *cma_alloc_folio(struct cma *cma, int order, gfp_t gfp) -{ - return NULL; -} - -static inline bool cma_free_folio(struct cma *cma, const struct folio *folio) -{ - return false; -} -static inline bool cma_validate_zones(struct cma *cma) -{ - return false; -} -#endif - #endif diff --git a/mm/cma.c b/mm/cma.c index 0e8c146424fb..b80b60ed4927 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -856,8 +856,8 @@ out: return ret; } -static struct page *__cma_alloc(struct cma *cma, unsigned long count, - unsigned int align, gfp_t gfp) +static struct page *__cma_alloc_frozen(struct cma *cma, + unsigned long count, unsigned int align, gfp_t gfp) { struct page *page = NULL; int ret = -ENOMEM, r; @@ -904,7 +904,6 @@ static struct page *__cma_alloc(struct cma *cma, unsigned long count, trace_cma_alloc_finish(name, page ? page_to_pfn(page) : 0, page, count, align, ret); if (page) { - set_pages_refcounted(page, count); count_vm_event(CMA_ALLOC_SUCCESS); cma_sysfs_account_success_pages(cma, count); } else { @@ -915,6 +914,21 @@ static struct page *__cma_alloc(struct cma *cma, unsigned long count, return page; } +struct page *cma_alloc_frozen(struct cma *cma, unsigned long count, + unsigned int align, bool no_warn) +{ + gfp_t gfp = GFP_KERNEL | (no_warn ? __GFP_NOWARN : 0); + + return __cma_alloc_frozen(cma, count, align, gfp); +} + +struct page *cma_alloc_frozen_compound(struct cma *cma, unsigned int order) +{ + gfp_t gfp = GFP_KERNEL | __GFP_COMP | __GFP_NOWARN; + + return __cma_alloc_frozen(cma, 1 << order, order, gfp); +} + /** * cma_alloc() - allocate pages from contiguous area * @cma: Contiguous memory region for which the allocation is performed. @@ -927,43 +941,27 @@ static struct page *__cma_alloc(struct cma *cma, unsigned long count, */ struct page *cma_alloc(struct cma *cma, unsigned long count, unsigned int align, bool no_warn) -{ - return __cma_alloc(cma, count, align, GFP_KERNEL | (no_warn ? __GFP_NOWARN : 0)); -} - -struct folio *cma_alloc_folio(struct cma *cma, int order, gfp_t gfp) { struct page *page; - if (WARN_ON(!order || !(gfp & __GFP_COMP))) - return NULL; - - page = __cma_alloc(cma, 1 << order, order, gfp); + page = cma_alloc_frozen(cma, count, align, no_warn); + if (page) + set_pages_refcounted(page, count); - return page ? page_folio(page) : NULL; + return page; } -/** - * cma_release() - release allocated pages - * @cma: Contiguous memory region for which the allocation is performed. - * @pages: Allocated pages. - * @count: Number of allocated pages. - * - * This function releases memory allocated by cma_alloc(). - * It returns false when provided pages do not belong to contiguous area and - * true otherwise. - */ -bool cma_release(struct cma *cma, const struct page *pages, - unsigned long count) +static struct cma_memrange *find_cma_memrange(struct cma *cma, + const struct page *pages, unsigned long count) { - struct cma_memrange *cmr; + struct cma_memrange *cmr = NULL; unsigned long pfn, end_pfn; int r; pr_debug("%s(page %p, count %lu)\n", __func__, (void *)pages, count); if (!cma || !pages || count > cma->count) - return false; + return NULL; pfn = page_to_pfn(pages); @@ -981,27 +979,66 @@ bool cma_release(struct cma *cma, const struct page *pages, if (r == cma->nranges) { pr_debug("%s(page %p, count %lu, no cma range matches the page range)\n", __func__, (void *)pages, count); - return false; + return NULL; } - if (PageHead(pages)) - __free_pages((struct page *)pages, compound_order(pages)); - else - free_contig_range(pfn, count); + return cmr; +} + +static void __cma_release_frozen(struct cma *cma, struct cma_memrange *cmr, + const struct page *pages, unsigned long count) +{ + unsigned long pfn = page_to_pfn(pages); + + pr_debug("%s(page %p, count %lu)\n", __func__, (void *)pages, count); + free_contig_frozen_range(pfn, count); cma_clear_bitmap(cma, cmr, pfn, count); cma_sysfs_account_release_pages(cma, count); trace_cma_release(cma->name, pfn, pages, count); +} + +/** + * cma_release() - release allocated pages + * @cma: Contiguous memory region for which the allocation is performed. + * @pages: Allocated pages. + * @count: Number of allocated pages. + * + * This function releases memory allocated by cma_alloc(). + * It returns false when provided pages do not belong to contiguous area and + * true otherwise. + */ +bool cma_release(struct cma *cma, const struct page *pages, + unsigned long count) +{ + struct cma_memrange *cmr; + unsigned long i, pfn; + + cmr = find_cma_memrange(cma, pages, count); + if (!cmr) + return false; + + pfn = page_to_pfn(pages); + for (i = 0; i < count; i++, pfn++) + VM_WARN_ON(!put_page_testzero(pfn_to_page(pfn))); + + __cma_release_frozen(cma, cmr, pages, count); return true; } -bool cma_free_folio(struct cma *cma, const struct folio *folio) +bool cma_release_frozen(struct cma *cma, const struct page *pages, + unsigned long count) { - if (WARN_ON(!folio_test_large(folio))) + struct cma_memrange *cmr; + + cmr = find_cma_memrange(cma, pages, count); + if (!cmr) return false; - return cma_release(cma, &folio->page, folio_nr_pages(folio)); + __cma_release_frozen(cma, cmr, pages, count); + + return true; } int cma_for_each_area(int (*it)(struct cma *cma, void *data), void *data) diff --git a/mm/hugetlb_cma.c b/mm/hugetlb_cma.c index f5e79103e110..58ceb6c9e410 100644 --- a/mm/hugetlb_cma.c +++ b/mm/hugetlb_cma.c @@ -20,35 +20,39 @@ static unsigned long hugetlb_cma_size __initdata; void hugetlb_cma_free_folio(struct folio *folio) { - int nid = folio_nid(folio); + folio_ref_dec(folio); - WARN_ON_ONCE(!cma_free_folio(hugetlb_cma[nid], folio)); + WARN_ON_ONCE(!cma_release_frozen(hugetlb_cma[folio_nid(folio)], + &folio->page, folio_nr_pages(folio))); } - struct folio *hugetlb_cma_alloc_folio(int order, gfp_t gfp_mask, int nid, nodemask_t *nodemask) { int node; - struct folio *folio = NULL; + struct folio *folio; + struct page *page = NULL; if (hugetlb_cma[nid]) - folio = cma_alloc_folio(hugetlb_cma[nid], order, gfp_mask); + page = cma_alloc_frozen_compound(hugetlb_cma[nid], order); - if (!folio && !(gfp_mask & __GFP_THISNODE)) { + if (!page && !(gfp_mask & __GFP_THISNODE)) { for_each_node_mask(node, *nodemask) { if (node == nid || !hugetlb_cma[node]) continue; - folio = cma_alloc_folio(hugetlb_cma[node], order, gfp_mask); - if (folio) + page = cma_alloc_frozen_compound(hugetlb_cma[node], order); + if (page) break; } } - if (folio) - folio_set_hugetlb_cma(folio); + if (!page) + return NULL; + set_page_refcounted(page); + folio = page_folio(page); + folio_set_hugetlb_cma(folio); return folio; } diff --git a/mm/internal.h b/mm/internal.h index 0623b865ad1a..27509a909915 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -584,11 +584,6 @@ static inline void set_pages_refcounted(struct page *page, unsigned long nr_page { unsigned long pfn = page_to_pfn(page); - if (PageHead(page)) { - set_page_refcounted(page); - return; - } - for (; nr_pages--; pfn++) set_page_refcounted(pfn_to_page(pfn)); } @@ -1014,9 +1009,14 @@ void init_cma_reserved_pageblock(struct page *page); struct cma; #ifdef CONFIG_CMA +bool cma_validate_zones(struct cma *cma); void *cma_reserve_early(struct cma *cma, unsigned long size); void init_cma_pageblock(struct page *page); #else +static inline bool cma_validate_zones(struct cma *cma) +{ + return false; +} static inline void *cma_reserve_early(struct cma *cma, unsigned long size) { return NULL; -- cgit v1.2.3 From 4835e2871321fd9cf5bc9702dded323e3e3fbc1a Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 13 Jan 2026 07:27:06 -0800 Subject: mm/damon/core: introduce [in]active memory ratio damos quota goal metric Patch series "mm/damon: advance DAMOS-based LRU sorting". DAMOS_LRU_[DE]PRIO actions were added to DAMOS for more access-aware LRU lists sorting. For simple usage, a specialized kernel module, namely DAMON_LRU_SORT, has also been introduced. After the introduction of the module, DAMON got a few important new features, including the aim-based quota auto-tuning, age tracking, young page filter, and monitoring intervals auto-tuning. Meanwhile, DAMOS-based LRU sorting had no direct updates. Now we show some rooms to advance for DAMOS-based LRU sorting. Firstly, the aim-oriented quota auto-tuning can simplify the LRU sorting parameters tuning. But there is no good auto-tuning target metric for LRU sorting use case. Secondly, the behavior of DAMOS_LRU_[DE]PRIO are not very symmetric. DAMOS_LRU_DEPRIO directly moves the pages to inactive LRU list, while DAMOS_LRU_PRIO only marks the page as accessed, so that the page can not directly but only eventually moved to the active LRU list. Finally, DAMON_LRU_SORT users cannot utilize the modern features that can be useful for them, too. Improve the situation with the following changes. First, introduce a new DAMOS quota auto-tuning target metric for active:inactive memory size ratio. Since LRU sorting is a kind of balancing of active and inactive pages, the active:inactive memory size ratio can be intuitively set. Second, update DAMOS_LRU_[DE]PRIO behaviors to be more intuitive and symmetric, by letting them directly move the pages to [in]active LRU list. Third, update the DAMON_LRU_SORT module user interface to be able to fully utilize the modern features including the [in]active memory size ratio-based quota auto-tuning, young page filter, and monitoring intervals auto-tuning. With these changes, for example, users can now ask DAMON to "find hot/cold memory regions with auto-tuned monitoring intervals, do one more page level access check for found hot/cold memory, and move pages of those to active or inactive LRU lists accordingly, aiming X:Y active to inactive memory ratio." For example, if they know 30% of the memory is better to be protected from reclamation, 30:70 can be set as the target ratio. Test Results ------------ I ran DAMON_LRU_SORT with the features introduced by this series, on a real world server workload. For the active:inactive ratio goal, I set 50:50. I confirmed it achieves the target active:inactive ratio, without manual tuning of the monitoring intervals and the hot/coldness thresholds. The baseline system that was not running the DAMON_LRU_SORT was keeping active:inactive ratio of about 1:10. Note that the test didn't show a clear performance difference, though. I believe that was mainly because the workload was not very memory intensive. Also, whether the 50:50 target ratio was optimum is unclear. Nonetheless, the positive performance impact of the basic LRU sorting idea is already confirmed with the initial DAMON_LRU_SORT introduction patch series. The goal of this patch series is simplifying the parameters tuning of DAMOS-based LRU sorting, and the test confirmed the aimed goals are achieved. Patches Sequence ---------------- First three patches extend DAMOS quota auto-tuning to support [in]active memory ratio target metric type. Those (patches 1-3) introduce new metrics, implement DAMON sysfs support, and update the documentation, respectively. Following patch (patch 4) makes DAMOS_LRU_PRIO action to directly move target pages to active LRU list, instead of only marking them accessed. Following seven patches (patches 5-11) updates DAMON_LRU_SORT to support modern DAMON features. Patch 5 makes it uses not only access frequency but also age at under-quota regions prioritization. Patches 6-11 add the support for young page filtering, active:inactive memory ratio based quota auto-tuning, and monitoring intervals auto-tuning, with appropriate document updates. This patch (of 11): DAMOS_LRU_[DE]PRIO are DAMOS actions for making balance of active and inactive memory size. There is no appropriate DAMOS quota auto-tuning target metric for the use case. Add two new DAMOS quota goal metrics for the purpose, namely DAMOS_QUOTA_[IN]ACTIVE_MEM_BP. Those will represent the ratio of [in]active memory to total (inactive + active) memory. Hence, users will be able to ask DAMON to, for example, "find hot and cold memory, and move pages of those to active and inactive LRU lists, adjusting the hot/cold thresholds aiming 50:50 active:inactive memory ratio." Link: https://lkml.kernel.org/r/20260113152717.70459-1-sj@kernel.org Link: https://lkml.kernel.org/r/20260113152717.70459-2-sj@kernel.org Signed-off-by: SeongJae Park Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/damon.h | 4 ++++ mm/damon/core.c | 22 ++++++++++++++++++++++ 2 files changed, 26 insertions(+) (limited to 'include') diff --git a/include/linux/damon.h b/include/linux/damon.h index 650e7ecfa32b..26fb8e90dff6 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -155,6 +155,8 @@ enum damos_action { * @DAMOS_QUOTA_NODE_MEM_FREE_BP: MemFree ratio of a node. * @DAMOS_QUOTA_NODE_MEMCG_USED_BP: MemUsed ratio of a node for a cgroup. * @DAMOS_QUOTA_NODE_MEMCG_FREE_BP: MemFree ratio of a node for a cgroup. + * @DAMOS_QUOTA_ACTIVE_MEM_BP: Active to total LRU memory ratio. + * @DAMOS_QUOTA_INACTIVE_MEM_BP: Inactive to total LRU memory ratio. * @NR_DAMOS_QUOTA_GOAL_METRICS: Number of DAMOS quota goal metrics. * * Metrics equal to larger than @NR_DAMOS_QUOTA_GOAL_METRICS are unsupported. @@ -166,6 +168,8 @@ enum damos_quota_goal_metric { DAMOS_QUOTA_NODE_MEM_FREE_BP, DAMOS_QUOTA_NODE_MEMCG_USED_BP, DAMOS_QUOTA_NODE_MEMCG_FREE_BP, + DAMOS_QUOTA_ACTIVE_MEM_BP, + DAMOS_QUOTA_INACTIVE_MEM_BP, NR_DAMOS_QUOTA_GOAL_METRICS, }; diff --git a/mm/damon/core.c b/mm/damon/core.c index 6888917c1a00..729a5f7fac94 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -2132,6 +2132,23 @@ static unsigned long damos_get_node_memcg_used_bp( } #endif +/* + * Returns LRU-active or inactive memory to total LRU memory size ratio. + */ +static unsigned int damos_get_in_active_mem_bp(bool active_ratio) +{ + unsigned long active, inactive, total; + + /* This should align with /proc/meminfo output */ + active = global_node_page_state(NR_LRU_BASE + LRU_ACTIVE_ANON) + + global_node_page_state(NR_LRU_BASE + LRU_ACTIVE_FILE); + inactive = global_node_page_state(NR_LRU_BASE + LRU_INACTIVE_ANON) + + global_node_page_state(NR_LRU_BASE + LRU_INACTIVE_FILE); + total = active + inactive; + if (active_ratio) + return active * 10000 / total; + return inactive * 10000 / total; +} static void damos_set_quota_goal_current_value(struct damos_quota_goal *goal) { @@ -2154,6 +2171,11 @@ static void damos_set_quota_goal_current_value(struct damos_quota_goal *goal) case DAMOS_QUOTA_NODE_MEMCG_FREE_BP: goal->current_value = damos_get_node_memcg_used_bp(goal); break; + case DAMOS_QUOTA_ACTIVE_MEM_BP: + case DAMOS_QUOTA_INACTIVE_MEM_BP: + goal->current_value = damos_get_in_active_mem_bp( + goal->metric == DAMOS_QUOTA_ACTIVE_MEM_BP); + break; default: break; } -- cgit v1.2.3 From dc2e4982cb018306f0699cd460a9033467f07be5 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Tue, 13 Jan 2026 12:46:45 +0900 Subject: zsmalloc: introduce SG-list based object read API Currently, zsmalloc performs address linearization on read (which sometimes requires memcpy() to a local buffer). Not all zsmalloc users need a linear address. For example, Crypto API supports SG-list, performing linearization under the hood, if needed. In addition, some compressors can have native SG-list support, completely avoiding the linearization step. Provide an SG-list based zsmalloc read API: - zs_obj_read_sg_begin() - zs_obj_read_sg_end() This API allows callers to obtain an SG representation of the object (one entry for objects that are contained in a single page and two entries for spanning objects), avoiding the need for a bounce buffer and memcpy. [senozhatsky@chromium.org: make zs_obj_read_sg_begin() return void, per Yosry] Link: https://lkml.kernel.org/r/20260117024900.792237-1-senozhatsky@chromium.org Link: https://lkml.kernel.org/r/20260113034645.2729998-1-senozhatsky@chromium.org Signed-off-by: Sergey Senozhatsky Acked-by: Herbert Xu Tested-by: Yosry Ahmed Cc: Herbert Xu Cc: Brian Geffon Cc: Johannes Weiner Cc: Minchan Kim Cc: Nhat Pham Signed-off-by: Andrew Morton --- include/linux/zsmalloc.h | 4 +++ mm/zsmalloc.c | 63 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+) (limited to 'include') diff --git a/include/linux/zsmalloc.h b/include/linux/zsmalloc.h index 5565c3171007..478410c880b1 100644 --- a/include/linux/zsmalloc.h +++ b/include/linux/zsmalloc.h @@ -22,6 +22,7 @@ struct zs_pool_stats { }; struct zs_pool; +struct scatterlist; struct zs_pool *zs_create_pool(const char *name); void zs_destroy_pool(struct zs_pool *pool); @@ -43,6 +44,9 @@ void *zs_obj_read_begin(struct zs_pool *pool, unsigned long handle, size_t mem_len, void *local_copy); void zs_obj_read_end(struct zs_pool *pool, unsigned long handle, size_t mem_len, void *handle_mem); +void zs_obj_read_sg_begin(struct zs_pool *pool, unsigned long handle, + struct scatterlist *sg, size_t mem_len); +void zs_obj_read_sg_end(struct zs_pool *pool, unsigned long handle); void zs_obj_write(struct zs_pool *pool, unsigned long handle, void *handle_mem, size_t mem_len); diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index cc3d9501ae21..dccb88d52c07 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include @@ -1141,6 +1142,68 @@ void zs_obj_read_end(struct zs_pool *pool, unsigned long handle, } EXPORT_SYMBOL_GPL(zs_obj_read_end); +void zs_obj_read_sg_begin(struct zs_pool *pool, unsigned long handle, + struct scatterlist *sg, size_t mem_len) +{ + struct zspage *zspage; + struct zpdesc *zpdesc; + unsigned long obj, off; + unsigned int obj_idx; + struct size_class *class; + + /* Guarantee we can get zspage from handle safely */ + read_lock(&pool->lock); + obj = handle_to_obj(handle); + obj_to_location(obj, &zpdesc, &obj_idx); + zspage = get_zspage(zpdesc); + + /* Make sure migration doesn't move any pages in this zspage */ + zspage_read_lock(zspage); + read_unlock(&pool->lock); + + class = zspage_class(pool, zspage); + off = offset_in_page(class->size * obj_idx); + + if (!ZsHugePage(zspage)) + off += ZS_HANDLE_SIZE; + + if (off + mem_len <= PAGE_SIZE) { + /* this object is contained entirely within a page */ + sg_init_table(sg, 1); + sg_set_page(sg, zpdesc_page(zpdesc), mem_len, off); + } else { + size_t sizes[2]; + + /* this object spans two pages */ + sizes[0] = PAGE_SIZE - off; + sizes[1] = mem_len - sizes[0]; + + sg_init_table(sg, 2); + sg_set_page(sg, zpdesc_page(zpdesc), sizes[0], off); + + zpdesc = get_next_zpdesc(zpdesc); + sg = sg_next(sg); + + sg_set_page(sg, zpdesc_page(zpdesc), sizes[1], 0); + } +} +EXPORT_SYMBOL_GPL(zs_obj_read_sg_begin); + +void zs_obj_read_sg_end(struct zs_pool *pool, unsigned long handle) +{ + struct zspage *zspage; + struct zpdesc *zpdesc; + unsigned long obj; + unsigned int obj_idx; + + obj = handle_to_obj(handle); + obj_to_location(obj, &zpdesc, &obj_idx); + zspage = get_zspage(zpdesc); + + zspage_read_unlock(zspage); +} +EXPORT_SYMBOL_GPL(zs_obj_read_sg_end); + void zs_obj_write(struct zs_pool *pool, unsigned long handle, void *handle_mem, size_t mem_len) { -- cgit v1.2.3 From 3d702678f57edc524f73a7865382ae304269f590 Mon Sep 17 00:00:00 2001 From: Jinjiang Tu Date: Tue, 23 Dec 2025 19:05:23 +0800 Subject: mm/mempolicy: fix mpol_rebind_nodemask() for MPOL_F_NUMA_BALANCING commit bda420b98505 ("numa balancing: migrate on fault among multiple bound nodes") adds new flag MPOL_F_NUMA_BALANCING to enable NUMA balancing for MPOL_BIND memory policy. When the cpuset of tasks changes, the mempolicy of the task is rebound by mpol_rebind_nodemask(). When MPOL_F_STATIC_NODES and MPOL_F_RELATIVE_NODES are both not set, the behaviour of rebinding should be same whenever MPOL_F_NUMA_BALANCING is set or not. So, when an application calls set_mempolicy() with MPOL_F_NUMA_BALANCING set but both MPOL_F_STATIC_NODES and MPOL_F_RELATIVE_NODES cleared, mempolicy.w.cpuset_mems_allowed should be set to cpuset_current_mems_allowed nodemask. However, in current implementation, mpol_store_user_nodemask() wrongly returns true, causing mempolicy->w.user_nodemask to be incorrectly set to the user-specified nodemask. Later, when the cpuset of the application changes, mpol_rebind_nodemask() ends up rebinding based on the user-specified nodemask rather than the cpuset_mems_allowed nodemask as intended. I can reproduce with the following steps in qemu with 4 NUMA nodes: 1. echo '+cpuset' > /sys/fs/cgroup/cgroup.subtree_control 2. mkdir /sys/fs/cgroup/test 3. ./reproducer & 4. cat /proc/$pid/numa_maps, the task is bound to NUMA 1 5. echo $pid > /sys/fs/cgroup/test/cgroup.procs 6. cat /proc/$pid/numa_maps, the task is bound to NUMA 0 now. The reproducer code: int main() { struct bitmask *bmp; int ret; bmp = numa_parse_nodestring("1"); ret = set_mempolicy(MPOL_BIND | MPOL_F_NUMA_BALANCING, bmp->maskp, bmp->size + 1); if (ret < 0) { perror("Failed to call set_mempolicy"); exit(-1); } while (1); return 0; } If I call set_mempolicy() without MPOL_F_NUMA_BALANCING in the reproducer code. After step 5, the task is still bound to NUMA 1. To fix this, only set mempolicy->w.user_nodemask to the user-specified nodemask if MPOL_F_STATIC_NODES or MPOL_F_RELATIVE_NODES is present. Link: https://lkml.kernel.org/r/20260120011018.1256654-1-tujinjiang@huawei.com Link: https://lkml.kernel.org/r/20251223110523.1161421-1-tujinjiang@huawei.com Fixes: bda420b98505 ("numa balancing: migrate on fault among multiple bound nodes") Signed-off-by: Jinjiang Tu Reviewed-by: Gregory Price Reviewed-by: Huang Ying Acked-by: David Hildenbrand (Red Hat) Cc: Alistair Popple Cc: Byungchul Park Cc: Joshua Hahn Cc: Kefeng Wang Cc: Mathew Brost Cc: Mel Gorman Cc: Rakie Kim Cc: Zi Yan Signed-off-by: Andrew Morton --- include/uapi/linux/mempolicy.h | 3 +++ mm/mempolicy.c | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h index 8fbbe613611a..6c962d866e86 100644 --- a/include/uapi/linux/mempolicy.h +++ b/include/uapi/linux/mempolicy.h @@ -39,6 +39,9 @@ enum { #define MPOL_MODE_FLAGS \ (MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES | MPOL_F_NUMA_BALANCING) +/* Whether the nodemask is specified by users */ +#define MPOL_USER_NODEMASK_FLAGS (MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES) + /* Flags for get_mempolicy */ #define MPOL_F_NODE (1<<0) /* return next IL mode instead of node mask */ #define MPOL_F_ADDR (1<<1) /* look up vma using address */ diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 68a98ba57882..76da50425712 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -365,7 +365,7 @@ static const struct mempolicy_operations { static inline int mpol_store_user_nodemask(const struct mempolicy *pol) { - return pol->flags & MPOL_MODE_FLAGS; + return pol->flags & MPOL_USER_NODEMASK_FLAGS; } static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig, -- cgit v1.2.3 From 832d95b5314eea558cf4cc9ca40db10122ce8f63 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 9 Jan 2026 04:13:43 +0000 Subject: migrate: replace RMP_ flags with TTU_ flags Instead of translating between RMP_ and TTU_ flags, remove the RMP_ flags and just use the TTU_ flag space; there's plenty available. Possibly we should rename these to RMAP_ flags, and maybe even pass them in through rmap_walk_arg, but that can be done later. Link: https://lkml.kernel.org/r/20260109041345.3863089-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: David Hildenbrand (Red Hat) Reviewed-by: Lorenzo Stoakes Reviewed-by: Zi Yan Cc: Alistair Popple Cc: Byungchul Park Cc: Gregory Price Cc: Jann Horn Cc: Joshua Hahn Cc: Lance Yang Cc: Liam Howlett Cc: Matthew Brost Cc: Rakie Kim Cc: Rik van Riel Cc: Vlastimil Babka Cc: Ying Huang Signed-off-by: Andrew Morton --- include/linux/rmap.h | 9 +++------ mm/huge_memory.c | 8 ++++---- mm/migrate.c | 12 ++++++------ 3 files changed, 13 insertions(+), 16 deletions(-) (limited to 'include') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index dd764951b03d..8dc0871e5f00 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -92,6 +92,7 @@ struct anon_vma_chain { }; enum ttu_flags { + TTU_USE_SHARED_ZEROPAGE = 0x2, /* for unused pages of large folios */ TTU_SPLIT_HUGE_PMD = 0x4, /* split huge PMD if any */ TTU_IGNORE_MLOCK = 0x8, /* ignore mlock */ TTU_SYNC = 0x10, /* avoid racy checks with PVMW_SYNC */ @@ -933,12 +934,8 @@ int mapping_wrprotect_range(struct address_space *mapping, pgoff_t pgoff, int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff, struct vm_area_struct *vma); -enum rmp_flags { - RMP_LOCKED = 1 << 0, - RMP_USE_SHARED_ZEROPAGE = 1 << 1, -}; - -void remove_migration_ptes(struct folio *src, struct folio *dst, int flags); +void remove_migration_ptes(struct folio *src, struct folio *dst, + enum ttu_flags flags); /* * rmap_walk_control: To control rmap traversing for specific needs diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 40cf59301c21..44ff8a648afd 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3431,7 +3431,7 @@ static void remap_page(struct folio *folio, unsigned long nr, int flags) if (!folio_test_anon(folio)) return; for (;;) { - remove_migration_ptes(folio, folio, RMP_LOCKED | flags); + remove_migration_ptes(folio, folio, TTU_RMAP_LOCKED | flags); i += folio_nr_pages(folio); if (i >= nr) break; @@ -3944,7 +3944,7 @@ static int __folio_split(struct folio *folio, unsigned int new_order, int old_order = folio_order(folio); struct folio *new_folio, *next; int nr_shmem_dropped = 0; - int remap_flags = 0; + enum ttu_flags ttu_flags = 0; int ret; pgoff_t end = 0; @@ -4064,9 +4064,9 @@ fail: shmem_uncharge(mapping->host, nr_shmem_dropped); if (!ret && is_anon && !folio_is_device_private(folio)) - remap_flags = RMP_USE_SHARED_ZEROPAGE; + ttu_flags = TTU_USE_SHARED_ZEROPAGE; - remap_page(folio, 1 << old_order, remap_flags); + remap_page(folio, 1 << old_order, ttu_flags); /* * Unlock all after-split folios except the one containing diff --git a/mm/migrate.c b/mm/migrate.c index 4688b9e38cd2..4750a2ba15fe 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -452,11 +452,12 @@ static bool remove_migration_pte(struct folio *folio, * Get rid of all migration entries and replace them by * references to the indicated page. */ -void remove_migration_ptes(struct folio *src, struct folio *dst, int flags) +void remove_migration_ptes(struct folio *src, struct folio *dst, + enum ttu_flags flags) { struct rmap_walk_arg rmap_walk_arg = { .folio = src, - .map_unused_to_zeropage = flags & RMP_USE_SHARED_ZEROPAGE, + .map_unused_to_zeropage = flags & TTU_USE_SHARED_ZEROPAGE, }; struct rmap_walk_control rwc = { @@ -464,9 +465,9 @@ void remove_migration_ptes(struct folio *src, struct folio *dst, int flags) .arg = &rmap_walk_arg, }; - VM_BUG_ON_FOLIO((flags & RMP_USE_SHARED_ZEROPAGE) && (src != dst), src); + VM_BUG_ON_FOLIO((flags & TTU_USE_SHARED_ZEROPAGE) && (src != dst), src); - if (flags & RMP_LOCKED) + if (flags & TTU_RMAP_LOCKED) rmap_walk_locked(dst, &rwc); else rmap_walk(dst, &rwc); @@ -1521,8 +1522,7 @@ static int unmap_and_move_huge_page(new_folio_t get_new_folio, rc = move_to_new_folio(dst, src, mode); if (page_was_mapped) - remove_migration_ptes(src, !rc ? dst : src, - ttu ? RMP_LOCKED : 0); + remove_migration_ptes(src, !rc ? dst : src, ttu); if (ttu & TTU_RMAP_LOCKED) i_mmap_unlock_write(mapping); -- cgit v1.2.3 From c4a0c5ff85b7ca0d5fbd71888965f40e55295b19 Mon Sep 17 00:00:00 2001 From: Rohan McLure Date: Fri, 19 Dec 2025 04:09:35 +1100 Subject: mm/page_table_check: reinstate address parameter in [__]page_table_check_pud[s]_set() This reverts commit 6d144436d954 ("mm/page_table_check: remove unused parameter in [__]page_table_check_pud_set"). Reinstate previously unused parameters for the purpose of supporting powerpc platforms, as many do not encode user/kernel ownership of the page in the pte, but instead in the address of the access. Apply this to __page_table_check_puds_set(), page_table_check_puds_set() and the page_table_check_pud_set() wrapper macro. [ajd@linux.ibm.com: rebase on riscv + arm64 changes, update commit message] Link: https://lkml.kernel.org/r/20251219-pgtable_check_v18rebase-v18-3-755bc151a50b@linux.ibm.com Signed-off-by: Rohan McLure Signed-off-by: Andrew Donnellan Reviewed-by: Pasha Tatashin Acked-by: Ingo Molnar # x86 Acked-by: Alexandre Ghiti # riscv Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Alistair Popple Cc: Christophe Leroy Cc: "Christophe Leroy (CS GROUP)" Cc: David Hildenbrand Cc: Donet Tom Cc: Guo Weikang Cc: Jason Gunthorpe Cc: Kevin Brodsky Cc: Madhavan Srinivasan Cc: Magnus Lindholm Cc: "Matthew Wilcox (Oracle)" Cc: Michael Ellerman Cc: Nicholas Miehlbradt Cc: Nicholas Piggin Cc: Paul Mackerras Cc: Qi Zheng Cc: "Ritesh Harjani (IBM)" Cc: Sweet Tea Dorminy Cc: Thomas Huth Cc: "Vishal Moola (Oracle)" Cc: Zi Yan Signed-off-by: Andrew Morton --- arch/arm64/include/asm/pgtable.h | 3 ++- arch/riscv/include/asm/pgtable.h | 4 ++-- arch/x86/include/asm/pgtable.h | 4 ++-- include/linux/page_table_check.h | 12 ++++++------ mm/page_table_check.c | 4 ++-- 5 files changed, 14 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 29f7ae7011a8..87ed9b1c011e 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -688,7 +688,8 @@ static inline void __set_ptes_anysz(struct mm_struct *mm, unsigned long addr, break; #ifndef __PAGETABLE_PMD_FOLDED case PUD_SIZE: - page_table_check_puds_set(mm, (pud_t *)ptep, pte_pud(pte), nr); + page_table_check_puds_set(mm, addr, (pud_t *)ptep, + pte_pud(pte), nr); break; #endif default: diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 9acd58a67123..07705adee128 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -953,7 +953,7 @@ static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr, static inline void set_pud_at(struct mm_struct *mm, unsigned long addr, pud_t *pudp, pud_t pud) { - page_table_check_pud_set(mm, pudp, pud); + page_table_check_pud_set(mm, addr, pudp, pud); return __set_pte_at(mm, (pte_t *)pudp, pud_pte(pud)); } @@ -1122,7 +1122,7 @@ static inline void update_mmu_cache_pud(struct vm_area_struct *vma, static inline pud_t pudp_establish(struct vm_area_struct *vma, unsigned long address, pud_t *pudp, pud_t pud) { - page_table_check_pud_set(vma->vm_mm, pudp, pud); + page_table_check_pud_set(vma->vm_mm, address, pudp, pud); return __pud(atomic_long_xchg((atomic_long_t *)pudp, pud_val(pud))); } diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 2842fa1f7a2c..2b540c563d8d 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -1221,7 +1221,7 @@ static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr, static inline void set_pud_at(struct mm_struct *mm, unsigned long addr, pud_t *pudp, pud_t pud) { - page_table_check_pud_set(mm, pudp, pud); + page_table_check_pud_set(mm, addr, pudp, pud); native_set_pud(pudp, pud); } @@ -1372,7 +1372,7 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma, static inline pud_t pudp_establish(struct vm_area_struct *vma, unsigned long address, pud_t *pudp, pud_t pud) { - page_table_check_pud_set(vma->vm_mm, pudp, pud); + page_table_check_pud_set(vma->vm_mm, address, pudp, pud); if (IS_ENABLED(CONFIG_SMP)) { return xchg(pudp, pud); } else { diff --git a/include/linux/page_table_check.h b/include/linux/page_table_check.h index 289620d4aad3..0bf18b884a12 100644 --- a/include/linux/page_table_check.h +++ b/include/linux/page_table_check.h @@ -21,8 +21,8 @@ void __page_table_check_ptes_set(struct mm_struct *mm, pte_t *ptep, pte_t pte, unsigned int nr); void __page_table_check_pmds_set(struct mm_struct *mm, pmd_t *pmdp, pmd_t pmd, unsigned int nr); -void __page_table_check_puds_set(struct mm_struct *mm, pud_t *pudp, pud_t pud, - unsigned int nr); +void __page_table_check_puds_set(struct mm_struct *mm, unsigned long addr, + pud_t *pudp, pud_t pud, unsigned int nr); void __page_table_check_pte_clear_range(struct mm_struct *mm, unsigned long addr, pmd_t pmd); @@ -86,12 +86,12 @@ static inline void page_table_check_pmds_set(struct mm_struct *mm, } static inline void page_table_check_puds_set(struct mm_struct *mm, - pud_t *pudp, pud_t pud, unsigned int nr) + unsigned long addr, pud_t *pudp, pud_t pud, unsigned int nr) { if (static_branch_likely(&page_table_check_disabled)) return; - __page_table_check_puds_set(mm, pudp, pud, nr); + __page_table_check_puds_set(mm, addr, pudp, pud, nr); } static inline void page_table_check_pte_clear_range(struct mm_struct *mm, @@ -137,7 +137,7 @@ static inline void page_table_check_pmds_set(struct mm_struct *mm, } static inline void page_table_check_puds_set(struct mm_struct *mm, - pud_t *pudp, pud_t pud, unsigned int nr) + unsigned long addr, pud_t *pudp, pud_t pud, unsigned int nr) { } @@ -150,6 +150,6 @@ static inline void page_table_check_pte_clear_range(struct mm_struct *mm, #endif /* CONFIG_PAGE_TABLE_CHECK */ #define page_table_check_pmd_set(mm, pmdp, pmd) page_table_check_pmds_set(mm, pmdp, pmd, 1) -#define page_table_check_pud_set(mm, pudp, pud) page_table_check_puds_set(mm, pudp, pud, 1) +#define page_table_check_pud_set(mm, addr, pudp, pud) page_table_check_puds_set(mm, addr, pudp, pud, 1) #endif /* __LINUX_PAGE_TABLE_CHECK_H */ diff --git a/mm/page_table_check.c b/mm/page_table_check.c index 741884645ab0..a48f835216a1 100644 --- a/mm/page_table_check.c +++ b/mm/page_table_check.c @@ -243,8 +243,8 @@ void __page_table_check_pmds_set(struct mm_struct *mm, pmd_t *pmdp, pmd_t pmd, } EXPORT_SYMBOL(__page_table_check_pmds_set); -void __page_table_check_puds_set(struct mm_struct *mm, pud_t *pudp, pud_t pud, - unsigned int nr) +void __page_table_check_puds_set(struct mm_struct *mm, unsigned long addr, + pud_t *pudp, pud_t pud, unsigned int nr) { unsigned long stride = PUD_SIZE >> PAGE_SHIFT; unsigned int i; -- cgit v1.2.3 From 6e2d8f9fc4edcbf9f4dd953e1f41b0ff64867e5b Mon Sep 17 00:00:00 2001 From: Rohan McLure Date: Fri, 19 Dec 2025 04:09:36 +1100 Subject: mm/page_table_check: reinstate address parameter in [__]page_table_check_pmd[s]_set() This reverts commit a3b837130b58 ("mm/page_table_check: remove unused parameter in [__]page_table_check_pmd_set"). Reinstate previously unused parameters for the purpose of supporting powerpc platforms, as many do not encode user/kernel ownership of the page in the pte, but instead in the address of the access. Apply this to __page_table_check_pmds_set(), page_table_check_pmd_set(), and the page_table_check_pmd_set() wrapper macro. [ajd@linux.ibm.com: rebase on arm64 + riscv changes, update commit message] Link: https://lkml.kernel.org/r/20251219-pgtable_check_v18rebase-v18-4-755bc151a50b@linux.ibm.com Signed-off-by: Rohan McLure Signed-off-by: Andrew Donnellan Reviewed-by: Pasha Tatashin Acked-by: Ingo Molnar # x86 Acked-by: Alexandre Ghiti # riscv Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Alistair Popple Cc: Christophe Leroy Cc: "Christophe Leroy (CS GROUP)" Cc: David Hildenbrand Cc: Donet Tom Cc: Guo Weikang Cc: Jason Gunthorpe Cc: Kevin Brodsky Cc: Madhavan Srinivasan Cc: Magnus Lindholm Cc: "Matthew Wilcox (Oracle)" Cc: Michael Ellerman Cc: Nicholas Miehlbradt Cc: Nicholas Piggin Cc: Paul Mackerras Cc: Qi Zheng Cc: "Ritesh Harjani (IBM)" Cc: Sweet Tea Dorminy Cc: Thomas Huth Cc: "Vishal Moola (Oracle)" Cc: Zi Yan Signed-off-by: Andrew Morton --- arch/arm64/include/asm/pgtable.h | 5 +++-- arch/riscv/include/asm/pgtable.h | 4 ++-- arch/x86/include/asm/pgtable.h | 4 ++-- include/linux/page_table_check.h | 12 ++++++------ mm/page_table_check.c | 4 ++-- 5 files changed, 15 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 87ed9b1c011e..4b580d6246f5 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -684,7 +684,8 @@ static inline void __set_ptes_anysz(struct mm_struct *mm, unsigned long addr, page_table_check_ptes_set(mm, ptep, pte, nr); break; case PMD_SIZE: - page_table_check_pmds_set(mm, (pmd_t *)ptep, pte_pmd(pte), nr); + page_table_check_pmds_set(mm, addr, (pmd_t *)ptep, + pte_pmd(pte), nr); break; #ifndef __PAGETABLE_PMD_FOLDED case PUD_SIZE: @@ -1489,7 +1490,7 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm, static inline pmd_t pmdp_establish(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp, pmd_t pmd) { - page_table_check_pmd_set(vma->vm_mm, pmdp, pmd); + page_table_check_pmd_set(vma->vm_mm, address, pmdp, pmd); return __pmd(xchg_relaxed(&pmd_val(*pmdp), pmd_val(pmd))); } #endif diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 07705adee128..82b1c79bc2dd 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -946,7 +946,7 @@ static inline pmd_t pmd_swp_clear_soft_dirty(pmd_t pmd) static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, pmd_t pmd) { - page_table_check_pmd_set(mm, pmdp, pmd); + page_table_check_pmd_set(mm, addr, pmdp, pmd); return __set_pte_at(mm, (pte_t *)pmdp, pmd_pte(pmd)); } @@ -1023,7 +1023,7 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm, static inline pmd_t pmdp_establish(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp, pmd_t pmd) { - page_table_check_pmd_set(vma->vm_mm, pmdp, pmd); + page_table_check_pmd_set(vma->vm_mm, address, pmdp, pmd); return __pmd(atomic_long_xchg((atomic_long_t *)pmdp, pmd_val(pmd))); } diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 2b540c563d8d..7fd876f8d828 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -1214,7 +1214,7 @@ static inline pud_t native_local_pudp_get_and_clear(pud_t *pudp) static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, pmd_t pmd) { - page_table_check_pmd_set(mm, pmdp, pmd); + page_table_check_pmd_set(mm, addr, pmdp, pmd); set_pmd(pmdp, pmd); } @@ -1357,7 +1357,7 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm, static inline pmd_t pmdp_establish(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp, pmd_t pmd) { - page_table_check_pmd_set(vma->vm_mm, pmdp, pmd); + page_table_check_pmd_set(vma->vm_mm, address, pmdp, pmd); if (IS_ENABLED(CONFIG_SMP)) { return xchg(pmdp, pmd); } else { diff --git a/include/linux/page_table_check.h b/include/linux/page_table_check.h index 0bf18b884a12..cf7c28d8d468 100644 --- a/include/linux/page_table_check.h +++ b/include/linux/page_table_check.h @@ -19,8 +19,8 @@ void __page_table_check_pmd_clear(struct mm_struct *mm, pmd_t pmd); void __page_table_check_pud_clear(struct mm_struct *mm, pud_t pud); void __page_table_check_ptes_set(struct mm_struct *mm, pte_t *ptep, pte_t pte, unsigned int nr); -void __page_table_check_pmds_set(struct mm_struct *mm, pmd_t *pmdp, pmd_t pmd, - unsigned int nr); +void __page_table_check_pmds_set(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp, pmd_t pmd, unsigned int nr); void __page_table_check_puds_set(struct mm_struct *mm, unsigned long addr, pud_t *pudp, pud_t pud, unsigned int nr); void __page_table_check_pte_clear_range(struct mm_struct *mm, @@ -77,12 +77,12 @@ static inline void page_table_check_ptes_set(struct mm_struct *mm, } static inline void page_table_check_pmds_set(struct mm_struct *mm, - pmd_t *pmdp, pmd_t pmd, unsigned int nr) + unsigned long addr, pmd_t *pmdp, pmd_t pmd, unsigned int nr) { if (static_branch_likely(&page_table_check_disabled)) return; - __page_table_check_pmds_set(mm, pmdp, pmd, nr); + __page_table_check_pmds_set(mm, addr, pmdp, pmd, nr); } static inline void page_table_check_puds_set(struct mm_struct *mm, @@ -132,7 +132,7 @@ static inline void page_table_check_ptes_set(struct mm_struct *mm, } static inline void page_table_check_pmds_set(struct mm_struct *mm, - pmd_t *pmdp, pmd_t pmd, unsigned int nr) + unsigned long addr, pmd_t *pmdp, pmd_t pmd, unsigned int nr) { } @@ -149,7 +149,7 @@ static inline void page_table_check_pte_clear_range(struct mm_struct *mm, #endif /* CONFIG_PAGE_TABLE_CHECK */ -#define page_table_check_pmd_set(mm, pmdp, pmd) page_table_check_pmds_set(mm, pmdp, pmd, 1) +#define page_table_check_pmd_set(mm, addr, pmdp, pmd) page_table_check_pmds_set(mm, addr, pmdp, pmd, 1) #define page_table_check_pud_set(mm, addr, pudp, pud) page_table_check_puds_set(mm, addr, pudp, pud, 1) #endif /* __LINUX_PAGE_TABLE_CHECK_H */ diff --git a/mm/page_table_check.c b/mm/page_table_check.c index a48f835216a1..86dc4e4d1dad 100644 --- a/mm/page_table_check.c +++ b/mm/page_table_check.c @@ -225,8 +225,8 @@ static inline void page_table_check_pmd_flags(pmd_t pmd) } } -void __page_table_check_pmds_set(struct mm_struct *mm, pmd_t *pmdp, pmd_t pmd, - unsigned int nr) +void __page_table_check_pmds_set(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp, pmd_t pmd, unsigned int nr) { unsigned long stride = PMD_SIZE >> PAGE_SHIFT; unsigned int i; -- cgit v1.2.3 From 0a5ae4483177a621f5498c349d31f24b1ef10739 Mon Sep 17 00:00:00 2001 From: Rohan McLure Date: Fri, 19 Dec 2025 04:09:37 +1100 Subject: mm/page_table_check: provide addr parameter to page_table_check_ptes_set() To provide support for powerpc platforms, provide an addr parameter to the __page_table_check_ptes_set() and page_table_check_ptes_set() routines. This parameter is needed on some powerpc platforms which do not encode whether a mapping is for user or kernel in the pte. On such platforms, this can be inferred from the addr parameter. [ajd@linux.ibm.com: rebase on arm64 + riscv changes, update commit message] Link: https://lkml.kernel.org/r/20251219-pgtable_check_v18rebase-v18-5-755bc151a50b@linux.ibm.com Signed-off-by: Rohan McLure Reviewed-by: Pasha Tatashin Acked-by: Alexandre Ghiti # riscv Signed-off-by: Andrew Donnellan Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Alistair Popple Cc: Christophe Leroy Cc: "Christophe Leroy (CS GROUP)" Cc: David Hildenbrand Cc: Donet Tom Cc: Guo Weikang Cc: Ingo Molnar Cc: Jason Gunthorpe Cc: Kevin Brodsky Cc: Madhavan Srinivasan Cc: Magnus Lindholm Cc: "Matthew Wilcox (Oracle)" Cc: Michael Ellerman Cc: Nicholas Miehlbradt Cc: Nicholas Piggin Cc: Paul Mackerras Cc: Qi Zheng Cc: "Ritesh Harjani (IBM)" Cc: Sweet Tea Dorminy Cc: Thomas Huth Cc: "Vishal Moola (Oracle)" Cc: Zi Yan Signed-off-by: Andrew Morton --- arch/arm64/include/asm/pgtable.h | 2 +- arch/riscv/include/asm/pgtable.h | 2 +- include/linux/page_table_check.h | 12 +++++++----- include/linux/pgtable.h | 2 +- mm/page_table_check.c | 4 ++-- 5 files changed, 12 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 4b580d6246f5..d1dd0266bb0c 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -681,7 +681,7 @@ static inline void __set_ptes_anysz(struct mm_struct *mm, unsigned long addr, switch (pgsize) { case PAGE_SIZE: - page_table_check_ptes_set(mm, ptep, pte, nr); + page_table_check_ptes_set(mm, addr, ptep, pte, nr); break; case PMD_SIZE: page_table_check_pmds_set(mm, addr, (pmd_t *)ptep, diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 82b1c79bc2dd..574a45a22454 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -627,7 +627,7 @@ static inline void __set_pte_at(struct mm_struct *mm, pte_t *ptep, pte_t pteval) static inline void set_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pteval, unsigned int nr) { - page_table_check_ptes_set(mm, ptep, pteval, nr); + page_table_check_ptes_set(mm, addr, ptep, pteval, nr); for (;;) { __set_pte_at(mm, ptep, pteval); diff --git a/include/linux/page_table_check.h b/include/linux/page_table_check.h index cf7c28d8d468..66e109238416 100644 --- a/include/linux/page_table_check.h +++ b/include/linux/page_table_check.h @@ -17,8 +17,8 @@ void __page_table_check_zero(struct page *page, unsigned int order); void __page_table_check_pte_clear(struct mm_struct *mm, pte_t pte); void __page_table_check_pmd_clear(struct mm_struct *mm, pmd_t pmd); void __page_table_check_pud_clear(struct mm_struct *mm, pud_t pud); -void __page_table_check_ptes_set(struct mm_struct *mm, pte_t *ptep, pte_t pte, - unsigned int nr); +void __page_table_check_ptes_set(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte, unsigned int nr); void __page_table_check_pmds_set(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, pmd_t pmd, unsigned int nr); void __page_table_check_puds_set(struct mm_struct *mm, unsigned long addr, @@ -68,12 +68,13 @@ static inline void page_table_check_pud_clear(struct mm_struct *mm, pud_t pud) } static inline void page_table_check_ptes_set(struct mm_struct *mm, - pte_t *ptep, pte_t pte, unsigned int nr) + unsigned long addr, pte_t *ptep, + pte_t pte, unsigned int nr) { if (static_branch_likely(&page_table_check_disabled)) return; - __page_table_check_ptes_set(mm, ptep, pte, nr); + __page_table_check_ptes_set(mm, addr, ptep, pte, nr); } static inline void page_table_check_pmds_set(struct mm_struct *mm, @@ -127,7 +128,8 @@ static inline void page_table_check_pud_clear(struct mm_struct *mm, pud_t pud) } static inline void page_table_check_ptes_set(struct mm_struct *mm, - pte_t *ptep, pte_t pte, unsigned int nr) + unsigned long addr, pte_t *ptep, + pte_t pte, unsigned int nr) { } diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 2f0dd3a4ace1..496873f44f67 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -429,7 +429,7 @@ static inline pte_t pte_advance_pfn(pte_t pte, unsigned long nr) static inline void set_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte, unsigned int nr) { - page_table_check_ptes_set(mm, ptep, pte, nr); + page_table_check_ptes_set(mm, addr, ptep, pte, nr); for (;;) { set_pte(ptep, pte); diff --git a/mm/page_table_check.c b/mm/page_table_check.c index 86dc4e4d1dad..2871d9c45368 100644 --- a/mm/page_table_check.c +++ b/mm/page_table_check.c @@ -196,8 +196,8 @@ static void page_table_check_pte_flags(pte_t pte) } } -void __page_table_check_ptes_set(struct mm_struct *mm, pte_t *ptep, pte_t pte, - unsigned int nr) +void __page_table_check_ptes_set(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte, unsigned int nr) { unsigned int i; -- cgit v1.2.3 From 2e6ac078ce5d6a9dc96cab861359faac508eb56d Mon Sep 17 00:00:00 2001 From: Rohan McLure Date: Fri, 19 Dec 2025 04:09:38 +1100 Subject: mm/page_table_check: reinstate address parameter in [__]page_table_check_pud_clear() This reverts commit 931c38e16499 ("mm/page_table_check: remove unused parameter in [__]page_table_check_pud_clear"). Reinstate previously unused parameters for the purpose of supporting powerpc platforms, as many do not encode user/kernel ownership of the page in the pte, but instead in the address of the access. [ajd@linux.ibm.com: rebase on arm64 changes] Link: https://lkml.kernel.org/r/20251219-pgtable_check_v18rebase-v18-6-755bc151a50b@linux.ibm.com Signed-off-by: Rohan McLure Signed-off-by: Andrew Donnellan Reviewed-by: Pasha Tatashin Acked-by: Ingo Molnar # x86 Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Alexandre Ghiti Cc: Alistair Popple Cc: Christophe Leroy Cc: "Christophe Leroy (CS GROUP)" Cc: David Hildenbrand Cc: Donet Tom Cc: Guo Weikang Cc: Jason Gunthorpe Cc: Kevin Brodsky Cc: Madhavan Srinivasan Cc: Magnus Lindholm Cc: "Matthew Wilcox (Oracle)" Cc: Michael Ellerman Cc: Nicholas Miehlbradt Cc: Nicholas Piggin Cc: Paul Mackerras Cc: Qi Zheng Cc: "Ritesh Harjani (IBM)" Cc: Sweet Tea Dorminy Cc: Thomas Huth Cc: "Vishal Moola (Oracle)" Cc: Zi Yan Signed-off-by: Andrew Morton --- arch/arm64/include/asm/pgtable.h | 2 +- arch/riscv/include/asm/pgtable.h | 2 +- arch/x86/include/asm/pgtable.h | 2 +- include/linux/page_table_check.h | 11 +++++++---- include/linux/pgtable.h | 2 +- mm/page_table_check.c | 5 +++-- 6 files changed, 14 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index d1dd0266bb0c..595405e6bfc7 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -1349,7 +1349,7 @@ static inline pte_t __ptep_get_and_clear_anysz(struct mm_struct *mm, break; #ifndef __PAGETABLE_PMD_FOLDED case PUD_SIZE: - page_table_check_pud_clear(mm, pte_pud(pte)); + page_table_check_pud_clear(mm, address, pte_pud(pte)); break; #endif default: diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 574a45a22454..e06727c975fe 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -1101,7 +1101,7 @@ static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm, pud_clear(pudp); #endif - page_table_check_pud_clear(mm, pud); + page_table_check_pud_clear(mm, address, pud); return pud; } diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 7fd876f8d828..3eb36a36058f 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -1330,7 +1330,7 @@ static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm, { pud_t pud = native_pudp_get_and_clear(pudp); - page_table_check_pud_clear(mm, pud); + page_table_check_pud_clear(mm, addr, pud); return pud; } diff --git a/include/linux/page_table_check.h b/include/linux/page_table_check.h index 66e109238416..808cc3a48c28 100644 --- a/include/linux/page_table_check.h +++ b/include/linux/page_table_check.h @@ -16,7 +16,8 @@ extern struct page_ext_operations page_table_check_ops; void __page_table_check_zero(struct page *page, unsigned int order); void __page_table_check_pte_clear(struct mm_struct *mm, pte_t pte); void __page_table_check_pmd_clear(struct mm_struct *mm, pmd_t pmd); -void __page_table_check_pud_clear(struct mm_struct *mm, pud_t pud); +void __page_table_check_pud_clear(struct mm_struct *mm, unsigned long addr, + pud_t pud); void __page_table_check_ptes_set(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte, unsigned int nr); void __page_table_check_pmds_set(struct mm_struct *mm, unsigned long addr, @@ -59,12 +60,13 @@ static inline void page_table_check_pmd_clear(struct mm_struct *mm, pmd_t pmd) __page_table_check_pmd_clear(mm, pmd); } -static inline void page_table_check_pud_clear(struct mm_struct *mm, pud_t pud) +static inline void page_table_check_pud_clear(struct mm_struct *mm, + unsigned long addr, pud_t pud) { if (static_branch_likely(&page_table_check_disabled)) return; - __page_table_check_pud_clear(mm, pud); + __page_table_check_pud_clear(mm, addr, pud); } static inline void page_table_check_ptes_set(struct mm_struct *mm, @@ -123,7 +125,8 @@ static inline void page_table_check_pmd_clear(struct mm_struct *mm, pmd_t pmd) { } -static inline void page_table_check_pud_clear(struct mm_struct *mm, pud_t pud) +static inline void page_table_check_pud_clear(struct mm_struct *mm, + unsigned long addr, pud_t pud) { } diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 496873f44f67..ed3c28ebeb35 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -801,7 +801,7 @@ static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm, pud_t pud = *pudp; pud_clear(pudp); - page_table_check_pud_clear(mm, pud); + page_table_check_pud_clear(mm, address, pud); return pud; } diff --git a/mm/page_table_check.c b/mm/page_table_check.c index 2871d9c45368..2295bc9368ab 100644 --- a/mm/page_table_check.c +++ b/mm/page_table_check.c @@ -167,7 +167,8 @@ void __page_table_check_pmd_clear(struct mm_struct *mm, pmd_t pmd) } EXPORT_SYMBOL(__page_table_check_pmd_clear); -void __page_table_check_pud_clear(struct mm_struct *mm, pud_t pud) +void __page_table_check_pud_clear(struct mm_struct *mm, unsigned long addr, + pud_t pud) { if (&init_mm == mm) return; @@ -253,7 +254,7 @@ void __page_table_check_puds_set(struct mm_struct *mm, unsigned long addr, return; for (i = 0; i < nr; i++) - __page_table_check_pud_clear(mm, *(pudp + i)); + __page_table_check_pud_clear(mm, addr + PUD_SIZE * i, *(pudp + i)); if (pud_user_accessible_page(pud)) page_table_check_set(pud_pfn(pud), stride * nr, pud_write(pud)); } -- cgit v1.2.3 From 649ec9e3d03c4908ef51731cd7b422c4a3e2ccff Mon Sep 17 00:00:00 2001 From: Rohan McLure Date: Fri, 19 Dec 2025 04:09:39 +1100 Subject: mm/page_table_check: reinstate address parameter in [__]page_table_check_pmd_clear() This reverts commit 1831414cd729 ("mm/page_table_check: remove unused parameter in [__]page_table_check_pmd_clear"). Reinstate previously unused parameters for the purpose of supporting powerpc platforms, as many do not encode user/kernel ownership of the page in the pte, but instead in the address of the access. [ajd@linux.ibm.com: rebase on arm64 changes] Link: https://lkml.kernel.org/r/20251219-pgtable_check_v18rebase-v18-7-755bc151a50b@linux.ibm.com Signed-off-by: Rohan McLure Signed-off-by: Andrew Donnellan Reviewed-by: Pasha Tatashin Acked-by: Ingo Molnar # x86 Acked-by: Alexandre Ghiti # riscv Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Alistair Popple Cc: Christophe Leroy Cc: "Christophe Leroy (CS GROUP)" Cc: David Hildenbrand Cc: Donet Tom Cc: Guo Weikang Cc: Jason Gunthorpe Cc: Kevin Brodsky Cc: Madhavan Srinivasan Cc: Magnus Lindholm Cc: "Matthew Wilcox (Oracle)" Cc: Michael Ellerman Cc: Nicholas Miehlbradt Cc: Nicholas Piggin Cc: Paul Mackerras Cc: Qi Zheng Cc: "Ritesh Harjani (IBM)" Cc: Sweet Tea Dorminy Cc: Thomas Huth Cc: "Vishal Moola (Oracle)" Cc: Zi Yan Signed-off-by: Andrew Morton --- arch/arm64/include/asm/pgtable.h | 2 +- arch/riscv/include/asm/pgtable.h | 2 +- arch/x86/include/asm/pgtable.h | 2 +- include/linux/page_table_check.h | 11 +++++++---- include/linux/pgtable.h | 2 +- mm/page_table_check.c | 5 +++-- 6 files changed, 14 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 595405e6bfc7..5abad90913eb 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -1345,7 +1345,7 @@ static inline pte_t __ptep_get_and_clear_anysz(struct mm_struct *mm, page_table_check_pte_clear(mm, pte); break; case PMD_SIZE: - page_table_check_pmd_clear(mm, pte_pmd(pte)); + page_table_check_pmd_clear(mm, address, pte_pmd(pte)); break; #ifndef __PAGETABLE_PMD_FOLDED case PUD_SIZE: diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index e06727c975fe..6464a2c18ebe 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -1007,7 +1007,7 @@ static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, pmd_clear(pmdp); #endif - page_table_check_pmd_clear(mm, pmd); + page_table_check_pmd_clear(mm, address, pmd); return pmd; } diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 3eb36a36058f..5a2b2d3a80d8 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -1319,7 +1319,7 @@ static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, unsigned long { pmd_t pmd = native_pmdp_get_and_clear(pmdp); - page_table_check_pmd_clear(mm, pmd); + page_table_check_pmd_clear(mm, addr, pmd); return pmd; } diff --git a/include/linux/page_table_check.h b/include/linux/page_table_check.h index 808cc3a48c28..3973b69ae294 100644 --- a/include/linux/page_table_check.h +++ b/include/linux/page_table_check.h @@ -15,7 +15,8 @@ extern struct page_ext_operations page_table_check_ops; void __page_table_check_zero(struct page *page, unsigned int order); void __page_table_check_pte_clear(struct mm_struct *mm, pte_t pte); -void __page_table_check_pmd_clear(struct mm_struct *mm, pmd_t pmd); +void __page_table_check_pmd_clear(struct mm_struct *mm, unsigned long addr, + pmd_t pmd); void __page_table_check_pud_clear(struct mm_struct *mm, unsigned long addr, pud_t pud); void __page_table_check_ptes_set(struct mm_struct *mm, unsigned long addr, @@ -52,12 +53,13 @@ static inline void page_table_check_pte_clear(struct mm_struct *mm, pte_t pte) __page_table_check_pte_clear(mm, pte); } -static inline void page_table_check_pmd_clear(struct mm_struct *mm, pmd_t pmd) +static inline void page_table_check_pmd_clear(struct mm_struct *mm, + unsigned long addr, pmd_t pmd) { if (static_branch_likely(&page_table_check_disabled)) return; - __page_table_check_pmd_clear(mm, pmd); + __page_table_check_pmd_clear(mm, addr, pmd); } static inline void page_table_check_pud_clear(struct mm_struct *mm, @@ -121,7 +123,8 @@ static inline void page_table_check_pte_clear(struct mm_struct *mm, pte_t pte) { } -static inline void page_table_check_pmd_clear(struct mm_struct *mm, pmd_t pmd) +static inline void page_table_check_pmd_clear(struct mm_struct *mm, + unsigned long addr, pmd_t pmd) { } diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index ed3c28ebeb35..2d1f7369624c 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -788,7 +788,7 @@ static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, pmd_t pmd = *pmdp; pmd_clear(pmdp); - page_table_check_pmd_clear(mm, pmd); + page_table_check_pmd_clear(mm, address, pmd); return pmd; } diff --git a/mm/page_table_check.c b/mm/page_table_check.c index 2295bc9368ab..e8280b0b6dda 100644 --- a/mm/page_table_check.c +++ b/mm/page_table_check.c @@ -156,7 +156,8 @@ void __page_table_check_pte_clear(struct mm_struct *mm, pte_t pte) } EXPORT_SYMBOL(__page_table_check_pte_clear); -void __page_table_check_pmd_clear(struct mm_struct *mm, pmd_t pmd) +void __page_table_check_pmd_clear(struct mm_struct *mm, unsigned long addr, + pmd_t pmd) { if (&init_mm == mm) return; @@ -238,7 +239,7 @@ void __page_table_check_pmds_set(struct mm_struct *mm, unsigned long addr, page_table_check_pmd_flags(pmd); for (i = 0; i < nr; i++) - __page_table_check_pmd_clear(mm, *(pmdp + i)); + __page_table_check_pmd_clear(mm, addr + PMD_SIZE * i, *(pmdp + i)); if (pmd_user_accessible_page(pmd)) page_table_check_set(pmd_pfn(pmd), stride * nr, pmd_write(pmd)); } -- cgit v1.2.3 From d7b4b67eb6b37aef1723a69add88c9a7add81308 Mon Sep 17 00:00:00 2001 From: Rohan McLure Date: Fri, 19 Dec 2025 04:09:40 +1100 Subject: mm/page_table_check: reinstate address parameter in [__]page_table_check_pte_clear() This reverts commit aa232204c468 ("mm/page_table_check: remove unused parameter in [__]page_table_check_pte_clear"). Reinstate previously unused parameters for the purpose of supporting powerpc platforms, as many do not encode user/kernel ownership of the page in the pte, but instead in the address of the access. [ajd@linux.ibm.com: rebase, fix additional occurrence and loop handling] Link: https://lkml.kernel.org/r/20251219-pgtable_check_v18rebase-v18-8-755bc151a50b@linux.ibm.com Signed-off-by: Rohan McLure Signed-off-by: Andrew Donnellan Reviewed-by: Pasha Tatashin Acked-by: Ingo Molnar # x86 Acked-by: Alexandre Ghiti # riscv Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Alistair Popple Cc: Christophe Leroy Cc: "Christophe Leroy (CS GROUP)" Cc: David Hildenbrand Cc: Donet Tom Cc: Guo Weikang Cc: Jason Gunthorpe Cc: Kevin Brodsky Cc: Madhavan Srinivasan Cc: Magnus Lindholm Cc: "Matthew Wilcox (Oracle)" Cc: Michael Ellerman Cc: Nicholas Miehlbradt Cc: Nicholas Piggin Cc: Paul Mackerras Cc: Qi Zheng Cc: "Ritesh Harjani (IBM)" Cc: Sweet Tea Dorminy Cc: Thomas Huth Cc: "Vishal Moola (Oracle)" Cc: Zi Yan Signed-off-by: Andrew Morton --- arch/arm64/include/asm/pgtable.h | 2 +- arch/riscv/include/asm/pgtable.h | 2 +- arch/x86/include/asm/pgtable.h | 4 ++-- include/linux/page_table_check.h | 11 +++++++---- include/linux/pgtable.h | 4 ++-- mm/page_table_check.c | 7 ++++--- 6 files changed, 17 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 5abad90913eb..ce64c560e284 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -1342,7 +1342,7 @@ static inline pte_t __ptep_get_and_clear_anysz(struct mm_struct *mm, switch (pgsize) { case PAGE_SIZE: - page_table_check_pte_clear(mm, pte); + page_table_check_pte_clear(mm, address, pte); break; case PMD_SIZE: page_table_check_pmd_clear(mm, address, pte_pmd(pte)); diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 6464a2c18ebe..e3618d789aa4 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -664,7 +664,7 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm, set_pte(ptep, __pte(0)); #endif - page_table_check_pte_clear(mm, pte); + page_table_check_pte_clear(mm, address, pte); return pte; } diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 5a2b2d3a80d8..6ec6cf7ad2d4 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -1252,7 +1252,7 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { pte_t pte = native_ptep_get_and_clear(ptep); - page_table_check_pte_clear(mm, pte); + page_table_check_pte_clear(mm, addr, pte); return pte; } @@ -1268,7 +1268,7 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, * care about updates and native needs no locking */ pte = native_local_ptep_get_and_clear(ptep); - page_table_check_pte_clear(mm, pte); + page_table_check_pte_clear(mm, addr, pte); } else { pte = ptep_get_and_clear(mm, addr, ptep); } diff --git a/include/linux/page_table_check.h b/include/linux/page_table_check.h index 3973b69ae294..12268a32e8be 100644 --- a/include/linux/page_table_check.h +++ b/include/linux/page_table_check.h @@ -14,7 +14,8 @@ extern struct static_key_true page_table_check_disabled; extern struct page_ext_operations page_table_check_ops; void __page_table_check_zero(struct page *page, unsigned int order); -void __page_table_check_pte_clear(struct mm_struct *mm, pte_t pte); +void __page_table_check_pte_clear(struct mm_struct *mm, unsigned long addr, + pte_t pte); void __page_table_check_pmd_clear(struct mm_struct *mm, unsigned long addr, pmd_t pmd); void __page_table_check_pud_clear(struct mm_struct *mm, unsigned long addr, @@ -45,12 +46,13 @@ static inline void page_table_check_free(struct page *page, unsigned int order) __page_table_check_zero(page, order); } -static inline void page_table_check_pte_clear(struct mm_struct *mm, pte_t pte) +static inline void page_table_check_pte_clear(struct mm_struct *mm, + unsigned long addr, pte_t pte) { if (static_branch_likely(&page_table_check_disabled)) return; - __page_table_check_pte_clear(mm, pte); + __page_table_check_pte_clear(mm, addr, pte); } static inline void page_table_check_pmd_clear(struct mm_struct *mm, @@ -119,7 +121,8 @@ static inline void page_table_check_free(struct page *page, unsigned int order) { } -static inline void page_table_check_pte_clear(struct mm_struct *mm, pte_t pte) +static inline void page_table_check_pte_clear(struct mm_struct *mm, + unsigned long addr, pte_t pte) { } diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 2d1f7369624c..827dca25c0bc 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -634,7 +634,7 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm, { pte_t pte = ptep_get(ptep); pte_clear(mm, address, ptep); - page_table_check_pte_clear(mm, pte); + page_table_check_pte_clear(mm, address, pte); return pte; } #endif @@ -693,7 +693,7 @@ static inline void ptep_clear(struct mm_struct *mm, unsigned long addr, * No need for ptep_get_and_clear(): page table check doesn't care about * any bits that could have been set by HW concurrently. */ - page_table_check_pte_clear(mm, pte); + page_table_check_pte_clear(mm, addr, pte); } #ifdef CONFIG_GUP_GET_PXX_LOW_HIGH diff --git a/mm/page_table_check.c b/mm/page_table_check.c index e8280b0b6dda..de9e54bd27e6 100644 --- a/mm/page_table_check.c +++ b/mm/page_table_check.c @@ -145,7 +145,8 @@ void __page_table_check_zero(struct page *page, unsigned int order) rcu_read_unlock(); } -void __page_table_check_pte_clear(struct mm_struct *mm, pte_t pte) +void __page_table_check_pte_clear(struct mm_struct *mm, unsigned long addr, + pte_t pte) { if (&init_mm == mm) return; @@ -209,7 +210,7 @@ void __page_table_check_ptes_set(struct mm_struct *mm, unsigned long addr, page_table_check_pte_flags(pte); for (i = 0; i < nr; i++) - __page_table_check_pte_clear(mm, ptep_get(ptep + i)); + __page_table_check_pte_clear(mm, addr + PAGE_SIZE * i, ptep_get(ptep + i)); if (pte_user_accessible_page(pte)) page_table_check_set(pte_pfn(pte), nr, pte_write(pte)); } @@ -275,7 +276,7 @@ void __page_table_check_pte_clear_range(struct mm_struct *mm, if (WARN_ON(!ptep)) return; for (i = 0; i < PTRS_PER_PTE; i++) { - __page_table_check_pte_clear(mm, ptep_get(ptep)); + __page_table_check_pte_clear(mm, addr, ptep_get(ptep)); addr += PAGE_SIZE; ptep++; } -- cgit v1.2.3 From cbc064e708b687cd2dbc2b788c473e2a34e10f7c Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Wed, 14 Jan 2026 12:22:13 -0500 Subject: nodemask: propagate boolean for nodes_and{,not} MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "nodemask: align nodes_and{,not} with underlying bitmap ops". nodes_and{,not} are void despite that underlying bitmap_and(,not) return boolean, true if the result bitmap is non-empty. Align nodemask API, and simplify client code. This patch (of 3): Bitmap functions bitmap_and{,not} return boolean depending on emptiness of the result bitmap. The corresponding nodemask helpers ignore the returned value. Propagate the underlying bitmaps result to nodemasks users, as it simplifies user code. Link: https://lkml.kernel.org/r/20260114172217.861204-1-ynorov@nvidia.com Link: https://lkml.kernel.org/r/20260114172217.861204-2-ynorov@nvidia.com Signed-off-by: Yury Norov Reviewed-by: Gregory Price Reviewed-by: Joshua Hahn Reviewed-by: David Hildenbrand (Red Hat) Cc: Alistair Popple Cc: Byungchul Park Cc: "Huang, Ying" Cc: Johannes Weiner Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Mathew Brost Cc: Michal Hocko Cc: Michal Koutný Cc: Mike Rapoport Cc: Rakie Kim Cc: Rasmus Villemoes Cc: Suren Baghdasaryan Cc: Tejun Heo Cc: Vlastimil Babka Cc: Waiman Long Cc: Yury Norov (NVIDIA) Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/nodemask.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h index bd38648c998d..204c92462f3c 100644 --- a/include/linux/nodemask.h +++ b/include/linux/nodemask.h @@ -157,10 +157,10 @@ static __always_inline bool __node_test_and_set(int node, nodemask_t *addr) #define nodes_and(dst, src1, src2) \ __nodes_and(&(dst), &(src1), &(src2), MAX_NUMNODES) -static __always_inline void __nodes_and(nodemask_t *dstp, const nodemask_t *src1p, +static __always_inline bool __nodes_and(nodemask_t *dstp, const nodemask_t *src1p, const nodemask_t *src2p, unsigned int nbits) { - bitmap_and(dstp->bits, src1p->bits, src2p->bits, nbits); + return bitmap_and(dstp->bits, src1p->bits, src2p->bits, nbits); } #define nodes_or(dst, src1, src2) \ @@ -181,10 +181,10 @@ static __always_inline void __nodes_xor(nodemask_t *dstp, const nodemask_t *src1 #define nodes_andnot(dst, src1, src2) \ __nodes_andnot(&(dst), &(src1), &(src2), MAX_NUMNODES) -static __always_inline void __nodes_andnot(nodemask_t *dstp, const nodemask_t *src1p, +static __always_inline bool __nodes_andnot(nodemask_t *dstp, const nodemask_t *src1p, const nodemask_t *src2p, unsigned int nbits) { - bitmap_andnot(dstp->bits, src1p->bits, src2p->bits, nbits); + return bitmap_andnot(dstp->bits, src1p->bits, src2p->bits, nbits); } #define nodes_copy(dst, src) __nodes_copy(&(dst), &(src), MAX_NUMNODES) -- cgit v1.2.3 From 4262c53236977de3ceaa3bf2aefdf772c9b874dd Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 15 Jan 2026 07:20:41 -0800 Subject: mm/damon/core: implement damon_kdamond_pid() Patch series "mm/damon: hide kdamond and kdamond_lock from API callers". 'kdamond' and 'kdamond_lock' fields initially exposed to DAMON API callers for flexible synchronization and use cases. As DAMON API became somewhat complicated compared to the early days, Keeping those exposed could only encourage the API callers to invent more creative but complicated and difficult-to-debug use cases. Fortunately DAMON API callers didn't invent that many creative use cases. There exist only two use cases of 'kdamond' and 'kdamond_lock'. Finding whether the kdamond is actively running, and getting the pid of the kdamond. For the first use case, a dedicated API function, namely 'damon_is_running()' is provided, and all DAMON API callers are using the function for the use case. Hence only the second use case is where the fields are directly being used by DAMON API callers. To prevent future invention of complicated and erroneous use cases of the fields, hide the fields from the API callers. For that, provide new dedicated DAMON API functions for the remaining use case, namely damon_kdamond_pid(), migrate DAMON API callers to use the new function, and mark the fields as private fields. This patch (of 5): 'kdamond' and 'kdamond_lock' are directly being used by DAMON API callers for getting the pid of the corresponding kdamond. To discourage invention of creative but complicated and erroneous new usages of the fields that require careful synchronization, implement a new API function that can simply be used without the manual synchronizations. Link: https://lkml.kernel.org/r/20260115152047.68415-1-sj@kernel.org Link: https://lkml.kernel.org/r/20260115152047.68415-2-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/damon.h | 1 + mm/damon/core.c | 17 +++++++++++++++++ 2 files changed, 18 insertions(+) (limited to 'include') diff --git a/include/linux/damon.h b/include/linux/damon.h index 26fb8e90dff6..5b7ea7082134 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -972,6 +972,7 @@ bool damon_initialized(void); int damon_start(struct damon_ctx **ctxs, int nr_ctxs, bool exclusive); int damon_stop(struct damon_ctx **ctxs, int nr_ctxs); bool damon_is_running(struct damon_ctx *ctx); +int damon_kdamond_pid(struct damon_ctx *ctx); int damon_call(struct damon_ctx *ctx, struct damon_call_control *control); int damos_walk(struct damon_ctx *ctx, struct damos_walk_control *control); diff --git a/mm/damon/core.c b/mm/damon/core.c index 729a5f7fac94..81b998d32074 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -1442,6 +1442,23 @@ bool damon_is_running(struct damon_ctx *ctx) return running; } +/** + * damon_kdamond_pid() - Return pid of a given DAMON context's worker thread. + * @ctx: The DAMON context of the question. + * + * Return: pid if @ctx is running, negative error code otherwise. + */ +int damon_kdamond_pid(struct damon_ctx *ctx) +{ + int pid = -EINVAL; + + mutex_lock(&ctx->kdamond_lock); + if (ctx->kdamond) + pid = ctx->kdamond->pid; + mutex_unlock(&ctx->kdamond_lock); + return pid; +} + /* * damon_call_handle_inactive_ctx() - handle DAMON call request that added to * an inactive context. -- cgit v1.2.3 From 6fe0e6d599a6bb4b65704285d40d4972423b7aaa Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 15 Jan 2026 07:20:45 -0800 Subject: mm/damon: hide kdamond and kdamond_lock of damon_ctx There is no DAMON API caller that directly access 'kdamond' and 'kdamond_lock' fields of 'struct damon_ctx'. Keeping those exposed could only encourage creative but error-prone usages. Hide them from DAMON API callers by marking those as private fields. Link: https://lkml.kernel.org/r/20260115152047.68415-6-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/damon.h | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/include/linux/damon.h b/include/linux/damon.h index 5b7ea7082134..e6930d8574d3 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -759,23 +759,20 @@ struct damon_attrs { * of the monitoring. * * @attrs: Monitoring attributes for accuracy/overhead control. - * @kdamond: Kernel thread who does the monitoring. - * @kdamond_lock: Mutex for the synchronizations with @kdamond. * - * For each monitoring context, one kernel thread for the monitoring is - * created. The pointer to the thread is stored in @kdamond. + * For each monitoring context, one kernel thread for the monitoring, namely + * kdamond, is created. The pid of kdamond can be retrieved using + * damon_kdamond_pid(). * - * Once started, the monitoring thread runs until explicitly required to be - * terminated or every monitoring target is invalid. The validity of the - * targets is checked via the &damon_operations.target_valid of @ops. The - * termination can also be explicitly requested by calling damon_stop(). - * The thread sets @kdamond to NULL when it terminates. Therefore, users can - * know whether the monitoring is ongoing or terminated by reading @kdamond. - * Reads and writes to @kdamond from outside of the monitoring thread must - * be protected by @kdamond_lock. + * Once started, kdamond runs until explicitly required to be terminated or + * every monitoring target is invalid. The validity of the targets is checked + * via the &damon_operations.target_valid of @ops. The termination can also be + * explicitly requested by calling damon_stop(). To know if a kdamond is + * running, damon_is_running() can be used. * - * Note that the monitoring thread protects only @kdamond via @kdamond_lock. - * Accesses to other fields must be protected by themselves. + * While the kdamond is running, all accesses to &struct damon_ctx from a + * thread other than the kdamond should be made using safe DAMON APIs, + * including damon_call() and damos_walk(). * * @ops: Set of monitoring operations for given use cases. * @addr_unit: Scale factor for core to ops address conversion. @@ -816,10 +813,12 @@ struct damon_ctx { struct damos_walk_control *walk_control; struct mutex walk_control_lock; -/* public: */ + /* Working thread of the given DAMON context */ struct task_struct *kdamond; + /* Protects @kdamond field access */ struct mutex kdamond_lock; +/* public: */ struct damon_operations ops; unsigned long addr_unit; unsigned long min_sz_region; -- cgit v1.2.3 From 3ab981c1fca08721a2cc100d4e097d4e0c9e149b Mon Sep 17 00:00:00 2001 From: Shivank Garg Date: Sun, 18 Jan 2026 19:22:57 +0000 Subject: mm/khugepaged: change collapse_pte_mapped_thp() to return void The only external caller of collapse_pte_mapped_thp() is uprobe, which ignores the return value. Change the external API to return void to simplify the interface. Introduce try_collapse_pte_mapped_thp() for internal use that preserves the return value. This prepares for future patch that will convert the return type to use enum scan_result. Link: https://lkml.kernel.org/r/20260118192253.9263-10-shivankg@amd.com Signed-off-by: Shivank Garg Suggested-by: David Hildenbrand (Red Hat) Acked-by: Lance Yang Acked-by: David Hildenbrand (Red Hat) Reviewed-by: Zi Yan Tested-by: Nico Pache Reviewed-by: Nico Pache Cc: Anshuman Khandual Cc: Baolin Wang Cc: Barry Song Cc: Dev Jain Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Ryan Roberts Cc: Wei Yang Signed-off-by: Andrew Morton --- include/linux/khugepaged.h | 9 ++++----- mm/khugepaged.c | 40 +++++++++++++++++++++++----------------- 2 files changed, 27 insertions(+), 22 deletions(-) (limited to 'include') diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h index eb1946a70cff..d7a9053ff4fe 100644 --- a/include/linux/khugepaged.h +++ b/include/linux/khugepaged.h @@ -17,8 +17,8 @@ extern void khugepaged_enter_vma(struct vm_area_struct *vma, vm_flags_t vm_flags); extern void khugepaged_min_free_kbytes_update(void); extern bool current_is_khugepaged(void); -extern int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, - bool install_pmd); +void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, + bool install_pmd); static inline void khugepaged_fork(struct mm_struct *mm, struct mm_struct *oldmm) { @@ -42,10 +42,9 @@ static inline void khugepaged_enter_vma(struct vm_area_struct *vma, vm_flags_t vm_flags) { } -static inline int collapse_pte_mapped_thp(struct mm_struct *mm, - unsigned long addr, bool install_pmd) +static inline void collapse_pte_mapped_thp(struct mm_struct *mm, + unsigned long addr, bool install_pmd) { - return 0; } static inline void khugepaged_min_free_kbytes_update(void) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 984294a16861..d513375b4f39 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1477,20 +1477,8 @@ static int set_huge_pmd(struct vm_area_struct *vma, unsigned long addr, return SCAN_SUCCEED; } -/** - * collapse_pte_mapped_thp - Try to collapse a pte-mapped THP for mm at - * address haddr. - * - * @mm: process address space where collapse happens - * @addr: THP collapse address - * @install_pmd: If a huge PMD should be installed - * - * This function checks whether all the PTEs in the PMD are pointing to the - * right THP. If so, retract the page table so the THP can refault in with - * as pmd-mapped. Possibly install a huge PMD mapping the THP. - */ -int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, - bool install_pmd) +static int try_collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, + bool install_pmd) { int nr_mapped_ptes = 0, result = SCAN_FAIL; unsigned int nr_batch_ptes; @@ -1711,6 +1699,24 @@ drop_folio: return result; } +/** + * collapse_pte_mapped_thp - Try to collapse a pte-mapped THP for mm at + * address haddr. + * + * @mm: process address space where collapse happens + * @addr: THP collapse address + * @install_pmd: If a huge PMD should be installed + * + * This function checks whether all the PTEs in the PMD are pointing to the + * right THP. If so, retract the page table so the THP can refault in with + * as pmd-mapped. Possibly install a huge PMD mapping the THP. + */ +void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, + bool install_pmd) +{ + try_collapse_pte_mapped_thp(mm, addr, install_pmd); +} + /* Can we retract page tables for this file-backed VMA? */ static bool file_backed_vma_is_retractable(struct vm_area_struct *vma) { @@ -2227,7 +2233,7 @@ immap_locked: /* * Remove pte page tables, so we can re-fault the page as huge. - * If MADV_COLLAPSE, adjust result to call collapse_pte_mapped_thp(). + * If MADV_COLLAPSE, adjust result to call try_collapse_pte_mapped_thp(). */ retract_page_tables(mapping, start); if (cc && !cc->is_khugepaged) @@ -2479,7 +2485,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, mmap_read_lock(mm); if (hpage_collapse_test_exit_or_disable(mm)) goto breakouterloop; - *result = collapse_pte_mapped_thp(mm, + *result = try_collapse_pte_mapped_thp(mm, khugepaged_scan.address, false); if (*result == SCAN_PMD_MAPPED) *result = SCAN_SUCCEED; @@ -2844,7 +2850,7 @@ handle_result: case SCAN_PTE_MAPPED_HUGEPAGE: BUG_ON(mmap_locked); mmap_read_lock(mm); - result = collapse_pte_mapped_thp(mm, addr, true); + result = try_collapse_pte_mapped_thp(mm, addr, true); mmap_read_unlock(mm); goto handle_result; /* Whitelisted set of results where continuing OK */ -- cgit v1.2.3 From a00de9ba30aa71fe68ab45a9d2df595a7c39dd74 Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Red Hat)" Date: Tue, 20 Jan 2026 00:01:14 +0100 Subject: mm/balloon_compaction: centralize adjust_managed_page_count() handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Let's centralize it, by allowing for the driver to enable this handling through a new flag (bool for now) in the balloon device info. Note that we now adjust the counter when adding/removing a page into the balloon list: when removing a page to deflate it, it will now happen before the driver communicated with hypervisor, not afterwards. This shouldn't make a difference in practice. Link: https://lkml.kernel.org/r/20260119230133.3551867-7-david@kernel.org Signed-off-by: David Hildenbrand (Red Hat) Acked-by: Liam R. Howlett Acked-by: Michael S. Tsirkin Cc: Arnd Bergmann Cc: Christophe Leroy Cc: Eugenio Pérez Cc: Greg Kroah-Hartman Cc: Jason Wang Cc: Jerrin Shaji George Cc: Jonathan Corbet Cc: Lorenzo Stoakes Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Oscar Salvador Cc: SeongJae Park Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Zi Yan Signed-off-by: Andrew Morton --- arch/powerpc/platforms/pseries/cmm.c | 13 +------------ drivers/virtio/virtio_balloon.c | 19 ++----------------- include/linux/balloon_compaction.h | 2 ++ mm/balloon_compaction.c | 17 +++++++++++++++++ 4 files changed, 22 insertions(+), 29 deletions(-) (limited to 'include') diff --git a/arch/powerpc/platforms/pseries/cmm.c b/arch/powerpc/platforms/pseries/cmm.c index 15f873f733a4..7fd8b3d7e763 100644 --- a/arch/powerpc/platforms/pseries/cmm.c +++ b/arch/powerpc/platforms/pseries/cmm.c @@ -165,7 +165,6 @@ static long cmm_alloc_pages(long nr) balloon_page_enqueue(&b_dev_info, page); atomic_long_inc(&loaned_pages); - adjust_managed_page_count(page, -1); nr--; } @@ -190,7 +189,6 @@ static long cmm_free_pages(long nr) if (!page) break; plpar_page_set_active(page); - adjust_managed_page_count(page, 1); __free_page(page); atomic_long_dec(&loaned_pages); nr--; @@ -515,16 +513,6 @@ static int cmm_migratepage(struct balloon_dev_info *b_dev_info, return -EBUSY; } - /* - * When we migrate a page to a different zone, we have to fixup the - * count of both involved zones as we adjusted the managed page count - * when inflating. - */ - if (page_zone(page) != page_zone(newpage)) { - adjust_managed_page_count(page, 1); - adjust_managed_page_count(newpage, -1); - } - /* * activate/"deflate" the old page. We ignore any errors just like the * other callers. @@ -551,6 +539,7 @@ static int cmm_init(void) return -EOPNOTSUPP; balloon_devinfo_init(&b_dev_info); + b_dev_info.adjust_managed_page_count = true; if (IS_ENABLED(CONFIG_BALLOON_COMPACTION)) b_dev_info.migratepage = cmm_migratepage; diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c index df2756c071da..15c1cf5fd249 100644 --- a/drivers/virtio/virtio_balloon.c +++ b/drivers/virtio/virtio_balloon.c @@ -274,9 +274,6 @@ static unsigned int fill_balloon(struct virtio_balloon *vb, size_t num) set_page_pfns(vb, vb->pfns + vb->num_pfns, page); vb->num_pages += VIRTIO_BALLOON_PAGES_PER_PAGE; - if (!virtio_has_feature(vb->vdev, - VIRTIO_BALLOON_F_DEFLATE_ON_OOM)) - adjust_managed_page_count(page, -1); vb->num_pfns += VIRTIO_BALLOON_PAGES_PER_PAGE; } @@ -295,9 +292,6 @@ static void release_pages_balloon(struct virtio_balloon *vb, struct page *page, *next; list_for_each_entry_safe(page, next, pages, lru) { - if (!virtio_has_feature(vb->vdev, - VIRTIO_BALLOON_F_DEFLATE_ON_OOM)) - adjust_managed_page_count(page, 1); list_del(&page->lru); put_page(page); /* balloon reference */ } @@ -839,17 +833,6 @@ static int virtballoon_migratepage(struct balloon_dev_info *vb_dev_info, if (!mutex_trylock(&vb->balloon_lock)) return -EAGAIN; - /* - * When we migrate a page to a different zone and adjusted the - * managed page count when inflating, we have to fixup the count of - * both involved zones. - */ - if (!virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_DEFLATE_ON_OOM) && - page_zone(page) != page_zone(newpage)) { - adjust_managed_page_count(page, 1); - adjust_managed_page_count(newpage, -1); - } - /* balloon's page migration 1st step -- inflate "newpage" */ vb->num_pfns = VIRTIO_BALLOON_PAGES_PER_PAGE; set_page_pfns(vb, vb->pfns, newpage); @@ -958,6 +941,8 @@ static int virtballoon_probe(struct virtio_device *vdev) if (err) goto out_free_vb; + if (!virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_DEFLATE_ON_OOM)) + vb->vb_dev_info.adjust_managed_page_count = true; #ifdef CONFIG_BALLOON_COMPACTION vb->vb_dev_info.migratepage = virtballoon_migratepage; #endif diff --git a/include/linux/balloon_compaction.h b/include/linux/balloon_compaction.h index 7cfe48769239..3109d3c43d30 100644 --- a/include/linux/balloon_compaction.h +++ b/include/linux/balloon_compaction.h @@ -56,6 +56,7 @@ struct balloon_dev_info { struct list_head pages; /* Pages enqueued & handled to Host */ int (*migratepage)(struct balloon_dev_info *, struct page *newpage, struct page *page, enum migrate_mode mode); + bool adjust_managed_page_count; }; extern struct page *balloon_page_alloc(void); @@ -73,6 +74,7 @@ static inline void balloon_devinfo_init(struct balloon_dev_info *balloon) spin_lock_init(&balloon->pages_lock); INIT_LIST_HEAD(&balloon->pages); balloon->migratepage = NULL; + balloon->adjust_managed_page_count = false; } #ifdef CONFIG_BALLOON_COMPACTION diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c index 764fa25dc4bd..4fe2a0cff69e 100644 --- a/mm/balloon_compaction.c +++ b/mm/balloon_compaction.c @@ -23,6 +23,8 @@ static void balloon_page_enqueue_one(struct balloon_dev_info *b_dev_info, BUG_ON(!trylock_page(page)); balloon_page_insert(b_dev_info, page); unlock_page(page); + if (b_dev_info->adjust_managed_page_count) + adjust_managed_page_count(page, -1); __count_vm_event(BALLOON_INFLATE); inc_node_page_state(page, NR_BALLOON_PAGES); } @@ -95,6 +97,8 @@ size_t balloon_page_list_dequeue(struct balloon_dev_info *b_dev_info, continue; list_del(&page->lru); + if (b_dev_info->adjust_managed_page_count) + adjust_managed_page_count(page, 1); balloon_page_finalize(page); __count_vm_event(BALLOON_DEFLATE); list_add(&page->lru, pages); @@ -264,9 +268,22 @@ static int balloon_page_migrate(struct page *newpage, struct page *page, get_page(newpage); balloon_page_insert(b_dev_info, newpage); __count_vm_event(BALLOON_MIGRATE); + + if (b_dev_info->adjust_managed_page_count && + page_zone(page) != page_zone(newpage)) { + /* + * When we migrate a page to a different zone we + * have to fixup the count of both involved zones. + */ + adjust_managed_page_count(page, 1); + adjust_managed_page_count(newpage, -1); + } } else { /* Old page was deflated but new page not inflated. */ __count_vm_event(BALLOON_DEFLATE); + + if (b_dev_info->adjust_managed_page_count) + adjust_managed_page_count(page, 1); } b_dev_info->isolated_pages--; -- cgit v1.2.3 From 8202313e3dfa9bdeb73427b564cfe2bfd02e4807 Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Red Hat)" Date: Tue, 20 Jan 2026 00:01:16 +0100 Subject: mm/balloon_compaction: use a device-independent balloon (list) lock MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In order to remove the dependency on the page lock for balloon pages, we need a lock that is independent of the page. It's crucial that we can handle the scenario where balloon deflation (clearing page->private) can race with page isolation (using page->private to obtain the balloon_dev_info where the lock currently resides). The current lock in balloon_dev_info is therefore not suitable. Fortunately, we never really have more than a single balloon device per VM, so we can just keep it simple and use a static lock to protect all balloon devices. Based on this change we will remove the dependency on the page lock next. Link: https://lkml.kernel.org/r/20260119230133.3551867-9-david@kernel.org Signed-off-by: David Hildenbrand (Red Hat) Acked-by: Michael S. Tsirkin Cc: Arnd Bergmann Cc: Christophe Leroy Cc: Eugenio Pérez Cc: Greg Kroah-Hartman Cc: Jason Wang Cc: Jerrin Shaji George Cc: Jonathan Corbet Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Oscar Salvador Cc: SeongJae Park Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/balloon_compaction.h | 6 ++---- mm/balloon_compaction.c | 34 ++++++++++++++++++++-------------- 2 files changed, 22 insertions(+), 18 deletions(-) (limited to 'include') diff --git a/include/linux/balloon_compaction.h b/include/linux/balloon_compaction.h index 3109d3c43d30..9a8568fcd477 100644 --- a/include/linux/balloon_compaction.h +++ b/include/linux/balloon_compaction.h @@ -21,10 +21,10 @@ * i. Setting the PG_movable_ops flag and page->private with the following * lock order * +-page_lock(page); - * +--spin_lock_irq(&b_dev_info->pages_lock); + * +--spin_lock_irq(&balloon_pages_lock); * * ii. isolation or dequeueing procedure must remove the page from balloon - * device page list under b_dev_info->pages_lock. + * device page list under balloon_pages_lock * * The functions provided by this interface are placed to help on coping with * the aforementioned balloon page corner case, as well as to ensure the simple @@ -52,7 +52,6 @@ */ struct balloon_dev_info { unsigned long isolated_pages; /* # of isolated pages for migration */ - spinlock_t pages_lock; /* Protection to pages list */ struct list_head pages; /* Pages enqueued & handled to Host */ int (*migratepage)(struct balloon_dev_info *, struct page *newpage, struct page *page, enum migrate_mode mode); @@ -71,7 +70,6 @@ extern size_t balloon_page_list_dequeue(struct balloon_dev_info *b_dev_info, static inline void balloon_devinfo_init(struct balloon_dev_info *balloon) { balloon->isolated_pages = 0; - spin_lock_init(&balloon->pages_lock); INIT_LIST_HEAD(&balloon->pages); balloon->migratepage = NULL; balloon->adjust_managed_page_count = false; diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c index 4fe2a0cff69e..a0fd779bbd01 100644 --- a/mm/balloon_compaction.c +++ b/mm/balloon_compaction.c @@ -11,6 +11,12 @@ #include #include +/* + * Lock protecting the balloon_dev_info of all devices. We don't really + * expect more than one device. + */ +static DEFINE_SPINLOCK(balloon_pages_lock); + static void balloon_page_enqueue_one(struct balloon_dev_info *b_dev_info, struct page *page) { @@ -47,13 +53,13 @@ size_t balloon_page_list_enqueue(struct balloon_dev_info *b_dev_info, unsigned long flags; size_t n_pages = 0; - spin_lock_irqsave(&b_dev_info->pages_lock, flags); + spin_lock_irqsave(&balloon_pages_lock, flags); list_for_each_entry_safe(page, tmp, pages, lru) { list_del(&page->lru); balloon_page_enqueue_one(b_dev_info, page); n_pages++; } - spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); + spin_unlock_irqrestore(&balloon_pages_lock, flags); return n_pages; } EXPORT_SYMBOL_GPL(balloon_page_list_enqueue); @@ -83,7 +89,7 @@ size_t balloon_page_list_dequeue(struct balloon_dev_info *b_dev_info, unsigned long flags; size_t n_pages = 0; - spin_lock_irqsave(&b_dev_info->pages_lock, flags); + spin_lock_irqsave(&balloon_pages_lock, flags); list_for_each_entry_safe(page, tmp, &b_dev_info->pages, lru) { if (n_pages == n_req_pages) break; @@ -106,7 +112,7 @@ size_t balloon_page_list_dequeue(struct balloon_dev_info *b_dev_info, dec_node_page_state(page, NR_BALLOON_PAGES); n_pages++; } - spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); + spin_unlock_irqrestore(&balloon_pages_lock, flags); return n_pages; } @@ -149,9 +155,9 @@ void balloon_page_enqueue(struct balloon_dev_info *b_dev_info, { unsigned long flags; - spin_lock_irqsave(&b_dev_info->pages_lock, flags); + spin_lock_irqsave(&balloon_pages_lock, flags); balloon_page_enqueue_one(b_dev_info, page); - spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); + spin_unlock_irqrestore(&balloon_pages_lock, flags); } EXPORT_SYMBOL_GPL(balloon_page_enqueue); @@ -191,11 +197,11 @@ struct page *balloon_page_dequeue(struct balloon_dev_info *b_dev_info) * BUG() here, otherwise the balloon driver may get stuck in * an infinite loop while attempting to release all its pages. */ - spin_lock_irqsave(&b_dev_info->pages_lock, flags); + spin_lock_irqsave(&balloon_pages_lock, flags); if (unlikely(list_empty(&b_dev_info->pages) && !b_dev_info->isolated_pages)) BUG(); - spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); + spin_unlock_irqrestore(&balloon_pages_lock, flags); return NULL; } return list_first_entry(&pages, struct page, lru); @@ -213,10 +219,10 @@ static bool balloon_page_isolate(struct page *page, isolate_mode_t mode) if (!b_dev_info) return false; - spin_lock_irqsave(&b_dev_info->pages_lock, flags); + spin_lock_irqsave(&balloon_pages_lock, flags); list_del(&page->lru); b_dev_info->isolated_pages++; - spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); + spin_unlock_irqrestore(&balloon_pages_lock, flags); return true; } @@ -234,10 +240,10 @@ static void balloon_page_putback(struct page *page) if (WARN_ON_ONCE(!b_dev_info)) return; - spin_lock_irqsave(&b_dev_info->pages_lock, flags); + spin_lock_irqsave(&balloon_pages_lock, flags); list_add(&page->lru, &b_dev_info->pages); b_dev_info->isolated_pages--; - spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); + spin_unlock_irqrestore(&balloon_pages_lock, flags); } static int balloon_page_migrate(struct page *newpage, struct page *page, @@ -262,7 +268,7 @@ static int balloon_page_migrate(struct page *newpage, struct page *page, if (rc < 0 && rc != -ENOENT) return rc; - spin_lock_irqsave(&b_dev_info->pages_lock, flags); + spin_lock_irqsave(&balloon_pages_lock, flags); if (!rc) { /* Insert the new page into the balloon list. */ get_page(newpage); @@ -287,7 +293,7 @@ static int balloon_page_migrate(struct page *newpage, struct page *page, } b_dev_info->isolated_pages--; - spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); + spin_unlock_irqrestore(&balloon_pages_lock, flags); /* Free the now-deflated page we isolated in balloon_page_isolate(). */ balloon_page_finalize(page); -- cgit v1.2.3 From a3fafdd3896719923f7055b6d7f10f6ee1950d8b Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Red Hat)" Date: Tue, 20 Jan 2026 00:01:17 +0100 Subject: mm/balloon_compaction: remove dependency on page lock MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Let's stop using the page lock in balloon code and instead use only the balloon_device_lock. As soon as we set the PG_movable_ops flag, we might now get isolation callbacks for that page as we are no longer holding the page lock. In there, we'll simply synchronize using the balloon_device_lock. So in balloon_page_isolate() lookup the balloon_dev_info through page->private under balloon_device_lock. It's crucial that we update page->private under the balloon_device_lock, so the isolation callback can properly deal with concurrent deflation. Consequently, make sure that balloon_page_finalize() is called under balloon_device_lock as we remove a page from the list and clear page->private. balloon_page_insert() is already called with the balloon_device_lock held. Note that the core will still lock the pages, for example in isolate_movable_ops_page(). The lock is there still relevant for handling the PageMovableOpsIsolated flag, but that can be later changed to use an atomic test-and-set instead, or moved into the movable_ops backends. Link: https://lkml.kernel.org/r/20260119230133.3551867-10-david@kernel.org Signed-off-by: David Hildenbrand (Red Hat) Acked-by: Michael S. Tsirkin Cc: Arnd Bergmann Cc: Christophe Leroy Cc: Eugenio Pérez Cc: Greg Kroah-Hartman Cc: Jason Wang Cc: Jerrin Shaji George Cc: Jonathan Corbet Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Oscar Salvador Cc: SeongJae Park Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/balloon_compaction.h | 25 +++++++++++++------------ mm/balloon_compaction.c | 38 ++++++++++++-------------------------- 2 files changed, 25 insertions(+), 38 deletions(-) (limited to 'include') diff --git a/include/linux/balloon_compaction.h b/include/linux/balloon_compaction.h index 9a8568fcd477..ad594af6ed10 100644 --- a/include/linux/balloon_compaction.h +++ b/include/linux/balloon_compaction.h @@ -12,25 +12,27 @@ * is derived from the page type (PageOffline()) combined with the * PG_movable_ops flag (PageMovableOps()). * + * Once the page type and the PG_movable_ops are set, migration code + * can initiate page isolation by invoking the + * movable_operations()->isolate_page() callback + * + * As long as page->private is set, the page is either on the balloon list + * or isolated for migration. If page->private is not set, the page is + * either still getting inflated, or was deflated to be freed by the balloon + * driver soon. Isolation is impossible in both cases. + * * As the page isolation scanning step a compaction thread does is a lockless * procedure (from a page standpoint), it might bring some racy situations while * performing balloon page compaction. In order to sort out these racy scenarios * and safely perform balloon's page compaction and migration we must, always, * ensure following these simple rules: * - * i. Setting the PG_movable_ops flag and page->private with the following - * lock order - * +-page_lock(page); - * +--spin_lock_irq(&balloon_pages_lock); + * i. Inflation/deflation must set/clear page->private under the + * balloon_pages_lock * * ii. isolation or dequeueing procedure must remove the page from balloon * device page list under balloon_pages_lock * - * The functions provided by this interface are placed to help on coping with - * the aforementioned balloon page corner case, as well as to ensure the simple - * set of exposed rules are satisfied while we are dealing with balloon pages - * compaction / migration. - * * Copyright (C) 2012, Red Hat, Inc. Rafael Aquini */ #ifndef _LINUX_BALLOON_COMPACTION_H @@ -93,8 +95,7 @@ static inline struct balloon_dev_info *balloon_page_device(struct page *page) * @balloon : pointer to balloon device * @page : page to be assigned as a 'balloon page' * - * Caller must ensure the page is locked and the spin_lock protecting balloon - * pages list is held before inserting a page into the balloon device. + * Caller must ensure the balloon_pages_lock is held. */ static inline void balloon_page_insert(struct balloon_dev_info *balloon, struct page *page) @@ -119,7 +120,7 @@ static inline gfp_t balloon_mapping_gfp_mask(void) * balloon list for release to the page allocator * @page: page to be released to the page allocator * - * Caller must ensure that the page is locked. + * Caller must ensure the balloon_pages_lock is held. */ static inline void balloon_page_finalize(struct page *page) { diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c index a0fd779bbd01..75763c73dbd5 100644 --- a/mm/balloon_compaction.c +++ b/mm/balloon_compaction.c @@ -20,15 +20,7 @@ static DEFINE_SPINLOCK(balloon_pages_lock); static void balloon_page_enqueue_one(struct balloon_dev_info *b_dev_info, struct page *page) { - /* - * Block others from accessing the 'page' when we get around to - * establishing additional references. We should be the only one - * holding a reference to the 'page' at this point. If we are not, then - * memory corruption is possible and we should stop execution. - */ - BUG_ON(!trylock_page(page)); balloon_page_insert(b_dev_info, page); - unlock_page(page); if (b_dev_info->adjust_managed_page_count) adjust_managed_page_count(page, -1); __count_vm_event(BALLOON_INFLATE); @@ -93,22 +85,12 @@ size_t balloon_page_list_dequeue(struct balloon_dev_info *b_dev_info, list_for_each_entry_safe(page, tmp, &b_dev_info->pages, lru) { if (n_pages == n_req_pages) break; - - /* - * Block others from accessing the 'page' while we get around to - * establishing additional references and preparing the 'page' - * to be released by the balloon driver. - */ - if (!trylock_page(page)) - continue; - list_del(&page->lru); if (b_dev_info->adjust_managed_page_count) adjust_managed_page_count(page, 1); balloon_page_finalize(page); __count_vm_event(BALLOON_DEFLATE); list_add(&page->lru, pages); - unlock_page(page); dec_node_page_state(page, NR_BALLOON_PAGES); n_pages++; } @@ -213,13 +195,19 @@ EXPORT_SYMBOL_GPL(balloon_page_dequeue); static bool balloon_page_isolate(struct page *page, isolate_mode_t mode) { - struct balloon_dev_info *b_dev_info = balloon_page_device(page); + struct balloon_dev_info *b_dev_info; unsigned long flags; - if (!b_dev_info) - return false; - spin_lock_irqsave(&balloon_pages_lock, flags); + b_dev_info = balloon_page_device(page); + if (!b_dev_info) { + /* + * The page already got deflated and removed from the + * balloon list. + */ + spin_unlock_irqrestore(&balloon_pages_lock, flags); + return false; + } list_del(&page->lru); b_dev_info->isolated_pages++; spin_unlock_irqrestore(&balloon_pages_lock, flags); @@ -253,9 +241,6 @@ static int balloon_page_migrate(struct page *newpage, struct page *page, unsigned long flags; int rc; - VM_BUG_ON_PAGE(!PageLocked(page), page); - VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); - /* * When we isolated the page, the page was still inflated in a balloon * device. As isolated balloon pages cannot get deflated, we still have @@ -293,10 +278,11 @@ static int balloon_page_migrate(struct page *newpage, struct page *page, } b_dev_info->isolated_pages--; - spin_unlock_irqrestore(&balloon_pages_lock, flags); /* Free the now-deflated page we isolated in balloon_page_isolate(). */ balloon_page_finalize(page); + spin_unlock_irqrestore(&balloon_pages_lock, flags); + put_page(page); return 0; -- cgit v1.2.3 From ddc50a97bef1e34c096bf3f0dc9590d7f570ed7b Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Red Hat)" Date: Tue, 20 Jan 2026 00:01:18 +0100 Subject: mm/balloon_compaction: make balloon_mops static MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is no need to expose this anymore, so let's just make it static. Link: https://lkml.kernel.org/r/20260119230133.3551867-11-david@kernel.org Signed-off-by: David Hildenbrand (Red Hat) Reviewed-by: Lorenzo Stoakes Acked-by: Michael S. Tsirkin Cc: Arnd Bergmann Cc: Christophe Leroy Cc: Eugenio Pérez Cc: Greg Kroah-Hartman Cc: Jason Wang Cc: Jerrin Shaji George Cc: Jonathan Corbet Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Oscar Salvador Cc: SeongJae Park Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/balloon_compaction.h | 1 - mm/balloon_compaction.c | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/balloon_compaction.h b/include/linux/balloon_compaction.h index ad594af6ed10..7db66c2c86cd 100644 --- a/include/linux/balloon_compaction.h +++ b/include/linux/balloon_compaction.h @@ -78,7 +78,6 @@ static inline void balloon_devinfo_init(struct balloon_dev_info *balloon) } #ifdef CONFIG_BALLOON_COMPACTION -extern const struct movable_operations balloon_mops; /* * balloon_page_device - get the b_dev_info descriptor for the balloon device * that enqueues the given page. diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c index 75763c73dbd5..cf4d93176392 100644 --- a/mm/balloon_compaction.c +++ b/mm/balloon_compaction.c @@ -288,7 +288,7 @@ static int balloon_page_migrate(struct page *newpage, struct page *page, return 0; } -const struct movable_operations balloon_mops = { +static const struct movable_operations balloon_mops = { .migrate_page = balloon_page_migrate, .isolate_page = balloon_page_isolate, .putback_page = balloon_page_putback, -- cgit v1.2.3 From aa974cbf949e94c79b46a0053d40229bc634f9be Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Red Hat)" Date: Tue, 20 Jan 2026 00:01:19 +0100 Subject: mm/balloon_compaction: drop fs.h include from balloon_compaction.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Ever since commit 68f2736a8583 ("mm: Convert all PageMovable users to movable_operations") we no longer store an inode in balloon_dev_info, so we can stop including "fs.h". Link: https://lkml.kernel.org/r/20260119230133.3551867-12-david@kernel.org Signed-off-by: David Hildenbrand (Red Hat) Reviewed-by: Lorenzo Stoakes Acked-by: Michael S. Tsirkin Cc: Arnd Bergmann Cc: Christophe Leroy Cc: Eugenio Pérez Cc: Greg Kroah-Hartman Cc: Jason Wang Cc: Jerrin Shaji George Cc: Jonathan Corbet Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Oscar Salvador Cc: SeongJae Park Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/balloon_compaction.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/balloon_compaction.h b/include/linux/balloon_compaction.h index 7db66c2c86cd..1452ea063524 100644 --- a/include/linux/balloon_compaction.h +++ b/include/linux/balloon_compaction.h @@ -42,7 +42,6 @@ #include #include #include -#include #include /* -- cgit v1.2.3 From 0fa3e9a48bafde8aa5a5b994b05396e9b86ce156 Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Red Hat)" Date: Tue, 20 Jan 2026 00:01:21 +0100 Subject: mm/balloon_compaction: remove balloon_page_push/pop() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Let's remove these helpers as they are unused now. Link: https://lkml.kernel.org/r/20260119230133.3551867-14-david@kernel.org Signed-off-by: David Hildenbrand (Red Hat) Reviewed-by: Lorenzo Stoakes Acked-by: Michael S. Tsirkin Cc: Arnd Bergmann Cc: Christophe Leroy Cc: Eugenio Pérez Cc: Greg Kroah-Hartman Cc: Jason Wang Cc: Jerrin Shaji George Cc: Jonathan Corbet Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Oscar Salvador Cc: SeongJae Park Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/balloon_compaction.h | 30 ------------------------------ mm/balloon_compaction.c | 5 ++--- 2 files changed, 2 insertions(+), 33 deletions(-) (limited to 'include') diff --git a/include/linux/balloon_compaction.h b/include/linux/balloon_compaction.h index 1452ea063524..e5451cf1f658 100644 --- a/include/linux/balloon_compaction.h +++ b/include/linux/balloon_compaction.h @@ -126,34 +126,4 @@ static inline void balloon_page_finalize(struct page *page) set_page_private(page, 0); /* PageOffline is sticky until the page is freed to the buddy. */ } - -/* - * balloon_page_push - insert a page into a page list. - * @head : pointer to list - * @page : page to be added - * - * Caller must ensure the page is private and protect the list. - */ -static inline void balloon_page_push(struct list_head *pages, struct page *page) -{ - list_add(&page->lru, pages); -} - -/* - * balloon_page_pop - remove a page from a page list. - * @head : pointer to list - * @page : page to be added - * - * Caller must ensure the page is private and protect the list. - */ -static inline struct page *balloon_page_pop(struct list_head *pages) -{ - struct page *page = list_first_entry_or_null(pages, struct page, lru); - - if (!page) - return NULL; - - list_del(&page->lru); - return page; -} #endif /* _LINUX_BALLOON_COMPACTION_H */ diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c index cf4d93176392..5e1507a13a52 100644 --- a/mm/balloon_compaction.c +++ b/mm/balloon_compaction.c @@ -128,9 +128,8 @@ EXPORT_SYMBOL_GPL(balloon_page_alloc); * Drivers must call this function to properly enqueue a new allocated balloon * page before definitively removing the page from the guest system. * - * Drivers must not call balloon_page_enqueue on pages that have been pushed to - * a list with balloon_page_push before removing them with balloon_page_pop. To - * enqueue a list of pages, use balloon_page_list_enqueue instead. + * Drivers must not enqueue pages while page->lru is still in + * use, and must not use page->lru until a page was unqueued again. */ void balloon_page_enqueue(struct balloon_dev_info *b_dev_info, struct page *page) -- cgit v1.2.3 From 9d792ef33e40c8511b00a38e5e2e63f20bd2d815 Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Red Hat)" Date: Tue, 20 Jan 2026 00:01:22 +0100 Subject: mm/balloon_compaction: fold balloon_mapping_gfp_mask() into balloon_page_alloc() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Let's just remove balloon_mapping_gfp_mask(). Link: https://lkml.kernel.org/r/20260119230133.3551867-15-david@kernel.org Signed-off-by: David Hildenbrand (Red Hat) Reviewed-by: Lorenzo Stoakes Acked-by: Michael S. Tsirkin Cc: Arnd Bergmann Cc: Christophe Leroy Cc: Eugenio Pérez Cc: Greg Kroah-Hartman Cc: Jason Wang Cc: Jerrin Shaji George Cc: Jonathan Corbet Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Oscar Salvador Cc: SeongJae Park Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/balloon_compaction.h | 7 ------- mm/balloon_compaction.c | 12 ++++++++---- 2 files changed, 8 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/linux/balloon_compaction.h b/include/linux/balloon_compaction.h index e5451cf1f658..d1d473939897 100644 --- a/include/linux/balloon_compaction.h +++ b/include/linux/balloon_compaction.h @@ -106,13 +106,6 @@ static inline void balloon_page_insert(struct balloon_dev_info *balloon, list_add(&page->lru, &balloon->pages); } -static inline gfp_t balloon_mapping_gfp_mask(void) -{ - if (IS_ENABLED(CONFIG_BALLOON_COMPACTION)) - return GFP_HIGHUSER_MOVABLE; - return GFP_HIGHUSER; -} - /* * balloon_page_finalize - prepare a balloon page that was removed from the * balloon list for release to the page allocator diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c index 5e1507a13a52..1843e168db3c 100644 --- a/mm/balloon_compaction.c +++ b/mm/balloon_compaction.c @@ -112,10 +112,14 @@ EXPORT_SYMBOL_GPL(balloon_page_list_dequeue); */ struct page *balloon_page_alloc(void) { - struct page *page = alloc_page(balloon_mapping_gfp_mask() | - __GFP_NOMEMALLOC | __GFP_NORETRY | - __GFP_NOWARN); - return page; + gfp_t gfp_flags = __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN; + + if (IS_ENABLED(CONFIG_BALLOON_COMPACTION)) + gfp_flags |= GFP_HIGHUSER_MOVABLE; + else + gfp_flags |= GFP_HIGHUSER; + + return alloc_page(gfp_flags); } EXPORT_SYMBOL_GPL(balloon_page_alloc); -- cgit v1.2.3 From 03d6a2f68419b808d51ba39c84aedd6e9a6a92d8 Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Red Hat)" Date: Tue, 20 Jan 2026 00:01:23 +0100 Subject: mm/balloon_compaction: move internal helpers to balloon_compaction.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Let's move the helpers that are not required by drivers anymore. While at it, drop the doc of balloon_page_device() as it is trivial. [david@kernel.org: move balloon_page_device() under CONFIG_BALLOON_COMPACTION] Link: https://lkml.kernel.org/r/27f0adf1-54c1-4d99-8b7f-fd45574e7f41@kernel.org Link: https://lkml.kernel.org/r/20260119230133.3551867-16-david@kernel.org Signed-off-by: David Hildenbrand (Red Hat) Reviewed-by: Lorenzo Stoakes Acked-by: Michael S. Tsirkin Cc: Arnd Bergmann Cc: Christophe Leroy Cc: Eugenio Pérez Cc: Greg Kroah-Hartman Cc: Jason Wang Cc: Jerrin Shaji George Cc: Jonathan Corbet Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Oscar Salvador Cc: SeongJae Park Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/balloon_compaction.h | 44 -------------------------------------- mm/balloon_compaction.c | 37 ++++++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+), 44 deletions(-) (limited to 'include') diff --git a/include/linux/balloon_compaction.h b/include/linux/balloon_compaction.h index d1d473939897..eec8994056a4 100644 --- a/include/linux/balloon_compaction.h +++ b/include/linux/balloon_compaction.h @@ -75,48 +75,4 @@ static inline void balloon_devinfo_init(struct balloon_dev_info *balloon) balloon->migratepage = NULL; balloon->adjust_managed_page_count = false; } - -#ifdef CONFIG_BALLOON_COMPACTION -/* - * balloon_page_device - get the b_dev_info descriptor for the balloon device - * that enqueues the given page. - */ -static inline struct balloon_dev_info *balloon_page_device(struct page *page) -{ - return (struct balloon_dev_info *)page_private(page); -} -#endif /* CONFIG_BALLOON_COMPACTION */ - -/* - * balloon_page_insert - insert a page into the balloon's page list and make - * the page->private assignment accordingly. - * @balloon : pointer to balloon device - * @page : page to be assigned as a 'balloon page' - * - * Caller must ensure the balloon_pages_lock is held. - */ -static inline void balloon_page_insert(struct balloon_dev_info *balloon, - struct page *page) -{ - __SetPageOffline(page); - if (IS_ENABLED(CONFIG_BALLOON_COMPACTION)) { - SetPageMovableOps(page); - set_page_private(page, (unsigned long)balloon); - } - list_add(&page->lru, &balloon->pages); -} - -/* - * balloon_page_finalize - prepare a balloon page that was removed from the - * balloon list for release to the page allocator - * @page: page to be released to the page allocator - * - * Caller must ensure the balloon_pages_lock is held. - */ -static inline void balloon_page_finalize(struct page *page) -{ - if (IS_ENABLED(CONFIG_BALLOON_COMPACTION)) - set_page_private(page, 0); - /* PageOffline is sticky until the page is freed to the buddy. */ -} #endif /* _LINUX_BALLOON_COMPACTION_H */ diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c index 1843e168db3c..30fa7ee8e1f3 100644 --- a/mm/balloon_compaction.c +++ b/mm/balloon_compaction.c @@ -17,6 +17,39 @@ */ static DEFINE_SPINLOCK(balloon_pages_lock); +/* + * balloon_page_insert - insert a page into the balloon's page list and make + * the page->private assignment accordingly. + * @balloon : pointer to balloon device + * @page : page to be assigned as a 'balloon page' + * + * Caller must ensure the balloon_pages_lock is held. + */ +static void balloon_page_insert(struct balloon_dev_info *balloon, + struct page *page) +{ + __SetPageOffline(page); + if (IS_ENABLED(CONFIG_BALLOON_COMPACTION)) { + SetPageMovableOps(page); + set_page_private(page, (unsigned long)balloon); + } + list_add(&page->lru, &balloon->pages); +} + +/* + * balloon_page_finalize - prepare a balloon page that was removed from the + * balloon list for release to the page allocator + * @page: page to be released to the page allocator + * + * Caller must ensure the balloon_pages_lock is held. + */ +static void balloon_page_finalize(struct page *page) +{ + if (IS_ENABLED(CONFIG_BALLOON_COMPACTION)) + set_page_private(page, 0); + /* PageOffline is sticky until the page is freed to the buddy. */ +} + static void balloon_page_enqueue_one(struct balloon_dev_info *b_dev_info, struct page *page) { @@ -194,6 +227,10 @@ struct page *balloon_page_dequeue(struct balloon_dev_info *b_dev_info) EXPORT_SYMBOL_GPL(balloon_page_dequeue); #ifdef CONFIG_BALLOON_COMPACTION +static struct balloon_dev_info *balloon_page_device(struct page *page) +{ + return (struct balloon_dev_info *)page_private(page); +} static bool balloon_page_isolate(struct page *page, isolate_mode_t mode) -- cgit v1.2.3 From 92ec9260d53b245d3266f74ecc66d8ea47aaec3d Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Red Hat)" Date: Tue, 20 Jan 2026 00:01:26 +0100 Subject: mm/balloon_compaction: remove "extern" from functions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adding "extern" to functions is frowned-upon. Let's just get rid of it for all functions here. Link: https://lkml.kernel.org/r/20260119230133.3551867-19-david@kernel.org Signed-off-by: David Hildenbrand (Red Hat) Reviewed-by: Lorenzo Stoakes Acked-by: Michael S. Tsirkin Cc: Arnd Bergmann Cc: Christophe Leroy Cc: Eugenio Pérez Cc: Greg Kroah-Hartman Cc: Jason Wang Cc: Jerrin Shaji George Cc: Jonathan Corbet Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Oscar Salvador Cc: SeongJae Park Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/balloon_compaction.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/balloon_compaction.h b/include/linux/balloon_compaction.h index eec8994056a4..7757e0e314fd 100644 --- a/include/linux/balloon_compaction.h +++ b/include/linux/balloon_compaction.h @@ -59,14 +59,14 @@ struct balloon_dev_info { bool adjust_managed_page_count; }; -extern struct page *balloon_page_alloc(void); -extern void balloon_page_enqueue(struct balloon_dev_info *b_dev_info, - struct page *page); -extern struct page *balloon_page_dequeue(struct balloon_dev_info *b_dev_info); -extern size_t balloon_page_list_enqueue(struct balloon_dev_info *b_dev_info, - struct list_head *pages); -extern size_t balloon_page_list_dequeue(struct balloon_dev_info *b_dev_info, - struct list_head *pages, size_t n_req_pages); +struct page *balloon_page_alloc(void); +void balloon_page_enqueue(struct balloon_dev_info *b_dev_info, + struct page *page); +struct page *balloon_page_dequeue(struct balloon_dev_info *b_dev_info); +size_t balloon_page_list_enqueue(struct balloon_dev_info *b_dev_info, + struct list_head *pages); +size_t balloon_page_list_dequeue(struct balloon_dev_info *b_dev_info, + struct list_head *pages, size_t n_req_pages); static inline void balloon_devinfo_init(struct balloon_dev_info *balloon) { -- cgit v1.2.3 From 25b48b4cdf912f70998336b861a4bf767ee3d332 Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Red Hat)" Date: Tue, 20 Jan 2026 00:01:28 +0100 Subject: mm: rename balloon_compaction.(c|h) to balloon.(c|h) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Even without CONFIG_BALLOON_COMPACTION this infrastructure implements basic list and page management for a memory balloon. Link: https://lkml.kernel.org/r/20260119230133.3551867-21-david@kernel.org Signed-off-by: David Hildenbrand (Red Hat) Reviewed-by: Lorenzo Stoakes Acked-by: Michael S. Tsirkin Cc: Arnd Bergmann Cc: Christophe Leroy Cc: Eugenio Pérez Cc: Greg Kroah-Hartman Cc: Jason Wang Cc: Jerrin Shaji George Cc: Jonathan Corbet Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Oscar Salvador Cc: SeongJae Park Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Zi Yan Signed-off-by: Andrew Morton --- Documentation/core-api/mm-api.rst | 2 +- MAINTAINERS | 4 +- arch/powerpc/platforms/pseries/cmm.c | 2 +- drivers/misc/vmw_balloon.c | 2 +- drivers/virtio/virtio_balloon.c | 2 +- include/linux/balloon.h | 77 ++++++++ include/linux/balloon_compaction.h | 78 -------- mm/Makefile | 2 +- mm/balloon.c | 344 ++++++++++++++++++++++++++++++++++ mm/balloon_compaction.c | 345 ----------------------------------- 10 files changed, 428 insertions(+), 430 deletions(-) create mode 100644 include/linux/balloon.h delete mode 100644 include/linux/balloon_compaction.h create mode 100644 mm/balloon.c delete mode 100644 mm/balloon_compaction.c (limited to 'include') diff --git a/Documentation/core-api/mm-api.rst b/Documentation/core-api/mm-api.rst index 68193a4cfcf5..aabdd3cba58e 100644 --- a/Documentation/core-api/mm-api.rst +++ b/Documentation/core-api/mm-api.rst @@ -130,5 +130,5 @@ More Memory Management Functions .. kernel-doc:: mm/vmscan.c .. kernel-doc:: mm/memory_hotplug.c .. kernel-doc:: mm/mmu_notifier.c -.. kernel-doc:: mm/balloon_compaction.c +.. kernel-doc:: mm/balloon.c .. kernel-doc:: mm/huge_memory.c diff --git a/MAINTAINERS b/MAINTAINERS index ebc2f1bc0ade..a4535ec654dc 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -27546,9 +27546,9 @@ M: David Hildenbrand L: virtualization@lists.linux.dev S: Maintained F: drivers/virtio/virtio_balloon.c -F: include/linux/balloon_compaction.h +F: include/linux/balloon.h F: include/uapi/linux/virtio_balloon.h -F: mm/balloon_compaction.c +F: mm/balloon.c VIRTIO BLOCK AND SCSI DRIVERS M: "Michael S. Tsirkin" diff --git a/arch/powerpc/platforms/pseries/cmm.c b/arch/powerpc/platforms/pseries/cmm.c index 7fd8b3d7e763..7a3c4922685a 100644 --- a/arch/powerpc/platforms/pseries/cmm.c +++ b/arch/powerpc/platforms/pseries/cmm.c @@ -19,7 +19,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/misc/vmw_balloon.c b/drivers/misc/vmw_balloon.c index 53e9335b6718..7fd3f709108c 100644 --- a/drivers/misc/vmw_balloon.c +++ b/drivers/misc/vmw_balloon.c @@ -29,7 +29,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c index 6ae00de78b61..de8041c3285a 100644 --- a/drivers/virtio/virtio_balloon.c +++ b/drivers/virtio/virtio_balloon.c @@ -13,7 +13,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/include/linux/balloon.h b/include/linux/balloon.h new file mode 100644 index 000000000000..82585542300d --- /dev/null +++ b/include/linux/balloon.h @@ -0,0 +1,77 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Common interface for implementing a memory balloon, including support + * for migration of pages inflated in a memory balloon. + * + * Balloon page migration makes use of the general "movable_ops page migration" + * feature. + * + * page->private is used to reference the responsible balloon device. + * That these pages have movable_ops, and which movable_ops apply, + * is derived from the page type (PageOffline()) combined with the + * PG_movable_ops flag (PageMovableOps()). + * + * Once the page type and the PG_movable_ops are set, migration code + * can initiate page isolation by invoking the + * movable_operations()->isolate_page() callback + * + * As long as page->private is set, the page is either on the balloon list + * or isolated for migration. If page->private is not set, the page is + * either still getting inflated, or was deflated to be freed by the balloon + * driver soon. Isolation is impossible in both cases. + * + * As the page isolation scanning step a compaction thread does is a lockless + * procedure (from a page standpoint), it might bring some racy situations while + * performing balloon page compaction. In order to sort out these racy scenarios + * and safely perform balloon's page compaction and migration we must, always, + * ensure following these simple rules: + * + * i. Inflation/deflation must set/clear page->private under the + * balloon_pages_lock + * + * ii. isolation or dequeueing procedure must remove the page from balloon + * device page list under balloon_pages_lock + * + * Copyright (C) 2012, Red Hat, Inc. Rafael Aquini + */ +#ifndef _LINUX_BALLOON_H +#define _LINUX_BALLOON_H +#include +#include +#include +#include +#include +#include + +/* + * Balloon device information descriptor. + * This struct is used to allow the common balloon compaction interface + * procedures to find the proper balloon device holding memory pages they'll + * have to cope for page compaction / migration, as well as it serves the + * balloon driver as a page book-keeper for its registered balloon devices. + */ +struct balloon_dev_info { + unsigned long isolated_pages; /* # of isolated pages for migration */ + struct list_head pages; /* Pages enqueued & handled to Host */ + int (*migratepage)(struct balloon_dev_info *, struct page *newpage, + struct page *page, enum migrate_mode mode); + bool adjust_managed_page_count; +}; + +struct page *balloon_page_alloc(void); +void balloon_page_enqueue(struct balloon_dev_info *b_dev_info, + struct page *page); +struct page *balloon_page_dequeue(struct balloon_dev_info *b_dev_info); +size_t balloon_page_list_enqueue(struct balloon_dev_info *b_dev_info, + struct list_head *pages); +size_t balloon_page_list_dequeue(struct balloon_dev_info *b_dev_info, + struct list_head *pages, size_t n_req_pages); + +static inline void balloon_devinfo_init(struct balloon_dev_info *balloon) +{ + balloon->isolated_pages = 0; + INIT_LIST_HEAD(&balloon->pages); + balloon->migratepage = NULL; + balloon->adjust_managed_page_count = false; +} +#endif /* _LINUX_BALLOON_H */ diff --git a/include/linux/balloon_compaction.h b/include/linux/balloon_compaction.h deleted file mode 100644 index 7757e0e314fd..000000000000 --- a/include/linux/balloon_compaction.h +++ /dev/null @@ -1,78 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * include/linux/balloon_compaction.h - * - * Common interface definitions for making balloon pages movable by compaction. - * - * Balloon page migration makes use of the general "movable_ops page migration" - * feature. - * - * page->private is used to reference the responsible balloon device. - * That these pages have movable_ops, and which movable_ops apply, - * is derived from the page type (PageOffline()) combined with the - * PG_movable_ops flag (PageMovableOps()). - * - * Once the page type and the PG_movable_ops are set, migration code - * can initiate page isolation by invoking the - * movable_operations()->isolate_page() callback - * - * As long as page->private is set, the page is either on the balloon list - * or isolated for migration. If page->private is not set, the page is - * either still getting inflated, or was deflated to be freed by the balloon - * driver soon. Isolation is impossible in both cases. - * - * As the page isolation scanning step a compaction thread does is a lockless - * procedure (from a page standpoint), it might bring some racy situations while - * performing balloon page compaction. In order to sort out these racy scenarios - * and safely perform balloon's page compaction and migration we must, always, - * ensure following these simple rules: - * - * i. Inflation/deflation must set/clear page->private under the - * balloon_pages_lock - * - * ii. isolation or dequeueing procedure must remove the page from balloon - * device page list under balloon_pages_lock - * - * Copyright (C) 2012, Red Hat, Inc. Rafael Aquini - */ -#ifndef _LINUX_BALLOON_COMPACTION_H -#define _LINUX_BALLOON_COMPACTION_H -#include -#include -#include -#include -#include -#include - -/* - * Balloon device information descriptor. - * This struct is used to allow the common balloon compaction interface - * procedures to find the proper balloon device holding memory pages they'll - * have to cope for page compaction / migration, as well as it serves the - * balloon driver as a page book-keeper for its registered balloon devices. - */ -struct balloon_dev_info { - unsigned long isolated_pages; /* # of isolated pages for migration */ - struct list_head pages; /* Pages enqueued & handled to Host */ - int (*migratepage)(struct balloon_dev_info *, struct page *newpage, - struct page *page, enum migrate_mode mode); - bool adjust_managed_page_count; -}; - -struct page *balloon_page_alloc(void); -void balloon_page_enqueue(struct balloon_dev_info *b_dev_info, - struct page *page); -struct page *balloon_page_dequeue(struct balloon_dev_info *b_dev_info); -size_t balloon_page_list_enqueue(struct balloon_dev_info *b_dev_info, - struct list_head *pages); -size_t balloon_page_list_dequeue(struct balloon_dev_info *b_dev_info, - struct list_head *pages, size_t n_req_pages); - -static inline void balloon_devinfo_init(struct balloon_dev_info *balloon) -{ - balloon->isolated_pages = 0; - INIT_LIST_HEAD(&balloon->pages); - balloon->migratepage = NULL; - balloon->adjust_managed_page_count = false; -} -#endif /* _LINUX_BALLOON_COMPACTION_H */ diff --git a/mm/Makefile b/mm/Makefile index 9175f8cc6565..1e31e0a528dc 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -122,7 +122,7 @@ obj-$(CONFIG_CMA) += cma.o obj-$(CONFIG_NUMA) += numa.o obj-$(CONFIG_NUMA_MEMBLKS) += numa_memblks.o obj-$(CONFIG_NUMA_EMU) += numa_emulation.o -obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o +obj-$(CONFIG_MEMORY_BALLOON) += balloon.o obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o obj-$(CONFIG_PAGE_TABLE_CHECK) += page_table_check.o obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o diff --git a/mm/balloon.c b/mm/balloon.c new file mode 100644 index 000000000000..0f068b97e5d8 --- /dev/null +++ b/mm/balloon.c @@ -0,0 +1,344 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Common interface for implementing a memory balloon, including support + * for migration of pages inflated in a memory balloon. + * + * Copyright (C) 2012, Red Hat, Inc. Rafael Aquini + */ +#include +#include +#include +#include + +/* + * Lock protecting the balloon_dev_info of all devices. We don't really + * expect more than one device. + */ +static DEFINE_SPINLOCK(balloon_pages_lock); + +/** + * balloon_page_insert - insert a page into the balloon's page list and make + * the page->private assignment accordingly. + * @balloon : pointer to balloon device + * @page : page to be assigned as a 'balloon page' + * + * Caller must ensure the balloon_pages_lock is held. + */ +static void balloon_page_insert(struct balloon_dev_info *balloon, + struct page *page) +{ + lockdep_assert_held(&balloon_pages_lock); + __SetPageOffline(page); + if (IS_ENABLED(CONFIG_BALLOON_COMPACTION)) { + SetPageMovableOps(page); + set_page_private(page, (unsigned long)balloon); + } + list_add(&page->lru, &balloon->pages); +} + +/** + * balloon_page_finalize - prepare a balloon page that was removed from the + * balloon list for release to the page allocator + * @page: page to be released to the page allocator + * + * Caller must ensure the balloon_pages_lock is held. + */ +static void balloon_page_finalize(struct page *page) +{ + lockdep_assert_held(&balloon_pages_lock); + if (IS_ENABLED(CONFIG_BALLOON_COMPACTION)) + set_page_private(page, 0); + /* PageOffline is sticky until the page is freed to the buddy. */ +} + +static void balloon_page_enqueue_one(struct balloon_dev_info *b_dev_info, + struct page *page) +{ + balloon_page_insert(b_dev_info, page); + if (b_dev_info->adjust_managed_page_count) + adjust_managed_page_count(page, -1); + __count_vm_event(BALLOON_INFLATE); + inc_node_page_state(page, NR_BALLOON_PAGES); +} + +/** + * balloon_page_list_enqueue() - inserts a list of pages into the balloon page + * list. + * @b_dev_info: balloon device descriptor where we will insert a new page to + * @pages: pages to enqueue - allocated using balloon_page_alloc. + * + * Driver must call this function to properly enqueue balloon pages before + * definitively removing them from the guest system. + * + * Return: number of pages that were enqueued. + */ +size_t balloon_page_list_enqueue(struct balloon_dev_info *b_dev_info, + struct list_head *pages) +{ + struct page *page, *tmp; + unsigned long flags; + size_t n_pages = 0; + + spin_lock_irqsave(&balloon_pages_lock, flags); + list_for_each_entry_safe(page, tmp, pages, lru) { + list_del(&page->lru); + balloon_page_enqueue_one(b_dev_info, page); + n_pages++; + } + spin_unlock_irqrestore(&balloon_pages_lock, flags); + return n_pages; +} +EXPORT_SYMBOL_GPL(balloon_page_list_enqueue); + +/** + * balloon_page_list_dequeue() - removes pages from balloon's page list and + * returns a list of the pages. + * @b_dev_info: balloon device descriptor where we will grab a page from. + * @pages: pointer to the list of pages that would be returned to the caller. + * @n_req_pages: number of requested pages. + * + * Driver must call this function to properly de-allocate a previous enlisted + * balloon pages before definitively releasing it back to the guest system. + * This function tries to remove @n_req_pages from the ballooned pages and + * return them to the caller in the @pages list. + * + * Note that this function may fail to dequeue some pages even if the balloon + * isn't empty - since the page list can be temporarily empty due to compaction + * of isolated pages. + * + * Return: number of pages that were added to the @pages list. + */ +size_t balloon_page_list_dequeue(struct balloon_dev_info *b_dev_info, + struct list_head *pages, size_t n_req_pages) +{ + struct page *page, *tmp; + unsigned long flags; + size_t n_pages = 0; + + spin_lock_irqsave(&balloon_pages_lock, flags); + list_for_each_entry_safe(page, tmp, &b_dev_info->pages, lru) { + if (n_pages == n_req_pages) + break; + list_del(&page->lru); + if (b_dev_info->adjust_managed_page_count) + adjust_managed_page_count(page, 1); + balloon_page_finalize(page); + __count_vm_event(BALLOON_DEFLATE); + list_add(&page->lru, pages); + dec_node_page_state(page, NR_BALLOON_PAGES); + n_pages++; + } + spin_unlock_irqrestore(&balloon_pages_lock, flags); + + return n_pages; +} +EXPORT_SYMBOL_GPL(balloon_page_list_dequeue); + +/** + * balloon_page_alloc - allocates a new page for insertion into the balloon + * page list. + * + * Driver must call this function to properly allocate a new balloon page. + * Driver must call balloon_page_enqueue before definitively removing the page + * from the guest system. + * + * Return: struct page for the allocated page or NULL on allocation failure. + */ +struct page *balloon_page_alloc(void) +{ + gfp_t gfp_flags = __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN; + + if (IS_ENABLED(CONFIG_BALLOON_COMPACTION)) + gfp_flags |= GFP_HIGHUSER_MOVABLE; + else + gfp_flags |= GFP_HIGHUSER; + + return alloc_page(gfp_flags); +} +EXPORT_SYMBOL_GPL(balloon_page_alloc); + +/** + * balloon_page_enqueue - inserts a new page into the balloon page list. + * + * @b_dev_info: balloon device descriptor where we will insert a new page + * @page: new page to enqueue - allocated using balloon_page_alloc. + * + * Drivers must call this function to properly enqueue a new allocated balloon + * page before definitively removing the page from the guest system. + * + * Drivers must not enqueue pages while page->lru is still in + * use, and must not use page->lru until a page was unqueued again. + */ +void balloon_page_enqueue(struct balloon_dev_info *b_dev_info, + struct page *page) +{ + unsigned long flags; + + spin_lock_irqsave(&balloon_pages_lock, flags); + balloon_page_enqueue_one(b_dev_info, page); + spin_unlock_irqrestore(&balloon_pages_lock, flags); +} +EXPORT_SYMBOL_GPL(balloon_page_enqueue); + +/** + * balloon_page_dequeue - removes a page from balloon's page list and returns + * its address to allow the driver to release the page. + * @b_dev_info: balloon device descriptor where we will grab a page from. + * + * Driver must call this function to properly dequeue a previously enqueued page + * before definitively releasing it back to the guest system. + * + * Caller must perform its own accounting to ensure that this + * function is called only if some pages are actually enqueued. + * + * Note that this function may fail to dequeue some pages even if there are + * some enqueued pages - since the page list can be temporarily empty due to + * the compaction of isolated pages. + * + * TODO: remove the caller accounting requirements, and allow caller to wait + * until all pages can be dequeued. + * + * Return: struct page for the dequeued page, or NULL if no page was dequeued. + */ +struct page *balloon_page_dequeue(struct balloon_dev_info *b_dev_info) +{ + unsigned long flags; + LIST_HEAD(pages); + int n_pages; + + n_pages = balloon_page_list_dequeue(b_dev_info, &pages, 1); + + if (n_pages != 1) { + /* + * If we are unable to dequeue a balloon page because the page + * list is empty and there are no isolated pages, then something + * went out of track and some balloon pages are lost. + * BUG() here, otherwise the balloon driver may get stuck in + * an infinite loop while attempting to release all its pages. + */ + spin_lock_irqsave(&balloon_pages_lock, flags); + if (unlikely(list_empty(&b_dev_info->pages) && + !b_dev_info->isolated_pages)) + BUG(); + spin_unlock_irqrestore(&balloon_pages_lock, flags); + return NULL; + } + return list_first_entry(&pages, struct page, lru); +} +EXPORT_SYMBOL_GPL(balloon_page_dequeue); + +#ifdef CONFIG_BALLOON_COMPACTION +static struct balloon_dev_info *balloon_page_device(struct page *page) +{ + return (struct balloon_dev_info *)page_private(page); +} + +static bool balloon_page_isolate(struct page *page, isolate_mode_t mode) + +{ + struct balloon_dev_info *b_dev_info; + unsigned long flags; + + spin_lock_irqsave(&balloon_pages_lock, flags); + b_dev_info = balloon_page_device(page); + if (!b_dev_info) { + /* + * The page already got deflated and removed from the + * balloon list. + */ + spin_unlock_irqrestore(&balloon_pages_lock, flags); + return false; + } + list_del(&page->lru); + b_dev_info->isolated_pages++; + spin_unlock_irqrestore(&balloon_pages_lock, flags); + + return true; +} + +static void balloon_page_putback(struct page *page) +{ + struct balloon_dev_info *b_dev_info = balloon_page_device(page); + unsigned long flags; + + /* + * When we isolated the page, the page was still inflated in a balloon + * device. As isolated balloon pages cannot get deflated, we still have + * a balloon device here. + */ + if (WARN_ON_ONCE(!b_dev_info)) + return; + + spin_lock_irqsave(&balloon_pages_lock, flags); + list_add(&page->lru, &b_dev_info->pages); + b_dev_info->isolated_pages--; + spin_unlock_irqrestore(&balloon_pages_lock, flags); +} + +static int balloon_page_migrate(struct page *newpage, struct page *page, + enum migrate_mode mode) +{ + struct balloon_dev_info *b_dev_info = balloon_page_device(page); + unsigned long flags; + int rc; + + /* + * When we isolated the page, the page was still inflated in a balloon + * device. As isolated balloon pages cannot get deflated, we still have + * a balloon device here. + */ + if (WARN_ON_ONCE(!b_dev_info)) + return -EAGAIN; + + rc = b_dev_info->migratepage(b_dev_info, newpage, page, mode); + if (rc < 0 && rc != -ENOENT) + return rc; + + spin_lock_irqsave(&balloon_pages_lock, flags); + if (!rc) { + /* Insert the new page into the balloon list. */ + get_page(newpage); + balloon_page_insert(b_dev_info, newpage); + __count_vm_event(BALLOON_MIGRATE); + + if (b_dev_info->adjust_managed_page_count && + page_zone(page) != page_zone(newpage)) { + /* + * When we migrate a page to a different zone we + * have to fixup the count of both involved zones. + */ + adjust_managed_page_count(page, 1); + adjust_managed_page_count(newpage, -1); + } + } else { + /* Old page was deflated but new page not inflated. */ + __count_vm_event(BALLOON_DEFLATE); + + if (b_dev_info->adjust_managed_page_count) + adjust_managed_page_count(page, 1); + } + + b_dev_info->isolated_pages--; + + /* Free the now-deflated page we isolated in balloon_page_isolate(). */ + balloon_page_finalize(page); + spin_unlock_irqrestore(&balloon_pages_lock, flags); + + put_page(page); + + return 0; +} + +static const struct movable_operations balloon_mops = { + .migrate_page = balloon_page_migrate, + .isolate_page = balloon_page_isolate, + .putback_page = balloon_page_putback, +}; + +static int __init balloon_init(void) +{ + return set_movable_ops(&balloon_mops, PGTY_offline); +} +core_initcall(balloon_init); + +#endif /* CONFIG_BALLOON_COMPACTION */ diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c deleted file mode 100644 index 7e37a7af9ef0..000000000000 --- a/mm/balloon_compaction.c +++ /dev/null @@ -1,345 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * mm/balloon_compaction.c - * - * Common interface for making balloon pages movable by compaction. - * - * Copyright (C) 2012, Red Hat, Inc. Rafael Aquini - */ -#include -#include -#include -#include - -/* - * Lock protecting the balloon_dev_info of all devices. We don't really - * expect more than one device. - */ -static DEFINE_SPINLOCK(balloon_pages_lock); - -/** - * balloon_page_insert - insert a page into the balloon's page list and make - * the page->private assignment accordingly. - * @balloon : pointer to balloon device - * @page : page to be assigned as a 'balloon page' - * - * Caller must ensure the balloon_pages_lock is held. - */ -static void balloon_page_insert(struct balloon_dev_info *balloon, - struct page *page) -{ - lockdep_assert_held(&balloon_pages_lock); - __SetPageOffline(page); - if (IS_ENABLED(CONFIG_BALLOON_COMPACTION)) { - SetPageMovableOps(page); - set_page_private(page, (unsigned long)balloon); - } - list_add(&page->lru, &balloon->pages); -} - -/** - * balloon_page_finalize - prepare a balloon page that was removed from the - * balloon list for release to the page allocator - * @page: page to be released to the page allocator - * - * Caller must ensure the balloon_pages_lock is held. - */ -static void balloon_page_finalize(struct page *page) -{ - lockdep_assert_held(&balloon_pages_lock); - if (IS_ENABLED(CONFIG_BALLOON_COMPACTION)) - set_page_private(page, 0); - /* PageOffline is sticky until the page is freed to the buddy. */ -} - -static void balloon_page_enqueue_one(struct balloon_dev_info *b_dev_info, - struct page *page) -{ - balloon_page_insert(b_dev_info, page); - if (b_dev_info->adjust_managed_page_count) - adjust_managed_page_count(page, -1); - __count_vm_event(BALLOON_INFLATE); - inc_node_page_state(page, NR_BALLOON_PAGES); -} - -/** - * balloon_page_list_enqueue() - inserts a list of pages into the balloon page - * list. - * @b_dev_info: balloon device descriptor where we will insert a new page to - * @pages: pages to enqueue - allocated using balloon_page_alloc. - * - * Driver must call this function to properly enqueue balloon pages before - * definitively removing them from the guest system. - * - * Return: number of pages that were enqueued. - */ -size_t balloon_page_list_enqueue(struct balloon_dev_info *b_dev_info, - struct list_head *pages) -{ - struct page *page, *tmp; - unsigned long flags; - size_t n_pages = 0; - - spin_lock_irqsave(&balloon_pages_lock, flags); - list_for_each_entry_safe(page, tmp, pages, lru) { - list_del(&page->lru); - balloon_page_enqueue_one(b_dev_info, page); - n_pages++; - } - spin_unlock_irqrestore(&balloon_pages_lock, flags); - return n_pages; -} -EXPORT_SYMBOL_GPL(balloon_page_list_enqueue); - -/** - * balloon_page_list_dequeue() - removes pages from balloon's page list and - * returns a list of the pages. - * @b_dev_info: balloon device descriptor where we will grab a page from. - * @pages: pointer to the list of pages that would be returned to the caller. - * @n_req_pages: number of requested pages. - * - * Driver must call this function to properly de-allocate a previous enlisted - * balloon pages before definitively releasing it back to the guest system. - * This function tries to remove @n_req_pages from the ballooned pages and - * return them to the caller in the @pages list. - * - * Note that this function may fail to dequeue some pages even if the balloon - * isn't empty - since the page list can be temporarily empty due to compaction - * of isolated pages. - * - * Return: number of pages that were added to the @pages list. - */ -size_t balloon_page_list_dequeue(struct balloon_dev_info *b_dev_info, - struct list_head *pages, size_t n_req_pages) -{ - struct page *page, *tmp; - unsigned long flags; - size_t n_pages = 0; - - spin_lock_irqsave(&balloon_pages_lock, flags); - list_for_each_entry_safe(page, tmp, &b_dev_info->pages, lru) { - if (n_pages == n_req_pages) - break; - list_del(&page->lru); - if (b_dev_info->adjust_managed_page_count) - adjust_managed_page_count(page, 1); - balloon_page_finalize(page); - __count_vm_event(BALLOON_DEFLATE); - list_add(&page->lru, pages); - dec_node_page_state(page, NR_BALLOON_PAGES); - n_pages++; - } - spin_unlock_irqrestore(&balloon_pages_lock, flags); - - return n_pages; -} -EXPORT_SYMBOL_GPL(balloon_page_list_dequeue); - -/** - * balloon_page_alloc - allocates a new page for insertion into the balloon - * page list. - * - * Driver must call this function to properly allocate a new balloon page. - * Driver must call balloon_page_enqueue before definitively removing the page - * from the guest system. - * - * Return: struct page for the allocated page or NULL on allocation failure. - */ -struct page *balloon_page_alloc(void) -{ - gfp_t gfp_flags = __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN; - - if (IS_ENABLED(CONFIG_BALLOON_COMPACTION)) - gfp_flags |= GFP_HIGHUSER_MOVABLE; - else - gfp_flags |= GFP_HIGHUSER; - - return alloc_page(gfp_flags); -} -EXPORT_SYMBOL_GPL(balloon_page_alloc); - -/** - * balloon_page_enqueue - inserts a new page into the balloon page list. - * - * @b_dev_info: balloon device descriptor where we will insert a new page - * @page: new page to enqueue - allocated using balloon_page_alloc. - * - * Drivers must call this function to properly enqueue a new allocated balloon - * page before definitively removing the page from the guest system. - * - * Drivers must not enqueue pages while page->lru is still in - * use, and must not use page->lru until a page was unqueued again. - */ -void balloon_page_enqueue(struct balloon_dev_info *b_dev_info, - struct page *page) -{ - unsigned long flags; - - spin_lock_irqsave(&balloon_pages_lock, flags); - balloon_page_enqueue_one(b_dev_info, page); - spin_unlock_irqrestore(&balloon_pages_lock, flags); -} -EXPORT_SYMBOL_GPL(balloon_page_enqueue); - -/** - * balloon_page_dequeue - removes a page from balloon's page list and returns - * its address to allow the driver to release the page. - * @b_dev_info: balloon device descriptor where we will grab a page from. - * - * Driver must call this function to properly dequeue a previously enqueued page - * before definitively releasing it back to the guest system. - * - * Caller must perform its own accounting to ensure that this - * function is called only if some pages are actually enqueued. - * - * Note that this function may fail to dequeue some pages even if there are - * some enqueued pages - since the page list can be temporarily empty due to - * the compaction of isolated pages. - * - * TODO: remove the caller accounting requirements, and allow caller to wait - * until all pages can be dequeued. - * - * Return: struct page for the dequeued page, or NULL if no page was dequeued. - */ -struct page *balloon_page_dequeue(struct balloon_dev_info *b_dev_info) -{ - unsigned long flags; - LIST_HEAD(pages); - int n_pages; - - n_pages = balloon_page_list_dequeue(b_dev_info, &pages, 1); - - if (n_pages != 1) { - /* - * If we are unable to dequeue a balloon page because the page - * list is empty and there are no isolated pages, then something - * went out of track and some balloon pages are lost. - * BUG() here, otherwise the balloon driver may get stuck in - * an infinite loop while attempting to release all its pages. - */ - spin_lock_irqsave(&balloon_pages_lock, flags); - if (unlikely(list_empty(&b_dev_info->pages) && - !b_dev_info->isolated_pages)) - BUG(); - spin_unlock_irqrestore(&balloon_pages_lock, flags); - return NULL; - } - return list_first_entry(&pages, struct page, lru); -} -EXPORT_SYMBOL_GPL(balloon_page_dequeue); - -#ifdef CONFIG_BALLOON_COMPACTION -static struct balloon_dev_info *balloon_page_device(struct page *page) -{ - return (struct balloon_dev_info *)page_private(page); -} - -static bool balloon_page_isolate(struct page *page, isolate_mode_t mode) - -{ - struct balloon_dev_info *b_dev_info; - unsigned long flags; - - spin_lock_irqsave(&balloon_pages_lock, flags); - b_dev_info = balloon_page_device(page); - if (!b_dev_info) { - /* - * The page already got deflated and removed from the - * balloon list. - */ - spin_unlock_irqrestore(&balloon_pages_lock, flags); - return false; - } - list_del(&page->lru); - b_dev_info->isolated_pages++; - spin_unlock_irqrestore(&balloon_pages_lock, flags); - - return true; -} - -static void balloon_page_putback(struct page *page) -{ - struct balloon_dev_info *b_dev_info = balloon_page_device(page); - unsigned long flags; - - /* - * When we isolated the page, the page was still inflated in a balloon - * device. As isolated balloon pages cannot get deflated, we still have - * a balloon device here. - */ - if (WARN_ON_ONCE(!b_dev_info)) - return; - - spin_lock_irqsave(&balloon_pages_lock, flags); - list_add(&page->lru, &b_dev_info->pages); - b_dev_info->isolated_pages--; - spin_unlock_irqrestore(&balloon_pages_lock, flags); -} - -static int balloon_page_migrate(struct page *newpage, struct page *page, - enum migrate_mode mode) -{ - struct balloon_dev_info *b_dev_info = balloon_page_device(page); - unsigned long flags; - int rc; - - /* - * When we isolated the page, the page was still inflated in a balloon - * device. As isolated balloon pages cannot get deflated, we still have - * a balloon device here. - */ - if (WARN_ON_ONCE(!b_dev_info)) - return -EAGAIN; - - rc = b_dev_info->migratepage(b_dev_info, newpage, page, mode); - if (rc < 0 && rc != -ENOENT) - return rc; - - spin_lock_irqsave(&balloon_pages_lock, flags); - if (!rc) { - /* Insert the new page into the balloon list. */ - get_page(newpage); - balloon_page_insert(b_dev_info, newpage); - __count_vm_event(BALLOON_MIGRATE); - - if (b_dev_info->adjust_managed_page_count && - page_zone(page) != page_zone(newpage)) { - /* - * When we migrate a page to a different zone we - * have to fixup the count of both involved zones. - */ - adjust_managed_page_count(page, 1); - adjust_managed_page_count(newpage, -1); - } - } else { - /* Old page was deflated but new page not inflated. */ - __count_vm_event(BALLOON_DEFLATE); - - if (b_dev_info->adjust_managed_page_count) - adjust_managed_page_count(page, 1); - } - - b_dev_info->isolated_pages--; - - /* Free the now-deflated page we isolated in balloon_page_isolate(). */ - balloon_page_finalize(page); - spin_unlock_irqrestore(&balloon_pages_lock, flags); - - put_page(page); - - return 0; -} - -static const struct movable_operations balloon_mops = { - .migrate_page = balloon_page_migrate, - .isolate_page = balloon_page_isolate, - .putback_page = balloon_page_putback, -}; - -static int __init balloon_init(void) -{ - return set_movable_ops(&balloon_mops, PGTY_offline); -} -core_initcall(balloon_init); - -#endif /* CONFIG_BALLOON_COMPACTION */ -- cgit v1.2.3 From cd8e95d80bc29b3c72288bd31e845b11755ef6a5 Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Red Hat)" Date: Tue, 20 Jan 2026 00:01:30 +0100 Subject: mm: rename CONFIG_BALLOON_COMPACTION to CONFIG_BALLOON_MIGRATION MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit While compaction depends on migration, the other direction is not the case. So let's make it clearer that this is all about migration of balloon pages. Adjust all comments/docs in the core to talk about "migration" instead of "compaction". While at it add some "/* CONFIG_BALLOON_MIGRATION */". Link: https://lkml.kernel.org/r/20260119230133.3551867-23-david@kernel.org Signed-off-by: David Hildenbrand (Red Hat) Reviewed-by: Lorenzo Stoakes Acked-by: Michael S. Tsirkin Cc: Arnd Bergmann Cc: Christophe Leroy Cc: Eugenio Pérez Cc: Greg Kroah-Hartman Cc: Jason Wang Cc: Jerrin Shaji George Cc: Jonathan Corbet Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Oscar Salvador Cc: SeongJae Park Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Zi Yan Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/memory-hotplug.rst | 8 ++++---- arch/powerpc/platforms/pseries/cmm.c | 8 ++++---- drivers/misc/vmw_balloon.c | 8 ++++---- drivers/virtio/virtio_balloon.c | 6 +++--- include/linux/balloon.h | 12 ++++++------ include/linux/vm_event_item.h | 4 ++-- mm/Kconfig | 4 ++-- mm/balloon.c | 10 +++++----- mm/memory_hotplug.c | 4 ++-- mm/migrate.c | 2 +- mm/vmstat.c | 4 ++-- 11 files changed, 35 insertions(+), 35 deletions(-) (limited to 'include') diff --git a/Documentation/admin-guide/mm/memory-hotplug.rst b/Documentation/admin-guide/mm/memory-hotplug.rst index 6581558fd0d7..0207f8725142 100644 --- a/Documentation/admin-guide/mm/memory-hotplug.rst +++ b/Documentation/admin-guide/mm/memory-hotplug.rst @@ -603,11 +603,11 @@ ZONE_MOVABLE, especially when fine-tuning zone ratios: memory for metadata and page tables in the direct map; having a lot of offline memory blocks is not a typical case, though. -- Memory ballooning without balloon compaction is incompatible with - ZONE_MOVABLE. Only some implementations, such as virtio-balloon and - pseries CMM, fully support balloon compaction. +- Memory ballooning without support for balloon memory migration is incompatible + with ZONE_MOVABLE. Only some implementations, such as virtio-balloon and + pseries CMM, fully support balloon memory migration. - Further, the CONFIG_BALLOON_COMPACTION kernel configuration option might be + Further, the CONFIG_BALLOON_MIGRATION kernel configuration option might be disabled. In that case, balloon inflation will only perform unmovable allocations and silently create a zone imbalance, usually triggered by inflation requests from the hypervisor. diff --git a/arch/powerpc/platforms/pseries/cmm.c b/arch/powerpc/platforms/pseries/cmm.c index 7a3c4922685a..8d83df12430f 100644 --- a/arch/powerpc/platforms/pseries/cmm.c +++ b/arch/powerpc/platforms/pseries/cmm.c @@ -494,7 +494,7 @@ static struct notifier_block cmm_mem_nb = { .priority = CMM_MEM_HOTPLUG_PRI }; -#ifdef CONFIG_BALLOON_COMPACTION +#ifdef CONFIG_BALLOON_MIGRATION static int cmm_migratepage(struct balloon_dev_info *b_dev_info, struct page *newpage, struct page *page, enum migrate_mode mode) @@ -520,10 +520,10 @@ static int cmm_migratepage(struct balloon_dev_info *b_dev_info, plpar_page_set_active(page); return 0; } -#else /* CONFIG_BALLOON_COMPACTION */ +#else /* CONFIG_BALLOON_MIGRATION */ int cmm_migratepage(struct balloon_dev_info *b_dev_info, struct page *newpage, struct page *page, enum migrate_mode mode); -#endif /* CONFIG_BALLOON_COMPACTION */ +#endif /* CONFIG_BALLOON_MIGRATION */ /** * cmm_init - Module initialization @@ -540,7 +540,7 @@ static int cmm_init(void) balloon_devinfo_init(&b_dev_info); b_dev_info.adjust_managed_page_count = true; - if (IS_ENABLED(CONFIG_BALLOON_COMPACTION)) + if (IS_ENABLED(CONFIG_BALLOON_MIGRATION)) b_dev_info.migratepage = cmm_migratepage; rc = register_oom_notifier(&cmm_oom_nb); diff --git a/drivers/misc/vmw_balloon.c b/drivers/misc/vmw_balloon.c index 7fd3f709108c..216a16395968 100644 --- a/drivers/misc/vmw_balloon.c +++ b/drivers/misc/vmw_balloon.c @@ -1719,7 +1719,7 @@ static inline void vmballoon_debugfs_exit(struct vmballoon *b) #endif /* CONFIG_DEBUG_FS */ -#ifdef CONFIG_BALLOON_COMPACTION +#ifdef CONFIG_BALLOON_MIGRATION /** * vmballoon_migratepage() - migrates a balloon page. * @b_dev_info: balloon device information descriptor. @@ -1803,11 +1803,11 @@ out_unlock: up_read(&b->conf_sem); return ret; } -#else /* CONFIG_BALLOON_COMPACTION */ +#else /* CONFIG_BALLOON_MIGRATION */ int vmballoon_migratepage(struct balloon_dev_info *b_dev_info, struct page *newpage, struct page *page, enum migrate_mode mode); -#endif /* CONFIG_BALLOON_COMPACTION */ +#endif /* CONFIG_BALLOON_MIGRATION */ static int __init vmballoon_init(void) { @@ -1827,7 +1827,7 @@ static int __init vmballoon_init(void) return error; balloon_devinfo_init(&balloon.b_dev_info); - if (IS_ENABLED(CONFIG_BALLOON_COMPACTION)) + if (IS_ENABLED(CONFIG_BALLOON_MIGRATION)) balloon.b_dev_info.migratepage = vmballoon_migratepage; INIT_LIST_HEAD(&balloon.huge_pages); diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c index de8041c3285a..4e549abe59ff 100644 --- a/drivers/virtio/virtio_balloon.c +++ b/drivers/virtio/virtio_balloon.c @@ -802,7 +802,7 @@ static void report_free_page_func(struct work_struct *work) } } -#ifdef CONFIG_BALLOON_COMPACTION +#ifdef CONFIG_BALLOON_MIGRATION /* * virtballoon_migratepage - perform the balloon page migration on behalf of * a compaction thread. (called under page lock) @@ -851,7 +851,7 @@ static int virtballoon_migratepage(struct balloon_dev_info *vb_dev_info, mutex_unlock(&vb->balloon_lock); return 0; } -#endif /* CONFIG_BALLOON_COMPACTION */ +#endif /* CONFIG_BALLOON_MIGRATION */ static unsigned long shrink_free_pages(struct virtio_balloon *vb, unsigned long pages_to_free) @@ -948,7 +948,7 @@ static int virtballoon_probe(struct virtio_device *vdev) if (!virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_DEFLATE_ON_OOM)) vb->vb_dev_info.adjust_managed_page_count = true; -#ifdef CONFIG_BALLOON_COMPACTION +#ifdef CONFIG_BALLOON_MIGRATION vb->vb_dev_info.migratepage = virtballoon_migratepage; #endif if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT)) { diff --git a/include/linux/balloon.h b/include/linux/balloon.h index 82585542300d..ca5b15150f42 100644 --- a/include/linux/balloon.h +++ b/include/linux/balloon.h @@ -22,9 +22,9 @@ * * As the page isolation scanning step a compaction thread does is a lockless * procedure (from a page standpoint), it might bring some racy situations while - * performing balloon page compaction. In order to sort out these racy scenarios - * and safely perform balloon's page compaction and migration we must, always, - * ensure following these simple rules: + * performing balloon page migration. In order to sort out these racy scenarios + * and safely perform balloon's page migration we must, always, ensure following + * these simple rules: * * i. Inflation/deflation must set/clear page->private under the * balloon_pages_lock @@ -45,10 +45,10 @@ /* * Balloon device information descriptor. - * This struct is used to allow the common balloon compaction interface + * This struct is used to allow the common balloon page migration interface * procedures to find the proper balloon device holding memory pages they'll - * have to cope for page compaction / migration, as well as it serves the - * balloon driver as a page book-keeper for its registered balloon devices. + * have to cope for page migration, as well as it serves the balloon driver as + * a page book-keeper for its registered balloon devices. */ struct balloon_dev_info { unsigned long isolated_pages; /* # of isolated pages for migration */ diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index 92f80b4d69a6..fca34d3473b6 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -125,9 +125,9 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, #ifdef CONFIG_MEMORY_BALLOON BALLOON_INFLATE, BALLOON_DEFLATE, -#ifdef CONFIG_BALLOON_COMPACTION +#ifdef CONFIG_BALLOON_MIGRATION BALLOON_MIGRATE, -#endif +#endif /* CONFIG_BALLOON_MIGRATION */ #endif #ifdef CONFIG_DEBUG_TLBFLUSH NR_TLB_REMOTE_FLUSH, /* cpu tried to flush others' tlbs */ diff --git a/mm/Kconfig b/mm/Kconfig index c5374f3cf1c8..cd6896c1ba7d 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -597,8 +597,8 @@ config MEMORY_BALLOON bool # -# support for memory balloon compaction -config BALLOON_COMPACTION +# support for memory balloon page migration +config BALLOON_MIGRATION bool "Allow for balloon memory migration" default y depends on MIGRATION && MEMORY_BALLOON diff --git a/mm/balloon.c b/mm/balloon.c index 0f068b97e5d8..96a8f1e20bc6 100644 --- a/mm/balloon.c +++ b/mm/balloon.c @@ -29,7 +29,7 @@ static void balloon_page_insert(struct balloon_dev_info *balloon, { lockdep_assert_held(&balloon_pages_lock); __SetPageOffline(page); - if (IS_ENABLED(CONFIG_BALLOON_COMPACTION)) { + if (IS_ENABLED(CONFIG_BALLOON_MIGRATION)) { SetPageMovableOps(page); set_page_private(page, (unsigned long)balloon); } @@ -46,7 +46,7 @@ static void balloon_page_insert(struct balloon_dev_info *balloon, static void balloon_page_finalize(struct page *page) { lockdep_assert_held(&balloon_pages_lock); - if (IS_ENABLED(CONFIG_BALLOON_COMPACTION)) + if (IS_ENABLED(CONFIG_BALLOON_MIGRATION)) set_page_private(page, 0); /* PageOffline is sticky until the page is freed to the buddy. */ } @@ -148,7 +148,7 @@ struct page *balloon_page_alloc(void) { gfp_t gfp_flags = __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN; - if (IS_ENABLED(CONFIG_BALLOON_COMPACTION)) + if (IS_ENABLED(CONFIG_BALLOON_MIGRATION)) gfp_flags |= GFP_HIGHUSER_MOVABLE; else gfp_flags |= GFP_HIGHUSER; @@ -227,7 +227,7 @@ struct page *balloon_page_dequeue(struct balloon_dev_info *b_dev_info) } EXPORT_SYMBOL_GPL(balloon_page_dequeue); -#ifdef CONFIG_BALLOON_COMPACTION +#ifdef CONFIG_BALLOON_MIGRATION static struct balloon_dev_info *balloon_page_device(struct page *page) { return (struct balloon_dev_info *)page_private(page); @@ -341,4 +341,4 @@ static int __init balloon_init(void) } core_initcall(balloon_init); -#endif /* CONFIG_BALLOON_COMPACTION */ +#endif /* CONFIG_BALLOON_MIGRATION */ diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 389989a28abe..bc805029da51 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -946,8 +946,8 @@ static struct zone *default_kernel_zone_for_pfn(int nid, unsigned long start_pfn * We rely on "present pages" instead of "managed pages", as the latter is * highly unreliable and dynamic in virtualized environments, and does not * consider boot time allocations. For example, memory ballooning adjusts the - * managed pages when inflating/deflating the balloon, and balloon compaction - * can even migrate inflated pages between zones. + * managed pages when inflating/deflating the balloon, and balloon page + * migration can even migrate inflated pages between zones. * * Using "present pages" is better but some things to keep in mind are: * diff --git a/mm/migrate.c b/mm/migrate.c index 4750a2ba15fe..1bf2cf8c44dd 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -88,7 +88,7 @@ static const struct movable_operations *page_movable_ops(struct page *page) * back to the buddy. */ if (PageOffline(page)) - /* Only balloon compaction sets PageOffline pages movable. */ + /* Only balloon page migration sets PageOffline pages movable. */ return offline_movable_ops; if (PageZsmalloc(page)) return zsmalloc_movable_ops; diff --git a/mm/vmstat.c b/mm/vmstat.c index 6ae8891c9693..e96a344ab597 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1422,9 +1422,9 @@ const char * const vmstat_text[] = { #ifdef CONFIG_MEMORY_BALLOON [I(BALLOON_INFLATE)] = "balloon_inflate", [I(BALLOON_DEFLATE)] = "balloon_deflate", -#ifdef CONFIG_BALLOON_COMPACTION +#ifdef CONFIG_BALLOON_MIGRATION [I(BALLOON_MIGRATE)] = "balloon_migrate", -#endif +#endif /* CONFIG_BALLOON_MIGRATION */ #endif /* CONFIG_MEMORY_BALLOON */ #ifdef CONFIG_DEBUG_TLBFLUSH [I(NR_TLB_REMOTE_FLUSH)] = "nr_tlb_remote_flush", -- cgit v1.2.3 From 1421758055ca6028d3b758914863f38d434bf36b Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Red Hat)" Date: Tue, 20 Jan 2026 00:01:31 +0100 Subject: mm: rename CONFIG_MEMORY_BALLOON -> CONFIG_BALLOON MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Let's make it consistent with the naming of the files but also with the naming of CONFIG_BALLOON_MIGRATION. While at it, add a "/* CONFIG_BALLOON */". Link: https://lkml.kernel.org/r/20260119230133.3551867-24-david@kernel.org Signed-off-by: David Hildenbrand (Red Hat) Reviewed-by: Lorenzo Stoakes Acked-by: Michael S. Tsirkin Cc: Arnd Bergmann Cc: Christophe Leroy Cc: Eugenio Pérez Cc: Greg Kroah-Hartman Cc: Jason Wang Cc: Jerrin Shaji George Cc: Jonathan Corbet Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Oscar Salvador Cc: SeongJae Park Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Zi Yan Signed-off-by: Andrew Morton --- arch/powerpc/platforms/pseries/Kconfig | 2 +- drivers/misc/Kconfig | 2 +- drivers/virtio/Kconfig | 2 +- include/linux/vm_event_item.h | 4 ++-- mm/Kconfig | 4 ++-- mm/Makefile | 2 +- mm/vmstat.c | 4 ++-- 7 files changed, 10 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/arch/powerpc/platforms/pseries/Kconfig b/arch/powerpc/platforms/pseries/Kconfig index 3e042218d6cd..f7052b131a4c 100644 --- a/arch/powerpc/platforms/pseries/Kconfig +++ b/arch/powerpc/platforms/pseries/Kconfig @@ -120,7 +120,7 @@ config PPC_SMLPAR config CMM tristate "Collaborative memory management" depends on PPC_SMLPAR - select MEMORY_BALLOON + select BALLOON default y help Select this option, if you want to enable the kernel interface diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig index d7d41b054b98..5cc79d1517af 100644 --- a/drivers/misc/Kconfig +++ b/drivers/misc/Kconfig @@ -410,7 +410,7 @@ config DS1682 config VMWARE_BALLOON tristate "VMware Balloon Driver" depends on VMWARE_VMCI && X86 && HYPERVISOR_GUEST - select MEMORY_BALLOON + select BALLOON help This is VMware physical memory management driver which acts like a "balloon" that can be inflated to reclaim physical pages diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig index 6db5235a7693..ce5bc0d9ea28 100644 --- a/drivers/virtio/Kconfig +++ b/drivers/virtio/Kconfig @@ -112,7 +112,7 @@ config VIRTIO_PMEM config VIRTIO_BALLOON tristate "Virtio balloon driver" depends on VIRTIO - select MEMORY_BALLOON + select BALLOON select PAGE_REPORTING help This driver supports increasing and decreasing the amount diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index fca34d3473b6..22a139f82d75 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -122,13 +122,13 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, THP_SWPOUT, THP_SWPOUT_FALLBACK, #endif -#ifdef CONFIG_MEMORY_BALLOON +#ifdef CONFIG_BALLOON BALLOON_INFLATE, BALLOON_DEFLATE, #ifdef CONFIG_BALLOON_MIGRATION BALLOON_MIGRATE, #endif /* CONFIG_BALLOON_MIGRATION */ -#endif +#endif /* CONFIG_BALLOON */ #ifdef CONFIG_DEBUG_TLBFLUSH NR_TLB_REMOTE_FLUSH, /* cpu tried to flush others' tlbs */ NR_TLB_REMOTE_FLUSH_RECEIVED,/* cpu received ipi for flush */ diff --git a/mm/Kconfig b/mm/Kconfig index cd6896c1ba7d..d1d76ce7373e 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -593,7 +593,7 @@ config SPLIT_PMD_PTLOCKS # # support for memory balloon -config MEMORY_BALLOON +config BALLOON bool # @@ -601,7 +601,7 @@ config MEMORY_BALLOON config BALLOON_MIGRATION bool "Allow for balloon memory migration" default y - depends on MIGRATION && MEMORY_BALLOON + depends on MIGRATION && BALLOON help Allow for migration of pages inflated in a memory balloon such that they can be allocated from memory areas only available for movable diff --git a/mm/Makefile b/mm/Makefile index 1e31e0a528dc..0d85b10dbdde 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -122,7 +122,7 @@ obj-$(CONFIG_CMA) += cma.o obj-$(CONFIG_NUMA) += numa.o obj-$(CONFIG_NUMA_MEMBLKS) += numa_memblks.o obj-$(CONFIG_NUMA_EMU) += numa_emulation.o -obj-$(CONFIG_MEMORY_BALLOON) += balloon.o +obj-$(CONFIG_BALLOON) += balloon.o obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o obj-$(CONFIG_PAGE_TABLE_CHECK) += page_table_check.o obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o diff --git a/mm/vmstat.c b/mm/vmstat.c index e96a344ab597..0f64c898f79f 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1419,13 +1419,13 @@ const char * const vmstat_text[] = { [I(THP_SWPOUT)] = "thp_swpout", [I(THP_SWPOUT_FALLBACK)] = "thp_swpout_fallback", #endif -#ifdef CONFIG_MEMORY_BALLOON +#ifdef CONFIG_BALLOON [I(BALLOON_INFLATE)] = "balloon_inflate", [I(BALLOON_DEFLATE)] = "balloon_deflate", #ifdef CONFIG_BALLOON_MIGRATION [I(BALLOON_MIGRATE)] = "balloon_migrate", #endif /* CONFIG_BALLOON_MIGRATION */ -#endif /* CONFIG_MEMORY_BALLOON */ +#endif /* CONFIG_BALLOON */ #ifdef CONFIG_DEBUG_TLBFLUSH [I(NR_TLB_REMOTE_FLUSH)] = "nr_tlb_remote_flush", [I(NR_TLB_REMOTE_FLUSH_RECEIVED)] = "nr_tlb_remote_flush_received", -- cgit v1.2.3 From 5898aa8f9a0b42fe1f65c7364010ab15ec5c38bf Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Wed, 14 Jan 2026 09:36:42 -0500 Subject: mm: fix OOM killer inaccuracy on large many-core systems Use the precise, albeit slower, precise RSS counter sums for the OOM killer task selection and console dumps. The approximated value is too imprecise on large many-core systems. The following rss tracking issues were noted by Sweet Tea Dorminy [1], which lead to picking wrong tasks as OOM kill target: Recently, several internal services had an RSS usage regression as part of a kernel upgrade. Previously, they were on a pre-6.2 kernel and were able to read RSS statistics in a backup watchdog process to monitor and decide if they'd overrun their memory budget. Now, however, a representative service with five threads, expected to use about a hundred MB of memory, on a 250-cpu machine had memory usage tens of megabytes different from the expected amount -- this constituted a significant percentage of inaccuracy, causing the watchdog to act. This was a result of commit f1a7941243c1 ("mm: convert mm's rss stats into percpu_counter") [1]. Previously, the memory error was bounded by 64*nr_threads pages, a very livable megabyte. Now, however, as a result of scheduler decisions moving the threads around the CPUs, the memory error could be as large as a gigabyte. This is a really tremendous inaccuracy for any few-threaded program on a large machine and impedes monitoring significantly. These stat counters are also used to make OOM killing decisions, so this additional inaccuracy could make a big difference in OOM situations -- either resulting in the wrong process being killed, or in less memory being returned from an OOM-kill than expected. Here is a (possibly incomplete) list of the prior approaches that were used or proposed, along with their downside: 1) Per-thread rss tracking: large error on many-thread processes. 2) Per-CPU counters: up to 12% slower for short-lived processes and 9% increased system time in make test workloads [1]. Moreover, the inaccuracy increases with O(n^2) with the number of CPUs. 3) Per-NUMA-node counters: requires atomics on fast-path (overhead), error is high with systems that have lots of NUMA nodes (32 times the number of NUMA nodes). commit 82241a83cd15 ("mm: fix the inaccurate memory statistics issue for users") introduced get_mm_counter_sum() for precise proc memory status queries for some proc files. The simple fix proposed here is to do the precise per-cpu counters sum every time a counter value needs to be read. This applies to the OOM killer task selection, oom task console dumps (printk). This change increases the latency introduced when the OOM killer executes in favor of doing a more precise OOM target task selection. Effectively, the OOM killer iterates on all tasks, for all relevant page types, for which the precise sum iterates on all possible CPUs. As a reference, here is the execution time of the OOM killer before/after the change: AMD EPYC 9654 96-Core (2 sockets) Within a KVM, configured with 256 logical cpus. | before | after | ----------------------------------|----------|----------| nr_processes=40 | 0.3 ms | 0.5 ms | nr_processes=10000 | 3.0 ms | 80.0 ms | Link: https://lkml.kernel.org/r/20260114143642.47333-1-mathieu.desnoyers@efficios.com Fixes: f1a7941243c1 ("mm: convert mm's rss stats into percpu_counter") Link: https://lore.kernel.org/lkml/20250331223516.7810-2-sweettea-kernel@dorminy.me/ # [1] Signed-off-by: Mathieu Desnoyers Suggested-by: Michal Hocko Acked-by: Michal Hocko Reviewed-by: Baolin Wang Acked-by: Vlastimil Babka Cc: "Paul E. McKenney" Cc: Steven Rostedt Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Dennis Zhou Cc: Tejun Heo Cc: Christoph Lameter Cc: Martin Liu Cc: David Rientjes Cc: Shakeel Butt Cc: SeongJae Park Cc: Michal Hocko Cc: Johannes Weiner Cc: Sweet Tea Dorminy Cc: Lorenzo Stoakes Cc: "Liam R . Howlett" Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Christian Brauner Cc: Wei Yang Cc: David Hildenbrand Cc: Miaohe Lin Cc: Al Viro Cc: Yu Zhao Cc: Roman Gushchin Cc: Mateusz Guzik Cc: Matthew Wilcox Cc: Aboorva Devarajan Signed-off-by: Andrew Morton --- include/linux/mm.h | 7 +++++++ mm/oom_kill.c | 22 +++++++++++----------- 2 files changed, 18 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index aacabf8a0b58..aa90719234f1 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2906,6 +2906,13 @@ static inline unsigned long get_mm_rss(struct mm_struct *mm) get_mm_counter(mm, MM_SHMEMPAGES); } +static inline unsigned long get_mm_rss_sum(struct mm_struct *mm) +{ + return get_mm_counter_sum(mm, MM_FILEPAGES) + + get_mm_counter_sum(mm, MM_ANONPAGES) + + get_mm_counter_sum(mm, MM_SHMEMPAGES); +} + static inline unsigned long get_mm_hiwater_rss(struct mm_struct *mm) { return max(mm->hiwater_rss, get_mm_rss(mm)); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 94066316e3ec..5c6c95c169ee 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -228,7 +228,7 @@ long oom_badness(struct task_struct *p, unsigned long totalpages) * The baseline for the badness score is the proportion of RAM that each * task's rss, pagetable and swap space use. */ - points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS) + + points = get_mm_rss_sum(p->mm) + get_mm_counter_sum(p->mm, MM_SWAPENTS) + mm_pgtables_bytes(p->mm) / PAGE_SIZE; task_unlock(p); @@ -402,10 +402,10 @@ static int dump_task(struct task_struct *p, void *arg) pr_info("[%7d] %5d %5d %8lu %8lu %8lu %8lu %9lu %8ld %8lu %5hd %s\n", task->pid, from_kuid(&init_user_ns, task_uid(task)), - task->tgid, task->mm->total_vm, get_mm_rss(task->mm), - get_mm_counter(task->mm, MM_ANONPAGES), get_mm_counter(task->mm, MM_FILEPAGES), - get_mm_counter(task->mm, MM_SHMEMPAGES), mm_pgtables_bytes(task->mm), - get_mm_counter(task->mm, MM_SWAPENTS), + task->tgid, task->mm->total_vm, get_mm_rss_sum(task->mm), + get_mm_counter_sum(task->mm, MM_ANONPAGES), get_mm_counter_sum(task->mm, MM_FILEPAGES), + get_mm_counter_sum(task->mm, MM_SHMEMPAGES), mm_pgtables_bytes(task->mm), + get_mm_counter_sum(task->mm, MM_SWAPENTS), task->signal->oom_score_adj, task->comm); task_unlock(task); @@ -604,9 +604,9 @@ static bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n", task_pid_nr(tsk), tsk->comm, - K(get_mm_counter(mm, MM_ANONPAGES)), - K(get_mm_counter(mm, MM_FILEPAGES)), - K(get_mm_counter(mm, MM_SHMEMPAGES))); + K(get_mm_counter_sum(mm, MM_ANONPAGES)), + K(get_mm_counter_sum(mm, MM_FILEPAGES)), + K(get_mm_counter_sum(mm, MM_SHMEMPAGES))); out_finish: trace_finish_task_reaping(tsk->pid); out_unlock: @@ -960,9 +960,9 @@ static void __oom_kill_process(struct task_struct *victim, const char *message) mark_oom_victim(victim); pr_err("%s: Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB, UID:%u pgtables:%lukB oom_score_adj:%d\n", message, task_pid_nr(victim), victim->comm, K(mm->total_vm), - K(get_mm_counter(mm, MM_ANONPAGES)), - K(get_mm_counter(mm, MM_FILEPAGES)), - K(get_mm_counter(mm, MM_SHMEMPAGES)), + K(get_mm_counter_sum(mm, MM_ANONPAGES)), + K(get_mm_counter_sum(mm, MM_FILEPAGES)), + K(get_mm_counter_sum(mm, MM_SHMEMPAGES)), from_kuid(&init_user_ns, task_uid(victim)), mm_pgtables_bytes(mm) >> 10, victim->signal->oom_score_adj); task_unlock(victim); -- cgit v1.2.3 From dc9fe9b7056a44ad65715def880e7d91d32c047f Mon Sep 17 00:00:00 2001 From: Jiayuan Chen Date: Tue, 20 Jan 2026 10:43:48 +0800 Subject: mm/vmscan: mitigate spurious kswapd_failures reset from direct reclaim Patch series "mm/vmscan: add tracepoint and reason for kswapd_failures reset", v4. Currently, kswapd_failures is reset in multiple places (kswapd, direct reclaim, PCP freeing, memory-tiers), but there's no way to trace when and why it was reset, making it difficult to debug memory reclaim issues. This patch: 1. Introduce kswapd_clear_hopeless() as a wrapper function to centralize kswapd_failures reset logic. 2. Introduce kswapd_test_hopeless() to encapsulate hopeless node checks, replacing all open-coded kswapd_failures comparisons. 3. Add kswapd_clear_hopeless_reason enum to distinguish reset sources: - KSWAPD_CLEAR_HOPELESS_KSWAPD: reset from kswapd context - KSWAPD_CLEAR_HOPELESS_DIRECT: reset from direct reclaim - KSWAPD_CLEAR_HOPELESS_PCP: reset from PCP page freeing - KSWAPD_CLEAR_HOPELESS_OTHER: reset from other paths 4. Add tracepoints for better observability: - mm_vmscan_kswapd_clear_hopeless: traces each reset with reason - mm_vmscan_kswapd_reclaim_fail: traces each kswapd reclaim failure Test results: $ trace-cmd record -e vmscan:mm_vmscan_kswapd_clear_hopeless -e vmscan:mm_vmscan_kswapd_reclaim_fail $ # generate memory pressure $ trace-cmd report cpus=4 kswapd0-71 [000] 27.216563: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=1 kswapd0-71 [000] 27.217169: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=2 kswapd0-71 [000] 27.217764: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=3 kswapd0-71 [000] 27.218353: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=4 kswapd0-71 [000] 27.218993: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=5 kswapd0-71 [000] 27.219744: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=6 kswapd0-71 [000] 27.220488: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=7 kswapd0-71 [000] 27.221206: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=8 kswapd0-71 [000] 27.221806: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=9 kswapd0-71 [000] 27.222634: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=10 kswapd0-71 [000] 27.223286: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=11 kswapd0-71 [000] 27.223894: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=12 kswapd0-71 [000] 27.224712: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=13 kswapd0-71 [000] 27.225424: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=14 kswapd0-71 [000] 27.226082: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=15 kswapd0-71 [000] 27.226810: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=16 kswapd1-72 [002] 27.386869: mm_vmscan_kswapd_reclaim_fail: nid=1 failures=1 kswapd1-72 [002] 27.387435: mm_vmscan_kswapd_reclaim_fail: nid=1 failures=2 kswapd1-72 [002] 27.388016: mm_vmscan_kswapd_reclaim_fail: nid=1 failures=3 kswapd1-72 [002] 27.388586: mm_vmscan_kswapd_reclaim_fail: nid=1 failures=4 kswapd1-72 [002] 27.389155: mm_vmscan_kswapd_reclaim_fail: nid=1 failures=5 kswapd1-72 [002] 27.389723: mm_vmscan_kswapd_reclaim_fail: nid=1 failures=6 kswapd1-72 [002] 27.390292: mm_vmscan_kswapd_reclaim_fail: nid=1 failures=7 kswapd1-72 [002] 27.392364: mm_vmscan_kswapd_reclaim_fail: nid=1 failures=8 kswapd1-72 [002] 27.392934: mm_vmscan_kswapd_reclaim_fail: nid=1 failures=9 kswapd1-72 [002] 27.393504: mm_vmscan_kswapd_reclaim_fail: nid=1 failures=10 kswapd1-72 [002] 27.394073: mm_vmscan_kswapd_reclaim_fail: nid=1 failures=11 kswapd1-72 [002] 27.394899: mm_vmscan_kswapd_reclaim_fail: nid=1 failures=12 kswapd1-72 [002] 27.395472: mm_vmscan_kswapd_reclaim_fail: nid=1 failures=13 kswapd1-72 [002] 27.396055: mm_vmscan_kswapd_reclaim_fail: nid=1 failures=14 kswapd1-72 [002] 27.396628: mm_vmscan_kswapd_reclaim_fail: nid=1 failures=15 kswapd1-72 [002] 27.397199: mm_vmscan_kswapd_reclaim_fail: nid=1 failures=16 kworker/u18:0-40 [002] 27.410151: mm_vmscan_kswapd_clear_hopeless: nid=0 reason=DIRECT kswapd0-71 [000] 27.439454: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=1 kswapd0-71 [000] 27.440048: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=2 kswapd0-71 [000] 27.440634: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=3 kswapd0-71 [000] 27.441211: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=4 kswapd0-71 [000] 27.441787: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=5 kswapd0-71 [000] 27.442363: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=6 kswapd0-71 [000] 27.443030: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=7 kswapd0-71 [000] 27.443725: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=8 kswapd0-71 [000] 27.444315: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=9 kswapd0-71 [000] 27.444898: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=10 kswapd0-71 [000] 27.445476: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=11 kswapd0-71 [000] 27.446053: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=12 kswapd0-71 [000] 27.446646: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=13 kswapd0-71 [000] 27.447230: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=14 kswapd0-71 [000] 27.447812: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=15 kswapd0-71 [000] 27.448391: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=16 ann-423 [003] 28.028285: mm_vmscan_kswapd_clear_hopeless: nid=0 reason=PCP This patch (of 2): When kswapd fails to reclaim memory, kswapd_failures is incremented. Once it reaches MAX_RECLAIM_RETRIES, kswapd stops running to avoid futile reclaim attempts. However, any successful direct reclaim unconditionally resets kswapd_failures to 0, which can cause problems. We observed an issue in production on a multi-NUMA system where a process allocated large amounts of anonymous pages on a single NUMA node, causing its watermark to drop below high and evicting most file pages: $ numastat -m Per-node system memory usage (in MBs): Node 0 Node 1 Total --------------- --------------- --------------- MemTotal 128222.19 127983.91 256206.11 MemFree 1414.48 1432.80 2847.29 MemUsed 126807.71 126551.11 252358.82 SwapCached 0.00 0.00 0.00 Active 29017.91 25554.57 54572.48 Inactive 92749.06 95377.00 188126.06 Active(anon) 28998.96 23356.47 52355.43 Inactive(anon) 92685.27 87466.11 180151.39 Active(file) 18.95 2198.10 2217.05 Inactive(file) 63.79 7910.89 7974.68 With swap disabled, only file pages can be reclaimed. When kswapd is woken (e.g., via wake_all_kswapds()), it runs continuously but cannot raise free memory above the high watermark since reclaimable file pages are insufficient. Normally, kswapd would eventually stop after kswapd_failures reaches MAX_RECLAIM_RETRIES. However, containers on this machine have memory.high set in their cgroup. Business processes continuously trigger the high limit, causing frequent direct reclaim that keeps resetting kswapd_failures to 0. This prevents kswapd from ever stopping. The key insight is that direct reclaim triggered by cgroup memory.high performs aggressive scanning to throttle the allocating process. With sufficiently aggressive scanning, even hot pages will eventually be reclaimed, making direct reclaim "successful" at freeing some memory. However, this success does not mean the node has reached a balanced state - the freed memory may still be insufficient to bring free pages above the high watermark. Unconditionally resetting kswapd_failures in this case keeps kswapd alive indefinitely. The result is that kswapd runs endlessly. Unlike direct reclaim which only reclaims from the allocating cgroup, kswapd scans the entire node's memory. This causes hot file pages from all workloads on the node to be evicted, not just those from the cgroup triggering memory.high. These pages constantly refault, generating sustained heavy IO READ pressure across the entire system. Fix this by only resetting kswapd_failures when the node is actually balanced. This allows both kswapd and direct reclaim to clear kswapd_failures upon successful reclaim, but only when the reclaim actually resolves the memory pressure (i.e., the node becomes balanced). Link: https://lkml.kernel.org/r/20260120024402.387576-1-jiayuan.chen@linux.dev Link: https://lkml.kernel.org/r/20260120024402.387576-2-jiayuan.chen@linux.dev Signed-off-by: Jiayuan Chen Signed-off-by: Jiayuan Chen Acked-by: Shakeel Butt Cc: Axel Rasmussen Cc: Brendan Jackman Cc: David Hildenbrand Cc: Johannes Weiner Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: "Masami Hiramatsu (Google)" Cc: Mathieu Desnoyers Cc: Michal Hocko Cc: Mike Rapoport Cc: Qi Zheng Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Wei Xu Cc: Yuanchu Xie Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 2 ++ mm/vmscan.c | 22 ++++++++++++++++++++-- 2 files changed, 22 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index eb3815fc94ad..8881198e85c6 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1536,6 +1536,8 @@ static inline unsigned long pgdat_end_pfn(pg_data_t *pgdat) void build_all_zonelists(pg_data_t *pgdat); void wakeup_kswapd(struct zone *zone, gfp_t gfp_mask, int order, enum zone_type highest_zoneidx); +void kswapd_try_clear_hopeless(struct pglist_data *pgdat, + unsigned int order, int highest_zoneidx); bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, int highest_zoneidx, unsigned int alloc_flags, long free_pages); diff --git a/mm/vmscan.c b/mm/vmscan.c index b33039000d6e..5d9b1bce6f01 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -5065,7 +5065,7 @@ static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control * blk_finish_plug(&plug); done: if (sc->nr_reclaimed > reclaimed) - atomic_set(&pgdat->kswapd_failures, 0); + kswapd_try_clear_hopeless(pgdat, sc->order, sc->reclaim_idx); } /****************************************************************************** @@ -6132,7 +6132,7 @@ again: * successful direct reclaim run will revive a dormant kswapd. */ if (reclaimable) - atomic_set(&pgdat->kswapd_failures, 0); + kswapd_try_clear_hopeless(pgdat, sc->order, sc->reclaim_idx); else if (sc->cache_trim_mode) sc->cache_trim_mode_failed = 1; } @@ -7391,6 +7391,24 @@ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order, wake_up_interruptible(&pgdat->kswapd_wait); } +static void kswapd_clear_hopeless(pg_data_t *pgdat) +{ + atomic_set(&pgdat->kswapd_failures, 0); +} + +/* + * Reset kswapd_failures only when the node is balanced. Without this + * check, successful direct reclaim (e.g., from cgroup memory.high + * throttling) can keep resetting kswapd_failures even when the node + * cannot be balanced, causing kswapd to run endlessly. + */ +void kswapd_try_clear_hopeless(struct pglist_data *pgdat, + unsigned int order, int highest_zoneidx) +{ + if (pgdat_balanced(pgdat, order, highest_zoneidx)) + kswapd_clear_hopeless(pgdat); +} + #ifdef CONFIG_HIBERNATION /* * Try to free `nr_to_reclaim' of memory, system-wide, and return the number of -- cgit v1.2.3 From a45088376d8a847a5e3b1982fcfceb41644e3b1d Mon Sep 17 00:00:00 2001 From: Jiayuan Chen Date: Tue, 20 Jan 2026 10:43:49 +0800 Subject: mm/vmscan: add tracepoint and reason for kswapd_failures reset Currently, kswapd_failures is reset in multiple places (kswapd, direct reclaim, PCP freeing, memory-tiers), but there's no way to trace when and why it was reset, making it difficult to debug memory reclaim issues. This patch: 1. Introduce kswapd_clear_hopeless() as a wrapper function to centralize kswapd_failures reset logic. 2. Introduce kswapd_test_hopeless() to encapsulate hopeless node checks, replacing all open-coded kswapd_failures comparisons. 3. Add kswapd_clear_hopeless_reason enum to distinguish reset sources: - KSWAPD_CLEAR_HOPELESS_KSWAPD: reset from kswapd context - KSWAPD_CLEAR_HOPELESS_DIRECT: reset from direct reclaim - KSWAPD_CLEAR_HOPELESS_PCP: reset from PCP page freeing - KSWAPD_CLEAR_HOPELESS_OTHER: reset from other paths 4. Add tracepoints for better observability: - mm_vmscan_kswapd_clear_hopeless: traces each reset with reason - mm_vmscan_kswapd_reclaim_fail: traces each kswapd reclaim failure Test results: $ trace-cmd record -e vmscan:mm_vmscan_kswapd_clear_hopeless -e vmscan:mm_vmscan_kswapd_reclaim_fail $ # generate memory pressure $ trace-cmd report cpus=4 kswapd0-71 [000] 27.216563: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=1 kswapd0-71 [000] 27.217169: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=2 kswapd0-71 [000] 27.217764: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=3 kswapd0-71 [000] 27.218353: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=4 kswapd0-71 [000] 27.218993: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=5 kswapd0-71 [000] 27.219744: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=6 kswapd0-71 [000] 27.220488: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=7 kswapd0-71 [000] 27.221206: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=8 kswapd0-71 [000] 27.221806: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=9 kswapd0-71 [000] 27.222634: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=10 kswapd0-71 [000] 27.223286: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=11 kswapd0-71 [000] 27.223894: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=12 kswapd0-71 [000] 27.224712: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=13 kswapd0-71 [000] 27.225424: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=14 kswapd0-71 [000] 27.226082: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=15 kswapd0-71 [000] 27.226810: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=16 kswapd1-72 [002] 27.386869: mm_vmscan_kswapd_reclaim_fail: nid=1 failures=1 kswapd1-72 [002] 27.387435: mm_vmscan_kswapd_reclaim_fail: nid=1 failures=2 kswapd1-72 [002] 27.388016: mm_vmscan_kswapd_reclaim_fail: nid=1 failures=3 kswapd1-72 [002] 27.388586: mm_vmscan_kswapd_reclaim_fail: nid=1 failures=4 kswapd1-72 [002] 27.389155: mm_vmscan_kswapd_reclaim_fail: nid=1 failures=5 kswapd1-72 [002] 27.389723: mm_vmscan_kswapd_reclaim_fail: nid=1 failures=6 kswapd1-72 [002] 27.390292: mm_vmscan_kswapd_reclaim_fail: nid=1 failures=7 kswapd1-72 [002] 27.392364: mm_vmscan_kswapd_reclaim_fail: nid=1 failures=8 kswapd1-72 [002] 27.392934: mm_vmscan_kswapd_reclaim_fail: nid=1 failures=9 kswapd1-72 [002] 27.393504: mm_vmscan_kswapd_reclaim_fail: nid=1 failures=10 kswapd1-72 [002] 27.394073: mm_vmscan_kswapd_reclaim_fail: nid=1 failures=11 kswapd1-72 [002] 27.394899: mm_vmscan_kswapd_reclaim_fail: nid=1 failures=12 kswapd1-72 [002] 27.395472: mm_vmscan_kswapd_reclaim_fail: nid=1 failures=13 kswapd1-72 [002] 27.396055: mm_vmscan_kswapd_reclaim_fail: nid=1 failures=14 kswapd1-72 [002] 27.396628: mm_vmscan_kswapd_reclaim_fail: nid=1 failures=15 kswapd1-72 [002] 27.397199: mm_vmscan_kswapd_reclaim_fail: nid=1 failures=16 kworker/u18:0-40 [002] 27.410151: mm_vmscan_kswapd_clear_hopeless: nid=0 reason=DIRECT kswapd0-71 [000] 27.439454: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=1 kswapd0-71 [000] 27.440048: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=2 kswapd0-71 [000] 27.440634: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=3 kswapd0-71 [000] 27.441211: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=4 kswapd0-71 [000] 27.441787: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=5 kswapd0-71 [000] 27.442363: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=6 kswapd0-71 [000] 27.443030: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=7 kswapd0-71 [000] 27.443725: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=8 kswapd0-71 [000] 27.444315: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=9 kswapd0-71 [000] 27.444898: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=10 kswapd0-71 [000] 27.445476: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=11 kswapd0-71 [000] 27.446053: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=12 kswapd0-71 [000] 27.446646: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=13 kswapd0-71 [000] 27.447230: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=14 kswapd0-71 [000] 27.447812: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=15 kswapd0-71 [000] 27.448391: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=16 ann-423 [003] 28.028285: mm_vmscan_kswapd_clear_hopeless: nid=0 reason=PCP Link: https://lkml.kernel.org/r/20260120024402.387576-3-jiayuan.chen@linux.dev Signed-off-by: Jiayuan Chen Signed-off-by: Jiayuan Chen Acked-by: Shakeel Butt Suggested-by: Johannes Weiner Reviewed-by: Steven Rostedt (Google) [tracing] Cc: Axel Rasmussen Cc: Brendan Jackman Cc: David Hildenbrand Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: "Masami Hiramatsu (Google)" Cc: Mathieu Desnoyers Cc: Michal Hocko Cc: Mike Rapoport Cc: Qi Zheng Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Wei Xu Cc: Yuanchu Xie Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 19 ++++++++++++---- include/trace/events/vmscan.h | 51 +++++++++++++++++++++++++++++++++++++++++++ mm/memory-tiers.c | 2 +- mm/page_alloc.c | 4 ++-- mm/show_mem.c | 3 +-- mm/vmscan.c | 29 ++++++++++++++++-------- mm/vmstat.c | 2 +- 7 files changed, 91 insertions(+), 19 deletions(-) (limited to 'include') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 8881198e85c6..3e51190a55e4 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1534,16 +1534,27 @@ static inline unsigned long pgdat_end_pfn(pg_data_t *pgdat) #include void build_all_zonelists(pg_data_t *pgdat); -void wakeup_kswapd(struct zone *zone, gfp_t gfp_mask, int order, - enum zone_type highest_zoneidx); -void kswapd_try_clear_hopeless(struct pglist_data *pgdat, - unsigned int order, int highest_zoneidx); bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, int highest_zoneidx, unsigned int alloc_flags, long free_pages); bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, int highest_zoneidx, unsigned int alloc_flags); + +enum kswapd_clear_hopeless_reason { + KSWAPD_CLEAR_HOPELESS_OTHER = 0, + KSWAPD_CLEAR_HOPELESS_KSWAPD, + KSWAPD_CLEAR_HOPELESS_DIRECT, + KSWAPD_CLEAR_HOPELESS_PCP, +}; + +void wakeup_kswapd(struct zone *zone, gfp_t gfp_mask, int order, + enum zone_type highest_zoneidx); +void kswapd_try_clear_hopeless(struct pglist_data *pgdat, + unsigned int order, int highest_zoneidx); +void kswapd_clear_hopeless(pg_data_t *pgdat, enum kswapd_clear_hopeless_reason reason); +bool kswapd_test_hopeless(pg_data_t *pgdat); + /* * Memory initialization context, use to differentiate memory added by * the platform statically or via memory hotplug interface. diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h index 490958fa10de..ea58e4656abf 100644 --- a/include/trace/events/vmscan.h +++ b/include/trace/events/vmscan.h @@ -40,6 +40,16 @@ {_VMSCAN_THROTTLE_CONGESTED, "VMSCAN_THROTTLE_CONGESTED"} \ ) : "VMSCAN_THROTTLE_NONE" +TRACE_DEFINE_ENUM(KSWAPD_CLEAR_HOPELESS_OTHER); +TRACE_DEFINE_ENUM(KSWAPD_CLEAR_HOPELESS_KSWAPD); +TRACE_DEFINE_ENUM(KSWAPD_CLEAR_HOPELESS_DIRECT); +TRACE_DEFINE_ENUM(KSWAPD_CLEAR_HOPELESS_PCP); + +#define kswapd_clear_hopeless_reason_ops \ + {KSWAPD_CLEAR_HOPELESS_KSWAPD, "KSWAPD"}, \ + {KSWAPD_CLEAR_HOPELESS_DIRECT, "DIRECT"}, \ + {KSWAPD_CLEAR_HOPELESS_PCP, "PCP"}, \ + {KSWAPD_CLEAR_HOPELESS_OTHER, "OTHER"} #define trace_reclaim_flags(file) ( \ (file ? RECLAIM_WB_FILE : RECLAIM_WB_ANON) | \ @@ -535,6 +545,47 @@ TRACE_EVENT(mm_vmscan_throttled, __entry->usec_delayed, show_throttle_flags(__entry->reason)) ); + +TRACE_EVENT(mm_vmscan_kswapd_reclaim_fail, + + TP_PROTO(int nid, int failures), + + TP_ARGS(nid, failures), + + TP_STRUCT__entry( + __field(int, nid) + __field(int, failures) + ), + + TP_fast_assign( + __entry->nid = nid; + __entry->failures = failures; + ), + + TP_printk("nid=%d failures=%d", + __entry->nid, __entry->failures) +); + +TRACE_EVENT(mm_vmscan_kswapd_clear_hopeless, + + TP_PROTO(int nid, int reason), + + TP_ARGS(nid, reason), + + TP_STRUCT__entry( + __field(int, nid) + __field(int, reason) + ), + + TP_fast_assign( + __entry->nid = nid; + __entry->reason = reason; + ), + + TP_printk("nid=%d reason=%s", + __entry->nid, + __print_symbolic(__entry->reason, kswapd_clear_hopeless_reason_ops)) +); #endif /* _TRACE_VMSCAN_H */ /* This part must be outside protection */ diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c index 7ec442776574..0ae8bec86346 100644 --- a/mm/memory-tiers.c +++ b/mm/memory-tiers.c @@ -955,7 +955,7 @@ static ssize_t demotion_enabled_store(struct kobject *kobj, struct pglist_data *pgdat; for_each_online_pgdat(pgdat) - atomic_set(&pgdat->kswapd_failures, 0); + kswapd_clear_hopeless(pgdat, KSWAPD_CLEAR_HOPELESS_OTHER); } return count; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e779b18168de..2c70ba9d5cc6 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2945,9 +2945,9 @@ static bool free_frozen_page_commit(struct zone *zone, * 'hopeless node' to stay in that state for a while. Let * kswapd work again by resetting kswapd_failures. */ - if (atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES && + if (kswapd_test_hopeless(pgdat) && next_memory_node(pgdat->node_id) < MAX_NUMNODES) - atomic_set(&pgdat->kswapd_failures, 0); + kswapd_clear_hopeless(pgdat, KSWAPD_CLEAR_HOPELESS_PCP); } return ret; } diff --git a/mm/show_mem.c b/mm/show_mem.c index 3a4b5207635d..24078ac3e6bc 100644 --- a/mm/show_mem.c +++ b/mm/show_mem.c @@ -278,8 +278,7 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z #endif K(node_page_state(pgdat, NR_PAGETABLE)), K(node_page_state(pgdat, NR_SECONDARY_PAGETABLE)), - str_yes_no(atomic_read(&pgdat->kswapd_failures) >= - MAX_RECLAIM_RETRIES), + str_yes_no(kswapd_test_hopeless(pgdat)), K(node_page_state(pgdat, NR_BALLOON_PAGES))); } diff --git a/mm/vmscan.c b/mm/vmscan.c index 5d9b1bce6f01..1d281174164e 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -506,7 +506,7 @@ static bool skip_throttle_noprogress(pg_data_t *pgdat) * If kswapd is disabled, reschedule if necessary but do not * throttle as the system is likely near OOM. */ - if (atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES) + if (kswapd_test_hopeless(pgdat)) return true; /* @@ -6437,7 +6437,7 @@ static bool allow_direct_reclaim(pg_data_t *pgdat) int i; bool wmark_ok; - if (atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES) + if (kswapd_test_hopeless(pgdat)) return true; for_each_managed_zone_pgdat(zone, pgdat, i, ZONE_NORMAL) { @@ -6846,7 +6846,7 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, wake_up_all(&pgdat->pfmemalloc_wait); /* Hopeless node, leave it to direct reclaim */ - if (atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES) + if (kswapd_test_hopeless(pgdat)) return true; if (pgdat_balanced(pgdat, order, highest_zoneidx)) { @@ -7111,8 +7111,11 @@ restart: * watermark_high at this point. We need to avoid increasing the * failure count to prevent the kswapd thread from stopping. */ - if (!sc.nr_reclaimed && !boosted) - atomic_inc(&pgdat->kswapd_failures); + if (!sc.nr_reclaimed && !boosted) { + int fail_cnt = atomic_inc_return(&pgdat->kswapd_failures); + /* kswapd context, low overhead to trace every failure */ + trace_mm_vmscan_kswapd_reclaim_fail(pgdat->node_id, fail_cnt); + } out: clear_reclaim_active(pgdat, highest_zoneidx); @@ -7371,7 +7374,7 @@ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order, return; /* Hopeless node, leave it to direct reclaim if possible */ - if (atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES || + if (kswapd_test_hopeless(pgdat) || (pgdat_balanced(pgdat, order, highest_zoneidx) && !pgdat_watermark_boosted(pgdat, highest_zoneidx))) { /* @@ -7391,9 +7394,11 @@ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order, wake_up_interruptible(&pgdat->kswapd_wait); } -static void kswapd_clear_hopeless(pg_data_t *pgdat) +void kswapd_clear_hopeless(pg_data_t *pgdat, enum kswapd_clear_hopeless_reason reason) { - atomic_set(&pgdat->kswapd_failures, 0); + /* Only trace actual resets, not redundant zero-to-zero */ + if (atomic_xchg(&pgdat->kswapd_failures, 0)) + trace_mm_vmscan_kswapd_clear_hopeless(pgdat->node_id, reason); } /* @@ -7406,7 +7411,13 @@ void kswapd_try_clear_hopeless(struct pglist_data *pgdat, unsigned int order, int highest_zoneidx) { if (pgdat_balanced(pgdat, order, highest_zoneidx)) - kswapd_clear_hopeless(pgdat); + kswapd_clear_hopeless(pgdat, current_is_kswapd() ? + KSWAPD_CLEAR_HOPELESS_KSWAPD : KSWAPD_CLEAR_HOPELESS_DIRECT); +} + +bool kswapd_test_hopeless(pg_data_t *pgdat) +{ + return atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES; } #ifdef CONFIG_HIBERNATION diff --git a/mm/vmstat.c b/mm/vmstat.c index 0f64c898f79f..23e176e1d09d 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1840,7 +1840,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, "\n start_pfn: %lu" "\n reserved_highatomic: %lu" "\n free_highatomic: %lu", - atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES, + kswapd_test_hopeless(pgdat), zone->zone_start_pfn, zone->nr_reserved_highatomic, zone->nr_free_highatomic); -- cgit v1.2.3 From c83109e95c9d78e41b39e65b6490e511f4b8fba2 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Mon, 12 Jan 2026 23:09:50 +0800 Subject: mm: page_isolation: introduce page_is_unmovable() Patch series "mm: accelerate gigantic folio allocation". Optimize pfn_range_valid_contig() and replace_free_hugepage_folios() in alloc_contig_frozen_pages() to speed up gigantic folio allocation. The allocation time for 120*1G folios drops from 3.605s to 0.431s. This patch (of 5): Factor out the check if a page is unmovable into a new helper, and will be reused in the following patch. No functional change intended, the minor changes are as follows, 1) Avoid unnecessary calls by checking CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION 2) Directly call PageCompound since PageTransCompound may be dropped 3) Using folio_test_hugetlb() Link: https://lkml.kernel.org/r/20260112150954.1802953-1-wangkefeng.wang@huawei.com Link: https://lkml.kernel.org/r/20260112150954.1802953-2-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: Zi Yan Reviewed-by: Oscar Salvador Cc: Brendan Jackman Cc: David Hildenbrand Cc: Jane Chu Cc: Johannes Weiner Cc: Matthew Wilcox (Oracle) Cc: Muchun Song Cc: Sidhartha Kumar Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/page-isolation.h | 2 + mm/page_isolation.c | 187 ++++++++++++++++++++++------------------- 2 files changed, 101 insertions(+), 88 deletions(-) (limited to 'include') diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h index 3e2f960e166c..6f8638c9904f 100644 --- a/include/linux/page-isolation.h +++ b/include/linux/page-isolation.h @@ -67,4 +67,6 @@ void undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn); int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn, enum pb_isolate_mode mode); +bool page_is_unmovable(struct zone *zone, struct page *page, + enum pb_isolate_mode mode, unsigned long *step); #endif diff --git a/mm/page_isolation.c b/mm/page_isolation.c index b5924eff4f8b..c48ff5c00244 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -15,6 +15,100 @@ #define CREATE_TRACE_POINTS #include +bool page_is_unmovable(struct zone *zone, struct page *page, + enum pb_isolate_mode mode, unsigned long *step) +{ + /* + * Both, bootmem allocations and memory holes are marked + * PG_reserved and are unmovable. We can even have unmovable + * allocations inside ZONE_MOVABLE, for example when + * specifying "movablecore". + */ + if (PageReserved(page)) + return true; + + /* + * If the zone is movable and we have ruled out all reserved + * pages then it should be reasonably safe to assume the rest + * is movable. + */ + if (zone_idx(zone) == ZONE_MOVABLE) + return false; + + /* + * Hugepages are not in LRU lists, but they're movable. + * THPs are on the LRU, but need to be counted as #small pages. + * We need not scan over tail pages because we don't + * handle each tail page individually in migration. + */ + if (PageHuge(page) || PageCompound(page)) { + struct folio *folio = page_folio(page); + + if (folio_test_hugetlb(folio)) { + struct hstate *h; + + if (!IS_ENABLED(CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION)) + return true; + + /* + * The huge page may be freed so can not + * use folio_hstate() directly. + */ + h = size_to_hstate(folio_size(folio)); + if (h && !hugepage_migration_supported(h)) + return true; + + } else if (!folio_test_lru(folio)) { + return true; + } + + *step = folio_nr_pages(folio) - folio_page_idx(folio, page); + return false; + } + + /* + * We can't use page_count without pin a page + * because another CPU can free compound page. + * This check already skips compound tails of THP + * because their page->_refcount is zero at all time. + */ + if (!page_ref_count(page)) { + if (PageBuddy(page)) + *step = (1 << buddy_order(page)); + return false; + } + + /* + * The HWPoisoned page may be not in buddy system, and + * page_count() is not 0. + */ + if ((mode == PB_ISOLATE_MODE_MEM_OFFLINE) && PageHWPoison(page)) + return false; + + /* + * We treat all PageOffline() pages as movable when offlining + * to give drivers a chance to decrement their reference count + * in MEM_GOING_OFFLINE in order to indicate that these pages + * can be offlined as there are no direct references anymore. + * For actually unmovable PageOffline() where the driver does + * not support this, we will fail later when trying to actually + * move these pages that still have a reference count > 0. + * (false negatives in this function only) + */ + if ((mode == PB_ISOLATE_MODE_MEM_OFFLINE) && PageOffline(page)) + return false; + + if (PageLRU(page) || page_has_movable_ops(page)) + return false; + + /* + * If there are RECLAIMABLE pages, we need to check + * it. But now, memory offline itself doesn't call + * shrink_node_slabs() and it still to be fixed. + */ + return true; +} + /* * This function checks whether the range [start_pfn, end_pfn) includes * unmovable pages or not. The range must fall into a single pageblock and @@ -35,7 +129,6 @@ static struct page *has_unmovable_pages(unsigned long start_pfn, unsigned long e { struct page *page = pfn_to_page(start_pfn); struct zone *zone = page_zone(page); - unsigned long pfn; VM_BUG_ON(pageblock_start_pfn(start_pfn) != pageblock_start_pfn(end_pfn - 1)); @@ -52,96 +145,14 @@ static struct page *has_unmovable_pages(unsigned long start_pfn, unsigned long e return page; } - for (pfn = start_pfn; pfn < end_pfn; pfn++) { - page = pfn_to_page(pfn); + while (start_pfn < end_pfn) { + unsigned long step = 1; - /* - * Both, bootmem allocations and memory holes are marked - * PG_reserved and are unmovable. We can even have unmovable - * allocations inside ZONE_MOVABLE, for example when - * specifying "movablecore". - */ - if (PageReserved(page)) + page = pfn_to_page(start_pfn); + if (page_is_unmovable(zone, page, mode, &step)) return page; - /* - * If the zone is movable and we have ruled out all reserved - * pages then it should be reasonably safe to assume the rest - * is movable. - */ - if (zone_idx(zone) == ZONE_MOVABLE) - continue; - - /* - * Hugepages are not in LRU lists, but they're movable. - * THPs are on the LRU, but need to be counted as #small pages. - * We need not scan over tail pages because we don't - * handle each tail page individually in migration. - */ - if (PageHuge(page) || PageTransCompound(page)) { - struct folio *folio = page_folio(page); - unsigned int skip_pages; - - if (PageHuge(page)) { - struct hstate *h; - - /* - * The huge page may be freed so can not - * use folio_hstate() directly. - */ - h = size_to_hstate(folio_size(folio)); - if (h && !hugepage_migration_supported(h)) - return page; - } else if (!folio_test_lru(folio)) { - return page; - } - - skip_pages = folio_nr_pages(folio) - folio_page_idx(folio, page); - pfn += skip_pages - 1; - continue; - } - - /* - * We can't use page_count without pin a page - * because another CPU can free compound page. - * This check already skips compound tails of THP - * because their page->_refcount is zero at all time. - */ - if (!page_ref_count(page)) { - if (PageBuddy(page)) - pfn += (1 << buddy_order(page)) - 1; - continue; - } - - /* - * The HWPoisoned page may be not in buddy system, and - * page_count() is not 0. - */ - if ((mode == PB_ISOLATE_MODE_MEM_OFFLINE) && PageHWPoison(page)) - continue; - - /* - * We treat all PageOffline() pages as movable when offlining - * to give drivers a chance to decrement their reference count - * in MEM_GOING_OFFLINE in order to indicate that these pages - * can be offlined as there are no direct references anymore. - * For actually unmovable PageOffline() where the driver does - * not support this, we will fail later when trying to actually - * move these pages that still have a reference count > 0. - * (false negatives in this function only) - */ - if ((mode == PB_ISOLATE_MODE_MEM_OFFLINE) && PageOffline(page)) - continue; - - if (PageLRU(page) || page_has_movable_ops(page)) - continue; - - /* - * If there are RECLAIMABLE pages, we need to check - * it. But now, memory offline itself doesn't call - * shrink_node_slabs() and it still to be fixed. - */ - return page; + start_pfn += step; } return NULL; } -- cgit v1.2.3 From 50962b16c0d63725fa73f0a5b4b831f740cf7208 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Sat, 17 Jan 2026 09:52:48 -0800 Subject: mm/damon: remove damon_operations->cleanup() Patch series "mm/damon: cleanup kdamond, damon_call(), damos filter and DAMON_MIN_REGION". Do miscellaneous code cleanups for improving readability. First three patches cleanup kdamond termination process, by removing unused operation set cleanup callback (patch 1) and moving damon_ctx specific resource cleanups on kdamond termination to synchronization-easy place (patches 2 and 3). Next two patches touch damon_call() infrastructure, by refactoring kdamond_call() function to do less and simpler locking operations (patch 4), and documenting when dealloc_on_free does work (patch 5). Final three patches rename things for clear uses of those. Those rename damos_filter_out() to be more explicit about the fact that it is only for core-handled filters (patch 6), DAMON_MIN_REGION macro to be more explicit it is not about number of regions but size of each region (patch 7), and damon_ctx->min_sz_region to be different from damos_access_patern->min_sz_region (patch 8), so that those are not confusing and easy to grep. This patch (of 8): damon_operations->cleanup() was added for a case that an operation set implementation requires additional cleanups. But no such implementation exists at the moment. Remove it. Link: https://lkml.kernel.org/r/20260117175256.82826-1-sj@kernel.org Link: https://lkml.kernel.org/r/20260117175256.82826-2-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/damon.h | 3 --- mm/damon/core.c | 2 -- mm/damon/paddr.c | 1 - mm/damon/vaddr.c | 1 - 4 files changed, 7 deletions(-) (limited to 'include') diff --git a/include/linux/damon.h b/include/linux/damon.h index e6930d8574d3..bd4c76b126bd 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -607,7 +607,6 @@ enum damon_ops_id { * @apply_scheme: Apply a DAMON-based operation scheme. * @target_valid: Determine if the target is valid. * @cleanup_target: Clean up each target before deallocation. - * @cleanup: Clean up the context. * * DAMON can be extended for various address spaces and usages. For this, * users should register the low level operations for their target address @@ -640,7 +639,6 @@ enum damon_ops_id { * @target_valid should check whether the target is still valid for the * monitoring. * @cleanup_target is called before the target will be deallocated. - * @cleanup is called from @kdamond just before its termination. */ struct damon_operations { enum damon_ops_id id; @@ -656,7 +654,6 @@ struct damon_operations { struct damos *scheme, unsigned long *sz_filter_passed); bool (*target_valid)(struct damon_target *t); void (*cleanup_target)(struct damon_target *t); - void (*cleanup)(struct damon_ctx *context); }; /* diff --git a/mm/damon/core.c b/mm/damon/core.c index 81b998d32074..53514cb712cf 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -2859,8 +2859,6 @@ done: damon_destroy_region(r, t); } - if (ctx->ops.cleanup) - ctx->ops.cleanup(ctx); kfree(ctx->regions_score_histogram); kdamond_call(ctx, true); diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c index 4c2c935d82d6..9bfe48826840 100644 --- a/mm/damon/paddr.c +++ b/mm/damon/paddr.c @@ -373,7 +373,6 @@ static int __init damon_pa_initcall(void) .prepare_access_checks = damon_pa_prepare_access_checks, .check_accesses = damon_pa_check_accesses, .target_valid = NULL, - .cleanup = NULL, .apply_scheme = damon_pa_apply_scheme, .get_scheme_score = damon_pa_scheme_score, }; diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index 23ed738a0bd6..40c73adf1946 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -1014,7 +1014,6 @@ static int __init damon_va_initcall(void) .check_accesses = damon_va_check_accesses, .target_valid = damon_va_target_valid, .cleanup_target = damon_va_cleanup_target, - .cleanup = NULL, .apply_scheme = damon_va_apply_scheme, .get_scheme_score = damon_va_scheme_score, }; -- cgit v1.2.3 From 177c8a272968b6bcdbcc8589a72e3eaa32f975d0 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Sat, 17 Jan 2026 09:52:52 -0800 Subject: mm/damon: document damon_call_control->dealloc_on_cancel repeat behavior damon_call_control->dealloc_on_cancel works only when ->repeat is true. But the behavior is not clearly documented. DAMON API callers can understand the behavior only after reading kdamond_call() code. Document the behavior on the kernel-doc comment of damon_call_control. Link: https://lkml.kernel.org/r/20260117175256.82826-6-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/damon.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/damon.h b/include/linux/damon.h index bd4c76b126bd..bdca28e15e40 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -663,7 +663,7 @@ struct damon_operations { * @data: Data that will be passed to @fn. * @repeat: Repeat invocations. * @return_code: Return code from @fn invocation. - * @dealloc_on_cancel: De-allocate when canceled. + * @dealloc_on_cancel: If @repeat is true, de-allocate when canceled. * * Control damon_call(), which requests specific kdamond to invoke a given * function. Refer to damon_call() for more details. -- cgit v1.2.3 From dfb1b0c9dc0d61e422905640e1e7334b3cf6f384 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Sat, 17 Jan 2026 09:52:54 -0800 Subject: mm/damon: rename DAMON_MIN_REGION to DAMON_MIN_REGION_SZ The macro is for the default minimum size of each DAMON region. There was a case that a reader was confused if it is the minimum number of total DAMON regions, which is set on damon_attrs->min_nr_regions. Make the name more explicit. Link: https://lkml.kernel.org/r/20260117175256.82826-8-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/damon.h | 2 +- mm/damon/core.c | 2 +- mm/damon/lru_sort.c | 2 +- mm/damon/reclaim.c | 2 +- mm/damon/sysfs.c | 2 +- mm/damon/tests/vaddr-kunit.h | 2 +- mm/damon/vaddr.c | 24 ++++++++++++------------ 7 files changed, 18 insertions(+), 18 deletions(-) (limited to 'include') diff --git a/include/linux/damon.h b/include/linux/damon.h index bdca28e15e40..5bf8db1d78fe 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -15,7 +15,7 @@ #include /* Minimal region size. Every damon_region is aligned by this. */ -#define DAMON_MIN_REGION PAGE_SIZE +#define DAMON_MIN_REGION_SZ PAGE_SIZE /* Max priority score for DAMON-based operation schemes */ #define DAMOS_MAX_SCORE (99) diff --git a/mm/damon/core.c b/mm/damon/core.c index ae5b772ceffb..5508bc794172 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -553,7 +553,7 @@ struct damon_ctx *damon_new_ctx(void) ctx->attrs.max_nr_regions = 1000; ctx->addr_unit = 1; - ctx->min_sz_region = DAMON_MIN_REGION; + ctx->min_sz_region = DAMON_MIN_REGION_SZ; INIT_LIST_HEAD(&ctx->adaptive_targets); INIT_LIST_HEAD(&ctx->schemes); diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c index bedb9134d286..9dde096a9064 100644 --- a/mm/damon/lru_sort.c +++ b/mm/damon/lru_sort.c @@ -298,7 +298,7 @@ static int damon_lru_sort_apply_parameters(void) if (!monitor_region_start && !monitor_region_end) addr_unit = 1; param_ctx->addr_unit = addr_unit; - param_ctx->min_sz_region = max(DAMON_MIN_REGION / addr_unit, 1); + param_ctx->min_sz_region = max(DAMON_MIN_REGION_SZ / addr_unit, 1); if (!damon_lru_sort_mon_attrs.sample_interval) { err = -EINVAL; diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c index 55df43e241c5..c343622a2f52 100644 --- a/mm/damon/reclaim.c +++ b/mm/damon/reclaim.c @@ -208,7 +208,7 @@ static int damon_reclaim_apply_parameters(void) if (!monitor_region_start && !monitor_region_end) addr_unit = 1; param_ctx->addr_unit = addr_unit; - param_ctx->min_sz_region = max(DAMON_MIN_REGION / addr_unit, 1); + param_ctx->min_sz_region = max(DAMON_MIN_REGION_SZ / addr_unit, 1); if (!damon_reclaim_mon_attrs.aggr_interval) { err = -EINVAL; diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index 4de25708b05a..57d36d60f329 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -1470,7 +1470,7 @@ static int damon_sysfs_apply_inputs(struct damon_ctx *ctx, /* addr_unit is respected by only DAMON_OPS_PADDR */ if (sys_ctx->ops_id == DAMON_OPS_PADDR) ctx->min_sz_region = max( - DAMON_MIN_REGION / sys_ctx->addr_unit, 1); + DAMON_MIN_REGION_SZ / sys_ctx->addr_unit, 1); err = damon_sysfs_set_attrs(ctx, sys_ctx->attrs); if (err) return err; diff --git a/mm/damon/tests/vaddr-kunit.h b/mm/damon/tests/vaddr-kunit.h index 30dc5459f1d2..cfae870178bf 100644 --- a/mm/damon/tests/vaddr-kunit.h +++ b/mm/damon/tests/vaddr-kunit.h @@ -147,7 +147,7 @@ static void damon_do_test_apply_three_regions(struct kunit *test, damon_add_region(r, t); } - damon_set_regions(t, three_regions, 3, DAMON_MIN_REGION); + damon_set_regions(t, three_regions, 3, DAMON_MIN_REGION_SZ); for (i = 0; i < nr_expected / 2; i++) { r = __nth_region_of(t, i); diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index 40c73adf1946..83ab3d8c3792 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -19,8 +19,8 @@ #include "ops-common.h" #ifdef CONFIG_DAMON_VADDR_KUNIT_TEST -#undef DAMON_MIN_REGION -#define DAMON_MIN_REGION 1 +#undef DAMON_MIN_REGION_SZ +#define DAMON_MIN_REGION_SZ 1 #endif /* @@ -78,7 +78,7 @@ static int damon_va_evenly_split_region(struct damon_target *t, orig_end = r->ar.end; sz_orig = damon_sz_region(r); - sz_piece = ALIGN_DOWN(sz_orig / nr_pieces, DAMON_MIN_REGION); + sz_piece = ALIGN_DOWN(sz_orig / nr_pieces, DAMON_MIN_REGION_SZ); if (!sz_piece) return -EINVAL; @@ -161,12 +161,12 @@ next: swap(first_gap, second_gap); /* Store the result */ - regions[0].start = ALIGN(start, DAMON_MIN_REGION); - regions[0].end = ALIGN(first_gap.start, DAMON_MIN_REGION); - regions[1].start = ALIGN(first_gap.end, DAMON_MIN_REGION); - regions[1].end = ALIGN(second_gap.start, DAMON_MIN_REGION); - regions[2].start = ALIGN(second_gap.end, DAMON_MIN_REGION); - regions[2].end = ALIGN(prev->vm_end, DAMON_MIN_REGION); + regions[0].start = ALIGN(start, DAMON_MIN_REGION_SZ); + regions[0].end = ALIGN(first_gap.start, DAMON_MIN_REGION_SZ); + regions[1].start = ALIGN(first_gap.end, DAMON_MIN_REGION_SZ); + regions[1].end = ALIGN(second_gap.start, DAMON_MIN_REGION_SZ); + regions[2].start = ALIGN(second_gap.end, DAMON_MIN_REGION_SZ); + regions[2].end = ALIGN(prev->vm_end, DAMON_MIN_REGION_SZ); return 0; } @@ -259,8 +259,8 @@ static void __damon_va_init_regions(struct damon_ctx *ctx, sz += regions[i].end - regions[i].start; if (ctx->attrs.min_nr_regions) sz /= ctx->attrs.min_nr_regions; - if (sz < DAMON_MIN_REGION) - sz = DAMON_MIN_REGION; + if (sz < DAMON_MIN_REGION_SZ) + sz = DAMON_MIN_REGION_SZ; /* Set the initial three regions of the target */ for (i = 0; i < 3; i++) { @@ -299,7 +299,7 @@ static void damon_va_update(struct damon_ctx *ctx) damon_for_each_target(t, ctx) { if (damon_va_three_regions(t, three_regions)) continue; - damon_set_regions(t, three_regions, 3, DAMON_MIN_REGION); + damon_set_regions(t, three_regions, 3, DAMON_MIN_REGION_SZ); } } -- cgit v1.2.3 From cc1db8dff8e751ec3ab352483de366b7f23aefe2 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Sat, 17 Jan 2026 09:52:55 -0800 Subject: mm/damon: rename min_sz_region of damon_ctx to min_region_sz 'min_sz_region' field of 'struct damon_ctx' represents the minimum size of each DAMON region for the context. 'struct damos_access_pattern' has a field of the same name. It confuses readers and makes 'grep' less optimal for them. Rename it to 'min_region_sz'. Link: https://lkml.kernel.org/r/20260117175256.82826-9-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/damon.h | 8 +++--- mm/damon/core.c | 69 ++++++++++++++++++++++++++------------------------- mm/damon/lru_sort.c | 4 +-- mm/damon/reclaim.c | 4 +-- mm/damon/stat.c | 2 +- mm/damon/sysfs.c | 9 ++++--- 6 files changed, 49 insertions(+), 47 deletions(-) (limited to 'include') diff --git a/include/linux/damon.h b/include/linux/damon.h index 5bf8db1d78fe..a4fea23da857 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -773,7 +773,7 @@ struct damon_attrs { * * @ops: Set of monitoring operations for given use cases. * @addr_unit: Scale factor for core to ops address conversion. - * @min_sz_region: Minimum region size. + * @min_region_sz: Minimum region size. * @adaptive_targets: Head of monitoring targets (&damon_target) list. * @schemes: Head of schemes (&damos) list. */ @@ -818,7 +818,7 @@ struct damon_ctx { /* public: */ struct damon_operations ops; unsigned long addr_unit; - unsigned long min_sz_region; + unsigned long min_region_sz; struct list_head adaptive_targets; struct list_head schemes; @@ -907,7 +907,7 @@ static inline void damon_insert_region(struct damon_region *r, void damon_add_region(struct damon_region *r, struct damon_target *t); void damon_destroy_region(struct damon_region *r, struct damon_target *t); int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges, - unsigned int nr_ranges, unsigned long min_sz_region); + unsigned int nr_ranges, unsigned long min_region_sz); void damon_update_region_access_rate(struct damon_region *r, bool accessed, struct damon_attrs *attrs); @@ -975,7 +975,7 @@ int damos_walk(struct damon_ctx *ctx, struct damos_walk_control *control); int damon_set_region_biggest_system_ram_default(struct damon_target *t, unsigned long *start, unsigned long *end, - unsigned long min_sz_region); + unsigned long min_region_sz); #endif /* CONFIG_DAMON */ diff --git a/mm/damon/core.c b/mm/damon/core.c index 5508bc794172..70efbf22a2b4 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -203,7 +203,7 @@ static int damon_fill_regions_holes(struct damon_region *first, * @t: the given target. * @ranges: array of new monitoring target ranges. * @nr_ranges: length of @ranges. - * @min_sz_region: minimum region size. + * @min_region_sz: minimum region size. * * This function adds new regions to, or modify existing regions of a * monitoring target to fit in specific ranges. @@ -211,7 +211,7 @@ static int damon_fill_regions_holes(struct damon_region *first, * Return: 0 if success, or negative error code otherwise. */ int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges, - unsigned int nr_ranges, unsigned long min_sz_region) + unsigned int nr_ranges, unsigned long min_region_sz) { struct damon_region *r, *next; unsigned int i; @@ -248,16 +248,16 @@ int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges, /* no region intersects with this range */ newr = damon_new_region( ALIGN_DOWN(range->start, - min_sz_region), - ALIGN(range->end, min_sz_region)); + min_region_sz), + ALIGN(range->end, min_region_sz)); if (!newr) return -ENOMEM; damon_insert_region(newr, damon_prev_region(r), r, t); } else { /* resize intersecting regions to fit in this range */ first->ar.start = ALIGN_DOWN(range->start, - min_sz_region); - last->ar.end = ALIGN(range->end, min_sz_region); + min_region_sz); + last->ar.end = ALIGN(range->end, min_region_sz); /* fill possible holes in the range */ err = damon_fill_regions_holes(first, last, t); @@ -553,7 +553,7 @@ struct damon_ctx *damon_new_ctx(void) ctx->attrs.max_nr_regions = 1000; ctx->addr_unit = 1; - ctx->min_sz_region = DAMON_MIN_REGION_SZ; + ctx->min_region_sz = DAMON_MIN_REGION_SZ; INIT_LIST_HEAD(&ctx->adaptive_targets); INIT_LIST_HEAD(&ctx->schemes); @@ -1142,7 +1142,7 @@ static struct damon_target *damon_nth_target(int n, struct damon_ctx *ctx) * If @src has no region, @dst keeps current regions. */ static int damon_commit_target_regions(struct damon_target *dst, - struct damon_target *src, unsigned long src_min_sz_region) + struct damon_target *src, unsigned long src_min_region_sz) { struct damon_region *src_region; struct damon_addr_range *ranges; @@ -1159,7 +1159,7 @@ static int damon_commit_target_regions(struct damon_target *dst, i = 0; damon_for_each_region(src_region, src) ranges[i++] = src_region->ar; - err = damon_set_regions(dst, ranges, i, src_min_sz_region); + err = damon_set_regions(dst, ranges, i, src_min_region_sz); kfree(ranges); return err; } @@ -1167,11 +1167,11 @@ static int damon_commit_target_regions(struct damon_target *dst, static int damon_commit_target( struct damon_target *dst, bool dst_has_pid, struct damon_target *src, bool src_has_pid, - unsigned long src_min_sz_region) + unsigned long src_min_region_sz) { int err; - err = damon_commit_target_regions(dst, src, src_min_sz_region); + err = damon_commit_target_regions(dst, src, src_min_region_sz); if (err) return err; if (dst_has_pid) @@ -1198,7 +1198,7 @@ static int damon_commit_targets( err = damon_commit_target( dst_target, damon_target_has_pid(dst), src_target, damon_target_has_pid(src), - src->min_sz_region); + src->min_region_sz); if (err) return err; } else { @@ -1225,7 +1225,7 @@ static int damon_commit_targets( return -ENOMEM; err = damon_commit_target(new_target, false, src_target, damon_target_has_pid(src), - src->min_sz_region); + src->min_region_sz); if (err) { damon_destroy_target(new_target, NULL); return err; @@ -1272,7 +1272,7 @@ int damon_commit_ctx(struct damon_ctx *dst, struct damon_ctx *src) } dst->ops = src->ops; dst->addr_unit = src->addr_unit; - dst->min_sz_region = src->min_sz_region; + dst->min_region_sz = src->min_region_sz; return 0; } @@ -1305,8 +1305,8 @@ static unsigned long damon_region_sz_limit(struct damon_ctx *ctx) if (ctx->attrs.min_nr_regions) sz /= ctx->attrs.min_nr_regions; - if (sz < ctx->min_sz_region) - sz = ctx->min_sz_region; + if (sz < ctx->min_region_sz) + sz = ctx->min_region_sz; return sz; } @@ -1696,7 +1696,7 @@ static bool damos_valid_target(struct damon_ctx *c, struct damon_target *t, * @t: The target of the region. * @rp: The pointer to the region. * @s: The scheme to be applied. - * @min_sz_region: minimum region size. + * @min_region_sz: minimum region size. * * If a quota of a scheme has exceeded in a quota charge window, the scheme's * action would applied to only a part of the target access pattern fulfilling @@ -1714,7 +1714,8 @@ static bool damos_valid_target(struct damon_ctx *c, struct damon_target *t, * Return: true if the region should be entirely skipped, false otherwise. */ static bool damos_skip_charged_region(struct damon_target *t, - struct damon_region **rp, struct damos *s, unsigned long min_sz_region) + struct damon_region **rp, struct damos *s, + unsigned long min_region_sz) { struct damon_region *r = *rp; struct damos_quota *quota = &s->quota; @@ -1736,11 +1737,11 @@ static bool damos_skip_charged_region(struct damon_target *t, if (quota->charge_addr_from && r->ar.start < quota->charge_addr_from) { sz_to_skip = ALIGN_DOWN(quota->charge_addr_from - - r->ar.start, min_sz_region); + r->ar.start, min_region_sz); if (!sz_to_skip) { - if (damon_sz_region(r) <= min_sz_region) + if (damon_sz_region(r) <= min_region_sz) return true; - sz_to_skip = min_sz_region; + sz_to_skip = min_region_sz; } damon_split_region_at(t, r, sz_to_skip); r = damon_next_region(r); @@ -1766,7 +1767,7 @@ static void damos_update_stat(struct damos *s, static bool damos_filter_match(struct damon_ctx *ctx, struct damon_target *t, struct damon_region *r, struct damos_filter *filter, - unsigned long min_sz_region) + unsigned long min_region_sz) { bool matched = false; struct damon_target *ti; @@ -1783,8 +1784,8 @@ static bool damos_filter_match(struct damon_ctx *ctx, struct damon_target *t, matched = target_idx == filter->target_idx; break; case DAMOS_FILTER_TYPE_ADDR: - start = ALIGN_DOWN(filter->addr_range.start, min_sz_region); - end = ALIGN_DOWN(filter->addr_range.end, min_sz_region); + start = ALIGN_DOWN(filter->addr_range.start, min_region_sz); + end = ALIGN_DOWN(filter->addr_range.end, min_region_sz); /* inside the range */ if (start <= r->ar.start && r->ar.end <= end) { @@ -1820,7 +1821,7 @@ static bool damos_core_filter_out(struct damon_ctx *ctx, struct damon_target *t, s->core_filters_allowed = false; damos_for_each_core_filter(filter, s) { - if (damos_filter_match(ctx, t, r, filter, ctx->min_sz_region)) { + if (damos_filter_match(ctx, t, r, filter, ctx->min_region_sz)) { if (filter->allow) s->core_filters_allowed = true; return !filter->allow; @@ -1955,7 +1956,7 @@ static void damos_apply_scheme(struct damon_ctx *c, struct damon_target *t, if (c->ops.apply_scheme) { if (quota->esz && quota->charged_sz + sz > quota->esz) { sz = ALIGN_DOWN(quota->esz - quota->charged_sz, - c->min_sz_region); + c->min_region_sz); if (!sz) goto update_stat; damon_split_region_at(t, r, sz); @@ -2003,7 +2004,7 @@ static void damon_do_apply_schemes(struct damon_ctx *c, if (quota->esz && quota->charged_sz >= quota->esz) continue; - if (damos_skip_charged_region(t, &r, s, c->min_sz_region)) + if (damos_skip_charged_region(t, &r, s, c->min_region_sz)) continue; if (s->max_nr_snapshots && @@ -2496,7 +2497,7 @@ static void damon_split_region_at(struct damon_target *t, /* Split every region in the given target into 'nr_subs' regions */ static void damon_split_regions_of(struct damon_target *t, int nr_subs, - unsigned long min_sz_region) + unsigned long min_region_sz) { struct damon_region *r, *next; unsigned long sz_region, sz_sub = 0; @@ -2506,13 +2507,13 @@ static void damon_split_regions_of(struct damon_target *t, int nr_subs, sz_region = damon_sz_region(r); for (i = 0; i < nr_subs - 1 && - sz_region > 2 * min_sz_region; i++) { + sz_region > 2 * min_region_sz; i++) { /* * Randomly select size of left sub-region to be at * least 10 percent and at most 90% of original region */ sz_sub = ALIGN_DOWN(damon_rand(1, 10) * - sz_region / 10, min_sz_region); + sz_region / 10, min_region_sz); /* Do not allow blank region */ if (sz_sub == 0 || sz_sub >= sz_region) continue; @@ -2552,7 +2553,7 @@ static void kdamond_split_regions(struct damon_ctx *ctx) nr_subregions = 3; damon_for_each_target(t, ctx) - damon_split_regions_of(t, nr_subregions, ctx->min_sz_region); + damon_split_regions_of(t, nr_subregions, ctx->min_region_sz); last_nr_regions = nr_regions; } @@ -2902,7 +2903,7 @@ static bool damon_find_biggest_system_ram(unsigned long *start, * @t: The monitoring target to set the region. * @start: The pointer to the start address of the region. * @end: The pointer to the end address of the region. - * @min_sz_region: Minimum region size. + * @min_region_sz: Minimum region size. * * This function sets the region of @t as requested by @start and @end. If the * values of @start and @end are zero, however, this function finds the biggest @@ -2914,7 +2915,7 @@ static bool damon_find_biggest_system_ram(unsigned long *start, */ int damon_set_region_biggest_system_ram_default(struct damon_target *t, unsigned long *start, unsigned long *end, - unsigned long min_sz_region) + unsigned long min_region_sz) { struct damon_addr_range addr_range; @@ -2927,7 +2928,7 @@ int damon_set_region_biggest_system_ram_default(struct damon_target *t, addr_range.start = *start; addr_range.end = *end; - return damon_set_regions(t, &addr_range, 1, min_sz_region); + return damon_set_regions(t, &addr_range, 1, min_region_sz); } /* diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c index 9dde096a9064..7bc5c0b2aea3 100644 --- a/mm/damon/lru_sort.c +++ b/mm/damon/lru_sort.c @@ -298,7 +298,7 @@ static int damon_lru_sort_apply_parameters(void) if (!monitor_region_start && !monitor_region_end) addr_unit = 1; param_ctx->addr_unit = addr_unit; - param_ctx->min_sz_region = max(DAMON_MIN_REGION_SZ / addr_unit, 1); + param_ctx->min_region_sz = max(DAMON_MIN_REGION_SZ / addr_unit, 1); if (!damon_lru_sort_mon_attrs.sample_interval) { err = -EINVAL; @@ -345,7 +345,7 @@ static int damon_lru_sort_apply_parameters(void) err = damon_set_region_biggest_system_ram_default(param_target, &monitor_region_start, &monitor_region_end, - param_ctx->min_sz_region); + param_ctx->min_region_sz); if (err) goto out; err = damon_commit_ctx(ctx, param_ctx); diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c index c343622a2f52..43d76f5bed44 100644 --- a/mm/damon/reclaim.c +++ b/mm/damon/reclaim.c @@ -208,7 +208,7 @@ static int damon_reclaim_apply_parameters(void) if (!monitor_region_start && !monitor_region_end) addr_unit = 1; param_ctx->addr_unit = addr_unit; - param_ctx->min_sz_region = max(DAMON_MIN_REGION_SZ / addr_unit, 1); + param_ctx->min_region_sz = max(DAMON_MIN_REGION_SZ / addr_unit, 1); if (!damon_reclaim_mon_attrs.aggr_interval) { err = -EINVAL; @@ -251,7 +251,7 @@ static int damon_reclaim_apply_parameters(void) err = damon_set_region_biggest_system_ram_default(param_target, &monitor_region_start, &monitor_region_end, - param_ctx->min_sz_region); + param_ctx->min_region_sz); if (err) goto out; err = damon_commit_ctx(ctx, param_ctx); diff --git a/mm/damon/stat.c b/mm/damon/stat.c index 5e18b164f6d8..536f02bd173e 100644 --- a/mm/damon/stat.c +++ b/mm/damon/stat.c @@ -181,7 +181,7 @@ static struct damon_ctx *damon_stat_build_ctx(void) goto free_out; damon_add_target(ctx, target); if (damon_set_region_biggest_system_ram_default(target, &start, &end, - ctx->min_sz_region)) + ctx->min_region_sz)) goto free_out; return ctx; free_out: diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index 57d36d60f329..b7f66196bec4 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -1365,7 +1365,7 @@ static int damon_sysfs_set_attrs(struct damon_ctx *ctx, static int damon_sysfs_set_regions(struct damon_target *t, struct damon_sysfs_regions *sysfs_regions, - unsigned long min_sz_region) + unsigned long min_region_sz) { struct damon_addr_range *ranges = kmalloc_array(sysfs_regions->nr, sizeof(*ranges), GFP_KERNEL | __GFP_NOWARN); @@ -1387,7 +1387,7 @@ static int damon_sysfs_set_regions(struct damon_target *t, if (ranges[i - 1].end > ranges[i].start) goto out; } - err = damon_set_regions(t, ranges, sysfs_regions->nr, min_sz_region); + err = damon_set_regions(t, ranges, sysfs_regions->nr, min_region_sz); out: kfree(ranges); return err; @@ -1409,7 +1409,8 @@ static int damon_sysfs_add_target(struct damon_sysfs_target *sys_target, return -EINVAL; } t->obsolete = sys_target->obsolete; - return damon_sysfs_set_regions(t, sys_target->regions, ctx->min_sz_region); + return damon_sysfs_set_regions(t, sys_target->regions, + ctx->min_region_sz); } static int damon_sysfs_add_targets(struct damon_ctx *ctx, @@ -1469,7 +1470,7 @@ static int damon_sysfs_apply_inputs(struct damon_ctx *ctx, ctx->addr_unit = sys_ctx->addr_unit; /* addr_unit is respected by only DAMON_OPS_PADDR */ if (sys_ctx->ops_id == DAMON_OPS_PADDR) - ctx->min_sz_region = max( + ctx->min_region_sz = max( DAMON_MIN_REGION_SZ / sys_ctx->addr_unit, 1); err = damon_sysfs_set_attrs(ctx, sys_ctx->attrs); if (err) -- cgit v1.2.3 From 25faccd69977d9a72739fd425040c2a1c2d67e46 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Fri, 23 Jan 2026 20:12:11 +0000 Subject: mm/vma: rename VMA_LOCK_OFFSET to VM_REFCNT_EXCLUDE_READERS_FLAG Patch series "mm: add and use vma_assert_stabilised() helper", v4. This series first introduces a series of refactorings, intended to significantly improve readability and abstraction of the code. Sometimes we wish to assert that a VMA is stable, that is - the VMA cannot be changed underneath us. This will be the case if EITHER the VMA lock or the mmap lock is held. We already open-code this in two places - anon_vma_name() in mm/madvise.c and vma_flag_set_atomic() in include/linux/mm.h. This series adds vma_assert_stablised() which abstract this can be used in these callsites instead. This implementation uses lockdep where possible - that is VMA read locks - which correctly track read lock acquisition/release via: vma_start_read() -> rwsem_acquire_read() vma_start_read_locked() -> vma_start_read_locked_nested() -> rwsem_acquire_read() And: vma_end_read() -> vma_refcount_put() -> rwsem_release() We don't track the VMA locks using lockdep for VMA write locks, however these are predicated upon mmap write locks whose lockdep state we do track, and additionally vma_assert_stabillised() asserts this check if VMA read lock is not held, so we get lockdep coverage in this case also. We also add extensive comments to describe what we're doing. There's some tricky stuff around mmap locking and stabilisation races that we have to be careful of that I describe in the patch introducing vma_assert_stabilised(). This change also lays the foundation for future series to add this assert in further places where we wish to make it clear that we rely upon a stabilised VMA. The motivation for this change was precisely this. This patch (of 10): The VMA_LOCK_OFFSET value encodes a flag which vma->vm_refcnt is set to in order to indicate that a VMA is in the process of having VMA read-locks excluded in __vma_enter_locked() (that is, first checking if there are any VMA read locks held, and if there are, waiting on them to be released). This happens when a VMA write lock is being established, or a VMA is being marked detached and discovers that the VMA reference count is elevated due to read-locks temporarily elevating the reference count only to discover a VMA write lock is in place. The naming does not convey any of this, so rename VMA_LOCK_OFFSET to VM_REFCNT_EXCLUDE_READERS_FLAG (with a sensible new prefix to differentiate from the newly introduced VMA_*_BIT flags). Also rename VMA_REF_LIMIT to VM_REFCNT_LIMIT to make this consistent also. Update comments to reflect this. No functional change intended. Link: https://lkml.kernel.org/r/817bd763e5fe35f23e01347996f9007e6eb88460.1769198904.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Suren Baghdasaryan Reviewed-by: Vlastimil Babka Reviewed-by: Liam R. Howlett Cc: Boqun Feng Cc: Michal Hocko Cc: Mike Rapoport Cc: Shakeel Butt Cc: Waiman Long Cc: Sebastian Andrzej Siewior Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 17 +++++++++++++---- include/linux/mmap_lock.h | 14 ++++++++------ mm/mmap_lock.c | 17 ++++++++++------- 3 files changed, 31 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 78950eb8926d..bdbf17c4f26b 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -752,8 +752,17 @@ static inline struct anon_vma_name *anon_vma_name_alloc(const char *name) } #endif -#define VMA_LOCK_OFFSET 0x40000000 -#define VMA_REF_LIMIT (VMA_LOCK_OFFSET - 1) +/* + * While __vma_enter_locked() is working to ensure are no read-locks held on a + * VMA (either while acquiring a VMA write lock or marking a VMA detached) we + * set the VM_REFCNT_EXCLUDE_READERS_FLAG in vma->vm_refcnt to indiciate to + * vma_start_read() that the reference count should be left alone. + * + * Once the operation is complete, this value is subtracted from vma->vm_refcnt. + */ +#define VM_REFCNT_EXCLUDE_READERS_BIT (30) +#define VM_REFCNT_EXCLUDE_READERS_FLAG (1U << VM_REFCNT_EXCLUDE_READERS_BIT) +#define VM_REFCNT_LIMIT (VM_REFCNT_EXCLUDE_READERS_FLAG - 1) struct vma_numab_state { /* @@ -935,10 +944,10 @@ struct vm_area_struct { /* * Can only be written (using WRITE_ONCE()) while holding both: * - mmap_lock (in write mode) - * - vm_refcnt bit at VMA_LOCK_OFFSET is set + * - vm_refcnt bit at VM_REFCNT_EXCLUDE_READERS_FLAG is set * Can be read reliably while holding one of: * - mmap_lock (in read or write mode) - * - vm_refcnt bit at VMA_LOCK_OFFSET is set or vm_refcnt > 1 + * - vm_refcnt bit at VM_REFCNT_EXCLUDE_READERS_BIT is set or vm_refcnt > 1 * Can be read unreliably (using READ_ONCE()) for pessimistic bailout * while holding nothing (except RCU to keep the VMA struct allocated). * diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h index b50416fbba20..5acbd4ba1b52 100644 --- a/include/linux/mmap_lock.h +++ b/include/linux/mmap_lock.h @@ -125,12 +125,14 @@ static inline void vma_lock_init(struct vm_area_struct *vma, bool reset_refcnt) static inline bool is_vma_writer_only(int refcnt) { /* - * With a writer and no readers, refcnt is VMA_LOCK_OFFSET if the vma - * is detached and (VMA_LOCK_OFFSET + 1) if it is attached. Waiting on - * a detached vma happens only in vma_mark_detached() and is a rare - * case, therefore most of the time there will be no unnecessary wakeup. + * With a writer and no readers, refcnt is VM_REFCNT_EXCLUDE_READERS_FLAG + * if the vma is detached and (VM_REFCNT_EXCLUDE_READERS_FLAG + 1) if it is + * attached. Waiting on a detached vma happens only in + * vma_mark_detached() and is a rare case, therefore most of the time + * there will be no unnecessary wakeup. */ - return (refcnt & VMA_LOCK_OFFSET) && refcnt <= VMA_LOCK_OFFSET + 1; + return (refcnt & VM_REFCNT_EXCLUDE_READERS_FLAG) && + refcnt <= VM_REFCNT_EXCLUDE_READERS_FLAG + 1; } static inline void vma_refcount_put(struct vm_area_struct *vma) @@ -159,7 +161,7 @@ static inline bool vma_start_read_locked_nested(struct vm_area_struct *vma, int mmap_assert_locked(vma->vm_mm); if (unlikely(!__refcount_inc_not_zero_limited_acquire(&vma->vm_refcnt, &oldcnt, - VMA_REF_LIMIT))) + VM_REFCNT_LIMIT))) return false; rwsem_acquire_read(&vma->vmlock_dep_map, 0, 1, _RET_IP_); diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c index 7421b7ea8001..1d23b48552e9 100644 --- a/mm/mmap_lock.c +++ b/mm/mmap_lock.c @@ -54,7 +54,7 @@ static inline int __vma_enter_locked(struct vm_area_struct *vma, bool detaching, int state) { int err; - unsigned int tgt_refcnt = VMA_LOCK_OFFSET; + unsigned int tgt_refcnt = VM_REFCNT_EXCLUDE_READERS_FLAG; mmap_assert_write_locked(vma->vm_mm); @@ -66,7 +66,7 @@ static inline int __vma_enter_locked(struct vm_area_struct *vma, * If vma is detached then only vma_mark_attached() can raise the * vm_refcnt. mmap_write_lock prevents racing with vma_mark_attached(). */ - if (!refcount_add_not_zero(VMA_LOCK_OFFSET, &vma->vm_refcnt)) + if (!refcount_add_not_zero(VM_REFCNT_EXCLUDE_READERS_FLAG, &vma->vm_refcnt)) return 0; rwsem_acquire(&vma->vmlock_dep_map, 0, 0, _RET_IP_); @@ -74,7 +74,7 @@ static inline int __vma_enter_locked(struct vm_area_struct *vma, refcount_read(&vma->vm_refcnt) == tgt_refcnt, state); if (err) { - if (refcount_sub_and_test(VMA_LOCK_OFFSET, &vma->vm_refcnt)) { + if (refcount_sub_and_test(VM_REFCNT_EXCLUDE_READERS_FLAG, &vma->vm_refcnt)) { /* * The wait failed, but the last reader went away * as well. Tell the caller the VMA is detached. @@ -92,7 +92,8 @@ static inline int __vma_enter_locked(struct vm_area_struct *vma, static inline void __vma_exit_locked(struct vm_area_struct *vma, bool *detached) { - *detached = refcount_sub_and_test(VMA_LOCK_OFFSET, &vma->vm_refcnt); + *detached = refcount_sub_and_test(VM_REFCNT_EXCLUDE_READERS_FLAG, + &vma->vm_refcnt); rwsem_release(&vma->vmlock_dep_map, _RET_IP_); } @@ -180,13 +181,15 @@ static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm, } /* - * If VMA_LOCK_OFFSET is set, __refcount_inc_not_zero_limited_acquire() - * will fail because VMA_REF_LIMIT is less than VMA_LOCK_OFFSET. + * If VM_REFCNT_EXCLUDE_READERS_FLAG is set, + * __refcount_inc_not_zero_limited_acquire() will fail because + * VM_REFCNT_LIMIT is less than VM_REFCNT_EXCLUDE_READERS_FLAG. + * * Acquire fence is required here to avoid reordering against later * vm_lock_seq check and checks inside lock_vma_under_rcu(). */ if (unlikely(!__refcount_inc_not_zero_limited_acquire(&vma->vm_refcnt, &oldcnt, - VMA_REF_LIMIT))) { + VM_REFCNT_LIMIT))) { /* return EAGAIN if vma got detached from under us */ vma = oldcnt ? NULL : ERR_PTR(-EAGAIN); goto err; -- cgit v1.2.3 From ef4c0cea1e15dc6b1b5b9bb72fa4605b14f2125e Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Fri, 23 Jan 2026 20:12:12 +0000 Subject: mm/vma: document possible vma->vm_refcnt values and reference comment The possible vma->vm_refcnt values are confusing and vague, explain in detail what these can be in a comment describing the vma->vm_refcnt field and reference this comment in various places that read/write this field. No functional change intended. [akpm@linux-foundation.org: fix typo, per Suren] Link: https://lkml.kernel.org/r/d462e7678c6cc7461f94e5b26c776547d80a67e8.1769198904.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Vlastimil Babka Reviewed-by: Suren Baghdasaryan Cc: Boqun Feng Cc: Liam Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Shakeel Butt Cc: Waiman Long Cc: Sebastian Andrzej Siewior Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 42 ++++++++++++++++++++++++++++++++++++++++-- include/linux/mmap_lock.h | 7 +++++++ mm/mmap_lock.c | 6 ++++++ 3 files changed, 53 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index bdbf17c4f26b..3e608d22cab0 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -758,7 +758,8 @@ static inline struct anon_vma_name *anon_vma_name_alloc(const char *name) * set the VM_REFCNT_EXCLUDE_READERS_FLAG in vma->vm_refcnt to indiciate to * vma_start_read() that the reference count should be left alone. * - * Once the operation is complete, this value is subtracted from vma->vm_refcnt. + * See the comment describing vm_refcnt in vm_area_struct for details as to + * which values the VMA reference count can be. */ #define VM_REFCNT_EXCLUDE_READERS_BIT (30) #define VM_REFCNT_EXCLUDE_READERS_FLAG (1U << VM_REFCNT_EXCLUDE_READERS_BIT) @@ -989,7 +990,44 @@ struct vm_area_struct { struct vma_numab_state *numab_state; /* NUMA Balancing state */ #endif #ifdef CONFIG_PER_VMA_LOCK - /* Unstable RCU readers are allowed to read this. */ + /* + * Used to keep track of firstly, whether the VMA is attached, secondly, + * if attached, how many read locks are taken, and thirdly, if the + * VM_REFCNT_EXCLUDE_READERS_FLAG is set, whether any read locks held + * are currently in the process of being excluded. + * + * This value can be equal to: + * + * 0 - Detached. IMPORTANT: when the refcnt is zero, readers cannot + * increment it. + * + * 1 - Attached and either unlocked or write-locked. Write locks are + * identified via __is_vma_write_locked() which checks for equality of + * vma->vm_lock_seq and mm->mm_lock_seq. + * + * >1, < VM_REFCNT_EXCLUDE_READERS_FLAG - Read-locked or (unlikely) + * write-locked with other threads having temporarily incremented the + * reference count prior to determining it is write-locked and + * decrementing it again. + * + * VM_REFCNT_EXCLUDE_READERS_FLAG - Detached, pending + * __vma_exit_locked() completion which will decrement the reference + * count to zero. IMPORTANT - at this stage no further readers can + * increment the reference count. It can only be reduced. + * + * VM_REFCNT_EXCLUDE_READERS_FLAG + 1 - A thread is either write-locking + * an attached VMA and has yet to invoke __vma_exit_locked(), OR a + * thread is detaching a VMA and is waiting on a single spurious reader + * in order to decrement the reference count. IMPORTANT - as above, no + * further readers can increment the reference count. + * + * > VM_REFCNT_EXCLUDE_READERS_FLAG + 1 - A thread is either + * write-locking or detaching a VMA is waiting on readers to + * exit. IMPORTANT - as above, no further readers can increment the + * reference count. + * + * NOTE: Unstable RCU readers are allowed to read this. + */ refcount_t vm_refcnt ____cacheline_aligned_in_smp; #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map vmlock_dep_map; diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h index 5acbd4ba1b52..a764439d0276 100644 --- a/include/linux/mmap_lock.h +++ b/include/linux/mmap_lock.h @@ -130,6 +130,9 @@ static inline bool is_vma_writer_only(int refcnt) * attached. Waiting on a detached vma happens only in * vma_mark_detached() and is a rare case, therefore most of the time * there will be no unnecessary wakeup. + * + * See the comment describing the vm_area_struct->vm_refcnt field for + * details of possible refcnt values. */ return (refcnt & VM_REFCNT_EXCLUDE_READERS_FLAG) && refcnt <= VM_REFCNT_EXCLUDE_READERS_FLAG + 1; @@ -249,6 +252,10 @@ static inline void vma_assert_locked(struct vm_area_struct *vma) { unsigned int mm_lock_seq; + /* + * See the comment describing the vm_area_struct->vm_refcnt field for + * details of possible refcnt values. + */ VM_BUG_ON_VMA(refcount_read(&vma->vm_refcnt) <= 1 && !__is_vma_write_locked(vma, &mm_lock_seq), vma); } diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c index 1d23b48552e9..75dc098aea14 100644 --- a/mm/mmap_lock.c +++ b/mm/mmap_lock.c @@ -65,6 +65,9 @@ static inline int __vma_enter_locked(struct vm_area_struct *vma, /* * If vma is detached then only vma_mark_attached() can raise the * vm_refcnt. mmap_write_lock prevents racing with vma_mark_attached(). + * + * See the comment describing the vm_area_struct->vm_refcnt field for + * details of possible refcnt values. */ if (!refcount_add_not_zero(VM_REFCNT_EXCLUDE_READERS_FLAG, &vma->vm_refcnt)) return 0; @@ -137,6 +140,9 @@ void vma_mark_detached(struct vm_area_struct *vma) * before they check vm_lock_seq, realize the vma is locked and drop * back the vm_refcnt. That is a narrow window for observing a raised * vm_refcnt. + * + * See the comment describing the vm_area_struct->vm_refcnt field for + * details of possible refcnt values. */ if (unlikely(!refcount_dec_and_test(&vma->vm_refcnt))) { /* Wait until vma is detached with no readers. */ -- cgit v1.2.3 From 180355d4cfbd25f370e2e0912877a36aa350ff64 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Fri, 23 Jan 2026 20:12:13 +0000 Subject: mm/vma: rename is_vma_write_only(), separate out shared refcount put The is_vma_writer_only() function is misnamed - this isn't determining if there is only a write lock, as it checks for the presence of the VM_REFCNT_EXCLUDE_READERS_FLAG. Really, it is checking to see whether readers are excluded, with a possibility of a false positive in the case of a detachment (there we expect the vma->vm_refcnt to eventually be set to VM_REFCNT_EXCLUDE_READERS_FLAG, whereas for an attached VMA we expect it to eventually be set to VM_REFCNT_EXCLUDE_READERS_FLAG + 1). Rename the function accordingly. Relatedly, we use a __refcount_dec_and_test() primitive directly in vma_refcount_put(), using the old value to determine what the reference count ought to be after the operation is complete (ignoring racing reference count adjustments). Wrap this into a __vma_refcount_put_return() function, which we can then utilise in vma_mark_detached() and thus keep the refcount primitive usage abstracted. This function, as the name implies, returns the value after the reference count has been updated. This reduces duplication in the two invocations of this function. Also adjust comments, removing duplicative comments covered elsewhere and adding more to aid understanding. No functional change intended. Link: https://lkml.kernel.org/r/32053580bff460eb1092ef780b526cefeb748bad.1769198904.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Vlastimil Babka Reviewed-by: Suren Baghdasaryan Cc: Boqun Feng Cc: Liam Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Shakeel Butt Cc: Waiman Long Cc: Sebastian Andrzej Siewior Signed-off-by: Andrew Morton --- include/linux/mmap_lock.h | 66 +++++++++++++++++++++++++++++++++++++---------- mm/mmap_lock.c | 17 +++++++----- 2 files changed, 63 insertions(+), 20 deletions(-) (limited to 'include') diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h index a764439d0276..294fb282052d 100644 --- a/include/linux/mmap_lock.h +++ b/include/linux/mmap_lock.h @@ -122,15 +122,22 @@ static inline void vma_lock_init(struct vm_area_struct *vma, bool reset_refcnt) vma->vm_lock_seq = UINT_MAX; } -static inline bool is_vma_writer_only(int refcnt) +/* + * This function determines whether the input VMA reference count describes a + * VMA which has excluded all VMA read locks. + * + * In the case of a detached VMA, we may incorrectly indicate that readers are + * excluded when one remains, because in that scenario we target a refcount of + * VM_REFCNT_EXCLUDE_READERS_FLAG, rather than the attached target of + * VM_REFCNT_EXCLUDE_READERS_FLAG + 1. + * + * However, the race window for that is very small so it is unlikely. + * + * Returns: true if readers are excluded, false otherwise. + */ +static inline bool __vma_are_readers_excluded(int refcnt) { /* - * With a writer and no readers, refcnt is VM_REFCNT_EXCLUDE_READERS_FLAG - * if the vma is detached and (VM_REFCNT_EXCLUDE_READERS_FLAG + 1) if it is - * attached. Waiting on a detached vma happens only in - * vma_mark_detached() and is a rare case, therefore most of the time - * there will be no unnecessary wakeup. - * * See the comment describing the vm_area_struct->vm_refcnt field for * details of possible refcnt values. */ @@ -138,18 +145,51 @@ static inline bool is_vma_writer_only(int refcnt) refcnt <= VM_REFCNT_EXCLUDE_READERS_FLAG + 1; } +/* + * Actually decrement the VMA reference count. + * + * The function returns the reference count as it was immediately after the + * decrement took place. If it returns zero, the VMA is now detached. + */ +static inline __must_check unsigned int +__vma_refcount_put_return(struct vm_area_struct *vma) +{ + int oldcnt; + + if (__refcount_dec_and_test(&vma->vm_refcnt, &oldcnt)) + return 0; + + return oldcnt - 1; +} + +/** + * vma_refcount_put() - Drop reference count in VMA vm_refcnt field due to a + * read-lock being dropped. + * @vma: The VMA whose reference count we wish to decrement. + * + * If we were the last reader, wake up threads waiting to obtain an exclusive + * lock. + */ static inline void vma_refcount_put(struct vm_area_struct *vma) { - /* Use a copy of vm_mm in case vma is freed after we drop vm_refcnt */ + /* Use a copy of vm_mm in case vma is freed after we drop vm_refcnt. */ struct mm_struct *mm = vma->vm_mm; - int oldcnt; + int newcnt; rwsem_release(&vma->vmlock_dep_map, _RET_IP_); - if (!__refcount_dec_and_test(&vma->vm_refcnt, &oldcnt)) { - if (is_vma_writer_only(oldcnt - 1)) - rcuwait_wake_up(&mm->vma_writer_wait); - } + newcnt = __vma_refcount_put_return(vma); + /* + * __vma_enter_locked() may be sleeping waiting for readers to drop + * their reference count, so wake it up if we were the last reader + * blocking it from being acquired. + * + * We may be raced by other readers temporarily incrementing the + * reference count, though the race window is very small, this might + * cause spurious wakeups. + */ + if (newcnt && __vma_are_readers_excluded(newcnt)) + rcuwait_wake_up(&mm->vma_writer_wait); } /* diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c index 75dc098aea14..6be1bbcde09e 100644 --- a/mm/mmap_lock.c +++ b/mm/mmap_lock.c @@ -134,21 +134,24 @@ void vma_mark_detached(struct vm_area_struct *vma) vma_assert_attached(vma); /* - * We are the only writer, so no need to use vma_refcount_put(). - * The condition below is unlikely because the vma has been already - * write-locked and readers can increment vm_refcnt only temporarily - * before they check vm_lock_seq, realize the vma is locked and drop - * back the vm_refcnt. That is a narrow window for observing a raised - * vm_refcnt. + * This condition - that the VMA is still attached (refcnt > 0) - is + * unlikely, because the vma has been already write-locked and readers + * can increment vm_refcnt only temporarily before they check + * vm_lock_seq, realize the vma is locked and drop back the + * vm_refcnt. That is a narrow window for observing a raised vm_refcnt. * * See the comment describing the vm_area_struct->vm_refcnt field for * details of possible refcnt values. */ - if (unlikely(!refcount_dec_and_test(&vma->vm_refcnt))) { + if (unlikely(__vma_refcount_put_return(vma))) { /* Wait until vma is detached with no readers. */ if (__vma_enter_locked(vma, true, TASK_UNINTERRUPTIBLE)) { bool detached; + /* + * Once this is complete, no readers can increment the + * reference count, and the VMA is marked detached. + */ __vma_exit_locked(vma, &detached); WARN_ON_ONCE(!detached); } -- cgit v1.2.3 From 1f2e7efc3ee9b32095d5a331d1f8672623f311bf Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Fri, 23 Jan 2026 20:12:14 +0000 Subject: mm/vma: add+use vma lockdep acquire/release defines The code is littered with inscrutable and duplicative lockdep incantations, replace these with defines which explain what is going on and add commentary to explain what we're doing. If lockdep is disabled these become no-ops. We must use defines so _RET_IP_ remains meaningful. These are self-documenting and aid readability of the code. Additionally, instead of using the confusing rwsem_*() form for something that is emphatically not an rwsem, we instead explicitly use lock_[acquired, release]_shared/exclusive() lockdep invocations since we are doing something rather custom here and these make more sense to use. No functional change intended. Link: https://lkml.kernel.org/r/fdae72441949ecf3b4a0ed3510da803e881bb153.1769198904.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Suren Baghdasaryan Reviewed-by: Vlastimil Babka Reviewed-by: Sebastian Andrzej Siewior Cc: Boqun Feng Cc: Liam Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Shakeel Butt Cc: Waiman Long Signed-off-by: Andrew Morton --- include/linux/mmap_lock.h | 37 ++++++++++++++++++++++++++++++++++--- mm/mmap_lock.c | 10 +++++----- 2 files changed, 39 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h index 294fb282052d..1887ca55ead7 100644 --- a/include/linux/mmap_lock.h +++ b/include/linux/mmap_lock.h @@ -78,6 +78,37 @@ static inline void mmap_assert_write_locked(const struct mm_struct *mm) #ifdef CONFIG_PER_VMA_LOCK +/* + * VMA locks do not behave like most ordinary locks found in the kernel, so we + * cannot quite have full lockdep tracking in the way we would ideally prefer. + * + * Read locks act as shared locks which exclude an exclusive lock being + * taken. We therefore mark these accordingly on read lock acquire/release. + * + * Write locks are acquired exclusively per-VMA, but released in a shared + * fashion, that is upon vma_end_write_all(), we update the mmap's seqcount such + * that write lock is released. + * + * We therefore cannot track write locks per-VMA, nor do we try. Mitigating this + * is the fact that, of course, we do lockdep-track the mmap lock rwsem which + * must be held when taking a VMA write lock. + * + * We do, however, want to indicate that during either acquisition of a VMA + * write lock or detachment of a VMA that we require the lock held be exclusive, + * so we utilise lockdep to do so. + */ +#define __vma_lockdep_acquire_read(vma) \ + lock_acquire_shared(&vma->vmlock_dep_map, 0, 1, NULL, _RET_IP_) +#define __vma_lockdep_release_read(vma) \ + lock_release(&vma->vmlock_dep_map, _RET_IP_) +#define __vma_lockdep_acquire_exclusive(vma) \ + lock_acquire_exclusive(&vma->vmlock_dep_map, 0, 0, NULL, _RET_IP_) +#define __vma_lockdep_release_exclusive(vma) \ + lock_release(&vma->vmlock_dep_map, _RET_IP_) +/* Only meaningful if CONFIG_LOCK_STAT is defined. */ +#define __vma_lockdep_stat_mark_acquired(vma) \ + lock_acquired(&vma->vmlock_dep_map, _RET_IP_) + static inline void mm_lock_seqcount_init(struct mm_struct *mm) { seqcount_init(&mm->mm_lock_seq); @@ -176,9 +207,9 @@ static inline void vma_refcount_put(struct vm_area_struct *vma) struct mm_struct *mm = vma->vm_mm; int newcnt; - rwsem_release(&vma->vmlock_dep_map, _RET_IP_); - + __vma_lockdep_release_read(vma); newcnt = __vma_refcount_put_return(vma); + /* * __vma_enter_locked() may be sleeping waiting for readers to drop * their reference count, so wake it up if we were the last reader @@ -207,7 +238,7 @@ static inline bool vma_start_read_locked_nested(struct vm_area_struct *vma, int VM_REFCNT_LIMIT))) return false; - rwsem_acquire_read(&vma->vmlock_dep_map, 0, 1, _RET_IP_); + __vma_lockdep_acquire_read(vma); return true; } diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c index 6be1bbcde09e..85b2ae1d9720 100644 --- a/mm/mmap_lock.c +++ b/mm/mmap_lock.c @@ -72,7 +72,7 @@ static inline int __vma_enter_locked(struct vm_area_struct *vma, if (!refcount_add_not_zero(VM_REFCNT_EXCLUDE_READERS_FLAG, &vma->vm_refcnt)) return 0; - rwsem_acquire(&vma->vmlock_dep_map, 0, 0, _RET_IP_); + __vma_lockdep_acquire_exclusive(vma); err = rcuwait_wait_event(&vma->vm_mm->vma_writer_wait, refcount_read(&vma->vm_refcnt) == tgt_refcnt, state); @@ -85,10 +85,10 @@ static inline int __vma_enter_locked(struct vm_area_struct *vma, WARN_ON_ONCE(!detaching); err = 0; } - rwsem_release(&vma->vmlock_dep_map, _RET_IP_); + __vma_lockdep_release_exclusive(vma); return err; } - lock_acquired(&vma->vmlock_dep_map, _RET_IP_); + __vma_lockdep_stat_mark_acquired(vma); return 1; } @@ -97,7 +97,7 @@ static inline void __vma_exit_locked(struct vm_area_struct *vma, bool *detached) { *detached = refcount_sub_and_test(VM_REFCNT_EXCLUDE_READERS_FLAG, &vma->vm_refcnt); - rwsem_release(&vma->vmlock_dep_map, _RET_IP_); + __vma_lockdep_release_exclusive(vma); } int __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq, @@ -204,7 +204,7 @@ static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm, goto err; } - rwsem_acquire_read(&vma->vmlock_dep_map, 0, 1, _RET_IP_); + __vma_lockdep_acquire_read(vma); if (unlikely(vma->vm_mm != mm)) goto err_unstable; -- cgit v1.2.3 From 28f590f35da8435f75e2aee51431c6c1b8d91f54 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Fri, 23 Jan 2026 20:12:16 +0000 Subject: mm/vma: clean up __vma_enter/exit_locked() These functions are very confusing indeed. 'Entering' a lock could be interpreted as acquiring it, but this is not what these functions are interacting with. Equally they don't indicate at all what kind of lock we are 'entering' or 'exiting'. Finally they are misleading as we invoke these functions when we already hold a write lock to detach a VMA. These functions are explicitly simply 'entering' and 'exiting' a state in which we hold the EXCLUSIVE lock in order that we can either mark the VMA as being write-locked, or mark the VMA detached. Rename the functions accordingly, and also update __vma_end_exclude_readers() to return detached state with a __must_check directive, as it is simply clumsy to pass an output pointer here to detached state and inconsistent vs. __vma_start_exclude_readers(). Finally, remove the unnecessary 'inline' directives. No functional change intended. Link: https://lkml.kernel.org/r/33273be9389712347d69987c408ca7436f0c1b22.1769198904.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Vlastimil Babka Reviewed-by: Suren Baghdasaryan Cc: Boqun Feng Cc: Liam Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Shakeel Butt Cc: Waiman Long Cc: Sebastian Andrzej Siewior Signed-off-by: Andrew Morton --- include/linux/mmap_lock.h | 4 ++-- mm/mmap_lock.c | 58 ++++++++++++++++++++++++++++++++--------------- 2 files changed, 42 insertions(+), 20 deletions(-) (limited to 'include') diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h index 1887ca55ead7..d6df6aad3e24 100644 --- a/include/linux/mmap_lock.h +++ b/include/linux/mmap_lock.h @@ -211,8 +211,8 @@ static inline void vma_refcount_put(struct vm_area_struct *vma) newcnt = __vma_refcount_put_return(vma); /* - * __vma_enter_locked() may be sleeping waiting for readers to drop - * their reference count, so wake it up if we were the last reader + * __vma_start_exclude_readers() may be sleeping waiting for readers to + * drop their reference count, so wake it up if we were the last reader * blocking it from being acquired. * * We may be raced by other readers temporarily incrementing the diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c index 1fabda07c922..72f15f606093 100644 --- a/mm/mmap_lock.c +++ b/mm/mmap_lock.c @@ -46,19 +46,44 @@ EXPORT_SYMBOL(__mmap_lock_do_trace_released); #ifdef CONFIG_MMU #ifdef CONFIG_PER_VMA_LOCK -static inline void __vma_exit_locked(struct vm_area_struct *vma, bool *detached) +/* + * Now that all readers have been evicted, mark the VMA as being out of the + * 'exclude readers' state. + * + * Returns true if the VMA is now detached, otherwise false. + */ +static bool __must_check __vma_end_exclude_readers(struct vm_area_struct *vma) { - *detached = refcount_sub_and_test(VM_REFCNT_EXCLUDE_READERS_FLAG, - &vma->vm_refcnt); + bool detached; + + detached = refcount_sub_and_test(VM_REFCNT_EXCLUDE_READERS_FLAG, + &vma->vm_refcnt); __vma_lockdep_release_exclusive(vma); + return detached; } /* - * __vma_enter_locked() returns 0 immediately if the vma is not - * attached, otherwise it waits for any current readers to finish and - * returns 1. Returns -EINTR if a signal is received while waiting. + * Mark the VMA as being in a state of excluding readers, check to see if any + * VMA read locks are indeed held, and if so wait for them to be released. + * + * Note that this function pairs with vma_refcount_put() which will wake up this + * thread when it detects that the last reader has released its lock. + * + * The state parameter ought to be set to TASK_UNINTERRUPTIBLE in cases where we + * wish the thread to sleep uninterruptibly or TASK_KILLABLE if a fatal signal + * is permitted to kill it. + * + * The function will return 0 immediately if the VMA is detached, or wait for + * readers and return 1 once they have all exited, leaving the VMA exclusively + * locked. + * + * If the function returns 1, the caller is required to invoke + * __vma_end_exclude_readers() once the exclusive state is no longer required. + * + * If state is set to something other than TASK_UNINTERRUPTIBLE, the function + * may also return -EINTR to indicate a fatal signal was received while waiting. */ -static inline int __vma_enter_locked(struct vm_area_struct *vma, +static int __vma_start_exclude_readers(struct vm_area_struct *vma, bool detaching, int state) { int err; @@ -85,13 +110,10 @@ static inline int __vma_enter_locked(struct vm_area_struct *vma, refcount_read(&vma->vm_refcnt) == tgt_refcnt, state); if (err) { - bool detached; - - __vma_exit_locked(vma, &detached); - if (detached) { + if (__vma_end_exclude_readers(vma)) { /* * The wait failed, but the last reader went away - * as well. Tell the caller the VMA is detached. + * as well. Tell the caller the VMA is detached. */ WARN_ON_ONCE(!detaching); err = 0; @@ -108,7 +130,7 @@ int __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq, { int locked; - locked = __vma_enter_locked(vma, false, state); + locked = __vma_start_exclude_readers(vma, false, state); if (locked < 0) return locked; @@ -121,10 +143,10 @@ int __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq, WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq); if (locked) { - bool detached; + bool detached = __vma_end_exclude_readers(vma); - __vma_exit_locked(vma, &detached); - WARN_ON_ONCE(detached); /* vma should remain attached */ + /* The VMA should remain attached. */ + WARN_ON_ONCE(detached); } return 0; @@ -148,14 +170,14 @@ void vma_mark_detached(struct vm_area_struct *vma) */ if (unlikely(__vma_refcount_put_return(vma))) { /* Wait until vma is detached with no readers. */ - if (__vma_enter_locked(vma, true, TASK_UNINTERRUPTIBLE)) { + if (__vma_start_exclude_readers(vma, true, TASK_UNINTERRUPTIBLE)) { bool detached; /* * Once this is complete, no readers can increment the * reference count, and the VMA is marked detached. */ - __vma_exit_locked(vma, &detached); + detached = __vma_end_exclude_readers(vma); WARN_ON_ONCE(!detached); } } -- cgit v1.2.3 From e28e575af956c4c3089b443e87be91a6ff7af355 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Fri, 23 Jan 2026 20:12:17 +0000 Subject: mm/vma: introduce helper struct + thread through exclusive lock fns It is confusing to have __vma_start_exclude_readers() return 0, 1 or an error (but only when waiting for readers in TASK_KILLABLE state), and having the return value be stored in a stack variable called 'locked' is further confusion. More generally, we are doing a lot of rather finnicky things during the acquisition of a state in which readers are excluded and moving out of this state, including tracking whether we are detached or not or whether an error occurred. We are implementing logic in __vma_start_exclude_readers() that effectively acts as if 'if one caller calls us do X, if another then do Y', which is very confusing from a control flow perspective. Introducing the shared helper object state helps us avoid this, as we can now handle the 'an error arose but we're detached' condition correctly in both callers - a warning if not detaching, and treating the situation as if no error arose in the case of a VMA detaching. This also acts to help document what's going on and allows us to add some more logical debug asserts. Also update vma_mark_detached() to add a guard clause for the likely 'already detached' state (given we hold the mmap write lock), and add a comment about ephemeral VMA read lock reference count increments to clarify why we are entering/exiting an exclusive locked state here. Finally, separate vma_mark_detached() into its fast-path component and make it inline, then place the slow path for excluding readers in mmap_lock.c. No functional change intended. [akpm@linux-foundation.org: fix function naming in comments, add comment per Vlastimil per Lorenzo] Link: https://lkml.kernel.org/r/7d3084d596c84da10dd374130a5055deba6439c0.1769198904.git.lorenzo.stoakes@oracle.com Link: https://lkml.kernel.org/r/7d3084d596c84da10dd374130a5055deba6439c0.1769198904.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Suren Baghdasaryan Cc: Boqun Feng Cc: Liam Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Shakeel Butt Cc: Vlastimil Babka Cc: Waiman Long Cc: Sebastian Andrzej Siewior Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 14 ++--- include/linux/mmap_lock.h | 23 ++++++- mm/mmap_lock.c | 152 +++++++++++++++++++++++++--------------------- 3 files changed, 112 insertions(+), 77 deletions(-) (limited to 'include') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 3e608d22cab0..8731606d8d36 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -1011,15 +1011,15 @@ struct vm_area_struct { * decrementing it again. * * VM_REFCNT_EXCLUDE_READERS_FLAG - Detached, pending - * __vma_exit_locked() completion which will decrement the reference - * count to zero. IMPORTANT - at this stage no further readers can - * increment the reference count. It can only be reduced. + * __vma_end_exclude_readers() completion which will decrement the + * reference count to zero. IMPORTANT - at this stage no further readers + * can increment the reference count. It can only be reduced. * * VM_REFCNT_EXCLUDE_READERS_FLAG + 1 - A thread is either write-locking - * an attached VMA and has yet to invoke __vma_exit_locked(), OR a - * thread is detaching a VMA and is waiting on a single spurious reader - * in order to decrement the reference count. IMPORTANT - as above, no - * further readers can increment the reference count. + * an attached VMA and has yet to invoke __vma_end_exclude_readers(), + * OR a thread is detaching a VMA and is waiting on a single spurious + * reader in order to decrement the reference count. IMPORTANT - as + * above, no further readers can increment the reference count. * * > VM_REFCNT_EXCLUDE_READERS_FLAG + 1 - A thread is either * write-locking or detaching a VMA is waiting on readers to diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h index d6df6aad3e24..678f90080fa6 100644 --- a/include/linux/mmap_lock.h +++ b/include/linux/mmap_lock.h @@ -358,7 +358,28 @@ static inline void vma_mark_attached(struct vm_area_struct *vma) refcount_set_release(&vma->vm_refcnt, 1); } -void vma_mark_detached(struct vm_area_struct *vma); +void __vma_exclude_readers_for_detach(struct vm_area_struct *vma); + +static inline void vma_mark_detached(struct vm_area_struct *vma) +{ + vma_assert_write_locked(vma); + vma_assert_attached(vma); + + /* + * The VMA still being attached (refcnt > 0) - is unlikely, because the + * vma has been already write-locked and readers can increment vm_refcnt + * only temporarily before they check vm_lock_seq, realize the vma is + * locked and drop back the vm_refcnt. That is a narrow window for + * observing a raised vm_refcnt. + * + * See the comment describing the vm_area_struct->vm_refcnt field for + * details of possible refcnt values. + */ + if (likely(!__vma_refcount_put_return(vma))) + return; + + __vma_exclude_readers_for_detach(vma); +} struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, unsigned long address); diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c index 72f15f606093..490793ac88ed 100644 --- a/mm/mmap_lock.c +++ b/mm/mmap_lock.c @@ -46,20 +46,38 @@ EXPORT_SYMBOL(__mmap_lock_do_trace_released); #ifdef CONFIG_MMU #ifdef CONFIG_PER_VMA_LOCK +/* State shared across __vma_[start, end]_exclude_readers. */ +struct vma_exclude_readers_state { + /* Input parameters. */ + struct vm_area_struct *vma; + int state; /* TASK_KILLABLE or TASK_UNINTERRUPTIBLE. */ + bool detaching; + + /* Output parameters. */ + bool detached; + bool exclusive; /* Are we exclusively locked? */ +}; + /* * Now that all readers have been evicted, mark the VMA as being out of the * 'exclude readers' state. - * - * Returns true if the VMA is now detached, otherwise false. */ -static bool __must_check __vma_end_exclude_readers(struct vm_area_struct *vma) +static void __vma_end_exclude_readers(struct vma_exclude_readers_state *ves) { - bool detached; + struct vm_area_struct *vma = ves->vma; - detached = refcount_sub_and_test(VM_REFCNT_EXCLUDE_READERS_FLAG, - &vma->vm_refcnt); + VM_WARN_ON_ONCE(ves->detached); + + ves->detached = refcount_sub_and_test(VM_REFCNT_EXCLUDE_READERS_FLAG, + &vma->vm_refcnt); __vma_lockdep_release_exclusive(vma); - return detached; +} + +static unsigned int get_target_refcnt(struct vma_exclude_readers_state *ves) +{ + const unsigned int tgt = ves->detaching ? 0 : 1; + + return tgt | VM_REFCNT_EXCLUDE_READERS_FLAG; } /* @@ -69,32 +87,29 @@ static bool __must_check __vma_end_exclude_readers(struct vm_area_struct *vma) * Note that this function pairs with vma_refcount_put() which will wake up this * thread when it detects that the last reader has released its lock. * - * The state parameter ought to be set to TASK_UNINTERRUPTIBLE in cases where we - * wish the thread to sleep uninterruptibly or TASK_KILLABLE if a fatal signal - * is permitted to kill it. + * The ves->state parameter ought to be set to TASK_UNINTERRUPTIBLE in cases + * where we wish the thread to sleep uninterruptibly or TASK_KILLABLE if a fatal + * signal is permitted to kill it. * - * The function will return 0 immediately if the VMA is detached, or wait for - * readers and return 1 once they have all exited, leaving the VMA exclusively - * locked. + * The function sets the ves->exclusive parameter to true if readers were + * excluded, or false if the VMA was detached or an error arose on wait. * - * If the function returns 1, the caller is required to invoke - * __vma_end_exclude_readers() once the exclusive state is no longer required. + * If the function indicates an exclusive lock was acquired via ves->exclusive + * the caller is required to invoke __vma_end_exclude_readers() once the + * exclusive state is no longer required. * - * If state is set to something other than TASK_UNINTERRUPTIBLE, the function - * may also return -EINTR to indicate a fatal signal was received while waiting. + * If ves->state is set to something other than TASK_UNINTERRUPTIBLE, the + * function may also return -EINTR to indicate a fatal signal was received while + * waiting. Otherwise, the function returns 0. */ -static int __vma_start_exclude_readers(struct vm_area_struct *vma, - bool detaching, int state) +static int __vma_start_exclude_readers(struct vma_exclude_readers_state *ves) { - int err; - unsigned int tgt_refcnt = VM_REFCNT_EXCLUDE_READERS_FLAG; + struct vm_area_struct *vma = ves->vma; + unsigned int tgt_refcnt = get_target_refcnt(ves); + int err = 0; mmap_assert_write_locked(vma->vm_mm); - /* Additional refcnt if the vma is attached. */ - if (!detaching) - tgt_refcnt++; - /* * If vma is detached then only vma_mark_attached() can raise the * vm_refcnt. mmap_write_lock prevents racing with vma_mark_attached(). @@ -102,37 +117,39 @@ static int __vma_start_exclude_readers(struct vm_area_struct *vma, * See the comment describing the vm_area_struct->vm_refcnt field for * details of possible refcnt values. */ - if (!refcount_add_not_zero(VM_REFCNT_EXCLUDE_READERS_FLAG, &vma->vm_refcnt)) + if (!refcount_add_not_zero(VM_REFCNT_EXCLUDE_READERS_FLAG, &vma->vm_refcnt)) { + ves->detached = true; return 0; + } __vma_lockdep_acquire_exclusive(vma); err = rcuwait_wait_event(&vma->vm_mm->vma_writer_wait, refcount_read(&vma->vm_refcnt) == tgt_refcnt, - state); + ves->state); if (err) { - if (__vma_end_exclude_readers(vma)) { - /* - * The wait failed, but the last reader went away - * as well. Tell the caller the VMA is detached. - */ - WARN_ON_ONCE(!detaching); - err = 0; - } + __vma_end_exclude_readers(ves); return err; } - __vma_lockdep_stat_mark_acquired(vma); - return 1; + __vma_lockdep_stat_mark_acquired(vma); + ves->exclusive = true; + return 0; } int __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq, int state) { - int locked; + int err; + struct vma_exclude_readers_state ves = { + .vma = vma, + .state = state, + }; - locked = __vma_start_exclude_readers(vma, false, state); - if (locked < 0) - return locked; + err = __vma_start_exclude_readers(&ves); + if (err) { + WARN_ON_ONCE(ves.detached); + return err; + } /* * We should use WRITE_ONCE() here because we can have concurrent reads @@ -142,45 +159,42 @@ int __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq, */ WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq); - if (locked) { - bool detached = __vma_end_exclude_readers(vma); - - /* The VMA should remain attached. */ - WARN_ON_ONCE(detached); + if (ves.exclusive) { + __vma_end_exclude_readers(&ves); + /* VMA should remain attached. */ + WARN_ON_ONCE(ves.detached); } return 0; } EXPORT_SYMBOL_GPL(__vma_start_write); -void vma_mark_detached(struct vm_area_struct *vma) +void __vma_exclude_readers_for_detach(struct vm_area_struct *vma) { - vma_assert_write_locked(vma); - vma_assert_attached(vma); + struct vma_exclude_readers_state ves = { + .vma = vma, + .state = TASK_UNINTERRUPTIBLE, + .detaching = true, + }; + int err; /* - * This condition - that the VMA is still attached (refcnt > 0) - is - * unlikely, because the vma has been already write-locked and readers - * can increment vm_refcnt only temporarily before they check - * vm_lock_seq, realize the vma is locked and drop back the - * vm_refcnt. That is a narrow window for observing a raised vm_refcnt. - * - * See the comment describing the vm_area_struct->vm_refcnt field for - * details of possible refcnt values. + * Wait until the VMA is detached with no readers. Since we hold the VMA + * write lock, the only read locks that might be present are those from + * threads trying to acquire the read lock and incrementing the + * reference count before realising the write lock is held and + * decrementing it. */ - if (unlikely(__vma_refcount_put_return(vma))) { - /* Wait until vma is detached with no readers. */ - if (__vma_start_exclude_readers(vma, true, TASK_UNINTERRUPTIBLE)) { - bool detached; - - /* - * Once this is complete, no readers can increment the - * reference count, and the VMA is marked detached. - */ - detached = __vma_end_exclude_readers(vma); - WARN_ON_ONCE(!detached); - } + err = __vma_start_exclude_readers(&ves); + if (!err && ves.exclusive) { + /* + * Once this is complete, no readers can increment the + * reference count, and the VMA is marked detached. + */ + __vma_end_exclude_readers(&ves); } + /* If an error arose but we were detached anyway, we don't care. */ + WARN_ON_ONCE(!ves.detached); } /* -- cgit v1.2.3 From 22f7639f2f030e58cb55ad8438c77dfcea951fc3 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Fri, 23 Jan 2026 20:12:18 +0000 Subject: mm/vma: improve and document __is_vma_write_locked() We don't actually need to return an output parameter providing mm sequence number, rather we can separate that out into another function - __vma_raw_mm_seqnum() - and have any callers which need to obtain that invoke that instead. The access to the raw sequence number requires that we hold the exclusive mmap lock such that we know we can't race vma_end_write_all(), so move the assert to __vma_raw_mm_seqnum() to make this requirement clear. Also while we're here, convert all of the VM_BUG_ON_VMA()'s to VM_WARN_ON_ONCE_VMA()'s in line with the convention that we do not invoke oopses when we can avoid it. [lorenzo.stoakes@oracle.com: minor tweaks, per Vlastimil] Link: https://lkml.kernel.org/r/3fa89c13-232d-4eee-86cc-96caa75c2c67@lucifer.local Link: https://lkml.kernel.org/r/ef6c415c2d2c03f529dca124ccaed66bc2f60edc.1769198904.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Suren Baghdasaryan Cc: Boqun Feng Cc: Liam Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Shakeel Butt Cc: Vlastimil Babka Cc: Waiman Long Cc: Sebastian Andrzej Siewior Signed-off-by: Andrew Morton --- include/linux/mmap_lock.h | 45 ++++++++++++++++++++++++--------------------- mm/mmap_lock.c | 6 +++--- 2 files changed, 27 insertions(+), 24 deletions(-) (limited to 'include') diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h index 678f90080fa6..1746a172a81c 100644 --- a/include/linux/mmap_lock.h +++ b/include/linux/mmap_lock.h @@ -258,21 +258,31 @@ static inline void vma_end_read(struct vm_area_struct *vma) vma_refcount_put(vma); } -/* WARNING! Can only be used if mmap_lock is expected to be write-locked */ -static inline bool __is_vma_write_locked(struct vm_area_struct *vma, unsigned int *mm_lock_seq) +static inline unsigned int __vma_raw_mm_seqnum(struct vm_area_struct *vma) { + const struct mm_struct *mm = vma->vm_mm; + + /* We must hold an exclusive write lock for this access to be valid. */ mmap_assert_write_locked(vma->vm_mm); + return mm->mm_lock_seq.sequence; +} +/* + * Determine whether a VMA is write-locked. Must be invoked ONLY if the mmap + * write lock is held. + * + * Returns true if write-locked, otherwise false. + */ +static inline bool __is_vma_write_locked(struct vm_area_struct *vma) +{ /* * current task is holding mmap_write_lock, both vma->vm_lock_seq and * mm->mm_lock_seq can't be concurrently modified. */ - *mm_lock_seq = vma->vm_mm->mm_lock_seq.sequence; - return (vma->vm_lock_seq == *mm_lock_seq); + return vma->vm_lock_seq == __vma_raw_mm_seqnum(vma); } -int __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq, - int state); +int __vma_start_write(struct vm_area_struct *vma, int state); /* * Begin writing to a VMA. @@ -281,12 +291,10 @@ int __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq, */ static inline void vma_start_write(struct vm_area_struct *vma) { - unsigned int mm_lock_seq; - - if (__is_vma_write_locked(vma, &mm_lock_seq)) + if (__is_vma_write_locked(vma)) return; - __vma_start_write(vma, mm_lock_seq, TASK_UNINTERRUPTIBLE); + __vma_start_write(vma, TASK_UNINTERRUPTIBLE); } /** @@ -305,30 +313,25 @@ static inline void vma_start_write(struct vm_area_struct *vma) static inline __must_check int vma_start_write_killable(struct vm_area_struct *vma) { - unsigned int mm_lock_seq; - - if (__is_vma_write_locked(vma, &mm_lock_seq)) + if (__is_vma_write_locked(vma)) return 0; - return __vma_start_write(vma, mm_lock_seq, TASK_KILLABLE); + + return __vma_start_write(vma, TASK_KILLABLE); } static inline void vma_assert_write_locked(struct vm_area_struct *vma) { - unsigned int mm_lock_seq; - - VM_BUG_ON_VMA(!__is_vma_write_locked(vma, &mm_lock_seq), vma); + VM_WARN_ON_ONCE_VMA(!__is_vma_write_locked(vma), vma); } static inline void vma_assert_locked(struct vm_area_struct *vma) { - unsigned int mm_lock_seq; - /* * See the comment describing the vm_area_struct->vm_refcnt field for * details of possible refcnt values. */ - VM_BUG_ON_VMA(refcount_read(&vma->vm_refcnt) <= 1 && - !__is_vma_write_locked(vma, &mm_lock_seq), vma); + VM_WARN_ON_ONCE_VMA(refcount_read(&vma->vm_refcnt) <= 1 && + !__is_vma_write_locked(vma), vma); } static inline bool vma_is_attached(struct vm_area_struct *vma) diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c index 490793ac88ed..898c2ef1e958 100644 --- a/mm/mmap_lock.c +++ b/mm/mmap_lock.c @@ -136,14 +136,14 @@ static int __vma_start_exclude_readers(struct vma_exclude_readers_state *ves) return 0; } -int __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq, - int state) +int __vma_start_write(struct vm_area_struct *vma, int state) { - int err; + const unsigned int mm_lock_seq = __vma_raw_mm_seqnum(vma); struct vma_exclude_readers_state ves = { .vma = vma, .state = state, }; + int err; err = __vma_start_exclude_readers(&ves); if (err) { -- cgit v1.2.3 From 256c11937de0039253ee36ed7d1cabc852beae54 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Fri, 23 Jan 2026 20:12:19 +0000 Subject: mm/vma: update vma_assert_locked() to use lockdep We can use lockdep to avoid unnecessary work here, otherwise update the code to logically evaluate all pertinent cases and share code with vma_assert_write_locked(). Make it clear here that we treat the VMA being detached at this point as a bug, this was only implicit before. Additionally, abstract references to vma->vmlock_dep_map by introducing a macro helper __vma_lockdep_map() which accesses this field if lockdep is enabled. Since lock_is_held() is specified as an extern function if lockdep is disabled, we can simply have __vma_lockdep_map() defined as NULL in this case, and then use IS_ENABLED(CONFIG_LOCKDEP) to avoid ugly ifdeffery. [lorenzo.stoakes@oracle.com: add helper macro __vma_lockdep_map(), per Vlastimil] Link: https://lkml.kernel.org/r/7c4b722e-604b-4b20-8e33-03d2f8d55407@lucifer.local Link: https://lkml.kernel.org/r/538762f079cc4fa76ff8bf30a8a9525a09961451.1769198904.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Suren Baghdasaryan Reviewed-by: Vlastimil Babka Cc: Boqun Feng Cc: Liam Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Shakeel Butt Cc: Waiman Long Cc: Sebastian Andrzej Siewior Signed-off-by: Andrew Morton --- include/linux/mmap_lock.h | 56 ++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 48 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h index 1746a172a81c..90fc32b683dd 100644 --- a/include/linux/mmap_lock.h +++ b/include/linux/mmap_lock.h @@ -78,6 +78,12 @@ static inline void mmap_assert_write_locked(const struct mm_struct *mm) #ifdef CONFIG_PER_VMA_LOCK +#ifdef CONFIG_LOCKDEP +#define __vma_lockdep_map(vma) (&vma->vmlock_dep_map) +#else +#define __vma_lockdep_map(vma) NULL +#endif + /* * VMA locks do not behave like most ordinary locks found in the kernel, so we * cannot quite have full lockdep tracking in the way we would ideally prefer. @@ -98,16 +104,16 @@ static inline void mmap_assert_write_locked(const struct mm_struct *mm) * so we utilise lockdep to do so. */ #define __vma_lockdep_acquire_read(vma) \ - lock_acquire_shared(&vma->vmlock_dep_map, 0, 1, NULL, _RET_IP_) + lock_acquire_shared(__vma_lockdep_map(vma), 0, 1, NULL, _RET_IP_) #define __vma_lockdep_release_read(vma) \ - lock_release(&vma->vmlock_dep_map, _RET_IP_) + lock_release(__vma_lockdep_map(vma), _RET_IP_) #define __vma_lockdep_acquire_exclusive(vma) \ - lock_acquire_exclusive(&vma->vmlock_dep_map, 0, 0, NULL, _RET_IP_) + lock_acquire_exclusive(__vma_lockdep_map(vma), 0, 0, NULL, _RET_IP_) #define __vma_lockdep_release_exclusive(vma) \ - lock_release(&vma->vmlock_dep_map, _RET_IP_) + lock_release(__vma_lockdep_map(vma), _RET_IP_) /* Only meaningful if CONFIG_LOCK_STAT is defined. */ #define __vma_lockdep_stat_mark_acquired(vma) \ - lock_acquired(&vma->vmlock_dep_map, _RET_IP_) + lock_acquired(__vma_lockdep_map(vma), _RET_IP_) static inline void mm_lock_seqcount_init(struct mm_struct *mm) { @@ -146,7 +152,7 @@ static inline void vma_lock_init(struct vm_area_struct *vma, bool reset_refcnt) #ifdef CONFIG_DEBUG_LOCK_ALLOC static struct lock_class_key lockdep_key; - lockdep_init_map(&vma->vmlock_dep_map, "vm_lock", &lockdep_key, 0); + lockdep_init_map(__vma_lockdep_map(vma), "vm_lock", &lockdep_key, 0); #endif if (reset_refcnt) refcount_set(&vma->vm_refcnt, 0); @@ -319,19 +325,53 @@ int vma_start_write_killable(struct vm_area_struct *vma) return __vma_start_write(vma, TASK_KILLABLE); } +/** + * vma_assert_write_locked() - assert that @vma holds a VMA write lock. + * @vma: The VMA to assert. + */ static inline void vma_assert_write_locked(struct vm_area_struct *vma) { VM_WARN_ON_ONCE_VMA(!__is_vma_write_locked(vma), vma); } +/** + * vma_assert_locked() - assert that @vma holds either a VMA read or a VMA write + * lock and is not detached. + * @vma: The VMA to assert. + */ static inline void vma_assert_locked(struct vm_area_struct *vma) { + unsigned int refcnt; + + if (IS_ENABLED(CONFIG_LOCKDEP)) { + if (!lock_is_held(__vma_lockdep_map(vma))) + vma_assert_write_locked(vma); + return; + } + /* * See the comment describing the vm_area_struct->vm_refcnt field for * details of possible refcnt values. */ - VM_WARN_ON_ONCE_VMA(refcount_read(&vma->vm_refcnt) <= 1 && - !__is_vma_write_locked(vma), vma); + refcnt = refcount_read(&vma->vm_refcnt); + + /* + * In this case we're either read-locked, write-locked with temporary + * readers, or in the midst of excluding readers, all of which means + * we're locked. + */ + if (refcnt > 1) + return; + + /* It is a bug for the VMA to be detached here. */ + VM_WARN_ON_ONCE_VMA(!refcnt, vma); + + /* + * OK, the VMA has a reference count of 1 which means it is either + * unlocked and attached or write-locked, so assert that it is + * write-locked. + */ + vma_assert_write_locked(vma); } static inline bool vma_is_attached(struct vm_area_struct *vma) -- cgit v1.2.3 From 17fd82c3abe03c1e202959bb1a7c4ab448b36bef Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Fri, 23 Jan 2026 20:12:20 +0000 Subject: mm/vma: add and use vma_assert_stabilised() Sometimes we wish to assert that a VMA is stable, that is - the VMA cannot be changed underneath us. This will be the case if EITHER the VMA lock or the mmap lock is held. In order to do so, we introduce a new assert vma_assert_stabilised() - this will make a lockdep assert if lockdep is enabled AND the VMA is read-locked. Currently lockdep tracking for VMA write locks is not implemented, so it suffices to check in this case that we have either an mmap read or write semaphore held. Note that because the VMA lock uses the non-standard vmlock_dep_map naming convention, we cannot use lockdep_assert_is_write_held() so have to open code this ourselves via lockdep-asserting that lock_is_held_type(&vma->vmlock_dep_map, 0). We have to be careful here - for instance when merging a VMA, we use the mmap write lock to stabilise the examination of adjacent VMAs which might be simultaneously VMA read-locked whilst being faulted in. If we were to assert VMA read lock using lockdep we would encounter an incorrect lockdep assert. Also, we have to be careful about asserting mmap locks are held - if we try to address the above issue by first checking whether mmap lock is held and if so asserting it via lockdep, we may find that we were raced by another thread acquiring an mmap read lock simultaneously that either we don't own (and thus can be released any time - so we are not stable) or was indeed released since we last checked. So to deal with these complexities we end up with either a precise (if lockdep is enabled) or imprecise (if not) approach - in the first instance we assert the lock is held using lockdep and thus whether we own it. If we do own it, then the check is complete, otherwise we must check for the VMA read lock being held (VMA write lock implies mmap write lock so the mmap lock suffices for this). If lockdep is not enabled we simply check if the mmap lock is held and risk a false negative (i.e. not asserting when we should do). There are a couple places in the kernel where we already do this stabliisation check - the anon_vma_name() helper in mm/madvise.c and vma_flag_set_atomic() in include/linux/mm.h, which we update to use vma_assert_stabilised(). This change abstracts these into vma_assert_stabilised(), uses lockdep if possible, and avoids a duplicate check of whether the mmap lock is held. This is also self-documenting and lays the foundations for further VMA stability checks in the code. The only functional change here is adding the lockdep check. Link: https://lkml.kernel.org/r/6c9e64bb2b56ddb6f806fde9237f8a00cb3a776b.1769198904.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Vlastimil Babka Reviewed-by: Suren Baghdasaryan Cc: Boqun Feng Cc: Liam Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Shakeel Butt Cc: Waiman Long Cc: Sebastian Andrzej Siewior Signed-off-by: Andrew Morton --- include/linux/mm.h | 5 +---- include/linux/mmap_lock.h | 52 +++++++++++++++++++++++++++++++++++++++++++++++ mm/madvise.c | 4 +--- 3 files changed, 54 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index aa90719234f1..2c6c6d00ed73 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1008,10 +1008,7 @@ static inline void vma_flag_set_atomic(struct vm_area_struct *vma, { unsigned long *bitmap = ACCESS_PRIVATE(&vma->flags, __vma_flags); - /* mmap read lock/VMA read lock must be held. */ - if (!rwsem_is_locked(&vma->vm_mm->mmap_lock)) - vma_assert_locked(vma); - + vma_assert_stabilised(vma); if (__vma_flag_atomic_valid(vma, bit)) set_bit((__force int)bit, bitmap); } diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h index 90fc32b683dd..93eca48bc443 100644 --- a/include/linux/mmap_lock.h +++ b/include/linux/mmap_lock.h @@ -374,6 +374,52 @@ static inline void vma_assert_locked(struct vm_area_struct *vma) vma_assert_write_locked(vma); } +/** + * vma_assert_stabilised() - assert that this VMA cannot be changed from + * underneath us either by having a VMA or mmap lock held. + * @vma: The VMA whose stability we wish to assess. + * + * If lockdep is enabled we can precisely ensure stability via either an mmap + * lock owned by us or a specific VMA lock. + * + * With lockdep disabled we may sometimes race with other threads acquiring the + * mmap read lock simultaneous with our VMA read lock. + */ +static inline void vma_assert_stabilised(struct vm_area_struct *vma) +{ + /* + * If another thread owns an mmap lock, it may go away at any time, and + * thus is no guarantee of stability. + * + * If lockdep is enabled we can accurately determine if an mmap lock is + * held and owned by us. Otherwise we must approximate. + * + * It doesn't necessarily mean we are not stabilised however, as we may + * hold a VMA read lock (not a write lock as this would require an owned + * mmap lock). + * + * If (assuming lockdep is not enabled) we were to assert a VMA read + * lock first we may also run into issues, as other threads can hold VMA + * read locks simlutaneous to us. + * + * Therefore if lockdep is not enabled we risk a false negative (i.e. no + * assert fired). If accurate checking is required, enable lockdep. + */ + if (IS_ENABLED(CONFIG_LOCKDEP)) { + if (lockdep_is_held(&vma->vm_mm->mmap_lock)) + return; + } else { + if (rwsem_is_locked(&vma->vm_mm->mmap_lock)) + return; + } + + /* + * We're not stabilised by the mmap lock, so assert that we're + * stabilised by a VMA lock. + */ + vma_assert_locked(vma); +} + static inline bool vma_is_attached(struct vm_area_struct *vma) { return refcount_read(&vma->vm_refcnt); @@ -476,6 +522,12 @@ static inline void vma_assert_locked(struct vm_area_struct *vma) mmap_assert_locked(vma->vm_mm); } +static inline void vma_assert_stabilised(struct vm_area_struct *vma) +{ + /* If no VMA locks, then either mmap lock suffices to stabilise. */ + mmap_assert_locked(vma->vm_mm); +} + #endif /* CONFIG_PER_VMA_LOCK */ static inline void mmap_write_lock(struct mm_struct *mm) diff --git a/mm/madvise.c b/mm/madvise.c index 863d55b8a658..19cf480eed49 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -109,9 +109,7 @@ void anon_vma_name_free(struct kref *kref) struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma) { - if (!rwsem_is_locked(&vma->vm_mm->mmap_lock)) - vma_assert_locked(vma); - + vma_assert_stabilised(vma); return vma->anon_name; } -- cgit v1.2.3 From bc617c990eae4259cd5014d596477cbe0d596417 Mon Sep 17 00:00:00 2001 From: Nhat Pham Date: Sat, 20 Dec 2025 03:43:37 +0800 Subject: mm/shmem, swap: remove SWAP_MAP_SHMEM The SWAP_MAP_SHMEM state was introduced in the commit aaa468653b4a ("swap_info: note SWAP_MAP_SHMEM"), to quickly determine if a swap entry belongs to shmem during swapoff. However, swapoff has since been rewritten in the commit b56a2d8af914 ("mm: rid swapoff of quadratic complexity"). Now having swap count == SWAP_MAP_SHMEM value is basically the same as having swap count == 1, and swap_shmem_alloc() behaves analogously to swap_duplicate(). The only difference of note is that swap_shmem_alloc() does not check for -ENOMEM returned from __swap_duplicate(), but it is OK because shmem never re-duplicates any swap entry it owns. This will stil be safe if we use (batched) swap_duplicate() instead. This commit adds swap_duplicate_nr(), the batched variant of swap_duplicate(), and removes the SWAP_MAP_SHMEM state and the associated swap_shmem_alloc() helper to simplify the state machine (both mentally and in terms of actual code). We will also have an extra state/special value that can be repurposed (for swap entries that never gets re-duplicated). Link: https://lkml.kernel.org/r/20251220-swap-table-p2-v5-8-8862a265a033@tencent.com Signed-off-by: Kairui Song Signed-off-by: Nhat Pham Reviewed-by: Baolin Wang Tested-by: Baolin Wang Cc: Baoquan He Cc: Barry Song Cc: Chris Li Cc: Rafael J. Wysocki (Intel) Cc: Yosry Ahmed Cc: Deepanshu Kartikey Cc: Johannes Weiner Cc: Kairui Song Signed-off-by: Andrew Morton --- include/linux/swap.h | 15 +++++++-------- mm/shmem.c | 2 +- mm/swapfile.c | 42 +++++++++++++++++------------------------- 3 files changed, 25 insertions(+), 34 deletions(-) (limited to 'include') diff --git a/include/linux/swap.h b/include/linux/swap.h index 38ca3df68716..bf72b548a96d 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -230,7 +230,6 @@ enum { /* Special value in first swap_map */ #define SWAP_MAP_MAX 0x3e /* Max count */ #define SWAP_MAP_BAD 0x3f /* Note page is bad */ -#define SWAP_MAP_SHMEM 0xbf /* Owned by shmem/tmpfs */ /* Special value in each swap_map continuation */ #define SWAP_CONT_MAX 0x7f /* Max count */ @@ -458,8 +457,7 @@ bool folio_free_swap(struct folio *folio); void put_swap_folio(struct folio *folio, swp_entry_t entry); extern swp_entry_t get_swap_page_of_type(int); extern int add_swap_count_continuation(swp_entry_t, gfp_t); -extern void swap_shmem_alloc(swp_entry_t, int); -extern int swap_duplicate(swp_entry_t); +extern int swap_duplicate_nr(swp_entry_t entry, int nr); extern int swapcache_prepare(swp_entry_t entry, int nr); extern void swap_free_nr(swp_entry_t entry, int nr_pages); extern void free_swap_and_cache_nr(swp_entry_t entry, int nr); @@ -514,11 +512,7 @@ static inline int add_swap_count_continuation(swp_entry_t swp, gfp_t gfp_mask) return 0; } -static inline void swap_shmem_alloc(swp_entry_t swp, int nr) -{ -} - -static inline int swap_duplicate(swp_entry_t swp) +static inline int swap_duplicate_nr(swp_entry_t swp, int nr_pages) { return 0; } @@ -569,6 +563,11 @@ static inline int add_swap_extent(struct swap_info_struct *sis, } #endif /* CONFIG_SWAP */ +static inline int swap_duplicate(swp_entry_t entry) +{ + return swap_duplicate_nr(entry, 1); +} + static inline void free_swap_and_cache(swp_entry_t entry) { free_swap_and_cache_nr(entry, 1); diff --git a/mm/shmem.c b/mm/shmem.c index c60392d054e2..dd4951d6f891 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1690,7 +1690,7 @@ try_split: spin_unlock(&shmem_swaplist_lock); } - swap_shmem_alloc(folio->swap, nr_pages); + swap_duplicate_nr(folio->swap, nr_pages); shmem_delete_from_page_cache(folio, swp_to_radix_entry(folio->swap)); BUG_ON(folio_mapped(folio)); diff --git a/mm/swapfile.c b/mm/swapfile.c index ea02d9795126..eb394f30181a 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -201,7 +201,7 @@ static bool swap_is_last_map(struct swap_info_struct *si, unsigned char *map_end = map + nr_pages; unsigned char count = *map; - if (swap_count(count) != 1 && swap_count(count) != SWAP_MAP_SHMEM) + if (swap_count(count) != 1) return false; while (++map < map_end) { @@ -1523,12 +1523,6 @@ static unsigned char swap_entry_put_locked(struct swap_info_struct *si, if (usage == SWAP_HAS_CACHE) { VM_BUG_ON(!has_cache); has_cache = 0; - } else if (count == SWAP_MAP_SHMEM) { - /* - * Or we could insist on shmem.c using a special - * swap_shmem_free() and free_shmem_swap_and_cache()... - */ - count = 0; } else if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) { if (count == COUNT_CONTINUED) { if (swap_count_continued(si, offset, count)) @@ -1626,7 +1620,7 @@ static bool swap_entries_put_map(struct swap_info_struct *si, if (nr <= 1) goto fallback; count = swap_count(data_race(si->swap_map[offset])); - if (count != 1 && count != SWAP_MAP_SHMEM) + if (count != 1) goto fallback; ci = swap_cluster_lock(si, offset); @@ -1680,12 +1674,10 @@ static bool swap_entries_put_map_nr(struct swap_info_struct *si, /* * Check if it's the last ref of swap entry in the freeing path. - * Qualified value includes 1, SWAP_HAS_CACHE or SWAP_MAP_SHMEM. */ static inline bool __maybe_unused swap_is_last_ref(unsigned char count) { - return (count == SWAP_HAS_CACHE) || (count == 1) || - (count == SWAP_MAP_SHMEM); + return (count == SWAP_HAS_CACHE) || (count == 1); } /* @@ -3678,7 +3670,6 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr) offset = swp_offset(entry); VM_WARN_ON(nr > SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER); - VM_WARN_ON(usage == 1 && nr > 1); ci = swap_cluster_lock(si, offset); err = 0; @@ -3738,27 +3729,28 @@ unlock_out: return err; } -/* - * Help swapoff by noting that swap entry belongs to shmem/tmpfs - * (in which case its reference count is never incremented). - */ -void swap_shmem_alloc(swp_entry_t entry, int nr) -{ - __swap_duplicate(entry, SWAP_MAP_SHMEM, nr); -} - -/* - * Increase reference count of swap entry by 1. +/** + * swap_duplicate_nr() - Increase reference count of nr contiguous swap entries + * by 1. + * + * @entry: first swap entry from which we want to increase the refcount. + * @nr: Number of entries in range. + * * Returns 0 for success, or -ENOMEM if a swap_count_continuation is required * but could not be atomically allocated. Returns 0, just as if it succeeded, * if __swap_duplicate() fails for another reason (-EINVAL or -ENOENT), which * might occur if a page table entry has got corrupted. + * + * Note that we are currently not handling the case where nr > 1 and we need to + * add swap count continuation. This is OK, because no such user exists - shmem + * is the only user that can pass nr > 1, and it never re-duplicates any swap + * entry it owns. */ -int swap_duplicate(swp_entry_t entry) +int swap_duplicate_nr(swp_entry_t entry, int nr) { int err = 0; - while (!err && __swap_duplicate(entry, 1, 1) == -ENOMEM) + while (!err && __swap_duplicate(entry, 1, nr) == -ENOMEM) err = add_swap_count_continuation(entry, GFP_ATOMIC); return err; } -- cgit v1.2.3 From 2732acda82c93475c5986e1a5640004a5d4f9c3e Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Sat, 20 Dec 2025 03:43:41 +0800 Subject: mm, swap: use swap cache as the swap in synchronize layer Current swap in synchronization mostly uses the swap_map's SWAP_HAS_CACHE bit. Whoever sets the bit first does the actual work to swap in a folio. This has been causing many issues as it's just a poor implementation of a bit lock. Raced users have no idea what is pinning a slot, so it has to loop with a schedule_timeout_uninterruptible(1), which is ugly and causes long-tailing or other performance issues. Besides, the abuse of SWAP_HAS_CACHE has been causing many other troubles for synchronization or maintenance. This is the first step to remove this bit completely. Now all swap in paths are using the swap cache, and both the swap cache and swap map are protected by the cluster lock. So we can just resolve the swap synchronization with the swap cache layer directly using the cluster lock and folio lock. Whoever inserts a folio in the swap cache first does the swap in work. And because folios are locked during swap operations, other raced swap operations will just wait on the folio lock. The SWAP_HAS_CACHE will be removed in later commit. For now, we still set it for some remaining users. But now we do the bit setting and swap cache folio adding in the same critical section, after swap cache is ready. No one will have to spin on the SWAP_HAS_CACHE bit anymore. This both simplifies the logic and should improve the performance, eliminating issues like the one solved in commit 01626a1823024 ("mm: avoid unconditional one-tick sleep when swapcache_prepare fails"), or the "skip_if_exists" from commit a65b0e7607ccb ("zswap: make shrinking memcg-aware"), which will be removed very soon. [kasong@tencent.com: fix cgroup v1 accounting issue] Link: https://lkml.kernel.org/r/CAMgjq7CGUnzOVG7uSaYjzw9wD7w2dSKOHprJfaEp4CcGLgE3iw@mail.gmail.com Link: https://lkml.kernel.org/r/20251220-swap-table-p2-v5-12-8862a265a033@tencent.com Signed-off-by: Kairui Song Reviewed-by: Baoquan He Cc: Baolin Wang Cc: Barry Song Cc: Chris Li Cc: Nhat Pham Cc: Rafael J. Wysocki (Intel) Cc: Yosry Ahmed Cc: Deepanshu Kartikey Cc: Johannes Weiner Cc: Kairui Song Signed-off-by: Andrew Morton --- include/linux/swap.h | 6 --- mm/swap.h | 15 +++++++- mm/swap_state.c | 105 ++++++++++++++++++++++++++++----------------------- mm/swapfile.c | 39 ++++++++++++------- mm/vmscan.c | 3 +- 5 files changed, 97 insertions(+), 71 deletions(-) (limited to 'include') diff --git a/include/linux/swap.h b/include/linux/swap.h index bf72b548a96d..74df3004c850 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -458,7 +458,6 @@ void put_swap_folio(struct folio *folio, swp_entry_t entry); extern swp_entry_t get_swap_page_of_type(int); extern int add_swap_count_continuation(swp_entry_t, gfp_t); extern int swap_duplicate_nr(swp_entry_t entry, int nr); -extern int swapcache_prepare(swp_entry_t entry, int nr); extern void swap_free_nr(swp_entry_t entry, int nr_pages); extern void free_swap_and_cache_nr(swp_entry_t entry, int nr); int swap_type_of(dev_t device, sector_t offset); @@ -517,11 +516,6 @@ static inline int swap_duplicate_nr(swp_entry_t swp, int nr_pages) return 0; } -static inline int swapcache_prepare(swp_entry_t swp, int nr) -{ - return 0; -} - static inline void swap_free_nr(swp_entry_t entry, int nr_pages) { } diff --git a/mm/swap.h b/mm/swap.h index 2f79458b37f3..e427240073e9 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -234,6 +234,14 @@ static inline bool folio_matches_swap_entry(const struct folio *folio, return folio_entry.val == round_down(entry.val, nr_pages); } +/* Temporary internal helpers */ +void __swapcache_set_cached(struct swap_info_struct *si, + struct swap_cluster_info *ci, + swp_entry_t entry); +void __swapcache_clear_cached(struct swap_info_struct *si, + struct swap_cluster_info *ci, + swp_entry_t entry, unsigned int nr); + /* * All swap cache helpers below require the caller to ensure the swap entries * used are valid and stabilize the device by any of the following ways: @@ -247,7 +255,8 @@ static inline bool folio_matches_swap_entry(const struct folio *folio, */ struct folio *swap_cache_get_folio(swp_entry_t entry); void *swap_cache_get_shadow(swp_entry_t entry); -void swap_cache_add_folio(struct folio *folio, swp_entry_t entry, void **shadow); +int swap_cache_add_folio(struct folio *folio, swp_entry_t entry, + void **shadow, bool alloc); void swap_cache_del_folio(struct folio *folio); struct folio *swap_cache_alloc_folio(swp_entry_t entry, gfp_t gfp_flags, struct mempolicy *mpol, pgoff_t ilx, @@ -413,8 +422,10 @@ static inline void *swap_cache_get_shadow(swp_entry_t entry) return NULL; } -static inline void swap_cache_add_folio(struct folio *folio, swp_entry_t entry, void **shadow) +static inline int swap_cache_add_folio(struct folio *folio, swp_entry_t entry, + void **shadow, bool alloc) { + return -ENOENT; } static inline void swap_cache_del_folio(struct folio *folio) diff --git a/mm/swap_state.c b/mm/swap_state.c index d58bce532d95..22990c5259cc 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -127,34 +127,64 @@ void *swap_cache_get_shadow(swp_entry_t entry) * @entry: The swap entry corresponding to the folio. * @gfp: gfp_mask for XArray node allocation. * @shadowp: If a shadow is found, return the shadow. + * @alloc: If it's the allocator that is trying to insert a folio. Allocator + * sets SWAP_HAS_CACHE to pin slots before insert so skip map update. * * Context: Caller must ensure @entry is valid and protect the swap device * with reference count or locks. - * The caller also needs to update the corresponding swap_map slots with - * SWAP_HAS_CACHE bit to avoid race or conflict. */ -void swap_cache_add_folio(struct folio *folio, swp_entry_t entry, void **shadowp) +int swap_cache_add_folio(struct folio *folio, swp_entry_t entry, + void **shadowp, bool alloc) { + int err; void *shadow = NULL; + struct swap_info_struct *si; unsigned long old_tb, new_tb; struct swap_cluster_info *ci; - unsigned int ci_start, ci_off, ci_end; + unsigned int ci_start, ci_off, ci_end, offset; unsigned long nr_pages = folio_nr_pages(folio); VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio); VM_WARN_ON_ONCE_FOLIO(folio_test_swapcache(folio), folio); VM_WARN_ON_ONCE_FOLIO(!folio_test_swapbacked(folio), folio); + si = __swap_entry_to_info(entry); new_tb = folio_to_swp_tb(folio); ci_start = swp_cluster_offset(entry); ci_end = ci_start + nr_pages; ci_off = ci_start; - ci = swap_cluster_lock(__swap_entry_to_info(entry), swp_offset(entry)); + offset = swp_offset(entry); + ci = swap_cluster_lock(si, swp_offset(entry)); + if (unlikely(!ci->table)) { + err = -ENOENT; + goto failed; + } do { - old_tb = __swap_table_xchg(ci, ci_off, new_tb); - WARN_ON_ONCE(swp_tb_is_folio(old_tb)); + old_tb = __swap_table_get(ci, ci_off); + if (unlikely(swp_tb_is_folio(old_tb))) { + err = -EEXIST; + goto failed; + } + if (!alloc && unlikely(!__swap_count(swp_entry(swp_type(entry), offset)))) { + err = -ENOENT; + goto failed; + } if (swp_tb_is_shadow(old_tb)) shadow = swp_tb_to_shadow(old_tb); + offset++; + } while (++ci_off < ci_end); + + ci_off = ci_start; + offset = swp_offset(entry); + do { + /* + * Still need to pin the slots with SWAP_HAS_CACHE since + * swap allocator depends on that. + */ + if (!alloc) + __swapcache_set_cached(si, ci, swp_entry(swp_type(entry), offset)); + __swap_table_set(ci, ci_off, new_tb); + offset++; } while (++ci_off < ci_end); folio_ref_add(folio, nr_pages); @@ -167,6 +197,11 @@ void swap_cache_add_folio(struct folio *folio, swp_entry_t entry, void **shadowp if (shadowp) *shadowp = shadow; + return 0; + +failed: + swap_cluster_unlock(ci); + return err; } /** @@ -185,6 +220,7 @@ void swap_cache_add_folio(struct folio *folio, swp_entry_t entry, void **shadowp void __swap_cache_del_folio(struct swap_cluster_info *ci, struct folio *folio, swp_entry_t entry, void *shadow) { + struct swap_info_struct *si; unsigned long old_tb, new_tb; unsigned int ci_start, ci_off, ci_end; unsigned long nr_pages = folio_nr_pages(folio); @@ -194,6 +230,7 @@ void __swap_cache_del_folio(struct swap_cluster_info *ci, struct folio *folio, VM_WARN_ON_ONCE_FOLIO(!folio_test_swapcache(folio), folio); VM_WARN_ON_ONCE_FOLIO(folio_test_writeback(folio), folio); + si = __swap_entry_to_info(entry); new_tb = shadow_swp_to_tb(shadow); ci_start = swp_cluster_offset(entry); ci_end = ci_start + nr_pages; @@ -209,6 +246,7 @@ void __swap_cache_del_folio(struct swap_cluster_info *ci, struct folio *folio, folio_clear_swapcache(folio); node_stat_mod_folio(folio, NR_FILE_PAGES, -nr_pages); lruvec_stat_mod_folio(folio, NR_SWAPCACHE, -nr_pages); + __swapcache_clear_cached(si, ci, entry, nr_pages); } /** @@ -230,7 +268,6 @@ void swap_cache_del_folio(struct folio *folio) __swap_cache_del_folio(ci, folio, entry, NULL); swap_cluster_unlock(ci); - put_swap_folio(folio, entry); folio_ref_sub(folio, folio_nr_pages(folio)); } @@ -422,67 +459,37 @@ static struct folio *__swap_cache_prepare_and_add(swp_entry_t entry, gfp_t gfp, bool charged, bool skip_if_exists) { - struct folio *swapcache; + struct folio *swapcache = NULL; void *shadow; int ret; - /* - * Check and pin the swap map with SWAP_HAS_CACHE, then add the folio - * into the swap cache. Loop with a schedule delay if raced with - * another process setting SWAP_HAS_CACHE. This hackish loop will - * be fixed very soon. - */ + __folio_set_locked(folio); + __folio_set_swapbacked(folio); for (;;) { - ret = swapcache_prepare(entry, folio_nr_pages(folio)); + ret = swap_cache_add_folio(folio, entry, &shadow, false); if (!ret) break; /* - * The skip_if_exists is for protecting against a recursive - * call to this helper on the same entry waiting forever - * here because SWAP_HAS_CACHE is set but the folio is not - * in the swap cache yet. This can happen today if - * mem_cgroup_swapin_charge_folio() below triggers reclaim - * through zswap, which may call this helper again in the - * writeback path. - * - * Large order allocation also needs special handling on + * Large order allocation needs special handling on * race: if a smaller folio exists in cache, swapin needs * to fallback to order 0, and doing a swap cache lookup * might return a folio that is irrelevant to the faulting * entry because @entry is aligned down. Just return NULL. */ if (ret != -EEXIST || skip_if_exists || folio_test_large(folio)) - return NULL; + goto failed; - /* - * Check the swap cache again, we can only arrive - * here because swapcache_prepare returns -EEXIST. - */ swapcache = swap_cache_get_folio(entry); if (swapcache) - return swapcache; - - /* - * We might race against __swap_cache_del_folio(), and - * stumble across a swap_map entry whose SWAP_HAS_CACHE - * has not yet been cleared. Or race against another - * swap_cache_alloc_folio(), which has set SWAP_HAS_CACHE - * in swap_map, but not yet added its folio to swap cache. - */ - schedule_timeout_uninterruptible(1); + goto failed; } - __folio_set_locked(folio); - __folio_set_swapbacked(folio); - if (!charged && mem_cgroup_swapin_charge_folio(folio, NULL, gfp, entry)) { - put_swap_folio(folio, entry); - folio_unlock(folio); - return NULL; + swap_cache_del_folio(folio); + goto failed; } - swap_cache_add_folio(folio, entry, &shadow); memcg1_swapin(entry, folio_nr_pages(folio)); if (shadow) workingset_refault(folio, shadow); @@ -490,6 +497,10 @@ static struct folio *__swap_cache_prepare_and_add(swp_entry_t entry, /* Caller will initiate read into locked folio */ folio_add_lru(folio); return folio; + +failed: + folio_unlock(folio); + return swapcache; } /** diff --git a/mm/swapfile.c b/mm/swapfile.c index ced53aba3f4c..64970ee11fcf 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1476,7 +1476,11 @@ again: if (!entry.val) return -ENOMEM; - swap_cache_add_folio(folio, entry, NULL); + /* + * Allocator has pinned the slots with SWAP_HAS_CACHE + * so it should never fail + */ + WARN_ON_ONCE(swap_cache_add_folio(folio, entry, NULL, true)); return 0; @@ -1582,9 +1586,8 @@ static unsigned char swap_entry_put_locked(struct swap_info_struct *si, * do_swap_page() * ... swapoff+swapon * swap_cache_alloc_folio() - * swapcache_prepare() - * __swap_duplicate() - * // check swap_map + * swap_cache_add_folio() + * // check swap_map * // verify PTE not changed * * In __swap_duplicate(), the swap_map need to be checked before @@ -3769,17 +3772,25 @@ int swap_duplicate_nr(swp_entry_t entry, int nr) return err; } -/* - * @entry: first swap entry from which we allocate nr swap cache. - * - * Called when allocating swap cache for existing swap entries, - * This can return error codes. Returns 0 at success. - * -EEXIST means there is a swap cache. - * Note: return code is different from swap_duplicate(). - */ -int swapcache_prepare(swp_entry_t entry, int nr) +/* Mark the swap map as HAS_CACHE, caller need to hold the cluster lock */ +void __swapcache_set_cached(struct swap_info_struct *si, + struct swap_cluster_info *ci, + swp_entry_t entry) +{ + WARN_ON(swap_dup_entries(si, ci, swp_offset(entry), SWAP_HAS_CACHE, 1)); +} + +/* Clear the swap map as !HAS_CACHE, caller need to hold the cluster lock */ +void __swapcache_clear_cached(struct swap_info_struct *si, + struct swap_cluster_info *ci, + swp_entry_t entry, unsigned int nr) { - return __swap_duplicate(entry, SWAP_HAS_CACHE, nr); + if (swap_only_has_cache(si, swp_offset(entry), nr)) { + swap_entries_free(si, ci, entry, nr); + } else { + for (int i = 0; i < nr; i++, entry.val++) + swap_entry_put_locked(si, ci, entry, SWAP_HAS_CACHE); + } } /* diff --git a/mm/vmscan.c b/mm/vmscan.c index 1d281174164e..973ffb9813ea 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -757,10 +757,9 @@ static int __remove_mapping(struct address_space *mapping, struct folio *folio, if (reclaimed && !mapping_exiting(mapping)) shadow = workingset_eviction(folio, target_memcg); - __swap_cache_del_folio(ci, folio, swap, shadow); memcg1_swapout(folio, swap); + __swap_cache_del_folio(ci, folio, swap, shadow); swap_cluster_unlock_irq(ci); - put_swap_folio(folio, swap); } else { void (*free_folio)(struct folio *); -- cgit v1.2.3 From 36976159140bc288c3752a9b799090a49f1a8b62 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Sat, 20 Dec 2025 03:43:43 +0800 Subject: mm, swap: cleanup swap entry management workflow The current swap entry allocation/freeing workflow has never had a clear definition. This makes it hard to debug or add new optimizations. This commit introduces a proper definition of how swap entries would be allocated and freed. Now, most operations are folio based, so they will never exceed one swap cluster, and we now have a cleaner border between swap and the rest of mm, making it much easier to follow and debug, especially with new added sanity checks. Also making more optimization possible. Swap entry will be mostly freed and free with a folio bound. The folio lock will be useful for resolving many swap related races. Now swap allocation (except hibernation) always starts with a folio in the swap cache, and gets duped/freed protected by the folio lock: - folio_alloc_swap() - The only allocation entry point now. Context: The folio must be locked. This allocates one or a set of continuous swap slots for a folio and binds them to the folio by adding the folio to the swap cache. The swap slots' swap count start with zero value. - folio_dup_swap() - Increase the swap count of one or more entries. Context: The folio must be locked and in the swap cache. For now, the caller still has to lock the new swap entry owner (e.g., PTL). This increases the ref count of swap entries allocated to a folio. Newly allocated swap slots' count has to be increased by this helper as the folio got unmapped (and swap entries got installed). - folio_put_swap() - Decrease the swap count of one or more entries. Context: The folio must be locked and in the swap cache. For now, the caller still has to lock the new swap entry owner (e.g., PTL). This decreases the ref count of swap entries allocated to a folio. Typically, swapin will decrease the swap count as the folio got installed back and the swap entry got uninstalled This won't remove the folio from the swap cache and free the slot. Lazy freeing of swap cache is helpful for reducing IO. There is already a folio_free_swap() for immediate cache reclaim. This part could be further optimized later. The above locking constraints could be further relaxed when the swap table is fully implemented. Currently dup still needs the caller to lock the swap entry container (e.g. PTL), or a concurrent zap may underflow the swap count. Some swap users need to interact with swap count without involving folio (e.g. forking/zapping the page table or mapping truncate without swapin). In such cases, the caller has to ensure there is no race condition on whatever owns the swap count and use the below helpers: - swap_put_entries_direct() - Decrease the swap count directly. Context: The caller must lock whatever is referencing the slots to avoid a race. Typically the page table zapping or shmem mapping truncate will need to free swap slots directly. If a slot is cached (has a folio bound), this will also try to release the swap cache. - swap_dup_entry_direct() - Increase the swap count directly. Context: The caller must lock whatever is referencing the entries to avoid race, and the entries must already have a swap count > 1. Typically, forking will need to copy the page table and hence needs to increase the swap count of the entries in the table. The page table is locked while referencing the swap entries, so the entries all have a swap count > 1 and can't be freed. Hibernation subsystem is a bit different, so two special wrappers are here: - swap_alloc_hibernation_slot() - Allocate one entry from one device. - swap_free_hibernation_slot() - Free one entry allocated by the above helper. All hibernation entries are exclusive to the hibernation subsystem and should not interact with ordinary swap routines. By separating the workflows, it will be possible to bind folio more tightly with swap cache and get rid of the SWAP_HAS_CACHE as a temporary pin. This commit should not introduce any behavior change [kasong@tencent.com: fix leak, per Chris Mason. Remove WARN_ON, per Lai Yi] Link: https://lkml.kernel.org/r/CAMgjq7AUz10uETVm8ozDWcB3XohkOqf0i33KGrAquvEVvfp5cg@mail.gmail.com [ryncsn@gmail.com: fix KSM copy pages for swapoff, per Chris] Link: https://lkml.kernel.org/r/aXxkANcET3l2Xu6J@KASONG-MC4 Link: https://lkml.kernel.org/r/20251220-swap-table-p2-v5-14-8862a265a033@tencent.com Signed-off-by: Kairui Song Signed-off-by: Kairui Song Acked-by: Rafael J. Wysocki (Intel) Reviewed-by: Baoquan He Cc: Baolin Wang Cc: Barry Song Cc: Chris Li Cc: Nhat Pham Cc: Yosry Ahmed Cc: Deepanshu Kartikey Cc: Johannes Weiner Cc: Kairui Song Cc: Chris Mason Cc: Chris Mason Cc: Lai Yi Signed-off-by: Andrew Morton --- arch/s390/mm/gmap_helpers.c | 2 +- arch/s390/mm/pgtable.c | 2 +- include/linux/swap.h | 58 ++++++++--------- kernel/power/swap.c | 10 +-- mm/madvise.c | 2 +- mm/memory.c | 15 +++-- mm/rmap.c | 7 +- mm/shmem.c | 10 +-- mm/swap.h | 37 +++++++++++ mm/swapfile.c | 151 +++++++++++++++++++++++++++++++------------- 10 files changed, 196 insertions(+), 98 deletions(-) (limited to 'include') diff --git a/arch/s390/mm/gmap_helpers.c b/arch/s390/mm/gmap_helpers.c index d41b19925a5a..dd89fce28531 100644 --- a/arch/s390/mm/gmap_helpers.c +++ b/arch/s390/mm/gmap_helpers.c @@ -32,7 +32,7 @@ static void ptep_zap_softleaf_entry(struct mm_struct *mm, softleaf_t entry) dec_mm_counter(mm, MM_SWAPENTS); else if (softleaf_is_migration(entry)) dec_mm_counter(mm, mm_counter(softleaf_to_folio(entry))); - free_swap_and_cache(entry); + swap_put_entries_direct(entry, 1); } /** diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index 666adcd681ab..b22181e1079e 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -682,7 +682,7 @@ static void ptep_zap_softleaf_entry(struct mm_struct *mm, softleaf_t entry) dec_mm_counter(mm, mm_counter(folio)); } - free_swap_and_cache(entry); + swap_put_entries_direct(entry, 1); } void ptep_zap_unused(struct mm_struct *mm, unsigned long addr, diff --git a/include/linux/swap.h b/include/linux/swap.h index 74df3004c850..aaa868f60b9c 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -452,14 +452,8 @@ static inline long get_nr_swap_pages(void) } extern void si_swapinfo(struct sysinfo *); -int folio_alloc_swap(struct folio *folio); -bool folio_free_swap(struct folio *folio); void put_swap_folio(struct folio *folio, swp_entry_t entry); -extern swp_entry_t get_swap_page_of_type(int); extern int add_swap_count_continuation(swp_entry_t, gfp_t); -extern int swap_duplicate_nr(swp_entry_t entry, int nr); -extern void swap_free_nr(swp_entry_t entry, int nr_pages); -extern void free_swap_and_cache_nr(swp_entry_t entry, int nr); int swap_type_of(dev_t device, sector_t offset); int find_first_swap(dev_t *device); extern unsigned int count_swap_pages(int, int); @@ -471,6 +465,29 @@ struct backing_dev_info; extern struct swap_info_struct *get_swap_device(swp_entry_t entry); sector_t swap_folio_sector(struct folio *folio); +/* + * If there is an existing swap slot reference (swap entry) and the caller + * guarantees that there is no race modification of it (e.g., PTL + * protecting the swap entry in page table; shmem's cmpxchg protects t + * he swap entry in shmem mapping), these two helpers below can be used + * to put/dup the entries directly. + * + * All entries must be allocated by folio_alloc_swap(). And they must have + * a swap count > 1. See comments of folio_*_swap helpers for more info. + */ +int swap_dup_entry_direct(swp_entry_t entry); +void swap_put_entries_direct(swp_entry_t entry, int nr); + +/* + * folio_free_swap tries to free the swap entries pinned by a swap cache + * folio, it has to be here to be called by other components. + */ +bool folio_free_swap(struct folio *folio); + +/* Allocate / free (hibernation) exclusive entries */ +swp_entry_t swap_alloc_hibernation_slot(int type); +void swap_free_hibernation_slot(swp_entry_t entry); + static inline void put_swap_device(struct swap_info_struct *si) { percpu_ref_put(&si->users); @@ -498,10 +515,6 @@ static inline void put_swap_device(struct swap_info_struct *si) #define free_pages_and_swap_cache(pages, nr) \ release_pages((pages), (nr)); -static inline void free_swap_and_cache_nr(swp_entry_t entry, int nr) -{ -} - static inline void free_swap_cache(struct folio *folio) { } @@ -511,12 +524,12 @@ static inline int add_swap_count_continuation(swp_entry_t swp, gfp_t gfp_mask) return 0; } -static inline int swap_duplicate_nr(swp_entry_t swp, int nr_pages) +static inline int swap_dup_entry_direct(swp_entry_t ent) { return 0; } -static inline void swap_free_nr(swp_entry_t entry, int nr_pages) +static inline void swap_put_entries_direct(swp_entry_t ent, int nr) { } @@ -539,11 +552,6 @@ static inline int swp_swapcount(swp_entry_t entry) return 0; } -static inline int folio_alloc_swap(struct folio *folio) -{ - return -EINVAL; -} - static inline bool folio_free_swap(struct folio *folio) { return false; @@ -556,22 +564,6 @@ static inline int add_swap_extent(struct swap_info_struct *sis, return -EINVAL; } #endif /* CONFIG_SWAP */ - -static inline int swap_duplicate(swp_entry_t entry) -{ - return swap_duplicate_nr(entry, 1); -} - -static inline void free_swap_and_cache(swp_entry_t entry) -{ - free_swap_and_cache_nr(entry, 1); -} - -static inline void swap_free(swp_entry_t entry) -{ - swap_free_nr(entry, 1); -} - #ifdef CONFIG_MEMCG static inline int mem_cgroup_swappiness(struct mem_cgroup *memcg) { diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 8050e5182835..19ed7bd2adcc 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -174,10 +174,10 @@ sector_t alloc_swapdev_block(int swap) * Allocate a swap page and register that it has been allocated, so that * it can be freed in case of an error. */ - offset = swp_offset(get_swap_page_of_type(swap)); + offset = swp_offset(swap_alloc_hibernation_slot(swap)); if (offset) { if (swsusp_extents_insert(offset)) - swap_free(swp_entry(swap, offset)); + swap_free_hibernation_slot(swp_entry(swap, offset)); else return swapdev_block(swap, offset); } @@ -186,6 +186,7 @@ sector_t alloc_swapdev_block(int swap) void free_all_swap_pages(int swap) { + unsigned long offset; struct rb_node *node; /* @@ -197,8 +198,9 @@ void free_all_swap_pages(int swap) ext = rb_entry(node, struct swsusp_extent, node); rb_erase(node, &swsusp_extents); - swap_free_nr(swp_entry(swap, ext->start), - ext->end - ext->start + 1); + + for (offset = ext->start; offset <= ext->end; offset++) + swap_free_hibernation_slot(swp_entry(swap, offset)); kfree(ext); } diff --git a/mm/madvise.c b/mm/madvise.c index 19cf480eed49..1f3040688f04 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -692,7 +692,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, max_nr = (end - addr) / PAGE_SIZE; nr = swap_pte_batch(pte, max_nr, ptent); nr_swap -= nr; - free_swap_and_cache_nr(entry, nr); + swap_put_entries_direct(entry, nr); clear_not_present_full_ptes(mm, addr, pte, nr, tlb->fullmm); } else if (softleaf_is_hwpoison(entry) || softleaf_is_poison_marker(entry)) { diff --git a/mm/memory.c b/mm/memory.c index 60258033103e..187f16b7e996 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -934,7 +934,7 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, struct page *page; if (likely(softleaf_is_swap(entry))) { - if (swap_duplicate(entry) < 0) + if (swap_dup_entry_direct(entry) < 0) return -EIO; /* make sure dst_mm is on swapoff's mmlist. */ @@ -1748,7 +1748,7 @@ static inline int zap_nonpresent_ptes(struct mmu_gather *tlb, nr = swap_pte_batch(pte, max_nr, ptent); rss[MM_SWAPENTS] -= nr; - free_swap_and_cache_nr(entry, nr); + swap_put_entries_direct(entry, nr); } else if (softleaf_is_migration(entry)) { struct folio *folio = softleaf_to_folio(entry); @@ -4936,7 +4936,7 @@ check_folio: /* * Some architectures may have to restore extra metadata to the page * when reading from swap. This metadata may be indexed by swap entry - * so this must be called before swap_free(). + * so this must be called before folio_put_swap(). */ arch_swap_restore(folio_swap(entry, folio), folio); @@ -4974,6 +4974,7 @@ check_folio: if (unlikely(folio != swapcache)) { folio_add_new_anon_rmap(folio, vma, address, RMAP_EXCLUSIVE); folio_add_lru_vma(folio, vma); + folio_put_swap(swapcache, NULL); } else if (!folio_test_anon(folio)) { /* * We currently only expect !anon folios that are fully @@ -4982,9 +4983,12 @@ check_folio: VM_WARN_ON_ONCE_FOLIO(folio_nr_pages(folio) != nr_pages, folio); VM_WARN_ON_ONCE_FOLIO(folio_mapped(folio), folio); folio_add_new_anon_rmap(folio, vma, address, rmap_flags); + folio_put_swap(folio, NULL); } else { + VM_WARN_ON_ONCE(nr_pages != 1 && nr_pages != folio_nr_pages(folio)); folio_add_anon_rmap_ptes(folio, page, nr_pages, vma, address, - rmap_flags); + rmap_flags); + folio_put_swap(folio, nr_pages == 1 ? page : NULL); } VM_BUG_ON(!folio_test_anon(folio) || @@ -4998,7 +5002,6 @@ check_folio: * Do it after mapping, so raced page faults will likely see the folio * in swap cache and wait on the folio lock. */ - swap_free_nr(entry, nr_pages); if (should_try_to_free_swap(si, folio, vma, nr_pages, vmf->flags)) folio_free_swap(folio); @@ -5008,7 +5011,7 @@ check_folio: * Hold the lock to avoid the swap entry to be reused * until we take the PT lock for the pte_same() check * (to avoid false positives from pte_same). For - * further safety release the lock after the swap_free + * further safety release the lock after the folio_put_swap * so that the swap count won't change under a * parallel locked swapcache. */ diff --git a/mm/rmap.c b/mm/rmap.c index 6ddbf58111ff..c1ba88763102 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -82,6 +82,7 @@ #include #include "internal.h" +#include "swap.h" static struct kmem_cache *anon_vma_cachep; static struct kmem_cache *anon_vma_chain_cachep; @@ -2232,7 +2233,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, goto discard; } - if (swap_duplicate(entry) < 0) { + if (folio_dup_swap(folio, subpage) < 0) { set_pte_at(mm, address, pvmw.pte, pteval); goto walk_abort; } @@ -2243,7 +2244,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, * so we'll not check/care. */ if (arch_unmap_one(mm, vma, address, pteval) < 0) { - swap_free(entry); + folio_put_swap(folio, subpage); set_pte_at(mm, address, pvmw.pte, pteval); goto walk_abort; } @@ -2251,7 +2252,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, /* See folio_try_share_anon_rmap(): clear PTE first. */ if (anon_exclusive && folio_try_share_anon_rmap_pte(folio, subpage)) { - swap_free(entry); + folio_put_swap(folio, subpage); set_pte_at(mm, address, pvmw.pte, pteval); goto walk_abort; } diff --git a/mm/shmem.c b/mm/shmem.c index dd4951d6f891..0adde3f4df27 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -982,7 +982,7 @@ static long shmem_free_swap(struct address_space *mapping, xas_unlock_irq(&xas); if (nr_pages) - free_swap_and_cache_nr(radix_to_swp_entry(radswap), nr_pages); + swap_put_entries_direct(radix_to_swp_entry(radswap), nr_pages); return nr_pages; } @@ -1690,7 +1690,7 @@ try_split: spin_unlock(&shmem_swaplist_lock); } - swap_duplicate_nr(folio->swap, nr_pages); + folio_dup_swap(folio, NULL); shmem_delete_from_page_cache(folio, swp_to_radix_entry(folio->swap)); BUG_ON(folio_mapped(folio)); @@ -1711,7 +1711,7 @@ try_split: /* Swap entry might be erased by racing shmem_free_swap() */ if (!error) { shmem_recalc_inode(inode, 0, -nr_pages); - swap_free_nr(folio->swap, nr_pages); + folio_put_swap(folio, NULL); } /* @@ -2197,6 +2197,7 @@ static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index, nr_pages = folio_nr_pages(folio); folio_wait_writeback(folio); + folio_put_swap(folio, NULL); swap_cache_del_folio(folio); /* * Don't treat swapin error folio as alloced. Otherwise inode->i_blocks @@ -2204,7 +2205,6 @@ static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index, * in shmem_evict_inode(). */ shmem_recalc_inode(inode, -nr_pages, -nr_pages); - swap_free_nr(swap, nr_pages); } static int shmem_split_large_entry(struct inode *inode, pgoff_t index, @@ -2427,9 +2427,9 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, if (sgp == SGP_WRITE) folio_mark_accessed(folio); + folio_put_swap(folio, NULL); swap_cache_del_folio(folio); folio_mark_dirty(folio); - swap_free_nr(swap, nr_pages); put_swap_device(si); *foliop = folio; diff --git a/mm/swap.h b/mm/swap.h index 0801857a0640..da243a1e3e45 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -183,6 +183,28 @@ static inline void swap_cluster_unlock_irq(struct swap_cluster_info *ci) spin_unlock_irq(&ci->lock); } +/* + * Below are the core routines for doing swap for a folio. + * All helpers requires the folio to be locked, and a locked folio + * in the swap cache pins the swap entries / slots allocated to the + * folio, swap relies heavily on the swap cache and folio lock for + * synchronization. + * + * folio_alloc_swap(): the entry point for a folio to be swapped + * out. It allocates swap slots and pins the slots with swap cache. + * The slots start with a swap count of zero. + * + * folio_dup_swap(): increases the swap count of a folio, usually + * during it gets unmapped and a swap entry is installed to replace + * it (e.g., swap entry in page table). A swap slot with swap + * count == 0 should only be increasd by this helper. + * + * folio_put_swap(): does the opposite thing of folio_dup_swap(). + */ +int folio_alloc_swap(struct folio *folio); +int folio_dup_swap(struct folio *folio, struct page *subpage); +void folio_put_swap(struct folio *folio, struct page *subpage); + /* linux/mm/page_io.c */ int sio_pool_init(void); struct swap_iocb; @@ -363,9 +385,24 @@ static inline struct swap_info_struct *__swap_entry_to_info(swp_entry_t entry) return NULL; } +static inline int folio_alloc_swap(struct folio *folio) +{ + return -EINVAL; +} + +static inline int folio_dup_swap(struct folio *folio, struct page *page) +{ + return -EINVAL; +} + +static inline void folio_put_swap(struct folio *folio, struct page *page) +{ +} + static inline void swap_read_folio(struct folio *folio, struct swap_iocb **plug) { } + static inline void swap_write_unplug(struct swap_iocb *sio) { } diff --git a/mm/swapfile.c b/mm/swapfile.c index 64970ee11fcf..d652486898de 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -58,6 +58,9 @@ static void swap_entries_free(struct swap_info_struct *si, swp_entry_t entry, unsigned int nr_pages); static void swap_range_alloc(struct swap_info_struct *si, unsigned int nr_entries); +static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr); +static bool swap_entries_put_map(struct swap_info_struct *si, + swp_entry_t entry, int nr); static bool folio_swapcache_freeable(struct folio *folio); static void move_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci, struct list_head *list, @@ -1482,6 +1485,12 @@ again: */ WARN_ON_ONCE(swap_cache_add_folio(folio, entry, NULL, true)); + /* + * Allocator should always allocate aligned entries so folio based + * operations never crossed more than one cluster. + */ + VM_WARN_ON_ONCE_FOLIO(!IS_ALIGNED(folio->swap.val, size), folio); + return 0; out_free: @@ -1489,6 +1498,66 @@ out_free: return -ENOMEM; } +/** + * folio_dup_swap() - Increase swap count of swap entries of a folio. + * @folio: folio with swap entries bounded. + * @subpage: if not NULL, only increase the swap count of this subpage. + * + * Typically called when the folio is unmapped and have its swap entry to + * take its palce. + * + * Context: Caller must ensure the folio is locked and in the swap cache. + * NOTE: The caller also has to ensure there is no raced call to + * swap_put_entries_direct on its swap entry before this helper returns, or + * the swap map may underflow. Currently, we only accept @subpage == NULL + * for shmem due to the limitation of swap continuation: shmem always + * duplicates the swap entry only once, so there is no such issue for it. + */ +int folio_dup_swap(struct folio *folio, struct page *subpage) +{ + int err = 0; + swp_entry_t entry = folio->swap; + unsigned long nr_pages = folio_nr_pages(folio); + + VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio); + VM_WARN_ON_FOLIO(!folio_test_swapcache(folio), folio); + + if (subpage) { + entry.val += folio_page_idx(folio, subpage); + nr_pages = 1; + } + + while (!err && __swap_duplicate(entry, 1, nr_pages) == -ENOMEM) + err = add_swap_count_continuation(entry, GFP_ATOMIC); + + return err; +} + +/** + * folio_put_swap() - Decrease swap count of swap entries of a folio. + * @folio: folio with swap entries bounded, must be in swap cache and locked. + * @subpage: if not NULL, only decrease the swap count of this subpage. + * + * This won't free the swap slots even if swap count drops to zero, they are + * still pinned by the swap cache. User may call folio_free_swap to free them. + * Context: Caller must ensure the folio is locked and in the swap cache. + */ +void folio_put_swap(struct folio *folio, struct page *subpage) +{ + swp_entry_t entry = folio->swap; + unsigned long nr_pages = folio_nr_pages(folio); + + VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio); + VM_WARN_ON_FOLIO(!folio_test_swapcache(folio), folio); + + if (subpage) { + entry.val += folio_page_idx(folio, subpage); + nr_pages = 1; + } + + swap_entries_put_map(__swap_entry_to_info(entry), entry, nr_pages); +} + static struct swap_info_struct *_swap_info_get(swp_entry_t entry) { struct swap_info_struct *si; @@ -1729,28 +1798,6 @@ static void swap_entries_free(struct swap_info_struct *si, partial_free_cluster(si, ci); } -/* - * Caller has made sure that the swap device corresponding to entry - * is still around or has not been recycled. - */ -void swap_free_nr(swp_entry_t entry, int nr_pages) -{ - int nr; - struct swap_info_struct *sis; - unsigned long offset = swp_offset(entry); - - sis = _swap_info_get(entry); - if (!sis) - return; - - while (nr_pages) { - nr = min_t(int, nr_pages, SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER); - swap_entries_put_map(sis, swp_entry(sis->type, offset), nr); - offset += nr; - nr_pages -= nr; - } -} - /* * Called after dropping swapcache to decrease refcnt to swap entries. */ @@ -1940,16 +1987,19 @@ bool folio_free_swap(struct folio *folio) } /** - * free_swap_and_cache_nr() - Release reference on range of swap entries and - * reclaim their cache if no more references remain. + * swap_put_entries_direct() - Release reference on range of swap entries and + * reclaim their cache if no more references remain. * @entry: First entry of range. * @nr: Number of entries in range. * * For each swap entry in the contiguous range, release a reference. If any swap * entries become free, try to reclaim their underlying folios, if present. The * offset range is defined by [entry.offset, entry.offset + nr). + * + * Context: Caller must ensure there is no race condition on the reference + * owner. e.g., locking the PTL of a PTE containing the entry being released. */ -void free_swap_and_cache_nr(swp_entry_t entry, int nr) +void swap_put_entries_direct(swp_entry_t entry, int nr) { const unsigned long start_offset = swp_offset(entry); const unsigned long end_offset = start_offset + nr; @@ -1958,10 +2008,9 @@ void free_swap_and_cache_nr(swp_entry_t entry, int nr) unsigned long offset; si = get_swap_device(entry); - if (!si) + if (WARN_ON_ONCE(!si)) return; - - if (WARN_ON(end_offset > si->max)) + if (WARN_ON_ONCE(end_offset > si->max)) goto out; /* @@ -2005,8 +2054,8 @@ out: } #ifdef CONFIG_HIBERNATION - -swp_entry_t get_swap_page_of_type(int type) +/* Allocate a slot for hibernation */ +swp_entry_t swap_alloc_hibernation_slot(int type) { struct swap_info_struct *si = swap_type_to_info(type); unsigned long offset; @@ -2034,6 +2083,26 @@ fail: return entry; } +/* Free a slot allocated by swap_alloc_hibernation_slot */ +void swap_free_hibernation_slot(swp_entry_t entry) +{ + struct swap_info_struct *si; + struct swap_cluster_info *ci; + pgoff_t offset = swp_offset(entry); + + si = get_swap_device(entry); + if (WARN_ON(!si)) + return; + + ci = swap_cluster_lock(si, offset); + swap_entry_put_locked(si, ci, entry, 1); + swap_cluster_unlock(ci); + + /* In theory readahead might add it to the swap cache by accident */ + __try_to_reclaim_swap(si, offset, TTRS_ANYWAY); + put_swap_device(si); +} + /* * Find the swap type that corresponds to given device (if any). * @@ -2195,7 +2264,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, /* * Some architectures may have to restore extra metadata to the page * when reading from swap. This metadata may be indexed by swap entry - * so this must be called before swap_free(). + * so this must be called before folio_put_swap(). */ arch_swap_restore(folio_swap(entry, folio), folio); @@ -2236,7 +2305,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, new_pte = pte_mkuffd_wp(new_pte); setpte: set_pte_at(vma->vm_mm, addr, pte, new_pte); - swap_free(entry); + folio_put_swap(swapcache, folio_file_page(swapcache, swp_offset(entry))); out: if (pte) pte_unmap_unlock(pte, ptl); @@ -3746,28 +3815,22 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr) return err; } -/** - * swap_duplicate_nr() - Increase reference count of nr contiguous swap entries - * by 1. - * +/* + * swap_dup_entry_direct() - Increase reference count of a swap entry by one. * @entry: first swap entry from which we want to increase the refcount. - * @nr: Number of entries in range. * * Returns 0 for success, or -ENOMEM if a swap_count_continuation is required * but could not be atomically allocated. Returns 0, just as if it succeeded, * if __swap_duplicate() fails for another reason (-EINVAL or -ENOENT), which * might occur if a page table entry has got corrupted. * - * Note that we are currently not handling the case where nr > 1 and we need to - * add swap count continuation. This is OK, because no such user exists - shmem - * is the only user that can pass nr > 1, and it never re-duplicates any swap - * entry it owns. + * Context: Caller must ensure there is no race condition on the reference + * owner. e.g., locking the PTL of a PTE containing the entry being increased. */ -int swap_duplicate_nr(swp_entry_t entry, int nr) +int swap_dup_entry_direct(swp_entry_t entry) { int err = 0; - - while (!err && __swap_duplicate(entry, 1, nr) == -ENOMEM) + while (!err && __swap_duplicate(entry, 1, 1) == -ENOMEM) err = add_swap_count_continuation(entry, GFP_ATOMIC); return err; } -- cgit v1.2.3 From 270f095179ff15b7c72f25dd6720dcab3d15cc9b Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Sat, 20 Dec 2025 03:43:44 +0800 Subject: mm, swap: add folio to swap cache directly on allocation The allocator uses SWAP_HAS_CACHE to pin a swap slot upon allocation. SWAP_HAS_CACHE is being deprecated as it caused a lot of confusion. This pinning usage here can be dropped by adding the folio to swap cache directly on allocation. All swap allocations are folio-based now (except for hibernation), so the swap allocator can always take the folio as the parameter. And now both swap cache (swap table) and swap map are protected by the cluster lock, scanning the map and inserting the folio can be done in the same critical section. This eliminates the time window that a slot is pinned by SWAP_HAS_CACHE, but it has no cache, and avoids touching the lock multiple times. This is both a cleanup and an optimization. Link: https://lkml.kernel.org/r/20251220-swap-table-p2-v5-15-8862a265a033@tencent.com Signed-off-by: Kairui Song Reviewed-by: Baoquan He Cc: Baolin Wang Cc: Barry Song Cc: Chris Li Cc: Nhat Pham Cc: Rafael J. Wysocki (Intel) Cc: Yosry Ahmed Cc: Deepanshu Kartikey Cc: Johannes Weiner Cc: Kairui Song Signed-off-by: Andrew Morton --- include/linux/swap.h | 5 -- mm/swap.h | 10 +--- mm/swap_state.c | 58 +++++++++++-------- mm/swapfile.c | 161 ++++++++++++++++++++++----------------------------- 4 files changed, 105 insertions(+), 129 deletions(-) (limited to 'include') diff --git a/include/linux/swap.h b/include/linux/swap.h index aaa868f60b9c..517d24e96d8c 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -452,7 +452,6 @@ static inline long get_nr_swap_pages(void) } extern void si_swapinfo(struct sysinfo *); -void put_swap_folio(struct folio *folio, swp_entry_t entry); extern int add_swap_count_continuation(swp_entry_t, gfp_t); int swap_type_of(dev_t device, sector_t offset); int find_first_swap(dev_t *device); @@ -533,10 +532,6 @@ static inline void swap_put_entries_direct(swp_entry_t ent, int nr) { } -static inline void put_swap_folio(struct folio *folio, swp_entry_t swp) -{ -} - static inline int __swap_count(swp_entry_t entry) { return 0; diff --git a/mm/swap.h b/mm/swap.h index da243a1e3e45..50d904117ef6 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -277,13 +277,13 @@ void __swapcache_clear_cached(struct swap_info_struct *si, */ struct folio *swap_cache_get_folio(swp_entry_t entry); void *swap_cache_get_shadow(swp_entry_t entry); -int swap_cache_add_folio(struct folio *folio, swp_entry_t entry, - void **shadow, bool alloc); void swap_cache_del_folio(struct folio *folio); struct folio *swap_cache_alloc_folio(swp_entry_t entry, gfp_t gfp_flags, struct mempolicy *mpol, pgoff_t ilx, bool *alloced); /* Below helpers require the caller to lock and pass in the swap cluster. */ +void __swap_cache_add_folio(struct swap_cluster_info *ci, + struct folio *folio, swp_entry_t entry); void __swap_cache_del_folio(struct swap_cluster_info *ci, struct folio *folio, swp_entry_t entry, void *shadow); void __swap_cache_replace_folio(struct swap_cluster_info *ci, @@ -459,12 +459,6 @@ static inline void *swap_cache_get_shadow(swp_entry_t entry) return NULL; } -static inline int swap_cache_add_folio(struct folio *folio, swp_entry_t entry, - void **shadow, bool alloc) -{ - return -ENOENT; -} - static inline void swap_cache_del_folio(struct folio *folio) { } diff --git a/mm/swap_state.c b/mm/swap_state.c index 9f45563591d6..22fbb2b08a60 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -121,35 +121,56 @@ void *swap_cache_get_shadow(swp_entry_t entry) return NULL; } +void __swap_cache_add_folio(struct swap_cluster_info *ci, + struct folio *folio, swp_entry_t entry) +{ + unsigned long new_tb; + unsigned int ci_start, ci_off, ci_end; + unsigned long nr_pages = folio_nr_pages(folio); + + VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio); + VM_WARN_ON_ONCE_FOLIO(folio_test_swapcache(folio), folio); + VM_WARN_ON_ONCE_FOLIO(!folio_test_swapbacked(folio), folio); + + new_tb = folio_to_swp_tb(folio); + ci_start = swp_cluster_offset(entry); + ci_off = ci_start; + ci_end = ci_start + nr_pages; + do { + VM_WARN_ON_ONCE(swp_tb_is_folio(__swap_table_get(ci, ci_off))); + __swap_table_set(ci, ci_off, new_tb); + } while (++ci_off < ci_end); + + folio_ref_add(folio, nr_pages); + folio_set_swapcache(folio); + folio->swap = entry; + + node_stat_mod_folio(folio, NR_FILE_PAGES, nr_pages); + lruvec_stat_mod_folio(folio, NR_SWAPCACHE, nr_pages); +} + /** * swap_cache_add_folio - Add a folio into the swap cache. * @folio: The folio to be added. * @entry: The swap entry corresponding to the folio. * @gfp: gfp_mask for XArray node allocation. * @shadowp: If a shadow is found, return the shadow. - * @alloc: If it's the allocator that is trying to insert a folio. Allocator - * sets SWAP_HAS_CACHE to pin slots before insert so skip map update. * * Context: Caller must ensure @entry is valid and protect the swap device * with reference count or locks. */ -int swap_cache_add_folio(struct folio *folio, swp_entry_t entry, - void **shadowp, bool alloc) +static int swap_cache_add_folio(struct folio *folio, swp_entry_t entry, + void **shadowp) { int err; void *shadow = NULL; + unsigned long old_tb; struct swap_info_struct *si; - unsigned long old_tb, new_tb; struct swap_cluster_info *ci; unsigned int ci_start, ci_off, ci_end, offset; unsigned long nr_pages = folio_nr_pages(folio); - VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio); - VM_WARN_ON_ONCE_FOLIO(folio_test_swapcache(folio), folio); - VM_WARN_ON_ONCE_FOLIO(!folio_test_swapbacked(folio), folio); - si = __swap_entry_to_info(entry); - new_tb = folio_to_swp_tb(folio); ci_start = swp_cluster_offset(entry); ci_end = ci_start + nr_pages; ci_off = ci_start; @@ -165,7 +186,7 @@ int swap_cache_add_folio(struct folio *folio, swp_entry_t entry, err = -EEXIST; goto failed; } - if (!alloc && unlikely(!__swap_count(swp_entry(swp_type(entry), offset)))) { + if (unlikely(!__swap_count(swp_entry(swp_type(entry), offset)))) { err = -ENOENT; goto failed; } @@ -181,20 +202,11 @@ int swap_cache_add_folio(struct folio *folio, swp_entry_t entry, * Still need to pin the slots with SWAP_HAS_CACHE since * swap allocator depends on that. */ - if (!alloc) - __swapcache_set_cached(si, ci, swp_entry(swp_type(entry), offset)); - __swap_table_set(ci, ci_off, new_tb); + __swapcache_set_cached(si, ci, swp_entry(swp_type(entry), offset)); offset++; } while (++ci_off < ci_end); - - folio_ref_add(folio, nr_pages); - folio_set_swapcache(folio); - folio->swap = entry; + __swap_cache_add_folio(ci, folio, entry); swap_cluster_unlock(ci); - - node_stat_mod_folio(folio, NR_FILE_PAGES, nr_pages); - lruvec_stat_mod_folio(folio, NR_SWAPCACHE, nr_pages); - if (shadowp) *shadowp = shadow; return 0; @@ -463,7 +475,7 @@ static struct folio *__swap_cache_prepare_and_add(swp_entry_t entry, __folio_set_locked(folio); __folio_set_swapbacked(folio); for (;;) { - ret = swap_cache_add_folio(folio, entry, &shadow, false); + ret = swap_cache_add_folio(folio, entry, &shadow); if (!ret) break; diff --git a/mm/swapfile.c b/mm/swapfile.c index d652486898de..8e6bb0774c41 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -884,28 +884,57 @@ static void swap_cluster_assert_table_empty(struct swap_cluster_info *ci, } } -static bool cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster_info *ci, - unsigned int start, unsigned char usage, - unsigned int order) +static bool cluster_alloc_range(struct swap_info_struct *si, + struct swap_cluster_info *ci, + struct folio *folio, + unsigned int offset) { - unsigned int nr_pages = 1 << order; + unsigned long nr_pages; + unsigned int order; lockdep_assert_held(&ci->lock); if (!(si->flags & SWP_WRITEOK)) return false; + /* + * All mm swap allocation starts with a folio (folio_alloc_swap), + * it's also the only allocation path for large orders allocation. + * Such swap slots starts with count == 0 and will be increased + * upon folio unmap. + * + * Else, it's a exclusive order 0 allocation for hibernation. + * The slot starts with count == 1 and never increases. + */ + if (likely(folio)) { + order = folio_order(folio); + nr_pages = 1 << order; + /* + * Pin the slot with SWAP_HAS_CACHE to satisfy swap_dup_entries. + * This is the legacy allocation behavior, will drop it very soon. + */ + memset(si->swap_map + offset, SWAP_HAS_CACHE, nr_pages); + __swap_cache_add_folio(ci, folio, swp_entry(si->type, offset)); + } else if (IS_ENABLED(CONFIG_HIBERNATION)) { + order = 0; + nr_pages = 1; + WARN_ON_ONCE(si->swap_map[offset]); + si->swap_map[offset] = 1; + swap_cluster_assert_table_empty(ci, offset, 1); + } else { + /* Allocation without folio is only possible with hibernation */ + WARN_ON_ONCE(1); + return false; + } + /* * The first allocation in a cluster makes the * cluster exclusive to this order */ if (cluster_is_empty(ci)) ci->order = order; - - memset(si->swap_map + start, usage, nr_pages); - swap_cluster_assert_table_empty(ci, start, nr_pages); - swap_range_alloc(si, nr_pages); ci->count += nr_pages; + swap_range_alloc(si, nr_pages); return true; } @@ -913,13 +942,12 @@ static bool cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster /* Try use a new cluster for current CPU and allocate from it. */ static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci, - unsigned long offset, - unsigned int order, - unsigned char usage) + struct folio *folio, unsigned long offset) { unsigned int next = SWAP_ENTRY_INVALID, found = SWAP_ENTRY_INVALID; unsigned long start = ALIGN_DOWN(offset, SWAPFILE_CLUSTER); unsigned long end = min(start + SWAPFILE_CLUSTER, si->max); + unsigned int order = likely(folio) ? folio_order(folio) : 0; unsigned int nr_pages = 1 << order; bool need_reclaim, ret, usable; @@ -943,7 +971,7 @@ static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si, if (!ret) continue; } - if (!cluster_alloc_range(si, ci, offset, usage, order)) + if (!cluster_alloc_range(si, ci, folio, offset)) break; found = offset; offset += nr_pages; @@ -965,8 +993,7 @@ out: static unsigned int alloc_swap_scan_list(struct swap_info_struct *si, struct list_head *list, - unsigned int order, - unsigned char usage, + struct folio *folio, bool scan_all) { unsigned int found = SWAP_ENTRY_INVALID; @@ -978,7 +1005,7 @@ static unsigned int alloc_swap_scan_list(struct swap_info_struct *si, if (!ci) break; offset = cluster_offset(si, ci); - found = alloc_swap_scan_cluster(si, ci, offset, order, usage); + found = alloc_swap_scan_cluster(si, ci, folio, offset); if (found) break; } while (scan_all); @@ -1039,10 +1066,11 @@ static void swap_reclaim_work(struct work_struct *work) * Try to allocate swap entries with specified order and try set a new * cluster for current CPU too. */ -static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int order, - unsigned char usage) +static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, + struct folio *folio) { struct swap_cluster_info *ci; + unsigned int order = likely(folio) ? folio_order(folio) : 0; unsigned int offset = SWAP_ENTRY_INVALID, found = SWAP_ENTRY_INVALID; /* @@ -1064,8 +1092,7 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o if (cluster_is_usable(ci, order)) { if (cluster_is_empty(ci)) offset = cluster_offset(si, ci); - found = alloc_swap_scan_cluster(si, ci, offset, - order, usage); + found = alloc_swap_scan_cluster(si, ci, folio, offset); } else { swap_cluster_unlock(ci); } @@ -1079,22 +1106,19 @@ new_cluster: * to spread out the writes. */ if (si->flags & SWP_PAGE_DISCARD) { - found = alloc_swap_scan_list(si, &si->free_clusters, order, usage, - false); + found = alloc_swap_scan_list(si, &si->free_clusters, folio, false); if (found) goto done; } if (order < PMD_ORDER) { - found = alloc_swap_scan_list(si, &si->nonfull_clusters[order], - order, usage, true); + found = alloc_swap_scan_list(si, &si->nonfull_clusters[order], folio, true); if (found) goto done; } if (!(si->flags & SWP_PAGE_DISCARD)) { - found = alloc_swap_scan_list(si, &si->free_clusters, order, usage, - false); + found = alloc_swap_scan_list(si, &si->free_clusters, folio, false); if (found) goto done; } @@ -1110,8 +1134,7 @@ new_cluster: * failure is not critical. Scanning one cluster still * keeps the list rotated and reclaimed (for HAS_CACHE). */ - found = alloc_swap_scan_list(si, &si->frag_clusters[order], order, - usage, false); + found = alloc_swap_scan_list(si, &si->frag_clusters[order], folio, false); if (found) goto done; } @@ -1125,13 +1148,11 @@ new_cluster: * Clusters here have at least one usable slots and can't fail order 0 * allocation, but reclaim may drop si->lock and race with another user. */ - found = alloc_swap_scan_list(si, &si->frag_clusters[o], - 0, usage, true); + found = alloc_swap_scan_list(si, &si->frag_clusters[o], folio, true); if (found) goto done; - found = alloc_swap_scan_list(si, &si->nonfull_clusters[o], - 0, usage, true); + found = alloc_swap_scan_list(si, &si->nonfull_clusters[o], folio, true); if (found) goto done; } @@ -1322,12 +1343,12 @@ static bool get_swap_device_info(struct swap_info_struct *si) * Fast path try to get swap entries with specified order from current * CPU's swap entry pool (a cluster). */ -static bool swap_alloc_fast(swp_entry_t *entry, - int order) +static bool swap_alloc_fast(struct folio *folio) { + unsigned int order = folio_order(folio); struct swap_cluster_info *ci; struct swap_info_struct *si; - unsigned int offset, found = SWAP_ENTRY_INVALID; + unsigned int offset; /* * Once allocated, swap_info_struct will never be completely freed, @@ -1342,22 +1363,18 @@ static bool swap_alloc_fast(swp_entry_t *entry, if (cluster_is_usable(ci, order)) { if (cluster_is_empty(ci)) offset = cluster_offset(si, ci); - found = alloc_swap_scan_cluster(si, ci, offset, order, SWAP_HAS_CACHE); - if (found) - *entry = swp_entry(si->type, found); + alloc_swap_scan_cluster(si, ci, folio, offset); } else { swap_cluster_unlock(ci); } put_swap_device(si); - return !!found; + return folio_test_swapcache(folio); } /* Rotate the device and switch to a new cluster */ -static void swap_alloc_slow(swp_entry_t *entry, - int order) +static void swap_alloc_slow(struct folio *folio) { - unsigned long offset; struct swap_info_struct *si, *next; spin_lock(&swap_avail_lock); @@ -1367,13 +1384,11 @@ start_over: plist_requeue(&si->avail_list, &swap_avail_head); spin_unlock(&swap_avail_lock); if (get_swap_device_info(si)) { - offset = cluster_alloc_swap_entry(si, order, SWAP_HAS_CACHE); + cluster_alloc_swap_entry(si, folio); put_swap_device(si); - if (offset) { - *entry = swp_entry(si->type, offset); + if (folio_test_swapcache(folio)) return; - } - if (order) + if (folio_test_large(folio)) return; } @@ -1438,7 +1453,6 @@ int folio_alloc_swap(struct folio *folio) { unsigned int order = folio_order(folio); unsigned int size = 1 << order; - swp_entry_t entry = {}; VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); VM_BUG_ON_FOLIO(!folio_test_uptodate(folio), folio); @@ -1463,39 +1477,23 @@ int folio_alloc_swap(struct folio *folio) again: local_lock(&percpu_swap_cluster.lock); - if (!swap_alloc_fast(&entry, order)) - swap_alloc_slow(&entry, order); + if (!swap_alloc_fast(folio)) + swap_alloc_slow(folio); local_unlock(&percpu_swap_cluster.lock); - if (unlikely(!order && !entry.val)) { + if (!order && unlikely(!folio_test_swapcache(folio))) { if (swap_sync_discard()) goto again; } /* Need to call this even if allocation failed, for MEMCG_SWAP_FAIL. */ - if (mem_cgroup_try_charge_swap(folio, entry)) - goto out_free; + if (unlikely(mem_cgroup_try_charge_swap(folio, folio->swap))) + swap_cache_del_folio(folio); - if (!entry.val) + if (unlikely(!folio_test_swapcache(folio))) return -ENOMEM; - /* - * Allocator has pinned the slots with SWAP_HAS_CACHE - * so it should never fail - */ - WARN_ON_ONCE(swap_cache_add_folio(folio, entry, NULL, true)); - - /* - * Allocator should always allocate aligned entries so folio based - * operations never crossed more than one cluster. - */ - VM_WARN_ON_ONCE_FOLIO(!IS_ALIGNED(folio->swap.val, size), folio); - return 0; - -out_free: - put_swap_folio(folio, entry); - return -ENOMEM; } /** @@ -1798,29 +1796,6 @@ static void swap_entries_free(struct swap_info_struct *si, partial_free_cluster(si, ci); } -/* - * Called after dropping swapcache to decrease refcnt to swap entries. - */ -void put_swap_folio(struct folio *folio, swp_entry_t entry) -{ - struct swap_info_struct *si; - struct swap_cluster_info *ci; - unsigned long offset = swp_offset(entry); - int size = 1 << swap_entry_order(folio_order(folio)); - - si = _swap_info_get(entry); - if (!si) - return; - - ci = swap_cluster_lock(si, offset); - if (swap_only_has_cache(si, offset, size)) - swap_entries_free(si, ci, entry, size); - else - for (int i = 0; i < size; i++, entry.val++) - swap_entry_put_locked(si, ci, entry, SWAP_HAS_CACHE); - swap_cluster_unlock(ci); -} - int __swap_count(swp_entry_t entry) { struct swap_info_struct *si = __swap_entry_to_info(entry); @@ -2072,7 +2047,7 @@ swp_entry_t swap_alloc_hibernation_slot(int type) * with swap table allocation. */ local_lock(&percpu_swap_cluster.lock); - offset = cluster_alloc_swap_entry(si, 0, 1); + offset = cluster_alloc_swap_entry(si, NULL); local_unlock(&percpu_swap_cluster.lock); if (offset) entry = swp_entry(si->type, offset); -- cgit v1.2.3 From d3852f9692b8a6af7566f92f7432ee5067c6be15 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Sat, 20 Dec 2025 03:43:47 +0800 Subject: mm, swap: drop the SWAP_HAS_CACHE flag Now, the swap cache is managed by the swap table. All swap cache users are checking the swap table directly to check the swap cache state. SWAP_HAS_CACHE is now just a temporary pin before the first increase from 0 to 1 of a slot's swap count (swap_dup_entries) after swap allocation (folio_alloc_swap), or before the final free of slots pinned by folio in swap cache (put_swap_folio). Drop these two usages. For the first dup, SWAP_HAS_CACHE pinning was hard to kill because it used to have multiple meanings, more than just "a slot is cached". We have just simplified that and defined that the first dup is always done with folio locked in swap cache (folio_dup_swap), so stop checking the SWAP_HAS_CACHE bit and just check the swap cache (swap table) directly, and add a WARN if a swap entry's count is being increased for the first time while the folio is not in swap cache. As for freeing, just let the swap cache free all swap entries of a folio that have a swap count of zero directly upon folio removal. We have also just cleaned up batch freeing to check the swap cache usage using the swap table: a slot with swap cache in the swap table will not be freed until its cache is gone, and no SWAP_HAS_CACHE bit is involved anymore. And besides, the removal of a folio and freeing of the slots are being done in the same critical section now, which should improve the performance. After these two changes, SWAP_HAS_CACHE no longer has any users. Swap cache synchronization is also done by the swap table directly, so using SWAP_HAS_CACHE to pin a slot before adding the cache is also no longer needed. Remove all related logic and helpers. swap_map is now only used for tracking the count, so all swap_map users can just read it directly, ignoring the swap_count helper, which was previously used to filter out the SWAP_HAS_CACHE bit. The idea of dropping SWAP_HAS_CACHE and using the swap table directly was initially from Chris's idea of merging all the metadata usage of all swaps into one place. Link: https://lkml.kernel.org/r/20251220-swap-table-p2-v5-18-8862a265a033@tencent.com Signed-off-by: Kairui Song Suggested-by: Chris Li Reviewed-by: Baoquan He Cc: Baolin Wang Cc: Barry Song Cc: Nhat Pham Cc: Rafael J. Wysocki (Intel) Cc: Yosry Ahmed Cc: Deepanshu Kartikey Cc: Johannes Weiner Cc: Kairui Song Signed-off-by: Andrew Morton --- include/linux/swap.h | 1 - mm/swap.h | 13 ++-- mm/swap_state.c | 28 +++++---- mm/swapfile.c | 168 +++++++++++++++++---------------------------------- 4 files changed, 78 insertions(+), 132 deletions(-) (limited to 'include') diff --git a/include/linux/swap.h b/include/linux/swap.h index 517d24e96d8c..62fc7499b408 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -224,7 +224,6 @@ enum { #define COMPACT_CLUSTER_MAX SWAP_CLUSTER_MAX /* Bit flag in swap_map */ -#define SWAP_HAS_CACHE 0x40 /* Flag page is cached, in first swap_map */ #define COUNT_CONTINUED 0x80 /* Flag swap_map continuation for full count */ /* Special value in first swap_map */ diff --git a/mm/swap.h b/mm/swap.h index 393378ce1687..bfafa637c458 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -205,6 +205,11 @@ int folio_alloc_swap(struct folio *folio); int folio_dup_swap(struct folio *folio, struct page *subpage); void folio_put_swap(struct folio *folio, struct page *subpage); +/* For internal use */ +extern void swap_entries_free(struct swap_info_struct *si, + struct swap_cluster_info *ci, + unsigned long offset, unsigned int nr_pages); + /* linux/mm/page_io.c */ int sio_pool_init(void); struct swap_iocb; @@ -256,14 +261,6 @@ static inline bool folio_matches_swap_entry(const struct folio *folio, return folio_entry.val == round_down(entry.val, nr_pages); } -/* Temporary internal helpers */ -void __swapcache_set_cached(struct swap_info_struct *si, - struct swap_cluster_info *ci, - swp_entry_t entry); -void __swapcache_clear_cached(struct swap_info_struct *si, - struct swap_cluster_info *ci, - swp_entry_t entry, unsigned int nr); - /* * All swap cache helpers below require the caller to ensure the swap entries * used are valid and stabilize the device by any of the following ways: diff --git a/mm/swap_state.c b/mm/swap_state.c index 4f59770e5eb7..6d0eef7470be 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -210,17 +210,6 @@ static int swap_cache_add_folio(struct folio *folio, swp_entry_t entry, shadow = swp_tb_to_shadow(old_tb); offset++; } while (++ci_off < ci_end); - - ci_off = ci_start; - offset = swp_offset(entry); - do { - /* - * Still need to pin the slots with SWAP_HAS_CACHE since - * swap allocator depends on that. - */ - __swapcache_set_cached(si, ci, swp_entry(swp_type(entry), offset)); - offset++; - } while (++ci_off < ci_end); __swap_cache_add_folio(ci, folio, entry); swap_cluster_unlock(ci); if (shadowp) @@ -251,6 +240,7 @@ void __swap_cache_del_folio(struct swap_cluster_info *ci, struct folio *folio, struct swap_info_struct *si; unsigned long old_tb, new_tb; unsigned int ci_start, ci_off, ci_end; + bool folio_swapped = false, need_free = false; unsigned long nr_pages = folio_nr_pages(folio); VM_WARN_ON_ONCE(__swap_entry_to_cluster(entry) != ci); @@ -268,13 +258,27 @@ void __swap_cache_del_folio(struct swap_cluster_info *ci, struct folio *folio, old_tb = __swap_table_xchg(ci, ci_off, new_tb); WARN_ON_ONCE(!swp_tb_is_folio(old_tb) || swp_tb_to_folio(old_tb) != folio); + if (__swap_count(swp_entry(si->type, + swp_offset(entry) + ci_off - ci_start))) + folio_swapped = true; + else + need_free = true; } while (++ci_off < ci_end); folio->swap.val = 0; folio_clear_swapcache(folio); node_stat_mod_folio(folio, NR_FILE_PAGES, -nr_pages); lruvec_stat_mod_folio(folio, NR_SWAPCACHE, -nr_pages); - __swapcache_clear_cached(si, ci, entry, nr_pages); + + if (!folio_swapped) { + swap_entries_free(si, ci, swp_offset(entry), nr_pages); + } else if (need_free) { + do { + if (!__swap_count(entry)) + swap_entries_free(si, ci, swp_offset(entry), 1); + entry.val++; + } while (--nr_pages); + } } /** diff --git a/mm/swapfile.c b/mm/swapfile.c index a41632e74787..5721018cb28a 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -48,21 +48,18 @@ #include #include "swap_table.h" #include "internal.h" +#include "swap_table.h" #include "swap.h" static bool swap_count_continued(struct swap_info_struct *, pgoff_t, unsigned char); static void free_swap_count_continuations(struct swap_info_struct *); -static void swap_entries_free(struct swap_info_struct *si, - struct swap_cluster_info *ci, - unsigned long start, unsigned int nr_pages); static void swap_range_alloc(struct swap_info_struct *si, unsigned int nr_entries); static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr); static void swap_put_entry_locked(struct swap_info_struct *si, struct swap_cluster_info *ci, - unsigned long offset, - unsigned char usage); + unsigned long offset); static bool folio_swapcache_freeable(struct folio *folio); static void move_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci, struct list_head *list, @@ -149,11 +146,6 @@ static struct swap_info_struct *swap_entry_to_info(swp_entry_t entry) return swap_type_to_info(swp_type(entry)); } -static inline unsigned char swap_count(unsigned char ent) -{ - return ent & ~SWAP_HAS_CACHE; /* may include COUNT_CONTINUED flag */ -} - /* * Use the second highest bit of inuse_pages counter as the indicator * if one swap device is on the available plist, so the atomic can @@ -185,15 +177,20 @@ static long swap_usage_in_pages(struct swap_info_struct *si) #define TTRS_FULL 0x4 static bool swap_only_has_cache(struct swap_info_struct *si, - unsigned long offset, int nr_pages) + struct swap_cluster_info *ci, + unsigned long offset, int nr_pages) { + unsigned int ci_off = offset % SWAPFILE_CLUSTER; unsigned char *map = si->swap_map + offset; unsigned char *map_end = map + nr_pages; + unsigned long swp_tb; do { - VM_BUG_ON(!(*map & SWAP_HAS_CACHE)); - if (*map != SWAP_HAS_CACHE) + swp_tb = __swap_table_get(ci, ci_off); + VM_WARN_ON_ONCE(!swp_tb_is_folio(swp_tb)); + if (*map) return false; + ++ci_off; } while (++map < map_end); return true; @@ -248,12 +245,12 @@ again: goto out_unlock; /* - * It's safe to delete the folio from swap cache only if the folio's - * swap_map is HAS_CACHE only, which means the slots have no page table + * It's safe to delete the folio from swap cache only if the folio + * is in swap cache with swap count == 0. The slots have no page table * reference or pending writeback, and can't be allocated to others. */ ci = swap_cluster_lock(si, offset); - need_reclaim = swap_only_has_cache(si, offset, nr_pages); + need_reclaim = swap_only_has_cache(si, ci, offset, nr_pages); swap_cluster_unlock(ci); if (!need_reclaim) goto out_unlock; @@ -779,7 +776,7 @@ static bool cluster_reclaim_range(struct swap_info_struct *si, spin_unlock(&ci->lock); do { - if (swap_count(READ_ONCE(map[offset]))) + if (READ_ONCE(map[offset])) break; swp_tb = swap_table_get(ci, offset % SWAPFILE_CLUSTER); if (swp_tb_is_folio(swp_tb)) { @@ -809,7 +806,7 @@ static bool cluster_reclaim_range(struct swap_info_struct *si, */ for (offset = start; offset < end; offset++) { swp_tb = __swap_table_get(ci, offset % SWAPFILE_CLUSTER); - if (swap_count(map[offset]) || !swp_tb_is_null(swp_tb)) + if (map[offset] || !swp_tb_is_null(swp_tb)) return false; } @@ -829,11 +826,10 @@ static bool cluster_scan_range(struct swap_info_struct *si, return true; do { - if (swap_count(map[offset])) + if (map[offset]) return false; swp_tb = __swap_table_get(ci, offset % SWAPFILE_CLUSTER); if (swp_tb_is_folio(swp_tb)) { - WARN_ON_ONCE(!(map[offset] & SWAP_HAS_CACHE)); if (!vm_swap_full()) return false; *need_reclaim = true; @@ -891,11 +887,6 @@ static bool cluster_alloc_range(struct swap_info_struct *si, if (likely(folio)) { order = folio_order(folio); nr_pages = 1 << order; - /* - * Pin the slot with SWAP_HAS_CACHE to satisfy swap_dup_entries. - * This is the legacy allocation behavior, will drop it very soon. - */ - memset(si->swap_map + offset, SWAP_HAS_CACHE, nr_pages); __swap_cache_add_folio(ci, folio, swp_entry(si->type, offset)); } else if (IS_ENABLED(CONFIG_HIBERNATION)) { order = 0; @@ -1012,8 +1003,8 @@ static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force) to_scan--; while (offset < end) { - if (!swap_count(READ_ONCE(map[offset])) && - swp_tb_is_folio(__swap_table_get(ci, offset % SWAPFILE_CLUSTER))) { + if (!READ_ONCE(map[offset]) && + swp_tb_is_folio(swap_table_get(ci, offset % SWAPFILE_CLUSTER))) { spin_unlock(&ci->lock); nr_reclaim = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY); @@ -1115,7 +1106,7 @@ new_cluster: * Scan only one fragment cluster is good enough. Order 0 * allocation will surely success, and large allocation * failure is not critical. Scanning one cluster still - * keeps the list rotated and reclaimed (for HAS_CACHE). + * keeps the list rotated and reclaimed (for clean swap cache). */ found = alloc_swap_scan_list(si, &si->frag_clusters[order], folio, false); if (found) @@ -1450,8 +1441,8 @@ static void swap_put_entries_cluster(struct swap_info_struct *si, do { swp_tb = __swap_table_get(ci, offset % SWAPFILE_CLUSTER); count = si->swap_map[offset]; - VM_WARN_ON(swap_count(count) < 1 || count == SWAP_MAP_BAD); - if (swap_count(count) == 1) { + VM_WARN_ON(count < 1 || count == SWAP_MAP_BAD); + if (count == 1) { /* count == 1 and non-cached slots will be batch freed. */ if (!swp_tb_is_folio(swp_tb)) { if (!batch_start) @@ -1459,7 +1450,6 @@ static void swap_put_entries_cluster(struct swap_info_struct *si, continue; } /* count will be 0 after put, slot can be reclaimed */ - VM_WARN_ON(!(count & SWAP_HAS_CACHE)); need_reclaim = true; } /* @@ -1468,7 +1458,7 @@ static void swap_put_entries_cluster(struct swap_info_struct *si, * slots will be freed when folio is removed from swap cache * (__swap_cache_del_folio). */ - swap_put_entry_locked(si, ci, offset, 1); + swap_put_entry_locked(si, ci, offset); if (batch_start) { swap_entries_free(si, ci, batch_start, offset - batch_start); batch_start = SWAP_ENTRY_INVALID; @@ -1625,7 +1615,8 @@ static struct swap_info_struct *_swap_info_get(swp_entry_t entry) offset = swp_offset(entry); if (offset >= si->max) goto bad_offset; - if (data_race(!si->swap_map[swp_offset(entry)])) + if (data_race(!si->swap_map[swp_offset(entry)]) && + !swap_cache_has_folio(entry)) goto bad_free; return si; @@ -1646,21 +1637,12 @@ out: static void swap_put_entry_locked(struct swap_info_struct *si, struct swap_cluster_info *ci, - unsigned long offset, - unsigned char usage) + unsigned long offset) { unsigned char count; - unsigned char has_cache; count = si->swap_map[offset]; - - has_cache = count & SWAP_HAS_CACHE; - count &= ~SWAP_HAS_CACHE; - - if (usage == SWAP_HAS_CACHE) { - VM_BUG_ON(!has_cache); - has_cache = 0; - } else if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) { + if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) { if (count == COUNT_CONTINUED) { if (swap_count_continued(si, offset, count)) count = SWAP_MAP_MAX | COUNT_CONTINUED; @@ -1670,10 +1652,8 @@ static void swap_put_entry_locked(struct swap_info_struct *si, count--; } - usage = count | has_cache; - if (usage) - WRITE_ONCE(si->swap_map[offset], usage); - else + WRITE_ONCE(si->swap_map[offset], count); + if (!count && !swp_tb_is_folio(__swap_table_get(ci, offset % SWAPFILE_CLUSTER))) swap_entries_free(si, ci, offset, 1); } @@ -1742,21 +1722,13 @@ put_out: return NULL; } -/* - * Check if it's the last ref of swap entry in the freeing path. - */ -static inline bool __maybe_unused swap_is_last_ref(unsigned char count) -{ - return (count == SWAP_HAS_CACHE) || (count == 1); -} - /* * Drop the last ref of swap entries, caller have to ensure all entries * belong to the same cgroup and cluster. */ -static void swap_entries_free(struct swap_info_struct *si, - struct swap_cluster_info *ci, - unsigned long offset, unsigned int nr_pages) +void swap_entries_free(struct swap_info_struct *si, + struct swap_cluster_info *ci, + unsigned long offset, unsigned int nr_pages) { swp_entry_t entry = swp_entry(si->type, offset); unsigned char *map = si->swap_map + offset; @@ -1769,7 +1741,7 @@ static void swap_entries_free(struct swap_info_struct *si, ci->count -= nr_pages; do { - VM_BUG_ON(!swap_is_last_ref(*map)); + VM_WARN_ON(*map > 1); *map = 0; } while (++map < map_end); @@ -1788,7 +1760,7 @@ int __swap_count(swp_entry_t entry) struct swap_info_struct *si = __swap_entry_to_info(entry); pgoff_t offset = swp_offset(entry); - return swap_count(si->swap_map[offset]); + return si->swap_map[offset]; } /** @@ -1803,7 +1775,7 @@ bool swap_entry_swapped(struct swap_info_struct *si, swp_entry_t entry) int count; ci = swap_cluster_lock(si, offset); - count = swap_count(si->swap_map[offset]); + count = si->swap_map[offset]; swap_cluster_unlock(ci); return count && count != SWAP_MAP_BAD; @@ -1830,7 +1802,7 @@ int swp_swapcount(swp_entry_t entry) ci = swap_cluster_lock(si, offset); - count = swap_count(si->swap_map[offset]); + count = si->swap_map[offset]; if (!(count & COUNT_CONTINUED)) goto out; @@ -1868,12 +1840,12 @@ static bool swap_page_trans_huge_swapped(struct swap_info_struct *si, ci = swap_cluster_lock(si, offset); if (nr_pages == 1) { - if (swap_count(map[roffset])) + if (map[roffset]) ret = true; goto unlock_out; } for (i = 0; i < nr_pages; i++) { - if (swap_count(map[offset + i])) { + if (map[offset + i]) { ret = true; break; } @@ -2027,7 +1999,7 @@ void swap_free_hibernation_slot(swp_entry_t entry) return; ci = swap_cluster_lock(si, offset); - swap_put_entry_locked(si, ci, offset, 1); + swap_put_entry_locked(si, ci, offset); swap_cluster_unlock(ci); /* In theory readahead might add it to the swap cache by accident */ @@ -2432,6 +2404,7 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si, unsigned int prev) { unsigned int i; + unsigned long swp_tb; unsigned char count; /* @@ -2442,7 +2415,11 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si, */ for (i = prev + 1; i < si->max; i++) { count = READ_ONCE(si->swap_map[i]); - if (count && swap_count(count) != SWAP_MAP_BAD) + swp_tb = swap_table_get(__swap_offset_to_cluster(si, i), + i % SWAPFILE_CLUSTER); + if (count == SWAP_MAP_BAD) + continue; + if (count || swp_tb_is_folio(swp_tb)) break; if ((i % LATENCY_LIMIT) == 0) cond_resched(); @@ -3667,8 +3644,7 @@ void si_swapinfo(struct sysinfo *val) * Returns error code in following case. * - success -> 0 * - swp_entry is invalid -> EINVAL - * - swap-cache reference is requested but there is already one. -> EEXIST - * - swap-cache reference is requested but the entry is not used. -> ENOENT + * - swap-mapped reference is requested but the entry is not used. -> ENOENT * - swap-mapped reference requested but needs continued swap count. -> ENOMEM */ static int swap_dup_entries(struct swap_info_struct *si, @@ -3677,39 +3653,30 @@ static int swap_dup_entries(struct swap_info_struct *si, unsigned char usage, int nr) { int i; - unsigned char count, has_cache; + unsigned char count; for (i = 0; i < nr; i++) { count = si->swap_map[offset + i]; - /* * For swapin out, allocator never allocates bad slots. for * swapin, readahead is guarded by swap_entry_swapped. */ - if (WARN_ON(swap_count(count) == SWAP_MAP_BAD)) + if (WARN_ON(count == SWAP_MAP_BAD)) return -ENOENT; - - has_cache = count & SWAP_HAS_CACHE; - count &= ~SWAP_HAS_CACHE; - - if (!count && !has_cache) { + /* + * Swap count duplication must be guarded by either swap cache folio (from + * folio_dup_swap) or external lock of existing entry (from swap_dup_entry_direct). + */ + if (WARN_ON(!count && + !swp_tb_is_folio(__swap_table_get(ci, offset % SWAPFILE_CLUSTER)))) return -ENOENT; - } else if (usage == SWAP_HAS_CACHE) { - if (has_cache) - return -EEXIST; - } else if ((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX) { + if (WARN_ON((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX)) return -EINVAL; - } } for (i = 0; i < nr; i++) { count = si->swap_map[offset + i]; - has_cache = count & SWAP_HAS_CACHE; - count &= ~SWAP_HAS_CACHE; - - if (usage == SWAP_HAS_CACHE) - has_cache = SWAP_HAS_CACHE; - else if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX) + if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX) count += usage; else if (swap_count_continued(si, offset + i, count)) count = COUNT_CONTINUED; @@ -3721,7 +3688,7 @@ static int swap_dup_entries(struct swap_info_struct *si, return -ENOMEM; } - WRITE_ONCE(si->swap_map[offset + i], count | has_cache); + WRITE_ONCE(si->swap_map[offset + i], count); } return 0; @@ -3767,27 +3734,6 @@ int swap_dup_entry_direct(swp_entry_t entry) return err; } -/* Mark the swap map as HAS_CACHE, caller need to hold the cluster lock */ -void __swapcache_set_cached(struct swap_info_struct *si, - struct swap_cluster_info *ci, - swp_entry_t entry) -{ - WARN_ON(swap_dup_entries(si, ci, swp_offset(entry), SWAP_HAS_CACHE, 1)); -} - -/* Clear the swap map as !HAS_CACHE, caller need to hold the cluster lock */ -void __swapcache_clear_cached(struct swap_info_struct *si, - struct swap_cluster_info *ci, - swp_entry_t entry, unsigned int nr) -{ - if (swap_only_has_cache(si, swp_offset(entry), nr)) { - swap_entries_free(si, ci, swp_offset(entry), nr); - } else { - for (int i = 0; i < nr; i++, entry.val++) - swap_put_entry_locked(si, ci, swp_offset(entry), SWAP_HAS_CACHE); - } -} - /* * add_swap_count_continuation - called when a swap count is duplicated * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's @@ -3833,7 +3779,7 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) ci = swap_cluster_lock(si, offset); - count = swap_count(si->swap_map[offset]); + count = si->swap_map[offset]; if ((count & ~COUNT_CONTINUED) != SWAP_MAP_MAX) { /* -- cgit v1.2.3 From 086498aed3f68febb58df7e6141962942abb8944 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Tue, 27 Jan 2026 20:13:00 +0800 Subject: mm: convert __HAVE_ARCH_TLB_REMOVE_TABLE to CONFIG_HAVE_ARCH_TLB_REMOVE_TABLE config For architectures that define __HAVE_ARCH_TLB_REMOVE_TABLE, the page tables at the pmd/pud level are generally not of struct ptdesc type, and do not have pt_rcu_head member, thus these architectures cannot support PT_RECLAIM. In preparation for enabling PT_RECLAIM on more architectures, convert __HAVE_ARCH_TLB_REMOVE_TABLE to CONFIG_HAVE_ARCH_TLB_REMOVE_TABLE config, so that we can make conditional judgments in Kconfig. Link: https://lkml.kernel.org/r/5ebfa3d4b56e63c6906bda5eccaa9f7194d3a86b.1769515122.git.zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Acked-by: David Hildenbrand (Arm) Tested-by: Andreas Larsson [sparc, UP&SMP] Acked-by: Andreas Larsson [sparc] Cc: "Aneesh Kumar K.V" Cc: Anton Ivanov Cc: Borislav Petkov Cc: Dave Hansen Cc: Dev Jain Cc: Helge Deller Cc: "H. Peter Anvin" Cc: Huacai Chen Cc: Ingo Molnar Cc: "James E.J. Bottomley" Cc: Johannes Berg Cc: Lance Yang Cc: "Liam R. Howlett" Cc: Lorenzo Stoakes Cc: Magnus Lindholm Cc: Matt Turner Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Richard Henderson Cc: Richard Weinberger Cc: Suren Baghdasaryan Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vlastimil Babka Cc: WANG Xuerui Cc: Wei Yang Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/powerpc/Kconfig | 1 + arch/powerpc/include/asm/tlb.h | 1 - arch/sparc/Kconfig | 1 + arch/sparc/include/asm/tlb_64.h | 1 - include/asm-generic/tlb.h | 2 +- mm/Kconfig | 3 +++ 6 files changed, 6 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 271690445a45..374ee60dcf75 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -305,6 +305,7 @@ config PPC select LOCK_MM_AND_FIND_VMA select MMU_GATHER_PAGE_SIZE select MMU_GATHER_RCU_TABLE_FREE + select HAVE_ARCH_TLB_REMOVE_TABLE select MMU_GATHER_MERGE_VMAS select MMU_LAZY_TLB_SHOOTDOWN if PPC_BOOK3S_64 select MODULES_USE_ELF_RELA diff --git a/arch/powerpc/include/asm/tlb.h b/arch/powerpc/include/asm/tlb.h index 2058e8d3e013..1ca7d4c4b90d 100644 --- a/arch/powerpc/include/asm/tlb.h +++ b/arch/powerpc/include/asm/tlb.h @@ -37,7 +37,6 @@ extern void tlb_flush(struct mmu_gather *tlb); */ #define tlb_needs_table_invalidate() radix_enabled() -#define __HAVE_ARCH_TLB_REMOVE_TABLE /* Get the generic bits... */ #include diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index 2bad14744ca4..61415a9b3e86 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -74,6 +74,7 @@ config SPARC64 select HAVE_KRETPROBES select HAVE_KPROBES select MMU_GATHER_RCU_TABLE_FREE if SMP + select HAVE_ARCH_TLB_REMOVE_TABLE if SMP select MMU_GATHER_MERGE_VMAS select MMU_GATHER_NO_FLUSH_CACHE select HAVE_ARCH_TRANSPARENT_HUGEPAGE diff --git a/arch/sparc/include/asm/tlb_64.h b/arch/sparc/include/asm/tlb_64.h index 1a6e694418e3..3037187482db 100644 --- a/arch/sparc/include/asm/tlb_64.h +++ b/arch/sparc/include/asm/tlb_64.h @@ -33,7 +33,6 @@ void flush_tlb_pending(void); #define tlb_needs_table_invalidate() (false) #endif -#define __HAVE_ARCH_TLB_REMOVE_TABLE #include #endif /* _SPARC64_TLB_H */ diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index 3975f7d11553..4aeac0c3d3f0 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -213,7 +213,7 @@ struct mmu_table_batch { #define MAX_TABLE_BATCH \ ((PAGE_SIZE - sizeof(struct mmu_table_batch)) / sizeof(void *)) -#ifndef __HAVE_ARCH_TLB_REMOVE_TABLE +#ifndef CONFIG_HAVE_ARCH_TLB_REMOVE_TABLE static inline void __tlb_remove_table(void *table) { struct ptdesc *ptdesc = (struct ptdesc *)table; diff --git a/mm/Kconfig b/mm/Kconfig index d1d76ce7373e..ec1db8a786af 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -1448,6 +1448,9 @@ config ARCH_HAS_USER_SHADOW_STACK The architecture has hardware support for userspace shadow call stacks (eg, x86 CET, arm64 GCS or RISC-V Zicfiss). +config HAVE_ARCH_TLB_REMOVE_TABLE + def_bool n + config ARCH_SUPPORTS_PT_RECLAIM def_bool n -- cgit v1.2.3