From a03ed8f144e2732dbfce05d4ee5576d77aba6957 Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Mon, 15 Dec 2025 10:40:25 +0000 Subject: mm/vmalloc: clarify why vmap_range_noflush() might sleep The only reason vmap_range_noflush() can sleep is because of pagetable allocations. The actual allocation mechanism is arch-specific so might_alloc() doesn't work here (what GFP flags would be used?). Hence, just add a comment. Also note that this might do a TLB shootdown. This is not actually sleeping but it requires IRQs on for x86, and might_sleep() incidentally serves to detect violations of that too. Link: https://lkml.kernel.org/r/20251215-b4-vmalloc-might_alloc-v3-1-92dd8e406868@google.com Signed-off-by: Brendan Jackman Reviewed-by: Uladzislau Rezki (Sony) Reviewed-by: Anshuman Khandual Signed-off-by: Andrew Morton --- mm/vmalloc.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 628f96e83b11..429a893b0505 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -305,6 +305,11 @@ static int vmap_range_noflush(unsigned long addr, unsigned long end, int err; pgtbl_mod_mask mask = 0; + /* + * Might allocate pagetables (for most archs a more precise annotation + * would be might_alloc(GFP_PGTABLE_KERNEL)). Also might shootdown TLB + * (requires IRQs enabled on x86). + */ might_sleep(); BUG_ON(addr >= end); -- cgit v1.2.3 From 817383b34db1e7d2a74d2d2b51cb0eed1586253b Mon Sep 17 00:00:00 2001 From: Enze Li Date: Tue, 2 Dec 2025 16:23:40 +0800 Subject: mm/damon/core: fix memory leak of repeat mode damon_call_control objects A memory leak exists in the handling of repeat mode damon_call_control objects by kdamond_call(). While damon_call() correctly allows multiple repeat mode objects (with ->repeat set to true) to be added to the per-context list, kdamond_call() incorrectly processes them. The function moves all repeat mode objects from the context's list to a temporary list (repeat_controls). However, it only moves the first object back to the context's list for future calls, leaving the remaining objects on the temporary list where they are abandoned and leaked. This patch fixes the leak by ensuring all repeat mode objects are properly re-added to the context's list. Note that the leak is not in the real world, and therefore no user is impacted. It is only potential for imaginaray damon_call() use cases that do not exist in the tree for now. In more detail, the leak happens only when the multiple repeat mode objects are assumed to be deallocated by kdamond_call() (damon_call_control->dealloc_on_cancel is set). There is no such damon_call() use cases at the moment. Link: https://lkml.kernel.org/r/20251202082340.34178-1-lienze@kylinos.cn Fixes: 43df7676e550 ("mm/damon/core: introduce repeat mode damon_call()") Signed-off-by: Enze Li Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/core.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/mm/damon/core.c b/mm/damon/core.c index 84f80a20f233..c852cac4f82e 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -2606,13 +2606,19 @@ static void kdamond_call(struct damon_ctx *ctx, bool cancel) list_add(&control->list, &repeat_controls); } } - control = list_first_entry_or_null(&repeat_controls, - struct damon_call_control, list); - if (!control || cancel) - return; - mutex_lock(&ctx->call_controls_lock); - list_add_tail(&control->list, &ctx->call_controls); - mutex_unlock(&ctx->call_controls_lock); + while (true) { + control = list_first_entry_or_null(&repeat_controls, + struct damon_call_control, list); + if (!control) + break; + /* Unlink from the repeate_controls list. */ + list_del(&control->list); + if (cancel) + continue; + mutex_lock(&ctx->call_controls_lock); + list_add(&control->list, &ctx->call_controls); + mutex_unlock(&ctx->call_controls_lock); + } } /* Returns negative error code if it's not activated but should return */ -- cgit v1.2.3 From 2a912d440c6024148a25850c7c4066b152ec8750 Mon Sep 17 00:00:00 2001 From: Joel Granados Date: Mon, 15 Dec 2025 16:47:37 +0100 Subject: alloc_tag: move memory_allocation_profiling_sysctls into .rodata Remove the change in file mode permissions done before initializing the sysctl. It is not necessary as the writing of the kernel variable will be blocked by the proc_mem_profiling_handler when writing is disallowed (also controlled by mem_profiling_support). Link: https://lkml.kernel.org/r/20251215-jag-alloc_tag_const-v1-1-35ea56a1ce13@kernel.org Signed-off-by: Joel Granados Acked-by: Suren Baghdasaryan Cc: Kent Overstreet Signed-off-by: Andrew Morton --- lib/alloc_tag.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c index 27fee57a5c91..846a5b5b44a4 100644 --- a/lib/alloc_tag.c +++ b/lib/alloc_tag.c @@ -783,7 +783,7 @@ static int proc_mem_profiling_handler(const struct ctl_table *table, int write, } -static struct ctl_table memory_allocation_profiling_sysctls[] = { +static const struct ctl_table memory_allocation_profiling_sysctls[] = { { .procname = "mem_profiling", .data = &mem_alloc_profiling_key, @@ -798,9 +798,6 @@ static struct ctl_table memory_allocation_profiling_sysctls[] = { static void __init sysctl_init(void) { - if (!mem_profiling_support) - memory_allocation_profiling_sysctls[0].mode = 0444; - register_sysctl_init("vm", memory_allocation_profiling_sysctls); } #else /* CONFIG_SYSCTL */ -- cgit v1.2.3 From 58852f24f9566602340130804bf7f4474a3f5f2a Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Mon, 15 Dec 2025 15:03:10 +0000 Subject: powerpc/64s: do not re-activate batched TLB flush Patch series "Nesting support for lazy MMU mode", v6. When the lazy MMU mode was introduced eons ago, it wasn't made clear whether such a sequence was legal: arch_enter_lazy_mmu_mode() ... arch_enter_lazy_mmu_mode() ... arch_leave_lazy_mmu_mode() ... arch_leave_lazy_mmu_mode() It seems fair to say that nested calls to arch_{enter,leave}_lazy_mmu_mode() were not expected, and most architectures never explicitly supported it. Nesting does in fact occur in certain configurations, and avoiding it has proved difficult. This series therefore enables lazy_mmu sections to nest, on all architectures. Nesting is handled using a counter in task_struct (patch 8), like other stateless APIs such as pagefault_{disable,enable}(). This is fully handled in a new generic layer in ; the arch_* API remains unchanged. A new pair of calls, lazy_mmu_mode_{pause,resume}(), is also introduced to allow functions that are called with the lazy MMU mode enabled to temporarily pause it, regardless of nesting. An arch now opts in to using the lazy MMU mode by selecting CONFIG_ARCH_LAZY_MMU; this is more appropriate now that we have a generic API, especially with state conditionally added to task_struct. This patch (of 14): Since commit b9ef323ea168 ("powerpc/64s: Disable preemption in hash lazy mmu mode") a task can not be preempted while in lazy MMU mode. Therefore, the batch re-activation code is never called, so remove it. Link: https://lkml.kernel.org/r/20251215150323.2218608-1-kevin.brodsky@arm.com Link: https://lkml.kernel.org/r/20251215150323.2218608-2-kevin.brodsky@arm.com Signed-off-by: Alexander Gordeev Signed-off-by: Kevin Brodsky Reviewed-by: David Hildenbrand Reviewed-by: Ritesh Harjani (IBM) Reviewed-by: Ryan Roberts Tested-by: Venkat Rao Bagalkote Reviewed-by: Yeoreum Yun Cc: Andreas Larsson Cc: Anshuman Khandual Cc: Borislav Betkov Cc: Boris Ostrovsky Cc: Catalin Marinas Cc: Christophe Leroy Cc: David S. Miller Cc: David Woodhouse Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jann Horn Cc: Juegren Gross Cc: levi.yun Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Suren Baghdasaryan Cc: Thomas Gleinxer Cc: Vlastimil Babka Cc: Will Deacon Cc: David Hildenbrand (Red Hat) Signed-off-by: Andrew Morton --- arch/powerpc/include/asm/thread_info.h | 2 -- arch/powerpc/kernel/process.c | 25 ------------------------- 2 files changed, 27 deletions(-) diff --git a/arch/powerpc/include/asm/thread_info.h b/arch/powerpc/include/asm/thread_info.h index b0f200aba2b3..97f35f9b1a96 100644 --- a/arch/powerpc/include/asm/thread_info.h +++ b/arch/powerpc/include/asm/thread_info.h @@ -154,12 +154,10 @@ void arch_setup_new_exec(void); /* Don't move TLF_NAPPING without adjusting the code in entry_32.S */ #define TLF_NAPPING 0 /* idle thread enabled NAP mode */ #define TLF_SLEEPING 1 /* suspend code enabled SLEEP mode */ -#define TLF_LAZY_MMU 3 /* tlb_batch is active */ #define TLF_RUNLATCH 4 /* Is the runlatch enabled? */ #define _TLF_NAPPING (1 << TLF_NAPPING) #define _TLF_SLEEPING (1 << TLF_SLEEPING) -#define _TLF_LAZY_MMU (1 << TLF_LAZY_MMU) #define _TLF_RUNLATCH (1 << TLF_RUNLATCH) #ifndef __ASSEMBLER__ diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index a45fe147868b..a15d0b619b1f 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -1281,9 +1281,6 @@ struct task_struct *__switch_to(struct task_struct *prev, { struct thread_struct *new_thread, *old_thread; struct task_struct *last; -#ifdef CONFIG_PPC_64S_HASH_MMU - struct ppc64_tlb_batch *batch; -#endif new_thread = &new->thread; old_thread = ¤t->thread; @@ -1291,14 +1288,6 @@ struct task_struct *__switch_to(struct task_struct *prev, WARN_ON(!irqs_disabled()); #ifdef CONFIG_PPC_64S_HASH_MMU - batch = this_cpu_ptr(&ppc64_tlb_batch); - if (batch->active) { - current_thread_info()->local_flags |= _TLF_LAZY_MMU; - if (batch->index) - __flush_tlb_pending(batch); - batch->active = 0; - } - /* * On POWER9 the copy-paste buffer can only paste into * foreign real addresses, so unprivileged processes can not @@ -1369,20 +1358,6 @@ struct task_struct *__switch_to(struct task_struct *prev, */ #ifdef CONFIG_PPC_BOOK3S_64 -#ifdef CONFIG_PPC_64S_HASH_MMU - /* - * This applies to a process that was context switched while inside - * arch_enter_lazy_mmu_mode(), to re-activate the batch that was - * deactivated above, before _switch(). This will never be the case - * for new tasks. - */ - if (current_thread_info()->local_flags & _TLF_LAZY_MMU) { - current_thread_info()->local_flags &= ~_TLF_LAZY_MMU; - batch = this_cpu_ptr(&ppc64_tlb_batch); - batch->active = 1; - } -#endif - /* * Math facilities are masked out of the child MSR in copy_thread. * A new task does not need to restore_math because it will -- cgit v1.2.3 From 66bdd779d3441329d0c55af1b679d97e10e7dfde Mon Sep 17 00:00:00 2001 From: Kevin Brodsky Date: Mon, 15 Dec 2025 15:03:11 +0000 Subject: x86/xen: simplify flush_lazy_mmu() arch_flush_lazy_mmu_mode() is called when outstanding batched pgtable operations must be completed immediately. There should however be no need to leave and re-enter lazy MMU completely. The only part of that sequence that we really need is xen_mc_flush(); call it directly. Link: https://lkml.kernel.org/r/20251215150323.2218608-3-kevin.brodsky@arm.com Signed-off-by: Kevin Brodsky Reviewed-by: David Hildenbrand Reviewed-by: Ryan Roberts Reviewed-by: Juergen Gross Reviewed-by: Anshuman Khandual Reviewed-by: Yeoreum Yun Cc: Alexander Gordeev Cc: Andreas Larsson Cc: Borislav Betkov Cc: Boris Ostrovsky Cc: Catalin Marinas Cc: Christophe Leroy Cc: David Hildenbrand (Red Hat) Cc: David S. Miller Cc: David Woodhouse Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jann Horn Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Ritesh Harjani (IBM) Cc: Suren Baghdasaryan Cc: Thomas Gleinxer Cc: Venkat Rao Bagalkote Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/x86/xen/mmu_pv.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index 2a4a8deaf612..7a35c3393df4 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -2139,10 +2139,8 @@ static void xen_flush_lazy_mmu(void) { preempt_disable(); - if (xen_get_lazy_mode() == XEN_LAZY_MMU) { - arch_leave_lazy_mmu_mode(); - arch_enter_lazy_mmu_mode(); - } + if (xen_get_lazy_mode() == XEN_LAZY_MMU) + xen_mc_flush(); preempt_enable(); } -- cgit v1.2.3 From c3f0778ffeca271b3b221fcdec66784c1eb9440d Mon Sep 17 00:00:00 2001 From: Kevin Brodsky Date: Mon, 15 Dec 2025 15:03:12 +0000 Subject: powerpc/mm: implement arch_flush_lazy_mmu_mode() Upcoming changes to the lazy_mmu API will cause arch_flush_lazy_mmu_mode() to be called when leaving a nested lazy_mmu section. Move the relevant logic from arch_leave_lazy_mmu_mode() to arch_flush_lazy_mmu_mode() and have the former call the latter. The radix_enabled() check is required in both as arch_flush_lazy_mmu_mode() will be called directly from the generic layer in a subsequent patch. Note: the additional this_cpu_ptr() and radix_enabled() calls on the arch_leave_lazy_mmu_mode() path will be removed in a subsequent patch. Link: https://lkml.kernel.org/r/20251215150323.2218608-4-kevin.brodsky@arm.com Signed-off-by: Kevin Brodsky Acked-by: David Hildenbrand Tested-by: Venkat Rao Bagalkote Reviewed-by: Yeoreum Yun Cc: Alexander Gordeev Cc: Andreas Larsson Cc: Anshuman Khandual Cc: Borislav Betkov Cc: Boris Ostrovsky Cc: Catalin Marinas Cc: Christophe Leroy Cc: David Hildenbrand (Red Hat) Cc: David S. Miller Cc: David Woodhouse Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jann Horn Cc: Juegren Gross Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Ritesh Harjani (IBM) Cc: Ryan Roberts Cc: Suren Baghdasaryan Cc: Thomas Gleinxer Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/powerpc/include/asm/book3s/64/tlbflush-hash.h | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h b/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h index 146287d9580f..2d45f57df169 100644 --- a/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h +++ b/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h @@ -41,7 +41,7 @@ static inline void arch_enter_lazy_mmu_mode(void) batch->active = 1; } -static inline void arch_leave_lazy_mmu_mode(void) +static inline void arch_flush_lazy_mmu_mode(void) { struct ppc64_tlb_batch *batch; @@ -51,12 +51,21 @@ static inline void arch_leave_lazy_mmu_mode(void) if (batch->index) __flush_tlb_pending(batch); +} + +static inline void arch_leave_lazy_mmu_mode(void) +{ + struct ppc64_tlb_batch *batch; + + if (radix_enabled()) + return; + batch = this_cpu_ptr(&ppc64_tlb_batch); + + arch_flush_lazy_mmu_mode(); batch->active = 0; preempt_enable(); } -#define arch_flush_lazy_mmu_mode() do {} while (0) - extern void hash__tlbiel_all(unsigned int action); extern void flush_hash_page(unsigned long vpn, real_pte_t pte, int psize, -- cgit v1.2.3 From 442bf488b9e876712c4c86783c3b6818c4042f26 Mon Sep 17 00:00:00 2001 From: Kevin Brodsky Date: Mon, 15 Dec 2025 15:03:13 +0000 Subject: sparc/mm: implement arch_flush_lazy_mmu_mode() Upcoming changes to the lazy_mmu API will cause arch_flush_lazy_mmu_mode() to be called when leaving a nested lazy_mmu section. Move the relevant logic from arch_leave_lazy_mmu_mode() to arch_flush_lazy_mmu_mode() and have the former call the latter. Note: the additional this_cpu_ptr() call on the arch_leave_lazy_mmu_mode() path will be removed in a subsequent patch. Link: https://lkml.kernel.org/r/20251215150323.2218608-5-kevin.brodsky@arm.com Signed-off-by: Kevin Brodsky Acked-by: David Hildenbrand Reviewed-by: Yeoreum Yun Acked-by: Andreas Larsson Cc: Alexander Gordeev Cc: Anshuman Khandual Cc: Borislav Betkov Cc: Boris Ostrovsky Cc: Catalin Marinas Cc: Christophe Leroy Cc: David Hildenbrand (Red Hat) Cc: David S. Miller Cc: David Woodhouse Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jann Horn Cc: Juegren Gross Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Ritesh Harjani (IBM) Cc: Ryan Roberts Cc: Suren Baghdasaryan Cc: Thomas Gleinxer Cc: Venkat Rao Bagalkote Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/sparc/include/asm/tlbflush_64.h | 2 +- arch/sparc/mm/tlb.c | 9 ++++++++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/arch/sparc/include/asm/tlbflush_64.h b/arch/sparc/include/asm/tlbflush_64.h index 8b8cdaa69272..925bb5d7a4e1 100644 --- a/arch/sparc/include/asm/tlbflush_64.h +++ b/arch/sparc/include/asm/tlbflush_64.h @@ -43,8 +43,8 @@ void flush_tlb_kernel_range(unsigned long start, unsigned long end); void flush_tlb_pending(void); void arch_enter_lazy_mmu_mode(void); +void arch_flush_lazy_mmu_mode(void); void arch_leave_lazy_mmu_mode(void); -#define arch_flush_lazy_mmu_mode() do {} while (0) /* Local cpu only. */ void __flush_tlb_all(void); diff --git a/arch/sparc/mm/tlb.c b/arch/sparc/mm/tlb.c index a35ddcca5e76..7b5dfcdb1243 100644 --- a/arch/sparc/mm/tlb.c +++ b/arch/sparc/mm/tlb.c @@ -59,12 +59,19 @@ void arch_enter_lazy_mmu_mode(void) tb->active = 1; } -void arch_leave_lazy_mmu_mode(void) +void arch_flush_lazy_mmu_mode(void) { struct tlb_batch *tb = this_cpu_ptr(&tlb_batch); if (tb->tlb_nr) flush_tlb_pending(); +} + +void arch_leave_lazy_mmu_mode(void) +{ + struct tlb_batch *tb = this_cpu_ptr(&tlb_batch); + + arch_flush_lazy_mmu_mode(); tb->active = 0; preempt_enable(); } -- cgit v1.2.3 From f2be745071ffd6793c032ca8443348c3ce0e3e18 Mon Sep 17 00:00:00 2001 From: Kevin Brodsky Date: Mon, 15 Dec 2025 15:03:14 +0000 Subject: mm: clarify lazy_mmu sleeping constraints The lazy MMU mode documentation makes clear that an implementation should not assume that preemption is disabled or any lock is held upon entry to the mode; however it says nothing about what code using the lazy MMU interface should expect. In practice sleeping is forbidden (for generic code) while the lazy MMU mode is active: say it explicitly. Link: https://lkml.kernel.org/r/20251215150323.2218608-6-kevin.brodsky@arm.com Signed-off-by: Kevin Brodsky Reviewed-by: Yeoreum Yun Acked-by: David Hildenbrand (Red Hat) Cc: Alexander Gordeev Cc: Andreas Larsson Cc: Anshuman Khandual Cc: Borislav Betkov Cc: Boris Ostrovsky Cc: Catalin Marinas Cc: Christophe Leroy Cc: David S. Miller Cc: David Woodhouse Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jann Horn Cc: Juegren Gross Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Ritesh Harjani (IBM) Cc: Ryan Roberts Cc: Suren Baghdasaryan Cc: Thomas Gleinxer Cc: Venkat Rao Bagalkote Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 652f287c1ef6..1abc4a1c3d72 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -225,11 +225,15 @@ static inline int pmd_dirty(pmd_t pmd) * up to date. * * In the general case, no lock is guaranteed to be held between entry and exit - * of the lazy mode. So the implementation must assume preemption may be enabled - * and cpu migration is possible; it must take steps to be robust against this. - * (In practice, for user PTE updates, the appropriate page table lock(s) are - * held, but for kernel PTE updates, no lock is held). Nesting is not permitted - * and the mode cannot be used in interrupt context. + * of the lazy mode. (In practice, for user PTE updates, the appropriate page + * table lock(s) are held, but for kernel PTE updates, no lock is held). + * The implementation must therefore assume preemption may be enabled upon + * entry to the mode and cpu migration is possible; it must take steps to be + * robust against this. An implementation may handle this by disabling + * preemption, as a consequence generic code may not sleep while the lazy MMU + * mode is active. + * + * Nesting is not permitted and the mode cannot be used in interrupt context. */ #ifndef __HAVE_ARCH_ENTER_LAZY_MMU_MODE static inline void arch_enter_lazy_mmu_mode(void) {} -- cgit v1.2.3 From 7303ecbfe4f46c00191b9b66acaa918784bad210 Mon Sep 17 00:00:00 2001 From: Kevin Brodsky Date: Mon, 15 Dec 2025 15:03:15 +0000 Subject: mm: introduce CONFIG_ARCH_HAS_LAZY_MMU_MODE Architectures currently opt in for implementing lazy_mmu helpers by defining __HAVE_ARCH_ENTER_LAZY_MMU_MODE. In preparation for introducing a generic lazy_mmu layer that will require storage in task_struct, let's switch to a cleaner approach: instead of defining a macro, select a CONFIG option. This patch introduces CONFIG_ARCH_HAS_LAZY_MMU_MODE and has each arch select it when it implements lazy_mmu helpers. __HAVE_ARCH_ENTER_LAZY_MMU_MODE is removed and relies on the new CONFIG instead. On x86, lazy_mmu helpers are only implemented if PARAVIRT_XXL is selected. This creates some complications in arch/x86/boot/, because a few files manually undefine PARAVIRT* options. As a result does not define the lazy_mmu helpers, but this breaks the build as only defines them if !CONFIG_ARCH_HAS_LAZY_MMU_MODE. There does not seem to be a clean way out of this - let's just undefine that new CONFIG too. Link: https://lkml.kernel.org/r/20251215150323.2218608-7-kevin.brodsky@arm.com Signed-off-by: Kevin Brodsky Acked-by: David Hildenbrand Reviewed-by: Ritesh Harjani (IBM) Reviewed-by: Ryan Roberts Reviewed-by: Yeoreum Yun Acked-by: Andreas Larsson [sparc] Cc: Alexander Gordeev Cc: Anshuman Khandual Cc: Borislav Betkov Cc: Boris Ostrovsky Cc: Catalin Marinas Cc: Christophe Leroy Cc: David Hildenbrand (Red Hat) Cc: David S. Miller Cc: David Woodhouse Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jann Horn Cc: Juegren Gross Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Suren Baghdasaryan Cc: Thomas Gleinxer Cc: Venkat Rao Bagalkote Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/arm64/Kconfig | 1 + arch/arm64/include/asm/pgtable.h | 1 - arch/powerpc/include/asm/book3s/64/tlbflush-hash.h | 2 -- arch/powerpc/platforms/Kconfig.cputype | 1 + arch/sparc/Kconfig | 1 + arch/sparc/include/asm/tlbflush_64.h | 2 -- arch/x86/Kconfig | 1 + arch/x86/boot/compressed/misc.h | 1 + arch/x86/boot/startup/sme.c | 1 + arch/x86/include/asm/paravirt.h | 1 - include/linux/pgtable.h | 2 +- mm/Kconfig | 7 +++++++ 12 files changed, 14 insertions(+), 7 deletions(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 93173f0a09c7..3fb4603c0e16 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -35,6 +35,7 @@ config ARM64 select ARCH_HAS_KCOV select ARCH_HAS_KERNEL_FPU_SUPPORT if KERNEL_MODE_NEON select ARCH_HAS_KEEPINITRD + select ARCH_HAS_LAZY_MMU_MODE select ARCH_HAS_MEMBARRIER_SYNC_CORE select ARCH_HAS_MEM_ENCRYPT select ARCH_SUPPORTS_MSEAL_SYSTEM_MAPPINGS diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 64d5f1d9cce9..f7d66c261347 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -80,7 +80,6 @@ static inline void queue_pte_barriers(void) } } -#define __HAVE_ARCH_ENTER_LAZY_MMU_MODE static inline void arch_enter_lazy_mmu_mode(void) { /* diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h b/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h index 2d45f57df169..565c1b7c3eae 100644 --- a/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h +++ b/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h @@ -24,8 +24,6 @@ DECLARE_PER_CPU(struct ppc64_tlb_batch, ppc64_tlb_batch); extern void __flush_tlb_pending(struct ppc64_tlb_batch *batch); -#define __HAVE_ARCH_ENTER_LAZY_MMU_MODE - static inline void arch_enter_lazy_mmu_mode(void) { struct ppc64_tlb_batch *batch; diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index 4c321a8ea896..f399917c17bd 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -93,6 +93,7 @@ config PPC_BOOK3S_64 select IRQ_WORK select PPC_64S_HASH_MMU if !PPC_RADIX_MMU select KASAN_VMALLOC if KASAN + select ARCH_HAS_LAZY_MMU_MODE config PPC_BOOK3E_64 bool "Embedded processors" diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index a630d373e645..2bad14744ca4 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -112,6 +112,7 @@ config SPARC64 select NEED_PER_CPU_PAGE_FIRST_CHUNK select ARCH_SUPPORTS_SCHED_SMT if SMP select ARCH_SUPPORTS_SCHED_MC if SMP + select ARCH_HAS_LAZY_MMU_MODE config ARCH_PROC_KCORE_TEXT def_bool y diff --git a/arch/sparc/include/asm/tlbflush_64.h b/arch/sparc/include/asm/tlbflush_64.h index 925bb5d7a4e1..4e1036728e2f 100644 --- a/arch/sparc/include/asm/tlbflush_64.h +++ b/arch/sparc/include/asm/tlbflush_64.h @@ -39,8 +39,6 @@ static inline void flush_tlb_range(struct vm_area_struct *vma, void flush_tlb_kernel_range(unsigned long start, unsigned long end); -#define __HAVE_ARCH_ENTER_LAZY_MMU_MODE - void flush_tlb_pending(void); void arch_enter_lazy_mmu_mode(void); void arch_flush_lazy_mmu_mode(void); diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 80527299f859..2427a66cb0fe 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -808,6 +808,7 @@ config PARAVIRT config PARAVIRT_XXL bool depends on X86_64 + select ARCH_HAS_LAZY_MMU_MODE config PARAVIRT_DEBUG bool "paravirt-ops debugging" diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index fd855e32c9b9..4f86c5903e03 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -11,6 +11,7 @@ #undef CONFIG_PARAVIRT #undef CONFIG_PARAVIRT_XXL #undef CONFIG_PARAVIRT_SPINLOCKS +#undef CONFIG_ARCH_HAS_LAZY_MMU_MODE #undef CONFIG_KASAN #undef CONFIG_KASAN_GENERIC diff --git a/arch/x86/boot/startup/sme.c b/arch/x86/boot/startup/sme.c index e7ea65f3f1d6..b76a7c95dfe1 100644 --- a/arch/x86/boot/startup/sme.c +++ b/arch/x86/boot/startup/sme.c @@ -24,6 +24,7 @@ #undef CONFIG_PARAVIRT #undef CONFIG_PARAVIRT_XXL #undef CONFIG_PARAVIRT_SPINLOCKS +#undef CONFIG_ARCH_HAS_LAZY_MMU_MODE /* * This code runs before CPU feature bits are set. By default, the diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index b5e59a7ba0d0..13f9cd31c8f8 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -526,7 +526,6 @@ static inline void arch_end_context_switch(struct task_struct *next) PVOP_VCALL1(cpu.end_context_switch, next); } -#define __HAVE_ARCH_ENTER_LAZY_MMU_MODE static inline void arch_enter_lazy_mmu_mode(void) { PVOP_VCALL0(mmu.lazy_mode.enter); diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 1abc4a1c3d72..d46d86959bd6 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -235,7 +235,7 @@ static inline int pmd_dirty(pmd_t pmd) * * Nesting is not permitted and the mode cannot be used in interrupt context. */ -#ifndef __HAVE_ARCH_ENTER_LAZY_MMU_MODE +#ifndef CONFIG_ARCH_HAS_LAZY_MMU_MODE static inline void arch_enter_lazy_mmu_mode(void) {} static inline void arch_leave_lazy_mmu_mode(void) {} static inline void arch_flush_lazy_mmu_mode(void) {} diff --git a/mm/Kconfig b/mm/Kconfig index a992f2203eb9..7c2520e6a6b3 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -1468,6 +1468,13 @@ config PT_RECLAIM config FIND_NORMAL_PAGE def_bool n +config ARCH_HAS_LAZY_MMU_MODE + bool + help + The architecture uses the lazy MMU mode. This allows changes to + MMU-related architectural state to be deferred until the mode is + exited. See for details. + source "mm/damon/Kconfig" endmenu -- cgit v1.2.3 From 0a096ab7a3a6e2859c3c88988e548c5c213138bc Mon Sep 17 00:00:00 2001 From: Kevin Brodsky Date: Mon, 15 Dec 2025 15:03:16 +0000 Subject: mm: introduce generic lazy_mmu helpers The implementation of the lazy MMU mode is currently entirely arch-specific; core code directly calls arch helpers: arch_{enter,leave}_lazy_mmu_mode(). We are about to introduce support for nested lazy MMU sections. As things stand we'd have to duplicate that logic in every arch implementing lazy_mmu - adding to a fair amount of logic already duplicated across lazy_mmu implementations. This patch therefore introduces a new generic layer that calls the existing arch_* helpers. Two pair of calls are introduced: * lazy_mmu_mode_enable() ... lazy_mmu_mode_disable() This is the standard case where the mode is enabled for a given block of code by surrounding it with enable() and disable() calls. * lazy_mmu_mode_pause() ... lazy_mmu_mode_resume() This is for situations where the mode is temporarily disabled by first calling pause() and then resume() (e.g. to prevent any batching from occurring in a critical section). The documentation in will be updated in a subsequent patch. No functional change should be introduced at this stage. The implementation of enable()/resume() and disable()/pause() is currently identical, but nesting support will change that. Most of the call sites have been updated using the following Coccinelle script: @@ @@ { ... - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); ... - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); ... } @@ @@ { ... - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_pause(); ... - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_resume(); ... } A couple of notes regarding x86: * Xen is currently the only case where explicit handling is required for lazy MMU when context-switching. This is purely an implementation detail and using the generic lazy_mmu_mode_* functions would cause trouble when nesting support is introduced, because the generic functions must be called from the current task. For that reason we still use arch_leave() and arch_enter() there. * x86 calls arch_flush_lazy_mmu_mode() unconditionally in a few places, but only defines it if PARAVIRT_XXL is selected, and we are removing the fallback in . Add a new fallback definition to to keep things building. Link: https://lkml.kernel.org/r/20251215150323.2218608-8-kevin.brodsky@arm.com Signed-off-by: Kevin Brodsky Acked-by: David Hildenbrand Reviewed-by: Anshuman Khandual Reviewed-by: Yeoreum Yun Cc: Alexander Gordeev Cc: Andreas Larsson Cc: Borislav Betkov Cc: Boris Ostrovsky Cc: Catalin Marinas Cc: Christophe Leroy Cc: David Hildenbrand (Red Hat) Cc: David S. Miller Cc: David Woodhouse Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jann Horn Cc: Juegren Gross Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Ritesh Harjani (IBM) Cc: Ryan Roberts Cc: Suren Baghdasaryan Cc: Thomas Gleinxer Cc: Venkat Rao Bagalkote Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/arm64/mm/mmu.c | 8 ++++---- arch/arm64/mm/pageattr.c | 4 ++-- arch/powerpc/mm/book3s64/hash_tlb.c | 8 ++++---- arch/powerpc/mm/book3s64/subpage_prot.c | 4 ++-- arch/x86/include/asm/pgtable.h | 1 + fs/proc/task_mmu.c | 4 ++-- include/linux/pgtable.h | 29 +++++++++++++++++++++++++---- mm/kasan/shadow.c | 8 ++++---- mm/madvise.c | 18 +++++++++--------- mm/memory.c | 16 ++++++++-------- mm/migrate_device.c | 8 ++++---- mm/mprotect.c | 4 ++-- mm/mremap.c | 4 ++-- mm/userfaultfd.c | 4 ++-- mm/vmalloc.c | 12 ++++++------ mm/vmscan.c | 12 ++++++------ 16 files changed, 83 insertions(+), 61 deletions(-) diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 8e1d80a7033e..a6a00accf4f9 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -800,7 +800,7 @@ int split_kernel_leaf_mapping(unsigned long start, unsigned long end) return -EINVAL; mutex_lock(&pgtable_split_lock); - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); /* * The split_kernel_leaf_mapping_locked() may sleep, it is not a @@ -822,7 +822,7 @@ int split_kernel_leaf_mapping(unsigned long start, unsigned long end) ret = split_kernel_leaf_mapping_locked(end); } - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); mutex_unlock(&pgtable_split_lock); return ret; } @@ -883,10 +883,10 @@ static int range_split_to_ptes(unsigned long start, unsigned long end, gfp_t gfp { int ret; - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); ret = walk_kernel_page_table_range_lockless(start, end, &split_to_ptes_ops, NULL, &gfp); - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); return ret; } diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c index 7176ff39cb87..358d1dc9a576 100644 --- a/arch/arm64/mm/pageattr.c +++ b/arch/arm64/mm/pageattr.c @@ -110,7 +110,7 @@ static int update_range_prot(unsigned long start, unsigned long size, if (WARN_ON_ONCE(ret)) return ret; - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); /* * The caller must ensure that the range we are operating on does not @@ -119,7 +119,7 @@ static int update_range_prot(unsigned long start, unsigned long size, */ ret = walk_kernel_page_table_range_lockless(start, start + size, &pageattr_ops, NULL, &data); - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); return ret; } diff --git a/arch/powerpc/mm/book3s64/hash_tlb.c b/arch/powerpc/mm/book3s64/hash_tlb.c index 21fcad97ae80..787f7a0e27f0 100644 --- a/arch/powerpc/mm/book3s64/hash_tlb.c +++ b/arch/powerpc/mm/book3s64/hash_tlb.c @@ -205,7 +205,7 @@ void __flush_hash_table_range(unsigned long start, unsigned long end) * way to do things but is fine for our needs here. */ local_irq_save(flags); - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); for (; start < end; start += PAGE_SIZE) { pte_t *ptep = find_init_mm_pte(start, &hugepage_shift); unsigned long pte; @@ -217,7 +217,7 @@ void __flush_hash_table_range(unsigned long start, unsigned long end) continue; hpte_need_flush(&init_mm, start, ptep, pte, hugepage_shift); } - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); local_irq_restore(flags); } @@ -237,7 +237,7 @@ void flush_hash_table_pmd_range(struct mm_struct *mm, pmd_t *pmd, unsigned long * way to do things but is fine for our needs here. */ local_irq_save(flags); - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); start_pte = pte_offset_map(pmd, addr); if (!start_pte) goto out; @@ -249,6 +249,6 @@ void flush_hash_table_pmd_range(struct mm_struct *mm, pmd_t *pmd, unsigned long } pte_unmap(start_pte); out: - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); local_irq_restore(flags); } diff --git a/arch/powerpc/mm/book3s64/subpage_prot.c b/arch/powerpc/mm/book3s64/subpage_prot.c index ec98e526167e..07c47673bba2 100644 --- a/arch/powerpc/mm/book3s64/subpage_prot.c +++ b/arch/powerpc/mm/book3s64/subpage_prot.c @@ -73,13 +73,13 @@ static void hpte_flush_range(struct mm_struct *mm, unsigned long addr, pte = pte_offset_map_lock(mm, pmd, addr, &ptl); if (!pte) return; - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); for (; npages > 0; --npages) { pte_update(mm, addr, pte, 0, 0, 0); addr += PAGE_SIZE; ++pte; } - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); pte_unmap_unlock(pte - 1, ptl); } diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index e33df3da6980..2842fa1f7a2c 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -118,6 +118,7 @@ extern pmdval_t early_pmd_flags; #define __pte(x) native_make_pte(x) #define arch_end_context_switch(prev) do {} while(0) +static inline void arch_flush_lazy_mmu_mode(void) {} #endif /* CONFIG_PARAVIRT_XXL */ static inline pmd_t pmd_set_flags(pmd_t pmd, pmdval_t set) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 81dfc26bfae8..480db575553e 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -2739,7 +2739,7 @@ static int pagemap_scan_pmd_entry(pmd_t *pmd, unsigned long start, return 0; } - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); if ((p->arg.flags & PM_SCAN_WP_MATCHING) && !p->vec_out) { /* Fast path for performing exclusive WP */ @@ -2809,7 +2809,7 @@ flush_and_return: if (flush_end) flush_tlb_range(vma, start, addr); - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); pte_unmap_unlock(start_pte, ptl); cond_resched(); diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index d46d86959bd6..116a18b7916c 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -235,10 +235,31 @@ static inline int pmd_dirty(pmd_t pmd) * * Nesting is not permitted and the mode cannot be used in interrupt context. */ -#ifndef CONFIG_ARCH_HAS_LAZY_MMU_MODE -static inline void arch_enter_lazy_mmu_mode(void) {} -static inline void arch_leave_lazy_mmu_mode(void) {} -static inline void arch_flush_lazy_mmu_mode(void) {} +#ifdef CONFIG_ARCH_HAS_LAZY_MMU_MODE +static inline void lazy_mmu_mode_enable(void) +{ + arch_enter_lazy_mmu_mode(); +} + +static inline void lazy_mmu_mode_disable(void) +{ + arch_leave_lazy_mmu_mode(); +} + +static inline void lazy_mmu_mode_pause(void) +{ + arch_leave_lazy_mmu_mode(); +} + +static inline void lazy_mmu_mode_resume(void) +{ + arch_enter_lazy_mmu_mode(); +} +#else +static inline void lazy_mmu_mode_enable(void) {} +static inline void lazy_mmu_mode_disable(void) {} +static inline void lazy_mmu_mode_pause(void) {} +static inline void lazy_mmu_mode_resume(void) {} #endif #ifndef pte_batch_hint diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c index 32fbdf759ea2..d286e0a04543 100644 --- a/mm/kasan/shadow.c +++ b/mm/kasan/shadow.c @@ -305,7 +305,7 @@ static int kasan_populate_vmalloc_pte(pte_t *ptep, unsigned long addr, pte_t pte; int index; - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_pause(); index = PFN_DOWN(addr - data->start); page = data->pages[index]; @@ -319,7 +319,7 @@ static int kasan_populate_vmalloc_pte(pte_t *ptep, unsigned long addr, } spin_unlock(&init_mm.page_table_lock); - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_resume(); return 0; } @@ -471,7 +471,7 @@ static int kasan_depopulate_vmalloc_pte(pte_t *ptep, unsigned long addr, pte_t pte; int none; - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_pause(); spin_lock(&init_mm.page_table_lock); pte = ptep_get(ptep); @@ -483,7 +483,7 @@ static int kasan_depopulate_vmalloc_pte(pte_t *ptep, unsigned long addr, if (likely(!none)) __free_page(pfn_to_page(pte_pfn(pte))); - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_resume(); return 0; } diff --git a/mm/madvise.c b/mm/madvise.c index b617b1be0f53..6bf7009fa5ce 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -453,7 +453,7 @@ restart: if (!start_pte) return 0; flush_tlb_batched_pending(mm); - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); for (; addr < end; pte += nr, addr += nr * PAGE_SIZE) { nr = 1; ptent = ptep_get(pte); @@ -461,7 +461,7 @@ restart: if (++batch_count == SWAP_CLUSTER_MAX) { batch_count = 0; if (need_resched()) { - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); pte_unmap_unlock(start_pte, ptl); cond_resched(); goto restart; @@ -497,7 +497,7 @@ restart: if (!folio_trylock(folio)) continue; folio_get(folio); - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); pte_unmap_unlock(start_pte, ptl); start_pte = NULL; err = split_folio(folio); @@ -508,7 +508,7 @@ restart: if (!start_pte) break; flush_tlb_batched_pending(mm); - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); if (!err) nr = 0; continue; @@ -556,7 +556,7 @@ restart: } if (start_pte) { - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); pte_unmap_unlock(start_pte, ptl); } if (pageout) @@ -675,7 +675,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, if (!start_pte) return 0; flush_tlb_batched_pending(mm); - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); for (; addr != end; pte += nr, addr += PAGE_SIZE * nr) { nr = 1; ptent = ptep_get(pte); @@ -724,7 +724,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, if (!folio_trylock(folio)) continue; folio_get(folio); - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); pte_unmap_unlock(start_pte, ptl); start_pte = NULL; err = split_folio(folio); @@ -735,7 +735,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, if (!start_pte) break; flush_tlb_batched_pending(mm); - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); if (!err) nr = 0; continue; @@ -775,7 +775,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, if (nr_swap) add_mm_counter(mm, MM_SWAPENTS, nr_swap); if (start_pte) { - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); pte_unmap_unlock(start_pte, ptl); } cond_resched(); diff --git a/mm/memory.c b/mm/memory.c index da360a6eb8a4..e0bce673f053 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1256,7 +1256,7 @@ again: spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); orig_src_pte = src_pte; orig_dst_pte = dst_pte; - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); do { nr = 1; @@ -1325,7 +1325,7 @@ again: } while (dst_pte += nr, src_pte += nr, addr += PAGE_SIZE * nr, addr != end); - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); pte_unmap_unlock(orig_src_pte, src_ptl); add_mm_rss_vec(dst_mm, rss); pte_unmap_unlock(orig_dst_pte, dst_ptl); @@ -1846,7 +1846,7 @@ retry: return addr; flush_tlb_batched_pending(mm); - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); do { bool any_skipped = false; @@ -1878,7 +1878,7 @@ retry: direct_reclaim = try_get_and_clear_pmd(mm, pmd, &pmdval); add_mm_rss_vec(mm, rss); - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); /* Do the actual TLB flush before dropping ptl */ if (force_flush) { @@ -2816,7 +2816,7 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd, mapped_pte = pte = pte_alloc_map_lock(mm, pmd, addr, &ptl); if (!pte) return -ENOMEM; - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); do { BUG_ON(!pte_none(ptep_get(pte))); if (!pfn_modify_allowed(pfn, prot)) { @@ -2826,7 +2826,7 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd, set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot))); pfn++; } while (pte++, addr += PAGE_SIZE, addr != end); - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); pte_unmap_unlock(mapped_pte, ptl); return err; } @@ -3177,7 +3177,7 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, return -EINVAL; } - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); if (fn) { do { @@ -3190,7 +3190,7 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, } *mask |= PGTBL_PTE_MODIFIED; - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); if (mm != &init_mm) pte_unmap_unlock(mapped_pte, ptl); diff --git a/mm/migrate_device.c b/mm/migrate_device.c index 23379663b1e1..0346c2d7819f 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -271,7 +271,7 @@ again: ptep = pte_offset_map_lock(mm, pmdp, start, &ptl); if (!ptep) goto again; - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); ptep += (addr - start) / PAGE_SIZE; for (; addr < end; addr += PAGE_SIZE, ptep++) { @@ -313,7 +313,7 @@ again: if (folio_test_large(folio)) { int ret; - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); pte_unmap_unlock(ptep, ptl); ret = migrate_vma_split_folio(folio, migrate->fault_page); @@ -356,7 +356,7 @@ again: if (folio && folio_test_large(folio)) { int ret; - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); pte_unmap_unlock(ptep, ptl); ret = migrate_vma_split_folio(folio, migrate->fault_page); @@ -485,7 +485,7 @@ next: if (unmapped) flush_tlb_range(walk->vma, start, end); - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); pte_unmap_unlock(ptep - 1, ptl); return 0; diff --git a/mm/mprotect.c b/mm/mprotect.c index 283889e4f1ce..c0571445bef7 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -233,7 +233,7 @@ static long change_pte_range(struct mmu_gather *tlb, is_private_single_threaded = vma_is_single_threaded_private(vma); flush_tlb_batched_pending(vma->vm_mm); - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); do { nr_ptes = 1; oldpte = ptep_get(pte); @@ -379,7 +379,7 @@ static long change_pte_range(struct mmu_gather *tlb, } } } while (pte += nr_ptes, addr += nr_ptes * PAGE_SIZE, addr != end); - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); pte_unmap_unlock(pte - 1, ptl); return pages; diff --git a/mm/mremap.c b/mm/mremap.c index 672264807db6..8275b9772ec1 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -260,7 +260,7 @@ static int move_ptes(struct pagetable_move_control *pmc, if (new_ptl != old_ptl) spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); flush_tlb_batched_pending(vma->vm_mm); - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); for (; old_addr < old_end; old_ptep += nr_ptes, old_addr += nr_ptes * PAGE_SIZE, new_ptep += nr_ptes, new_addr += nr_ptes * PAGE_SIZE) { @@ -305,7 +305,7 @@ static int move_ptes(struct pagetable_move_control *pmc, } } - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); if (force_flush) flush_tlb_range(vma, old_end - len, old_end); if (new_ptl != old_ptl) diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index e6dfd5f28acd..b11f81095fa5 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -1103,7 +1103,7 @@ static long move_present_ptes(struct mm_struct *mm, /* It's safe to drop the reference now as the page-table is holding one. */ folio_put(*first_src_folio); *first_src_folio = NULL; - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); while (true) { orig_src_pte = ptep_get_and_clear(mm, src_addr, src_pte); @@ -1140,7 +1140,7 @@ static long move_present_ptes(struct mm_struct *mm, break; } - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); if (src_addr > src_start) flush_tlb_range(src_vma, src_start, src_addr); diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 429a893b0505..32d6ee92d4ff 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -108,7 +108,7 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, if (!pte) return -ENOMEM; - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); do { if (unlikely(!pte_none(ptep_get(pte)))) { @@ -134,7 +134,7 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, pfn++; } while (pte += PFN_DOWN(size), addr += size, addr != end); - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); *mask |= PGTBL_PTE_MODIFIED; return 0; } @@ -371,7 +371,7 @@ static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, unsigned long size = PAGE_SIZE; pte = pte_offset_kernel(pmd, addr); - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); do { #ifdef CONFIG_HUGETLB_PAGE @@ -390,7 +390,7 @@ static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, WARN_ON(!pte_none(ptent) && !pte_present(ptent)); } while (pte += (size >> PAGE_SHIFT), addr += size, addr != end); - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); *mask |= PGTBL_PTE_MODIFIED; } @@ -538,7 +538,7 @@ static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr, if (!pte) return -ENOMEM; - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); do { struct page *page = pages[*nr]; @@ -560,7 +560,7 @@ static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr, (*nr)++; } while (pte++, addr += PAGE_SIZE, addr != end); - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); *mask |= PGTBL_PTE_MODIFIED; return err; diff --git a/mm/vmscan.c b/mm/vmscan.c index 614ccf39fe3f..6cf5ee94be7a 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3516,7 +3516,7 @@ static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end, return false; } - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); restart: for (i = pte_index(start), addr = start; addr != end; i++, addr += PAGE_SIZE) { unsigned long pfn; @@ -3557,7 +3557,7 @@ restart: if (i < PTRS_PER_PTE && get_next_vma(PMD_MASK, PAGE_SIZE, args, &start, &end)) goto restart; - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); pte_unmap_unlock(pte, ptl); return suitable_to_scan(total, young); @@ -3598,7 +3598,7 @@ static void walk_pmd_range_locked(pud_t *pud, unsigned long addr, struct vm_area if (!spin_trylock(ptl)) goto done; - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); do { unsigned long pfn; @@ -3645,7 +3645,7 @@ next: walk_update_folio(walk, last, gen, dirty); - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); spin_unlock(ptl); done: *first = -1; @@ -4244,7 +4244,7 @@ bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw) } } - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); pte -= (addr - start) / PAGE_SIZE; @@ -4278,7 +4278,7 @@ bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw) walk_update_folio(walk, last, gen, dirty); - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); /* feedback from rmap walkers to page table walkers */ if (mm_state && suitable_to_scan(i, young)) -- cgit v1.2.3 From 9273dfaeaca8ea4d88c7e9fd081922a029984fd4 Mon Sep 17 00:00:00 2001 From: Kevin Brodsky Date: Mon, 15 Dec 2025 15:03:17 +0000 Subject: mm: bail out of lazy_mmu_mode_* in interrupt context The lazy MMU mode cannot be used in interrupt context. This is documented in , but isn't consistently handled across architectures. arm64 ensures that calls to lazy_mmu_mode_* have no effect in interrupt context, because such calls do occur in certain configurations - see commit b81c688426a9 ("arm64/mm: Disable barrier batching in interrupt contexts"). Other architectures do not check this situation, most likely because it hasn't occurred so far. Let's handle this in the new generic lazy_mmu layer, in the same fashion as arm64: bail out of lazy_mmu_mode_* if in_interrupt(). Also remove the arm64 handling that is now redundant. Both arm64 and x86/Xen also ensure that any lazy MMU optimisation is disabled while in interrupt (see queue_pte_barriers() and xen_get_lazy_mode() respectively). This will be handled in the generic layer in a subsequent patch. Link: https://lkml.kernel.org/r/20251215150323.2218608-9-kevin.brodsky@arm.com Signed-off-by: Kevin Brodsky Acked-by: David Hildenbrand (Red Hat) Reviewed-by: Anshuman Khandual Reviewed-by: Yeoreum Yun Cc: Alexander Gordeev Cc: Andreas Larsson Cc: Borislav Betkov Cc: Boris Ostrovsky Cc: Catalin Marinas Cc: Christophe Leroy Cc: David Hildenbrand Cc: David S. Miller Cc: David Woodhouse Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jann Horn Cc: Juegren Gross Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Ritesh Harjani (IBM) Cc: Ryan Roberts Cc: Suren Baghdasaryan Cc: Thomas Gleinxer Cc: Venkat Rao Bagalkote Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/arm64/include/asm/pgtable.h | 9 --------- include/linux/pgtable.h | 17 ++++++++++++++++- 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index f7d66c261347..bf9178902bdb 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -94,26 +94,17 @@ static inline void arch_enter_lazy_mmu_mode(void) * keeps tracking simple. */ - if (in_interrupt()) - return; - set_thread_flag(TIF_LAZY_MMU); } static inline void arch_flush_lazy_mmu_mode(void) { - if (in_interrupt()) - return; - if (test_and_clear_thread_flag(TIF_LAZY_MMU_PENDING)) emit_pte_barriers(); } static inline void arch_leave_lazy_mmu_mode(void) { - if (in_interrupt()) - return; - arch_flush_lazy_mmu_mode(); clear_thread_flag(TIF_LAZY_MMU); } diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 116a18b7916c..dddde6873d1e 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -233,26 +233,41 @@ static inline int pmd_dirty(pmd_t pmd) * preemption, as a consequence generic code may not sleep while the lazy MMU * mode is active. * - * Nesting is not permitted and the mode cannot be used in interrupt context. + * The mode is disabled in interrupt context and calls to the lazy_mmu API have + * no effect. + * + * Nesting is not permitted. */ #ifdef CONFIG_ARCH_HAS_LAZY_MMU_MODE static inline void lazy_mmu_mode_enable(void) { + if (in_interrupt()) + return; + arch_enter_lazy_mmu_mode(); } static inline void lazy_mmu_mode_disable(void) { + if (in_interrupt()) + return; + arch_leave_lazy_mmu_mode(); } static inline void lazy_mmu_mode_pause(void) { + if (in_interrupt()) + return; + arch_leave_lazy_mmu_mode(); } static inline void lazy_mmu_mode_resume(void) { + if (in_interrupt()) + return; + arch_enter_lazy_mmu_mode(); } #else -- cgit v1.2.3 From 5ab246749569cff9f815618f02ba0d7cf20e5edd Mon Sep 17 00:00:00 2001 From: Kevin Brodsky Date: Mon, 15 Dec 2025 15:03:18 +0000 Subject: mm: enable lazy_mmu sections to nest MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Despite recent efforts to prevent lazy_mmu sections from nesting, it remains difficult to ensure that it never occurs - and in fact it does occur on arm64 in certain situations (CONFIG_DEBUG_PAGEALLOC). Commit 1ef3095b1405 ("arm64/mm: Permit lazy_mmu_mode to be nested") made nesting tolerable on arm64, but without truly supporting it: the inner call to leave() disables the batching optimisation before the outer section ends. This patch actually enables lazy_mmu sections to nest by tracking the nesting level in task_struct, in a similar fashion to e.g. pagefault_{enable,disable}(). This is fully handled by the generic lazy_mmu helpers that were recently introduced. lazy_mmu sections were not initially intended to nest, so we need to clarify the semantics w.r.t. the arch_*_lazy_mmu_mode() callbacks. This patch takes the following approach: * The outermost calls to lazy_mmu_mode_{enable,disable}() trigger calls to arch_{enter,leave}_lazy_mmu_mode() - this is unchanged. * Nested calls to lazy_mmu_mode_{enable,disable}() are not forwarded to the arch via arch_{enter,leave} - lazy MMU remains enabled so the assumption is that these callbacks are not relevant. However, existing code may rely on a call to disable() to flush any batched state, regardless of nesting. arch_flush_lazy_mmu_mode() is therefore called in that situation. A separate interface was recently introduced to temporarily pause the lazy MMU mode: lazy_mmu_mode_{pause,resume}(). pause() fully exits the mode *regardless of the nesting level*, and resume() restores the mode at the same nesting level. pause()/resume() are themselves allowed to nest, so we actually store two nesting levels in task_struct: enable_count and pause_count. A new helper is_lazy_mmu_mode_active() is introduced to determine whether we are currently in lazy MMU mode; this will be used in subsequent patches to replace the various ways arch's currently track whether the mode is enabled. In summary (enable/pause represent the values *after* the call): lazy_mmu_mode_enable() -> arch_enter() enable=1 pause=0 lazy_mmu_mode_enable() -> ø enable=2 pause=0 lazy_mmu_mode_pause() -> arch_leave() enable=2 pause=1 lazy_mmu_mode_resume() -> arch_enter() enable=2 pause=0 lazy_mmu_mode_disable() -> arch_flush() enable=1 pause=0 lazy_mmu_mode_disable() -> arch_leave() enable=0 pause=0 Note: is_lazy_mmu_mode_active() is added to to allow arch headers included by to use it. Link: https://lkml.kernel.org/r/20251215150323.2218608-10-kevin.brodsky@arm.com Signed-off-by: Kevin Brodsky Acked-by: David Hildenbrand (Red Hat) Reviewed-by: Yeoreum Yun Cc: Alexander Gordeev Cc: Andreas Larsson Cc: Anshuman Khandual Cc: Borislav Betkov Cc: Boris Ostrovsky Cc: Catalin Marinas Cc: Christophe Leroy Cc: David Hildenbrand Cc: David S. Miller Cc: David Woodhouse Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jann Horn Cc: Juegren Gross Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Ritesh Harjani (IBM) Cc: Ryan Roberts Cc: Suren Baghdasaryan Cc: Thomas Gleinxer Cc: Venkat Rao Bagalkote Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/arm64/include/asm/pgtable.h | 12 ----- include/linux/mm_types_task.h | 5 ++ include/linux/pgtable.h | 114 ++++++++++++++++++++++++++++++++++++--- include/linux/sched.h | 45 ++++++++++++++++ 4 files changed, 157 insertions(+), 19 deletions(-) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index bf9178902bdb..7f528c36d53c 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -82,18 +82,6 @@ static inline void queue_pte_barriers(void) static inline void arch_enter_lazy_mmu_mode(void) { - /* - * lazy_mmu_mode is not supposed to permit nesting. But in practice this - * does happen with CONFIG_DEBUG_PAGEALLOC, where a page allocation - * inside a lazy_mmu_mode section (such as zap_pte_range()) will change - * permissions on the linear map with apply_to_page_range(), which - * re-enters lazy_mmu_mode. So we tolerate nesting in our - * implementation. The first call to arch_leave_lazy_mmu_mode() will - * flush and clear the flag such that the remainder of the work in the - * outer nest behaves as if outside of lazy mmu mode. This is safe and - * keeps tracking simple. - */ - set_thread_flag(TIF_LAZY_MMU); } diff --git a/include/linux/mm_types_task.h b/include/linux/mm_types_task.h index a82aa80c0ba4..11bf319d78ec 100644 --- a/include/linux/mm_types_task.h +++ b/include/linux/mm_types_task.h @@ -88,4 +88,9 @@ struct tlbflush_unmap_batch { #endif }; +struct lazy_mmu_state { + u8 enable_count; + u8 pause_count; +}; + #endif /* _LINUX_MM_TYPES_TASK_H */ diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index dddde6873d1e..2f0dd3a4ace1 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -236,39 +236,139 @@ static inline int pmd_dirty(pmd_t pmd) * The mode is disabled in interrupt context and calls to the lazy_mmu API have * no effect. * - * Nesting is not permitted. + * The lazy MMU mode is enabled for a given block of code using: + * + * lazy_mmu_mode_enable(); + * + * lazy_mmu_mode_disable(); + * + * Nesting is permitted: may itself use an enable()/disable() pair. + * A nested call to enable() has no functional effect; however disable() causes + * any batched architectural state to be flushed regardless of nesting. After a + * call to disable(), the caller can therefore rely on all previous page table + * modifications to have taken effect, but the lazy MMU mode may still be + * enabled. + * + * In certain cases, it may be desirable to temporarily pause the lazy MMU mode. + * This can be done using: + * + * lazy_mmu_mode_pause(); + * + * lazy_mmu_mode_resume(); + * + * pause() ensures that the mode is exited regardless of the nesting level; + * resume() re-enters the mode at the same nesting level. Any call to the + * lazy_mmu_mode_* API between those two calls has no effect. In particular, + * this means that pause()/resume() pairs may nest. + * + * is_lazy_mmu_mode_active() can be used to check whether the lazy MMU mode is + * currently enabled. */ #ifdef CONFIG_ARCH_HAS_LAZY_MMU_MODE +/** + * lazy_mmu_mode_enable() - Enable the lazy MMU mode. + * + * Enters a new lazy MMU mode section; if the mode was not already enabled, + * enables it and calls arch_enter_lazy_mmu_mode(). + * + * Must be paired with a call to lazy_mmu_mode_disable(). + * + * Has no effect if called: + * - While paused - see lazy_mmu_mode_pause() + * - In interrupt context + */ static inline void lazy_mmu_mode_enable(void) { - if (in_interrupt()) + struct lazy_mmu_state *state = ¤t->lazy_mmu_state; + + if (in_interrupt() || state->pause_count > 0) return; - arch_enter_lazy_mmu_mode(); + VM_WARN_ON_ONCE(state->enable_count == U8_MAX); + + if (state->enable_count++ == 0) + arch_enter_lazy_mmu_mode(); } +/** + * lazy_mmu_mode_disable() - Disable the lazy MMU mode. + * + * Exits the current lazy MMU mode section. If it is the outermost section, + * disables the mode and calls arch_leave_lazy_mmu_mode(). Otherwise (nested + * section), calls arch_flush_lazy_mmu_mode(). + * + * Must match a call to lazy_mmu_mode_enable(). + * + * Has no effect if called: + * - While paused - see lazy_mmu_mode_pause() + * - In interrupt context + */ static inline void lazy_mmu_mode_disable(void) { - if (in_interrupt()) + struct lazy_mmu_state *state = ¤t->lazy_mmu_state; + + if (in_interrupt() || state->pause_count > 0) return; - arch_leave_lazy_mmu_mode(); + VM_WARN_ON_ONCE(state->enable_count == 0); + + if (--state->enable_count == 0) + arch_leave_lazy_mmu_mode(); + else /* Exiting a nested section */ + arch_flush_lazy_mmu_mode(); + } +/** + * lazy_mmu_mode_pause() - Pause the lazy MMU mode. + * + * Pauses the lazy MMU mode; if it is currently active, disables it and calls + * arch_leave_lazy_mmu_mode(). + * + * Must be paired with a call to lazy_mmu_mode_resume(). Calls to the + * lazy_mmu_mode_* API have no effect until the matching resume() call. + * + * Has no effect if called: + * - While paused (inside another pause()/resume() pair) + * - In interrupt context + */ static inline void lazy_mmu_mode_pause(void) { + struct lazy_mmu_state *state = ¤t->lazy_mmu_state; + if (in_interrupt()) return; - arch_leave_lazy_mmu_mode(); + VM_WARN_ON_ONCE(state->pause_count == U8_MAX); + + if (state->pause_count++ == 0 && state->enable_count > 0) + arch_leave_lazy_mmu_mode(); } +/** + * lazy_mmu_mode_resume() - Resume the lazy MMU mode. + * + * Resumes the lazy MMU mode; if it was active at the point where the matching + * call to lazy_mmu_mode_pause() was made, re-enables it and calls + * arch_enter_lazy_mmu_mode(). + * + * Must match a call to lazy_mmu_mode_pause(). + * + * Has no effect if called: + * - While paused (inside another pause()/resume() pair) + * - In interrupt context + */ static inline void lazy_mmu_mode_resume(void) { + struct lazy_mmu_state *state = ¤t->lazy_mmu_state; + if (in_interrupt()) return; - arch_enter_lazy_mmu_mode(); + VM_WARN_ON_ONCE(state->pause_count == 0); + + if (--state->pause_count == 0 && state->enable_count > 0) + arch_enter_lazy_mmu_mode(); } #else static inline void lazy_mmu_mode_enable(void) {} diff --git a/include/linux/sched.h b/include/linux/sched.h index da0133524d08..6b563d4e68f6 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1419,6 +1419,10 @@ struct task_struct { struct page_frag task_frag; +#ifdef CONFIG_ARCH_HAS_LAZY_MMU_MODE + struct lazy_mmu_state lazy_mmu_state; +#endif + #ifdef CONFIG_TASK_DELAY_ACCT struct task_delay_info *delays; #endif @@ -1702,6 +1706,47 @@ static inline char task_state_to_char(struct task_struct *tsk) return task_index_to_char(task_state_index(tsk)); } +#ifdef CONFIG_ARCH_HAS_LAZY_MMU_MODE +/** + * __task_lazy_mmu_mode_active() - Test the lazy MMU mode state for a task. + * @tsk: The task to check. + * + * Test whether @tsk has its lazy MMU mode state set to active (i.e. enabled + * and not paused). + * + * This function only considers the state saved in task_struct; to test whether + * current actually is in lazy MMU mode, is_lazy_mmu_mode_active() should be + * used instead. + * + * This function is intended for architectures that implement the lazy MMU + * mode; it must not be called from generic code. + */ +static inline bool __task_lazy_mmu_mode_active(struct task_struct *tsk) +{ + struct lazy_mmu_state *state = &tsk->lazy_mmu_state; + + return state->enable_count > 0 && state->pause_count == 0; +} + +/** + * is_lazy_mmu_mode_active() - Test whether we are currently in lazy MMU mode. + * + * Test whether the current context is in lazy MMU mode. This is true if both: + * 1. We are not in interrupt context + * 2. Lazy MMU mode is active for the current task + * + * This function is intended for architectures that implement the lazy MMU + * mode; it must not be called from generic code. + */ +static inline bool is_lazy_mmu_mode_active(void) +{ + if (in_interrupt()) + return false; + + return __task_lazy_mmu_mode_active(current); +} +#endif + extern struct pid *cad_pid; /* -- cgit v1.2.3 From 4dd9b4d7a8d5537b982a6b35a6309c0517fc3da3 Mon Sep 17 00:00:00 2001 From: Kevin Brodsky Date: Mon, 15 Dec 2025 15:03:19 +0000 Subject: arm64: mm: replace TIF_LAZY_MMU with is_lazy_mmu_mode_active() The generic lazy_mmu layer now tracks whether a task is in lazy MMU mode. As a result we no longer need a TIF flag for that purpose - let's use the new is_lazy_mmu_mode_active() helper instead. The explicit check for in_interrupt() is no longer necessary either as is_lazy_mmu_mode_active() always returns false in interrupt context. Link: https://lkml.kernel.org/r/20251215150323.2218608-11-kevin.brodsky@arm.com Signed-off-by: Kevin Brodsky Acked-by: David Hildenbrand (Red Hat) Reviewed-by: Anshuman Khandual Reviewed-by: Yeoreum Yun Cc: Alexander Gordeev Cc: Andreas Larsson Cc: Borislav Betkov Cc: Boris Ostrovsky Cc: Catalin Marinas Cc: Christophe Leroy Cc: David Hildenbrand Cc: David S. Miller Cc: David Woodhouse Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jann Horn Cc: Juegren Gross Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Ritesh Harjani (IBM) Cc: Ryan Roberts Cc: Suren Baghdasaryan Cc: Thomas Gleinxer Cc: Venkat Rao Bagalkote Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/arm64/include/asm/pgtable.h | 19 +++---------------- arch/arm64/include/asm/thread_info.h | 3 +-- 2 files changed, 4 insertions(+), 18 deletions(-) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 7f528c36d53c..445e18e92221 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -62,28 +62,16 @@ static inline void emit_pte_barriers(void) static inline void queue_pte_barriers(void) { - unsigned long flags; - - if (in_interrupt()) { - emit_pte_barriers(); - return; - } - - flags = read_thread_flags(); - - if (flags & BIT(TIF_LAZY_MMU)) { + if (is_lazy_mmu_mode_active()) { /* Avoid the atomic op if already set. */ - if (!(flags & BIT(TIF_LAZY_MMU_PENDING))) + if (!test_thread_flag(TIF_LAZY_MMU_PENDING)) set_thread_flag(TIF_LAZY_MMU_PENDING); } else { emit_pte_barriers(); } } -static inline void arch_enter_lazy_mmu_mode(void) -{ - set_thread_flag(TIF_LAZY_MMU); -} +static inline void arch_enter_lazy_mmu_mode(void) {} static inline void arch_flush_lazy_mmu_mode(void) { @@ -94,7 +82,6 @@ static inline void arch_flush_lazy_mmu_mode(void) static inline void arch_leave_lazy_mmu_mode(void) { arch_flush_lazy_mmu_mode(); - clear_thread_flag(TIF_LAZY_MMU); } #ifdef CONFIG_TRANSPARENT_HUGEPAGE diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h index a803b887b0b4..e7cd017b07c8 100644 --- a/arch/arm64/include/asm/thread_info.h +++ b/arch/arm64/include/asm/thread_info.h @@ -84,8 +84,7 @@ void arch_setup_new_exec(void); #define TIF_SME_VL_INHERIT 28 /* Inherit SME vl_onexec across exec */ #define TIF_KERNEL_FPSTATE 29 /* Task is in a kernel mode FPSIMD section */ #define TIF_TSC_SIGSEGV 30 /* SIGSEGV on counter-timer access */ -#define TIF_LAZY_MMU 31 /* Task in lazy mmu mode */ -#define TIF_LAZY_MMU_PENDING 32 /* Ops pending for lazy mmu mode exit */ +#define TIF_LAZY_MMU_PENDING 31 /* Ops pending for lazy mmu mode exit */ #define _TIF_SIGPENDING (1 << TIF_SIGPENDING) #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) -- cgit v1.2.3 From 313a05a15a1b29c29c7eb4ae8cf44a7cf0fcf419 Mon Sep 17 00:00:00 2001 From: Kevin Brodsky Date: Mon, 15 Dec 2025 15:03:20 +0000 Subject: powerpc/mm: replace batch->active with is_lazy_mmu_mode_active() A per-CPU batch struct is activated when entering lazy MMU mode; its lifetime is the same as the lazy MMU section (it is deactivated when leaving the mode). Preemption is disabled in that interval to ensure that the per-CPU reference remains valid. The generic lazy_mmu layer now tracks whether a task is in lazy MMU mode. We can therefore use the generic helper is_lazy_mmu_mode_active() to tell whether a batch struct is active instead of tracking it explicitly. Link: https://lkml.kernel.org/r/20251215150323.2218608-12-kevin.brodsky@arm.com Signed-off-by: Kevin Brodsky Acked-by: David Hildenbrand Reviewed-by: Ritesh Harjani (IBM) Tested-by: Venkat Rao Bagalkote Reviewed-by: Yeoreum Yun Cc: Alexander Gordeev Cc: Andreas Larsson Cc: Anshuman Khandual Cc: Borislav Betkov Cc: Boris Ostrovsky Cc: Catalin Marinas Cc: Christophe Leroy Cc: David Hildenbrand (Red Hat) Cc: David S. Miller Cc: David Woodhouse Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jann Horn Cc: Juegren Gross Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Ryan Roberts Cc: Suren Baghdasaryan Cc: Thomas Gleinxer Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/powerpc/include/asm/book3s/64/tlbflush-hash.h | 9 --------- arch/powerpc/mm/book3s64/hash_tlb.c | 2 +- 2 files changed, 1 insertion(+), 10 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h b/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h index 565c1b7c3eae..6cc9abcd7b3d 100644 --- a/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h +++ b/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h @@ -12,7 +12,6 @@ #define PPC64_TLB_BATCH_NR 192 struct ppc64_tlb_batch { - int active; unsigned long index; struct mm_struct *mm; real_pte_t pte[PPC64_TLB_BATCH_NR]; @@ -26,8 +25,6 @@ extern void __flush_tlb_pending(struct ppc64_tlb_batch *batch); static inline void arch_enter_lazy_mmu_mode(void) { - struct ppc64_tlb_batch *batch; - if (radix_enabled()) return; /* @@ -35,8 +32,6 @@ static inline void arch_enter_lazy_mmu_mode(void) * operating on kernel page tables. */ preempt_disable(); - batch = this_cpu_ptr(&ppc64_tlb_batch); - batch->active = 1; } static inline void arch_flush_lazy_mmu_mode(void) @@ -53,14 +48,10 @@ static inline void arch_flush_lazy_mmu_mode(void) static inline void arch_leave_lazy_mmu_mode(void) { - struct ppc64_tlb_batch *batch; - if (radix_enabled()) return; - batch = this_cpu_ptr(&ppc64_tlb_batch); arch_flush_lazy_mmu_mode(); - batch->active = 0; preempt_enable(); } diff --git a/arch/powerpc/mm/book3s64/hash_tlb.c b/arch/powerpc/mm/book3s64/hash_tlb.c index 787f7a0e27f0..fbdeb8981ae7 100644 --- a/arch/powerpc/mm/book3s64/hash_tlb.c +++ b/arch/powerpc/mm/book3s64/hash_tlb.c @@ -100,7 +100,7 @@ void hpte_need_flush(struct mm_struct *mm, unsigned long addr, * Check if we have an active batch on this CPU. If not, just * flush now and return. */ - if (!batch->active) { + if (!is_lazy_mmu_mode_active()) { flush_hash_page(vpn, rpte, psize, ssize, mm_is_thread_local(mm)); put_cpu_var(ppc64_tlb_batch); return; -- cgit v1.2.3 From dacd24ec4965d92cd5ef338c2b86d5e7e3722bed Mon Sep 17 00:00:00 2001 From: Kevin Brodsky Date: Mon, 15 Dec 2025 15:03:21 +0000 Subject: sparc/mm: replace batch->active with is_lazy_mmu_mode_active() A per-CPU batch struct is activated when entering lazy MMU mode; its lifetime is the same as the lazy MMU section (it is deactivated when leaving the mode). Preemption is disabled in that interval to ensure that the per-CPU reference remains valid. The generic lazy_mmu layer now tracks whether a task is in lazy MMU mode. We can therefore use the generic helper is_lazy_mmu_mode_active() to tell whether a batch struct is active instead of tracking it explicitly. Link: https://lkml.kernel.org/r/20251215150323.2218608-13-kevin.brodsky@arm.com Signed-off-by: Kevin Brodsky Acked-by: David Hildenbrand (Red Hat) Reviewed-by: Yeoreum Yun Acked-by: Andreas Larsson Cc: Alexander Gordeev Cc: Anshuman Khandual Cc: Borislav Betkov Cc: Boris Ostrovsky Cc: Catalin Marinas Cc: Christophe Leroy Cc: David Hildenbrand Cc: David S. Miller Cc: David Woodhouse Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jann Horn Cc: Juegren Gross Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Ritesh Harjani (IBM) Cc: Ryan Roberts Cc: Suren Baghdasaryan Cc: Thomas Gleinxer Cc: Venkat Rao Bagalkote Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/sparc/include/asm/tlbflush_64.h | 1 - arch/sparc/mm/tlb.c | 9 +-------- 2 files changed, 1 insertion(+), 9 deletions(-) diff --git a/arch/sparc/include/asm/tlbflush_64.h b/arch/sparc/include/asm/tlbflush_64.h index 4e1036728e2f..6133306ba59a 100644 --- a/arch/sparc/include/asm/tlbflush_64.h +++ b/arch/sparc/include/asm/tlbflush_64.h @@ -12,7 +12,6 @@ struct tlb_batch { unsigned int hugepage_shift; struct mm_struct *mm; unsigned long tlb_nr; - unsigned long active; unsigned long vaddrs[TLB_BATCH_NR]; }; diff --git a/arch/sparc/mm/tlb.c b/arch/sparc/mm/tlb.c index 7b5dfcdb1243..3a852071d260 100644 --- a/arch/sparc/mm/tlb.c +++ b/arch/sparc/mm/tlb.c @@ -52,11 +52,7 @@ out: void arch_enter_lazy_mmu_mode(void) { - struct tlb_batch *tb; - preempt_disable(); - tb = this_cpu_ptr(&tlb_batch); - tb->active = 1; } void arch_flush_lazy_mmu_mode(void) @@ -69,10 +65,7 @@ void arch_flush_lazy_mmu_mode(void) void arch_leave_lazy_mmu_mode(void) { - struct tlb_batch *tb = this_cpu_ptr(&tlb_batch); - arch_flush_lazy_mmu_mode(); - tb->active = 0; preempt_enable(); } @@ -93,7 +86,7 @@ static void tlb_batch_add_one(struct mm_struct *mm, unsigned long vaddr, nr = 0; } - if (!tb->active) { + if (!is_lazy_mmu_mode_active()) { flush_tsb_user_page(mm, vaddr, hugepage_shift); global_flush_tlb_page(mm, vaddr); goto out; -- cgit v1.2.3 From 291b3abed657cb3c485cce7753e0913dc408cf85 Mon Sep 17 00:00:00 2001 From: Kevin Brodsky Date: Mon, 15 Dec 2025 15:03:22 +0000 Subject: x86/xen: use lazy_mmu_state when context-switching We currently set a TIF flag when scheduling out a task that is in lazy MMU mode, in order to restore it when the task is scheduled again. The generic lazy_mmu layer now tracks whether a task is in lazy MMU mode in task_struct::lazy_mmu_state. We can therefore check that state when switching to the new task, instead of using a separate TIF flag. Link: https://lkml.kernel.org/r/20251215150323.2218608-14-kevin.brodsky@arm.com Signed-off-by: Kevin Brodsky Acked-by: David Hildenbrand (Red Hat) Reviewed-by: Juergen Gross Reviewed-by: Yeoreum Yun Cc: Alexander Gordeev Cc: Andreas Larsson Cc: Anshuman Khandual Cc: Borislav Betkov Cc: Boris Ostrovsky Cc: Catalin Marinas Cc: Christophe Leroy Cc: David Hildenbrand Cc: David S. Miller Cc: David Woodhouse Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jann Horn Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Ritesh Harjani (IBM) Cc: Ryan Roberts Cc: Suren Baghdasaryan Cc: Thomas Gleinxer Cc: Venkat Rao Bagalkote Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/x86/include/asm/thread_info.h | 4 +--- arch/x86/xen/enlighten_pv.c | 3 +-- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index e71e0e8362ed..0067684afb5b 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -100,8 +100,7 @@ struct thread_info { #define TIF_FORCED_TF 24 /* true if TF in eflags artificially */ #define TIF_SINGLESTEP 25 /* reenable singlestep on user return*/ #define TIF_BLOCKSTEP 26 /* set when we want DEBUGCTLMSR_BTF */ -#define TIF_LAZY_MMU_UPDATES 27 /* task is updating the mmu lazily */ -#define TIF_ADDR32 28 /* 32-bit address space on 64 bits */ +#define TIF_ADDR32 27 /* 32-bit address space on 64 bits */ #define _TIF_SSBD BIT(TIF_SSBD) #define _TIF_SPEC_IB BIT(TIF_SPEC_IB) @@ -114,7 +113,6 @@ struct thread_info { #define _TIF_FORCED_TF BIT(TIF_FORCED_TF) #define _TIF_BLOCKSTEP BIT(TIF_BLOCKSTEP) #define _TIF_SINGLESTEP BIT(TIF_SINGLESTEP) -#define _TIF_LAZY_MMU_UPDATES BIT(TIF_LAZY_MMU_UPDATES) #define _TIF_ADDR32 BIT(TIF_ADDR32) /* flags to check in __switch_to() */ diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index b74ff8bc7f2a..ae52436a741f 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -426,7 +426,6 @@ static void xen_start_context_switch(struct task_struct *prev) if (this_cpu_read(xen_lazy_mode) == XEN_LAZY_MMU) { arch_leave_lazy_mmu_mode(); - set_ti_thread_flag(task_thread_info(prev), TIF_LAZY_MMU_UPDATES); } enter_lazy(XEN_LAZY_CPU); } @@ -437,7 +436,7 @@ static void xen_end_context_switch(struct task_struct *next) xen_mc_flush(); leave_lazy(XEN_LAZY_CPU); - if (test_and_clear_ti_thread_flag(task_thread_info(next), TIF_LAZY_MMU_UPDATES)) + if (__task_lazy_mmu_mode_active(next)) arch_enter_lazy_mmu_mode(); } -- cgit v1.2.3 From ee628d9cc8d5b96fdceeb270cf662efc4f85f2b6 Mon Sep 17 00:00:00 2001 From: Kevin Brodsky Date: Mon, 15 Dec 2025 15:03:23 +0000 Subject: mm: add basic tests for lazy_mmu Add basic KUnit tests for the generic aspects of the lazy MMU mode: ensure that it appears active when it should, depending on how enable/disable and pause/resume pairs are nested. [akpm@linux-foundation.org: export ppc64_tlb_batch and __flush_tlb_pending to modules] [ritesh.list@gmail.com: use EXPORT_SYMBOL_IF_KUNIT()] Link: https://lkml.kernel.org/r/87a4zhkt6h.ritesh.list@gmail.com [kevin.brodsky@arm.com: move MODULE_IMPORT_NS(), add comment] Link: https://lkml.kernel.org/r/20251217163812.2633648-2-kevin.brodsky@arm.com Link: https://lkml.kernel.org/r/20251215150323.2218608-15-kevin.brodsky@arm.com Signed-off-by: Kevin Brodsky Reviewed-by: Yeoreum Yun Signed-off-by: Ritesh Harjani (IBM) Acked-by: David Hildenbrand (Red Hat) Cc: Alexander Gordeev Cc: Andreas Larsson Cc: Anshuman Khandual Cc: Borislav Betkov Cc: Boris Ostrovsky Cc: Catalin Marinas Cc: Christophe Leroy Cc: David S. Miller Cc: David Woodhouse Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jann Horn Cc: Juegren Gross Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Ritesh Harjani (IBM) Cc: Ryan Roberts Cc: Suren Baghdasaryan Cc: Thomas Gleinxer Cc: Venkat Rao Bagalkote Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/powerpc/mm/book3s64/hash_tlb.c | 4 +- mm/Kconfig | 12 ++++++ mm/Makefile | 1 + mm/tests/lazy_mmu_mode_kunit.c | 74 +++++++++++++++++++++++++++++++++++++ 4 files changed, 90 insertions(+), 1 deletion(-) create mode 100644 mm/tests/lazy_mmu_mode_kunit.c diff --git a/arch/powerpc/mm/book3s64/hash_tlb.c b/arch/powerpc/mm/book3s64/hash_tlb.c index fbdeb8981ae7..ec2941cec815 100644 --- a/arch/powerpc/mm/book3s64/hash_tlb.c +++ b/arch/powerpc/mm/book3s64/hash_tlb.c @@ -25,11 +25,12 @@ #include #include #include - +#include #include DEFINE_PER_CPU(struct ppc64_tlb_batch, ppc64_tlb_batch); +EXPORT_SYMBOL_IF_KUNIT(ppc64_tlb_batch); /* * A linux PTE was changed and the corresponding hash table entry @@ -154,6 +155,7 @@ void __flush_tlb_pending(struct ppc64_tlb_batch *batch) flush_hash_range(i, local); batch->index = 0; } +EXPORT_SYMBOL_IF_KUNIT(__flush_tlb_pending); void hash__tlb_flush(struct mmu_gather *tlb) { diff --git a/mm/Kconfig b/mm/Kconfig index 7c2520e6a6b3..5f4d6e5b5715 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -1475,6 +1475,18 @@ config ARCH_HAS_LAZY_MMU_MODE MMU-related architectural state to be deferred until the mode is exited. See for details. +config LAZY_MMU_MODE_KUNIT_TEST + tristate "KUnit tests for the lazy MMU mode" if !KUNIT_ALL_TESTS + depends on ARCH_HAS_LAZY_MMU_MODE + depends on KUNIT + default KUNIT_ALL_TESTS + help + Enable this option to check that the lazy MMU mode interface behaves + as expected. Only tests for the generic interface are included (not + architecture-specific behaviours). + + If unsure, say N. + source "mm/damon/Kconfig" endmenu diff --git a/mm/Makefile b/mm/Makefile index 2d0570a16e5b..9175f8cc6565 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -147,3 +147,4 @@ obj-$(CONFIG_SHRINKER_DEBUG) += shrinker_debug.o obj-$(CONFIG_EXECMEM) += execmem.o obj-$(CONFIG_TMPFS_QUOTA) += shmem_quota.o obj-$(CONFIG_PT_RECLAIM) += pt_reclaim.o +obj-$(CONFIG_LAZY_MMU_MODE_KUNIT_TEST) += tests/lazy_mmu_mode_kunit.o diff --git a/mm/tests/lazy_mmu_mode_kunit.c b/mm/tests/lazy_mmu_mode_kunit.c new file mode 100644 index 000000000000..1c23456b467e --- /dev/null +++ b/mm/tests/lazy_mmu_mode_kunit.c @@ -0,0 +1,74 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include +#include + +/* For some symbols referenced by arch_{enter,leave}_lazy_mmu_mode on powerpc */ +MODULE_IMPORT_NS("EXPORTED_FOR_KUNIT_TESTING"); + +static void expect_not_active(struct kunit *test) +{ + KUNIT_EXPECT_FALSE(test, is_lazy_mmu_mode_active()); +} + +static void expect_active(struct kunit *test) +{ + KUNIT_EXPECT_TRUE(test, is_lazy_mmu_mode_active()); +} + +static void lazy_mmu_mode_active(struct kunit *test) +{ + expect_not_active(test); + + lazy_mmu_mode_enable(); + expect_active(test); + + { + /* Nested section */ + lazy_mmu_mode_enable(); + expect_active(test); + + lazy_mmu_mode_disable(); + expect_active(test); + } + + { + /* Paused section */ + lazy_mmu_mode_pause(); + expect_not_active(test); + + { + /* No effect (paused) */ + lazy_mmu_mode_enable(); + expect_not_active(test); + + lazy_mmu_mode_disable(); + expect_not_active(test); + + lazy_mmu_mode_pause(); + expect_not_active(test); + + lazy_mmu_mode_resume(); + expect_not_active(test); + } + + lazy_mmu_mode_resume(); + expect_active(test); + } + + lazy_mmu_mode_disable(); + expect_not_active(test); +} + +static struct kunit_case lazy_mmu_mode_test_cases[] = { + KUNIT_CASE(lazy_mmu_mode_active), + {} +}; + +static struct kunit_suite lazy_mmu_mode_test_suite = { + .name = "lazy_mmu_mode", + .test_cases = lazy_mmu_mode_test_cases, +}; +kunit_test_suite(lazy_mmu_mode_test_suite); + +MODULE_DESCRIPTION("Tests for the lazy MMU mode"); +MODULE_LICENSE("GPL"); -- cgit v1.2.3 From bf3480d7d0bce40d8687559fd6ff40c233a7052f Mon Sep 17 00:00:00 2001 From: Weilin Tong Date: Mon, 15 Dec 2025 10:46:32 +0800 Subject: mm/shmem: add mTHP swpout fallback statistics in shmem_writeout() Currently, when shmem mTHPs are split and swapped out via shmem_writeout(), there are no unified statistics to trace these mTHP swpout fallback events. This makes it difficult to analyze the prevalence of mTHP splitting and fallback during swap operations, which is important for memory diagnostics. Here we add statistics counting for mTHP fallback to small pages when splitting and swapping out in shmem_writeout(). Link: https://lkml.kernel.org/r/20251215024632.250149-1-tongweilin@linux.alibaba.com Signed-off-by: Weilin Tong Reviewed-by: Baolin Wang Cc: Hugh Dickins Signed-off-by: Andrew Morton --- mm/shmem.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/mm/shmem.c b/mm/shmem.c index ec6c01378e9d..15c2943140ca 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1593,11 +1593,23 @@ int shmem_writeout(struct folio *folio, struct swap_iocb **plug, } if (split) { + int order; + try_split: + order = folio_order(folio); /* Ensure the subpages are still dirty */ folio_test_set_dirty(folio); if (split_folio_to_list(folio, folio_list)) goto redirty; + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + if (order >= HPAGE_PMD_ORDER) { + count_memcg_folio_events(folio, THP_SWPOUT_FALLBACK, 1); + count_vm_event(THP_SWPOUT_FALLBACK); + } +#endif + count_mthp_stat(order, MTHP_STAT_SWPOUT_FALLBACK); + folio_clear_dirty(folio); } -- cgit v1.2.3 From 7adc97bc93946e55fc6af30a03d296fb833a28df Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Mon, 15 Dec 2025 11:05:56 -0800 Subject: mm/vmscan.c:shrink_folio_list(): save a tabstop We have some needlessly deep indentation in this huge function due to if (expr1) { if (expr2) { ... } } Convert this to if (expr1 && expr2) { ... } Also, reflow that big block comment to fit in 80 cols. Cc: Johannes Weiner Cc: David Hildenbrand Cc: Michal Hocko Cc: Qi Zheng Cc: Shakeel Butt Cc: Lorenzo Stoakes Cc: Axel Rasmussen Cc: Yuanchu Xie Cc: Wei Xu Signed-off-by: Andrew Morton --- mm/vmscan.c | 98 ++++++++++++++++++++++++++++++------------------------------- 1 file changed, 49 insertions(+), 49 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 6cf5ee94be7a..67234613fbff 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1276,58 +1276,58 @@ retry: * Try to allocate it some swap space here. * Lazyfree folio could be freed directly */ - if (folio_test_anon(folio) && folio_test_swapbacked(folio)) { - if (!folio_test_swapcache(folio)) { - if (!(sc->gfp_mask & __GFP_IO)) - goto keep_locked; - if (folio_maybe_dma_pinned(folio)) - goto keep_locked; - if (folio_test_large(folio)) { - /* cannot split folio, skip it */ - if (folio_expected_ref_count(folio) != - folio_ref_count(folio) - 1) - goto activate_locked; - /* - * Split partially mapped folios right away. - * We can free the unmapped pages without IO. - */ - if (data_race(!list_empty(&folio->_deferred_list) && - folio_test_partially_mapped(folio)) && - split_folio_to_list(folio, folio_list)) - goto activate_locked; - } - if (folio_alloc_swap(folio)) { - int __maybe_unused order = folio_order(folio); - - if (!folio_test_large(folio)) - goto activate_locked_split; - /* Fallback to swap normal pages */ - if (split_folio_to_list(folio, folio_list)) - goto activate_locked; -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - if (nr_pages >= HPAGE_PMD_NR) { - count_memcg_folio_events(folio, - THP_SWPOUT_FALLBACK, 1); - count_vm_event(THP_SWPOUT_FALLBACK); - } -#endif - count_mthp_stat(order, MTHP_STAT_SWPOUT_FALLBACK); - if (folio_alloc_swap(folio)) - goto activate_locked_split; - } + if (folio_test_anon(folio) && folio_test_swapbacked(folio) && + !folio_test_swapcache(folio)) { + if (!(sc->gfp_mask & __GFP_IO)) + goto keep_locked; + if (folio_maybe_dma_pinned(folio)) + goto keep_locked; + if (folio_test_large(folio)) { + /* cannot split folio, skip it */ + if (folio_expected_ref_count(folio) != + folio_ref_count(folio) - 1) + goto activate_locked; /* - * Normally the folio will be dirtied in unmap because its - * pte should be dirty. A special case is MADV_FREE page. The - * page's pte could have dirty bit cleared but the folio's - * SwapBacked flag is still set because clearing the dirty bit - * and SwapBacked flag has no lock protected. For such folio, - * unmap will not set dirty bit for it, so folio reclaim will - * not write the folio out. This can cause data corruption when - * the folio is swapped in later. Always setting the dirty flag - * for the folio solves the problem. + * Split partially mapped folios right away. + * We can free the unmapped pages without IO. */ - folio_mark_dirty(folio); + if (data_race(!list_empty(&folio->_deferred_list) && + folio_test_partially_mapped(folio)) && + split_folio_to_list(folio, folio_list)) + goto activate_locked; + } + if (folio_alloc_swap(folio)) { + int __maybe_unused order = folio_order(folio); + + if (!folio_test_large(folio)) + goto activate_locked_split; + /* Fallback to swap normal pages */ + if (split_folio_to_list(folio, folio_list)) + goto activate_locked; +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + if (nr_pages >= HPAGE_PMD_NR) { + count_memcg_folio_events(folio, + THP_SWPOUT_FALLBACK, 1); + count_vm_event(THP_SWPOUT_FALLBACK); + } +#endif + count_mthp_stat(order, MTHP_STAT_SWPOUT_FALLBACK); + if (folio_alloc_swap(folio)) + goto activate_locked_split; } + /* + * Normally the folio will be dirtied in unmap because + * its pte should be dirty. A special case is MADV_FREE + * page. The page's pte could have dirty bit cleared but + * the folio's SwapBacked flag is still set because + * clearing the dirty bit and SwapBacked flag has no + * lock protected. For such folio, unmap will not set + * dirty bit for it, so folio reclaim will not write the + * folio out. This can cause data corruption when the + * folio is swapped in later. Always setting the dirty + * flag for the folio solves the problem. + */ + folio_mark_dirty(folio); } /* -- cgit v1.2.3 From d38fab605c66778a8ddfbe2ac66c3a3eb7b2295a Mon Sep 17 00:00:00 2001 From: Richard Chang Date: Mon, 1 Dec 2025 18:47:48 +0900 Subject: zram: introduce compressed data writeback Patch series "zram: introduce compressed data writeback", v2. As writeback becomes more common there is another shortcoming that needs to be addressed - compressed data writeback. Currently zram does uncompressed data writeback which is not optimal due to potential CPU and battery wastage. This series changes suboptimal uncompressed writeback to a more optimal compressed data writeback. This patch (of 7): zram stores all written back slots raw, which implies that during writeback zram first has to decompress slots (except for ZRAM_HUGE slots, which are raw already). The problem with this approach is that not every written back page gets read back (either via read() or via page-fault), which means that zram basically wastes CPU cycles and battery decompressing such slots. This changes with introduction of decompression on demand, in other words decompression on read()/page-fault. One caveat of decompression on demand is that async read is completed in IRQ context, while zram decompression is sleepable. To workaround this, read-back decompression is offloaded to a preemptible context - system high-prio work-queue. At this point compressed writeback is still disabled, a follow up patch will introduce a new device attribute which will make it possible to toggle compressed writeback per-device. [senozhatsky@chromium.org: rewrote original implementation] Link: https://lkml.kernel.org/r/20251201094754.4149975-1-senozhatsky@chromium.org Link: https://lkml.kernel.org/r/20251201094754.4149975-2-senozhatsky@chromium.org Signed-off-by: Richard Chang Co-developed-by: Sergey Senozhatsky Suggested-by: Minchan Kim Suggested-by: Brian Geffon Cc: David Stevens Signed-off-by: Andrew Morton --- drivers/block/zram/zram_drv.c | 279 ++++++++++++++++++++++++++++++++++-------- drivers/block/zram/zram_drv.h | 1 + 2 files changed, 227 insertions(+), 53 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 5759823d6314..6263d300312e 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -57,9 +57,6 @@ static size_t huge_class_size; static const struct block_device_operations zram_devops; static void zram_free_page(struct zram *zram, size_t index); -static int zram_read_from_zspool(struct zram *zram, struct page *page, - u32 index); - #define slot_dep_map(zram, index) (&(zram)->table[(index)].dep_map) static void zram_slot_lock_init(struct zram *zram, u32 index) @@ -502,6 +499,10 @@ out: #ifdef CONFIG_ZRAM_WRITEBACK #define INVALID_BDEV_BLOCK (~0UL) +static int read_from_zspool_raw(struct zram *zram, struct page *page, + u32 index); +static int read_from_zspool(struct zram *zram, struct page *page, u32 index); + struct zram_wb_ctl { /* idle list is accessed only by the writeback task, no concurency */ struct list_head idle_reqs; @@ -522,6 +523,22 @@ struct zram_wb_req { struct list_head entry; }; +struct zram_rb_req { + struct work_struct work; + struct zram *zram; + struct page *page; + /* The read bio for backing device */ + struct bio *bio; + unsigned long blk_idx; + union { + /* The original bio to complete (async read) */ + struct bio *parent; + /* error status (sync read) */ + int error; + }; + u32 index; +}; + static ssize_t writeback_limit_enable_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) @@ -780,18 +797,6 @@ static void zram_release_bdev_block(struct zram *zram, unsigned long blk_idx) atomic64_dec(&zram->stats.bd_count); } -static void read_from_bdev_async(struct zram *zram, struct page *page, - unsigned long entry, struct bio *parent) -{ - struct bio *bio; - - bio = bio_alloc(zram->bdev, 1, parent->bi_opf, GFP_NOIO); - bio->bi_iter.bi_sector = entry * (PAGE_SIZE >> 9); - __bio_add_page(bio, page, PAGE_SIZE, 0); - bio_chain(bio, parent); - submit_bio(bio); -} - static void release_wb_req(struct zram_wb_req *req) { __free_page(req->page); @@ -886,8 +891,9 @@ static void zram_account_writeback_submit(struct zram *zram) static int zram_writeback_complete(struct zram *zram, struct zram_wb_req *req) { - u32 index = req->pps->index; - int err; + u32 size, index = req->pps->index; + int err, prio; + bool huge; err = blk_status_to_errno(req->bio.bi_status); if (err) { @@ -914,9 +920,27 @@ static int zram_writeback_complete(struct zram *zram, struct zram_wb_req *req) goto out; } + if (zram->wb_compressed) { + /* + * ZRAM_WB slots get freed, we need to preserve data required + * for read decompression. + */ + size = zram_get_obj_size(zram, index); + prio = zram_get_priority(zram, index); + huge = zram_test_flag(zram, index, ZRAM_HUGE); + } + zram_free_page(zram, index); zram_set_flag(zram, index, ZRAM_WB); zram_set_handle(zram, index, req->blk_idx); + + if (zram->wb_compressed) { + if (huge) + zram_set_flag(zram, index, ZRAM_HUGE); + zram_set_obj_size(zram, index, size); + zram_set_priority(zram, index, prio); + } + atomic64_inc(&zram->stats.pages_stored); out: @@ -1050,7 +1074,11 @@ static int zram_writeback_slots(struct zram *zram, */ if (!zram_test_flag(zram, index, ZRAM_PP_SLOT)) goto next; - if (zram_read_from_zspool(zram, req->page, index)) + if (zram->wb_compressed) + err = read_from_zspool_raw(zram, req->page, index); + else + err = read_from_zspool(zram, req->page, index); + if (err) goto next; zram_slot_unlock(zram, index); @@ -1313,24 +1341,140 @@ release_init_lock: return ret; } -struct zram_work { - struct work_struct work; - struct zram *zram; - unsigned long entry; - struct page *page; - int error; -}; +static int decompress_bdev_page(struct zram *zram, struct page *page, u32 index) +{ + struct zcomp_strm *zstrm; + unsigned int size; + int ret, prio; + void *src; + + zram_slot_lock(zram, index); + /* Since slot was unlocked we need to make sure it's still ZRAM_WB */ + if (!zram_test_flag(zram, index, ZRAM_WB)) { + zram_slot_unlock(zram, index); + /* We read some stale data, zero it out */ + memset_page(page, 0, 0, PAGE_SIZE); + return -EIO; + } + + if (zram_test_flag(zram, index, ZRAM_HUGE)) { + zram_slot_unlock(zram, index); + return 0; + } + + size = zram_get_obj_size(zram, index); + prio = zram_get_priority(zram, index); -static void zram_sync_read(struct work_struct *work) + zstrm = zcomp_stream_get(zram->comps[prio]); + src = kmap_local_page(page); + ret = zcomp_decompress(zram->comps[prio], zstrm, src, size, + zstrm->local_copy); + if (!ret) + copy_page(src, zstrm->local_copy); + kunmap_local(src); + zcomp_stream_put(zstrm); + zram_slot_unlock(zram, index); + + return ret; +} + +static void zram_deferred_decompress(struct work_struct *w) { - struct zram_work *zw = container_of(work, struct zram_work, work); + struct zram_rb_req *req = container_of(w, struct zram_rb_req, work); + struct page *page = bio_first_page_all(req->bio); + struct zram *zram = req->zram; + u32 index = req->index; + int ret; + + ret = decompress_bdev_page(zram, page, index); + if (ret) + req->parent->bi_status = BLK_STS_IOERR; + + /* Decrement parent's ->remaining */ + bio_endio(req->parent); + bio_put(req->bio); + kfree(req); +} + +static void zram_async_read_endio(struct bio *bio) +{ + struct zram_rb_req *req = bio->bi_private; + struct zram *zram = req->zram; + + if (bio->bi_status) { + req->parent->bi_status = bio->bi_status; + bio_endio(req->parent); + bio_put(bio); + kfree(req); + return; + } + + /* + * NOTE: zram_async_read_endio() is not exactly right place for this. + * Ideally, we need to do it after ZRAM_WB check, but this requires + * us to use wq path even on systems that don't enable compressed + * writeback, because we cannot take slot-lock in the current context. + * + * Keep the existing behavior for now. + */ + if (zram->wb_compressed == false) { + /* No decompression needed, complete the parent IO */ + bio_endio(req->parent); + bio_put(bio); + kfree(req); + return; + } + + /* + * zram decompression is sleepable, so we need to deffer it to + * a preemptible context. + */ + INIT_WORK(&req->work, zram_deferred_decompress); + queue_work(system_highpri_wq, &req->work); +} + +static void read_from_bdev_async(struct zram *zram, struct page *page, + u32 index, unsigned long blk_idx, + struct bio *parent) +{ + struct zram_rb_req *req; + struct bio *bio; + + req = kmalloc(sizeof(*req), GFP_NOIO); + if (!req) + return; + + bio = bio_alloc(zram->bdev, 1, parent->bi_opf, GFP_NOIO); + if (!bio) { + kfree(req); + return; + } + + req->zram = zram; + req->index = index; + req->blk_idx = blk_idx; + req->bio = bio; + req->parent = parent; + + bio->bi_iter.bi_sector = blk_idx * (PAGE_SIZE >> 9); + bio->bi_private = req; + bio->bi_end_io = zram_async_read_endio; + + __bio_add_page(bio, page, PAGE_SIZE, 0); + bio_inc_remaining(parent); + submit_bio(bio); +} + +static void zram_sync_read(struct work_struct *w) +{ + struct zram_rb_req *req = container_of(w, struct zram_rb_req, work); struct bio_vec bv; struct bio bio; - bio_init(&bio, zw->zram->bdev, &bv, 1, REQ_OP_READ); - bio.bi_iter.bi_sector = zw->entry * (PAGE_SIZE >> 9); - __bio_add_page(&bio, zw->page, PAGE_SIZE, 0); - zw->error = submit_bio_wait(&bio); + bio_init(&bio, req->zram->bdev, &bv, 1, REQ_OP_READ); + bio.bi_iter.bi_sector = req->blk_idx * (PAGE_SIZE >> 9); + __bio_add_page(&bio, req->page, PAGE_SIZE, 0); + req->error = submit_bio_wait(&bio); } /* @@ -1338,39 +1482,42 @@ static void zram_sync_read(struct work_struct *work) * chained IO with parent IO in same context, it's a deadlock. To avoid that, * use a worker thread context. */ -static int read_from_bdev_sync(struct zram *zram, struct page *page, - unsigned long entry) +static int read_from_bdev_sync(struct zram *zram, struct page *page, u32 index, + unsigned long blk_idx) { - struct zram_work work; + struct zram_rb_req req; - work.page = page; - work.zram = zram; - work.entry = entry; + req.page = page; + req.zram = zram; + req.blk_idx = blk_idx; - INIT_WORK_ONSTACK(&work.work, zram_sync_read); - queue_work(system_dfl_wq, &work.work); - flush_work(&work.work); - destroy_work_on_stack(&work.work); + INIT_WORK_ONSTACK(&req.work, zram_sync_read); + queue_work(system_dfl_wq, &req.work); + flush_work(&req.work); + destroy_work_on_stack(&req.work); - return work.error; + if (req.error || zram->wb_compressed == false) + return req.error; + + return decompress_bdev_page(zram, page, index); } -static int read_from_bdev(struct zram *zram, struct page *page, - unsigned long entry, struct bio *parent) +static int read_from_bdev(struct zram *zram, struct page *page, u32 index, + unsigned long blk_idx, struct bio *parent) { atomic64_inc(&zram->stats.bd_reads); if (!parent) { if (WARN_ON_ONCE(!IS_ENABLED(ZRAM_PARTIAL_IO))) return -EIO; - return read_from_bdev_sync(zram, page, entry); + return read_from_bdev_sync(zram, page, index, blk_idx); } - read_from_bdev_async(zram, page, entry, parent); + read_from_bdev_async(zram, page, index, blk_idx, parent); return 0; } #else static inline void reset_bdev(struct zram *zram) {}; -static int read_from_bdev(struct zram *zram, struct page *page, - unsigned long entry, struct bio *parent) +static int read_from_bdev(struct zram *zram, struct page *page, u32 index, + unsigned long blk_idx, struct bio *parent) { return -EIO; } @@ -1977,12 +2124,37 @@ static int read_compressed_page(struct zram *zram, struct page *page, u32 index) return ret; } +#if defined CONFIG_ZRAM_WRITEBACK +static int read_from_zspool_raw(struct zram *zram, struct page *page, u32 index) +{ + struct zcomp_strm *zstrm; + unsigned long handle; + unsigned int size; + void *src; + + handle = zram_get_handle(zram, index); + size = zram_get_obj_size(zram, index); + + /* + * We need to get stream just for ->local_copy buffer, in + * case if object spans two physical pages. No decompression + * takes place here, as we read raw compressed data. + */ + zstrm = zcomp_stream_get(zram->comps[ZRAM_PRIMARY_COMP]); + src = zs_obj_read_begin(zram->mem_pool, handle, zstrm->local_copy); + memcpy_to_page(page, 0, src, size); + zs_obj_read_end(zram->mem_pool, handle, src); + zcomp_stream_put(zstrm); + + return 0; +} +#endif + /* * Reads (decompresses if needed) a page from zspool (zsmalloc). * Corresponding ZRAM slot should be locked. */ -static int zram_read_from_zspool(struct zram *zram, struct page *page, - u32 index) +static int read_from_zspool(struct zram *zram, struct page *page, u32 index) { if (zram_test_flag(zram, index, ZRAM_SAME) || !zram_get_handle(zram, index)) @@ -2002,7 +2174,7 @@ static int zram_read_page(struct zram *zram, struct page *page, u32 index, zram_slot_lock(zram, index); if (!zram_test_flag(zram, index, ZRAM_WB)) { /* Slot should be locked through out the function call */ - ret = zram_read_from_zspool(zram, page, index); + ret = read_from_zspool(zram, page, index); zram_slot_unlock(zram, index); } else { unsigned long blk_idx = zram_get_handle(zram, index); @@ -2012,7 +2184,7 @@ static int zram_read_page(struct zram *zram, struct page *page, u32 index, * device. */ zram_slot_unlock(zram, index); - ret = read_from_bdev(zram, page, blk_idx, parent); + ret = read_from_bdev(zram, page, index, blk_idx, parent); } /* Should NEVER happen. Return bio error if it does. */ @@ -2273,7 +2445,7 @@ static int recompress_slot(struct zram *zram, u32 index, struct page *page, if (comp_len_old < threshold) return 0; - ret = zram_read_from_zspool(zram, page, index); + ret = read_from_zspool(zram, page, index); if (ret) return ret; @@ -2960,6 +3132,7 @@ static int zram_add(void) init_rwsem(&zram->init_lock); #ifdef CONFIG_ZRAM_WRITEBACK zram->wb_batch_size = 32; + zram->wb_compressed = false; #endif /* gendisk structure */ diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index c6d94501376c..72fdf66c78ab 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -128,6 +128,7 @@ struct zram { #ifdef CONFIG_ZRAM_WRITEBACK struct file *backing_dev; bool wb_limit_enable; + bool wb_compressed; u32 wb_batch_size; u64 bd_wb_limit; struct block_device *bdev; -- cgit v1.2.3 From 4c1d61389e8e4307449eb2ebad997241cbf08fef Mon Sep 17 00:00:00 2001 From: Richard Chang Date: Mon, 1 Dec 2025 18:47:49 +0900 Subject: zram: introduce writeback_compressed device attribute Introduce witeback_compressed device attribute to toggle compressed writeback (decompression on demand) feature. [senozhatsky@chromium.org: rewrote original patch, added documentation] Link: https://lkml.kernel.org/r/20251201094754.4149975-3-senozhatsky@chromium.org Signed-off-by: Richard Chang Co-developed-by: Sergey Senozhatsky Signed-off-by: Sergey Senozhatsky Cc: Brian Geffon Cc: David Stevens Cc: Minchan Kim Signed-off-by: Andrew Morton --- Documentation/ABI/testing/sysfs-block-zram | 7 ++++++ Documentation/admin-guide/blockdev/zram.rst | 13 ++++++++++ drivers/block/zram/zram_drv.c | 38 +++++++++++++++++++++++++++++ 3 files changed, 58 insertions(+) diff --git a/Documentation/ABI/testing/sysfs-block-zram b/Documentation/ABI/testing/sysfs-block-zram index 36c57de0a10a..ed10c2e4b5c2 100644 --- a/Documentation/ABI/testing/sysfs-block-zram +++ b/Documentation/ABI/testing/sysfs-block-zram @@ -150,3 +150,10 @@ Contact: Sergey Senozhatsky Description: The algorithm_params file is write-only and is used to setup compression algorithm parameters. + +What: /sys/block/zram/writeback_compressed +Date: Decemeber 2025 +Contact: Richard Chang +Description: + The writeback_compressed device atrribute toggles compressed + writeback feature. diff --git a/Documentation/admin-guide/blockdev/zram.rst b/Documentation/admin-guide/blockdev/zram.rst index 3e273c1bb749..9547e4e95979 100644 --- a/Documentation/admin-guide/blockdev/zram.rst +++ b/Documentation/admin-guide/blockdev/zram.rst @@ -214,6 +214,7 @@ mem_limit WO specifies the maximum amount of memory ZRAM can writeback_limit WO specifies the maximum amount of write IO zram can write out to backing device as 4KB unit writeback_limit_enable RW show and set writeback_limit feature +writeback_compressed RW show and set compressed writeback feature comp_algorithm RW show and change the compression algorithm algorithm_params WO setup compression algorithm parameters compact WO trigger memory compaction @@ -434,6 +435,18 @@ system reboot, echo 1 > /sys/block/zramX/reset) so keeping how many of writeback happened until you reset the zram to allocate extra writeback budget in next setting is user's job. +By default zram stores written back pages in decompressed (raw) form, which +means that writeback operation involves decompression of the page before +writing it to the backing device. This behavior can be changed by enabling +`writeback_compressed` feature, which causes zram to write compressed pages +to the backing device, thus avoiding decompression overhead. To enable +this feature, execute:: + + $ echo yes > /sys/block/zramX/writeback_compressed + +Note that this feature should be configured before the `zramX` device is +initialized. + If admin wants to measure writeback count in a certain period, they could know it via /sys/block/zram0/bd_stat's 3rd column. diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 6263d300312e..3cc03c3f7389 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -539,6 +539,42 @@ struct zram_rb_req { u32 index; }; +static ssize_t writeback_compressed_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t len) +{ + struct zram *zram = dev_to_zram(dev); + bool val; + + if (kstrtobool(buf, &val)) + return -EINVAL; + + down_write(&zram->init_lock); + if (init_done(zram)) { + up_write(&zram->init_lock); + return -EBUSY; + } + + zram->wb_compressed = val; + up_write(&zram->init_lock); + + return len; +} + +static ssize_t writeback_compressed_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + bool val; + struct zram *zram = dev_to_zram(dev); + + down_read(&zram->init_lock); + val = zram->wb_compressed; + up_read(&zram->init_lock); + + return sysfs_emit(buf, "%d\n", val); +} + static ssize_t writeback_limit_enable_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) @@ -3048,6 +3084,7 @@ static DEVICE_ATTR_WO(writeback); static DEVICE_ATTR_RW(writeback_limit); static DEVICE_ATTR_RW(writeback_limit_enable); static DEVICE_ATTR_RW(writeback_batch_size); +static DEVICE_ATTR_RW(writeback_compressed); #endif #ifdef CONFIG_ZRAM_MULTI_COMP static DEVICE_ATTR_RW(recomp_algorithm); @@ -3070,6 +3107,7 @@ static struct attribute *zram_disk_attrs[] = { &dev_attr_writeback_limit.attr, &dev_attr_writeback_limit_enable.attr, &dev_attr_writeback_batch_size.attr, + &dev_attr_writeback_compressed.attr, #endif &dev_attr_io_stat.attr, &dev_attr_mm_stat.attr, -- cgit v1.2.3 From 2502673aed6c66befc7efc2dc008e2a8a50508cd Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Mon, 1 Dec 2025 18:47:50 +0900 Subject: zram: document writeback_batch_size Add missing writeback_batch_size documentation. Link: https://lkml.kernel.org/r/20251201094754.4149975-4-senozhatsky@chromium.org Signed-off-by: Sergey Senozhatsky Cc: Brian Geffon Cc: David Stevens Cc: Minchan Kim Cc: Richard Chang Signed-off-by: Andrew Morton --- Documentation/ABI/testing/sysfs-block-zram | 7 +++++++ Documentation/admin-guide/blockdev/zram.rst | 11 ++++++++++- 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/Documentation/ABI/testing/sysfs-block-zram b/Documentation/ABI/testing/sysfs-block-zram index ed10c2e4b5c2..e538d4850d61 100644 --- a/Documentation/ABI/testing/sysfs-block-zram +++ b/Documentation/ABI/testing/sysfs-block-zram @@ -157,3 +157,10 @@ Contact: Richard Chang Description: The writeback_compressed device atrribute toggles compressed writeback feature. + +What: /sys/block/zram/writeback_batch_size +Date: November 2025 +Contact: Sergey Senozhatsky +Description: + The writeback_batch_size device atrribute sets the maximum + number of in-flight writeback operations. diff --git a/Documentation/admin-guide/blockdev/zram.rst b/Documentation/admin-guide/blockdev/zram.rst index 9547e4e95979..94bb7f2245ee 100644 --- a/Documentation/admin-guide/blockdev/zram.rst +++ b/Documentation/admin-guide/blockdev/zram.rst @@ -214,6 +214,8 @@ mem_limit WO specifies the maximum amount of memory ZRAM can writeback_limit WO specifies the maximum amount of write IO zram can write out to backing device as 4KB unit writeback_limit_enable RW show and set writeback_limit feature +writeback_batch_size RW show and set maximum number of in-flight + writeback operations writeback_compressed RW show and set compressed writeback feature comp_algorithm RW show and change the compression algorithm algorithm_params WO setup compression algorithm parameters @@ -223,7 +225,6 @@ backing_dev RW set up backend storage for zram to write out idle WO mark allocated slot as idle ====================== ====== =============================================== - User space is advised to use the following files to read the device statistics. File /sys/block/zram/stat @@ -447,6 +448,14 @@ this feature, execute:: Note that this feature should be configured before the `zramX` device is initialized. +Depending on backing device storage type, writeback operation may benefit +from a higher number of in-flight write requests (batched writes). The +number of maximum in-flight writeback operations can be configured via +`writeback_batch_size` attribute. To change the default value (which is 32), +execute:: + + $ echo 64 > /sys/block/zramX/writeback_batch_size + If admin wants to measure writeback count in a certain period, they could know it via /sys/block/zram0/bd_stat's 3rd column. -- cgit v1.2.3 From 910bbb441c004050e188dd8da5071054099e592c Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Mon, 1 Dec 2025 18:47:51 +0900 Subject: zram: move bd_stat to writeback section Move bd_stat function and attribute declaration to existing CONFIG_WRITEBACK ifdef-sections. Link: https://lkml.kernel.org/r/20251201094754.4149975-5-senozhatsky@chromium.org Signed-off-by: Sergey Senozhatsky Cc: Brian Geffon Cc: David Stevens Cc: Minchan Kim Cc: Richard Chang Signed-off-by: Andrew Morton --- drivers/block/zram/zram_drv.c | 48 +++++++++++++++++++------------------------ 1 file changed, 21 insertions(+), 27 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 3cc03c3f7389..1a0f550219b1 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -539,6 +539,24 @@ struct zram_rb_req { u32 index; }; +#define FOUR_K(x) ((x) * (1 << (PAGE_SHIFT - 12))) +static ssize_t bd_stat_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct zram *zram = dev_to_zram(dev); + ssize_t ret; + + down_read(&zram->init_lock); + ret = sysfs_emit(buf, + "%8llu %8llu %8llu\n", + FOUR_K((u64)atomic64_read(&zram->stats.bd_count)), + FOUR_K((u64)atomic64_read(&zram->stats.bd_reads)), + FOUR_K((u64)atomic64_read(&zram->stats.bd_writes))); + up_read(&zram->init_lock); + + return ret; +} + static ssize_t writeback_compressed_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) @@ -1976,28 +1994,8 @@ static ssize_t mm_stat_show(struct device *dev, return ret; } -#ifdef CONFIG_ZRAM_WRITEBACK -#define FOUR_K(x) ((x) * (1 << (PAGE_SHIFT - 12))) -static ssize_t bd_stat_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct zram *zram = dev_to_zram(dev); - ssize_t ret; - - down_read(&zram->init_lock); - ret = sysfs_emit(buf, - "%8llu %8llu %8llu\n", - FOUR_K((u64)atomic64_read(&zram->stats.bd_count)), - FOUR_K((u64)atomic64_read(&zram->stats.bd_reads)), - FOUR_K((u64)atomic64_read(&zram->stats.bd_writes))); - up_read(&zram->init_lock); - - return ret; -} -#endif - static ssize_t debug_stat_show(struct device *dev, - struct device_attribute *attr, char *buf) + struct device_attribute *attr, char *buf) { int version = 1; struct zram *zram = dev_to_zram(dev); @@ -2015,9 +2013,6 @@ static ssize_t debug_stat_show(struct device *dev, static DEVICE_ATTR_RO(io_stat); static DEVICE_ATTR_RO(mm_stat); -#ifdef CONFIG_ZRAM_WRITEBACK -static DEVICE_ATTR_RO(bd_stat); -#endif static DEVICE_ATTR_RO(debug_stat); static void zram_meta_free(struct zram *zram, u64 disksize) @@ -3079,6 +3074,7 @@ static DEVICE_ATTR_WO(mem_used_max); static DEVICE_ATTR_WO(idle); static DEVICE_ATTR_RW(comp_algorithm); #ifdef CONFIG_ZRAM_WRITEBACK +static DEVICE_ATTR_RO(bd_stat); static DEVICE_ATTR_RW(backing_dev); static DEVICE_ATTR_WO(writeback); static DEVICE_ATTR_RW(writeback_limit); @@ -3102,6 +3098,7 @@ static struct attribute *zram_disk_attrs[] = { &dev_attr_idle.attr, &dev_attr_comp_algorithm.attr, #ifdef CONFIG_ZRAM_WRITEBACK + &dev_attr_bd_stat.attr, &dev_attr_backing_dev.attr, &dev_attr_writeback.attr, &dev_attr_writeback_limit.attr, @@ -3111,9 +3108,6 @@ static struct attribute *zram_disk_attrs[] = { #endif &dev_attr_io_stat.attr, &dev_attr_mm_stat.attr, -#ifdef CONFIG_ZRAM_WRITEBACK - &dev_attr_bd_stat.attr, -#endif &dev_attr_debug_stat.attr, #ifdef CONFIG_ZRAM_MULTI_COMP &dev_attr_recomp_algorithm.attr, -- cgit v1.2.3 From 7ad688c0cdc46d01fc46f6d226813715542c531e Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Mon, 1 Dec 2025 18:47:52 +0900 Subject: zram: rename zram_free_page() We don't free page in zram_free_page(), not all slots even have any memory associated with them (e.g. ZRAM_SAME). We free the slot (or reset it), rename the function accordingly. Link: https://lkml.kernel.org/r/20251201094754.4149975-6-senozhatsky@chromium.org Signed-off-by: Sergey Senozhatsky Cc: Brian Geffon Cc: David Stevens Cc: Minchan Kim Cc: Richard Chang Signed-off-by: Andrew Morton --- drivers/block/zram/zram_drv.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 1a0f550219b1..615756d5d05d 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -56,7 +56,7 @@ static size_t huge_class_size; static const struct block_device_operations zram_devops; -static void zram_free_page(struct zram *zram, size_t index); +static void zram_slot_free(struct zram *zram, u32 index); #define slot_dep_map(zram, index) (&(zram)->table[(index)].dep_map) static void zram_slot_lock_init(struct zram *zram, u32 index) @@ -984,7 +984,7 @@ static int zram_writeback_complete(struct zram *zram, struct zram_wb_req *req) huge = zram_test_flag(zram, index, ZRAM_HUGE); } - zram_free_page(zram, index); + zram_slot_free(zram, index); zram_set_flag(zram, index, ZRAM_WB); zram_set_handle(zram, index, req->blk_idx); @@ -2025,7 +2025,7 @@ static void zram_meta_free(struct zram *zram, u64 disksize) /* Free all pages that are still in this zram device */ for (index = 0; index < num_pages; index++) - zram_free_page(zram, index); + zram_slot_free(zram, index); zs_destroy_pool(zram->mem_pool); vfree(zram->table); @@ -2057,7 +2057,7 @@ static bool zram_meta_alloc(struct zram *zram, u64 disksize) return true; } -static void zram_free_page(struct zram *zram, size_t index) +static void zram_slot_free(struct zram *zram, u32 index) { unsigned long handle; @@ -2256,7 +2256,7 @@ static int write_same_filled_page(struct zram *zram, unsigned long fill, u32 index) { zram_slot_lock(zram, index); - zram_free_page(zram, index); + zram_slot_free(zram, index); zram_set_flag(zram, index, ZRAM_SAME); zram_set_handle(zram, index, fill); zram_slot_unlock(zram, index); @@ -2294,7 +2294,7 @@ static int write_incompressible_page(struct zram *zram, struct page *page, kunmap_local(src); zram_slot_lock(zram, index); - zram_free_page(zram, index); + zram_slot_free(zram, index); zram_set_flag(zram, index, ZRAM_HUGE); zram_set_handle(zram, index, handle); zram_set_obj_size(zram, index, PAGE_SIZE); @@ -2359,7 +2359,7 @@ static int zram_write_page(struct zram *zram, struct page *page, u32 index) zcomp_stream_put(zstrm); zram_slot_lock(zram, index); - zram_free_page(zram, index); + zram_slot_free(zram, index); zram_set_handle(zram, index, handle); zram_set_obj_size(zram, index, comp_len); zram_slot_unlock(zram, index); @@ -2581,7 +2581,7 @@ static int recompress_slot(struct zram *zram, u32 index, struct page *page, zs_obj_write(zram->mem_pool, handle_new, zstrm->buffer, comp_len_new); zcomp_stream_put(zstrm); - zram_free_page(zram, index); + zram_slot_free(zram, index); zram_set_handle(zram, index, handle_new); zram_set_obj_size(zram, index, comp_len_new); zram_set_priority(zram, index, prio); @@ -2784,7 +2784,7 @@ static void zram_bio_discard(struct zram *zram, struct bio *bio) while (n >= PAGE_SIZE) { zram_slot_lock(zram, index); - zram_free_page(zram, index); + zram_slot_free(zram, index); zram_slot_unlock(zram, index); atomic64_inc(&zram->stats.notify_free); index++; @@ -2892,7 +2892,7 @@ static void zram_slot_free_notify(struct block_device *bdev, return; } - zram_free_page(zram, index); + zram_slot_free(zram, index); zram_slot_unlock(zram, index); } -- cgit v1.2.3 From 0d38260c2a11de147f0c4701b344fdfa6bcdd04c Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Mon, 1 Dec 2025 18:47:53 +0900 Subject: zram: switch to guard() for init_lock Use init_lock guard() in sysfs store/show handlers, in order to simplify and, more importantly, to modernize the code. While at it, fix up more coding styles. Link: https://lkml.kernel.org/r/20251201094754.4149975-7-senozhatsky@chromium.org Signed-off-by: Sergey Senozhatsky Cc: Brian Geffon Cc: David Stevens Cc: Minchan Kim Cc: Richard Chang Signed-off-by: Andrew Morton --- drivers/block/zram/zram_drv.c | 211 +++++++++++++++--------------------------- 1 file changed, 77 insertions(+), 134 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 615756d5d05d..4b8a26c60539 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -360,15 +360,14 @@ static bool page_same_filled(void *ptr, unsigned long *element) return true; } -static ssize_t initstate_show(struct device *dev, - struct device_attribute *attr, char *buf) +static ssize_t initstate_show(struct device *dev, struct device_attribute *attr, + char *buf) { u32 val; struct zram *zram = dev_to_zram(dev); - down_read(&zram->init_lock); + guard(rwsem_read)(&zram->init_lock); val = init_done(zram); - up_read(&zram->init_lock); return sysfs_emit(buf, "%u\n", val); } @@ -382,7 +381,8 @@ static ssize_t disksize_show(struct device *dev, } static ssize_t mem_limit_store(struct device *dev, - struct device_attribute *attr, const char *buf, size_t len) + struct device_attribute *attr, const char *buf, + size_t len) { u64 limit; char *tmp; @@ -392,15 +392,15 @@ static ssize_t mem_limit_store(struct device *dev, if (buf == tmp) /* no chars parsed, invalid input */ return -EINVAL; - down_write(&zram->init_lock); + guard(rwsem_write)(&zram->init_lock); zram->limit_pages = PAGE_ALIGN(limit) >> PAGE_SHIFT; - up_write(&zram->init_lock); return len; } static ssize_t mem_used_max_store(struct device *dev, - struct device_attribute *attr, const char *buf, size_t len) + struct device_attribute *attr, + const char *buf, size_t len) { int err; unsigned long val; @@ -410,12 +410,11 @@ static ssize_t mem_used_max_store(struct device *dev, if (err || val != 0) return -EINVAL; - down_read(&zram->init_lock); + guard(rwsem_read)(&zram->init_lock); if (init_done(zram)) { atomic_long_set(&zram->stats.max_used_pages, zs_get_total_pages(zram->mem_pool)); } - up_read(&zram->init_lock); return len; } @@ -458,12 +457,11 @@ static void mark_idle(struct zram *zram, ktime_t cutoff) } } -static ssize_t idle_store(struct device *dev, - struct device_attribute *attr, const char *buf, size_t len) +static ssize_t idle_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t len) { struct zram *zram = dev_to_zram(dev); ktime_t cutoff_time = 0; - ssize_t rv = -EINVAL; if (!sysfs_streq(buf, "all")) { /* @@ -476,24 +474,19 @@ static ssize_t idle_store(struct device *dev, cutoff_time = ktime_sub(ktime_get_boottime(), ns_to_ktime(age_sec * NSEC_PER_SEC)); else - goto out; + return -EINVAL; } - down_read(&zram->init_lock); + guard(rwsem_read)(&zram->init_lock); if (!init_done(zram)) - goto out_unlock; + return -EINVAL; /* * A cutoff_time of 0 marks everything as idle, this is the * "all" behavior. */ mark_idle(zram, cutoff_time); - rv = len; - -out_unlock: - up_read(&zram->init_lock); -out: - return rv; + return len; } #ifdef CONFIG_ZRAM_WRITEBACK @@ -546,13 +539,12 @@ static ssize_t bd_stat_show(struct device *dev, struct device_attribute *attr, struct zram *zram = dev_to_zram(dev); ssize_t ret; - down_read(&zram->init_lock); + guard(rwsem_read)(&zram->init_lock); ret = sysfs_emit(buf, "%8llu %8llu %8llu\n", FOUR_K((u64)atomic64_read(&zram->stats.bd_count)), FOUR_K((u64)atomic64_read(&zram->stats.bd_reads)), FOUR_K((u64)atomic64_read(&zram->stats.bd_writes))); - up_read(&zram->init_lock); return ret; } @@ -567,14 +559,12 @@ static ssize_t writeback_compressed_store(struct device *dev, if (kstrtobool(buf, &val)) return -EINVAL; - down_write(&zram->init_lock); + guard(rwsem_write)(&zram->init_lock); if (init_done(zram)) { - up_write(&zram->init_lock); return -EBUSY; } zram->wb_compressed = val; - up_write(&zram->init_lock); return len; } @@ -586,9 +576,8 @@ static ssize_t writeback_compressed_show(struct device *dev, bool val; struct zram *zram = dev_to_zram(dev); - down_read(&zram->init_lock); + guard(rwsem_read)(&zram->init_lock); val = zram->wb_compressed; - up_read(&zram->init_lock); return sysfs_emit(buf, "%d\n", val); } @@ -599,17 +588,14 @@ static ssize_t writeback_limit_enable_store(struct device *dev, { struct zram *zram = dev_to_zram(dev); u64 val; - ssize_t ret = -EINVAL; if (kstrtoull(buf, 10, &val)) - return ret; + return -EINVAL; - down_write(&zram->init_lock); + guard(rwsem_write)(&zram->init_lock); zram->wb_limit_enable = val; - up_write(&zram->init_lock); - ret = len; - return ret; + return len; } static ssize_t writeback_limit_enable_show(struct device *dev, @@ -619,9 +605,8 @@ static ssize_t writeback_limit_enable_show(struct device *dev, bool val; struct zram *zram = dev_to_zram(dev); - down_read(&zram->init_lock); + guard(rwsem_read)(&zram->init_lock); val = zram->wb_limit_enable; - up_read(&zram->init_lock); return sysfs_emit(buf, "%d\n", val); } @@ -632,10 +617,9 @@ static ssize_t writeback_limit_store(struct device *dev, { struct zram *zram = dev_to_zram(dev); u64 val; - ssize_t ret = -EINVAL; if (kstrtoull(buf, 10, &val)) - return ret; + return -EINVAL; /* * When the page size is greater than 4KB, if bd_wb_limit is set to @@ -647,12 +631,10 @@ static ssize_t writeback_limit_store(struct device *dev, */ val = rounddown(val, PAGE_SIZE / 4096); - down_write(&zram->init_lock); + guard(rwsem_write)(&zram->init_lock); zram->bd_wb_limit = val; - up_write(&zram->init_lock); - ret = len; - return ret; + return len; } static ssize_t writeback_limit_show(struct device *dev, @@ -661,9 +643,8 @@ static ssize_t writeback_limit_show(struct device *dev, u64 val; struct zram *zram = dev_to_zram(dev); - down_read(&zram->init_lock); + guard(rwsem_read)(&zram->init_lock); val = zram->bd_wb_limit; - up_read(&zram->init_lock); return sysfs_emit(buf, "%llu\n", val); } @@ -681,9 +662,8 @@ static ssize_t writeback_batch_size_store(struct device *dev, if (!val) return -EINVAL; - down_write(&zram->init_lock); + guard(rwsem_write)(&zram->init_lock); zram->wb_batch_size = val; - up_write(&zram->init_lock); return len; } @@ -695,9 +675,8 @@ static ssize_t writeback_batch_size_show(struct device *dev, u32 val; struct zram *zram = dev_to_zram(dev); - down_read(&zram->init_lock); + guard(rwsem_read)(&zram->init_lock); val = zram->wb_batch_size; - up_read(&zram->init_lock); return sysfs_emit(buf, "%u\n", val); } @@ -717,37 +696,33 @@ static void reset_bdev(struct zram *zram) } static ssize_t backing_dev_show(struct device *dev, - struct device_attribute *attr, char *buf) + struct device_attribute *attr, char *buf) { struct file *file; struct zram *zram = dev_to_zram(dev); char *p; ssize_t ret; - down_read(&zram->init_lock); + guard(rwsem_read)(&zram->init_lock); file = zram->backing_dev; if (!file) { memcpy(buf, "none\n", 5); - up_read(&zram->init_lock); return 5; } p = file_path(file, buf, PAGE_SIZE - 1); - if (IS_ERR(p)) { - ret = PTR_ERR(p); - goto out; - } + if (IS_ERR(p)) + return PTR_ERR(p); ret = strlen(p); memmove(buf, p, ret); buf[ret++] = '\n'; -out: - up_read(&zram->init_lock); return ret; } static ssize_t backing_dev_store(struct device *dev, - struct device_attribute *attr, const char *buf, size_t len) + struct device_attribute *attr, const char *buf, + size_t len) { char *file_name; size_t sz; @@ -762,7 +737,7 @@ static ssize_t backing_dev_store(struct device *dev, if (!file_name) return -ENOMEM; - down_write(&zram->init_lock); + guard(rwsem_write)(&zram->init_lock); if (init_done(zram)) { pr_info("Can't setup backing device for initialized device\n"); err = -EBUSY; @@ -810,7 +785,6 @@ static ssize_t backing_dev_store(struct device *dev, zram->backing_dev = backing_dev; zram->bitmap = bitmap; zram->nr_pages = nr_pages; - up_write(&zram->init_lock); pr_info("setup backing device %s\n", file_name); kfree(file_name); @@ -822,8 +796,6 @@ out: if (backing_dev) filp_close(backing_dev, NULL); - up_write(&zram->init_lock); - kfree(file_name); return err; @@ -1291,33 +1263,29 @@ static ssize_t writeback_store(struct device *dev, ssize_t ret = len; int err, mode = 0; - down_read(&zram->init_lock); - if (!init_done(zram)) { - up_read(&zram->init_lock); + guard(rwsem_read)(&zram->init_lock); + if (!init_done(zram)) return -EINVAL; - } /* Do not permit concurrent post-processing actions. */ - if (atomic_xchg(&zram->pp_in_progress, 1)) { - up_read(&zram->init_lock); + if (atomic_xchg(&zram->pp_in_progress, 1)) return -EAGAIN; - } if (!zram->backing_dev) { ret = -ENODEV; - goto release_init_lock; + goto out; } pp_ctl = init_pp_ctl(); if (!pp_ctl) { ret = -ENOMEM; - goto release_init_lock; + goto out; } wb_ctl = init_wb_ctl(zram); if (!wb_ctl) { ret = -ENOMEM; - goto release_init_lock; + goto out; } args = skip_spaces(buf); @@ -1341,7 +1309,7 @@ static ssize_t writeback_store(struct device *dev, err = parse_mode(param, &mode); if (err) { ret = err; - goto release_init_lock; + goto out; } scan_slots_for_writeback(zram, mode, lo, hi, pp_ctl); @@ -1352,7 +1320,7 @@ static ssize_t writeback_store(struct device *dev, err = parse_mode(val, &mode); if (err) { ret = err; - goto release_init_lock; + goto out; } scan_slots_for_writeback(zram, mode, lo, hi, pp_ctl); @@ -1363,7 +1331,7 @@ static ssize_t writeback_store(struct device *dev, err = parse_page_index(val, nr_pages, &lo, &hi); if (err) { ret = err; - goto release_init_lock; + goto out; } scan_slots_for_writeback(zram, mode, lo, hi, pp_ctl); @@ -1374,7 +1342,7 @@ static ssize_t writeback_store(struct device *dev, err = parse_page_indexes(val, nr_pages, &lo, &hi); if (err) { ret = err; - goto release_init_lock; + goto out; } scan_slots_for_writeback(zram, mode, lo, hi, pp_ctl); @@ -1386,11 +1354,10 @@ static ssize_t writeback_store(struct device *dev, if (err) ret = err; -release_init_lock: +out: release_pp_ctl(zram, pp_ctl); release_wb_ctl(wb_ctl); atomic_set(&zram->pp_in_progress, 0); - up_read(&zram->init_lock); return ret; } @@ -1608,9 +1575,8 @@ static ssize_t read_block_state(struct file *file, char __user *buf, if (!kbuf) return -ENOMEM; - down_read(&zram->init_lock); + guard(rwsem_read)(&zram->init_lock); if (!init_done(zram)) { - up_read(&zram->init_lock); kvfree(kbuf); return -EINVAL; } @@ -1646,7 +1612,6 @@ next: *ppos += 1; } - up_read(&zram->init_lock); if (copy_to_user(buf, kbuf, written)) written = -EFAULT; kvfree(kbuf); @@ -1713,16 +1678,14 @@ static int __comp_algorithm_store(struct zram *zram, u32 prio, const char *buf) return -EINVAL; } - down_write(&zram->init_lock); + guard(rwsem_write)(&zram->init_lock); if (init_done(zram)) { - up_write(&zram->init_lock); kfree(compressor); pr_info("Can't change algorithm for initialized device\n"); return -EBUSY; } comp_algorithm_set(zram, prio, compressor); - up_write(&zram->init_lock); return 0; } @@ -1843,9 +1806,8 @@ static ssize_t comp_algorithm_show(struct device *dev, struct zram *zram = dev_to_zram(dev); ssize_t sz; - down_read(&zram->init_lock); + guard(rwsem_read)(&zram->init_lock); sz = zcomp_available_show(zram->comp_algs[ZRAM_PRIMARY_COMP], buf, 0); - up_read(&zram->init_lock); return sz; } @@ -1870,7 +1832,7 @@ static ssize_t recomp_algorithm_show(struct device *dev, ssize_t sz = 0; u32 prio; - down_read(&zram->init_lock); + guard(rwsem_read)(&zram->init_lock); for (prio = ZRAM_SECONDARY_COMP; prio < ZRAM_MAX_COMPS; prio++) { if (!zram->comp_algs[prio]) continue; @@ -1878,7 +1840,6 @@ static ssize_t recomp_algorithm_show(struct device *dev, sz += sysfs_emit_at(buf, sz, "#%d: ", prio); sz += zcomp_available_show(zram->comp_algs[prio], buf, sz); } - up_read(&zram->init_lock); return sz; } @@ -1924,42 +1885,38 @@ static ssize_t recomp_algorithm_store(struct device *dev, } #endif -static ssize_t compact_store(struct device *dev, - struct device_attribute *attr, const char *buf, size_t len) +static ssize_t compact_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t len) { struct zram *zram = dev_to_zram(dev); - down_read(&zram->init_lock); - if (!init_done(zram)) { - up_read(&zram->init_lock); + guard(rwsem_read)(&zram->init_lock); + if (!init_done(zram)) return -EINVAL; - } zs_compact(zram->mem_pool); - up_read(&zram->init_lock); return len; } -static ssize_t io_stat_show(struct device *dev, - struct device_attribute *attr, char *buf) +static ssize_t io_stat_show(struct device *dev, struct device_attribute *attr, + char *buf) { struct zram *zram = dev_to_zram(dev); ssize_t ret; - down_read(&zram->init_lock); + guard(rwsem_read)(&zram->init_lock); ret = sysfs_emit(buf, "%8llu %8llu 0 %8llu\n", (u64)atomic64_read(&zram->stats.failed_reads), (u64)atomic64_read(&zram->stats.failed_writes), (u64)atomic64_read(&zram->stats.notify_free)); - up_read(&zram->init_lock); return ret; } -static ssize_t mm_stat_show(struct device *dev, - struct device_attribute *attr, char *buf) +static ssize_t mm_stat_show(struct device *dev, struct device_attribute *attr, + char *buf) { struct zram *zram = dev_to_zram(dev); struct zs_pool_stats pool_stats; @@ -1969,7 +1926,7 @@ static ssize_t mm_stat_show(struct device *dev, memset(&pool_stats, 0x00, sizeof(struct zs_pool_stats)); - down_read(&zram->init_lock); + guard(rwsem_read)(&zram->init_lock); if (init_done(zram)) { mem_used = zs_get_total_pages(zram->mem_pool); zs_pool_stats(zram->mem_pool, &pool_stats); @@ -1989,7 +1946,6 @@ static ssize_t mm_stat_show(struct device *dev, atomic_long_read(&pool_stats.pages_compacted), (u64)atomic64_read(&zram->stats.huge_pages), (u64)atomic64_read(&zram->stats.huge_pages_since)); - up_read(&zram->init_lock); return ret; } @@ -2001,12 +1957,11 @@ static ssize_t debug_stat_show(struct device *dev, struct zram *zram = dev_to_zram(dev); ssize_t ret; - down_read(&zram->init_lock); + guard(rwsem_read)(&zram->init_lock); ret = sysfs_emit(buf, "version: %d\n0 %8llu\n", version, (u64)atomic64_read(&zram->stats.miss_free)); - up_read(&zram->init_lock); return ret; } @@ -2669,17 +2624,13 @@ static ssize_t recompress_store(struct device *dev, if (threshold >= huge_class_size) return -EINVAL; - down_read(&zram->init_lock); - if (!init_done(zram)) { - ret = -EINVAL; - goto release_init_lock; - } + guard(rwsem_read)(&zram->init_lock); + if (!init_done(zram)) + return -EINVAL; /* Do not permit concurrent post-processing actions. */ - if (atomic_xchg(&zram->pp_in_progress, 1)) { - up_read(&zram->init_lock); + if (atomic_xchg(&zram->pp_in_progress, 1)) return -EAGAIN; - } if (algo) { bool found = false; @@ -2697,26 +2648,26 @@ static ssize_t recompress_store(struct device *dev, if (!found) { ret = -EINVAL; - goto release_init_lock; + goto out; } } prio_max = min(prio_max, (u32)zram->num_active_comps); if (prio >= prio_max) { ret = -EINVAL; - goto release_init_lock; + goto out; } page = alloc_page(GFP_KERNEL); if (!page) { ret = -ENOMEM; - goto release_init_lock; + goto out; } ctl = init_pp_ctl(); if (!ctl) { ret = -ENOMEM; - goto release_init_lock; + goto out; } scan_slots_for_recompress(zram, mode, prio_max, ctl); @@ -2747,12 +2698,11 @@ next: cond_resched(); } -release_init_lock: +out: if (page) __free_page(page); release_pp_ctl(zram, ctl); atomic_set(&zram->pp_in_progress, 0); - up_read(&zram->init_lock); return ret; } #endif @@ -2931,7 +2881,7 @@ static void zram_destroy_comps(struct zram *zram) static void zram_reset_device(struct zram *zram) { - down_write(&zram->init_lock); + guard(rwsem_write)(&zram->init_lock); zram->limit_pages = 0; @@ -2947,11 +2897,10 @@ static void zram_reset_device(struct zram *zram) reset_bdev(zram); comp_algorithm_set(zram, ZRAM_PRIMARY_COMP, default_compressor); - up_write(&zram->init_lock); } -static ssize_t disksize_store(struct device *dev, - struct device_attribute *attr, const char *buf, size_t len) +static ssize_t disksize_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t len) { u64 disksize; struct zcomp *comp; @@ -2963,18 +2912,15 @@ static ssize_t disksize_store(struct device *dev, if (!disksize) return -EINVAL; - down_write(&zram->init_lock); + guard(rwsem_write)(&zram->init_lock); if (init_done(zram)) { pr_info("Cannot change disksize for initialized device\n"); - err = -EBUSY; - goto out_unlock; + return -EBUSY; } disksize = PAGE_ALIGN(disksize); - if (!zram_meta_alloc(zram, disksize)) { - err = -ENOMEM; - goto out_unlock; - } + if (!zram_meta_alloc(zram, disksize)) + return -ENOMEM; for (prio = ZRAM_PRIMARY_COMP; prio < ZRAM_MAX_COMPS; prio++) { if (!zram->comp_algs[prio]) @@ -2994,15 +2940,12 @@ static ssize_t disksize_store(struct device *dev, } zram->disksize = disksize; set_capacity_and_notify(zram->disk, zram->disksize >> SECTOR_SHIFT); - up_write(&zram->init_lock); return len; out_free_comps: zram_destroy_comps(zram); zram_meta_free(zram, disksize); -out_unlock: - up_write(&zram->init_lock); return err; } -- cgit v1.2.3 From 0327a862135b0b0b5e67f1434468326b733562bf Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Mon, 1 Dec 2025 18:47:54 +0900 Subject: zram: consolidate device-attr declarations Do not spread device attributes declarations across the file, move io_stat, mm_stat, debug_stat to a common device-attr section. Link: https://lkml.kernel.org/r/20251201094754.4149975-8-senozhatsky@chromium.org Signed-off-by: Sergey Senozhatsky Cc: Brian Geffon Cc: David Stevens Cc: Minchan Kim Cc: Richard Chang Signed-off-by: Andrew Morton --- drivers/block/zram/zram_drv.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 4b8a26c60539..67a9e7c005c3 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -1966,10 +1966,6 @@ static ssize_t debug_stat_show(struct device *dev, return ret; } -static DEVICE_ATTR_RO(io_stat); -static DEVICE_ATTR_RO(mm_stat); -static DEVICE_ATTR_RO(debug_stat); - static void zram_meta_free(struct zram *zram, u64 disksize) { size_t num_pages = disksize >> PAGE_SHIFT; @@ -3008,6 +3004,9 @@ static const struct block_device_operations zram_devops = { .owner = THIS_MODULE }; +static DEVICE_ATTR_RO(io_stat); +static DEVICE_ATTR_RO(mm_stat); +static DEVICE_ATTR_RO(debug_stat); static DEVICE_ATTR_WO(compact); static DEVICE_ATTR_RW(disksize); static DEVICE_ATTR_RO(initstate); -- cgit v1.2.3 From 2e8ff2f51dde73a26b94aed2df4827177bd25e6e Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Mon, 15 Dec 2025 14:47:11 +0900 Subject: zram: use u32 for entry ac_time tracking We can reduce sizeof(zram_table_entry) on 64-bit systems by converting flags and ac_time to u32. Entry flags fit into u32, and for ac_time u32 gives us over a century of entry lifespan (approx 136 years) which is plenty (zram uses system boot time (seconds)). In struct zram_table_entry we use bytes aliasing, because bit-wait API (for slot lock) requires a whole unsigned long word. Link: https://lkml.kernel.org/r/d7c0b48450c70eeb5fd8acd6ecd23593f30dbf1f.1765775954.git.senozhatsky@chromium.org Signed-off-by: Sergey Senozhatsky Suggested-by: David Stevens Cc: Brian Geffon Cc: Minchan Kim Cc: Richard Chang Signed-off-by: Andrew Morton --- drivers/block/zram/zram_drv.c | 60 +++++++++++++++++++++---------------------- drivers/block/zram/zram_drv.h | 9 +++++-- 2 files changed, 37 insertions(+), 32 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 67a9e7c005c3..65f99ff3e2e5 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -81,7 +81,7 @@ static void zram_slot_lock_init(struct zram *zram, u32 index) */ static __must_check bool zram_slot_trylock(struct zram *zram, u32 index) { - unsigned long *lock = &zram->table[index].flags; + unsigned long *lock = &zram->table[index].__lock; if (!test_and_set_bit_lock(ZRAM_ENTRY_LOCK, lock)) { mutex_acquire(slot_dep_map(zram, index), 0, 1, _RET_IP_); @@ -94,7 +94,7 @@ static __must_check bool zram_slot_trylock(struct zram *zram, u32 index) static void zram_slot_lock(struct zram *zram, u32 index) { - unsigned long *lock = &zram->table[index].flags; + unsigned long *lock = &zram->table[index].__lock; mutex_acquire(slot_dep_map(zram, index), 0, 0, _RET_IP_); wait_on_bit_lock(lock, ZRAM_ENTRY_LOCK, TASK_UNINTERRUPTIBLE); @@ -103,7 +103,7 @@ static void zram_slot_lock(struct zram *zram, u32 index) static void zram_slot_unlock(struct zram *zram, u32 index) { - unsigned long *lock = &zram->table[index].flags; + unsigned long *lock = &zram->table[index].__lock; mutex_release(slot_dep_map(zram, index), _RET_IP_); clear_and_wake_up_bit(ZRAM_ENTRY_LOCK, lock); @@ -130,34 +130,33 @@ static void zram_set_handle(struct zram *zram, u32 index, unsigned long handle) } static bool zram_test_flag(struct zram *zram, u32 index, - enum zram_pageflags flag) + enum zram_pageflags flag) { - return zram->table[index].flags & BIT(flag); + return zram->table[index].attr.flags & BIT(flag); } static void zram_set_flag(struct zram *zram, u32 index, - enum zram_pageflags flag) + enum zram_pageflags flag) { - zram->table[index].flags |= BIT(flag); + zram->table[index].attr.flags |= BIT(flag); } static void zram_clear_flag(struct zram *zram, u32 index, - enum zram_pageflags flag) + enum zram_pageflags flag) { - zram->table[index].flags &= ~BIT(flag); + zram->table[index].attr.flags &= ~BIT(flag); } static size_t zram_get_obj_size(struct zram *zram, u32 index) { - return zram->table[index].flags & (BIT(ZRAM_FLAG_SHIFT) - 1); + return zram->table[index].attr.flags & (BIT(ZRAM_FLAG_SHIFT) - 1); } -static void zram_set_obj_size(struct zram *zram, - u32 index, size_t size) +static void zram_set_obj_size(struct zram *zram, u32 index, size_t size) { - unsigned long flags = zram->table[index].flags >> ZRAM_FLAG_SHIFT; + unsigned long flags = zram->table[index].attr.flags >> ZRAM_FLAG_SHIFT; - zram->table[index].flags = (flags << ZRAM_FLAG_SHIFT) | size; + zram->table[index].attr.flags = (flags << ZRAM_FLAG_SHIFT) | size; } static inline bool zram_allocated(struct zram *zram, u32 index) @@ -208,14 +207,14 @@ static inline void zram_set_priority(struct zram *zram, u32 index, u32 prio) * Clear previous priority value first, in case if we recompress * further an already recompressed page */ - zram->table[index].flags &= ~(ZRAM_COMP_PRIORITY_MASK << - ZRAM_COMP_PRIORITY_BIT1); - zram->table[index].flags |= (prio << ZRAM_COMP_PRIORITY_BIT1); + zram->table[index].attr.flags &= ~(ZRAM_COMP_PRIORITY_MASK << + ZRAM_COMP_PRIORITY_BIT1); + zram->table[index].attr.flags |= (prio << ZRAM_COMP_PRIORITY_BIT1); } static inline u32 zram_get_priority(struct zram *zram, u32 index) { - u32 prio = zram->table[index].flags >> ZRAM_COMP_PRIORITY_BIT1; + u32 prio = zram->table[index].attr.flags >> ZRAM_COMP_PRIORITY_BIT1; return prio & ZRAM_COMP_PRIORITY_MASK; } @@ -225,7 +224,7 @@ static void zram_accessed(struct zram *zram, u32 index) zram_clear_flag(zram, index, ZRAM_IDLE); zram_clear_flag(zram, index, ZRAM_PP_SLOT); #ifdef CONFIG_ZRAM_TRACK_ENTRY_ACTIME - zram->table[index].ac_time = ktime_get_boottime(); + zram->table[index].attr.ac_time = (u32)ktime_get_boottime_seconds(); #endif } @@ -447,7 +446,7 @@ static void mark_idle(struct zram *zram, ktime_t cutoff) #ifdef CONFIG_ZRAM_TRACK_ENTRY_ACTIME is_idle = !cutoff || - ktime_after(cutoff, zram->table[index].ac_time); + ktime_after(cutoff, zram->table[index].attr.ac_time); #endif if (is_idle) zram_set_flag(zram, index, ZRAM_IDLE); @@ -461,18 +460,19 @@ static ssize_t idle_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { struct zram *zram = dev_to_zram(dev); - ktime_t cutoff_time = 0; + ktime_t cutoff = 0; if (!sysfs_streq(buf, "all")) { /* * If it did not parse as 'all' try to treat it as an integer * when we have memory tracking enabled. */ - u64 age_sec; + u32 age_sec; - if (IS_ENABLED(CONFIG_ZRAM_TRACK_ENTRY_ACTIME) && !kstrtoull(buf, 0, &age_sec)) - cutoff_time = ktime_sub(ktime_get_boottime(), - ns_to_ktime(age_sec * NSEC_PER_SEC)); + if (IS_ENABLED(CONFIG_ZRAM_TRACK_ENTRY_ACTIME) && + !kstrtouint(buf, 0, &age_sec)) + cutoff = ktime_sub((u32)ktime_get_boottime_seconds(), + age_sec); else return -EINVAL; } @@ -482,10 +482,10 @@ static ssize_t idle_store(struct device *dev, struct device_attribute *attr, return -EINVAL; /* - * A cutoff_time of 0 marks everything as idle, this is the + * A cutoff of 0 marks everything as idle, this is the * "all" behavior. */ - mark_idle(zram, cutoff_time); + mark_idle(zram, cutoff); return len; } @@ -1588,7 +1588,7 @@ static ssize_t read_block_state(struct file *file, char __user *buf, if (!zram_allocated(zram, index)) goto next; - ts = ktime_to_timespec64(zram->table[index].ac_time); + ts = ktime_to_timespec64(zram->table[index].attr.ac_time); copied = snprintf(kbuf + written, count, "%12zd %12lld.%06lu %c%c%c%c%c%c\n", index, (s64)ts.tv_sec, @@ -2013,7 +2013,7 @@ static void zram_slot_free(struct zram *zram, u32 index) unsigned long handle; #ifdef CONFIG_ZRAM_TRACK_ENTRY_ACTIME - zram->table[index].ac_time = 0; + zram->table[index].attr.ac_time = 0; #endif zram_clear_flag(zram, index, ZRAM_IDLE); @@ -3286,7 +3286,7 @@ static int __init zram_init(void) struct zram_table_entry zram_te; int ret; - BUILD_BUG_ON(__NR_ZRAM_PAGEFLAGS > sizeof(zram_te.flags) * 8); + BUILD_BUG_ON(__NR_ZRAM_PAGEFLAGS > sizeof(zram_te.attr.flags) * 8); ret = cpuhp_setup_state_multi(CPUHP_ZCOMP_PREPARE, "block/zram:prepare", zcomp_cpu_up_prepare, zcomp_cpu_dead); diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index 72fdf66c78ab..48d6861c6647 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -65,10 +65,15 @@ enum zram_pageflags { */ struct zram_table_entry { unsigned long handle; - unsigned long flags; + union { + unsigned long __lock; + struct attr { + u32 flags; #ifdef CONFIG_ZRAM_TRACK_ENTRY_ACTIME - ktime_t ac_time; + u32 ac_time; #endif + } attr; + }; struct lockdep_map dep_map; }; -- cgit v1.2.3 From bde60fe747216d3449a1a74f07937a5273717b69 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Mon, 15 Dec 2025 14:47:12 +0900 Subject: zram: rename internal slot API We have a somewhat confusing internal API naming. E.g. the following code: zram_slot_lock() if (zram_allocated()) zram_set_flag() zram_slot_unlock() may look like it does something on zram device level, but in fact it tests and sets slot entry flags, not the device ones. Rename API to explicitly distinguish functions that operate on the slot level from functions that operate on the zram device level. While at it, fixup some coding styles. [senozhatsky@chromium.org: fix up mark_slot_accessed()] Link: https://lkml.kernel.org/r/20260115031922.3813659-1-senozhatsky@chromium.org Link: https://lkml.kernel.org/r/775a0b1a0ace5caf1f05965d8bc637c1192820fa.1765775954.git.senozhatsky@chromium.org Signed-off-by: Sergey Senozhatsky Cc: Brian Geffon Cc: David Stevens Cc: Minchan Kim Cc: Richard Chang Signed-off-by: Andrew Morton --- drivers/block/zram/zram_drv.c | 363 +++++++++++++++++++++--------------------- 1 file changed, 182 insertions(+), 181 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 65f99ff3e2e5..bd9a37fca675 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -56,10 +56,10 @@ static size_t huge_class_size; static const struct block_device_operations zram_devops; -static void zram_slot_free(struct zram *zram, u32 index); +static void slot_free(struct zram *zram, u32 index); #define slot_dep_map(zram, index) (&(zram)->table[(index)].dep_map) -static void zram_slot_lock_init(struct zram *zram, u32 index) +static void slot_lock_init(struct zram *zram, u32 index) { static struct lock_class_key __key; @@ -79,7 +79,7 @@ static void zram_slot_lock_init(struct zram *zram, u32 index) * 4) Use TRY lock variant when in atomic context * - must check return value and handle locking failers */ -static __must_check bool zram_slot_trylock(struct zram *zram, u32 index) +static __must_check bool slot_trylock(struct zram *zram, u32 index) { unsigned long *lock = &zram->table[index].__lock; @@ -92,7 +92,7 @@ static __must_check bool zram_slot_trylock(struct zram *zram, u32 index) return false; } -static void zram_slot_lock(struct zram *zram, u32 index) +static void slot_lock(struct zram *zram, u32 index) { unsigned long *lock = &zram->table[index].__lock; @@ -101,7 +101,7 @@ static void zram_slot_lock(struct zram *zram, u32 index) lock_acquired(slot_dep_map(zram, index), _RET_IP_); } -static void zram_slot_unlock(struct zram *zram, u32 index) +static void slot_unlock(struct zram *zram, u32 index) { unsigned long *lock = &zram->table[index].__lock; @@ -119,51 +119,80 @@ static inline struct zram *dev_to_zram(struct device *dev) return (struct zram *)dev_to_disk(dev)->private_data; } -static unsigned long zram_get_handle(struct zram *zram, u32 index) +static unsigned long get_slot_handle(struct zram *zram, u32 index) { return zram->table[index].handle; } -static void zram_set_handle(struct zram *zram, u32 index, unsigned long handle) +static void set_slot_handle(struct zram *zram, u32 index, unsigned long handle) { zram->table[index].handle = handle; } -static bool zram_test_flag(struct zram *zram, u32 index, +static bool test_slot_flag(struct zram *zram, u32 index, enum zram_pageflags flag) { return zram->table[index].attr.flags & BIT(flag); } -static void zram_set_flag(struct zram *zram, u32 index, +static void set_slot_flag(struct zram *zram, u32 index, enum zram_pageflags flag) { zram->table[index].attr.flags |= BIT(flag); } -static void zram_clear_flag(struct zram *zram, u32 index, +static void clear_slot_flag(struct zram *zram, u32 index, enum zram_pageflags flag) { zram->table[index].attr.flags &= ~BIT(flag); } -static size_t zram_get_obj_size(struct zram *zram, u32 index) +static size_t get_slot_size(struct zram *zram, u32 index) { return zram->table[index].attr.flags & (BIT(ZRAM_FLAG_SHIFT) - 1); } -static void zram_set_obj_size(struct zram *zram, u32 index, size_t size) +static void set_slot_size(struct zram *zram, u32 index, size_t size) { unsigned long flags = zram->table[index].attr.flags >> ZRAM_FLAG_SHIFT; zram->table[index].attr.flags = (flags << ZRAM_FLAG_SHIFT) | size; } -static inline bool zram_allocated(struct zram *zram, u32 index) +static inline bool slot_allocated(struct zram *zram, u32 index) { - return zram_get_obj_size(zram, index) || - zram_test_flag(zram, index, ZRAM_SAME) || - zram_test_flag(zram, index, ZRAM_WB); + return get_slot_size(zram, index) || + test_slot_flag(zram, index, ZRAM_SAME) || + test_slot_flag(zram, index, ZRAM_WB); +} + +static inline void set_slot_comp_priority(struct zram *zram, u32 index, + u32 prio) +{ + prio &= ZRAM_COMP_PRIORITY_MASK; + /* + * Clear previous priority value first, in case if we recompress + * further an already recompressed page + */ + zram->table[index].attr.flags &= ~(ZRAM_COMP_PRIORITY_MASK << + ZRAM_COMP_PRIORITY_BIT1); + zram->table[index].attr.flags |= (prio << ZRAM_COMP_PRIORITY_BIT1); +} + +static inline u32 get_slot_comp_priority(struct zram *zram, u32 index) +{ + u32 prio = zram->table[index].attr.flags >> ZRAM_COMP_PRIORITY_BIT1; + + return prio & ZRAM_COMP_PRIORITY_MASK; +} + +static void mark_slot_accessed(struct zram *zram, u32 index) +{ + clear_slot_flag(zram, index, ZRAM_IDLE); + clear_slot_flag(zram, index, ZRAM_PP_SLOT); +#ifdef CONFIG_ZRAM_TRACK_ENTRY_ACTIME + zram->table[index].attr.ac_time = (u32)ktime_get_boottime_seconds(); +#endif } static inline void update_used_max(struct zram *zram, const unsigned long pages) @@ -200,34 +229,6 @@ static inline bool is_partial_io(struct bio_vec *bvec) } #endif -static inline void zram_set_priority(struct zram *zram, u32 index, u32 prio) -{ - prio &= ZRAM_COMP_PRIORITY_MASK; - /* - * Clear previous priority value first, in case if we recompress - * further an already recompressed page - */ - zram->table[index].attr.flags &= ~(ZRAM_COMP_PRIORITY_MASK << - ZRAM_COMP_PRIORITY_BIT1); - zram->table[index].attr.flags |= (prio << ZRAM_COMP_PRIORITY_BIT1); -} - -static inline u32 zram_get_priority(struct zram *zram, u32 index) -{ - u32 prio = zram->table[index].attr.flags >> ZRAM_COMP_PRIORITY_BIT1; - - return prio & ZRAM_COMP_PRIORITY_MASK; -} - -static void zram_accessed(struct zram *zram, u32 index) -{ - zram_clear_flag(zram, index, ZRAM_IDLE); - zram_clear_flag(zram, index, ZRAM_PP_SLOT); -#ifdef CONFIG_ZRAM_TRACK_ENTRY_ACTIME - zram->table[index].attr.ac_time = (u32)ktime_get_boottime_seconds(); -#endif -} - #if defined CONFIG_ZRAM_WRITEBACK || defined CONFIG_ZRAM_MULTI_COMP struct zram_pp_slot { unsigned long index; @@ -263,9 +264,9 @@ static void release_pp_slot(struct zram *zram, struct zram_pp_slot *pps) { list_del_init(&pps->entry); - zram_slot_lock(zram, pps->index); - zram_clear_flag(zram, pps->index, ZRAM_PP_SLOT); - zram_slot_unlock(zram, pps->index); + slot_lock(zram, pps->index); + clear_slot_flag(zram, pps->index, ZRAM_PP_SLOT); + slot_unlock(zram, pps->index); kfree(pps); } @@ -304,10 +305,10 @@ static bool place_pp_slot(struct zram *zram, struct zram_pp_ctl *ctl, INIT_LIST_HEAD(&pps->entry); pps->index = index; - bid = zram_get_obj_size(zram, pps->index) / PP_BUCKET_SIZE_RANGE; + bid = get_slot_size(zram, pps->index) / PP_BUCKET_SIZE_RANGE; list_add(&pps->entry, &ctl->pp_buckets[bid]); - zram_set_flag(zram, pps->index, ZRAM_PP_SLOT); + set_slot_flag(zram, pps->index, ZRAM_PP_SLOT); return true; } @@ -436,11 +437,11 @@ static void mark_idle(struct zram *zram, ktime_t cutoff) * * And ZRAM_WB slots simply cannot be ZRAM_IDLE. */ - zram_slot_lock(zram, index); - if (!zram_allocated(zram, index) || - zram_test_flag(zram, index, ZRAM_WB) || - zram_test_flag(zram, index, ZRAM_SAME)) { - zram_slot_unlock(zram, index); + slot_lock(zram, index); + if (!slot_allocated(zram, index) || + test_slot_flag(zram, index, ZRAM_WB) || + test_slot_flag(zram, index, ZRAM_SAME)) { + slot_unlock(zram, index); continue; } @@ -449,10 +450,10 @@ static void mark_idle(struct zram *zram, ktime_t cutoff) ktime_after(cutoff, zram->table[index].attr.ac_time); #endif if (is_idle) - zram_set_flag(zram, index, ZRAM_IDLE); + set_slot_flag(zram, index, ZRAM_IDLE); else - zram_clear_flag(zram, index, ZRAM_IDLE); - zram_slot_unlock(zram, index); + clear_slot_flag(zram, index, ZRAM_IDLE); + slot_unlock(zram, index); } } @@ -933,7 +934,7 @@ static int zram_writeback_complete(struct zram *zram, struct zram_wb_req *req) } atomic64_inc(&zram->stats.bd_writes); - zram_slot_lock(zram, index); + slot_lock(zram, index); /* * We release slot lock during writeback so slot can change under us: * slot_free() or slot_free() and zram_write_page(). In both cases @@ -941,7 +942,7 @@ static int zram_writeback_complete(struct zram *zram, struct zram_wb_req *req) * set ZRAM_PP_SLOT on such slots until current post-processing * finishes. */ - if (!zram_test_flag(zram, index, ZRAM_PP_SLOT)) { + if (!test_slot_flag(zram, index, ZRAM_PP_SLOT)) { zram_release_bdev_block(zram, req->blk_idx); goto out; } @@ -951,26 +952,26 @@ static int zram_writeback_complete(struct zram *zram, struct zram_wb_req *req) * ZRAM_WB slots get freed, we need to preserve data required * for read decompression. */ - size = zram_get_obj_size(zram, index); - prio = zram_get_priority(zram, index); - huge = zram_test_flag(zram, index, ZRAM_HUGE); + size = get_slot_size(zram, index); + prio = get_slot_comp_priority(zram, index); + huge = test_slot_flag(zram, index, ZRAM_HUGE); } - zram_slot_free(zram, index); - zram_set_flag(zram, index, ZRAM_WB); - zram_set_handle(zram, index, req->blk_idx); + slot_free(zram, index); + set_slot_flag(zram, index, ZRAM_WB); + set_slot_handle(zram, index, req->blk_idx); if (zram->wb_compressed) { if (huge) - zram_set_flag(zram, index, ZRAM_HUGE); - zram_set_obj_size(zram, index, size); - zram_set_priority(zram, index, prio); + set_slot_flag(zram, index, ZRAM_HUGE); + set_slot_size(zram, index, size); + set_slot_comp_priority(zram, index, prio); } atomic64_inc(&zram->stats.pages_stored); out: - zram_slot_unlock(zram, index); + slot_unlock(zram, index); return 0; } @@ -1091,14 +1092,14 @@ static int zram_writeback_slots(struct zram *zram, } index = pps->index; - zram_slot_lock(zram, index); + slot_lock(zram, index); /* * scan_slots() sets ZRAM_PP_SLOT and releases slot lock, so * slots can change in the meantime. If slots are accessed or * freed they lose ZRAM_PP_SLOT flag and hence we don't * post-process them. */ - if (!zram_test_flag(zram, index, ZRAM_PP_SLOT)) + if (!test_slot_flag(zram, index, ZRAM_PP_SLOT)) goto next; if (zram->wb_compressed) err = read_from_zspool_raw(zram, req->page, index); @@ -1106,7 +1107,7 @@ static int zram_writeback_slots(struct zram *zram, err = read_from_zspool(zram, req->page, index); if (err) goto next; - zram_slot_unlock(zram, index); + slot_unlock(zram, index); /* * From now on pp-slot is owned by the req, remove it from @@ -1128,7 +1129,7 @@ static int zram_writeback_slots(struct zram *zram, continue; next: - zram_slot_unlock(zram, index); + slot_unlock(zram, index); release_pp_slot(zram, pps); } @@ -1221,27 +1222,27 @@ static int scan_slots_for_writeback(struct zram *zram, u32 mode, while (index < hi) { bool ok = true; - zram_slot_lock(zram, index); - if (!zram_allocated(zram, index)) + slot_lock(zram, index); + if (!slot_allocated(zram, index)) goto next; - if (zram_test_flag(zram, index, ZRAM_WB) || - zram_test_flag(zram, index, ZRAM_SAME)) + if (test_slot_flag(zram, index, ZRAM_WB) || + test_slot_flag(zram, index, ZRAM_SAME)) goto next; if (mode & IDLE_WRITEBACK && - !zram_test_flag(zram, index, ZRAM_IDLE)) + !test_slot_flag(zram, index, ZRAM_IDLE)) goto next; if (mode & HUGE_WRITEBACK && - !zram_test_flag(zram, index, ZRAM_HUGE)) + !test_slot_flag(zram, index, ZRAM_HUGE)) goto next; if (mode & INCOMPRESSIBLE_WRITEBACK && - !zram_test_flag(zram, index, ZRAM_INCOMPRESSIBLE)) + !test_slot_flag(zram, index, ZRAM_INCOMPRESSIBLE)) goto next; ok = place_pp_slot(zram, ctl, index); next: - zram_slot_unlock(zram, index); + slot_unlock(zram, index); if (!ok) break; index++; @@ -1369,22 +1370,22 @@ static int decompress_bdev_page(struct zram *zram, struct page *page, u32 index) int ret, prio; void *src; - zram_slot_lock(zram, index); + slot_lock(zram, index); /* Since slot was unlocked we need to make sure it's still ZRAM_WB */ - if (!zram_test_flag(zram, index, ZRAM_WB)) { - zram_slot_unlock(zram, index); + if (!test_slot_flag(zram, index, ZRAM_WB)) { + slot_unlock(zram, index); /* We read some stale data, zero it out */ memset_page(page, 0, 0, PAGE_SIZE); return -EIO; } - if (zram_test_flag(zram, index, ZRAM_HUGE)) { - zram_slot_unlock(zram, index); + if (test_slot_flag(zram, index, ZRAM_HUGE)) { + slot_unlock(zram, index); return 0; } - size = zram_get_obj_size(zram, index); - prio = zram_get_priority(zram, index); + size = get_slot_size(zram, index); + prio = get_slot_comp_priority(zram, index); zstrm = zcomp_stream_get(zram->comps[prio]); src = kmap_local_page(page); @@ -1394,7 +1395,7 @@ static int decompress_bdev_page(struct zram *zram, struct page *page, u32 index) copy_page(src, zstrm->local_copy); kunmap_local(src); zcomp_stream_put(zstrm); - zram_slot_unlock(zram, index); + slot_unlock(zram, index); return ret; } @@ -1584,8 +1585,8 @@ static ssize_t read_block_state(struct file *file, char __user *buf, for (index = *ppos; index < nr_pages; index++) { int copied; - zram_slot_lock(zram, index); - if (!zram_allocated(zram, index)) + slot_lock(zram, index); + if (!slot_allocated(zram, index)) goto next; ts = ktime_to_timespec64(zram->table[index].attr.ac_time); @@ -1593,22 +1594,22 @@ static ssize_t read_block_state(struct file *file, char __user *buf, "%12zd %12lld.%06lu %c%c%c%c%c%c\n", index, (s64)ts.tv_sec, ts.tv_nsec / NSEC_PER_USEC, - zram_test_flag(zram, index, ZRAM_SAME) ? 's' : '.', - zram_test_flag(zram, index, ZRAM_WB) ? 'w' : '.', - zram_test_flag(zram, index, ZRAM_HUGE) ? 'h' : '.', - zram_test_flag(zram, index, ZRAM_IDLE) ? 'i' : '.', - zram_get_priority(zram, index) ? 'r' : '.', - zram_test_flag(zram, index, + test_slot_flag(zram, index, ZRAM_SAME) ? 's' : '.', + test_slot_flag(zram, index, ZRAM_WB) ? 'w' : '.', + test_slot_flag(zram, index, ZRAM_HUGE) ? 'h' : '.', + test_slot_flag(zram, index, ZRAM_IDLE) ? 'i' : '.', + get_slot_comp_priority(zram, index) ? 'r' : '.', + test_slot_flag(zram, index, ZRAM_INCOMPRESSIBLE) ? 'n' : '.'); if (count <= copied) { - zram_slot_unlock(zram, index); + slot_unlock(zram, index); break; } written += copied; count -= copied; next: - zram_slot_unlock(zram, index); + slot_unlock(zram, index); *ppos += 1; } @@ -1976,7 +1977,7 @@ static void zram_meta_free(struct zram *zram, u64 disksize) /* Free all pages that are still in this zram device */ for (index = 0; index < num_pages; index++) - zram_slot_free(zram, index); + slot_free(zram, index); zs_destroy_pool(zram->mem_pool); vfree(zram->table); @@ -2003,12 +2004,12 @@ static bool zram_meta_alloc(struct zram *zram, u64 disksize) huge_class_size = zs_huge_class_size(zram->mem_pool); for (index = 0; index < num_pages; index++) - zram_slot_lock_init(zram, index); + slot_lock_init(zram, index); return true; } -static void zram_slot_free(struct zram *zram, u32 index) +static void slot_free(struct zram *zram, u32 index) { unsigned long handle; @@ -2016,19 +2017,19 @@ static void zram_slot_free(struct zram *zram, u32 index) zram->table[index].attr.ac_time = 0; #endif - zram_clear_flag(zram, index, ZRAM_IDLE); - zram_clear_flag(zram, index, ZRAM_INCOMPRESSIBLE); - zram_clear_flag(zram, index, ZRAM_PP_SLOT); - zram_set_priority(zram, index, 0); + clear_slot_flag(zram, index, ZRAM_IDLE); + clear_slot_flag(zram, index, ZRAM_INCOMPRESSIBLE); + clear_slot_flag(zram, index, ZRAM_PP_SLOT); + set_slot_comp_priority(zram, index, 0); - if (zram_test_flag(zram, index, ZRAM_HUGE)) { - zram_clear_flag(zram, index, ZRAM_HUGE); + if (test_slot_flag(zram, index, ZRAM_HUGE)) { + clear_slot_flag(zram, index, ZRAM_HUGE); atomic64_dec(&zram->stats.huge_pages); } - if (zram_test_flag(zram, index, ZRAM_WB)) { - zram_clear_flag(zram, index, ZRAM_WB); - zram_release_bdev_block(zram, zram_get_handle(zram, index)); + if (test_slot_flag(zram, index, ZRAM_WB)) { + clear_slot_flag(zram, index, ZRAM_WB); + zram_release_bdev_block(zram, get_slot_handle(zram, index)); goto out; } @@ -2036,24 +2037,24 @@ static void zram_slot_free(struct zram *zram, u32 index) * No memory is allocated for same element filled pages. * Simply clear same page flag. */ - if (zram_test_flag(zram, index, ZRAM_SAME)) { - zram_clear_flag(zram, index, ZRAM_SAME); + if (test_slot_flag(zram, index, ZRAM_SAME)) { + clear_slot_flag(zram, index, ZRAM_SAME); atomic64_dec(&zram->stats.same_pages); goto out; } - handle = zram_get_handle(zram, index); + handle = get_slot_handle(zram, index); if (!handle) return; zs_free(zram->mem_pool, handle); - atomic64_sub(zram_get_obj_size(zram, index), + atomic64_sub(get_slot_size(zram, index), &zram->stats.compr_data_size); out: atomic64_dec(&zram->stats.pages_stored); - zram_set_handle(zram, index, 0); - zram_set_obj_size(zram, index, 0); + set_slot_handle(zram, index, 0); + set_slot_size(zram, index, 0); } static int read_same_filled_page(struct zram *zram, struct page *page, @@ -2062,7 +2063,7 @@ static int read_same_filled_page(struct zram *zram, struct page *page, void *mem; mem = kmap_local_page(page); - zram_fill_page(mem, PAGE_SIZE, zram_get_handle(zram, index)); + zram_fill_page(mem, PAGE_SIZE, get_slot_handle(zram, index)); kunmap_local(mem); return 0; } @@ -2073,7 +2074,7 @@ static int read_incompressible_page(struct zram *zram, struct page *page, unsigned long handle; void *src, *dst; - handle = zram_get_handle(zram, index); + handle = get_slot_handle(zram, index); src = zs_obj_read_begin(zram->mem_pool, handle, NULL); dst = kmap_local_page(page); copy_page(dst, src); @@ -2091,9 +2092,9 @@ static int read_compressed_page(struct zram *zram, struct page *page, u32 index) void *src, *dst; int ret, prio; - handle = zram_get_handle(zram, index); - size = zram_get_obj_size(zram, index); - prio = zram_get_priority(zram, index); + handle = get_slot_handle(zram, index); + size = get_slot_size(zram, index); + prio = get_slot_comp_priority(zram, index); zstrm = zcomp_stream_get(zram->comps[prio]); src = zs_obj_read_begin(zram->mem_pool, handle, zstrm->local_copy); @@ -2114,8 +2115,8 @@ static int read_from_zspool_raw(struct zram *zram, struct page *page, u32 index) unsigned int size; void *src; - handle = zram_get_handle(zram, index); - size = zram_get_obj_size(zram, index); + handle = get_slot_handle(zram, index); + size = get_slot_size(zram, index); /* * We need to get stream just for ->local_copy buffer, in @@ -2138,11 +2139,11 @@ static int read_from_zspool_raw(struct zram *zram, struct page *page, u32 index) */ static int read_from_zspool(struct zram *zram, struct page *page, u32 index) { - if (zram_test_flag(zram, index, ZRAM_SAME) || - !zram_get_handle(zram, index)) + if (test_slot_flag(zram, index, ZRAM_SAME) || + !get_slot_handle(zram, index)) return read_same_filled_page(zram, page, index); - if (!zram_test_flag(zram, index, ZRAM_HUGE)) + if (!test_slot_flag(zram, index, ZRAM_HUGE)) return read_compressed_page(zram, page, index); else return read_incompressible_page(zram, page, index); @@ -2153,19 +2154,19 @@ static int zram_read_page(struct zram *zram, struct page *page, u32 index, { int ret; - zram_slot_lock(zram, index); - if (!zram_test_flag(zram, index, ZRAM_WB)) { + slot_lock(zram, index); + if (!test_slot_flag(zram, index, ZRAM_WB)) { /* Slot should be locked through out the function call */ ret = read_from_zspool(zram, page, index); - zram_slot_unlock(zram, index); + slot_unlock(zram, index); } else { - unsigned long blk_idx = zram_get_handle(zram, index); + unsigned long blk_idx = get_slot_handle(zram, index); /* * The slot should be unlocked before reading from the backing * device. */ - zram_slot_unlock(zram, index); + slot_unlock(zram, index); ret = read_from_bdev(zram, page, index, blk_idx, parent); } @@ -2206,11 +2207,11 @@ static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec, static int write_same_filled_page(struct zram *zram, unsigned long fill, u32 index) { - zram_slot_lock(zram, index); - zram_slot_free(zram, index); - zram_set_flag(zram, index, ZRAM_SAME); - zram_set_handle(zram, index, fill); - zram_slot_unlock(zram, index); + slot_lock(zram, index); + slot_free(zram, index); + set_slot_flag(zram, index, ZRAM_SAME); + set_slot_handle(zram, index, fill); + slot_unlock(zram, index); atomic64_inc(&zram->stats.same_pages); atomic64_inc(&zram->stats.pages_stored); @@ -2244,12 +2245,12 @@ static int write_incompressible_page(struct zram *zram, struct page *page, zs_obj_write(zram->mem_pool, handle, src, PAGE_SIZE); kunmap_local(src); - zram_slot_lock(zram, index); - zram_slot_free(zram, index); - zram_set_flag(zram, index, ZRAM_HUGE); - zram_set_handle(zram, index, handle); - zram_set_obj_size(zram, index, PAGE_SIZE); - zram_slot_unlock(zram, index); + slot_lock(zram, index); + slot_free(zram, index); + set_slot_flag(zram, index, ZRAM_HUGE); + set_slot_handle(zram, index, handle); + set_slot_size(zram, index, PAGE_SIZE); + slot_unlock(zram, index); atomic64_add(PAGE_SIZE, &zram->stats.compr_data_size); atomic64_inc(&zram->stats.huge_pages); @@ -2309,11 +2310,11 @@ static int zram_write_page(struct zram *zram, struct page *page, u32 index) zs_obj_write(zram->mem_pool, handle, zstrm->buffer, comp_len); zcomp_stream_put(zstrm); - zram_slot_lock(zram, index); - zram_slot_free(zram, index); - zram_set_handle(zram, index, handle); - zram_set_obj_size(zram, index, comp_len); - zram_slot_unlock(zram, index); + slot_lock(zram, index); + slot_free(zram, index); + set_slot_handle(zram, index, handle); + set_slot_size(zram, index, comp_len); + slot_unlock(zram, index); /* Update stats */ atomic64_inc(&zram->stats.pages_stored); @@ -2364,30 +2365,30 @@ static int scan_slots_for_recompress(struct zram *zram, u32 mode, u32 prio_max, for (index = 0; index < nr_pages; index++) { bool ok = true; - zram_slot_lock(zram, index); - if (!zram_allocated(zram, index)) + slot_lock(zram, index); + if (!slot_allocated(zram, index)) goto next; if (mode & RECOMPRESS_IDLE && - !zram_test_flag(zram, index, ZRAM_IDLE)) + !test_slot_flag(zram, index, ZRAM_IDLE)) goto next; if (mode & RECOMPRESS_HUGE && - !zram_test_flag(zram, index, ZRAM_HUGE)) + !test_slot_flag(zram, index, ZRAM_HUGE)) goto next; - if (zram_test_flag(zram, index, ZRAM_WB) || - zram_test_flag(zram, index, ZRAM_SAME) || - zram_test_flag(zram, index, ZRAM_INCOMPRESSIBLE)) + if (test_slot_flag(zram, index, ZRAM_WB) || + test_slot_flag(zram, index, ZRAM_SAME) || + test_slot_flag(zram, index, ZRAM_INCOMPRESSIBLE)) goto next; /* Already compressed with same of higher priority */ - if (zram_get_priority(zram, index) + 1 >= prio_max) + if (get_slot_comp_priority(zram, index) + 1 >= prio_max) goto next; ok = place_pp_slot(zram, ctl, index); next: - zram_slot_unlock(zram, index); + slot_unlock(zram, index); if (!ok) break; } @@ -2416,11 +2417,11 @@ static int recompress_slot(struct zram *zram, u32 index, struct page *page, void *src; int ret = 0; - handle_old = zram_get_handle(zram, index); + handle_old = get_slot_handle(zram, index); if (!handle_old) return -EINVAL; - comp_len_old = zram_get_obj_size(zram, index); + comp_len_old = get_slot_size(zram, index); /* * Do not recompress objects that are already "small enough". */ @@ -2436,11 +2437,11 @@ static int recompress_slot(struct zram *zram, u32 index, struct page *page, * we don't preserve IDLE flag and don't incorrectly pick this entry * for different post-processing type (e.g. writeback). */ - zram_clear_flag(zram, index, ZRAM_IDLE); + clear_slot_flag(zram, index, ZRAM_IDLE); class_index_old = zs_lookup_class_index(zram->mem_pool, comp_len_old); - prio = max(prio, zram_get_priority(zram, index) + 1); + prio = max(prio, get_slot_comp_priority(zram, index) + 1); /* * Recompression slots scan should not select slots that are * already compressed with a higher priority algorithm, but @@ -2507,7 +2508,7 @@ static int recompress_slot(struct zram *zram, u32 index, struct page *page, */ if (prio < zram->num_active_comps) return 0; - zram_set_flag(zram, index, ZRAM_INCOMPRESSIBLE); + set_slot_flag(zram, index, ZRAM_INCOMPRESSIBLE); return 0; } @@ -2532,10 +2533,10 @@ static int recompress_slot(struct zram *zram, u32 index, struct page *page, zs_obj_write(zram->mem_pool, handle_new, zstrm->buffer, comp_len_new); zcomp_stream_put(zstrm); - zram_slot_free(zram, index); - zram_set_handle(zram, index, handle_new); - zram_set_obj_size(zram, index, comp_len_new); - zram_set_priority(zram, index, prio); + slot_free(zram, index); + set_slot_handle(zram, index, handle_new); + set_slot_size(zram, index, comp_len_new); + set_slot_comp_priority(zram, index, prio); atomic64_add(comp_len_new, &zram->stats.compr_data_size); atomic64_inc(&zram->stats.pages_stored); @@ -2675,15 +2676,15 @@ static ssize_t recompress_store(struct device *dev, if (!num_recomp_pages) break; - zram_slot_lock(zram, pps->index); - if (!zram_test_flag(zram, pps->index, ZRAM_PP_SLOT)) + slot_lock(zram, pps->index); + if (!test_slot_flag(zram, pps->index, ZRAM_PP_SLOT)) goto next; err = recompress_slot(zram, pps->index, page, &num_recomp_pages, threshold, prio, prio_max); next: - zram_slot_unlock(zram, pps->index); + slot_unlock(zram, pps->index); release_pp_slot(zram, pps); if (err) { @@ -2729,9 +2730,9 @@ static void zram_bio_discard(struct zram *zram, struct bio *bio) } while (n >= PAGE_SIZE) { - zram_slot_lock(zram, index); - zram_slot_free(zram, index); - zram_slot_unlock(zram, index); + slot_lock(zram, index); + slot_free(zram, index); + slot_unlock(zram, index); atomic64_inc(&zram->stats.notify_free); index++; n -= PAGE_SIZE; @@ -2760,9 +2761,9 @@ static void zram_bio_read(struct zram *zram, struct bio *bio) } flush_dcache_page(bv.bv_page); - zram_slot_lock(zram, index); - zram_accessed(zram, index); - zram_slot_unlock(zram, index); + slot_lock(zram, index); + mark_slot_accessed(zram, index); + slot_unlock(zram, index); bio_advance_iter_single(bio, &iter, bv.bv_len); } while (iter.bi_size); @@ -2790,9 +2791,9 @@ static void zram_bio_write(struct zram *zram, struct bio *bio) break; } - zram_slot_lock(zram, index); - zram_accessed(zram, index); - zram_slot_unlock(zram, index); + slot_lock(zram, index); + mark_slot_accessed(zram, index); + slot_unlock(zram, index); bio_advance_iter_single(bio, &iter, bv.bv_len); } while (iter.bi_size); @@ -2833,13 +2834,13 @@ static void zram_slot_free_notify(struct block_device *bdev, zram = bdev->bd_disk->private_data; atomic64_inc(&zram->stats.notify_free); - if (!zram_slot_trylock(zram, index)) { + if (!slot_trylock(zram, index)) { atomic64_inc(&zram->stats.miss_free); return; } - zram_slot_free(zram, index); - zram_slot_unlock(zram, index); + slot_free(zram, index); + slot_unlock(zram, index); } static void zram_comp_params_reset(struct zram *zram) -- cgit v1.2.3 From 4932844eb87076a8c51bc6bcf8bfcf7ad30edd75 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Mon, 15 Dec 2025 14:47:13 +0900 Subject: zram: trivial fix of recompress_slot() coding styles A minor fixup of 80-cols breakage in recompress_slot() comment and zs_malloc() call. Link: https://lkml.kernel.org/r/ff3254847dbdc6fbd2e3fed53c572a261d60b7b6.1765775954.git.senozhatsky@chromium.org Signed-off-by: Sergey Senozhatsky Cc: Brian Geffon Cc: David Stevens Cc: Minchan Kim Cc: Richard Chang Cc: Chris Mason Signed-off-by: Andrew Morton --- drivers/block/zram/zram_drv.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index bd9a37fca675..df30150e6ed8 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -2517,14 +2517,15 @@ static int recompress_slot(struct zram *zram, u32 index, struct page *page, * avoid direct reclaim. Allocation error is not fatal since * we still have the old object in the mem_pool. * - * XXX: technically, the node we really want here is the node that holds - * the original compressed data. But that would require us to modify - * zsmalloc API to return this information. For now, we will make do with - * the node of the page allocated for recompression. + * XXX: technically, the node we really want here is the node that + * holds the original compressed data. But that would require us to + * modify zsmalloc API to return this information. For now, we will + * make do with the node of the page allocated for recompression. */ handle_new = zs_malloc(zram->mem_pool, comp_len_new, GFP_NOIO | __GFP_NOWARN | - __GFP_HIGHMEM | __GFP_MOVABLE, page_to_nid(page)); + __GFP_HIGHMEM | __GFP_MOVABLE, + page_to_nid(page)); if (IS_ERR_VALUE(handle_new)) { zcomp_stream_put(zstrm); return PTR_ERR((void *)handle_new); -- cgit v1.2.3 From 8b05d2d8af817c6a1e23032df51e7ad83030d543 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Thu, 15 Jan 2026 12:30:06 +0900 Subject: zram: fixup read_block_state() ac_time is now in seconds, do not use ktime_to_timespec64() [akpm@linux-foundation.org: remove now-unused local `ts'] [akpm@linux-foundation.org: fix build] Link: https://lkml.kernel.org/r/20260115033031.3818977-1-senozhatsky@chromium.org Signed-off-by: Sergey Senozhatsky Reported-by: Chris Mason Closes: https://lkml.kernel.org/r/20260114124522.1326519-1-clm@meta.com Cc: Brian Geffon Cc: David Stevens Cc: Minchan Kim Cc: Richard Chang Signed-off-by: Andrew Morton --- drivers/block/zram/zram_drv.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index df30150e6ed8..7dcfc71d2cac 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -1570,7 +1570,6 @@ static ssize_t read_block_state(struct file *file, char __user *buf, ssize_t index, written = 0; struct zram *zram = file->private_data; unsigned long nr_pages = zram->disksize >> PAGE_SHIFT; - struct timespec64 ts; kbuf = kvmalloc(count, GFP_KERNEL); if (!kbuf) @@ -1589,11 +1588,9 @@ static ssize_t read_block_state(struct file *file, char __user *buf, if (!slot_allocated(zram, index)) goto next; - ts = ktime_to_timespec64(zram->table[index].attr.ac_time); copied = snprintf(kbuf + written, count, - "%12zd %12lld.%06lu %c%c%c%c%c%c\n", - index, (s64)ts.tv_sec, - ts.tv_nsec / NSEC_PER_USEC, + "%12zd %12u.%06d %c%c%c%c%c%c\n", + index, zram->table[index].attr.ac_time, 0, test_slot_flag(zram, index, ZRAM_SAME) ? 's' : '.', test_slot_flag(zram, index, ZRAM_WB) ? 'w' : '.', test_slot_flag(zram, index, ZRAM_HUGE) ? 'h' : '.', -- cgit v1.2.3 From 8e38607aa4aa8ee7ad4058d183465d248d04dca4 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 6 Jan 2026 23:20:02 -0800 Subject: treewide: provide a generic clear_user_page() variant Patch series "mm: folio_zero_user: clear page ranges", v11. This series adds clearing of contiguous page ranges for hugepages. The series improves on the current discontiguous clearing approach in two ways: - clear pages in a contiguous fashion. - use batched clearing via clear_pages() wherever exposed. The first is useful because it allows us to make much better use of hardware prefetchers. The second, enables advertising the real extent to the processor. Where specific instructions support it (ex. string instructions on x86; "mops" on arm64 etc), a processor can optimize based on this because, instead of seeing a sequence of 8-byte stores, or a sequence of 4KB pages, it sees a larger unit being operated on. For instance, AMD Zen uarchs (for extents larger than LLC-size) switch to a mode where they start eliding cacheline allocation. This is helpful not just because it results in higher bandwidth, but also because now the cache is not evicting useful cachelines and replacing them with zeroes. Demand faulting a 64GB region shows performance improvement: $ perf bench mem mmap -p $pg-sz -f demand -s 64GB -l 5 baseline +series (GBps +- %stdev) (GBps +- %stdev) pg-sz=2MB 11.76 +- 1.10% 25.34 +- 1.18% [*] +115.47% preempt=* pg-sz=1GB 24.85 +- 2.41% 39.22 +- 2.32% + 57.82% preempt=none|voluntary pg-sz=1GB (similar) 52.73 +- 0.20% [#] +112.19% preempt=full|lazy [*] This improvement is because switching to sequential clearing allows the hardware prefetchers to do a much better job. [#] For pg-sz=1GB a large part of the improvement is because of the cacheline elision mentioned above. preempt=full|lazy improves upon that because, not needing explicit invocations of cond_resched() to ensure reasonable preemption latency, it can clear the full extent as a single unit. In comparison the maximum extent used for preempt=none|voluntary is PROCESS_PAGES_NON_PREEMPT_BATCH (32MB). When provided the full extent the processor forgoes allocating cachelines on this path almost entirely. (The hope is that eventually, in the fullness of time, the lazy preemption model will be able to do the same job that none or voluntary models are used for, allowing us to do away with cond_resched().) Raghavendra also tested previous version of the series on AMD Genoa and sees similar improvement [1] with preempt=lazy. $ perf bench mem map -p $page-size -f populate -s 64GB -l 10 base patched change pg-sz=2MB 12.731939 GB/sec 26.304263 GB/sec 106.6% pg-sz=1GB 26.232423 GB/sec 61.174836 GB/sec 133.2% This patch (of 8): Let's drop all variants that effectively map to clear_page() and provide it in a generic variant instead. We'll use the macro clear_user_page to indicate whether an architecture provides it's own variant. Also, clear_user_page() is only called from the generic variant of clear_user_highpage(), so define it only if the architecture does not provide a clear_user_highpage(). And, for simplicity define it in linux/highmem.h. Note that for parisc, clear_page() and clear_user_page() map to clear_page_asm(), so we can just get rid of the custom clear_user_page() implementation. There is a clear_user_page_asm() function on parisc, that seems to be unused. Not sure what's up with that. Link: https://lkml.kernel.org/r/20260107072009.1615991-1-ankur.a.arora@oracle.com Link: https://lkml.kernel.org/r/20260107072009.1615991-2-ankur.a.arora@oracle.com Signed-off-by: David Hildenbrand Co-developed-by: Ankur Arora Signed-off-by: Ankur Arora Cc: Andy Lutomirski Cc: Ankur Arora Cc: "Borislav Petkov (AMD)" Cc: Boris Ostrovsky Cc: David Hildenbrand Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Konrad Rzessutek Wilk Cc: Lance Yang Cc: "Liam R. Howlett" Cc: Li Zhe Cc: Lorenzo Stoakes Cc: Mateusz Guzik Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Peter Zijlstra Cc: Raghavendra K T Cc: Suren Baghdasaryan Cc: Thomas Gleixner Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- arch/alpha/include/asm/page.h | 1 - arch/arc/include/asm/page.h | 2 ++ arch/arm/include/asm/page-nommu.h | 1 - arch/arm64/include/asm/page.h | 1 - arch/csky/abiv1/inc/abi/page.h | 1 + arch/csky/abiv2/inc/abi/page.h | 7 ------- arch/hexagon/include/asm/page.h | 1 - arch/loongarch/include/asm/page.h | 1 - arch/m68k/include/asm/page_no.h | 1 - arch/microblaze/include/asm/page.h | 1 - arch/mips/include/asm/page.h | 1 + arch/nios2/include/asm/page.h | 1 + arch/openrisc/include/asm/page.h | 1 - arch/parisc/include/asm/page.h | 1 - arch/powerpc/include/asm/page.h | 1 + arch/riscv/include/asm/page.h | 1 - arch/s390/include/asm/page.h | 1 - arch/sparc/include/asm/page_64.h | 1 + arch/um/include/asm/page.h | 1 - arch/x86/include/asm/page.h | 6 ------ arch/xtensa/include/asm/page.h | 1 - include/linux/highmem.h | 24 ++++++++++++++++++++++-- 22 files changed, 29 insertions(+), 28 deletions(-) diff --git a/arch/alpha/include/asm/page.h b/arch/alpha/include/asm/page.h index d2c6667d73e9..59d01f9b77f6 100644 --- a/arch/alpha/include/asm/page.h +++ b/arch/alpha/include/asm/page.h @@ -11,7 +11,6 @@ #define STRICT_MM_TYPECHECKS extern void clear_page(void *page); -#define clear_user_page(page, vaddr, pg) clear_page(page) #define vma_alloc_zeroed_movable_folio(vma, vaddr) \ vma_alloc_folio(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, 0, vma, vaddr) diff --git a/arch/arc/include/asm/page.h b/arch/arc/include/asm/page.h index 9720fe6b2c24..38214e126c6d 100644 --- a/arch/arc/include/asm/page.h +++ b/arch/arc/include/asm/page.h @@ -32,6 +32,8 @@ struct page; void copy_user_highpage(struct page *to, struct page *from, unsigned long u_vaddr, struct vm_area_struct *vma); + +#define clear_user_page clear_user_page void clear_user_page(void *to, unsigned long u_vaddr, struct page *page); typedef struct { diff --git a/arch/arm/include/asm/page-nommu.h b/arch/arm/include/asm/page-nommu.h index 7c2c72323d17..e74415c959be 100644 --- a/arch/arm/include/asm/page-nommu.h +++ b/arch/arm/include/asm/page-nommu.h @@ -11,7 +11,6 @@ #define clear_page(page) memset((page), 0, PAGE_SIZE) #define copy_page(to,from) memcpy((to), (from), PAGE_SIZE) -#define clear_user_page(page, vaddr, pg) clear_page(page) #define copy_user_page(to, from, vaddr, pg) copy_page(to, from) /* diff --git a/arch/arm64/include/asm/page.h b/arch/arm64/include/asm/page.h index 00f117ff4f7a..b39cc1127e1f 100644 --- a/arch/arm64/include/asm/page.h +++ b/arch/arm64/include/asm/page.h @@ -36,7 +36,6 @@ struct folio *vma_alloc_zeroed_movable_folio(struct vm_area_struct *vma, bool tag_clear_highpages(struct page *to, int numpages); #define __HAVE_ARCH_TAG_CLEAR_HIGHPAGES -#define clear_user_page(page, vaddr, pg) clear_page(page) #define copy_user_page(to, from, vaddr, pg) copy_page(to, from) typedef struct page *pgtable_t; diff --git a/arch/csky/abiv1/inc/abi/page.h b/arch/csky/abiv1/inc/abi/page.h index 2d2159933b76..58307254e7e5 100644 --- a/arch/csky/abiv1/inc/abi/page.h +++ b/arch/csky/abiv1/inc/abi/page.h @@ -10,6 +10,7 @@ static inline unsigned long pages_do_alias(unsigned long addr1, return (addr1 ^ addr2) & (SHMLBA-1); } +#define clear_user_page clear_user_page static inline void clear_user_page(void *addr, unsigned long vaddr, struct page *page) { diff --git a/arch/csky/abiv2/inc/abi/page.h b/arch/csky/abiv2/inc/abi/page.h index cf005f13cd15..a5a255013308 100644 --- a/arch/csky/abiv2/inc/abi/page.h +++ b/arch/csky/abiv2/inc/abi/page.h @@ -1,11 +1,4 @@ /* SPDX-License-Identifier: GPL-2.0 */ - -static inline void clear_user_page(void *addr, unsigned long vaddr, - struct page *page) -{ - clear_page(addr); -} - static inline void copy_user_page(void *to, void *from, unsigned long vaddr, struct page *page) { diff --git a/arch/hexagon/include/asm/page.h b/arch/hexagon/include/asm/page.h index 137ba7c5de48..f0aed3ed812b 100644 --- a/arch/hexagon/include/asm/page.h +++ b/arch/hexagon/include/asm/page.h @@ -113,7 +113,6 @@ static inline void clear_page(void *page) /* * Under assumption that kernel always "sees" user map... */ -#define clear_user_page(page, vaddr, pg) clear_page(page) #define copy_user_page(to, from, vaddr, pg) copy_page(to, from) static inline unsigned long virt_to_pfn(const void *kaddr) diff --git a/arch/loongarch/include/asm/page.h b/arch/loongarch/include/asm/page.h index 256d1ff7a1e3..327bf0bc92bf 100644 --- a/arch/loongarch/include/asm/page.h +++ b/arch/loongarch/include/asm/page.h @@ -30,7 +30,6 @@ extern void clear_page(void *page); extern void copy_page(void *to, void *from); -#define clear_user_page(page, vaddr, pg) clear_page(page) #define copy_user_page(to, from, vaddr, pg) copy_page(to, from) extern unsigned long shm_align_mask; diff --git a/arch/m68k/include/asm/page_no.h b/arch/m68k/include/asm/page_no.h index 39db2026a4b4..d2532bc407ef 100644 --- a/arch/m68k/include/asm/page_no.h +++ b/arch/m68k/include/asm/page_no.h @@ -10,7 +10,6 @@ extern unsigned long memory_end; #define clear_page(page) memset((page), 0, PAGE_SIZE) #define copy_page(to,from) memcpy((to), (from), PAGE_SIZE) -#define clear_user_page(page, vaddr, pg) clear_page(page) #define copy_user_page(to, from, vaddr, pg) copy_page(to, from) #define vma_alloc_zeroed_movable_folio(vma, vaddr) \ diff --git a/arch/microblaze/include/asm/page.h b/arch/microblaze/include/asm/page.h index 90ac9f34b4b4..e1e396367ba7 100644 --- a/arch/microblaze/include/asm/page.h +++ b/arch/microblaze/include/asm/page.h @@ -45,7 +45,6 @@ typedef unsigned long pte_basic_t; # define copy_page(to, from) memcpy((to), (from), PAGE_SIZE) # define clear_page(pgaddr) memset((pgaddr), 0, PAGE_SIZE) -# define clear_user_page(pgaddr, vaddr, page) memset((pgaddr), 0, PAGE_SIZE) # define copy_user_page(vto, vfrom, vaddr, topg) \ memcpy((vto), (vfrom), PAGE_SIZE) diff --git a/arch/mips/include/asm/page.h b/arch/mips/include/asm/page.h index bc3e3484c1bf..5ec428fcc887 100644 --- a/arch/mips/include/asm/page.h +++ b/arch/mips/include/asm/page.h @@ -90,6 +90,7 @@ static inline void clear_user_page(void *addr, unsigned long vaddr, if (pages_do_alias((unsigned long) addr, vaddr & PAGE_MASK)) flush_data_cache_page((unsigned long)addr); } +#define clear_user_page clear_user_page struct vm_area_struct; extern void copy_user_highpage(struct page *to, struct page *from, diff --git a/arch/nios2/include/asm/page.h b/arch/nios2/include/asm/page.h index 00a51623d38a..722956ac0bf8 100644 --- a/arch/nios2/include/asm/page.h +++ b/arch/nios2/include/asm/page.h @@ -45,6 +45,7 @@ struct page; +#define clear_user_page clear_user_page extern void clear_user_page(void *addr, unsigned long vaddr, struct page *page); extern void copy_user_page(void *vto, void *vfrom, unsigned long vaddr, struct page *to); diff --git a/arch/openrisc/include/asm/page.h b/arch/openrisc/include/asm/page.h index 85797f94d1d7..d2cdbf3579bb 100644 --- a/arch/openrisc/include/asm/page.h +++ b/arch/openrisc/include/asm/page.h @@ -30,7 +30,6 @@ #define clear_page(page) memset((page), 0, PAGE_SIZE) #define copy_page(to, from) memcpy((to), (from), PAGE_SIZE) -#define clear_user_page(page, vaddr, pg) clear_page(page) #define copy_user_page(to, from, vaddr, pg) copy_page(to, from) /* diff --git a/arch/parisc/include/asm/page.h b/arch/parisc/include/asm/page.h index 8f4e51071ea1..3630b36d07da 100644 --- a/arch/parisc/include/asm/page.h +++ b/arch/parisc/include/asm/page.h @@ -21,7 +21,6 @@ struct vm_area_struct; void clear_page_asm(void *page); void copy_page_asm(void *to, void *from); -#define clear_user_page(vto, vaddr, page) clear_page_asm(vto) void copy_user_highpage(struct page *to, struct page *from, unsigned long vaddr, struct vm_area_struct *vma); #define __HAVE_ARCH_COPY_USER_HIGHPAGE diff --git a/arch/powerpc/include/asm/page.h b/arch/powerpc/include/asm/page.h index b28fbb1d57eb..f2bb1f98eebe 100644 --- a/arch/powerpc/include/asm/page.h +++ b/arch/powerpc/include/asm/page.h @@ -271,6 +271,7 @@ static inline const void *pfn_to_kaddr(unsigned long pfn) struct page; extern void clear_user_page(void *page, unsigned long vaddr, struct page *pg); +#define clear_user_page clear_user_page extern void copy_user_page(void *to, void *from, unsigned long vaddr, struct page *p); extern int devmem_is_allowed(unsigned long pfn); diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h index ffe213ad65a4..061b60b954ec 100644 --- a/arch/riscv/include/asm/page.h +++ b/arch/riscv/include/asm/page.h @@ -50,7 +50,6 @@ void clear_page(void *page); #endif #define copy_page(to, from) memcpy((to), (from), PAGE_SIZE) -#define clear_user_page(pgaddr, vaddr, page) clear_page(pgaddr) #define copy_user_page(vto, vfrom, vaddr, topg) \ memcpy((vto), (vfrom), PAGE_SIZE) diff --git a/arch/s390/include/asm/page.h b/arch/s390/include/asm/page.h index c1d63b613bf9..9c8c5283258e 100644 --- a/arch/s390/include/asm/page.h +++ b/arch/s390/include/asm/page.h @@ -65,7 +65,6 @@ static inline void copy_page(void *to, void *from) : : "memory", "cc"); } -#define clear_user_page(page, vaddr, pg) clear_page(page) #define copy_user_page(to, from, vaddr, pg) copy_page(to, from) #define vma_alloc_zeroed_movable_folio(vma, vaddr) \ diff --git a/arch/sparc/include/asm/page_64.h b/arch/sparc/include/asm/page_64.h index d764d8a8586b..fd4dc85fb38b 100644 --- a/arch/sparc/include/asm/page_64.h +++ b/arch/sparc/include/asm/page_64.h @@ -43,6 +43,7 @@ void _clear_page(void *page); #define clear_page(X) _clear_page((void *)(X)) struct page; void clear_user_page(void *addr, unsigned long vaddr, struct page *page); +#define clear_user_page clear_user_page #define copy_page(X,Y) memcpy((void *)(X), (void *)(Y), PAGE_SIZE) void copy_user_page(void *to, void *from, unsigned long vaddr, struct page *topage); #define __HAVE_ARCH_COPY_USER_HIGHPAGE diff --git a/arch/um/include/asm/page.h b/arch/um/include/asm/page.h index 2d363460d896..e348ff489b89 100644 --- a/arch/um/include/asm/page.h +++ b/arch/um/include/asm/page.h @@ -26,7 +26,6 @@ struct page; #define clear_page(page) memset((void *)(page), 0, PAGE_SIZE) #define copy_page(to,from) memcpy((void *)(to), (void *)(from), PAGE_SIZE) -#define clear_user_page(page, vaddr, pg) clear_page(page) #define copy_user_page(to, from, vaddr, pg) copy_page(to, from) typedef struct { unsigned long pte; } pte_t; diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h index 9265f2fca99a..416dc88e35c1 100644 --- a/arch/x86/include/asm/page.h +++ b/arch/x86/include/asm/page.h @@ -22,12 +22,6 @@ struct page; extern struct range pfn_mapped[]; extern int nr_pfn_mapped; -static inline void clear_user_page(void *page, unsigned long vaddr, - struct page *pg) -{ - clear_page(page); -} - static inline void copy_user_page(void *to, void *from, unsigned long vaddr, struct page *topage) { diff --git a/arch/xtensa/include/asm/page.h b/arch/xtensa/include/asm/page.h index 20655174b111..059493256765 100644 --- a/arch/xtensa/include/asm/page.h +++ b/arch/xtensa/include/asm/page.h @@ -126,7 +126,6 @@ void clear_user_highpage(struct page *page, unsigned long vaddr); void copy_user_highpage(struct page *to, struct page *from, unsigned long vaddr, struct vm_area_struct *vma); #else -# define clear_user_page(page, vaddr, pg) clear_page(page) # define copy_user_page(to, from, vaddr, pg) copy_page(to, from) #endif diff --git a/include/linux/highmem.h b/include/linux/highmem.h index abc20f9810fd..393bd51e5a1f 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -197,15 +197,35 @@ static inline void invalidate_kernel_vmap_range(void *vaddr, int size) } #endif -/* when CONFIG_HIGHMEM is not set these will be plain clear/copy_page */ #ifndef clear_user_highpage +#ifndef clear_user_page +/** + * clear_user_page() - clear a page to be mapped to user space + * @addr: the address of the page + * @vaddr: the address of the user mapping + * @page: the page + * + * We condition the definition of clear_user_page() on the architecture + * not having a custom clear_user_highpage(). That's because if there + * is some special flushing needed for clear_user_highpage() then it + * is likely that clear_user_page() also needs some magic. And, since + * our only caller is the generic clear_user_highpage(), not defining + * is not much of a loss. + */ +static inline void clear_user_page(void *addr, unsigned long vaddr, struct page *page) +{ + clear_page(addr); +} +#endif + +/* when CONFIG_HIGHMEM is not set these will be plain clear/copy_page */ static inline void clear_user_highpage(struct page *page, unsigned long vaddr) { void *addr = kmap_local_page(page); clear_user_page(addr, vaddr, page); kunmap_local(addr); } -#endif +#endif /* clear_user_highpage */ #ifndef vma_alloc_zeroed_movable_folio /** -- cgit v1.2.3 From 62a9f5a85b98d6d2d9b5e0d67b2d4e5903bc53ec Mon Sep 17 00:00:00 2001 From: Ankur Arora Date: Tue, 6 Jan 2026 23:20:03 -0800 Subject: mm: introduce clear_pages() and clear_user_pages() Introduce clear_pages(), to be overridden by architectures that support more efficient clearing of consecutive pages. Also introduce clear_user_pages(), however, we will not expect this function to be overridden anytime soon. As we do for clear_user_page(), define clear_user_pages() only if the architecture does not define clear_user_highpage(). That is because if the architecture does define clear_user_highpage(), then it likely needs some flushing magic when clearing user pages or highpages. This means we can get away without defining clear_user_pages(), since, much like its single page sibling, its only potential user is the generic clear_user_highpages() which should instead be using clear_user_highpage(). Link: https://lkml.kernel.org/r/20260107072009.1615991-3-ankur.a.arora@oracle.com Signed-off-by: Ankur Arora Acked-by: David Hildenbrand (Red Hat) Cc: Andy Lutomirski Cc: "Borislav Petkov (AMD)" Cc: Boris Ostrovsky Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Konrad Rzessutek Wilk Cc: Lance Yang Cc: "Liam R. Howlett" Cc: Li Zhe Cc: Lorenzo Stoakes Cc: Mateusz Guzik Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Peter Zijlstra Cc: Raghavendra K T Cc: Suren Baghdasaryan Cc: Thomas Gleixner Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/highmem.h | 33 +++++++++++++++++++++++++++++++++ include/linux/mm.h | 20 ++++++++++++++++++++ 2 files changed, 53 insertions(+) diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 393bd51e5a1f..019ab7d8c841 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -218,6 +218,39 @@ static inline void clear_user_page(void *addr, unsigned long vaddr, struct page } #endif +/** + * clear_user_pages() - clear a page range to be mapped to user space + * @addr: start address + * @vaddr: start address of the user mapping + * @page: start page + * @npages: number of pages + * + * Assumes that the region (@addr, +@npages) has been validated + * already so this does no exception handling. + * + * If the architecture provides a clear_user_page(), use that; + * otherwise, we can safely use clear_pages(). + */ +static inline void clear_user_pages(void *addr, unsigned long vaddr, + struct page *page, unsigned int npages) +{ + +#ifdef clear_user_page + do { + clear_user_page(addr, vaddr, page); + addr += PAGE_SIZE; + vaddr += PAGE_SIZE; + page++; + } while (--npages); +#else + /* + * Prefer clear_pages() to allow for architectural optimizations + * when operating on contiguous page ranges. + */ + clear_pages(addr, npages); +#endif +} + /* when CONFIG_HIGHMEM is not set these will be plain clear/copy_page */ static inline void clear_user_highpage(struct page *page, unsigned long vaddr) { diff --git a/include/linux/mm.h b/include/linux/mm.h index f0d5be9dc736..d78e294698b0 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4198,6 +4198,26 @@ static inline void clear_page_guard(struct zone *zone, struct page *page, unsigned int order) {} #endif /* CONFIG_DEBUG_PAGEALLOC */ +#ifndef clear_pages +/** + * clear_pages() - clear a page range for kernel-internal use. + * @addr: start address + * @npages: number of pages + * + * Use clear_user_pages() instead when clearing a page range to be + * mapped to user space. + * + * Does absolutely no exception handling. + */ +static inline void clear_pages(void *addr, unsigned int npages) +{ + do { + clear_page(addr); + addr += PAGE_SIZE; + } while (--npages); +} +#endif + #ifdef __HAVE_ARCH_GATE_AREA extern struct vm_area_struct *get_gate_vma(struct mm_struct *mm); extern int in_gate_area_no_mm(unsigned long addr); -- cgit v1.2.3 From 8d846b723e5723d98d859df9feeab89c2c889fb2 Mon Sep 17 00:00:00 2001 From: Ankur Arora Date: Tue, 6 Jan 2026 23:20:04 -0800 Subject: highmem: introduce clear_user_highpages() Define clear_user_highpages() which uses the range clearing primitive, clear_user_pages(). We can safely use this when CONFIG_HIGHMEM is disabled and if the architecture does not have clear_user_highpage. The first is needed to ensure that contiguous page ranges stay contiguous which precludes intermediate maps via HIGMEM. The second, because if the architecture has clear_user_highpage(), it likely needs flushing magic when clearing the page, magic that we aren't privy to. For both of those cases, just fallback to a loop around clear_user_highpage(). Link: https://lkml.kernel.org/r/20260107072009.1615991-4-ankur.a.arora@oracle.com Signed-off-by: Ankur Arora Acked-by: David Hildenbrand (Red Hat) Cc: Andy Lutomirski Cc: "Borislav Petkov (AMD)" Cc: Boris Ostrovsky Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Konrad Rzessutek Wilk Cc: Lance Yang Cc: "Liam R. Howlett" Cc: Li Zhe Cc: Lorenzo Stoakes Cc: Mateusz Guzik Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Peter Zijlstra Cc: Raghavendra K T Cc: Suren Baghdasaryan Cc: Thomas Gleixner Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/highmem.h | 45 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 44 insertions(+), 1 deletion(-) diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 019ab7d8c841..af03db851a1d 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -251,7 +251,14 @@ static inline void clear_user_pages(void *addr, unsigned long vaddr, #endif } -/* when CONFIG_HIGHMEM is not set these will be plain clear/copy_page */ +/** + * clear_user_highpage() - clear a page to be mapped to user space + * @page: start page + * @vaddr: start address of the user mapping + * + * With !CONFIG_HIGHMEM this (and the copy_user_highpage() below) will + * be plain clear_user_page() (and copy_user_page()). + */ static inline void clear_user_highpage(struct page *page, unsigned long vaddr) { void *addr = kmap_local_page(page); @@ -260,6 +267,42 @@ static inline void clear_user_highpage(struct page *page, unsigned long vaddr) } #endif /* clear_user_highpage */ +/** + * clear_user_highpages() - clear a page range to be mapped to user space + * @page: start page + * @vaddr: start address of the user mapping + * @npages: number of pages + * + * Assumes that all the pages in the region (@page, +@npages) are valid + * so this does no exception handling. + */ +static inline void clear_user_highpages(struct page *page, unsigned long vaddr, + unsigned int npages) +{ + +#if defined(clear_user_highpage) || defined(CONFIG_HIGHMEM) + /* + * An architecture defined clear_user_highpage() implies special + * handling is needed. + * + * So we use that or, the generic variant if CONFIG_HIGHMEM is + * enabled. + */ + do { + clear_user_highpage(page, vaddr); + vaddr += PAGE_SIZE; + page++; + } while (--npages); +#else + + /* + * Prefer clear_user_pages() to allow for architectural optimizations + * when operating on contiguous page ranges. + */ + clear_user_pages(page_address(page), vaddr, page, npages); +#endif +} + #ifndef vma_alloc_zeroed_movable_folio /** * vma_alloc_zeroed_movable_folio - Allocate a zeroed page for a VMA. -- cgit v1.2.3 From 54a6b89a3db2ecb4462abcd6e6e52dfebaa7e6c4 Mon Sep 17 00:00:00 2001 From: Ankur Arora Date: Tue, 6 Jan 2026 23:20:05 -0800 Subject: x86/mm: simplify clear_page_* clear_page_rep() and clear_page_erms() are wrappers around "REP; STOS" variations. Inlining gets rid of an unnecessary CALL/RET (which isn't free when using RETHUNK speculative execution mitigations.) Fixup and rename clear_page_orig() to adapt to the changed calling convention. Also add a comment from Dave Hansen detailing various clearing mechanisms used in clear_page(). Link: https://lkml.kernel.org/r/20260107072009.1615991-5-ankur.a.arora@oracle.com Signed-off-by: Ankur Arora Tested-by: Raghavendra K T Reviewed-by: Borislav Petkov (AMD) Cc: Andy Lutomirski Cc: Boris Ostrovsky Cc: David Hildenbrand Cc: David Hildenbrand Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Konrad Rzessutek Wilk Cc: Lance Yang Cc: "Liam R. Howlett" Cc: Li Zhe Cc: Lorenzo Stoakes Cc: Mateusz Guzik Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Peter Zijlstra Cc: Suren Baghdasaryan Cc: Thomas Gleixner Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- arch/x86/include/asm/page_32.h | 6 ++++ arch/x86/include/asm/page_64.h | 69 ++++++++++++++++++++++++++++++++---------- arch/x86/lib/clear_page_64.S | 39 +++++------------------- 3 files changed, 67 insertions(+), 47 deletions(-) diff --git a/arch/x86/include/asm/page_32.h b/arch/x86/include/asm/page_32.h index 0c623706cb7e..19fddb002cc9 100644 --- a/arch/x86/include/asm/page_32.h +++ b/arch/x86/include/asm/page_32.h @@ -17,6 +17,12 @@ extern unsigned long __phys_addr(unsigned long); #include +/** + * clear_page() - clear a page using a kernel virtual address. + * @page: address of kernel page + * + * Does absolutely no exception handling. + */ static inline void clear_page(void *page) { memset(page, 0, PAGE_SIZE); diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h index 2f0e47be79a4..ec3307234a17 100644 --- a/arch/x86/include/asm/page_64.h +++ b/arch/x86/include/asm/page_64.h @@ -48,26 +48,63 @@ static inline unsigned long __phys_addr_symbol(unsigned long x) #define __phys_reloc_hide(x) (x) -void clear_page_orig(void *page); -void clear_page_rep(void *page); -void clear_page_erms(void *page); -KCFI_REFERENCE(clear_page_orig); -KCFI_REFERENCE(clear_page_rep); -KCFI_REFERENCE(clear_page_erms); - -static inline void clear_page(void *page) +void __clear_pages_unrolled(void *page); +KCFI_REFERENCE(__clear_pages_unrolled); + +/** + * clear_page() - clear a page using a kernel virtual address. + * @addr: address of kernel page + * + * Switch between three implementations of page clearing based on CPU + * capabilities: + * + * - __clear_pages_unrolled(): the oldest, slowest and universally + * supported method. Zeroes via 8-byte MOV instructions unrolled 8x + * to write a 64-byte cacheline in each loop iteration. + * + * - "REP; STOSQ": really old CPUs had crummy REP implementations. + * Vendor CPU setup code sets 'REP_GOOD' on CPUs where REP can be + * trusted. The instruction writes 8-byte per REP iteration but + * CPUs can internally batch these together and do larger writes. + * + * - "REP; STOSB": used on CPUs with "enhanced REP MOVSB/STOSB", + * which enumerate 'ERMS' and provide an implementation which + * unlike "REP; STOSQ" above wasn't overly picky about alignment. + * The instruction writes 1-byte per REP iteration with CPUs + * internally batching these together into larger writes and is + * generally fastest of the three. + * + * Note that when running as a guest, features exposed by the CPU + * might be mediated by the hypervisor. So, the STOSQ variant might + * be in active use on some systems even when the hardware enumerates + * ERMS. + * + * Does absolutely no exception handling. + */ +static inline void clear_page(void *addr) { + u64 len = PAGE_SIZE; /* * Clean up KMSAN metadata for the page being cleared. The assembly call - * below clobbers @page, so we perform unpoisoning before it. + * below clobbers @addr, so perform unpoisoning before it. + */ + kmsan_unpoison_memory(addr, len); + + /* + * The inline asm embeds a CALL instruction and usually that is a no-no + * due to the compiler not knowing that and thus being unable to track + * callee-clobbered registers. + * + * In this case that is fine because the registers clobbered by + * __clear_pages_unrolled() are part of the inline asm register + * specification. */ - kmsan_unpoison_memory(page, PAGE_SIZE); - alternative_call_2(clear_page_orig, - clear_page_rep, X86_FEATURE_REP_GOOD, - clear_page_erms, X86_FEATURE_ERMS, - "=D" (page), - "D" (page), - "cc", "memory", "rax", "rcx"); + asm volatile(ALTERNATIVE_2("call __clear_pages_unrolled", + "shrq $3, %%rcx; rep stosq", X86_FEATURE_REP_GOOD, + "rep stosb", X86_FEATURE_ERMS) + : "+c" (len), "+D" (addr), ASM_CALL_CONSTRAINT + : "a" (0) + : "cc", "memory"); } void copy_page(void *to, void *from); diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S index a508e4a8c66a..f7f356e7218b 100644 --- a/arch/x86/lib/clear_page_64.S +++ b/arch/x86/lib/clear_page_64.S @@ -6,30 +6,15 @@ #include /* - * Most CPUs support enhanced REP MOVSB/STOSB instructions. It is - * recommended to use this when possible and we do use them by default. - * If enhanced REP MOVSB/STOSB is not available, try to use fast string. - * Otherwise, use original. + * Zero page aligned region. + * %rdi - dest + * %rcx - length */ - -/* - * Zero a page. - * %rdi - page - */ -SYM_TYPED_FUNC_START(clear_page_rep) - movl $4096/8,%ecx - xorl %eax,%eax - rep stosq - RET -SYM_FUNC_END(clear_page_rep) -EXPORT_SYMBOL_GPL(clear_page_rep) - -SYM_TYPED_FUNC_START(clear_page_orig) - xorl %eax,%eax - movl $4096/64,%ecx +SYM_TYPED_FUNC_START(__clear_pages_unrolled) + shrq $6, %rcx .p2align 4 .Lloop: - decl %ecx + decq %rcx #define PUT(x) movq %rax,x*8(%rdi) movq %rax,(%rdi) PUT(1) @@ -43,16 +28,8 @@ SYM_TYPED_FUNC_START(clear_page_orig) jnz .Lloop nop RET -SYM_FUNC_END(clear_page_orig) -EXPORT_SYMBOL_GPL(clear_page_orig) - -SYM_TYPED_FUNC_START(clear_page_erms) - movl $4096,%ecx - xorl %eax,%eax - rep stosb - RET -SYM_FUNC_END(clear_page_erms) -EXPORT_SYMBOL_GPL(clear_page_erms) +SYM_FUNC_END(__clear_pages_unrolled) +EXPORT_SYMBOL_GPL(__clear_pages_unrolled) /* * Default clear user-space. -- cgit v1.2.3 From cb431accb36e51b64ce34b5cc4d5ed292895fd84 Mon Sep 17 00:00:00 2001 From: Ankur Arora Date: Tue, 6 Jan 2026 23:20:06 -0800 Subject: x86/clear_page: introduce clear_pages() Performance when clearing with string instructions (x86-64-stosq and similar) can vary significantly based on the chunk-size used. $ perf bench mem memset -k 4KB -s 4GB -f x86-64-stosq # Running 'mem/memset' benchmark: # function 'x86-64-stosq' (movsq-based memset() in arch/x86/lib/memset_64.S) # Copying 4GB bytes ... 13.748208 GB/sec $ perf bench mem memset -k 2MB -s 4GB -f x86-64-stosq # Running 'mem/memset' benchmark: # function 'x86-64-stosq' (movsq-based memset() in # arch/x86/lib/memset_64.S) # Copying 4GB bytes ... 15.067900 GB/sec $ perf bench mem memset -k 1GB -s 4GB -f x86-64-stosq # Running 'mem/memset' benchmark: # function 'x86-64-stosq' (movsq-based memset() in arch/x86/lib/memset_64.S) # Copying 4GB bytes ... 38.104311 GB/sec (Both on AMD Milan.) With a change in chunk-size from 4KB to 1GB, we see the performance go from 13.7 GB/sec to 38.1 GB/sec. For the chunk-size of 2MB the change isn't quite as drastic but it is worth adding a clear_page() variant that can handle contiguous page-extents. Link: https://lkml.kernel.org/r/20260107072009.1615991-6-ankur.a.arora@oracle.com Signed-off-by: Ankur Arora Tested-by: Raghavendra K T Reviewed-by: David Hildenbrand (Red Hat) Cc: Andy Lutomirski Cc: "Borislav Petkov (AMD)" Cc: Boris Ostrovsky Cc: David Hildenbrand Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Konrad Rzessutek Wilk Cc: Lance Yang Cc: "Liam R. Howlett" Cc: Li Zhe Cc: Lorenzo Stoakes Cc: Mateusz Guzik Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Peter Zijlstra Cc: Suren Baghdasaryan Cc: Thomas Gleixner Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- arch/x86/include/asm/page_64.h | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h index ec3307234a17..1895c207f629 100644 --- a/arch/x86/include/asm/page_64.h +++ b/arch/x86/include/asm/page_64.h @@ -52,8 +52,9 @@ void __clear_pages_unrolled(void *page); KCFI_REFERENCE(__clear_pages_unrolled); /** - * clear_page() - clear a page using a kernel virtual address. - * @addr: address of kernel page + * clear_pages() - clear a page range using a kernel virtual address. + * @addr: start address of kernel page range + * @npages: number of pages * * Switch between three implementations of page clearing based on CPU * capabilities: @@ -81,11 +82,11 @@ KCFI_REFERENCE(__clear_pages_unrolled); * * Does absolutely no exception handling. */ -static inline void clear_page(void *addr) +static inline void clear_pages(void *addr, unsigned int npages) { - u64 len = PAGE_SIZE; + u64 len = npages * PAGE_SIZE; /* - * Clean up KMSAN metadata for the page being cleared. The assembly call + * Clean up KMSAN metadata for the pages being cleared. The assembly call * below clobbers @addr, so perform unpoisoning before it. */ kmsan_unpoison_memory(addr, len); @@ -106,6 +107,12 @@ static inline void clear_page(void *addr) : "a" (0) : "cc", "memory"); } +#define clear_pages clear_pages + +static inline void clear_page(void *addr) +{ + clear_pages(addr, 1); +} void copy_page(void *to, void *from); KCFI_REFERENCE(copy_page); -- cgit v1.2.3 From 9890ecab6ad9c0d3d342469f3b619fd704b5c59a Mon Sep 17 00:00:00 2001 From: Ankur Arora Date: Tue, 6 Jan 2026 23:20:07 -0800 Subject: mm: folio_zero_user: clear pages sequentially process_huge_pages(), used to clear hugepages, is optimized for cache locality. In particular it processes a hugepage in 4KB page units and in a difficult to predict order: clearing pages in the periphery in a backwards or forwards direction, then converging inwards to the faulting page (or page specified via base_addr.) This helps maximize temporal locality at time of access. However, while it keeps stores inside a 4KB page sequential, pages are ordered semi-randomly in a way that is not easy for the processor to predict. This limits the clearing bandwidth to what's available in a 4KB page. Consider the baseline bandwidth: $ perf bench mem mmap -p 2MB -f populate -s 64GB -l 3 # Running 'mem/mmap' benchmark: # function 'populate' (Eagerly populated mmap()) # Copying 64GB bytes ... 11.791097 GB/sec (Unless otherwise noted, all numbers are on AMD Genoa (EPYC 9J13); region-size=64GB, local node; 2.56 GHz, boost=0.) 11.79 GBps amounts to around 323ns/4KB. With memory access latency of ~100ns, that doesn't leave much time to help from, say, hardware prefetchers. (Note that since this is a purely write workload, it's reasonable to assume that the processor does not need to prefetch any cachelines. However, for a processor to skip the prefetch, it would need to look at the access pattern, and see that full cachelines were being written. This might be easily visible if clear_page() was using, say x86 string instructions; less so if it were using a store loop. In any case, the existence of these kind predictors or appropriately helpful threshold values is implementation specific. Additionally, even when the processor can skip the prefetch, coherence protocols will still need to establish exclusive ownership necessitating communication with remote caches.) With that, the change is quite straight-forward. Instead of clearing pages discontiguously, clear contiguously: switch to a loop around clear_user_highpage(). Performance == Testing a demand fault workload shows a decent improvement in bandwidth with pg-sz=2MB. Performance of pg-sz=1GB does not change because it has always used straight clearing. $ perf bench mem mmap -p $pg-sz -f demand -s 64GB -l 5 discontiguous-pages contiguous-pages (baseline) (GBps +- %stdev) (GBps +- %stdev) pg-sz=2MB 11.76 +- 1.10% 23.58 +- 1.95% +100.51% pg-sz=1GB 24.85 +- 2.41% 25.40 +- 1.33% - Analysis (pg-sz=2MB) == At L1 data cache level, nothing changes. The processor continues to access the same number of cachelines, allocating and missing them as it writes to them. discontiguous-pages 7,394,341,051 L1-dcache-loads # 445.172 M/sec ( +- 0.04% ) (35.73%) 3,292,247,227 L1-dcache-load-misses # 44.52% of all L1-dcache accesses ( +- 0.01% ) (35.73%) contiguous-pages 7,205,105,282 L1-dcache-loads # 861.895 M/sec ( +- 0.02% ) (35.75%) 3,241,584,535 L1-dcache-load-misses # 44.99% of all L1-dcache accesses ( +- 0.00% ) (35.74%) The L2 prefetcher, however, is now able to prefetch ~22% more cachelines (L2 prefetch miss rate also goes up significantly showing that we are backend limited): discontiguous-pages 2,835,860,245 l2_pf_hit_l2.all # 170.242 M/sec ( +- 0.12% ) (15.65%) contiguous-pages 3,472,055,269 l2_pf_hit_l2.all # 411.319 M/sec ( +- 0.62% ) (15.67%) That sill leaves a large gap between the ~22% improvement in prefetch and the ~100% improvement in bandwidth but better prefetching seems to streamline the traffic well enough that most of the data starts comes from the L2 leading to substantially fewer cache-misses at the LLC: discontiguous-pages 8,493,499,137 cache-references # 511.416 M/sec ( +- 0.15% ) (50.01%) 930,501,344 cache-misses # 10.96% of all cache refs ( +- 0.52% ) (50.01%) contiguous-pages 9,421,926,416 cache-references # 1.120 G/sec ( +- 0.09% ) (50.02%) 68,787,247 cache-misses # 0.73% of all cache refs ( +- 0.15% ) (50.03%) In addition, there are a few minor frontend optimizations: clear_pages() on x86 is now fully inlined, so we don't have a CALL/RET pair (which isn't free when using RETHUNK speculative execution mitigation as we do on my test system.) The loop in clear_contig_highpages() is also easier to predict (especially when handling faults) as compared to that in process_huge_pages(). discontiguous-pages 980,014,411 branches # 59.005 M/sec (31.26%) discontiguous-pages 180,897,177 branch-misses # 18.46% of all branches (31.26%) contiguous-pages 515,630,550 branches # 62.654 M/sec (31.27%) contiguous-pages 78,039,496 branch-misses # 15.13% of all branches (31.28%) Note that although clearing contiguously is easier to optimize for the processor, it does not, sadly, mean that the processor will necessarily take advantage of it. For instance this change does not result in any improvement in my tests on Intel Icelakex (Oracle X9), or on ARM64 Neoverse-N1 (Ampere Altra). Link: https://lkml.kernel.org/r/20260107072009.1615991-7-ankur.a.arora@oracle.com Signed-off-by: Ankur Arora Reviewed-by: Raghavendra K T Tested-by: Raghavendra K T Acked-by: David Hildenbrand (Red Hat) Cc: Andy Lutomirski Cc: "Borislav Petkov (AMD)" Cc: Boris Ostrovsky Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Konrad Rzessutek Wilk Cc: Lance Yang Cc: "Liam R. Howlett" Cc: Li Zhe Cc: Lorenzo Stoakes Cc: Mateusz Guzik Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Peter Zijlstra Cc: Suren Baghdasaryan Cc: Thomas Gleixner Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/memory.c | 28 +++++++++------------------- 1 file changed, 9 insertions(+), 19 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index e0bce673f053..74d663943ecb 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -7240,40 +7240,30 @@ static inline int process_huge_page( return 0; } -static void clear_gigantic_page(struct folio *folio, unsigned long addr_hint, - unsigned int nr_pages) +static void clear_contig_highpages(struct page *page, unsigned long addr, + unsigned int nr_pages) { - unsigned long addr = ALIGN_DOWN(addr_hint, folio_size(folio)); - int i; + unsigned int i; might_sleep(); for (i = 0; i < nr_pages; i++) { cond_resched(); - clear_user_highpage(folio_page(folio, i), addr + i * PAGE_SIZE); - } -} - -static int clear_subpage(unsigned long addr, int idx, void *arg) -{ - struct folio *folio = arg; - clear_user_highpage(folio_page(folio, idx), addr); - return 0; + clear_user_highpage(page + i, addr + i * PAGE_SIZE); + } } /** * folio_zero_user - Zero a folio which will be mapped to userspace. * @folio: The folio to zero. - * @addr_hint: The address will be accessed or the base address if uncelar. + * @addr_hint: The address accessed by the user or the base address. */ void folio_zero_user(struct folio *folio, unsigned long addr_hint) { - unsigned int nr_pages = folio_nr_pages(folio); + unsigned long base_addr = ALIGN_DOWN(addr_hint, folio_size(folio)); - if (unlikely(nr_pages > MAX_ORDER_NR_PAGES)) - clear_gigantic_page(folio, addr_hint, nr_pages); - else - process_huge_page(addr_hint, nr_pages, clear_subpage, folio); + clear_contig_highpages(folio_page(folio, 0), + base_addr, folio_nr_pages(folio)); } static int copy_user_gigantic_page(struct folio *dst, struct folio *src, -- cgit v1.2.3 From 94962b2628e6af2c48be6ebdf9f76add28d60ecc Mon Sep 17 00:00:00 2001 From: Ankur Arora Date: Tue, 6 Jan 2026 23:20:08 -0800 Subject: mm: folio_zero_user: clear page ranges Use batch clearing in clear_contig_highpages() instead of clearing a single page at a time. Exposing larger ranges enables the processor to optimize based on extent. To do this we just switch to using clear_user_highpages() which would in turn use clear_user_pages() or clear_pages(). Batched clearing, when running under non-preemptible models, however, has latency considerations. In particular, we need periodic invocations of cond_resched() to keep to reasonable preemption latencies. This is a problem because the clearing primitives do not, or might not be able to, call cond_resched() to check if preemption is needed. So, limit the worst case preemption latency by doing the clearing in units of no more than PROCESS_PAGES_NON_PREEMPT_BATCH pages. (Preemptible models already define away most of cond_resched(), so the batch size is ignored when running under those.) PROCESS_PAGES_NON_PREEMPT_BATCH: for architectures with "fast" clear-pages (ones that define clear_pages()), we define it as 32MB worth of pages. This is meant to be large enough to allow the processor to optimize the operation and yet small enough that we see reasonable preemption latency for when this optimization is not possible (ex. slow microarchitectures, memory bandwidth saturation.) This specific value also allows for a cacheline allocation elision optimization (which might help unrelated applications by not evicting potentially useful cache lines) that kicks in recent generations of AMD Zen processors at around LLC-size (32MB is a typical size). At the same time 32MB is small enough that even with poor clearing bandwidth (say ~10GBps), time to clear 32MB should be well below the scheduler's default warning threshold (sysctl_resched_latency_warn_ms=100). "Slow" architectures (don't have clear_pages()) will continue to use the base value (single page). Performance == Testing a demand fault workload shows a decent improvement in bandwidth with pg-sz=1GB. Bandwidth with pg-sz=2MB stays flat. $ perf bench mem mmap -p $pg-sz -f demand -s 64GB -l 5 contiguous-pages batched-pages (GBps +- %stdev) (GBps +- %stdev) pg-sz=2MB 23.58 +- 1.95% 25.34 +- 1.18% + 7.50% preempt=* pg-sz=1GB 25.09 +- 0.79% 39.22 +- 2.32% + 56.31% preempt=none|voluntary pg-sz=1GB 25.71 +- 0.03% 52.73 +- 0.20% [#] +110.16% preempt=full|lazy [#] We perform much better with preempt=full|lazy because, not needing explicit invocations of cond_resched() we can clear the full extent (pg-sz=1GB) as a single unit which the processor can optimize for. (Unless otherwise noted, all numbers are on AMD Genoa (EPYC 9J13); region-size=64GB, local node; 2.56 GHz, boost=0.) Analysis == pg-sz=1GB: the improvement we see falls in two buckets depending on the batch size in use. For batch-size=32MB the number of cachelines allocated (L1-dcache-loads) -- which stay relatively flat for smaller batches, start to drop off because cacheline allocation elision kicks in. And as can be seen below, at batch-size=1GB, we stop allocating cachelines almost entirely. (Not visible here but from testing with intermediate sizes, the allocation change kicks in only at batch-size=32MB and ramps up from there.) contigous-pages 6,949,417,798 L1-dcache-loads # 883.599 M/sec ( +- 0.01% ) (35.75%) 3,226,709,573 L1-dcache-load-misses # 46.43% of all L1-dcache accesses ( +- 0.05% ) (35.75%) batched,32MB 2,290,365,772 L1-dcache-loads # 471.171 M/sec ( +- 0.36% ) (35.72%) 1,144,426,272 L1-dcache-load-misses # 49.97% of all L1-dcache accesses ( +- 0.58% ) (35.70%) batched,1GB 63,914,157 L1-dcache-loads # 17.464 M/sec ( +- 8.08% ) (35.73%) 22,074,367 L1-dcache-load-misses # 34.54% of all L1-dcache accesses ( +- 16.70% ) (35.70%) The dropoff is also visible in L2 prefetch hits (miss numbers are on similar lines): contiguous-pages 3,464,861,312 l2_pf_hit_l2.all # 437.722 M/sec ( +- 0.74% ) (15.69%) batched,32MB 883,750,087 l2_pf_hit_l2.all # 181.223 M/sec ( +- 1.18% ) (15.71%) batched,1GB 8,967,943 l2_pf_hit_l2.all # 2.450 M/sec ( +- 17.92% ) (15.77%) This largely decouples the frontend from the backend since the clearing operation does not need to wait on loads from memory (we still need cacheline ownership but that's a shorter path). This is most visible if we rerun the test above with (boost=1, 3.66 GHz). $ perf bench mem mmap -p $pg-sz -f demand -s 64GB -l 5 contiguous-pages batched-pages (GBps +- %stdev) (GBps +- %stdev) pg-sz=2MB 26.08 +- 1.72% 26.13 +- 0.92% - preempt=* pg-sz=1GB 26.99 +- 0.62% 48.85 +- 2.19% + 80.99% preempt=none|voluntary pg-sz=1GB 27.69 +- 0.18% 75.18 +- 0.25% +171.50% preempt=full|lazy Comparing the batched-pages numbers from the boost=0 ones and these: for a clock-speed gain of 42% we gain 24.5% for batch-size=32MB and 42.5% for batch-size=1GB. In comparison the baseline contiguous-pages case and both the pg-sz=2MB ones are largely backend bound so gain no more than ~10%. Other platforms tested, Intel Icelakex (Oracle X9) and ARM64 Neoverse-N1 (Ampere Altra) both show an improvement of ~35% for pg-sz=2MB|1GB. The first goes from around 8GBps to 11GBps and the second from 32GBps to 44 GBPs. [ankur.a.arora@oracle.com: move the unit computation and make it a const Link: https://lkml.kernel.org/r/20260108060406.1693853-1-ankur.a.arora@oracle.com Link: https://lkml.kernel.org/r/20260107072009.1615991-8-ankur.a.arora@oracle.com Signed-off-by: Ankur Arora Acked-by: David Hildenbrand (Red Hat) Cc: Andy Lutomirski Cc: "Borislav Petkov (AMD)" Cc: Boris Ostrovsky Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Konrad Rzessutek Wilk Cc: Lance Yang Cc: "Liam R. Howlett" Cc: Li Zhe Cc: Lorenzo Stoakes Cc: Mateusz Guzik Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Peter Zijlstra Cc: Raghavendra K T Cc: Suren Baghdasaryan Cc: Thomas Gleixner Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/mm.h | 35 +++++++++++++++++++++++++++++++++++ mm/memory.c | 18 +++++++++++++++--- 2 files changed, 50 insertions(+), 3 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index d78e294698b0..ab2e7e30aef9 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4208,6 +4208,15 @@ static inline void clear_page_guard(struct zone *zone, struct page *page, * mapped to user space. * * Does absolutely no exception handling. + * + * Note that even though the clearing operation is preemptible, clear_pages() + * does not (and on architectures where it reduces to a few long-running + * instructions, might not be able to) call cond_resched() to check if + * rescheduling is required. + * + * When running under preemptible models this is not a problem. Under + * cooperatively scheduled models, however, the caller is expected to + * limit @npages to no more than PROCESS_PAGES_NON_PREEMPT_BATCH. */ static inline void clear_pages(void *addr, unsigned int npages) { @@ -4218,6 +4227,32 @@ static inline void clear_pages(void *addr, unsigned int npages) } #endif +#ifndef PROCESS_PAGES_NON_PREEMPT_BATCH +#ifdef clear_pages +/* + * The architecture defines clear_pages(), and we assume that it is + * generally "fast". So choose a batch size large enough to allow the processor + * headroom for optimizing the operation and yet small enough that we see + * reasonable preemption latency for when this optimization is not possible + * (ex. slow microarchitectures, memory bandwidth saturation.) + * + * With a value of 32MB and assuming a memory bandwidth of ~10GBps, this should + * result in worst case preemption latency of around 3ms when clearing pages. + * + * (See comment above clear_pages() for why preemption latency is a concern + * here.) + */ +#define PROCESS_PAGES_NON_PREEMPT_BATCH (SZ_32M >> PAGE_SHIFT) +#else /* !clear_pages */ +/* + * The architecture does not provide a clear_pages() implementation. Assume + * that clear_page() -- which clear_pages() will fallback to -- is relatively + * slow and choose a small value for PROCESS_PAGES_NON_PREEMPT_BATCH. + */ +#define PROCESS_PAGES_NON_PREEMPT_BATCH 1 +#endif +#endif + #ifdef __HAVE_ARCH_GATE_AREA extern struct vm_area_struct *get_gate_vma(struct mm_struct *mm); extern int in_gate_area_no_mm(unsigned long addr); diff --git a/mm/memory.c b/mm/memory.c index 74d663943ecb..3f6ec897c9a6 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -7243,13 +7243,25 @@ static inline int process_huge_page( static void clear_contig_highpages(struct page *page, unsigned long addr, unsigned int nr_pages) { - unsigned int i; + unsigned int i, count; + /* + * When clearing we want to operate on the largest extent possible to + * allow for architecture specific extent based optimizations. + * + * However, since clear_user_highpages() (and primitives clear_user_pages(), + * clear_pages()), do not call cond_resched(), limit the unit size when + * running under non-preemptible scheduling models. + */ + const unsigned int unit = preempt_model_preemptible() ? + nr_pages : PROCESS_PAGES_NON_PREEMPT_BATCH; might_sleep(); - for (i = 0; i < nr_pages; i++) { + + for (i = 0; i < nr_pages; i += count) { cond_resched(); - clear_user_highpage(page + i, addr + i * PAGE_SIZE); + count = min(unit, nr_pages - i); + clear_user_highpages(page + i, addr + i * PAGE_SIZE, count); } } -- cgit v1.2.3 From 93552c9a3350fff06543da18e4c80d3e804191ca Mon Sep 17 00:00:00 2001 From: Ankur Arora Date: Tue, 6 Jan 2026 23:20:09 -0800 Subject: mm: folio_zero_user: cache neighbouring pages folio_zero_user() does straight zeroing without caring about temporal locality for caches. This replaced commit c6ddfb6c5890 ("mm, clear_huge_page: move order algorithm into a separate function") where we cleared a page at a time converging to the faulting page from the left and the right. To retain limited temporal locality, split the clearing in three parts: the faulting page and its immediate neighbourhood, and the regions on its left and right. We clear the local neighbourhood last to maximize chances of it sticking around in the cache. Performance === AMD Genoa (EPYC 9J14, cpus=2 sockets * 96 cores * 2 threads, memory=2.2 TB, L1d=16K/thread, L2=512K/thread, L3=2MB/thread) vm-scalability/anon-w-seq-hugetlb: this workload runs with 384 processes (one for each CPU) each zeroing anonymously mapped hugetlb memory which is then accessed sequentially. stime utime discontiguous-page 1739.93 ( +- 6.15% ) 1016.61 ( +- 4.75% ) contiguous-page 1853.70 ( +- 2.51% ) 1187.13 ( +- 3.50% ) batched-pages 1756.75 ( +- 2.98% ) 1133.32 ( +- 4.89% ) neighbourhood-last 1725.18 ( +- 4.59% ) 1123.78 ( +- 7.38% ) Both stime and utime largely respond somewhat expectedly. There is a fair amount of run to run variation but the general trend is that the stime drops and utime increases. There are a few oddities, like contiguous-page performing very differently from batched-pages. As such this is likely an uncommon pattern where we saturate the memory bandwidth (since all CPUs are running the test) and at the same time are cache constrained because we access the entire region. Kernel make (make -j 12 bzImage): stime utime discontiguous-page 199.29 ( +- 0.63% ) 1431.67 ( +- .04% ) contiguous-page 193.76 ( +- 0.58% ) 1433.60 ( +- .05% ) batched-pages 193.92 ( +- 0.76% ) 1431.04 ( +- .08% ) neighbourhood-last 194.46 ( +- 0.68% ) 1431.51 ( +- .06% ) For make the utime stays relatively flat with a fairly small (-2.4%) improvement in the stime. Link: https://lkml.kernel.org/r/20260107072009.1615991-9-ankur.a.arora@oracle.com Signed-off-by: Ankur Arora Reviewed-by: Raghavendra K T Tested-by: Raghavendra K T Acked-by: David Hildenbrand (Red Hat) Cc: Andy Lutomirski Cc: "Borislav Petkov (AMD)" Cc: Boris Ostrovsky Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Konrad Rzessutek Wilk Cc: Lance Yang Cc: "Liam R. Howlett" Cc: Li Zhe Cc: Lorenzo Stoakes Cc: Mateusz Guzik Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Peter Zijlstra Cc: Suren Baghdasaryan Cc: Thomas Gleixner Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/memory.c | 41 ++++++++++++++++++++++++++++++++++++++--- 1 file changed, 38 insertions(+), 3 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 3f6ec897c9a6..ce933ee4a3dd 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -7265,6 +7265,15 @@ static void clear_contig_highpages(struct page *page, unsigned long addr, } } +/* + * When zeroing a folio, we want to differentiate between pages in the + * vicinity of the faulting address where we have spatial and temporal + * locality, and those far away where we don't. + * + * Use a radius of 2 for determining the local neighbourhood. + */ +#define FOLIO_ZERO_LOCALITY_RADIUS 2 + /** * folio_zero_user - Zero a folio which will be mapped to userspace. * @folio: The folio to zero. @@ -7272,10 +7281,36 @@ static void clear_contig_highpages(struct page *page, unsigned long addr, */ void folio_zero_user(struct folio *folio, unsigned long addr_hint) { - unsigned long base_addr = ALIGN_DOWN(addr_hint, folio_size(folio)); + const unsigned long base_addr = ALIGN_DOWN(addr_hint, folio_size(folio)); + const long fault_idx = (addr_hint - base_addr) / PAGE_SIZE; + const struct range pg = DEFINE_RANGE(0, folio_nr_pages(folio) - 1); + const int radius = FOLIO_ZERO_LOCALITY_RADIUS; + struct range r[3]; + int i; + + /* + * Faulting page and its immediate neighbourhood. Will be cleared at the + * end to keep its cachelines hot. + */ + r[2] = DEFINE_RANGE(clamp_t(s64, fault_idx - radius, pg.start, pg.end), + clamp_t(s64, fault_idx + radius, pg.start, pg.end)); + + /* Region to the left of the fault */ + r[1] = DEFINE_RANGE(pg.start, + clamp_t(s64, r[2].start - 1, pg.start - 1, r[2].start)); - clear_contig_highpages(folio_page(folio, 0), - base_addr, folio_nr_pages(folio)); + /* Region to the right of the fault: always valid for the common fault_idx=0 case. */ + r[0] = DEFINE_RANGE(clamp_t(s64, r[2].end + 1, r[2].end, pg.end + 1), + pg.end); + + for (i = 0; i < ARRAY_SIZE(r); i++) { + const unsigned long addr = base_addr + r[i].start * PAGE_SIZE; + const unsigned int nr_pages = range_len(&r[i]); + struct page *page = folio_page(folio, r[i].start); + + if (nr_pages > 0) + clear_contig_highpages(page, addr, nr_pages); + } } static int copy_user_gigantic_page(struct folio *dst, struct folio *src, -- cgit v1.2.3 From cc05d5d94bda595e66cf68a90b313baff5dc20ab Mon Sep 17 00:00:00 2001 From: Swaraj Gaikwad Date: Thu, 11 Dec 2025 03:27:22 +0000 Subject: mm/damon/sysfs-schemes: remove outdated TODO in target_nid_store() The TODO comment in target_nid_store() suggested adding range validation for target_nid. As discussed in [1], the current behavior of accepting any integer value is intentional. DAMON sysfs aims to remain flexible, including supporting users who prepare node IDs before future NUMA hotplug events. Because this behavior matches the broader design philosophy of the DAMON sysfs interface, the TODO comment is now misleading. This patch removes the comment without introducing any behavioral change. No functional changes. Link: https://lkml.kernel.org/r/20251211032722.4928-2-swarajgaikwad1925@gmail.com Link: https://lore.kernel.org/lkml/20251210150930.57679-1-sj@kernel.org/ [1] Signed-off-by: Swaraj Gaikwad Suggested-by: SeongJae Park Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/sysfs-schemes.c | 1 - 1 file changed, 1 deletion(-) diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c index 3a699dcd5a7f..b52fc3b45b30 100644 --- a/mm/damon/sysfs-schemes.c +++ b/mm/damon/sysfs-schemes.c @@ -2288,7 +2288,6 @@ static ssize_t target_nid_store(struct kobject *kobj, struct damon_sysfs_scheme, kobj); int err = 0; - /* TODO: error handling for target_nid range. */ err = kstrtoint(buf, 0, &scheme->target_nid); return err ? err : count; -- cgit v1.2.3 From 85aa39197420c2eb3cfad4cdfe499bb9b18fafbd Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 10 Dec 2025 21:56:45 -0500 Subject: mm: zswap: delete unused acomp->is_sleepable This hasn't been used since 7d4c9629b74f ("mm: zswap: use object read/write APIs instead of object mapping APIs"). Drop it. Link: https://lkml.kernel.org/r/20251211025645.820517-1-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Chengming Zhou Reviewed-by: Anshuman Khandual Acked-by: Yosry Ahmed Acked-by: Nhat Pham Signed-off-by: Andrew Morton --- mm/zswap.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index ac9b7a60736b..6bf4f2441914 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -141,7 +141,6 @@ struct crypto_acomp_ctx { struct crypto_wait wait; u8 *buffer; struct mutex mutex; - bool is_sleepable; }; /* @@ -781,7 +780,6 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node) acomp_ctx->buffer = buffer; acomp_ctx->acomp = acomp; - acomp_ctx->is_sleepable = acomp_is_async(acomp); acomp_ctx->req = req; mutex_unlock(&acomp_ctx->mutex); return 0; -- cgit v1.2.3 From 558605a530e079a59bafe4877b06100055f7d91d Mon Sep 17 00:00:00 2001 From: Chen Ridong Date: Thu, 11 Dec 2025 01:30:18 +0000 Subject: memcg: move mem_cgroup_usage memcontrol-v1.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "memcg cleanups", v3. Two code moves/removals with no behavior change. This patch (of 2): Currently, mem_cgroup_usage is only used for v1, just move it to memcontrol-v1.c Link: https://lkml.kernel.org/r/20251211013019.2080004-1-chenridong@huaweicloud.com Link: https://lkml.kernel.org/r/20251211013019.2080004-2-chenridong@huaweicloud.com Signed-off-by: Chen Ridong Acked-by: Michal Hocko Acked-by: Johannes Weiner Acked-by: Shakeel Butt Acked-by: Michal Koutný Cc: Axel Rasmussen Cc: Lorenzo Stoakes Cc: Lu Jialin Cc: Muchun Song Cc: Qi Zheng Cc: Roman Gushchin Cc: Wei Xu Cc: Yuanchu Xie Signed-off-by: Andrew Morton --- mm/memcontrol-v1.c | 22 ++++++++++++++++++++++ mm/memcontrol-v1.h | 2 -- mm/memcontrol.c | 22 ---------------------- 3 files changed, 22 insertions(+), 24 deletions(-) diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c index 6eed14bff742..0b50cb122ff3 100644 --- a/mm/memcontrol-v1.c +++ b/mm/memcontrol-v1.c @@ -427,6 +427,28 @@ static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, } #endif +static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) +{ + unsigned long val; + + if (mem_cgroup_is_root(memcg)) { + /* + * Approximate root's usage from global state. This isn't + * perfect, but the root usage was always an approximation. + */ + val = global_node_page_state(NR_FILE_PAGES) + + global_node_page_state(NR_ANON_MAPPED); + if (swap) + val += total_swap_pages - get_nr_swap_pages(); + } else { + if (!swap) + val = page_counter_read(&memcg->memory); + else + val = page_counter_read(&memcg->memsw); + } + return val; +} + static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) { struct mem_cgroup_threshold_ary *t; diff --git a/mm/memcontrol-v1.h b/mm/memcontrol-v1.h index 6358464bb416..e92b21af92b1 100644 --- a/mm/memcontrol-v1.h +++ b/mm/memcontrol-v1.h @@ -22,8 +22,6 @@ iter != NULL; \ iter = mem_cgroup_iter(NULL, iter, NULL)) -unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap); - void drain_all_stock(struct mem_cgroup *root_memcg); unsigned long memcg_events(struct mem_cgroup *memcg, int event); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 86f43b7e5f71..7d4b93d30eb0 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3272,28 +3272,6 @@ void folio_split_memcg_refs(struct folio *folio, unsigned old_order, css_get_many(&__folio_memcg(folio)->css, new_refs); } -unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) -{ - unsigned long val; - - if (mem_cgroup_is_root(memcg)) { - /* - * Approximate root's usage from global state. This isn't - * perfect, but the root usage was always an approximation. - */ - val = global_node_page_state(NR_FILE_PAGES) + - global_node_page_state(NR_ANON_MAPPED); - if (swap) - val += total_swap_pages - get_nr_swap_pages(); - } else { - if (!swap) - val = page_counter_read(&memcg->memory); - else - val = page_counter_read(&memcg->memsw); - } - return val; -} - static int memcg_online_kmem(struct mem_cgroup *memcg) { struct obj_cgroup *objcg; -- cgit v1.2.3 From 055059ed720ec7546d2bf7122d858814a9f84741 Mon Sep 17 00:00:00 2001 From: Chen Ridong Date: Thu, 11 Dec 2025 01:30:19 +0000 Subject: memcg: remove mem_cgroup_size() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The mem_cgroup_size helper is used only in apply_proportional_protection to read the current memory usage. Its semantics are unclear and inconsistent with other sites, which directly call page_counter_read for the same purpose. Remove this helper and get its usage via mem_cgroup_protection for clarity. Additionally, rename the local variable 'cgroup_size' to 'usage' to better reflect its meaning. No functional changes intended. Link: https://lkml.kernel.org/r/20251211013019.2080004-3-chenridong@huaweicloud.com Signed-off-by: Chen Ridong Acked-by: Johannes Weiner Acked-by: Michal Hocko Acked-by: Shakeel Butt Cc: Michal Koutný Cc: Axel Rasmussen Cc: Lorenzo Stoakes Cc: Lu Jialin Cc: Muchun Song Cc: Qi Zheng Cc: Roman Gushchin Cc: Wei Xu Cc: Yuanchu Xie Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 18 +++++++----------- mm/memcontrol.c | 5 ----- mm/vmscan.c | 9 ++++----- 3 files changed, 11 insertions(+), 21 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 0651865a4564..25908ba30700 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -557,13 +557,15 @@ static inline bool mem_cgroup_disabled(void) static inline void mem_cgroup_protection(struct mem_cgroup *root, struct mem_cgroup *memcg, unsigned long *min, - unsigned long *low) + unsigned long *low, + unsigned long *usage) { - *min = *low = 0; + *min = *low = *usage = 0; if (mem_cgroup_disabled()) return; + *usage = page_counter_read(&memcg->memory); /* * There is no reclaim protection applied to a targeted reclaim. * We are special casing this specific case here because @@ -919,8 +921,6 @@ static inline void mem_cgroup_handle_over_high(gfp_t gfp_mask) unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg); -unsigned long mem_cgroup_size(struct mem_cgroup *memcg); - void mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct *p); @@ -1102,9 +1102,10 @@ static inline void memcg_memory_event_mm(struct mm_struct *mm, static inline void mem_cgroup_protection(struct mem_cgroup *root, struct mem_cgroup *memcg, unsigned long *min, - unsigned long *low) + unsigned long *low, + unsigned long *usage) { - *min = *low = 0; + *min = *low = *usage = 0; } static inline void mem_cgroup_calculate_protection(struct mem_cgroup *root, @@ -1328,11 +1329,6 @@ static inline unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg) return 0; } -static inline unsigned long mem_cgroup_size(struct mem_cgroup *memcg) -{ - return 0; -} - static inline void mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct *p) { diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 7d4b93d30eb0..15323d5dc69b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1621,11 +1621,6 @@ unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg) return max; } -unsigned long mem_cgroup_size(struct mem_cgroup *memcg) -{ - return page_counter_read(&memcg->memory); -} - void __memcg_memory_event(struct mem_cgroup *memcg, enum memcg_memory_event event, bool allow_spinning) { diff --git a/mm/vmscan.c b/mm/vmscan.c index 67234613fbff..1c87945fa761 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2451,9 +2451,9 @@ static inline void calculate_pressure_balance(struct scan_control *sc, static unsigned long apply_proportional_protection(struct mem_cgroup *memcg, struct scan_control *sc, unsigned long scan) { - unsigned long min, low; + unsigned long min, low, usage; - mem_cgroup_protection(sc->target_mem_cgroup, memcg, &min, &low); + mem_cgroup_protection(sc->target_mem_cgroup, memcg, &min, &low, &usage); if (min || low) { /* @@ -2485,7 +2485,6 @@ static unsigned long apply_proportional_protection(struct mem_cgroup *memcg, * again by how much of the total memory used is under * hard protection. */ - unsigned long cgroup_size = mem_cgroup_size(memcg); unsigned long protection; /* memory.low scaling, make sure we retry before OOM */ @@ -2497,9 +2496,9 @@ static unsigned long apply_proportional_protection(struct mem_cgroup *memcg, } /* Avoid TOCTOU with earlier protection check */ - cgroup_size = max(cgroup_size, protection); + usage = max(usage, protection); - scan -= scan * protection / (cgroup_size + 1); + scan -= scan * protection / (usage + 1); /* * Minimally target SWAP_CLUSTER_MAX pages to keep -- cgit v1.2.3 From 16cc8b9396f6d63c1331059d67626cf907a7f23c Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 10 Dec 2025 10:43:01 -0500 Subject: mm: memcontrol: rename mem_cgroup_from_slab_obj() In addition to slab objects, this function is used for resolving non-slab kernel pointers. This has caused confusion in recent refactoring work. Rename it to mem_cgroup_from_virt(), sticking with terminology established by the virt_to_() converters. Link: https://lore.kernel.org/linux-mm/20251113161424.GB3465062@cmpxchg.org/ Link: https://lkml.kernel.org/r/20251210154301.720133-1-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Roman Gushchin Reviewed-by: Anshuman Khandual Acked-by: Vlastimil Babka Acked-by: Shakeel Butt Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Muchun Song Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 4 ++-- mm/list_lru.c | 4 ++-- mm/memcontrol.c | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 25908ba30700..fd400082313a 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1723,7 +1723,7 @@ static inline int memcg_kmem_id(struct mem_cgroup *memcg) return memcg ? memcg->kmemcg_id : -1; } -struct mem_cgroup *mem_cgroup_from_slab_obj(void *p); +struct mem_cgroup *mem_cgroup_from_virt(void *p); static inline void count_objcg_events(struct obj_cgroup *objcg, enum vm_event_item idx, @@ -1795,7 +1795,7 @@ static inline int memcg_kmem_id(struct mem_cgroup *memcg) return -1; } -static inline struct mem_cgroup *mem_cgroup_from_slab_obj(void *p) +static inline struct mem_cgroup *mem_cgroup_from_virt(void *p) { return NULL; } diff --git a/mm/list_lru.c b/mm/list_lru.c index ec48b5dadf51..37b642f6cbda 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -187,7 +187,7 @@ bool list_lru_add_obj(struct list_lru *lru, struct list_head *item) if (list_lru_memcg_aware(lru)) { rcu_read_lock(); - ret = list_lru_add(lru, item, nid, mem_cgroup_from_slab_obj(item)); + ret = list_lru_add(lru, item, nid, mem_cgroup_from_virt(item)); rcu_read_unlock(); } else { ret = list_lru_add(lru, item, nid, NULL); @@ -224,7 +224,7 @@ bool list_lru_del_obj(struct list_lru *lru, struct list_head *item) if (list_lru_memcg_aware(lru)) { rcu_read_lock(); - ret = list_lru_del(lru, item, nid, mem_cgroup_from_slab_obj(item)); + ret = list_lru_del(lru, item, nid, mem_cgroup_from_virt(item)); rcu_read_unlock(); } else { ret = list_lru_del(lru, item, nid, NULL); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 15323d5dc69b..a01d3e6c157d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -806,7 +806,7 @@ void mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val) struct lruvec *lruvec; rcu_read_lock(); - memcg = mem_cgroup_from_slab_obj(p); + memcg = mem_cgroup_from_virt(p); /* * Untracked pages have no memcg, no lruvec. Update only the @@ -2614,7 +2614,7 @@ struct mem_cgroup *mem_cgroup_from_obj_slab(struct slab *slab, void *p) * The caller must ensure the memcg lifetime, e.g. by taking rcu_read_lock(), * cgroup_mutex, etc. */ -struct mem_cgroup *mem_cgroup_from_slab_obj(void *p) +struct mem_cgroup *mem_cgroup_from_virt(void *p) { struct slab *slab; -- cgit v1.2.3 From 6e4930e33329eec80dd245f28b52202271f5fb28 Mon Sep 17 00:00:00 2001 From: Enze Li Date: Wed, 10 Dec 2025 13:25:08 +0800 Subject: mm/damon/core: fix wasteful CPU calls by skipping non-existent targets Currently, DAMON does not proactively clean up invalid monitoring targets during its runtime. When some monitored processes exit, DAMON continues to make the following unnecessary function calls, --damon_for_each_target-- --damon_for_each_region-- damon_do_apply_schemes damos_apply_scheme damon_va_apply_scheme damos_madvise damon_get_mm it is only in the damon_get_mm() function that it may finally discover the target no longer exists, which wastes CPU resources. A simple idea is to check for the existence of monitoring targets within the kdamond_need_stop() function and promptly clean up non-existent targets. However, SJ pointed out that this approach is problematic because the online commit logic incorrectly uses list indices to update the monitoring state. This can lead to data loss if the target list is changed concurrently. Meanwhile, SJ suggests checking for target existence at the damon_for_each_target level, and if a target does not exist, simply skip it and proceed to the next one. Link: https://lkml.kernel.org/r/20251210052508.264433-1-lienze@kylinos.cn Signed-off-by: Enze Li Suggested-by: SeongJae Park Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/core.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/damon/core.c b/mm/damon/core.c index c852cac4f82e..2379a07c2f87 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -2299,6 +2299,9 @@ static void kdamond_apply_schemes(struct damon_ctx *c) mutex_lock(&c->walk_control_lock); damon_for_each_target(t, c) { + if (c->ops.target_valid && c->ops.target_valid(t) == false) + continue; + damon_for_each_region_safe(r, next_r, t) damon_do_apply_schemes(c, t, r); } -- cgit v1.2.3 From 9f5edd785da3cf373285259928d7f1f08c9ce758 Mon Sep 17 00:00:00 2001 From: Kaushlendra Kumar Date: Tue, 9 Dec 2025 08:47:45 +0530 Subject: tools/mm/thp_swap_allocator_test: fix small folio alignment Use ALIGNMENT_SMALLFOLIO instead of ALIGNMENT_MTHP when allocating small folios to ensure correct memory alignment for the test case. Before: test allocates small folios with 64KB alignment (ALIGNMENT_MTHP) when only 4KB alignment (ALIGNMENT_SMALLFOLIO) is needed. This wastes address space and may cause allocation failures on systems with fragmented memory. Worst-case impact: this only affects thp_swap_allocator_test tool behavior. Link: https://lkml.kernel.org/r/20251209031745.2723120-1-kaushlendra.kumar@intel.com Signed-off-by: Kaushlendra Kumar Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton --- tools/mm/thp_swap_allocator_test.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/mm/thp_swap_allocator_test.c b/tools/mm/thp_swap_allocator_test.c index 83afc52275a5..d4434df3dcff 100644 --- a/tools/mm/thp_swap_allocator_test.c +++ b/tools/mm/thp_swap_allocator_test.c @@ -142,7 +142,7 @@ int main(int argc, char *argv[]) } if (use_small_folio) { - mem2 = aligned_alloc_mem(MEMSIZE_SMALLFOLIO, ALIGNMENT_MTHP); + mem2 = aligned_alloc_mem(MEMSIZE_SMALLFOLIO, ALIGNMENT_SMALLFOLIO); if (mem2 == NULL) { fprintf(stderr, "Failed to allocate small folios memory\n"); free(mem1); -- cgit v1.2.3 From 8b8017d7c411403731ee4d502cdbd76e9425f0e1 Mon Sep 17 00:00:00 2001 From: Kaushlendra Kumar Date: Mon, 8 Dec 2025 16:22:40 +0530 Subject: tools/mm/slabinfo: fix --partial long option mapping The long option "--partial" was incorrectly mapped to lowercase 'p' in the opts[] array, but the getopt string and switch case handle uppercase 'P'. This mismatch caused --partial to be rejected. Fix the long_options mapping to use 'P' so --partial works correctly alongside the existing -P short option. Link: https://lkml.kernel.org/r/20251208105240.2719773-1-kaushlendra.kumar@intel.com Signed-off-by: Kaushlendra Kumar Reviewed-by: SeongJae Park Tested-by: SeongJae Park Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- tools/mm/slabinfo.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/mm/slabinfo.c b/tools/mm/slabinfo.c index 80cdbd3db82d..54c7265ab52d 100644 --- a/tools/mm/slabinfo.c +++ b/tools/mm/slabinfo.c @@ -1405,7 +1405,7 @@ struct option opts[] = { { "numa", no_argument, NULL, 'n' }, { "lines", required_argument, NULL, 'N'}, { "ops", no_argument, NULL, 'o' }, - { "partial", no_argument, NULL, 'p'}, + { "partial", no_argument, NULL, 'P'}, { "report", no_argument, NULL, 'r' }, { "shrink", no_argument, NULL, 's' }, { "Size", no_argument, NULL, 'S'}, -- cgit v1.2.3 From 4a6ceb7c9744c69546d4ca43b7bd308f4db0927b Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 16 Dec 2025 00:01:14 -0800 Subject: mm/damon/core: introduce nr_snapshots damos stat Patch series "mm/damon: introduce {,max_}nr_snapshots and tracepoint for damos stats". Introduce three changes for improving DAMOS stat's provided information, deterministic control, and reading usability. DAMOS provides stats that are important for understanding its behavior. It lacks information about how many DAMON-generated monitoring output snapshots it has worked on. Add a new stat, nr_snapshots, to show the information. Users can control DAMOS schemes in multiple ways. Using the online parameters commit feature, they can install and uninstall DAMOS schemes whenever they want while keeping DAMON runs. DAMOS quotas and watermarks can be used for manually or automatically turning on/off or adjusting the aggressiveness of the scheme. DAMOS filters can be used for applying the scheme to specific memory entities based on their types and locations. Some users want their DAMOS scheme to be applied to only specific number of DAMON snapshots, for more deterministic control. One example use case is tracepoint based snapshot reading. Add a new knob, max_nr_snapshots, to support this. If the nr_snapshots parameter becomes same to or greater than the value of this parameter, the scheme is deactivated. Users can read DAMOS stats via DAMON's sysfs interface. For deep level investigations on environments having advanced tools like perf and bpftrace, exposing the stats via a tracepoint can be useful. Implement a new tracepoint, namely damon:damos_stat_after_apply_interval. First five patches (patches 1-5) of this series implement the new stat, nr_snapshots, on the core layer (patch 1), expose on DAMON sysfs user interface (patch 2), and update documents (patches 3-5). Following six patches (patches 6-11) are for the new stat based DAMOS deactivation (max_nr_snapshots). The first one (patch 6) of this group updates a kernel-doc comment before making further changes. Then an implementation of it on the core layer (patch 7), an introduction of a new DAMON sysfs interface file for users of the feature (patch 8), and three updates of the documents (patches 9-11) follow. The final one (patch 12) introduces the new tracepoint that exposes the DAMOS stat values for each scheme apply interval. This patch (of 12): DAMON generates monitoring results snapshots for every sampling interval. DAMOS applies given schemes on the regions of the snapshots, for every apply interval of the scheme. DAMOS stat informs a given scheme has tried to how many memory entities and applied, in the region and byte level. In some use cases including user-space oriented tuning and investigations, it is useful to know that in the DAMON-snapshot level. Introduce a new stat, namely nr_snapshots for DAMON core API callers. [sj@kernel.org: fix wrong list_is_last() call in damons_is_last_region()] Link: https://lkml.kernel.org/r/20260114152049.99727-1-sj@kernel.org Link: https://lkml.kernel.org/r/20251216080128.42991-1-sj@kernel.org Link: https://lkml.kernel.org/r/20251216080128.42991-2-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: "Masami Hiramatsu (Google)" Cc: Mathieu Desnoyers Cc: Michal Hocko Cc: Mike Rapoport Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/damon.h | 3 +++ mm/damon/core.c | 13 ++++++++++--- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/include/linux/damon.h b/include/linux/damon.h index 3813373a9200..1d8a1515e75a 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -330,6 +330,8 @@ struct damos_watermarks { * @sz_ops_filter_passed: * Total bytes that passed ops layer-handled DAMOS filters. * @qt_exceeds: Total number of times the quota of the scheme has exceeded. + * @nr_snapshots: + * Total number of DAMON snapshots that the scheme has tried. * * "Tried an action to a region" in this context means the DAMOS core logic * determined the region as eligible to apply the action. The access pattern @@ -355,6 +357,7 @@ struct damos_stat { unsigned long sz_applied; unsigned long sz_ops_filter_passed; unsigned long qt_exceeds; + unsigned long nr_snapshots; }; /** diff --git a/mm/damon/core.c b/mm/damon/core.c index 2379a07c2f87..9d5be7e9b8e0 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -157,6 +157,12 @@ void damon_destroy_region(struct damon_region *r, struct damon_target *t) damon_free_region(r); } +static bool damon_is_last_region(struct damon_region *r, + struct damon_target *t) +{ + return list_is_last(&r->list, &t->regions_list); +} + /* * Check whether a region is intersecting an address range * @@ -1978,10 +1984,11 @@ static void damon_do_apply_schemes(struct damon_ctx *c, if (damos_skip_charged_region(t, &r, s, c->min_sz_region)) continue; - if (!damos_valid_target(c, t, r, s)) - continue; + if (damos_valid_target(c, t, r, s)) + damos_apply_scheme(c, t, r, s); - damos_apply_scheme(c, t, r, s); + if (damon_is_last_region(r, t)) + s->stat.nr_snapshots++; } } -- cgit v1.2.3 From 83a741b9742505bda70e6cd82dc5cb32baf5e16c Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 16 Dec 2025 00:01:15 -0800 Subject: mm/damon/sysfs-schemes: introduce nr_snapshots damos stat file Introduce a new DAMON sysfs interface file for exposing the newly added DAMOS stat, nr_snapshots. The file has the name same to the stat name (nr_snapshots) and placed under the damos stat sysfs directory. Link: https://lkml.kernel.org/r/20251216080128.42991-3-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: "Masami Hiramatsu (Google)" Cc: Mathieu Desnoyers Cc: Michal Hocko Cc: Mike Rapoport Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/damon/sysfs-schemes.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c index b52fc3b45b30..b7fcd7590ab3 100644 --- a/mm/damon/sysfs-schemes.c +++ b/mm/damon/sysfs-schemes.c @@ -204,6 +204,7 @@ struct damon_sysfs_stats { unsigned long sz_applied; unsigned long sz_ops_filter_passed; unsigned long qt_exceeds; + unsigned long nr_snapshots; }; static struct damon_sysfs_stats *damon_sysfs_stats_alloc(void) @@ -265,6 +266,15 @@ static ssize_t qt_exceeds_show(struct kobject *kobj, return sysfs_emit(buf, "%lu\n", stats->qt_exceeds); } +static ssize_t nr_snapshots_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_stats *stats = container_of(kobj, + struct damon_sysfs_stats, kobj); + + return sysfs_emit(buf, "%lu\n", stats->nr_snapshots); +} + static void damon_sysfs_stats_release(struct kobject *kobj) { kfree(container_of(kobj, struct damon_sysfs_stats, kobj)); @@ -288,6 +298,9 @@ static struct kobj_attribute damon_sysfs_stats_sz_ops_filter_passed_attr = static struct kobj_attribute damon_sysfs_stats_qt_exceeds_attr = __ATTR_RO_MODE(qt_exceeds, 0400); +static struct kobj_attribute damon_sysfs_stats_nr_snapshots_attr = + __ATTR_RO_MODE(nr_snapshots, 0400); + static struct attribute *damon_sysfs_stats_attrs[] = { &damon_sysfs_stats_nr_tried_attr.attr, &damon_sysfs_stats_sz_tried_attr.attr, @@ -295,6 +308,7 @@ static struct attribute *damon_sysfs_stats_attrs[] = { &damon_sysfs_stats_sz_applied_attr.attr, &damon_sysfs_stats_sz_ops_filter_passed_attr.attr, &damon_sysfs_stats_qt_exceeds_attr.attr, + &damon_sysfs_stats_nr_snapshots_attr.attr, NULL, }; ATTRIBUTE_GROUPS(damon_sysfs_stats); @@ -2762,6 +2776,7 @@ void damon_sysfs_schemes_update_stats( sysfs_stats->sz_ops_filter_passed = scheme->stat.sz_ops_filter_passed; sysfs_stats->qt_exceeds = scheme->stat.qt_exceeds; + sysfs_stats->nr_snapshots = scheme->stat.nr_snapshots; } } -- cgit v1.2.3 From ee7f5d193358a6e8624a17cef78c508635f9b9b6 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 16 Dec 2025 00:01:16 -0800 Subject: Docs/mm/damon/design: update for nr_snapshots damos stat Update DAMON design document for the newly added damos stat, nr_snapshots. Link: https://lkml.kernel.org/r/20251216080128.42991-4-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: "Masami Hiramatsu (Google)" Cc: Mathieu Desnoyers Cc: Michal Hocko Cc: Mike Rapoport Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- Documentation/mm/damon/design.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Documentation/mm/damon/design.rst b/Documentation/mm/damon/design.rst index 2d8d8ca1e0a3..5cc7b7d662be 100644 --- a/Documentation/mm/damon/design.rst +++ b/Documentation/mm/damon/design.rst @@ -718,6 +718,8 @@ scheme's execution. - ``nr_applied``: Total number of regions that the scheme is applied. - ``sz_applied``: Total size of regions that the scheme is applied. - ``qt_exceeds``: Total number of times the quota of the scheme has exceeded. +- ``nr_snapshots``: Total number of DAMON snapshots that the scheme is tried to + be applied. "A scheme is tried to be applied to a region" means DAMOS core logic determined the region is eligible to apply the scheme's :ref:`action -- cgit v1.2.3 From 0b43f89e2d9a18d1a2373064f70bc730180b70f2 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 16 Dec 2025 00:01:17 -0800 Subject: Docs/admin-guide/mm/damon/usage: update for nr_snapshots damos stat Update DAMON usage document for the newly added damos stat, nr_snapshots. Link: https://lkml.kernel.org/r/20251216080128.42991-5-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: "Masami Hiramatsu (Google)" Cc: Mathieu Desnoyers Cc: Michal Hocko Cc: Mike Rapoport Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/damon/usage.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst index 9991dad60fcf..d0944bd78964 100644 --- a/Documentation/admin-guide/mm/damon/usage.rst +++ b/Documentation/admin-guide/mm/damon/usage.rst @@ -87,7 +87,7 @@ comma (","). │ │ │ │ │ │ │ │ 0/type,matching,allow,memcg_path,addr_start,addr_end,target_idx,min,max │ │ │ │ │ │ │ :ref:`dests `/nr_dests │ │ │ │ │ │ │ │ 0/id,weight - │ │ │ │ │ │ │ :ref:`stats `/nr_tried,sz_tried,nr_applied,sz_applied,sz_ops_filter_passed,qt_exceeds + │ │ │ │ │ │ │ :ref:`stats `/nr_tried,sz_tried,nr_applied,sz_applied,sz_ops_filter_passed,qt_exceeds,nr_snapshots │ │ │ │ │ │ │ :ref:`tried_regions `/total_bytes │ │ │ │ │ │ │ │ 0/start,end,nr_accesses,age,sz_filter_passed │ │ │ │ │ │ │ │ ... @@ -543,9 +543,9 @@ online analysis or tuning of the schemes. Refer to :ref:`design doc The statistics can be retrieved by reading the files under ``stats`` directory (``nr_tried``, ``sz_tried``, ``nr_applied``, ``sz_applied``, -``sz_ops_filter_passed``, and ``qt_exceeds``), respectively. The files are not -updated in real time, so you should ask DAMON sysfs interface to update the -content of the files for the stats by writing a special keyword, +``sz_ops_filter_passed``, ``qt_exceeds`` and ``nr_snapshots``), respectively. +The files are not updated in real time, so you should ask DAMON sysfs interface +to update the content of the files for the stats by writing a special keyword, ``update_schemes_stats`` to the relevant ``kdamonds//state`` file. .. _sysfs_schemes_tried_regions: -- cgit v1.2.3 From 55221e53f73e41b4ebac575ed1333f50488a7ba4 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 16 Dec 2025 00:01:18 -0800 Subject: Docs/ABI/damon: update for nr_snapshots damos stat Update DAMON ABI document for the newly added damos stat, nr_snapshots. Link: https://lkml.kernel.org/r/20251216080128.42991-6-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: "Masami Hiramatsu (Google)" Cc: Mathieu Desnoyers Cc: Michal Hocko Cc: Mike Rapoport Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- Documentation/ABI/testing/sysfs-kernel-mm-damon | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-damon b/Documentation/ABI/testing/sysfs-kernel-mm-damon index 4fb8b7a6d625..7571aa78b7bb 100644 --- a/Documentation/ABI/testing/sysfs-kernel-mm-damon +++ b/Documentation/ABI/testing/sysfs-kernel-mm-damon @@ -516,6 +516,12 @@ Contact: SeongJae Park Description: Reading this file returns the number of the exceed events of the scheme's quotas. +What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//stats/nr_snapshots +Date: Dec 2025 +Contact: SeongJae Park +Description: Reading this file returns the total number of DAMON snapshots + that the scheme has tried to be applied. + What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//tried_regions/total_bytes Date: Jul 2023 Contact: SeongJae Park -- cgit v1.2.3 From ccaa2d062a35add92832e8f082b8e00eed3f6efd Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 16 Dec 2025 00:01:19 -0800 Subject: mm/damon: update damos kerneldoc for stat field Commit 0e92c2ee9f45 ("mm/damon/schemes: account scheme actions that successfully applied") has replaced ->stat_count and ->stat_sz of 'struct damos' with ->stat. The commit mistakenly did not update the related kernel doc comment, though. Update the comment. Link: https://lkml.kernel.org/r/20251216080128.42991-7-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: "Masami Hiramatsu (Google)" Cc: Mathieu Desnoyers Cc: Michal Hocko Cc: Mike Rapoport Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/damon.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/include/linux/damon.h b/include/linux/damon.h index 1d8a1515e75a..43dfbfe2292f 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -532,9 +532,7 @@ struct damos_migrate_dests { * unsets @last_applied when each regions walking for applying the scheme is * finished. * - * After applying the &action to each region, &stat_count and &stat_sz is - * updated to reflect the number of regions and total size of regions that the - * &action is applied. + * After applying the &action to each region, &stat is updated. */ struct damos { struct damos_access_pattern pattern; -- cgit v1.2.3 From 84e425c68e6061751adecd2d328789e4f67eac1e Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 16 Dec 2025 00:01:20 -0800 Subject: mm/damon/core: implement max_nr_snapshots There are DAMOS use cases that require user-space centric control of its activation and deactivation. Having the control plane on the user-space, or using DAMOS as a way for monitoring results collection are such examples. DAMON parameters online commit, DAMOS quotas and watermarks can be useful for this purpose. However, those features work only at the sub-DAMON-snapshot level. In some use cases, the DAMON-snapshot level control is required. For example, in DAMOS-based monitoring results collection use case, the user online-installs a DAMOS scheme with DAMOS_STAT action, wait it be applied to whole regions of a single DAMON-snapshot, retrieves the stats and tried regions information, and online-uninstall the scheme. It is efficient to ensure the lifetime of the scheme as no more no less one snapshot consumption. To support such use cases, introduce a new DAMOS core API per-scheme parameter, namely max_nr_snapshots. As the name implies, it is the upper limit of nr_snapshots, which is a DAMOS stat that represents the number of DAMON-snapshots that the scheme has fully applied. If the limit is set with a non-zero value and nr_snapshots reaches or exceeds the limit, the scheme is deactivated. Link: https://lkml.kernel.org/r/20251216080128.42991-8-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: "Masami Hiramatsu (Google)" Cc: Mathieu Desnoyers Cc: Michal Hocko Cc: Mike Rapoport Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/damon.h | 5 +++++ mm/damon/core.c | 11 ++++++++++- 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/include/linux/damon.h b/include/linux/damon.h index 43dfbfe2292f..a67292a2f09d 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -499,6 +499,7 @@ struct damos_migrate_dests { * @ops_filters: ops layer handling &struct damos_filter objects list. * @last_applied: Last @action applied ops-managing entity. * @stat: Statistics of this scheme. + * @max_nr_snapshots: Upper limit of nr_snapshots stat. * @list: List head for siblings. * * For each @apply_interval_us, DAMON finds regions which fit in the @@ -533,6 +534,9 @@ struct damos_migrate_dests { * finished. * * After applying the &action to each region, &stat is updated. + * + * If &max_nr_snapshots is set as non-zero and &stat.nr_snapshots be same to or + * greater than it, the scheme is deactivated. */ struct damos { struct damos_access_pattern pattern; @@ -567,6 +571,7 @@ struct damos { struct list_head ops_filters; void *last_applied; struct damos_stat stat; + unsigned long max_nr_snapshots; struct list_head list; }; diff --git a/mm/damon/core.c b/mm/damon/core.c index 9d5be7e9b8e0..344773f53f64 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -401,6 +401,7 @@ struct damos *damon_new_scheme(struct damos_access_pattern *pattern, INIT_LIST_HEAD(&scheme->core_filters); INIT_LIST_HEAD(&scheme->ops_filters); scheme->stat = (struct damos_stat){}; + scheme->max_nr_snapshots = 0; INIT_LIST_HEAD(&scheme->list); scheme->quota = *(damos_quota_init(quota)); @@ -1078,7 +1079,11 @@ static int damos_commit(struct damos *dst, struct damos *src) return err; err = damos_commit_filters(dst, src); - return err; + if (err) + return err; + + dst->max_nr_snapshots = src->max_nr_snapshots; + return 0; } static int damon_commit_schemes(struct damon_ctx *dst, struct damon_ctx *src) @@ -1984,6 +1989,10 @@ static void damon_do_apply_schemes(struct damon_ctx *c, if (damos_skip_charged_region(t, &r, s, c->min_sz_region)) continue; + if (s->max_nr_snapshots && + s->max_nr_snapshots <= s->stat.nr_snapshots) + continue; + if (damos_valid_target(c, t, r, s)) damos_apply_scheme(c, t, r, s); -- cgit v1.2.3 From 204ab9ab9310cd79c710037992e6ad681c8fa6b9 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 16 Dec 2025 00:01:21 -0800 Subject: mm/damon/sysfs-schemes: implement max_nr_snapshots file Add a new DAMON sysfs file for setting and getting the newly introduced per-DAMON-snapshot level DAMOS deactivation control parameter, max_nr_snapshots. The file has a name same to the parameter and placed under the damos stat directory. Link: https://lkml.kernel.org/r/20251216080128.42991-9-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: "Masami Hiramatsu (Google)" Cc: Mathieu Desnoyers Cc: Michal Hocko Cc: Mike Rapoport Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/damon/sysfs-schemes.c | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c index b7fcd7590ab3..19bc2288cd68 100644 --- a/mm/damon/sysfs-schemes.c +++ b/mm/damon/sysfs-schemes.c @@ -205,6 +205,7 @@ struct damon_sysfs_stats { unsigned long sz_ops_filter_passed; unsigned long qt_exceeds; unsigned long nr_snapshots; + unsigned long max_nr_snapshots; }; static struct damon_sysfs_stats *damon_sysfs_stats_alloc(void) @@ -275,6 +276,28 @@ static ssize_t nr_snapshots_show(struct kobject *kobj, return sysfs_emit(buf, "%lu\n", stats->nr_snapshots); } +static ssize_t max_nr_snapshots_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_stats *stats = container_of(kobj, + struct damon_sysfs_stats, kobj); + + return sysfs_emit(buf, "%lu\n", stats->max_nr_snapshots); +} + +static ssize_t max_nr_snapshots_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_stats *stats = container_of(kobj, + struct damon_sysfs_stats, kobj); + unsigned long max_nr_snapshots, err = kstrtoul(buf, 0, &max_nr_snapshots); + + if (err) + return err; + stats->max_nr_snapshots = max_nr_snapshots; + return count; +} + static void damon_sysfs_stats_release(struct kobject *kobj) { kfree(container_of(kobj, struct damon_sysfs_stats, kobj)); @@ -301,6 +324,9 @@ static struct kobj_attribute damon_sysfs_stats_qt_exceeds_attr = static struct kobj_attribute damon_sysfs_stats_nr_snapshots_attr = __ATTR_RO_MODE(nr_snapshots, 0400); +static struct kobj_attribute damon_sysfs_stats_max_nr_snapshots_attr = + __ATTR_RW_MODE(max_nr_snapshots, 0600); + static struct attribute *damon_sysfs_stats_attrs[] = { &damon_sysfs_stats_nr_tried_attr.attr, &damon_sysfs_stats_sz_tried_attr.attr, @@ -309,6 +335,7 @@ static struct attribute *damon_sysfs_stats_attrs[] = { &damon_sysfs_stats_sz_ops_filter_passed_attr.attr, &damon_sysfs_stats_qt_exceeds_attr.attr, &damon_sysfs_stats_nr_snapshots_attr.attr, + &damon_sysfs_stats_max_nr_snapshots_attr.attr, NULL, }; ATTRIBUTE_GROUPS(damon_sysfs_stats); @@ -2732,6 +2759,7 @@ static struct damos *damon_sysfs_mk_scheme( damon_destroy_scheme(scheme); return NULL; } + scheme->max_nr_snapshots = sysfs_scheme->stats->max_nr_snapshots; return scheme; } -- cgit v1.2.3 From 64aa87f03da9165c45534695da42d9e87ada7544 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 16 Dec 2025 00:01:22 -0800 Subject: Docs/mm/damon/design: update for max_nr_snapshots Update DAMON design document for the newly added snapshot level DAMOS deactivation feature, max_nr_snapshots. Link: https://lkml.kernel.org/r/20251216080128.42991-10-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: "Masami Hiramatsu (Google)" Cc: Mathieu Desnoyers Cc: Michal Hocko Cc: Mike Rapoport Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- Documentation/mm/damon/design.rst | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Documentation/mm/damon/design.rst b/Documentation/mm/damon/design.rst index 5cc7b7d662be..7fd819b8bbf7 100644 --- a/Documentation/mm/damon/design.rst +++ b/Documentation/mm/damon/design.rst @@ -720,6 +720,7 @@ scheme's execution. - ``qt_exceeds``: Total number of times the quota of the scheme has exceeded. - ``nr_snapshots``: Total number of DAMON snapshots that the scheme is tried to be applied. +- ``max_nr_snapshots``: Upper limit of ``nr_snapshots``. "A scheme is tried to be applied to a region" means DAMOS core logic determined the region is eligible to apply the scheme's :ref:`action @@ -741,6 +742,10 @@ to exclude anonymous pages and the region has only anonymous pages, or if the action is ``pageout`` while all pages of the region are unreclaimable, applying the action to the region will fail. +Unlike normal stats, ``max_nr_snapshots`` is set by users. If it is set as +non-zero and ``nr_snapshots`` be same to or greater than ``nr_snapshots``, the +scheme is deactivated. + To know how user-space can read the stats via :ref:`DAMON sysfs interface `, refer to :ref:s`stats ` part of the documentation. -- cgit v1.2.3 From 2584dd7496c53135287d3a4b2e0699fe386df015 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 16 Dec 2025 00:01:23 -0800 Subject: Docs/admin-guide/mm/damon/usage: update for max_nr_snapshots Update DAMON usage document for the newly added DAMON sysfs interface file, max_nr_snapshots. Link: https://lkml.kernel.org/r/20251216080128.42991-11-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: "Masami Hiramatsu (Google)" Cc: Mathieu Desnoyers Cc: Michal Hocko Cc: Mike Rapoport Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/damon/usage.rst | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst index d0944bd78964..7da4c002cb39 100644 --- a/Documentation/admin-guide/mm/damon/usage.rst +++ b/Documentation/admin-guide/mm/damon/usage.rst @@ -87,7 +87,7 @@ comma (","). │ │ │ │ │ │ │ │ 0/type,matching,allow,memcg_path,addr_start,addr_end,target_idx,min,max │ │ │ │ │ │ │ :ref:`dests `/nr_dests │ │ │ │ │ │ │ │ 0/id,weight - │ │ │ │ │ │ │ :ref:`stats `/nr_tried,sz_tried,nr_applied,sz_applied,sz_ops_filter_passed,qt_exceeds,nr_snapshots + │ │ │ │ │ │ │ :ref:`stats `/nr_tried,sz_tried,nr_applied,sz_applied,sz_ops_filter_passed,qt_exceeds,nr_snapshots,max_nr_snapshots │ │ │ │ │ │ │ :ref:`tried_regions `/total_bytes │ │ │ │ │ │ │ │ 0/start,end,nr_accesses,age,sz_filter_passed │ │ │ │ │ │ │ │ ... @@ -543,10 +543,11 @@ online analysis or tuning of the schemes. Refer to :ref:`design doc The statistics can be retrieved by reading the files under ``stats`` directory (``nr_tried``, ``sz_tried``, ``nr_applied``, ``sz_applied``, -``sz_ops_filter_passed``, ``qt_exceeds`` and ``nr_snapshots``), respectively. -The files are not updated in real time, so you should ask DAMON sysfs interface -to update the content of the files for the stats by writing a special keyword, -``update_schemes_stats`` to the relevant ``kdamonds//state`` file. +``sz_ops_filter_passed``, ``qt_exceeds``, ``nr_snapshots`` and +``max_nr_snapshots``), respectively. The files are not updated in real time, +so you should ask DAMON sysfs interface to update the content of the files for +the stats by writing a special keyword, ``update_schemes_stats`` to the +relevant ``kdamonds//state`` file. .. _sysfs_schemes_tried_regions: -- cgit v1.2.3 From dcecf9e58b976dd848a06c667d92d2566f9384aa Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 16 Dec 2025 00:01:24 -0800 Subject: Docs/ABI/damon: update for max_nr_snapshots Update DAMON ABI document for the newly added DAMON sysfs interface file, max_nr_snapshots. Link: https://lkml.kernel.org/r/20251216080128.42991-12-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: "Masami Hiramatsu (Google)" Cc: Mathieu Desnoyers Cc: Michal Hocko Cc: Mike Rapoport Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- Documentation/ABI/testing/sysfs-kernel-mm-damon | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-damon b/Documentation/ABI/testing/sysfs-kernel-mm-damon index 7571aa78b7bb..f2af2ddedd32 100644 --- a/Documentation/ABI/testing/sysfs-kernel-mm-damon +++ b/Documentation/ABI/testing/sysfs-kernel-mm-damon @@ -522,6 +522,13 @@ Contact: SeongJae Park Description: Reading this file returns the total number of DAMON snapshots that the scheme has tried to be applied. +What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//stats/max_nr_snapshots +Date: Dec 2025 +Contact: SeongJae Park +Description: Writing a number to this file sets the upper limit of + nr_snapshots that deactivates the scheme when the limit is + reached or exceeded. + What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//tried_regions/total_bytes Date: Jul 2023 Contact: SeongJae Park -- cgit v1.2.3 From 804c26b961da295bd70c86a3c9dc4bea0b09de88 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 16 Dec 2025 00:01:25 -0800 Subject: mm/damon/core: add trace point for damos stat per apply interval DAMON users can read DAMOS stats via DAMON sysfs interface. It enables efficient, simple and flexible usages of the stats. Especially for systems not having advanced tools like perf or bpftrace, that can be useful. But if the advanced tools are available, exposing the stats via tracepoint can reduce unnecessary reimplementation of the wheels. Add a new tracepoint for DAMOS stats, namely damos_stat_after_apply_interval. The tracepoint is triggered for each scheme's apply interval and exposes the whole stat values. If the user needs sub-apply interval information for any chance, damos_before_apply tracepoint could be used. Link: https://lkml.kernel.org/r/20251216080128.42991-13-sj@kernel.org Signed-off-by: SeongJae Park Reviewed-by: Steven Rostedt (Google) Cc: Jonathan Corbet Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: "Masami Hiramatsu (Google)" Cc: Mathieu Desnoyers Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/trace/events/damon.h | 41 +++++++++++++++++++++++++++++++++++++++++ mm/damon/core.c | 17 +++++++++++++++++ 2 files changed, 58 insertions(+) diff --git a/include/trace/events/damon.h b/include/trace/events/damon.h index 852d725afea2..24fc402ab3c8 100644 --- a/include/trace/events/damon.h +++ b/include/trace/events/damon.h @@ -9,6 +9,47 @@ #include #include +TRACE_EVENT(damos_stat_after_apply_interval, + + TP_PROTO(unsigned int context_idx, unsigned int scheme_idx, + struct damos_stat *stat), + + TP_ARGS(context_idx, scheme_idx, stat), + + TP_STRUCT__entry( + __field(unsigned int, context_idx) + __field(unsigned int, scheme_idx) + __field(unsigned long, nr_tried) + __field(unsigned long, sz_tried) + __field(unsigned long, nr_applied) + __field(unsigned long, sz_applied) + __field(unsigned long, sz_ops_filter_passed) + __field(unsigned long, qt_exceeds) + __field(unsigned long, nr_snapshots) + ), + + TP_fast_assign( + __entry->context_idx = context_idx; + __entry->scheme_idx = scheme_idx; + __entry->nr_tried = stat->nr_tried; + __entry->sz_tried = stat->sz_tried; + __entry->nr_applied = stat->nr_applied; + __entry->sz_applied = stat->sz_applied; + __entry->sz_ops_filter_passed = stat->sz_ops_filter_passed; + __entry->qt_exceeds = stat->qt_exceeds; + __entry->nr_snapshots = stat->nr_snapshots; + ), + + TP_printk("ctx_idx=%u scheme_idx=%u nr_tried=%lu sz_tried=%lu " + "nr_applied=%lu sz_tried=%lu sz_ops_filter_passed=%lu " + "qt_exceeds=%lu nr_snapshots=%lu", + __entry->context_idx, __entry->scheme_idx, + __entry->nr_tried, __entry->sz_tried, + __entry->nr_applied, __entry->sz_applied, + __entry->sz_ops_filter_passed, __entry->qt_exceeds, + __entry->nr_snapshots) +); + TRACE_EVENT(damos_esz, TP_PROTO(unsigned int context_idx, unsigned int scheme_idx, diff --git a/mm/damon/core.c b/mm/damon/core.c index 344773f53f64..f4d83e12ba0e 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -2289,6 +2289,22 @@ static void damos_adjust_quota(struct damon_ctx *c, struct damos *s) quota->min_score = score; } +static void damos_trace_stat(struct damon_ctx *c, struct damos *s) +{ + unsigned int cidx = 0, sidx = 0; + struct damos *siter; + + if (!trace_damos_stat_after_apply_interval_enabled()) + return; + + damon_for_each_scheme(siter, c) { + if (siter == s) + break; + sidx++; + } + trace_damos_stat_after_apply_interval(cidx, sidx, &s->stat); +} + static void kdamond_apply_schemes(struct damon_ctx *c) { struct damon_target *t; @@ -2330,6 +2346,7 @@ static void kdamond_apply_schemes(struct damon_ctx *c) (s->apply_interval_us ? s->apply_interval_us : c->attrs.aggr_interval) / sample_interval; s->last_applied = NULL; + damos_trace_stat(c, s); } mutex_unlock(&c->walk_control_lock); } -- cgit v1.2.3 From 9082f24bd3b700bfc98a24baf794cc7af8f6bcd0 Mon Sep 17 00:00:00 2001 From: JaeJoon Jung Date: Mon, 15 Dec 2025 23:34:38 -0800 Subject: mm/damon/stat: deduplicate intervals_goal setup in damon_stat_build_ctx() The damon_stat_build_ctx() function sets the values of intervals_goal structure members. These values are applied to damon_ctx in damon_set_attrs(). However, It is resetting the values that were already applied previously to the same values. I suggest removing this code as it constitutes duplicate execution. Link: https://patch.msgid.link/20251206011716.7185-1-rgbi3307@gmail.com Link: https://lkml.kernel.org/r/20251216073440.40891-1-sj@kernel.org Signed-off-by: JaeJoon Jung Reviewed-by: Enze Li Reviewed-by: SeongJae Park Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/stat.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/mm/damon/stat.c b/mm/damon/stat.c index ed8e3629d31a..ef0a1195a584 100644 --- a/mm/damon/stat.c +++ b/mm/damon/stat.c @@ -173,14 +173,6 @@ static struct damon_ctx *damon_stat_build_ctx(void) if (damon_set_attrs(ctx, &attrs)) goto free_out; - /* - * auto-tune sampling and aggregation interval aiming 4% DAMON-observed - * accesses ratio, keeping sampling interval in [5ms, 10s] range. - */ - ctx->attrs.intervals_goal = (struct damon_intervals_goal) { - .access_bp = 400, .aggrs = 3, - .min_sample_us = 5000, .max_sample_us = 10000000, - }; if (damon_select_ops(ctx, DAMON_OPS_PADDR)) goto free_out; -- cgit v1.2.3 From 657a81fe3b41bd58c63e15ae282f992dda5c8eee Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Tue, 16 Dec 2025 16:13:42 +0900 Subject: zram: drop pp_in_progress pp_in_progress makes sure that only one post-processing (writeback or recomrpession) is active at any given time. Functionality wise it, basically, shadows zram init_lock, when init_lock is acquired in writer mode. Switch recompress_store() and writeback_store() to take zram init_lock in writer mode, like all store() sysfs handlers should do, so that we can drop pp_in_progress. Recompression and writeback can be somewhat slow, so holding init_lock in writer mode can block zram attrs reads, but in reality the only zram attrs reads that take place are mm_stat reads, and usually it's the same process that reads mm_stat and does recompression or writeback. Link: https://lkml.kernel.org/r/20251216071342.687993-1-senozhatsky@chromium.org Signed-off-by: Sergey Senozhatsky Suggested-by: Greg Kroah-Hartman Cc: Brian Geffon Cc: Minchan Kim Signed-off-by: Andrew Morton --- drivers/block/zram/zram_drv.c | 32 ++++++++------------------------ drivers/block/zram/zram_drv.h | 1 - 2 files changed, 8 insertions(+), 25 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 7dcfc71d2cac..ed717b65f0a9 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -902,7 +902,7 @@ release_wb_ctl: static void zram_account_writeback_rollback(struct zram *zram) { - lockdep_assert_held_read(&zram->init_lock); + lockdep_assert_held_write(&zram->init_lock); if (zram->wb_limit_enable) zram->bd_wb_limit += 1UL << (PAGE_SHIFT - 12); @@ -910,7 +910,7 @@ static void zram_account_writeback_rollback(struct zram *zram) static void zram_account_writeback_submit(struct zram *zram) { - lockdep_assert_held_read(&zram->init_lock); + lockdep_assert_held_write(&zram->init_lock); if (zram->wb_limit_enable && zram->bd_wb_limit > 0) zram->bd_wb_limit -= 1UL << (PAGE_SHIFT - 12); @@ -1264,24 +1264,16 @@ static ssize_t writeback_store(struct device *dev, ssize_t ret = len; int err, mode = 0; - guard(rwsem_read)(&zram->init_lock); + guard(rwsem_write)(&zram->init_lock); if (!init_done(zram)) return -EINVAL; - /* Do not permit concurrent post-processing actions. */ - if (atomic_xchg(&zram->pp_in_progress, 1)) - return -EAGAIN; - - if (!zram->backing_dev) { - ret = -ENODEV; - goto out; - } + if (!zram->backing_dev) + return -ENODEV; pp_ctl = init_pp_ctl(); - if (!pp_ctl) { - ret = -ENOMEM; - goto out; - } + if (!pp_ctl) + return -ENOMEM; wb_ctl = init_wb_ctl(zram); if (!wb_ctl) { @@ -1358,7 +1350,6 @@ static ssize_t writeback_store(struct device *dev, out: release_pp_ctl(zram, pp_ctl); release_wb_ctl(wb_ctl); - atomic_set(&zram->pp_in_progress, 0); return ret; } @@ -2619,14 +2610,10 @@ static ssize_t recompress_store(struct device *dev, if (threshold >= huge_class_size) return -EINVAL; - guard(rwsem_read)(&zram->init_lock); + guard(rwsem_write)(&zram->init_lock); if (!init_done(zram)) return -EINVAL; - /* Do not permit concurrent post-processing actions. */ - if (atomic_xchg(&zram->pp_in_progress, 1)) - return -EAGAIN; - if (algo) { bool found = false; @@ -2697,7 +2684,6 @@ out: if (page) __free_page(page); release_pp_ctl(zram, ctl); - atomic_set(&zram->pp_in_progress, 0); return ret; } #endif @@ -2888,7 +2874,6 @@ static void zram_reset_device(struct zram *zram) zram->disksize = 0; zram_destroy_comps(zram); memset(&zram->stats, 0, sizeof(zram->stats)); - atomic_set(&zram->pp_in_progress, 0); reset_bdev(zram); comp_algorithm_set(zram, ZRAM_PRIMARY_COMP, default_compressor); @@ -3124,7 +3109,6 @@ static int zram_add(void) zram->disk->fops = &zram_devops; zram->disk->private_data = zram; snprintf(zram->disk->disk_name, 16, "zram%d", device_id); - atomic_set(&zram->pp_in_progress, 0); zram_comp_params_reset(zram); comp_algorithm_set(zram, ZRAM_PRIMARY_COMP, default_compressor); diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index 48d6861c6647..469a3dab44ad 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -143,6 +143,5 @@ struct zram { #ifdef CONFIG_ZRAM_MEMORY_TRACKING struct dentry *debugfs_dir; #endif - atomic_t pp_in_progress; }; #endif -- cgit v1.2.3 From 64dd89ae01f2708a508e028c28b7906e4702a9a7 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 15 Dec 2025 12:57:53 -0500 Subject: mm/block/fs: remove laptop_mode Laptop mode was introduced to save battery, by delaying and consolidating writes and thereby maximize the time rotating hard drives wouldn't have to spin. Luckily, rotating hard drives, with their high spin-up times and power draw, are a thing of the past for battery-powered devices. Reclaim has also since changed to not write single filesystem pages anymore, and regular filesystem writeback is lumpy by design. The juice doesn't appear worth the squeeze anymore. The footprint of the feature is small, but nevertheless it's a complicating factor in mm, block, filesystems. Developers don't think about it, and it likely hasn't been tested with new reclaim and writeback changes in years. Let's sunset it. Keep the sysctl with a deprecation warning around for a few more cycles, but remove all functionality behind it. [akpm@linux-foundation.org: fix Documentation/admin-guide/laptops/index.rst] Link: https://lkml.kernel.org/r/20251216185201.GH905277@cmpxchg.org Signed-off-by: Johannes Weiner Suggested-by: Christoph Hellwig Reviewed-by: Christoph Hellwig Acked-by: Jens Axboe Reviewed-by: Shakeel Butt Acked-by: Michal Hocko Cc: Deepanshu Kartikey Signed-off-by: Andrew Morton --- Documentation/admin-guide/laptops/index.rst | 1 - Documentation/admin-guide/laptops/laptop-mode.rst | 770 ---------------------- Documentation/admin-guide/sysctl/vm.rst | 8 - block/blk-mq.c | 3 - fs/ext4/inode.c | 3 +- fs/sync.c | 2 - fs/xfs/xfs_super.c | 9 - include/linux/backing-dev-defs.h | 3 - include/linux/writeback.h | 4 - include/trace/events/writeback.h | 1 - include/uapi/linux/sysctl.h | 2 +- mm/backing-dev.c | 3 - mm/page-writeback.c | 74 +-- mm/vmscan.c | 30 +- 14 files changed, 25 insertions(+), 888 deletions(-) delete mode 100644 Documentation/admin-guide/laptops/laptop-mode.rst diff --git a/Documentation/admin-guide/laptops/index.rst b/Documentation/admin-guide/laptops/index.rst index 6432c251dc95..c0b911d05c59 100644 --- a/Documentation/admin-guide/laptops/index.rst +++ b/Documentation/admin-guide/laptops/index.rst @@ -10,7 +10,6 @@ Laptop Drivers alienware-wmi asus-laptop disk-shock-protection - laptop-mode lg-laptop samsung-galaxybook sony-laptop diff --git a/Documentation/admin-guide/laptops/laptop-mode.rst b/Documentation/admin-guide/laptops/laptop-mode.rst deleted file mode 100644 index 66eb9cd918b5..000000000000 --- a/Documentation/admin-guide/laptops/laptop-mode.rst +++ /dev/null @@ -1,770 +0,0 @@ -=============================================== -How to conserve battery power using laptop-mode -=============================================== - -Document Author: Bart Samwel (bart@samwel.tk) - -Date created: January 2, 2004 - -Last modified: December 06, 2004 - -Introduction ------------- - -Laptop mode is used to minimize the time that the hard disk needs to be spun up, -to conserve battery power on laptops. It has been reported to cause significant -power savings. - -.. Contents - - * Introduction - * Installation - * Caveats - * The Details - * Tips & Tricks - * Control script - * ACPI integration - * Monitoring tool - - -Installation ------------- - -To use laptop mode, you don't need to set any kernel configuration options -or anything. Simply install all the files included in this document, and -laptop mode will automatically be started when you're on battery. For -your convenience, a tarball containing an installer can be downloaded at: - - http://www.samwel.tk/laptop_mode/laptop_mode/ - -To configure laptop mode, you need to edit the configuration file, which is -located in /etc/default/laptop-mode on Debian-based systems, or in -/etc/sysconfig/laptop-mode on other systems. - -Unfortunately, automatic enabling of laptop mode does not work for -laptops that don't have ACPI. On those laptops, you need to start laptop -mode manually. To start laptop mode, run "laptop_mode start", and to -stop it, run "laptop_mode stop". (Note: The laptop mode tools package now -has experimental support for APM, you might want to try that first.) - - -Caveats -------- - -* The downside of laptop mode is that you have a chance of losing up to 10 - minutes of work. If you cannot afford this, don't use it! The supplied ACPI - scripts automatically turn off laptop mode when the battery almost runs out, - so that you won't lose any data at the end of your battery life. - -* Most desktop hard drives have a very limited lifetime measured in spindown - cycles, typically about 50.000 times (it's usually listed on the spec sheet). - Check your drive's rating, and don't wear down your drive's lifetime if you - don't need to. - -* If you mount some of your ext3 filesystems with the -n option, then - the control script will not be able to remount them correctly. You must set - DO_REMOUNTS=0 in the control script, otherwise it will remount them with the - wrong options -- or it will fail because it cannot write to /etc/mtab. - -* If you have your filesystems listed as type "auto" in fstab, like I did, then - the control script will not recognize them as filesystems that need remounting. - You must list the filesystems with their true type instead. - -* It has been reported that some versions of the mutt mail client use file access - times to determine whether a folder contains new mail. If you use mutt and - experience this, you must disable the noatime remounting by setting the option - DO_REMOUNT_NOATIME to 0 in the configuration file. - - -The Details ------------ - -Laptop mode is controlled by the knob /proc/sys/vm/laptop_mode. This knob is -present for all kernels that have the laptop mode patch, regardless of any -configuration options. When the knob is set, any physical disk I/O (that might -have caused the hard disk to spin up) causes Linux to flush all dirty blocks. The -result of this is that after a disk has spun down, it will not be spun up -anymore to write dirty blocks, because those blocks had already been written -immediately after the most recent read operation. The value of the laptop_mode -knob determines the time between the occurrence of disk I/O and when the flush -is triggered. A sensible value for the knob is 5 seconds. Setting the knob to -0 disables laptop mode. - -To increase the effectiveness of the laptop_mode strategy, the laptop_mode -control script increases dirty_expire_centisecs and dirty_writeback_centisecs in -/proc/sys/vm to about 10 minutes (by default), which means that pages that are -dirtied are not forced to be written to disk as often. The control script also -changes the dirty background ratio, so that background writeback of dirty pages -is not done anymore. Combined with a higher commit value (also 10 minutes) for -ext3 filesystem (also done automatically by the control script), -this results in concentration of disk activity in a small time interval which -occurs only once every 10 minutes, or whenever the disk is forced to spin up by -a cache miss. The disk can then be spun down in the periods of inactivity. - - -Configuration -------------- - -The laptop mode configuration file is located in /etc/default/laptop-mode on -Debian-based systems, or in /etc/sysconfig/laptop-mode on other systems. It -contains the following options: - -MAX_AGE: - -Maximum time, in seconds, of hard drive spindown time that you are -comfortable with. Worst case, it's possible that you could lose this -amount of work if your battery fails while you're in laptop mode. - -MINIMUM_BATTERY_MINUTES: - -Automatically disable laptop mode if the remaining number of minutes of -battery power is less than this value. Default is 10 minutes. - -AC_HD/BATT_HD: - -The idle timeout that should be set on your hard drive when laptop mode -is active (BATT_HD) and when it is not active (AC_HD). The defaults are -20 seconds (value 4) for BATT_HD and 2 hours (value 244) for AC_HD. The -possible values are those listed in the manual page for "hdparm" for the -"-S" option. - -HD: - -The devices for which the spindown timeout should be adjusted by laptop mode. -Default is /dev/hda. If you specify multiple devices, separate them by a space. - -READAHEAD: - -Disk readahead, in 512-byte sectors, while laptop mode is active. A large -readahead can prevent disk accesses for things like executable pages (which are -loaded on demand while the application executes) and sequentially accessed data -(MP3s). - -DO_REMOUNTS: - -The control script automatically remounts any mounted journaled filesystems -with appropriate commit interval options. When this option is set to 0, this -feature is disabled. - -DO_REMOUNT_NOATIME: - -When remounting, should the filesystems be remounted with the noatime option? -Normally, this is set to "1" (enabled), but there may be programs that require -access time recording. - -DIRTY_RATIO: - -The percentage of memory that is allowed to contain "dirty" or unsaved data -before a writeback is forced, while laptop mode is active. Corresponds to -the /proc/sys/vm/dirty_ratio sysctl. - -DIRTY_BACKGROUND_RATIO: - -The percentage of memory that is allowed to contain "dirty" or unsaved data -after a forced writeback is done due to an exceeding of DIRTY_RATIO. Set -this nice and low. This corresponds to the /proc/sys/vm/dirty_background_ratio -sysctl. - -Note that the behaviour of dirty_background_ratio is quite different -when laptop mode is active and when it isn't. When laptop mode is inactive, -dirty_background_ratio is the threshold percentage at which background writeouts -start taking place. When laptop mode is active, however, background writeouts -are disabled, and the dirty_background_ratio only determines how much writeback -is done when dirty_ratio is reached. - -DO_CPU: - -Enable CPU frequency scaling when in laptop mode. (Requires CPUFreq to be setup. -See Documentation/admin-guide/pm/cpufreq.rst for more info. Disabled by default.) - -CPU_MAXFREQ: - -When on battery, what is the maximum CPU speed that the system should use? Legal -values are "slowest" for the slowest speed that your CPU is able to operate at, -or a value listed in /sys/devices/system/cpu/cpu0/cpufreq/scaling_available_frequencies. - - -Tips & Tricks -------------- - -* Bartek Kania reports getting up to 50 minutes of extra battery life (on top - of his regular 3 to 3.5 hours) using a spindown time of 5 seconds (BATT_HD=1). - -* You can spin down the disk while playing MP3, by setting disk readahead - to 8MB (READAHEAD=16384). Effectively, the disk will read a complete MP3 at - once, and will then spin down while the MP3 is playing. (Thanks to Bartek - Kania.) - -* Drew Scott Daniels observed: "I don't know why, but when I decrease the number - of colours that my display uses it consumes less battery power. I've seen - this on powerbooks too. I hope that this is a piece of information that - might be useful to the Laptop Mode patch or its users." - -* In syslog.conf, you can prefix entries with a dash `-` to omit syncing the - file after every logging. When you're using laptop-mode and your disk doesn't - spin down, this is a likely culprit. - -* Richard Atterer observed that laptop mode does not work well with noflushd - (http://noflushd.sourceforge.net/), it seems that noflushd prevents laptop-mode - from doing its thing. - -* If you're worried about your data, you might want to consider using a USB - memory stick or something like that as a "working area". (Be aware though - that flash memory can only handle a limited number of writes, and overuse - may wear out your memory stick pretty quickly. Do _not_ use journalling - filesystems on flash memory sticks.) - - -Configuration file for control and ACPI battery scripts -------------------------------------------------------- - -This allows the tunables to be changed for the scripts via an external -configuration file - -It should be installed as /etc/default/laptop-mode on Debian, and as -/etc/sysconfig/laptop-mode on Red Hat, SUSE, Mandrake, and other work-alikes. - -Config file:: - - # Maximum time, in seconds, of hard drive spindown time that you are - # comfortable with. Worst case, it's possible that you could lose this - # amount of work if your battery fails you while in laptop mode. - #MAX_AGE=600 - - # Automatically disable laptop mode when the number of minutes of battery - # that you have left goes below this threshold. - MINIMUM_BATTERY_MINUTES=10 - - # Read-ahead, in 512-byte sectors. You can spin down the disk while playing MP3/OGG - # by setting the disk readahead to 8MB (READAHEAD=16384). Effectively, the disk - # will read a complete MP3 at once, and will then spin down while the MP3/OGG is - # playing. - #READAHEAD=4096 - - # Shall we remount journaled fs. with appropriate commit interval? (1=yes) - #DO_REMOUNTS=1 - - # And shall we add the "noatime" option to that as well? (1=yes) - #DO_REMOUNT_NOATIME=1 - - # Dirty synchronous ratio. At this percentage of dirty pages the process - # which - # calls write() does its own writeback - #DIRTY_RATIO=40 - - # - # Allowed dirty background ratio, in percent. Once DIRTY_RATIO has been - # exceeded, the kernel will wake flusher threads which will then reduce the - # amount of dirty memory to dirty_background_ratio. Set this nice and low, - # so once some writeout has commenced, we do a lot of it. - # - #DIRTY_BACKGROUND_RATIO=5 - - # kernel default dirty buffer age - #DEF_AGE=30 - #DEF_UPDATE=5 - #DEF_DIRTY_BACKGROUND_RATIO=10 - #DEF_DIRTY_RATIO=40 - #DEF_XFS_AGE_BUFFER=15 - #DEF_XFS_SYNC_INTERVAL=30 - #DEF_XFS_BUFD_INTERVAL=1 - - # This must be adjusted manually to the value of HZ in the running kernel - # on 2.4, until the XFS people change their 2.4 external interfaces to work in - # centisecs. This can be automated, but it's a work in progress that still - # needs# some fixes. On 2.6 kernels, XFS uses USER_HZ instead of HZ for - # external interfaces, and that is currently always set to 100. So you don't - # need to change this on 2.6. - #XFS_HZ=100 - - # Should the maximum CPU frequency be adjusted down while on battery? - # Requires CPUFreq to be setup. - # See Documentation/admin-guide/pm/cpufreq.rst for more info - #DO_CPU=0 - - # When on battery what is the maximum CPU speed that the system should - # use? Legal values are "slowest" for the slowest speed that your - # CPU is able to operate at, or a value listed in: - # /sys/devices/system/cpu/cpu0/cpufreq/scaling_available_frequencies - # Only applicable if DO_CPU=1. - #CPU_MAXFREQ=slowest - - # Idle timeout for your hard drive (man hdparm for valid values, -S option) - # Default is 2 hours on AC (AC_HD=244) and 20 seconds for battery (BATT_HD=4). - #AC_HD=244 - #BATT_HD=4 - - # The drives for which to adjust the idle timeout. Separate them by a space, - # e.g. HD="/dev/hda /dev/hdb". - #HD="/dev/hda" - - # Set the spindown timeout on a hard drive? - #DO_HD=1 - - -Control script --------------- - -Please note that this control script works for the Linux 2.4 and 2.6 series (thanks -to Kiko Piris). - -Control script:: - - #!/bin/bash - - # start or stop laptop_mode, best run by a power management daemon when - # ac gets connected/disconnected from a laptop - # - # install as /sbin/laptop_mode - # - # Contributors to this script: Kiko Piris - # Bart Samwel - # Micha Feigin - # Andrew Morton - # Herve Eychenne - # Dax Kelson - # - # Original Linux 2.4 version by: Jens Axboe - - ############################################################################# - - # Source config - if [ -f /etc/default/laptop-mode ] ; then - # Debian - . /etc/default/laptop-mode - elif [ -f /etc/sysconfig/laptop-mode ] ; then - # Others - . /etc/sysconfig/laptop-mode - fi - - # Don't raise an error if the config file is incomplete - # set defaults instead: - - # Maximum time, in seconds, of hard drive spindown time that you are - # comfortable with. Worst case, it's possible that you could lose this - # amount of work if your battery fails you while in laptop mode. - MAX_AGE=${MAX_AGE:-'600'} - - # Read-ahead, in kilobytes - READAHEAD=${READAHEAD:-'4096'} - - # Shall we remount journaled fs. with appropriate commit interval? (1=yes) - DO_REMOUNTS=${DO_REMOUNTS:-'1'} - - # And shall we add the "noatime" option to that as well? (1=yes) - DO_REMOUNT_NOATIME=${DO_REMOUNT_NOATIME:-'1'} - - # Shall we adjust the idle timeout on a hard drive? - DO_HD=${DO_HD:-'1'} - - # Adjust idle timeout on which hard drive? - HD="${HD:-'/dev/hda'}" - - # spindown time for HD (hdparm -S values) - AC_HD=${AC_HD:-'244'} - BATT_HD=${BATT_HD:-'4'} - - # Dirty synchronous ratio. At this percentage of dirty pages the process which - # calls write() does its own writeback - DIRTY_RATIO=${DIRTY_RATIO:-'40'} - - # cpu frequency scaling - # See Documentation/admin-guide/pm/cpufreq.rst for more info - DO_CPU=${CPU_MANAGE:-'0'} - CPU_MAXFREQ=${CPU_MAXFREQ:-'slowest'} - - # - # Allowed dirty background ratio, in percent. Once DIRTY_RATIO has been - # exceeded, the kernel will wake flusher threads which will then reduce the - # amount of dirty memory to dirty_background_ratio. Set this nice and low, - # so once some writeout has commenced, we do a lot of it. - # - DIRTY_BACKGROUND_RATIO=${DIRTY_BACKGROUND_RATIO:-'5'} - - # kernel default dirty buffer age - DEF_AGE=${DEF_AGE:-'30'} - DEF_UPDATE=${DEF_UPDATE:-'5'} - DEF_DIRTY_BACKGROUND_RATIO=${DEF_DIRTY_BACKGROUND_RATIO:-'10'} - DEF_DIRTY_RATIO=${DEF_DIRTY_RATIO:-'40'} - DEF_XFS_AGE_BUFFER=${DEF_XFS_AGE_BUFFER:-'15'} - DEF_XFS_SYNC_INTERVAL=${DEF_XFS_SYNC_INTERVAL:-'30'} - DEF_XFS_BUFD_INTERVAL=${DEF_XFS_BUFD_INTERVAL:-'1'} - - # This must be adjusted manually to the value of HZ in the running kernel - # on 2.4, until the XFS people change their 2.4 external interfaces to work in - # centisecs. This can be automated, but it's a work in progress that still needs - # some fixes. On 2.6 kernels, XFS uses USER_HZ instead of HZ for external - # interfaces, and that is currently always set to 100. So you don't need to - # change this on 2.6. - XFS_HZ=${XFS_HZ:-'100'} - - ############################################################################# - - KLEVEL="$(uname -r | - { - IFS='.' read a b c - echo $a.$b - } - )" - case "$KLEVEL" in - "2.4"|"2.6") - ;; - *) - echo "Unhandled kernel version: $KLEVEL ('uname -r' = '$(uname -r)')" >&2 - exit 1 - ;; - esac - - if [ ! -e /proc/sys/vm/laptop_mode ] ; then - echo "Kernel is not patched with laptop_mode patch." >&2 - exit 1 - fi - - if [ ! -w /proc/sys/vm/laptop_mode ] ; then - echo "You do not have enough privileges to enable laptop_mode." >&2 - exit 1 - fi - - # Remove an option (the first parameter) of the form option= from - # a mount options string (the rest of the parameters). - parse_mount_opts () { - OPT="$1" - shift - echo ",$*," | sed \ - -e 's/,'"$OPT"'=[0-9]*,/,/g' \ - -e 's/,,*/,/g' \ - -e 's/^,//' \ - -e 's/,$//' - } - - # Remove an option (the first parameter) without any arguments from - # a mount option string (the rest of the parameters). - parse_nonumber_mount_opts () { - OPT="$1" - shift - echo ",$*," | sed \ - -e 's/,'"$OPT"',/,/g' \ - -e 's/,,*/,/g' \ - -e 's/^,//' \ - -e 's/,$//' - } - - # Find out the state of a yes/no option (e.g. "atime"/"noatime") in - # fstab for a given filesystem, and use this state to replace the - # value of the option in another mount options string. The device - # is the first argument, the option name the second, and the default - # value the third. The remainder is the mount options string. - # - # Example: - # parse_yesno_opts_wfstab /dev/hda1 atime atime defaults,noatime - # - # If fstab contains, say, "rw" for this filesystem, then the result - # will be "defaults,atime". - parse_yesno_opts_wfstab () { - L_DEV="$1" - OPT="$2" - DEF_OPT="$3" - shift 3 - L_OPTS="$*" - PARSEDOPTS1="$(parse_nonumber_mount_opts $OPT $L_OPTS)" - PARSEDOPTS1="$(parse_nonumber_mount_opts no$OPT $PARSEDOPTS1)" - # Watch for a default atime in fstab - FSTAB_OPTS="$(awk '$1 == "'$L_DEV'" { print $4 }' /etc/fstab)" - if echo "$FSTAB_OPTS" | grep "$OPT" > /dev/null ; then - # option specified in fstab: extract the value and use it - if echo "$FSTAB_OPTS" | grep "no$OPT" > /dev/null ; then - echo "$PARSEDOPTS1,no$OPT" - else - # no$OPT not found -- so we must have $OPT. - echo "$PARSEDOPTS1,$OPT" - fi - else - # option not specified in fstab -- choose the default. - echo "$PARSEDOPTS1,$DEF_OPT" - fi - } - - # Find out the state of a numbered option (e.g. "commit=NNN") in - # fstab for a given filesystem, and use this state to replace the - # value of the option in another mount options string. The device - # is the first argument, and the option name the second. The - # remainder is the mount options string in which the replacement - # must be done. - # - # Example: - # parse_mount_opts_wfstab /dev/hda1 commit defaults,commit=7 - # - # If fstab contains, say, "commit=3,rw" for this filesystem, then the - # result will be "rw,commit=3". - parse_mount_opts_wfstab () { - L_DEV="$1" - OPT="$2" - shift 2 - L_OPTS="$*" - PARSEDOPTS1="$(parse_mount_opts $OPT $L_OPTS)" - # Watch for a default commit in fstab - FSTAB_OPTS="$(awk '$1 == "'$L_DEV'" { print $4 }' /etc/fstab)" - if echo "$FSTAB_OPTS" | grep "$OPT=" > /dev/null ; then - # option specified in fstab: extract the value, and use it - echo -n "$PARSEDOPTS1,$OPT=" - echo ",$FSTAB_OPTS," | sed \ - -e 's/.*,'"$OPT"'=//' \ - -e 's/,.*//' - else - # option not specified in fstab: set it to 0 - echo "$PARSEDOPTS1,$OPT=0" - fi - } - - deduce_fstype () { - MP="$1" - # My root filesystem unfortunately has - # type "unknown" in /etc/mtab. If we encounter - # "unknown", we try to get the type from fstab. - cat /etc/fstab | - grep -v '^#' | - while read FSTAB_DEV FSTAB_MP FSTAB_FST FSTAB_OPTS FSTAB_DUMP FSTAB_DUMP ; do - if [ "$FSTAB_MP" = "$MP" ]; then - echo $FSTAB_FST - exit 0 - fi - done - } - - if [ $DO_REMOUNT_NOATIME -eq 1 ] ; then - NOATIME_OPT=",noatime" - fi - - case "$1" in - start) - AGE=$((100*$MAX_AGE)) - XFS_AGE=$(($XFS_HZ*$MAX_AGE)) - echo -n "Starting laptop_mode" - - if [ -d /proc/sys/vm/pagebuf ] ; then - # (For 2.4 and early 2.6.) - # This only needs to be set, not reset -- it is only used when - # laptop mode is enabled. - echo $XFS_AGE > /proc/sys/vm/pagebuf/lm_flush_age - echo $XFS_AGE > /proc/sys/fs/xfs/lm_sync_interval - elif [ -f /proc/sys/fs/xfs/lm_age_buffer ] ; then - # (A couple of early 2.6 laptop mode patches had these.) - # The same goes for these. - echo $XFS_AGE > /proc/sys/fs/xfs/lm_age_buffer - echo $XFS_AGE > /proc/sys/fs/xfs/lm_sync_interval - elif [ -f /proc/sys/fs/xfs/age_buffer ] ; then - # (2.6.6) - # But not for these -- they are also used in normal - # operation. - echo $XFS_AGE > /proc/sys/fs/xfs/age_buffer - echo $XFS_AGE > /proc/sys/fs/xfs/sync_interval - elif [ -f /proc/sys/fs/xfs/age_buffer_centisecs ] ; then - # (2.6.7 upwards) - # And not for these either. These are in centisecs, - # not USER_HZ, so we have to use $AGE, not $XFS_AGE. - echo $AGE > /proc/sys/fs/xfs/age_buffer_centisecs - echo $AGE > /proc/sys/fs/xfs/xfssyncd_centisecs - echo 3000 > /proc/sys/fs/xfs/xfsbufd_centisecs - fi - - case "$KLEVEL" in - "2.4") - echo 1 > /proc/sys/vm/laptop_mode - echo "30 500 0 0 $AGE $AGE 60 20 0" > /proc/sys/vm/bdflush - ;; - "2.6") - echo 5 > /proc/sys/vm/laptop_mode - echo "$AGE" > /proc/sys/vm/dirty_writeback_centisecs - echo "$AGE" > /proc/sys/vm/dirty_expire_centisecs - echo "$DIRTY_RATIO" > /proc/sys/vm/dirty_ratio - echo "$DIRTY_BACKGROUND_RATIO" > /proc/sys/vm/dirty_background_ratio - ;; - esac - if [ $DO_REMOUNTS -eq 1 ]; then - cat /etc/mtab | while read DEV MP FST OPTS DUMP PASS ; do - PARSEDOPTS="$(parse_mount_opts "$OPTS")" - if [ "$FST" = 'unknown' ]; then - FST=$(deduce_fstype $MP) - fi - case "$FST" in - "ext3") - PARSEDOPTS="$(parse_mount_opts commit "$OPTS")" - mount $DEV -t $FST $MP -o remount,$PARSEDOPTS,commit=$MAX_AGE$NOATIME_OPT - ;; - "xfs") - mount $DEV -t $FST $MP -o remount,$OPTS$NOATIME_OPT - ;; - esac - if [ -b $DEV ] ; then - blockdev --setra $(($READAHEAD * 2)) $DEV - fi - done - fi - if [ $DO_HD -eq 1 ] ; then - for THISHD in $HD ; do - /sbin/hdparm -S $BATT_HD $THISHD > /dev/null 2>&1 - /sbin/hdparm -B 1 $THISHD > /dev/null 2>&1 - done - fi - if [ $DO_CPU -eq 1 -a -e /sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_min_freq ]; then - if [ $CPU_MAXFREQ = 'slowest' ]; then - CPU_MAXFREQ=`cat /sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_min_freq` - fi - echo $CPU_MAXFREQ > /sys/devices/system/cpu/cpu0/cpufreq/scaling_max_freq - fi - echo "." - ;; - stop) - U_AGE=$((100*$DEF_UPDATE)) - B_AGE=$((100*$DEF_AGE)) - echo -n "Stopping laptop_mode" - echo 0 > /proc/sys/vm/laptop_mode - if [ -f /proc/sys/fs/xfs/age_buffer -a ! -f /proc/sys/fs/xfs/lm_age_buffer ] ; then - # These need to be restored, if there are no lm_*. - echo $(($XFS_HZ*$DEF_XFS_AGE_BUFFER)) > /proc/sys/fs/xfs/age_buffer - echo $(($XFS_HZ*$DEF_XFS_SYNC_INTERVAL)) > /proc/sys/fs/xfs/sync_interval - elif [ -f /proc/sys/fs/xfs/age_buffer_centisecs ] ; then - # These need to be restored as well. - echo $((100*$DEF_XFS_AGE_BUFFER)) > /proc/sys/fs/xfs/age_buffer_centisecs - echo $((100*$DEF_XFS_SYNC_INTERVAL)) > /proc/sys/fs/xfs/xfssyncd_centisecs - echo $((100*$DEF_XFS_BUFD_INTERVAL)) > /proc/sys/fs/xfs/xfsbufd_centisecs - fi - case "$KLEVEL" in - "2.4") - echo "30 500 0 0 $U_AGE $B_AGE 60 20 0" > /proc/sys/vm/bdflush - ;; - "2.6") - echo "$U_AGE" > /proc/sys/vm/dirty_writeback_centisecs - echo "$B_AGE" > /proc/sys/vm/dirty_expire_centisecs - echo "$DEF_DIRTY_RATIO" > /proc/sys/vm/dirty_ratio - echo "$DEF_DIRTY_BACKGROUND_RATIO" > /proc/sys/vm/dirty_background_ratio - ;; - esac - if [ $DO_REMOUNTS -eq 1 ] ; then - cat /etc/mtab | while read DEV MP FST OPTS DUMP PASS ; do - # Reset commit and atime options to defaults. - if [ "$FST" = 'unknown' ]; then - FST=$(deduce_fstype $MP) - fi - case "$FST" in - "ext3") - PARSEDOPTS="$(parse_mount_opts_wfstab $DEV commit $OPTS)" - PARSEDOPTS="$(parse_yesno_opts_wfstab $DEV atime atime $PARSEDOPTS)" - mount $DEV -t $FST $MP -o remount,$PARSEDOPTS - ;; - "xfs") - PARSEDOPTS="$(parse_yesno_opts_wfstab $DEV atime atime $OPTS)" - mount $DEV -t $FST $MP -o remount,$PARSEDOPTS - ;; - esac - if [ -b $DEV ] ; then - blockdev --setra 256 $DEV - fi - done - fi - if [ $DO_HD -eq 1 ] ; then - for THISHD in $HD ; do - /sbin/hdparm -S $AC_HD $THISHD > /dev/null 2>&1 - /sbin/hdparm -B 255 $THISHD > /dev/null 2>&1 - done - fi - if [ $DO_CPU -eq 1 -a -e /sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_min_freq ]; then - echo `cat /sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq` > /sys/devices/system/cpu/cpu0/cpufreq/scaling_max_freq - fi - echo "." - ;; - *) - echo "Usage: $0 {start|stop}" 2>&1 - exit 1 - ;; - - esac - - exit 0 - - -ACPI integration ----------------- - -Dax Kelson submitted this so that the ACPI acpid daemon will -kick off the laptop_mode script and run hdparm. The part that -automatically disables laptop mode when the battery is low was -written by Jan Topinski. - -/etc/acpi/events/ac_adapter:: - - event=ac_adapter - action=/etc/acpi/actions/ac.sh %e - -/etc/acpi/events/battery:: - - event=battery.* - action=/etc/acpi/actions/battery.sh %e - -/etc/acpi/actions/ac.sh:: - - #!/bin/bash - - # ac on/offline event handler - - status=`awk '/^state: / { print $2 }' /proc/acpi/ac_adapter/$2/state` - - case $status in - "on-line") - /sbin/laptop_mode stop - exit 0 - ;; - "off-line") - /sbin/laptop_mode start - exit 0 - ;; - esac - - -/etc/acpi/actions/battery.sh:: - - #! /bin/bash - - # Automatically disable laptop mode when the battery almost runs out. - - BATT_INFO=/proc/acpi/battery/$2/state - - if [[ -f /proc/sys/vm/laptop_mode ]] - then - LM=`cat /proc/sys/vm/laptop_mode` - if [[ $LM -gt 0 ]] - then - if [[ -f $BATT_INFO ]] - then - # Source the config file only now that we know we need - if [ -f /etc/default/laptop-mode ] ; then - # Debian - . /etc/default/laptop-mode - elif [ -f /etc/sysconfig/laptop-mode ] ; then - # Others - . /etc/sysconfig/laptop-mode - fi - MINIMUM_BATTERY_MINUTES=${MINIMUM_BATTERY_MINUTES:-'10'} - - ACTION="`cat $BATT_INFO | grep charging | cut -c 26-`" - if [[ ACTION -eq "discharging" ]] - then - PRESENT_RATE=`cat $BATT_INFO | grep "present rate:" | sed "s/.* \([0-9][0-9]* \).*/\1/" ` - REMAINING=`cat $BATT_INFO | grep "remaining capacity:" | sed "s/.* \([0-9][0-9]* \).*/\1/" ` - fi - if (($REMAINING * 60 / $PRESENT_RATE < $MINIMUM_BATTERY_MINUTES)) - then - /sbin/laptop_mode stop - fi - else - logger -p daemon.warning "You are using laptop mode and your battery interface $BATT_INFO is missing. This may lead to loss of data when the battery runs out. Check kernel ACPI support and /proc/acpi/battery folder, and edit /etc/acpi/battery.sh to set BATT_INFO to the correct path." - fi - fi - fi - - -Monitoring tool ---------------- - -Bartek Kania submitted this, it can be used to measure how much time your disk -spends spun up/down. See tools/laptop/dslm/dslm.c diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst index 245bf6394935..ca6ebeb5171c 100644 --- a/Documentation/admin-guide/sysctl/vm.rst +++ b/Documentation/admin-guide/sysctl/vm.rst @@ -41,7 +41,6 @@ Currently, these files are in /proc/sys/vm: - extfrag_threshold - highmem_is_dirtyable - hugetlb_shm_group -- laptop_mode - legacy_va_layout - lowmem_reserve_ratio - max_map_count @@ -363,13 +362,6 @@ hugetlb_shm_group contains group id that is allowed to create SysV shared memory segment using hugetlb page. -laptop_mode -=========== - -laptop_mode is a knob that controls "laptop mode". All the things that are -controlled by this knob are discussed in Documentation/admin-guide/laptops/laptop-mode.rst. - - legacy_va_layout ================ diff --git a/block/blk-mq.c b/block/blk-mq.c index a29d8ac9d3e3..4bae7c4c664e 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -811,9 +811,6 @@ void blk_mq_free_request(struct request *rq) blk_mq_finish_request(rq); - if (unlikely(laptop_mode && !blk_rq_is_passthrough(rq))) - laptop_io_completion(q->disk->bdi); - rq_qos_done(q, rq); WRITE_ONCE(rq->state, MQ_RQ_IDLE); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 0c466ccbed69..15eb463d5a9b 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3305,8 +3305,7 @@ int ext4_alloc_da_blocks(struct inode *inode) /* * We do something simple for now. The filemap_flush() will * also start triggering a write of the data blocks, which is - * not strictly speaking necessary (and for users of - * laptop_mode, not even desirable). However, to do otherwise + * not strictly speaking necessary. However, to do otherwise * would require replicating code paths in: * * ext4_writepages() -> diff --git a/fs/sync.c b/fs/sync.c index 431fc5f5be06..6330150792f6 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -104,8 +104,6 @@ void ksys_sync(void) iterate_supers(sync_fs_one_sb, &wait); sync_bdevs(false); sync_bdevs(true); - if (unlikely(laptop_mode)) - laptop_sync_completion(); } SYSCALL_DEFINE0(sync) diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index bc71aa9dcee8..a2014fb1bc66 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -845,15 +845,6 @@ xfs_fs_sync_fs( if (error) return error; - if (laptop_mode) { - /* - * The disk must be active because we're syncing. - * We schedule log work now (now that the disk is - * active) instead of later (when it might not be). - */ - flush_delayed_work(&mp->m_log->l_work); - } - /* * If we are called with page faults frozen out, it means we are about * to freeze the transaction subsystem. Take the opportunity to shut diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index 0217c1073735..c88fd4d37d1f 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -46,7 +46,6 @@ enum wb_reason { WB_REASON_VMSCAN, WB_REASON_SYNC, WB_REASON_PERIODIC, - WB_REASON_LAPTOP_TIMER, WB_REASON_FS_FREE_SPACE, /* * There is no bdi forker thread any more and works are done @@ -204,8 +203,6 @@ struct backing_dev_info { char dev_name[64]; struct device *owner; - struct timer_list laptop_mode_wb_timer; - #ifdef CONFIG_DEBUG_FS struct dentry *debug_dir; #endif diff --git a/include/linux/writeback.h b/include/linux/writeback.h index f48e8ccffe81..e530112c4b3a 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -328,9 +328,6 @@ struct dirty_throttle_control { bool dirty_exceeded; }; -void laptop_io_completion(struct backing_dev_info *info); -void laptop_sync_completion(void); -void laptop_mode_timer_fn(struct timer_list *t); bool node_dirty_ok(struct pglist_data *pgdat); int wb_domain_init(struct wb_domain *dom, gfp_t gfp); #ifdef CONFIG_CGROUP_WRITEBACK @@ -342,7 +339,6 @@ extern struct wb_domain global_wb_domain; /* These are exported to sysctl. */ extern unsigned int dirty_writeback_interval; extern unsigned int dirty_expire_interval; -extern int laptop_mode; void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty); unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh); diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index 311a341e6fe4..b6f94e97788a 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -42,7 +42,6 @@ EM( WB_REASON_VMSCAN, "vmscan") \ EM( WB_REASON_SYNC, "sync") \ EM( WB_REASON_PERIODIC, "periodic") \ - EM( WB_REASON_LAPTOP_TIMER, "laptop_timer") \ EM( WB_REASON_FS_FREE_SPACE, "fs_free_space") \ EM( WB_REASON_FORKER_THREAD, "forker_thread") \ EMe(WB_REASON_FOREIGN_FLUSH, "foreign_flush") diff --git a/include/uapi/linux/sysctl.h b/include/uapi/linux/sysctl.h index 63d1464cb71c..6ea9ea8413fa 100644 --- a/include/uapi/linux/sysctl.h +++ b/include/uapi/linux/sysctl.h @@ -183,7 +183,7 @@ enum VM_LOWMEM_RESERVE_RATIO=20,/* reservation ratio for lower memory zones */ VM_MIN_FREE_KBYTES=21, /* Minimum free kilobytes to maintain */ VM_MAX_MAP_COUNT=22, /* int: Maximum number of mmaps/address-space */ - VM_LAPTOP_MODE=23, /* vm laptop mode */ + VM_BLOCK_DUMP=24, /* block dump mode */ VM_HUGETLB_GROUP=25, /* permitted hugetlb group */ VM_VFS_CACHE_PRESSURE=26, /* dcache/icache reclaim pressure */ diff --git a/mm/backing-dev.c b/mm/backing-dev.c index c5740c6d37a2..a0e26d1b717f 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -1034,7 +1034,6 @@ struct backing_dev_info *bdi_alloc(int node_id) bdi->capabilities = BDI_CAP_WRITEBACK; bdi->ra_pages = VM_READAHEAD_PAGES; bdi->io_pages = VM_READAHEAD_PAGES; - timer_setup(&bdi->laptop_mode_wb_timer, laptop_mode_timer_fn, 0); return bdi; } EXPORT_SYMBOL(bdi_alloc); @@ -1156,8 +1155,6 @@ static void bdi_remove_from_list(struct backing_dev_info *bdi) void bdi_unregister(struct backing_dev_info *bdi) { - timer_delete_sync(&bdi->laptop_mode_wb_timer); - /* make sure nobody finds us on the bdi_list anymore */ bdi_remove_from_list(bdi); wb_shutdown(&bdi->wb); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index ccdeb0e84d39..601a5e048d12 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -109,14 +109,6 @@ EXPORT_SYMBOL_GPL(dirty_writeback_interval); */ unsigned int dirty_expire_interval = 30 * 100; /* centiseconds */ -/* - * Flag that puts the machine in "laptop mode". Doubles as a timeout in jiffies: - * a full sync is triggered after this time elapses without any disk activity. - */ -int laptop_mode; - -EXPORT_SYMBOL(laptop_mode); - /* End of sysctl-exported parameters */ struct wb_domain global_wb_domain; @@ -1843,17 +1835,7 @@ static int balance_dirty_pages(struct bdi_writeback *wb, balance_domain_limits(mdtc, strictlimit); } - /* - * In laptop mode, we wait until hitting the higher threshold - * before starting background writeout, and then write out all - * the way down to the lower threshold. So slow writers cause - * minimal disk activity. - * - * In normal mode, we start background writeout at the lower - * background_thresh, to keep the amount of dirty memory low. - */ - if (!laptop_mode && nr_dirty > gdtc->bg_thresh && - !writeback_in_progress(wb)) + if (nr_dirty > gdtc->bg_thresh && !writeback_in_progress(wb)) wb_start_background_writeback(wb); /* @@ -1876,10 +1858,6 @@ free_running: break; } - /* Start writeback even when in laptop mode */ - if (unlikely(!writeback_in_progress(wb))) - wb_start_background_writeback(wb); - mem_cgroup_flush_foreign(wb); /* @@ -2198,41 +2176,6 @@ static int dirty_writeback_centisecs_handler(const struct ctl_table *table, int } #endif -void laptop_mode_timer_fn(struct timer_list *t) -{ - struct backing_dev_info *backing_dev_info = - timer_container_of(backing_dev_info, t, laptop_mode_wb_timer); - - wakeup_flusher_threads_bdi(backing_dev_info, WB_REASON_LAPTOP_TIMER); -} - -/* - * We've spun up the disk and we're in laptop mode: schedule writeback - * of all dirty data a few seconds from now. If the flush is already scheduled - * then push it back - the user is still using the disk. - */ -void laptop_io_completion(struct backing_dev_info *info) -{ - mod_timer(&info->laptop_mode_wb_timer, jiffies + laptop_mode); -} - -/* - * We're in laptop mode and we've just synced. The sync's writes will have - * caused another writeback to be scheduled by laptop_io_completion. - * Nothing needs to be written back anymore, so we unschedule the writeback. - */ -void laptop_sync_completion(void) -{ - struct backing_dev_info *bdi; - - rcu_read_lock(); - - list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) - timer_delete(&bdi->laptop_mode_wb_timer); - - rcu_read_unlock(); -} - /* * If ratelimit_pages is too high then we can get into dirty-data overload * if a large number of processes all perform writes at the same time. @@ -2263,6 +2206,19 @@ static int page_writeback_cpu_online(unsigned int cpu) #ifdef CONFIG_SYSCTL +static int laptop_mode; +static int laptop_mode_handler(const struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + int ret = proc_dointvec_jiffies(table, write, buffer, lenp, ppos); + + if (!ret && write) + pr_warn("%s: vm.laptop_mode is deprecated. Ignoring setting.\n", + current->comm); + + return ret; +} + /* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */ static const unsigned long dirty_bytes_min = 2 * PAGE_SIZE; @@ -2332,7 +2288,7 @@ static const struct ctl_table vm_page_writeback_sysctls[] = { .data = &laptop_mode, .maxlen = sizeof(laptop_mode), .mode = 0644, - .proc_handler = proc_dointvec_jiffies, + .proc_handler = laptop_mode_handler, }, }; #endif diff --git a/mm/vmscan.c b/mm/vmscan.c index 1c87945fa761..fc5691afb998 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -104,13 +104,13 @@ struct scan_control { unsigned int force_deactivate:1; unsigned int skipped_deactivate:1; - /* Writepage batching in laptop mode; RECLAIM_WRITE */ + /* zone_reclaim_mode, boost reclaim */ unsigned int may_writepage:1; - /* Can mapped folios be reclaimed? */ + /* zone_reclaim_mode */ unsigned int may_unmap:1; - /* Can folios be swapped as part of reclaim? */ + /* zome_reclaim_mode, boost reclaim, cgroup restrictions */ unsigned int may_swap:1; /* Not allow cache_trim_mode to be turned on as part of reclaim? */ @@ -6365,13 +6365,6 @@ retry: if (sc->compaction_ready) break; - - /* - * If we're getting trouble reclaiming, start doing - * writepage even in laptop mode. - */ - if (sc->priority < DEF_PRIORITY - 2) - sc->may_writepage = 1; } while (--sc->priority >= 0); last_pgdat = NULL; @@ -6580,7 +6573,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, .order = order, .nodemask = nodemask, .priority = DEF_PRIORITY, - .may_writepage = !laptop_mode, + .may_writepage = 1, .may_unmap = 1, .may_swap = 1, }; @@ -6624,7 +6617,7 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg, struct scan_control sc = { .nr_to_reclaim = SWAP_CLUSTER_MAX, .target_mem_cgroup = memcg, - .may_writepage = !laptop_mode, + .may_writepage = 1, .may_unmap = 1, .reclaim_idx = MAX_NR_ZONES - 1, .may_swap = !noswap, @@ -6670,7 +6663,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, .reclaim_idx = MAX_NR_ZONES - 1, .target_mem_cgroup = memcg, .priority = DEF_PRIORITY, - .may_writepage = !laptop_mode, + .may_writepage = 1, .may_unmap = 1, .may_swap = !!(reclaim_options & MEMCG_RECLAIM_MAY_SWAP), .proactive = !!(reclaim_options & MEMCG_RECLAIM_PROACTIVE), @@ -7051,7 +7044,7 @@ restart: * from reclaim context. If no pages are reclaimed, the * reclaim will be aborted. */ - sc.may_writepage = !laptop_mode && !nr_boost_reclaim; + sc.may_writepage = !nr_boost_reclaim; sc.may_swap = !nr_boost_reclaim; /* @@ -7061,13 +7054,6 @@ restart: */ kswapd_age_node(pgdat, &sc); - /* - * If we're getting trouble reclaiming, start doing writepage - * even in laptop mode. - */ - if (sc.priority < DEF_PRIORITY - 2) - sc.may_writepage = 1; - /* Call soft limit reclaim before calling shrink_node. */ sc.nr_scanned = 0; nr_soft_scanned = 0; @@ -7799,7 +7785,7 @@ int user_proactive_reclaim(char *buf, .reclaim_idx = gfp_zone(gfp_mask), .proactive_swappiness = swappiness == -1 ? NULL : &swappiness, .priority = DEF_PRIORITY, - .may_writepage = !laptop_mode, + .may_writepage = 1, .nr_to_reclaim = max(batch_size, SWAP_CLUSTER_MAX), .may_unmap = 1, .may_swap = 1, -- cgit v1.2.3 From bd4526e64bcff4cbeaefbbd91c40d3e38b9920a9 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Wed, 3 Dec 2025 22:45:11 +0000 Subject: maple_tree: remove struct maple_alloc struct maple_alloc is deprecated after the maple tree conversion to sheaves, remove the references from the header file. Link: https://lkml.kernel.org/r/20251203224511.469978-1-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reviewed-by: Jinjie Ruan Reviewed-by: Liam R. Howlett Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/maple_tree.h | 8 -------- 1 file changed, 8 deletions(-) diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index 66f98a3da8d8..1323c28a7470 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -129,13 +129,6 @@ struct maple_arange_64 { struct maple_metadata meta; }; -struct maple_alloc { - unsigned long total; - unsigned char node_count; - unsigned int request_count; - struct maple_alloc *slot[MAPLE_ALLOC_SLOTS]; -}; - struct maple_topiary { struct maple_pnode *parent; struct maple_enode *next; /* Overlaps the pivot */ @@ -306,7 +299,6 @@ struct maple_node { }; struct maple_range_64 mr64; struct maple_arange_64 ma64; - struct maple_alloc alloc; }; }; -- cgit v1.2.3 From a98ec863fdedf4940447f32ceda7d937bebd06a2 Mon Sep 17 00:00:00 2001 From: Audra Mitchell Date: Mon, 1 Dec 2025 13:18:48 -0500 Subject: lib/test_vmalloc.c: minor fixes to test_vmalloc.c If PAGE_SIZE is larger than 4k and if you have a system with a large number of CPUs, this test can require a very large amount of memory leading to oom-killer firing. Given the type of allocation, the kernel won't have anything to kill, causing the system to stall. Add a parameter to the test_vmalloc driver to represent the number of times a percpu object will be allocated. Calculate this in test_vmalloc.sh to be 90% of available memory or the current default of 35000, whichever is smaller. Link: https://lkml.kernel.org/r/20251201181848.1216197-1-audra@redhat.com Signed-off-by: Audra Mitchell Reviewed-by: Andrew Morton Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Rafael Aquini Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: "Uladzislau Rezki (Sony)" Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- lib/test_vmalloc.c | 11 +++++++---- tools/testing/selftests/mm/test_vmalloc.sh | 31 +++++++++++++++++++++++++++--- 2 files changed, 35 insertions(+), 7 deletions(-) diff --git a/lib/test_vmalloc.c b/lib/test_vmalloc.c index 6521c05c7816..270b6f7ca807 100644 --- a/lib/test_vmalloc.c +++ b/lib/test_vmalloc.c @@ -58,6 +58,9 @@ __param(int, run_test_mask, 7, /* Add a new test case description here. */ ); +__param(int, nr_pcpu_objects, 35000, + "Number of pcpu objects to allocate for pcpu_alloc_test"); + /* * This is for synchronization of setup phase. */ @@ -317,24 +320,24 @@ pcpu_alloc_test(void) size_t size, align; int i; - pcpu = vmalloc(sizeof(void __percpu *) * 35000); + pcpu = vmalloc(sizeof(void __percpu *) * nr_pcpu_objects); if (!pcpu) return -1; - for (i = 0; i < 35000; i++) { + for (i = 0; i < nr_pcpu_objects; i++) { size = get_random_u32_inclusive(1, PAGE_SIZE / 4); /* * Maximum PAGE_SIZE */ - align = 1 << get_random_u32_inclusive(1, 11); + align = 1 << get_random_u32_inclusive(1, PAGE_SHIFT - 1); pcpu[i] = __alloc_percpu(size, align); if (!pcpu[i]) rv = -1; } - for (i = 0; i < 35000; i++) + for (i = 0; i < nr_pcpu_objects; i++) free_percpu(pcpu[i]); vfree(pcpu); diff --git a/tools/testing/selftests/mm/test_vmalloc.sh b/tools/testing/selftests/mm/test_vmalloc.sh index d39096723fca..b23d705bf570 100755 --- a/tools/testing/selftests/mm/test_vmalloc.sh +++ b/tools/testing/selftests/mm/test_vmalloc.sh @@ -13,6 +13,9 @@ TEST_NAME="vmalloc" DRIVER="test_${TEST_NAME}" NUM_CPUS=`grep -c ^processor /proc/cpuinfo` +# Default number of times we allocate percpu objects: +NR_PCPU_OBJECTS=35000 + # 1 if fails exitcode=1 @@ -27,6 +30,8 @@ PERF_PARAM="sequential_test_order=1 test_repeat_count=3" SMOKE_PARAM="test_loop_count=10000 test_repeat_count=10" STRESS_PARAM="nr_threads=$NUM_CPUS test_repeat_count=20" +PCPU_OBJ_PARAM="nr_pcpu_objects=$NR_PCPU_OBJECTS" + check_test_requirements() { uid=$(id -u) @@ -47,12 +52,30 @@ check_test_requirements() fi } +check_memory_requirement() +{ + # The pcpu_alloc_test allocates nr_pcpu_objects per cpu. If the + # PAGE_SIZE is on the larger side it is easier to set a value + # that can cause oom events during testing. Since we are + # testing the functionality of vmalloc and not the oom-killer, + # calculate what is 90% of available memory and divide it by + # the number of online CPUs. + pages=$(($(getconf _AVPHYS_PAGES) * 90 / 100 / $NUM_CPUS)) + + if (($pages < $NR_PCPU_OBJECTS)); then + echo "Updated nr_pcpu_objects to 90% of available memory." + echo "nr_pcpu_objects is now set to: $pages." + PCPU_OBJ_PARAM="nr_pcpu_objects=$pages" + fi +} + run_performance_check() { echo "Run performance tests to evaluate how fast vmalloc allocation is." echo "It runs all test cases on one single CPU with sequential order." - modprobe $DRIVER $PERF_PARAM > /dev/null 2>&1 + check_memory_requirement + modprobe $DRIVER $PERF_PARAM $PCPU_OBJ_PARAM > /dev/null 2>&1 echo "Done." echo "Check the kernel message buffer to see the summary." } @@ -63,7 +86,8 @@ run_stability_check() echo "available test cases are run by NUM_CPUS workers simultaneously." echo "It will take time, so be patient." - modprobe $DRIVER $STRESS_PARAM > /dev/null 2>&1 + check_memory_requirement + modprobe $DRIVER $STRESS_PARAM $PCPU_OBJ_PARAM > /dev/null 2>&1 echo "Done." echo "Check the kernel ring buffer to see the summary." } @@ -74,7 +98,8 @@ run_smoke_check() echo "Please check $0 output how it can be used" echo "for deep performance analysis as well as stress testing." - modprobe $DRIVER $SMOKE_PARAM > /dev/null 2>&1 + check_memory_requirement + modprobe $DRIVER $SMOKE_PARAM $PCPU_OBJ_PARAM > /dev/null 2>&1 echo "Done." echo "Check the kernel ring buffer to see the summary." } -- cgit v1.2.3 From 84355caa271a0eab2d1b55ff73aa8aa3e4627661 Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Wed, 17 Dec 2025 12:02:13 +0100 Subject: mm/mm_init: replace simple_strtoul with kstrtobool in set_hashdist Use bool for 'hashdist' and replace simple_strtoul() with kstrtobool() for parsing the 'hashdist=' boot parameter. Unlike simple_strtoul(), which returns an unsigned long, kstrtobool() converts the string directly to bool and avoids implicit casting. Check the return value of kstrtobool() and reject invalid values. This adds error handling while preserving behavior for existing values, and removes use of the deprecated simple_strtoul() helper. The current code silently sets 'hashdist = 0' if parsing fails, instead of leaving the default value (HASHDIST_DEFAULT) unchanged. Additionally, kstrtobool() accepts common boolean strings such as "on" and "off". Link: https://lkml.kernel.org/r/20251217110214.50807-1-thorsten.blum@linux.dev Signed-off-by: Thorsten Blum Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: Mike Rapoport (Microsoft) Signed-off-by: Andrew Morton --- include/linux/memblock.h | 4 ++-- mm/mm_init.c | 9 +++------ 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 221118b5a16e..6ec5e9ac0699 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -598,9 +598,9 @@ extern void *alloc_large_system_hash(const char *tablename, */ #ifdef CONFIG_NUMA #define HASHDIST_DEFAULT IS_ENABLED(CONFIG_64BIT) -extern int hashdist; /* Distribute hashes across NUMA nodes? */ +extern bool hashdist; /* Distribute hashes across NUMA nodes? */ #else -#define hashdist (0) +#define hashdist (false) #endif #ifdef CONFIG_MEMTEST diff --git a/mm/mm_init.c b/mm/mm_init.c index fc2a6f1e518f..d86248566a56 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -646,21 +646,18 @@ int __meminit early_pfn_to_nid(unsigned long pfn) return nid; } -int hashdist = HASHDIST_DEFAULT; +bool hashdist = HASHDIST_DEFAULT; static int __init set_hashdist(char *str) { - if (!str) - return 0; - hashdist = simple_strtoul(str, &str, 0); - return 1; + return kstrtobool(str, &hashdist) == 0; } __setup("hashdist=", set_hashdist); static inline void fixup_hashdist(void) { if (num_node_state(N_MEMORY) == 1) - hashdist = 0; + hashdist = false; } #else static inline void fixup_hashdist(void) {} -- cgit v1.2.3 From a9853ac1c3bcf79cef46046529a3f7912ff5ecee Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 26 Nov 2025 15:36:02 +0100 Subject: zram: remove KMSG_COMPONENT macro The KMSG_COMPONENT macro is a leftover of the s390 specific "kernel message catalog" from 2008 [1] which never made it upstream. The macro was added to s390 code to allow for an out-of-tree patch which used this to generate unique message ids. Also this out-of-tree doesn't exist anymore. The pattern of how the KMSG_COMPONENT is used was partially also used for non s390 specific code, for whatever reasons. Remove the macro in order to get rid of a pointless indirection. Link: https://lkml.kernel.org/r/20251126143602.2207435-1-hca@linux.ibm.com Link: https://lwn.net/Articles/292650/ [1] Signed-off-by: Heiko Carstens Reviewed-by: Sergey Senozhatsky Cc: Jens Axboe Cc: Minchan Kim Signed-off-by: Andrew Morton --- drivers/block/zram/zram_drv.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index ed717b65f0a9..1d6760b3b557 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -12,8 +12,7 @@ * */ -#define KMSG_COMPONENT "zram" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "zram: " fmt #include #include -- cgit v1.2.3 From 5ec9bb6de4933b8a9bca09ce56039277d63dd5a8 Mon Sep 17 00:00:00 2001 From: Kevin Lourenco Date: Wed, 17 Dec 2025 19:12:16 +0100 Subject: mm/damon: fix typos in comments Correct minor spelling mistakes in several files under mm/damon. No functional changes. Link: https://lkml.kernel.org/r/20251217181216.47576-1-klourencodev@gmail.com Signed-off-by: Kevin Lourenco Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/core.c | 6 +++--- mm/damon/lru_sort.c | 2 +- mm/damon/reclaim.c | 2 +- mm/damon/stat.c | 2 +- mm/damon/tests/core-kunit.h | 2 +- 5 files changed, 7 insertions(+), 7 deletions(-) diff --git a/mm/damon/core.c b/mm/damon/core.c index f4d83e12ba0e..7f0028e23f92 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -284,7 +284,7 @@ struct damos_filter *damos_new_filter(enum damos_filter_type type, } /** - * damos_filter_for_ops() - Return if the filter is ops-hndled one. + * damos_filter_for_ops() - Return if the filter is ops-handled one. * @type: type of the filter. * * Return: true if the filter of @type needs to be handled by ops layer, false @@ -1615,7 +1615,7 @@ static unsigned long damon_get_intervals_adaptation_bp(struct damon_ctx *c) adaptation_bp = damon_feed_loop_next_input(100000000, score_bp) / 10000; /* - * adaptaion_bp ranges from 1 to 20,000. Avoid too rapid reduction of + * adaptation_bp ranges from 1 to 20,000. Avoid too rapid reduction of * the intervals by rescaling [1,10,000] to [5000, 10,000]. */ if (adaptation_bp <= 10000) @@ -2789,7 +2789,7 @@ static int kdamond_fn(void *data) * * Reset ->next_aggregation_sis to avoid that. * It will anyway correctly updated after this - * if caluse. + * if clause. */ ctx->next_aggregation_sis = next_aggregation_sis; diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c index 49b4bc294f4e..9388b091deb7 100644 --- a/mm/damon/lru_sort.c +++ b/mm/damon/lru_sort.c @@ -34,7 +34,7 @@ static bool enabled __read_mostly; * * Input parameters that updated while DAMON_LRU_SORT is running are not * applied by default. Once this parameter is set as ``Y``, DAMON_LRU_SORT - * reads values of parametrs except ``enabled`` again. Once the re-reading is + * reads values of parameters except ``enabled`` again. Once the re-reading is * done, this parameter is set as ``N``. If invalid parameters are found while * the re-reading, DAMON_LRU_SORT will be disabled. */ diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c index 36a582e09eae..8463a5a5032f 100644 --- a/mm/damon/reclaim.c +++ b/mm/damon/reclaim.c @@ -34,7 +34,7 @@ static bool enabled __read_mostly; * * Input parameters that updated while DAMON_RECLAIM is running are not applied * by default. Once this parameter is set as ``Y``, DAMON_RECLAIM reads values - * of parametrs except ``enabled`` again. Once the re-reading is done, this + * of parameters except ``enabled`` again. Once the re-reading is done, this * parameter is set as ``N``. If invalid parameters are found while the * re-reading, DAMON_RECLAIM will be disabled. */ diff --git a/mm/damon/stat.c b/mm/damon/stat.c index ef0a1195a584..5e18b164f6d8 100644 --- a/mm/damon/stat.c +++ b/mm/damon/stat.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Shows data access monitoring resutls in simple metrics. + * Shows data access monitoring results in simple metrics. */ #define pr_fmt(fmt) "damon-stat: " fmt diff --git a/mm/damon/tests/core-kunit.h b/mm/damon/tests/core-kunit.h index 8cb369b63e08..f59ae7ee19a0 100644 --- a/mm/damon/tests/core-kunit.h +++ b/mm/damon/tests/core-kunit.h @@ -1159,7 +1159,7 @@ static void damon_test_set_filters_default_reject(struct kunit *test) damos_set_filters_default_reject(&scheme); /* * A core-handled allow-filter is installed. - * Rejct by default on core layer filtering stage due to the last + * Reject by default on core layer filtering stage due to the last * core-layer-filter's behavior. * Allow by default on ops layer filtering stage due to the absence of * ops layer filters. -- cgit v1.2.3 From 62451ae347b0015bf3d644c97cbc14e75a8287e6 Mon Sep 17 00:00:00 2001 From: Kevin Lourenco Date: Thu, 18 Dec 2025 16:09:06 +0100 Subject: mm: fix minor spelling mistakes in comments Correct several typos in comments across files in mm/ [akpm@linux-foundation.org: also fix comment grammar, per SeongJae] Link: https://lkml.kernel.org/r/20251218150906.25042-1-klourencodev@gmail.com Signed-off-by: Kevin Lourenco Reviewed-by: SeongJae Park Acked-by: David Hildenbrand (Red Hat) Reviewed-by: Lorenzo Stoakes Signed-off-by: Andrew Morton --- mm/internal.h | 2 +- mm/madvise.c | 2 +- mm/memblock.c | 4 ++-- mm/memcontrol.c | 2 +- mm/memory-failure.c | 2 +- mm/memory-tiers.c | 2 +- mm/memory.c | 4 ++-- mm/memory_hotplug.c | 4 ++-- mm/migrate_device.c | 4 ++-- mm/mm_init.c | 6 +++--- mm/mremap.c | 6 +++--- mm/mseal.c | 4 ++-- mm/numa_memblks.c | 2 +- mm/page_alloc.c | 4 ++-- mm/page_io.c | 4 ++-- mm/page_isolation.c | 2 +- mm/page_reporting.c | 2 +- mm/swap.c | 2 +- mm/swap.h | 2 +- mm/swap_state.c | 2 +- mm/swapfile.c | 2 +- mm/userfaultfd.c | 4 ++-- mm/vma.c | 8 ++++---- mm/vma.h | 8 ++++---- mm/vmscan.c | 2 +- mm/vmstat.c | 2 +- mm/zsmalloc.c | 2 +- 27 files changed, 45 insertions(+), 45 deletions(-) diff --git a/mm/internal.h b/mm/internal.h index f35dbcf99a86..9ee336aa0365 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -171,7 +171,7 @@ static inline int mmap_file(struct file *file, struct vm_area_struct *vma) /* * OK, we tried to call the file hook for mmap(), but an error - * arose. The mapping is in an inconsistent state and we most not invoke + * arose. The mapping is in an inconsistent state and we must not invoke * any further hooks on it. */ vma->vm_ops = &vma_dummy_vm_ops; diff --git a/mm/madvise.c b/mm/madvise.c index 6bf7009fa5ce..863d55b8a658 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -1867,7 +1867,7 @@ static bool is_valid_madvise(unsigned long start, size_t len_in, int behavior) * madvise_should_skip() - Return if the request is invalid or nothing. * @start: Start address of madvise-requested address range. * @len_in: Length of madvise-requested address range. - * @behavior: Requested madvise behavor. + * @behavior: Requested madvise behavior. * @err: Pointer to store an error code from the check. * * If the specified behaviour is invalid or nothing would occur, we skip the diff --git a/mm/memblock.c b/mm/memblock.c index 905d06b16348..e76255e4ff36 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -773,7 +773,7 @@ bool __init_memblock memblock_validate_numa_coverage(unsigned long threshold_byt unsigned long start_pfn, end_pfn, mem_size_mb; int nid, i; - /* calculate lose page */ + /* calculate lost page */ for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) { if (!numa_valid_node(nid)) nr_pages += end_pfn - start_pfn; @@ -2414,7 +2414,7 @@ EXPORT_SYMBOL_GPL(reserve_mem_find_by_name); /** * reserve_mem_release_by_name - Release reserved memory region with a given name - * @name: The name that is attatched to a reserved memory region + * @name: The name that is attached to a reserved memory region * * Forcibly release the pages in the reserved memory region so that those memory * can be used as free memory. After released the reserved region size becomes 0. diff --git a/mm/memcontrol.c b/mm/memcontrol.c index a01d3e6c157d..75fc22a33b28 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4976,7 +4976,7 @@ void mem_cgroup_migrate(struct folio *old, struct folio *new) memcg = folio_memcg(old); /* * Note that it is normal to see !memcg for a hugetlb folio. - * For e.g, itt could have been allocated when memory_hugetlb_accounting + * For e.g, it could have been allocated when memory_hugetlb_accounting * was not selected. */ VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(old) && !memcg, old); diff --git a/mm/memory-failure.c b/mm/memory-failure.c index c80c2907da33..05a553b069ff 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -864,7 +864,7 @@ static int kill_accessing_process(struct task_struct *p, unsigned long pfn, * * MF_RECOVERED - The m-f() handler marks the page as PG_hwpoisoned'ed. * The page has been completely isolated, that is, unmapped, taken out of - * the buddy system, or hole-punnched out of the file mapping. + * the buddy system, or hole-punched out of the file mapping. */ static const char *action_name[] = { [MF_IGNORED] = "Ignored", diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c index 864811fff409..20aab9c19c5e 100644 --- a/mm/memory-tiers.c +++ b/mm/memory-tiers.c @@ -648,7 +648,7 @@ void clear_node_memory_type(int node, struct memory_dev_type *memtype) if (node_memory_types[node].memtype == memtype || !memtype) node_memory_types[node].map_count--; /* - * If we umapped all the attached devices to this node, + * If we unmapped all the attached devices to this node, * clear the node memory type. */ if (!node_memory_types[node].map_count) { diff --git a/mm/memory.c b/mm/memory.c index ce933ee4a3dd..87cf4e1a6f86 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -5935,7 +5935,7 @@ int numa_migrate_check(struct folio *folio, struct vm_fault *vmf, else *last_cpupid = folio_last_cpupid(folio); - /* Record the current PID acceesing VMA */ + /* Record the current PID accessing VMA */ vma_set_access_pid_bit(vma); count_vm_numa_event(NUMA_HINT_FAULTS); @@ -6254,7 +6254,7 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf) * Use the maywrite version to indicate that vmf->pte may be * modified, but since we will use pte_same() to detect the * change of the !pte_none() entry, there is no need to recheck - * the pmdval. Here we chooes to pass a dummy variable instead + * the pmdval. Here we choose to pass a dummy variable instead * of NULL, which helps new user think about why this place is * special. */ diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index a63ec679d861..389989a28abe 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -926,7 +926,7 @@ static struct zone *default_kernel_zone_for_pfn(int nid, unsigned long start_pfn * * MOVABLE : KERNEL_EARLY * - * Whereby KERNEL_EARLY is memory in one of the kernel zones, available sinze + * Whereby KERNEL_EARLY is memory in one of the kernel zones, available since * boot. We base our calculation on KERNEL_EARLY internally, because: * * a) Hotplugged memory in one of the kernel zones can sometimes still get @@ -1258,7 +1258,7 @@ static pg_data_t *hotadd_init_pgdat(int nid) * NODE_DATA is preallocated (free_area_init) but its internal * state is not allocated completely. Add missing pieces. * Completely offline nodes stay around and they just need - * reintialization. + * reinitialization. */ pgdat = NODE_DATA(nid); diff --git a/mm/migrate_device.c b/mm/migrate_device.c index 0346c2d7819f..0a8b31939640 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -1419,10 +1419,10 @@ EXPORT_SYMBOL(migrate_device_range); /** * migrate_device_pfns() - migrate device private pfns to normal memory. - * @src_pfns: pre-popluated array of source device private pfns to migrate. + * @src_pfns: pre-populated array of source device private pfns to migrate. * @npages: number of pages to migrate. * - * Similar to migrate_device_range() but supports non-contiguous pre-popluated + * Similar to migrate_device_range() but supports non-contiguous pre-populated * array of device pages to migrate. */ int migrate_device_pfns(unsigned long *src_pfns, unsigned long npages) diff --git a/mm/mm_init.c b/mm/mm_init.c index d86248566a56..0927bedb1254 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -187,7 +187,7 @@ void mm_compute_batch(int overcommit_policy) /* * For policy OVERCOMMIT_NEVER, set batch size to 0.4% of * (total memory/#cpus), and lift it to 25% for other policies - * to easy the possible lock contention for percpu_counter + * to ease the possible lock contention for percpu_counter * vm_committed_as, while the max limit is INT_MAX */ if (overcommit_policy == OVERCOMMIT_NEVER) @@ -1745,7 +1745,7 @@ static void __init free_area_init_node(int nid) lru_gen_init_pgdat(pgdat); } -/* Any regular or high memory on that node ? */ +/* Any regular or high memory on that node? */ static void __init check_for_memory(pg_data_t *pgdat) { enum zone_type zone_type; @@ -2045,7 +2045,7 @@ static unsigned long __init deferred_init_pages(struct zone *zone, * Initialize and free pages. * * At this point reserved pages and struct pages that correspond to holes in - * memblock.memory are already intialized so every free range has a valid + * memblock.memory are already initialized so every free range has a valid * memory map around it. * This ensures that access of pages that are ahead of the range being * initialized (computing buddy page in __free_one_page()) always reads a valid diff --git a/mm/mremap.c b/mm/mremap.c index 8275b9772ec1..8391ae17de64 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -678,7 +678,7 @@ static bool can_realign_addr(struct pagetable_move_control *pmc, /* * We don't want to have to go hunting for VMAs from the end of the old * VMA to the next page table boundary, also we want to make sure the - * operation is wortwhile. + * operation is worthwhile. * * So ensure that we only perform this realignment if the end of the * range being copied reaches or crosses the page table boundary. @@ -926,7 +926,7 @@ static bool vrm_overlaps(struct vma_remap_struct *vrm) /* * Will a new address definitely be assigned? This either if the user specifies * it via MREMAP_FIXED, or if MREMAP_DONTUNMAP is used, indicating we will - * always detemrine a target address. + * always determine a target address. */ static bool vrm_implies_new_addr(struct vma_remap_struct *vrm) { @@ -1806,7 +1806,7 @@ static unsigned long check_mremap_params(struct vma_remap_struct *vrm) /* * move_vma() need us to stay 4 maps below the threshold, otherwise * it will bail out at the very beginning. - * That is a problem if we have already unmaped the regions here + * That is a problem if we have already unmapped the regions here * (new_addr, and old_addr), because userspace will not know the * state of the vma's after it gets -ENOMEM. * So, to avoid such scenario we can pre-compute if the whole diff --git a/mm/mseal.c b/mm/mseal.c index ae442683c5c0..316b5e1dec78 100644 --- a/mm/mseal.c +++ b/mm/mseal.c @@ -21,7 +21,7 @@ * It disallows unmapped regions from start to end whether they exist at the * start, in the middle, or at the end of the range, or any combination thereof. * - * This is because after sealng a range, there's nothing to stop memory mapping + * This is because after sealing a range, there's nothing to stop memory mapping * of ranges in the remaining gaps later, meaning that the user might then * wrongly consider the entirety of the mseal()'d range to be sealed when it * in fact isn't. @@ -124,7 +124,7 @@ static int mseal_apply(struct mm_struct *mm, * -EINVAL: * invalid input flags. * start address is not page aligned. - * Address arange (start + len) overflow. + * Address range (start + len) overflow. * -ENOMEM: * addr is not a valid address (not allocated). * end (start + len) is not a valid address. diff --git a/mm/numa_memblks.c b/mm/numa_memblks.c index 8f5735fda0a2..391f53e63ea3 100644 --- a/mm/numa_memblks.c +++ b/mm/numa_memblks.c @@ -467,7 +467,7 @@ int __init numa_memblks_init(int (*init_func)(void), * We reset memblock back to the top-down direction * here because if we configured ACPI_NUMA, we have * parsed SRAT in init_func(). It is ok to have the - * reset here even if we did't configure ACPI_NUMA + * reset here even if we didn't configure ACPI_NUMA * or acpi numa init fails and fallbacks to dummy * numa init. */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index cbf758e27aa2..e1cc0c9ed947 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1853,7 +1853,7 @@ inline void post_alloc_hook(struct page *page, unsigned int order, /* * As memory initialization might be integrated into KASAN, - * KASAN unpoisoning and memory initializion code must be + * KASAN unpoisoning and memory initialization code must be * kept together to avoid discrepancies in behavior. */ @@ -7653,7 +7653,7 @@ struct page *alloc_frozen_pages_nolock_noprof(gfp_t gfp_flags, int nid, unsigned * unsafe in NMI. If spin_trylock() is called from hard IRQ the current * task may be waiting for one rt_spin_lock, but rt_spin_trylock() will * mark the task as the owner of another rt_spin_lock which will - * confuse PI logic, so return immediately if called form hard IRQ or + * confuse PI logic, so return immediately if called from hard IRQ or * NMI. * * Note, irqs_disabled() case is ok. This function can be called diff --git a/mm/page_io.c b/mm/page_io.c index 3c342db77ce3..a2c034660c80 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -450,14 +450,14 @@ void __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug) VM_BUG_ON_FOLIO(!folio_test_swapcache(folio), folio); /* - * ->flags can be updated non-atomicially (scan_swap_map_slots), + * ->flags can be updated non-atomically (scan_swap_map_slots), * but that will never affect SWP_FS_OPS, so the data_race * is safe. */ if (data_race(sis->flags & SWP_FS_OPS)) swap_writepage_fs(folio, swap_plug); /* - * ->flags can be updated non-atomicially (scan_swap_map_slots), + * ->flags can be updated non-atomically (scan_swap_map_slots), * but that will never affect SWP_SYNCHRONOUS_IO, so the data_race * is safe. */ diff --git a/mm/page_isolation.c b/mm/page_isolation.c index f72b6cd38b95..b5924eff4f8b 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -301,7 +301,7 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages) * pageblock. When not all pageblocks within a page are isolated at the same * time, free page accounting can go wrong. For example, in the case of * MAX_PAGE_ORDER = pageblock_order + 1, a MAX_PAGE_ORDER page has two - * pagelbocks. + * pageblocks. * [ MAX_PAGE_ORDER ] * [ pageblock0 | pageblock1 ] * When either pageblock is isolated, if it is a free page, the page is not diff --git a/mm/page_reporting.c b/mm/page_reporting.c index e4c428e61d8c..8a03effda749 100644 --- a/mm/page_reporting.c +++ b/mm/page_reporting.c @@ -123,7 +123,7 @@ page_reporting_drain(struct page_reporting_dev_info *prdev, continue; /* - * If page was not comingled with another page we can + * If page was not commingled with another page we can * consider the result to be "reported" since the page * hasn't been modified, otherwise we will need to * report on the new larger page when we make our way diff --git a/mm/swap.c b/mm/swap.c index 2260dcd2775e..bb19ccbece46 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -513,7 +513,7 @@ void folio_add_lru(struct folio *folio) EXPORT_SYMBOL(folio_add_lru); /** - * folio_add_lru_vma() - Add a folio to the appropate LRU list for this VMA. + * folio_add_lru_vma() - Add a folio to the appropriate LRU list for this VMA. * @folio: The folio to be added to the LRU. * @vma: VMA in which the folio is mapped. * diff --git a/mm/swap.h b/mm/swap.h index d034c13d8dd2..3dcf198b05e3 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -236,7 +236,7 @@ static inline bool folio_matches_swap_entry(const struct folio *folio, /* * All swap cache helpers below require the caller to ensure the swap entries - * used are valid and stablize the device by any of the following ways: + * used are valid and stabilize the device by any of the following ways: * - Hold a reference by get_swap_device(): this ensures a single entry is * valid and increases the swap device's refcount. * - Locking a folio in the swap cache: this ensures the folio's swap entries diff --git a/mm/swap_state.c b/mm/swap_state.c index 5f97c6ae70a2..c6f661436c9a 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -82,7 +82,7 @@ void show_swap_cache_info(void) * Context: Caller must ensure @entry is valid and protect the swap device * with reference count or locks. * Return: Returns the found folio on success, NULL otherwise. The caller - * must lock nd check if the folio still matches the swap entry before + * must lock and check if the folio still matches the swap entry before * use (e.g., folio_matches_swap_entry). */ struct folio *swap_cache_get_folio(swp_entry_t entry) diff --git a/mm/swapfile.c b/mm/swapfile.c index 46d2008e4b99..76273ad26739 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2018,7 +2018,7 @@ swp_entry_t get_swap_page_of_type(int type) if (get_swap_device_info(si)) { if (si->flags & SWP_WRITEOK) { /* - * Grab the local lock to be complaint + * Grab the local lock to be compliant * with swap table allocation. */ local_lock(&percpu_swap_cluster.lock); diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index b11f81095fa5..d270d5377630 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -1274,7 +1274,7 @@ retry: * Use the maywrite version to indicate that dst_pte will be modified, * since dst_pte needs to be none, the subsequent pte_same() check * cannot prevent the dst_pte page from being freed concurrently, so we - * also need to abtain dst_pmdval and recheck pmd_same() later. + * also need to obtain dst_pmdval and recheck pmd_same() later. */ dst_pte = pte_offset_map_rw_nolock(mm, dst_pmd, dst_addr, &dst_pmdval, &dst_ptl); @@ -1330,7 +1330,7 @@ retry: goto out; } - /* If PTE changed after we locked the folio them start over */ + /* If PTE changed after we locked the folio then start over */ if (src_folio && unlikely(!pte_same(src_folio_pte, orig_src_pte))) { ret = -EAGAIN; goto out; diff --git a/mm/vma.c b/mm/vma.c index 7a908a964d18..f81a5cfcd7cc 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -2951,10 +2951,10 @@ retry: return -ENOMEM; /* - * Adjust for the gap first so it doesn't interfere with the - * later alignment. The first step is the minimum needed to - * fulill the start gap, the next steps is the minimum to align - * that. It is the minimum needed to fulill both. + * Adjust for the gap first so it doesn't interfere with the later + * alignment. The first step is the minimum needed to fulfill the start + * gap, the next step is the minimum to align that. It is the minimum + * needed to fulfill both. */ gap = vma_iter_addr(&vmi) + info->start_gap; gap += (info->align_offset - gap) & info->align_mask; diff --git a/mm/vma.h b/mm/vma.h index 9d5ee6ac913a..8526f22c9f5a 100644 --- a/mm/vma.h +++ b/mm/vma.h @@ -267,7 +267,7 @@ void unmap_region(struct ma_state *mas, struct vm_area_struct *vma, struct vm_area_struct *prev, struct vm_area_struct *next); /** - * vma_modify_flags() - Peform any necessary split/merge in preparation for + * vma_modify_flags() - Perform any necessary split/merge in preparation for * setting VMA flags to *@vm_flags in the range @start to @end contained within * @vma. * @vmi: Valid VMA iterator positioned at @vma. @@ -295,7 +295,7 @@ __must_check struct vm_area_struct *vma_modify_flags(struct vma_iterator *vmi, vm_flags_t *vm_flags_ptr); /** - * vma_modify_name() - Peform any necessary split/merge in preparation for + * vma_modify_name() - Perform any necessary split/merge in preparation for * setting anonymous VMA name to @new_name in the range @start to @end contained * within @vma. * @vmi: Valid VMA iterator positioned at @vma. @@ -319,7 +319,7 @@ __must_check struct vm_area_struct *vma_modify_name(struct vma_iterator *vmi, struct anon_vma_name *new_name); /** - * vma_modify_policy() - Peform any necessary split/merge in preparation for + * vma_modify_policy() - Perform any necessary split/merge in preparation for * setting NUMA policy to @new_pol in the range @start to @end contained * within @vma. * @vmi: Valid VMA iterator positioned at @vma. @@ -343,7 +343,7 @@ __must_check struct vm_area_struct *vma_modify_policy(struct vma_iterator *vmi, struct mempolicy *new_pol); /** - * vma_modify_flags_uffd() - Peform any necessary split/merge in preparation for + * vma_modify_flags_uffd() - Perform any necessary split/merge in preparation for * setting VMA flags to @vm_flags and UFFD context to @new_ctx in the range * @start to @end contained within @vma. * @vmi: Valid VMA iterator positioned at @vma. diff --git a/mm/vmscan.c b/mm/vmscan.c index fc5691afb998..619691aa4393 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1063,7 +1063,7 @@ static bool may_enter_fs(struct folio *folio, gfp_t gfp_mask) /* * We can "enter_fs" for swap-cache with only __GFP_IO * providing this isn't SWP_FS_OPS. - * ->flags can be updated non-atomicially (scan_swap_map_slots), + * ->flags can be updated non-atomically (scan_swap_map_slots), * but that will never affect SWP_FS_OPS, so the data_race * is safe. */ diff --git a/mm/vmstat.c b/mm/vmstat.c index 65de88cdf40e..bd2af431ff86 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1626,7 +1626,7 @@ static void pagetypeinfo_showfree_print(struct seq_file *m, } } -/* Print out the free pages at each order for each migatetype */ +/* Print out the free pages at each order for each migratetype */ static void pagetypeinfo_showfree(struct seq_file *m, void *arg) { int order; diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 5bf832f9c05c..84da164dcbc5 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -105,7 +105,7 @@ /* * On systems with 4K page size, this gives 255 size classes! There is a - * trader-off here: + * trade-off here: * - Large number of size classes is potentially wasteful as free page are * spread across these classes * - Small number of size classes causes large internal fragmentation -- cgit v1.2.3 From ed60c8e280248c02cd87ce7982f8fcb402cce001 Mon Sep 17 00:00:00 2001 From: Xiu Jianfeng Date: Mon, 22 Dec 2025 07:23:59 +0000 Subject: mm/hugetlb_cgroup: fix -Wformat-truncation warning A false-positive compile warnings with -Wformat-trucation was introduced by commit 47179fe03588 ("mm/hugetlb_cgroup: prepare cftypes based on template") on arch s390. Suppress it by replacing snprintf() with scnprintf(). mm/hugetlb_cgroup.c: In function 'hugetlb_cgroup_file_init': mm/hugetlb_cgroup.c:829:44: warning: '%s' directive output may be truncated writing up to 1623 bytes into a region of size between 32 and 63 [-Wformat-truncation=] 829 | snprintf(cft->name, MAX_CFTYPE_NAME, "%s.%s", buf, tmpl->name); | ^~ Link: https://lkml.kernel.org/r/20251222072359.3626182-1-xiujianfeng@huaweicloud.com Fixes: 47179fe03588 ("mm/hugetlb_cgroup: prepare cftypes based on template") Signed-off-by: Xiu Jianfeng Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202512212332.9lFRbgdS-lkp@intel.com/ Reviewed-by: Andrew Morton Cc: "David Hildenbrand (Red Hat)" Cc: Muchun Song Cc: Oscar Salvador Signed-off-by: Andrew Morton --- mm/hugetlb_cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index 58e895f3899a..7144d7d555eb 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -822,7 +822,7 @@ hugetlb_cgroup_cfttypes_init(struct hstate *h, struct cftype *cft, for (i = 0; i < tmpl_size; cft++, tmpl++, i++) { *cft = *tmpl; /* rebuild the name */ - snprintf(cft->name, MAX_CFTYPE_NAME, "%s.%s", buf, tmpl->name); + scnprintf(cft->name, MAX_CFTYPE_NAME, "%s.%s", buf, tmpl->name); /* rebuild the private */ cft->private = MEMFILE_PRIVATE(idx, tmpl->private); /* rebuild the file_offset */ -- cgit v1.2.3 From 9c9828d3ead69416d731b1238802af31760c823e Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 19 Dec 2025 17:31:57 +0100 Subject: mm, page_alloc, thp: prevent reclaim for __GFP_THISNODE THP allocations Since commit cc638f329ef6 ("mm, thp: tweak reclaim/compaction effort of local-only and all-node allocations"), THP page fault allocations have settled on the following scheme (from the commit log): 1. local node only THP allocation with no reclaim, just compaction. 2. for madvised VMA's or when synchronous compaction is enabled always - THP allocation from any node with effort determined by global defrag setting and VMA madvise 3. fallback to base pages on any node Recent customer reports however revealed we have a gap in step 1 above. What we have seen is excessive reclaim due to THP page faults on a NUMA node that's close to its high watermark, while other nodes have plenty of free memory. The problem with step 1 is that it promises no reclaim after the compaction attempt, however reclaim is only avoided for certain compaction outcomes (deferred, or skipped due to insufficient free base pages), and not e.g. when compaction is actually performed but fails (we did see compact_fail vmstat counter increasing). THP page faults can therefore exhibit a zone_reclaim_mode-like behavior, which is not the intention. Thus add a check for __GFP_THISNODE that corresponds to this exact situation and prevents continuing with reclaim/compaction once the initial compaction attempt isn't successful in allocating the page. Note that commit cc638f329ef6 has not introduced this over-reclaim possibility; it appears to exist in some form since commit 2f0799a0ffc0 ("mm, thp: restore node-local hugepage allocations"). Followup commits b39d0ee2632d ("mm, page_alloc: avoid expensive reclaim when compaction may not succeed") and cc638f329ef6 have moved in the right direction, but left the abovementioned gap. Link: https://lkml.kernel.org/r/20251219-costly-noretry-thisnode-fix-v1-1-e1085a4a0c34@suse.cz Fixes: 2f0799a0ffc0 ("mm, thp: restore node-local hugepage allocations") Signed-off-by: Vlastimil Babka Acked-by: Michal Hocko Acked-by: Johannes Weiner Acked-by: Pedro Falcato Acked-by: Zi Yan Cc: Brendan Jackman Cc: "David Hildenbrand (Red Hat)" Cc: David Rientjes Cc: Joshua Hahn Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Signed-off-by: Andrew Morton --- mm/page_alloc.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e1cc0c9ed947..3333524e879c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4818,6 +4818,20 @@ restart: compact_result == COMPACT_DEFERRED) goto nopage; + /* + * THP page faults may attempt local node only first, + * but are then allowed to only compact, not reclaim, + * see alloc_pages_mpol(). + * + * Compaction can fail for other reasons than those + * checked above and we don't want such THP allocations + * to put reclaim pressure on a single node in a + * situation where other nodes might have plenty of + * available memory. + */ + if (gfp_mask & __GFP_THISNODE) + goto nopage; + /* * Looks like reclaim/compaction is worth trying, but * sync compaction could be very expensive, so keep -- cgit v1.2.3 From 7969f3059493eeb1aa93151d3ad5aded6af4e836 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Fri, 19 Dec 2025 19:46:00 +0800 Subject: mm/gup: remove no longer used gup_fast_undo_dev_pagemap This helper is no longer used after commit fd2825b0760a ("mm/gup: remove pXX_devmap usage from get_user_pages()"). Link: https://lkml.kernel.org/r/20251219-gup-cleanup-v1-1-348a70d9eecb@tencent.com Signed-off-by: Kairui Song Acked-by: David Hildenbrand (Red Hat) Reviewed-by: Alistair Popple Cc: Jason Gunthorpe Cc: John Hubbard Cc: Peter Xu Signed-off-by: Andrew Morton --- mm/gup.c | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index 95d948c8e86c..8e7dc2c6ee73 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -2806,17 +2806,6 @@ static bool gup_fast_folio_allowed(struct folio *folio, unsigned int flags) return !reject_file_backed || shmem_mapping(mapping); } -static void __maybe_unused gup_fast_undo_dev_pagemap(int *nr, int nr_start, - unsigned int flags, struct page **pages) -{ - while ((*nr) - nr_start) { - struct folio *folio = page_folio(pages[--(*nr)]); - - folio_clear_referenced(folio); - gup_put_folio(folio, 1, flags); - } -} - #ifdef CONFIG_ARCH_HAS_PTE_SPECIAL /* * GUP-fast relies on pte change detection to avoid concurrent pgtable -- cgit v1.2.3 From 241b3a09639c317bdcaeea6721b7d1aabef341f9 Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Fri, 19 Dec 2025 11:32:18 +0000 Subject: mm: clarify GFP_ATOMIC/GFP_NOWAIT doc-comment The current description of contexts where it's invalid to make GFP_ATOMIC and GFP_NOWAIT calls is rather vague. Replace this with a direct description of the actual contexts of concern and refer to the RT docs where this is explained more discursively. While rejigging this prose, also move the documentation of GFP_NOWAIT to the GFP_NOWAIT section. Link: https://lore.kernel.org/all/d912480a-5229-4efe-9336-b31acded30f5@suse.cz/ Link: https://lkml.kernel.org/r/20251219-b4-gfp_atomic-comment-v2-1-4c4ce274c2b6@google.com Signed-off-by: Brendan Jackman Acked-by: Vlastimil Babka Acked-by: David Hildenbrand (Red Hat) Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Sebastian Andrzej Siewior Cc: Steven Rostedt Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- include/linux/gfp_types.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/include/linux/gfp_types.h b/include/linux/gfp_types.h index 3de43b12209e..814bb2892f99 100644 --- a/include/linux/gfp_types.h +++ b/include/linux/gfp_types.h @@ -309,8 +309,10 @@ enum { * * %GFP_ATOMIC users can not sleep and need the allocation to succeed. A lower * watermark is applied to allow access to "atomic reserves". - * The current implementation doesn't support NMI and few other strict - * non-preemptive contexts (e.g. raw_spin_lock). The same applies to %GFP_NOWAIT. + * The current implementation doesn't support NMI, nor contexts that disable + * preemption under PREEMPT_RT. This includes raw_spin_lock() and plain + * preempt_disable() - see "Memory allocation" in + * Documentation/core-api/real-time/differences.rst for more info. * * %GFP_KERNEL is typical for kernel-internal allocations. The caller requires * %ZONE_NORMAL or a lower zone for direct access but can direct reclaim. @@ -321,6 +323,7 @@ enum { * %GFP_NOWAIT is for kernel allocations that should not stall for direct * reclaim, start physical IO or use any filesystem callback. It is very * likely to fail to allocate memory, even for very small allocations. + * The same restrictions on calling contexts apply as for %GFP_ATOMIC. * * %GFP_NOIO will use direct reclaim to discard clean pages or slab pages * that do not require the starting of any physical IO. -- cgit v1.2.3 From 7db0787000d44d52710e5cdd67113458fa28f3cd Mon Sep 17 00:00:00 2001 From: Wentao Guan Date: Thu, 6 Nov 2025 19:09:29 +0800 Subject: mm: cleanup vma_iter_bulk_alloc commit d24062914837 ("fork: use __mt_dup() to duplicate maple tree in dup_mmap()"), removed the only user and mas_expected_entries has been removed, since commit e3852a1213ffc ("maple_tree: Drop bulk insert support"). Also cleanup the mas_expected_entries in maple_tree.h. No functional change. Link: https://lkml.kernel.org/r/20251106110929.3522073-1-guanwentao@uniontech.com Signed-off-by: Wentao Guan Reviewed-by: Liam R. Howlett Cc: Anshuman Khandual Cc: Cheng Nie Cc: Guan Wentao Cc: Vlastimil Babka Cc: Lorenzo Stoakes Cc: Jann Horn Cc: Pedro Falcato Signed-off-by: Andrew Morton --- include/linux/maple_tree.h | 1 - mm/vma.h | 6 ------ 2 files changed, 7 deletions(-) diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index 1323c28a7470..7b8aad47121e 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -528,7 +528,6 @@ bool mas_nomem(struct ma_state *mas, gfp_t gfp); void mas_pause(struct ma_state *mas); void maple_tree_init(void); void mas_destroy(struct ma_state *mas); -int mas_expected_entries(struct ma_state *mas, unsigned long nr_entries); void *mas_prev(struct ma_state *mas, unsigned long min); void *mas_prev_range(struct ma_state *mas, unsigned long max); diff --git a/mm/vma.h b/mm/vma.h index 8526f22c9f5a..d51efd9da113 100644 --- a/mm/vma.h +++ b/mm/vma.h @@ -561,12 +561,6 @@ static inline unsigned long vma_iter_end(struct vma_iterator *vmi) return vmi->mas.last + 1; } -static inline int vma_iter_bulk_alloc(struct vma_iterator *vmi, - unsigned long count) -{ - return mas_expected_entries(&vmi->mas, count); -} - static inline struct vm_area_struct *vma_iter_prev_range(struct vma_iterator *vmi) { -- cgit v1.2.3 From 9e80e66ddaf736e5ca80cba8adf8d497bd53092f Mon Sep 17 00:00:00 2001 From: Gregory Price Date: Sun, 21 Dec 2025 07:56:03 -0500 Subject: mm, hugetlb: implement movable_gigantic_pages sysctl This reintroduces a concept removed by: commit d6cb41cc44c6 ("mm, hugetlb: remove hugepages_treat_as_movable sysctl") This sysctl provides flexibility between ZONE_MOVABLE use cases: 1) onlining memory in ZONE_MOVABLE to maintain hotplug compatibility 2) onlining memory in ZONE_MOVABLE to make hugepage allocate reliable When ZONE_MOVABLE is used to make huge page allocation more reliable, disallowing gigantic pages memory in this region is pointless. If hotplug is not a requirement, we can loosen the restrictions to allow 1GB gigantic pages in ZONE_MOVABLE. Since 1GB can be difficult to migrate / has impacts on compaction / defragmentation, we don't enable this by default. Notably, 1GB pages can only be migrated if another 1GB page is available - so hot-unplug will fail if such a page cannot be found. However, since there are scenarios where gigantic pages are migratable, we should allow use of these on movable regions. When not valid 1GB is available for migration, hot-unplug will retry indefinitely (or until interrupted). For example: echo 0 > node0/hugepages/..-1GB/nr_hugepages # clear node0 1GB pages echo 1 > node1/hugepages/..-1GB/nr_hugepages # reserve node1 1GB page ./alloc_huge_node1 & # Allocate a 1GB page on node1 ./node1_offline & # attempt to offline all node1 memory echo 1 > node0/hugepages/..-1GB/nr_hugepages # reserve node0 1GB page In this example, node1_offline will block indefinitely until the final step, when a node0 1GB page is made available. Note: Boot-time CMA is not possible for driver-managed hotplug memory, as CMA requires the memory to be registered as SystemRAM at boot time. Additionally, 1GB huge pages are not supported by THP. Link: https://lkml.kernel.org/r/20251221125603.2364174-1-gourry@gourry.net Signed-off-by: Gregory Price Suggested-by: David Rientjes Link: https://lore.kernel.org/all/20180201193132.Hk7vI_xaU%25akpm@linux-foundation.org/ Acked-by: David Hildenbrand (Red Hat) Acked-by: David Rientjes Cc: Mel Gorman Cc: Michal Hocko Cc: "David Hildenbrand (Red Hat)" Cc: Gregory Price Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Mike Rapoport Cc: Muchun Song Cc: Oscar Salvador Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/memory-hotplug.rst | 14 +++++++++++-- Documentation/admin-guide/sysctl/vm.rst | 28 +++++++++++++++++++++++++ include/linux/hugetlb.h | 3 ++- mm/hugetlb_sysctl.c | 11 ++++++++++ 4 files changed, 53 insertions(+), 3 deletions(-) diff --git a/Documentation/admin-guide/mm/memory-hotplug.rst b/Documentation/admin-guide/mm/memory-hotplug.rst index 33c886f3d198..6581558fd0d7 100644 --- a/Documentation/admin-guide/mm/memory-hotplug.rst +++ b/Documentation/admin-guide/mm/memory-hotplug.rst @@ -612,8 +612,9 @@ ZONE_MOVABLE, especially when fine-tuning zone ratios: allocations and silently create a zone imbalance, usually triggered by inflation requests from the hypervisor. -- Gigantic pages are unmovable, resulting in user space consuming a - lot of unmovable memory. +- Gigantic pages are unmovable when an architecture does not support + huge page migration and/or the ``movable_gigantic_pages`` sysctl is false. + See Documentation/admin-guide/sysctl/vm.rst for more info on this sysctl. - Huge pages are unmovable when an architectures does not support huge page migration, resulting in a similar issue as with gigantic pages. @@ -672,6 +673,15 @@ block might fail: - Concurrent activity that operates on the same physical memory area, such as allocating gigantic pages, can result in temporary offlining failures. +- When an admin sets the ``movable_gigantic_pages`` sysctl to true, gigantic + pages are allowed in ZONE_MOVABLE. This only allows migratable gigantic + pages to be allocated; however, if there are no eligible destination gigantic + pages at offline, the offlining operation will fail. + + Users leveraging ``movable_gigantic_pages`` should weigh the value of + ZONE_MOVABLE for increasing the reliability of gigantic page allocation + against the potential loss of hot-unplug reliability. + - Out of memory when dissolving huge pages, especially when HugeTLB Vmemmap Optimization (HVO) is enabled. diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst index ca6ebeb5171c..b98ccb5cb210 100644 --- a/Documentation/admin-guide/sysctl/vm.rst +++ b/Documentation/admin-guide/sysctl/vm.rst @@ -53,6 +53,7 @@ Currently, these files are in /proc/sys/vm: - mmap_min_addr - mmap_rnd_bits - mmap_rnd_compat_bits +- movable_gigantic_pages - nr_hugepages - nr_hugepages_mempolicy - nr_overcommit_hugepages @@ -620,6 +621,33 @@ This value can be changed after boot using the /proc/sys/vm/mmap_rnd_compat_bits tunable +movable_gigantic_pages +====================== + +This parameter controls whether gigantic pages may be allocated from +ZONE_MOVABLE. If set to non-zero, gigantic pages can be allocated +from ZONE_MOVABLE. ZONE_MOVABLE memory may be created via the kernel +boot parameter `kernelcore` or via memory hotplug as discussed in +Documentation/admin-guide/mm/memory-hotplug.rst. + +Support may depend on specific architecture. + +Note that using ZONE_MOVABLE gigantic pages make memory hotremove unreliable. + +Memory hot-remove operations will block indefinitely until the admin reserves +sufficient gigantic pages to service migration requests associated with the +memory offlining process. As HugeTLB gigantic page reservation is a manual +process (via `nodeN/hugepages/.../nr_hugepages` interfaces) this may not be +obvious when just attempting to offline a block of memory. + +Additionally, as multiple gigantic pages may be reserved on a single block, +it may appear that gigantic pages are available for migration when in reality +they are in the process of being removed. For example if `memoryN` contains +two gigantic pages, one reserved and one allocated, and an admin attempts to +offline that block, this operations may hang indefinitely unless another +reserved gigantic page is available on another block `memoryM`. + + nr_hugepages ============ diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index e51b8ef0cebd..694f6e83c637 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -171,6 +171,7 @@ bool hugetlbfs_pagecache_present(struct hstate *h, struct address_space *hugetlb_folio_mapping_lock_write(struct folio *folio); +extern int movable_gigantic_pages __read_mostly; extern int sysctl_hugetlb_shm_group __read_mostly; extern struct list_head huge_boot_pages[MAX_NUMNODES]; @@ -929,7 +930,7 @@ static inline bool hugepage_movable_supported(struct hstate *h) if (!hugepage_migration_supported(h)) return false; - if (hstate_is_gigantic(h)) + if (hstate_is_gigantic(h) && !movable_gigantic_pages) return false; return true; } diff --git a/mm/hugetlb_sysctl.c b/mm/hugetlb_sysctl.c index bd3077150542..e74cf18ad431 100644 --- a/mm/hugetlb_sysctl.c +++ b/mm/hugetlb_sysctl.c @@ -8,6 +8,8 @@ #include "hugetlb_internal.h" +int movable_gigantic_pages; + #ifdef CONFIG_SYSCTL static int proc_hugetlb_doulongvec_minmax(const struct ctl_table *table, int write, void *buffer, size_t *length, @@ -125,6 +127,15 @@ static const struct ctl_table hugetlb_table[] = { .mode = 0644, .proc_handler = hugetlb_overcommit_handler, }, +#ifdef CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION + { + .procname = "movable_gigantic_pages", + .data = &movable_gigantic_pages, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif }; void __init hugetlb_sysctl_init(void) -- cgit v1.2.3 From 3bb64898f00368cd110d4b31334f93251d56b404 Mon Sep 17 00:00:00 2001 From: Gregory Price Date: Sun, 21 Dec 2025 07:46:56 -0500 Subject: page_alloc: allow migration of smaller hugepages during contig_alloc We presently skip regions with hugepages entirely when trying to do contiguous page allocation. This will cause otherwise-movable 2MB HugeTLB pages to be considered unmovable, and makes 1GB gigantic page allocation less reliable on systems utilizing both. Commit 4d73ba5fa710 ("mm: page_alloc: skip regions with hugetlbfs pages when allocating 1G pages") skipped all HugePage containing regions because it can cause significant delays in 1G allocation (as HugeTLB migrations may fail for a number of reasons). Instead, if hugepage migration is enabled, consider regions with hugepages smaller than the target contiguous allocation request as valid targets for allocation. We optimize for the existing behavior by searching for non-hugetlb regions in a first pass, then retrying the search to include hugetlb only on failure. This allows the existing fast-path to remain the default case with a slow-path fallback to increase reliability. We only fallback to the slow path if a hugetlb region was detected, and we do a full re-scan because the zones/blocks may have changed during the first pass (and it's not worth further complexity). isolate_migrate_pages_block() has similar hugetlb filter logic, and the hugetlb code does a migratable check in folio_isolate_hugetlb() during isolation. The code servicing the allocation and migration already supports this exact use case. To test, allocate a bunch of 2MB HugeTLB pages (in this case 48GB) and then attempt to allocate some 1G HugeTLB pages (in this case 4GB) (Scale to your machine's memory capacity). echo 24576 > .../hugepages-2048kB/nr_hugepages echo 4 > .../hugepages-1048576kB/nr_hugepages Prior to this patch, the 1GB page reservation can fail if no contiguous 1GB pages remain. After this patch, the kernel will try to move 2MB pages and successfully allocate the 1GB pages (assuming overall sufficient memory is available). Also tested this while a program had the 2MB reservations mapped, and the 1GB reservation still succeeds. folio_alloc_gigantic() is the primary user of alloc_contig_pages(), other users are debug or init-time allocations and largely unaffected. - ppc/memtrace is a debugfs interface - x86/tdx memory allocation occurs once on module-init - kfence/core happens once on module (late) init - THP uses it in debug_vm_pgtable_alloc_huge_page at __init time Link: https://lkml.kernel.org/r/20251221124656.2362540-1-gourry@gourry.net Signed-off-by: Gregory Price Suggested-by: David Hildenbrand Link: https://lore.kernel.org/linux-mm/6fe3562d-49b2-4975-aa86-e139c535ad00@redhat.com/ Reviewed-by: Zi Yan Reviewed-by: Wei Yang Acked-by: Michal Hocko Acked-by: David Hildenbrand (Red Hat) Cc: Brendan Jackman Cc: Johannes Weiner Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/page_alloc.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 55 insertions(+), 4 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 3333524e879c..bc3ee3102b19 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7136,7 +7136,8 @@ static int __alloc_contig_pages(unsigned long start_pfn, } static bool pfn_range_valid_contig(struct zone *z, unsigned long start_pfn, - unsigned long nr_pages) + unsigned long nr_pages, bool skip_hugetlb, + bool *skipped_hugetlb) { unsigned long i, end_pfn = start_pfn + nr_pages; struct page *page; @@ -7152,8 +7153,42 @@ static bool pfn_range_valid_contig(struct zone *z, unsigned long start_pfn, if (PageReserved(page)) return false; - if (PageHuge(page)) - return false; + /* + * Only consider ranges containing hugepages if those pages are + * smaller than the requested contiguous region. e.g.: + * Move 2MB pages to free up a 1GB range. + * Don't move 1GB pages to free up a 2MB range. + * + * This makes contiguous allocation more reliable if multiple + * hugepage sizes are used without causing needless movement. + */ + if (PageHuge(page)) { + unsigned int order; + + if (!IS_ENABLED(CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION)) + return false; + + if (skip_hugetlb) { + *skipped_hugetlb = true; + return false; + } + + page = compound_head(page); + order = compound_order(page); + if ((order >= MAX_FOLIO_ORDER) || + (nr_pages <= (1 << order))) + return false; + + /* + * Reaching this point means we've encounted a huge page + * smaller than nr_pages, skip all pfn's for that page. + * + * We can't get here from a tail-PageHuge, as it implies + * we started a scan in the middle of a hugepage larger + * than nr_pages - which the prior check filters for. + */ + i += (1 << order) - 1; + } } return true; } @@ -7196,7 +7231,10 @@ struct page *alloc_contig_pages_noprof(unsigned long nr_pages, gfp_t gfp_mask, struct zonelist *zonelist; struct zone *zone; struct zoneref *z; + bool skip_hugetlb = true; + bool skipped_hugetlb = false; +retry: zonelist = node_zonelist(nid, gfp_mask); for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), nodemask) { @@ -7204,7 +7242,9 @@ struct page *alloc_contig_pages_noprof(unsigned long nr_pages, gfp_t gfp_mask, pfn = ALIGN(zone->zone_start_pfn, nr_pages); while (zone_spans_last_pfn(zone, pfn, nr_pages)) { - if (pfn_range_valid_contig(zone, pfn, nr_pages)) { + if (pfn_range_valid_contig(zone, pfn, nr_pages, + skip_hugetlb, + &skipped_hugetlb)) { /* * We release the zone lock here because * alloc_contig_range() will also lock the zone @@ -7223,6 +7263,17 @@ struct page *alloc_contig_pages_noprof(unsigned long nr_pages, gfp_t gfp_mask, } spin_unlock_irqrestore(&zone->lock, flags); } + /* + * If we failed, retry the search, but treat regions with HugeTLB pages + * as valid targets. This retains fast-allocations on first pass + * without trying to migrate HugeTLB pages (which may fail). On the + * second pass, we will try moving HugeTLB pages when those pages are + * smaller than the requested contiguous region size. + */ + if (skip_hugetlb && skipped_hugetlb) { + skip_hugetlb = false; + goto retry; + } return NULL; } #endif /* CONFIG_CONTIG_ALLOC */ -- cgit v1.2.3 From 8e46adb62fae98a866baa7c23f6ed3bfe02e6f88 Mon Sep 17 00:00:00 2001 From: Li Wang Date: Sun, 21 Dec 2025 20:26:37 +0800 Subject: selftests/mm/write_to_hugetlbfs: parse -s as size_t Patch series "selftests/mm: hugetlb cgroup charging: robustness fixes", v3. This series fixes a few issues in the hugetlb cgroup charging selftests (write_to_hugetlbfs.c + charge_reserved_hugetlb.sh) that show up on systems with large hugepages (e.g. 512MB) and when failures cause the test to wait indefinitely. On an aarch64 64k page kernel with 512MB hugepages, the test consistently fails in write_to_hugetlbfs with ENOMEM and then hangs waiting for the expected usage values. The root cause is that charge_reserved_hugetlb.sh mounts hugetlbfs with a fixed size=256M, which is smaller than a single hugepage, resulting in a mount with size=0 capacity. In addition, write_to_hugetlbfs previously parsed -s via atoi() into an int, which can overflow and print negative sizes. Reproducer / environment: - Kernel: 6.12.0-xxx.el10.aarch64+64k - Hugepagesize: 524288 kB (512MB) - ./charge_reserved_hugetlb.sh -cgroup-v2 - Observed mount: pagesize=512M,size=0 before this series After applying the series, the test completes successfully on the above setup. This patch (of 3): write_to_hugetlbfs currently parses the -s size argument with atoi() into an int. This silently accepts malformed input, cannot report overflow, and can truncate large sizes. === Error log === # uname -r 6.12.0-xxx.el10.aarch64+64k # ls /sys/kernel/mm/hugepages/hugepages-* hugepages-16777216kB/ hugepages-2048kB/ hugepages-524288kB/ #./charge_reserved_hugetlb.sh -cgroup-v2 # ----------------------------------------- ... # nr hugepages = 10 # writing cgroup limit: 5368709120 # writing reseravation limit: 5368709120 ... # Writing to this path: /mnt/huge/test # Writing this size: -1610612736 <-------- Switch the size variable to size_t and parse -s with sscanf("%zu", ...). Also print the size using %zu. This avoids incorrect behavior with large -s values and makes the utility more robust. Link: https://lkml.kernel.org/r/20251221122639.3168038-1-liwang@redhat.com Link: https://lkml.kernel.org/r/20251221122639.3168038-2-liwang@redhat.com Signed-off-by: Li Wang Acked-by: David Hildenbrand (Red Hat) Acked-by: Waiman Long Cc: David Hildenbrand Cc: Mark Brown Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/write_to_hugetlbfs.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/mm/write_to_hugetlbfs.c b/tools/testing/selftests/mm/write_to_hugetlbfs.c index 34c91f7e6128..ecb5f7619960 100644 --- a/tools/testing/selftests/mm/write_to_hugetlbfs.c +++ b/tools/testing/selftests/mm/write_to_hugetlbfs.c @@ -68,7 +68,7 @@ int main(int argc, char **argv) int key = 0; int *ptr = NULL; int c = 0; - int size = 0; + size_t size = 0; char path[256] = ""; enum method method = MAX_METHOD; int want_sleep = 0, private = 0; @@ -86,7 +86,10 @@ int main(int argc, char **argv) while ((c = getopt(argc, argv, "s:p:m:owlrn")) != -1) { switch (c) { case 's': - size = atoi(optarg); + if (sscanf(optarg, "%zu", &size) != 1) { + perror("Invalid -s."); + exit_usage(); + } break; case 'p': strncpy(path, optarg, sizeof(path) - 1); @@ -131,7 +134,7 @@ int main(int argc, char **argv) } if (size != 0) { - printf("Writing this size: %d\n", size); + printf("Writing this size: %zu\n", size); } else { errno = EINVAL; perror("size not found"); -- cgit v1.2.3 From 1aa1dd9cc595917882fb6db67725442956f79607 Mon Sep 17 00:00:00 2001 From: Li Wang Date: Sun, 21 Dec 2025 20:26:38 +0800 Subject: selftests/mm/charge_reserved_hugetlb: drop mount size for hugetlbfs charge_reserved_hugetlb.sh mounts a hugetlbfs instance at /mnt/huge with a fixed size of 256M. On systems with large base hugepages (e.g. 512MB), this is smaller than a single hugepage, so the hugetlbfs mount ends up with zero capacity (often visible as size=0 in mount output). As a result, write_to_hugetlbfs fails with ENOMEM and the test can hang waiting for progress. === Error log === # uname -r 6.12.0-xxx.el10.aarch64+64k #./charge_reserved_hugetlb.sh -cgroup-v2 # ----------------------------------------- ... # nr hugepages = 10 # writing cgroup limit: 5368709120 # writing reseravation limit: 5368709120 ... # write_to_hugetlbfs: Error mapping the file: Cannot allocate memory # Waiting for hugetlb memory reservation to reach size 2684354560. # 0 # Waiting for hugetlb memory reservation to reach size 2684354560. # 0 ... # mount |grep /mnt/huge none on /mnt/huge type hugetlbfs (rw,relatime,seclabel,pagesize=512M,size=0) # grep -i huge /proc/meminfo ... HugePages_Total: 10 HugePages_Free: 10 HugePages_Rsvd: 0 HugePages_Surp: 0 Hugepagesize: 524288 kB Hugetlb: 5242880 kB Drop the mount args with 'size=256M', so the filesystem capacity is sufficient regardless of HugeTLB page size. Link: https://lkml.kernel.org/r/20251221122639.3168038-3-liwang@redhat.com Fixes: 29750f71a9b4 ("hugetlb_cgroup: add hugetlb_cgroup reservation tests") Signed-off-by: Li Wang Acked-by: David Hildenbrand (Red Hat) Acked-by: Waiman Long Cc: Mark Brown Cc: Shuah Khan Cc: Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/charge_reserved_hugetlb.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/mm/charge_reserved_hugetlb.sh b/tools/testing/selftests/mm/charge_reserved_hugetlb.sh index e1fe16bcbbe8..fa6713892d82 100755 --- a/tools/testing/selftests/mm/charge_reserved_hugetlb.sh +++ b/tools/testing/selftests/mm/charge_reserved_hugetlb.sh @@ -290,7 +290,7 @@ function run_test() { setup_cgroup "hugetlb_cgroup_test" "$cgroup_limit" "$reservation_limit" mkdir -p /mnt/huge - mount -t hugetlbfs -o pagesize=${MB}M,size=256M none /mnt/huge + mount -t hugetlbfs -o pagesize=${MB}M none /mnt/huge write_hugetlbfs_and_get_usage "hugetlb_cgroup_test" "$size" "$populate" \ "$write" "/mnt/huge/test" "$method" "$private" "$expect_failure" \ @@ -344,7 +344,7 @@ function run_multiple_cgroup_test() { setup_cgroup "hugetlb_cgroup_test2" "$cgroup_limit2" "$reservation_limit2" mkdir -p /mnt/huge - mount -t hugetlbfs -o pagesize=${MB}M,size=256M none /mnt/huge + mount -t hugetlbfs -o pagesize=${MB}M none /mnt/huge write_hugetlbfs_and_get_usage "hugetlb_cgroup_test1" "$size1" \ "$populate1" "$write1" "/mnt/huge/test1" "$method" "$private" \ -- cgit v1.2.3 From b618876f2e7055160cc5b98b4ff5cd8917e7b49e Mon Sep 17 00:00:00 2001 From: Li Wang Date: Sun, 21 Dec 2025 20:26:39 +0800 Subject: selftests/mm/charge_reserved_hugetlb.sh: add waits with timeout helper The hugetlb cgroup usage wait loops in charge_reserved_hugetlb.sh were unbounded and could hang forever if the expected cgroup file value never appears (e.g. due to write_to_hugetlbfs in Error mapping). === Error log === # uname -r 6.12.0-xxx.el10.aarch64+64k # ls /sys/kernel/mm/hugepages/hugepages-* hugepages-16777216kB/ hugepages-2048kB/ hugepages-524288kB/ #./charge_reserved_hugetlb.sh -cgroup-v2 # ----------------------------------------- ... # nr hugepages = 10 # writing cgroup limit: 5368709120 # writing reseravation limit: 5368709120 ... # write_to_hugetlbfs: Error mapping the file: Cannot allocate memory # Waiting for hugetlb memory reservation to reach size 2684354560. # 0 # Waiting for hugetlb memory reservation to reach size 2684354560. # 0 # Waiting for hugetlb memory reservation to reach size 2684354560. # 0 # Waiting for hugetlb memory reservation to reach size 2684354560. # 0 # Waiting for hugetlb memory reservation to reach size 2684354560. # 0 # Waiting for hugetlb memory reservation to reach size 2684354560. # 0 ... Introduce a small helper, wait_for_file_value(), and use it for: - waiting for reservation usage to drop to 0, - waiting for reservation usage to reach a given size, - waiting for fault usage to reach a given size. This makes the waits consistent and adds a hard timeout (60 tries with 1s sleep) so the test fails instead of stalling indefinitely. Link: https://lkml.kernel.org/r/20251221122639.3168038-4-liwang@redhat.com Signed-off-by: Li Wang Acked-by: David Hildenbrand (Red Hat) Cc: Mark Brown Cc: Shuah Khan Cc: Waiman Long Signed-off-by: Andrew Morton --- .../selftests/mm/charge_reserved_hugetlb.sh | 51 +++++++++++++--------- 1 file changed, 30 insertions(+), 21 deletions(-) diff --git a/tools/testing/selftests/mm/charge_reserved_hugetlb.sh b/tools/testing/selftests/mm/charge_reserved_hugetlb.sh index fa6713892d82..447769657634 100755 --- a/tools/testing/selftests/mm/charge_reserved_hugetlb.sh +++ b/tools/testing/selftests/mm/charge_reserved_hugetlb.sh @@ -100,7 +100,7 @@ function setup_cgroup() { echo writing cgroup limit: "$cgroup_limit" echo "$cgroup_limit" >$cgroup_path/$name/hugetlb.${MB}MB.$fault_limit_file - echo writing reseravation limit: "$reservation_limit" + echo writing reservation limit: "$reservation_limit" echo "$reservation_limit" > \ $cgroup_path/$name/hugetlb.${MB}MB.$reservation_limit_file @@ -112,41 +112,50 @@ function setup_cgroup() { fi } +function wait_for_file_value() { + local path="$1" + local expect="$2" + local max_tries=60 + + if [[ ! -r "$path" ]]; then + echo "ERROR: cannot read '$path', missing or permission denied" + return 1 + fi + + for ((i=1; i<=max_tries; i++)); do + local cur="$(cat "$path")" + if [[ "$cur" == "$expect" ]]; then + return 0 + fi + echo "Waiting for $path to become '$expect' (current: '$cur') (try $i/$max_tries)" + sleep 1 + done + + echo "ERROR: timeout waiting for $path to become '$expect'" + return 1 +} + function wait_for_hugetlb_memory_to_get_depleted() { local cgroup="$1" local path="$cgroup_path/$cgroup/hugetlb.${MB}MB.$reservation_usage_file" - # Wait for hugetlbfs memory to get depleted. - while [ $(cat $path) != 0 ]; do - echo Waiting for hugetlb memory to get depleted. - cat $path - sleep 0.5 - done + + wait_for_file_value "$path" "0" } function wait_for_hugetlb_memory_to_get_reserved() { local cgroup="$1" local size="$2" - local path="$cgroup_path/$cgroup/hugetlb.${MB}MB.$reservation_usage_file" - # Wait for hugetlbfs memory to get written. - while [ $(cat $path) != $size ]; do - echo Waiting for hugetlb memory reservation to reach size $size. - cat $path - sleep 0.5 - done + + wait_for_file_value "$path" "$size" } function wait_for_hugetlb_memory_to_get_written() { local cgroup="$1" local size="$2" - local path="$cgroup_path/$cgroup/hugetlb.${MB}MB.$fault_usage_file" - # Wait for hugetlbfs memory to get written. - while [ $(cat $path) != $size ]; do - echo Waiting for hugetlb memory to reach size $size. - cat $path - sleep 0.5 - done + + wait_for_file_value "$path" "$size" } function write_hugetlbfs_and_get_usage() { -- cgit v1.2.3 From b47beff129c6193df3dd406f2db2628fcc09d1eb Mon Sep 17 00:00:00 2001 From: Chunyu Hu Date: Sun, 21 Dec 2025 12:00:21 +0800 Subject: selftests/mm: fix va_high_addr_switch.sh return value Patch series "Fix va_high_addr_switch.sh test failure - again", v2. The series address several issues exist for the va_high_addr_switch test: 1) the test return value is ignored in va_high_addr_switch.sh. 2) the va_high_addr_switch test requires 6 hugepages not 5. 3) the reurn value of the first test in va_high_addr_switch.c can be overridden by the second test. 4) the nr_hugepages setup in run_vmtests.sh for arm64 can be done in va_high_addr_switch.sh too. 5) update a comment for check_test_requirements. This patch: (of 5) The return value should be return value of va_high_addr_switch, otherwise a test failure would be silently ignored. Link: https://lkml.kernel.org/r/20251221040025.3159990-1-chuhu@redhat.com Fixes: d9d957bd7b61 ("selftests/mm: alloc hugepages in va_high_addr_switch test") Signed-off-by: Chunyu Hu Reviewed-by: Luiz Capitulino Cc: Luiz Capitulino Cc: "David Hildenbrand (Red Hat)" Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/va_high_addr_switch.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/testing/selftests/mm/va_high_addr_switch.sh b/tools/testing/selftests/mm/va_high_addr_switch.sh index a7d4b02b21dd..f89fe078a8e6 100755 --- a/tools/testing/selftests/mm/va_high_addr_switch.sh +++ b/tools/testing/selftests/mm/va_high_addr_switch.sh @@ -114,4 +114,6 @@ save_nr_hugepages # 4 keep_mapped pages, and one for tmp usage setup_nr_hugepages 5 ./va_high_addr_switch --run-hugetlb +retcode=$? restore_nr_hugepages +exit $retcode -- cgit v1.2.3 From b1f031e33cb5ae4be039a17613ad8da84c777e70 Mon Sep 17 00:00:00 2001 From: Chunyu Hu Date: Sun, 21 Dec 2025 12:00:22 +0800 Subject: selftests/mm: allocate 6 hugepages in va_high_addr_switch.sh The va_high_addr_switch test requires 6 hugepages, not 5. If running the test directly by: ./va_high_addr_switch.sh, the test will hit a mmap 'FAIL' caused by not enough hugepages: mmap(addr_switch_hint - hugepagesize, 2*hugepagesize, MAP_HUGETLB): 0x7f330f800000 - OK mmap(addr_switch_hint , 2*hugepagesize, MAP_FIXED | MAP_HUGETLB): 0xffffffffffffffff - FAILED The failure can't be hit if run the tests by running 'run_vmtests.sh -t hugevm' because the nr_hugepages is set to 128 at the beginning of run_vmtests.sh and va_high_addr_switch.sh skip the setup of nr_hugepages because already enough. Link: https://lkml.kernel.org/r/20251221040025.3159990-2-chuhu@redhat.com Fixes: d9d957bd7b61 ("selftests/mm: alloc hugepages in va_high_addr_switch test") Signed-off-by: Chunyu Hu Reviewed-by: Luiz Capitulino Cc: "David Hildenbrand (Red Hat)" Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/va_high_addr_switch.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/mm/va_high_addr_switch.sh b/tools/testing/selftests/mm/va_high_addr_switch.sh index f89fe078a8e6..a0c93d348b11 100755 --- a/tools/testing/selftests/mm/va_high_addr_switch.sh +++ b/tools/testing/selftests/mm/va_high_addr_switch.sh @@ -111,8 +111,8 @@ setup_nr_hugepages() check_test_requirements save_nr_hugepages -# 4 keep_mapped pages, and one for tmp usage -setup_nr_hugepages 5 +# The HugeTLB tests require 6 pages +setup_nr_hugepages 6 ./va_high_addr_switch --run-hugetlb retcode=$? restore_nr_hugepages -- cgit v1.2.3 From 7544d7969d84c1c4a078d1c5a7d4117fbf6f385c Mon Sep 17 00:00:00 2001 From: Chunyu Hu Date: Sun, 21 Dec 2025 12:00:23 +0800 Subject: selftests/mm: remove arm64 nr_hugepages setup for va_high_addr_switch test arm64 and x86_64 has the same nr_hugepages requriement for running the va_high_addr_switch test. Since commit d9d957bd7b61 ("selftests/mm: alloc hugepages in va_high_addr_switch test"), the setup can be done in va_high_addr_switch.sh. So remove the duplicated setup. Link: https://lkml.kernel.org/r/20251221040025.3159990-3-chuhu@redhat.com Signed-off-by: Chunyu Hu Reviewed-by: Luiz Capitulino Cc: Luiz Capitulino Cc: "David Hildenbrand (Red Hat)" Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/run_vmtests.sh | 8 -------- 1 file changed, 8 deletions(-) diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh index d9173f2312b7..2dadbfc6e535 100755 --- a/tools/testing/selftests/mm/run_vmtests.sh +++ b/tools/testing/selftests/mm/run_vmtests.sh @@ -412,15 +412,7 @@ if [ $VADDR64 -ne 0 ]; then fi # va high address boundary switch test - ARCH_ARM64="arm64" - prev_nr_hugepages=$(cat /proc/sys/vm/nr_hugepages) - if [ "$ARCH" == "$ARCH_ARM64" ]; then - echo 6 > /proc/sys/vm/nr_hugepages - fi CATEGORY="hugevm" run_test bash ./va_high_addr_switch.sh - if [ "$ARCH" == "$ARCH_ARM64" ]; then - echo $prev_nr_hugepages > /proc/sys/vm/nr_hugepages - fi fi # VADDR64 # vmalloc stability smoke test -- cgit v1.2.3 From dd0202a0bd81c33096f3d473c296cad997baba5b Mon Sep 17 00:00:00 2001 From: Chunyu Hu Date: Sun, 21 Dec 2025 12:00:24 +0800 Subject: selftests/mm: va_high_addr_switch return fail when either test failed When the first test failed, and the hugetlb test passed, the result would be pass, but we expect a fail. Fix this issue by returning fail if either is not KSFT_PASS. Link: https://lkml.kernel.org/r/20251221040025.3159990-4-chuhu@redhat.com Signed-off-by: Chunyu Hu Reviewed-by: Luiz Capitulino Cc: "David Hildenbrand (Red Hat)" Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/va_high_addr_switch.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/mm/va_high_addr_switch.c b/tools/testing/selftests/mm/va_high_addr_switch.c index 02f290a69132..51401e081b20 100644 --- a/tools/testing/selftests/mm/va_high_addr_switch.c +++ b/tools/testing/selftests/mm/va_high_addr_switch.c @@ -322,7 +322,7 @@ static int supported_arch(void) int main(int argc, char **argv) { - int ret; + int ret, hugetlb_ret = KSFT_PASS; if (!supported_arch()) return KSFT_SKIP; @@ -331,6 +331,10 @@ int main(int argc, char **argv) ret = run_test(testcases, sz_testcases); if (argc == 2 && !strcmp(argv[1], "--run-hugetlb")) - ret = run_test(hugetlb_testcases, sz_hugetlb_testcases); - return ret; + hugetlb_ret = run_test(hugetlb_testcases, sz_hugetlb_testcases); + + if (ret == KSFT_PASS && hugetlb_ret == KSFT_PASS) + return KSFT_PASS; + else + return KSFT_FAIL; } -- cgit v1.2.3 From 6319c4f44234c3849fdb2c3f72c45353aa428d3f Mon Sep 17 00:00:00 2001 From: Chunyu Hu Date: Sun, 21 Dec 2025 12:00:25 +0800 Subject: selftests/mm: fix comment for check_test_requirements The test supports arm64 as well so the comment is incorrect. And there's a check for arm64 in va_high_addr_switch.c. Link: https://lkml.kernel.org/r/20251221040025.3159990-5-chuhu@redhat.com Fixes: 983e760bcdb6 ("selftest/mm: va_high_addr_switch: add ppc64 support check") Fixes: f556acc2facd ("selftests/mm: skip test for non-LPA2 and non-LVA systems") Signed-off-by: Chunyu Hu Reviewed-by: Luiz Capitulino Cc: "David Hildenbrand (Red Hat)" Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/va_high_addr_switch.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/mm/va_high_addr_switch.sh b/tools/testing/selftests/mm/va_high_addr_switch.sh index a0c93d348b11..9492c2d72634 100755 --- a/tools/testing/selftests/mm/va_high_addr_switch.sh +++ b/tools/testing/selftests/mm/va_high_addr_switch.sh @@ -61,9 +61,9 @@ check_supported_ppc64() check_test_requirements() { - # The test supports x86_64 and powerpc64. We currently have no useful - # eligibility check for powerpc64, and the test itself will reject other - # architectures. + # The test supports x86_64, powerpc64 and arm64. There's check for arm64 + # in va_high_addr_switch.c. The test itself will reject other architectures. + case `uname -m` in "x86_64") check_supported_x86_64 -- cgit v1.2.3 From a8d933dc3354bfb9db1fc0e09c289ec1778ee271 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Thu, 25 Dec 2025 21:02:13 +0000 Subject: mm/vmstat: remove unused node and zone state helpers Several helper functions for managing node and zone states have become obsolete and no longer have any callers within the kernel. inc_node_state() inc_zone_state() dec_zone_state() This commit removes the dead code. Link: https://lkml.kernel.org/r/20251225210213.2553-1-richard.weiyang@gmail.com Signed-off-by: Wei Yang Reviewed-by: Joshua Hahn Acked-by: David Hildenbrand (Red Hat) Acked-by: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/vmstat.h | 6 ------ mm/vmstat.c | 15 --------------- 2 files changed, 21 deletions(-) diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 3398a345bda8..cf559e2ce1d4 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -286,10 +286,8 @@ void mod_node_page_state(struct pglist_data *, enum node_stat_item, long); void inc_node_page_state(struct page *, enum node_stat_item); void dec_node_page_state(struct page *, enum node_stat_item); -extern void inc_node_state(struct pglist_data *, enum node_stat_item); extern void __inc_zone_state(struct zone *, enum zone_stat_item); extern void __inc_node_state(struct pglist_data *, enum node_stat_item); -extern void dec_zone_state(struct zone *, enum zone_stat_item); extern void __dec_zone_state(struct zone *, enum zone_stat_item); extern void __dec_node_state(struct pglist_data *, enum node_stat_item); @@ -394,10 +392,6 @@ static inline void __dec_node_page_state(struct page *page, #define dec_node_page_state __dec_node_page_state #define mod_node_page_state __mod_node_page_state -#define inc_zone_state __inc_zone_state -#define inc_node_state __inc_node_state -#define dec_zone_state __dec_zone_state - #define set_pgdat_percpu_threshold(pgdat, callback) { } static inline void refresh_zone_stat_thresholds(void) { } diff --git a/mm/vmstat.c b/mm/vmstat.c index bd2af431ff86..6ae8891c9693 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -672,11 +672,6 @@ void mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item, } EXPORT_SYMBOL(mod_node_page_state); -void inc_node_state(struct pglist_data *pgdat, enum node_stat_item item) -{ - mod_node_state(pgdat, item, 1, 1); -} - void inc_node_page_state(struct page *page, enum node_stat_item item) { mod_node_state(page_pgdat(page), item, 1, 1); @@ -725,16 +720,6 @@ void dec_zone_page_state(struct page *page, enum zone_stat_item item) } EXPORT_SYMBOL(dec_zone_page_state); -void inc_node_state(struct pglist_data *pgdat, enum node_stat_item item) -{ - unsigned long flags; - - local_irq_save(flags); - __inc_node_state(pgdat, item); - local_irq_restore(flags); -} -EXPORT_SYMBOL(inc_node_state); - void mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item, long delta) { -- cgit v1.2.3 From 6c59085fc09428b2c168bb9fa1cab760e0831914 Mon Sep 17 00:00:00 2001 From: Shu Anzai Date: Wed, 24 Dec 2025 04:21:56 +0000 Subject: mm/damon/tests/core-kunit: verify the 'age' field in damon_test_split_at() Patch series "mm/damon/tests/core-kunit: extend existing test scenarios", v2. Improve the KUnit test coverage for DAMON. The five patches in this series respectively extend damon_test_split_at(), damon_test_merge_two(), damon_test_merge_regions_of(), damon_test_split_regions_of(), and damos_test_commit_quota_goal(). This patch (of 5): Extend damon_test_split_at() to verify the 'age' field. Link: https://lkml.kernel.org/r/20251224042200.2061847-1-shu17az@gmail.com Link: https://lkml.kernel.org/r/20251224042200.2061847-2-shu17az@gmail.com Signed-off-by: Shu Anzai Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/tests/core-kunit.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mm/damon/tests/core-kunit.h b/mm/damon/tests/core-kunit.h index f59ae7ee19a0..88ec046f4942 100644 --- a/mm/damon/tests/core-kunit.h +++ b/mm/damon/tests/core-kunit.h @@ -158,6 +158,7 @@ static void damon_test_split_at(struct kunit *test) r->nr_accesses_bp = 420000; r->nr_accesses = 42; r->last_nr_accesses = 15; + r->age = 10; damon_add_region(r, t); damon_split_region_at(t, r, 25); KUNIT_EXPECT_EQ(test, r->ar.start, 0ul); @@ -170,6 +171,7 @@ static void damon_test_split_at(struct kunit *test) KUNIT_EXPECT_EQ(test, r->nr_accesses_bp, r_new->nr_accesses_bp); KUNIT_EXPECT_EQ(test, r->nr_accesses, r_new->nr_accesses); KUNIT_EXPECT_EQ(test, r->last_nr_accesses, r_new->last_nr_accesses); + KUNIT_EXPECT_EQ(test, r->age, r_new->age); damon_free_target(t); } -- cgit v1.2.3 From 738dae96b2fb69c15c99f95ed0044bc12830dcba Mon Sep 17 00:00:00 2001 From: Shu Anzai Date: Wed, 24 Dec 2025 04:21:57 +0000 Subject: mm/damon/tests/core-kunit: verify the 'age' and 'nr_accesses_bp' fields in damon_test_merge_two() Extend damon_test_merge_two() to verify the 'age' and 'nr_accesses_bp' fields. Link: https://lkml.kernel.org/r/20251224042200.2061847-3-shu17az@gmail.com Signed-off-by: Shu Anzai Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/tests/core-kunit.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mm/damon/tests/core-kunit.h b/mm/damon/tests/core-kunit.h index 88ec046f4942..6e301113e103 100644 --- a/mm/damon/tests/core-kunit.h +++ b/mm/damon/tests/core-kunit.h @@ -192,6 +192,7 @@ static void damon_test_merge_two(struct kunit *test) } r->nr_accesses = 10; r->nr_accesses_bp = 100000; + r->age = 9; damon_add_region(r, t); r2 = damon_new_region(100, 300); if (!r2) { @@ -200,12 +201,15 @@ static void damon_test_merge_two(struct kunit *test) } r2->nr_accesses = 20; r2->nr_accesses_bp = 200000; + r2->age = 21; damon_add_region(r2, t); damon_merge_two_regions(t, r, r2); KUNIT_EXPECT_EQ(test, r->ar.start, 0ul); KUNIT_EXPECT_EQ(test, r->ar.end, 300ul); KUNIT_EXPECT_EQ(test, r->nr_accesses, 16u); + KUNIT_EXPECT_EQ(test, r->nr_accesses_bp, 160000u); + KUNIT_EXPECT_EQ(test, r->age, 17u); i = 0; damon_for_each_region(r3, t) { -- cgit v1.2.3 From 65a17a3e609f63c7ea2887096dc232a6c05d02a2 Mon Sep 17 00:00:00 2001 From: Shu Anzai Date: Wed, 24 Dec 2025 04:21:58 +0000 Subject: mm/damon/tests/core-kunit: add a test case for region merge size limit in damon_test_merge_regions_of() Add a test case in damon_test_merge_regions_of() to verify that two adjacent regions are not merged if the resulting region would exceed the specified size limit. Link: https://lkml.kernel.org/r/20251224042200.2061847-4-shu17az@gmail.com Signed-off-by: Shu Anzai Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/tests/core-kunit.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/mm/damon/tests/core-kunit.h b/mm/damon/tests/core-kunit.h index 6e301113e103..2eb6f41635a8 100644 --- a/mm/damon/tests/core-kunit.h +++ b/mm/damon/tests/core-kunit.h @@ -238,12 +238,12 @@ static void damon_test_merge_regions_of(struct kunit *test) { struct damon_target *t; struct damon_region *r; - unsigned long sa[] = {0, 100, 114, 122, 130, 156, 170, 184}; - unsigned long ea[] = {100, 112, 122, 130, 156, 170, 184, 230}; - unsigned int nrs[] = {0, 0, 10, 10, 20, 30, 1, 2}; + unsigned long sa[] = {0, 100, 114, 122, 130, 156, 170, 184, 230}; + unsigned long ea[] = {100, 112, 122, 130, 156, 170, 184, 230, 10170}; + unsigned int nrs[] = {0, 0, 10, 10, 20, 30, 1, 2, 5}; - unsigned long saddrs[] = {0, 114, 130, 156, 170}; - unsigned long eaddrs[] = {112, 130, 156, 170, 230}; + unsigned long saddrs[] = {0, 114, 130, 156, 170, 230}; + unsigned long eaddrs[] = {112, 130, 156, 170, 230, 10170}; int i; t = damon_new_target(); @@ -261,9 +261,9 @@ static void damon_test_merge_regions_of(struct kunit *test) } damon_merge_regions_of(t, 9, 9999); - /* 0-112, 114-130, 130-156, 156-170 */ - KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 5u); - for (i = 0; i < 5; i++) { + /* 0-112, 114-130, 130-156, 156-170, 170-230, 230-10170 */ + KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 6u); + for (i = 0; i < 6; i++) { r = __nth_region_of(t, i); KUNIT_EXPECT_EQ(test, r->ar.start, saddrs[i]); KUNIT_EXPECT_EQ(test, r->ar.end, eaddrs[i]); -- cgit v1.2.3 From 2caf45764a4fdb5d35524e364c963bb9e2d07fce Mon Sep 17 00:00:00 2001 From: Shu Anzai Date: Wed, 24 Dec 2025 04:21:59 +0000 Subject: mm/damon/tests/core-kunit: add test cases for multiple regions in damon_test_split_regions_of() Extend damon_test_split_regions_of() to verify that it correctly handles multiple regions with various 'min_sz_region'. [sj@kernel.org: remove braces in damon_test_split_regions_of()] Link: https://lkml.kernel.org/r/20251224153125.69194-1-sj@kernel.org Link: https://lkml.kernel.org/r/20251224042200.2061847-5-shu17az@gmail.com Signed-off-by: Shu Anzai Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/tests/core-kunit.h | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/mm/damon/tests/core-kunit.h b/mm/damon/tests/core-kunit.h index 2eb6f41635a8..252ce3e001c8 100644 --- a/mm/damon/tests/core-kunit.h +++ b/mm/damon/tests/core-kunit.h @@ -275,6 +275,9 @@ static void damon_test_split_regions_of(struct kunit *test) { struct damon_target *t; struct damon_region *r; + unsigned long sa[] = {0, 300, 500}; + unsigned long ea[] = {220, 400, 700}; + int i; t = damon_new_target(); if (!t) @@ -301,6 +304,23 @@ static void damon_test_split_regions_of(struct kunit *test) damon_split_regions_of(t, 4, 1); KUNIT_EXPECT_LE(test, damon_nr_regions(t), 4u); damon_free_target(t); + + t = damon_new_target(); + if (!t) + kunit_skip(test, "third target alloc fail"); + for (i = 0; i < ARRAY_SIZE(sa); i++) { + r = damon_new_region(sa[i], ea[i]); + if (!r) { + damon_free_target(t); + kunit_skip(test, "region alloc fail"); + } + damon_add_region(r, t); + } + damon_split_regions_of(t, 4, 5); + KUNIT_EXPECT_LE(test, damon_nr_regions(t), 12u); + damon_for_each_region(r, t) + KUNIT_EXPECT_GE(test, damon_sz_region(r) % 5ul, 0ul); + damon_free_target(t); } static void damon_test_ops_registration(struct kunit *test) -- cgit v1.2.3 From 860996495f989fd86b7f1525d7500a6e15986a24 Mon Sep 17 00:00:00 2001 From: Shu Anzai Date: Wed, 24 Dec 2025 04:22:00 +0000 Subject: mm/damon/tests/core-kunit: remove a redundant test case and add a new test case in damos_test_commit_quota_goal() Remove a redundant test case from damos_test_commit_quota_goal() as it is already covered. Instead, add a new test for DAMOS_QUOTA_SOME_MEM_PSI_US, which was previously not tested. Link: https://lkml.kernel.org/r/20251224042200.2061847-6-shu17az@gmail.com Signed-off-by: Shu Anzai Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/tests/core-kunit.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/mm/damon/tests/core-kunit.h b/mm/damon/tests/core-kunit.h index 252ce3e001c8..92ea25e2dc9e 100644 --- a/mm/damon/tests/core-kunit.h +++ b/mm/damon/tests/core-kunit.h @@ -600,9 +600,10 @@ static void damos_test_commit_quota_goal(struct kunit *test) }); damos_test_commit_quota_goal_for(test, &dst, &(struct damos_quota_goal) { - .metric = DAMOS_QUOTA_USER_INPUT, - .target_value = 789, - .current_value = 12, + .metric = DAMOS_QUOTA_SOME_MEM_PSI_US, + .target_value = 234, + .current_value = 345, + .last_psi_total = 567, }); } -- cgit v1.2.3 From 29ec27805f55122252c3973e4edae82676cc737d Mon Sep 17 00:00:00 2001 From: Dipendra Khadka Date: Sun, 28 Dec 2025 15:44:55 +0000 Subject: mm/oom_kill: remove unnecessary integer promotion in format string The 'h' length modifier in '%hd' is unnecessary as short integers are promoted to int in variadic functions. Use '%d' instead. Checkpatch flags the 'h' modifier as unnecessary for this reason, and many other subsystems have moved to using %d for promoted types. Hence, I think this patch aligns with kernel coding practices. Link: https://lkml.kernel.org/r/20251228154456.2386-1-kdipendra88@gmail.com Signed-off-by: Dipendra Khadka Acked-by: David Rientjes Acked-by: Michal Hocko Cc: Shakeel Butt Signed-off-by: Andrew Morton --- mm/oom_kill.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 5eb11fbba704..94066316e3ec 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -458,7 +458,7 @@ static void dump_oom_victim(struct oom_control *oc, struct task_struct *victim) static void dump_header(struct oom_control *oc) { - pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, oom_score_adj=%hd\n", + pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, oom_score_adj=%d\n", current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order, current->signal->oom_score_adj); if (!IS_ENABLED(CONFIG_COMPACTION) && oc->order) @@ -958,7 +958,7 @@ static void __oom_kill_process(struct task_struct *victim, const char *message) */ do_send_sig_info(SIGKILL, SEND_SIG_PRIV, victim, PIDTYPE_TGID); mark_oom_victim(victim); - pr_err("%s: Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB, UID:%u pgtables:%lukB oom_score_adj:%hd\n", + pr_err("%s: Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB, UID:%u pgtables:%lukB oom_score_adj:%d\n", message, task_pid_nr(victim), victim->comm, K(mm->total_vm), K(get_mm_counter(mm, MM_ANONPAGES)), K(get_mm_counter(mm, MM_FILEPAGES)), -- cgit v1.2.3 From f9b74c13b773b7c7e4920d7bc214ea3d5f37b422 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Wed, 31 Dec 2025 03:00:26 +0000 Subject: mm/mmu_gather: remove @delay_remap of __tlb_remove_page_size() __tlb_remove_page_size() is only used in tlb_remove_page_size() with @delay_remap set to false and it is passed directly to __tlb_remove_folio_pages_size(). Remove @delay_remap of __tlb_remove_page_size() and call __tlb_remove_folio_pages_size() with false @delay_remap. Link: https://lkml.kernel.org/r/20251231030026.15938-1-richard.weiyang@gmail.com Signed-off-by: Wei Yang Acked-by: SeongJae Park Acked-by: David Hildenbrand (Red Hat) Acked-by: Will Deacon Acked-by: Heiko Carstens # s390 Cc: Alexander Gordeev Cc: "Aneesh Kumar K.V" Cc: Arnd Bergmann Cc: Christian Borntraeger Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Sven Schnelle Cc: Vasily Gorbik Signed-off-by: Andrew Morton --- arch/s390/include/asm/tlb.h | 6 ++---- include/asm-generic/tlb.h | 5 ++--- mm/mmu_gather.c | 5 ++--- 3 files changed, 6 insertions(+), 10 deletions(-) diff --git a/arch/s390/include/asm/tlb.h b/arch/s390/include/asm/tlb.h index 1e50f6f1ad9d..0b7b4df94b24 100644 --- a/arch/s390/include/asm/tlb.h +++ b/arch/s390/include/asm/tlb.h @@ -24,7 +24,7 @@ static inline void tlb_flush(struct mmu_gather *tlb); static inline bool __tlb_remove_page_size(struct mmu_gather *tlb, - struct page *page, bool delay_rmap, int page_size); + struct page *page, int page_size); static inline bool __tlb_remove_folio_pages(struct mmu_gather *tlb, struct page *page, unsigned int nr_pages, bool delay_rmap); @@ -46,10 +46,8 @@ static inline bool __tlb_remove_folio_pages(struct mmu_gather *tlb, * s390 doesn't delay rmap removal. */ static inline bool __tlb_remove_page_size(struct mmu_gather *tlb, - struct page *page, bool delay_rmap, int page_size) + struct page *page, int page_size) { - VM_WARN_ON_ONCE(delay_rmap); - free_folio_and_swap_cache(page_folio(page)); return false; } diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index 4d679d2a206b..3975f7d11553 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -287,8 +287,7 @@ struct mmu_gather_batch { */ #define MAX_GATHER_BATCH_COUNT (10000UL/MAX_GATHER_BATCH) -extern bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, - bool delay_rmap, int page_size); +extern bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size); bool __tlb_remove_folio_pages(struct mmu_gather *tlb, struct page *page, unsigned int nr_pages, bool delay_rmap); @@ -510,7 +509,7 @@ static inline void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb) static inline void tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size) { - if (__tlb_remove_page_size(tlb, page, false, page_size)) + if (__tlb_remove_page_size(tlb, page, page_size)) tlb_flush_mmu(tlb); } diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c index 7468ec388455..2faa23d7f8d4 100644 --- a/mm/mmu_gather.c +++ b/mm/mmu_gather.c @@ -210,10 +210,9 @@ bool __tlb_remove_folio_pages(struct mmu_gather *tlb, struct page *page, PAGE_SIZE); } -bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, - bool delay_rmap, int page_size) +bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size) { - return __tlb_remove_folio_pages_size(tlb, page, 1, delay_rmap, page_size); + return __tlb_remove_folio_pages_size(tlb, page, 1, false, page_size); } #endif /* MMU_GATHER_NO_GATHER */ -- cgit v1.2.3 From 5173ae0a068d64643ccf4915b7cbedf82810a592 Mon Sep 17 00:00:00 2001 From: Shivank Garg Date: Sun, 18 Jan 2026 19:09:40 +0000 Subject: mm/khugepaged: map dirty/writeback pages failures to EAGAIN Patch series "mm/khugepaged: fix dirty page handling for MADV_COLLAPSE", v5. MADV_COLLAPSE on file-backed mappings fails with -EINVAL when TEXT pages are dirty. This affects scenarios like package/container updates or executing binaries immediately after writing them, etc. The issue is that collapse_file() triggers async writeback and returns SCAN_FAIL (maps to -EINVAL), expecting khugepaged to revisit later. But MADV_COLLAPSE is synchronous and userspace expects immediate success or a clear retry signal. Reproduction: - Compile or copy 2MB-aligned executable to XFS/ext4 FS - Call MADV_COLLAPSE on .text section - First call fails with -EINVAL (text pages dirty from copy) - Second call succeeds (async writeback completed) Issue Report: https://lore.kernel.org/all/4e26fe5e-7374-467c-a333-9dd48f85d7cc@amd.com This patch (of 2): When collapse_file encounters dirty or writeback pages in file-backed mappings, it currently returns SCAN_FAIL which maps to -EINVAL. This is misleading as EINVAL suggests invalid arguments, whereas dirty/writeback pages represent transient conditions that may resolve on retry. Introduce SCAN_PAGE_DIRTY_OR_WRITEBACK to cover both dirty and writeback states, mapping it to -EAGAIN. For MADV_COLLAPSE, this provides userspace with a clear signal that retry may succeed after writeback completes. For khugepaged, this is harmless as it will naturally revisit the range during periodic scans after async writeback completes. Link: https://lkml.kernel.org/r/20260118190939.8986-2-shivankg@amd.com Link: https://lkml.kernel.org/r/20260118190939.8986-4-shivankg@amd.com Fixes: 34488399fa08 ("mm/madvise: add file and shmem support to MADV_COLLAPSE") Signed-off-by: Shivank Garg Reported-by: Branden Moore Closes: https://lore.kernel.org/all/4e26fe5e-7374-467c-a333-9dd48f85d7cc@amd.com Reviewed-by: Dev Jain Reviewed-by: Lance Yang Reviewed-by: Baolin Wang Reviewed-by: wang lian Acked-by: David Hildenbrand (Red Hat) Cc: Barry Song Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Nico Pache Cc: Ryan Roberts Cc: Zach O'Keefe Cc: Zi Yan Signed-off-by: Andrew Morton --- include/trace/events/huge_memory.h | 3 ++- mm/khugepaged.c | 8 +++++--- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/include/trace/events/huge_memory.h b/include/trace/events/huge_memory.h index 4cde53b45a85..4e41bff31888 100644 --- a/include/trace/events/huge_memory.h +++ b/include/trace/events/huge_memory.h @@ -37,7 +37,8 @@ EM( SCAN_PAGE_HAS_PRIVATE, "page_has_private") \ EM( SCAN_STORE_FAILED, "store_failed") \ EM( SCAN_COPY_MC, "copy_poisoned_page") \ - EMe(SCAN_PAGE_FILLED, "page_filled") + EM( SCAN_PAGE_FILLED, "page_filled") \ + EMe(SCAN_PAGE_DIRTY_OR_WRITEBACK, "page_dirty_or_writeback") #undef EM #undef EMe diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 97d1b2824386..219dfa2e523c 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -58,6 +58,7 @@ enum scan_result { SCAN_STORE_FAILED, SCAN_COPY_MC, SCAN_PAGE_FILLED, + SCAN_PAGE_DIRTY_OR_WRITEBACK, }; #define CREATE_TRACE_POINTS @@ -1967,11 +1968,11 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, */ xas_unlock_irq(&xas); filemap_flush(mapping); - result = SCAN_FAIL; + result = SCAN_PAGE_DIRTY_OR_WRITEBACK; goto xa_unlocked; } else if (folio_test_writeback(folio)) { xas_unlock_irq(&xas); - result = SCAN_FAIL; + result = SCAN_PAGE_DIRTY_OR_WRITEBACK; goto xa_unlocked; } else if (folio_trylock(folio)) { folio_get(folio); @@ -2018,7 +2019,7 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, * folio is dirty because it hasn't been flushed * since first write. */ - result = SCAN_FAIL; + result = SCAN_PAGE_DIRTY_OR_WRITEBACK; goto out_unlock; } @@ -2747,6 +2748,7 @@ static int madvise_collapse_errno(enum scan_result r) case SCAN_PAGE_LRU: case SCAN_DEL_PAGE_LRU: case SCAN_PAGE_FILLED: + case SCAN_PAGE_DIRTY_OR_WRITEBACK: return -EAGAIN; /* * Other: Trying again likely not to succeed / error intrinsic to -- cgit v1.2.3 From 398556570e32af82aa7654e730bcd655712ecf08 Mon Sep 17 00:00:00 2001 From: Shivank Garg Date: Sun, 18 Jan 2026 19:09:43 +0000 Subject: mm/khugepaged: retry with sync writeback for MADV_COLLAPSE When MADV_COLLAPSE is called on file-backed mappings (e.g., executable text sections), the pages may still be dirty from recent writes. collapse_file() will trigger async writeback and fail with SCAN_PAGE_DIRTY_OR_WRITEBACK (-EAGAIN). MADV_COLLAPSE is a synchronous operation where userspace expects immediate results. If the collapse fails due to dirty pages, perform synchronous writeback on the specific range and retry once. This avoids spurious failures for freshly written executables while avoiding unnecessary synchronous I/O for mappings that are already clean. Link: https://lkml.kernel.org/r/20260118190939.8986-7-shivankg@amd.com Signed-off-by: Shivank Garg Reported-by: Branden Moore Closes: https://lore.kernel.org/all/4e26fe5e-7374-467c-a333-9dd48f85d7cc@amd.com Fixes: 34488399fa08 ("mm/madvise: add file and shmem support to MADV_COLLAPSE") Suggested-by: David Hildenbrand Tested-by: Lance Yang Acked-by: David Hildenbrand (Red Hat) Reviewed-by: Dev Jain Cc: Baolin Wang Cc: Barry Song Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Nico Pache Cc: Ryan Roberts Cc: wang lian Cc: Zach O'Keefe Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/khugepaged.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 219dfa2e523c..16582bdcb6ff 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include "internal.h" @@ -2788,7 +2789,9 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start, for (addr = hstart; addr < hend; addr += HPAGE_PMD_SIZE) { int result = SCAN_FAIL; + bool triggered_wb = false; +retry: if (!mmap_locked) { cond_resched(); mmap_read_lock(mm); @@ -2809,8 +2812,20 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start, mmap_read_unlock(mm); mmap_locked = false; + *lock_dropped = true; result = hpage_collapse_scan_file(mm, addr, file, pgoff, cc); + + if (result == SCAN_PAGE_DIRTY_OR_WRITEBACK && !triggered_wb && + mapping_can_writeback(file->f_mapping)) { + loff_t lstart = (loff_t)pgoff << PAGE_SHIFT; + loff_t lend = lstart + HPAGE_PMD_SIZE - 1; + + filemap_write_and_wait_range(file->f_mapping, lstart, lend); + triggered_wb = true; + fput(file); + goto retry; + } fput(file); } else { result = hpage_collapse_scan_pmd(mm, vma, addr, -- cgit v1.2.3 From ba1c86874e25e95de9b253570bb50cc3b5df542e Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Sun, 11 Jan 2026 10:20:35 +0200 Subject: alpha: introduce arch_zone_limits_init() Patch series "arch, mm: consolidate hugetlb early reservation", v3. Order in which early memory reservation for hugetlb happens depends on architecture, on configuration options and on command line parameters. Some architectures rely on the core MM to call hugetlb_bootmem_alloc() while others call it very early to allow pre-allocation of HVO-style vmemmap. When hugetlb_cma is supported by an architecture it is initialized during setup_arch() and then later hugetlb_init code needs to understand did it happen or not. To make everything consistent and unified, both reservation of hugetlb memory from bootmem and creation of CMA areas for hugetlb must be called from core MM initialization and it would have been a simple change. However, HVO-style pre-initialization ordering requirements slightly complicate things and for HVO pre-init to work sparse and memory map should be initialized after hugetlb reservations. This required pulling out the call to free_area_init() out of setup_arch() path and moving it MM initialization and this is what the first 23 patches do. These changes are deliberately split into per-arch patches that change how the zone limits are calculated for each architecture and the patches 22 and 23 just remove the calls to free_area_init() and sprase_init() from arch/*. Patch 24 is a simple cleanup for MIPS. Patches 25 and 26 actually consolidate hugetlb reservations and patches 27 and 28 perform some aftermath cleanups. This patch (of 29): Move calculations of zone limits to a dedicated arch_zone_limits_init() function. Later MM core will use this function as an architecture specific callback during nodes and zones initialization and thus there won't be a need to call free_area_init() from every architecture. Link: https://lkml.kernel.org/r/20260111082105.290734-1-rppt@kernel.org Link: https://lkml.kernel.org/r/20260111082105.290734-2-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Acked-by: Magnus Lindholm Cc: Alexander Gordeev Cc: Alex Shi Cc: Andreas Larsson Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: David Hildenbrand Cc: David S. Miller Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Heiko Carstens Cc: Helge Deller Cc: Huacai Chen Cc: Ingo Molnar Cc: Johannes Berg Cc: John Paul Adrian Glaubitz Cc: Jonathan Corbet Cc: Klara Modin Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Hocko Cc: Michal Simek Cc: Muchun Song Cc: Oscar Salvador Cc: Palmer Dabbelt Cc: Pratyush Yadav Cc: Richard Weinberger Cc: "Ritesh Harjani (IBM)" Cc: Russell King Cc: Stafford Horne Cc: Suren Baghdasaryan Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/alpha/mm/init.c | 15 ++++++++++----- include/linux/mm.h | 1 + 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/arch/alpha/mm/init.c b/arch/alpha/mm/init.c index 4c5ab9cd8a0a..cd0cb1abde5f 100644 --- a/arch/alpha/mm/init.c +++ b/arch/alpha/mm/init.c @@ -208,12 +208,8 @@ callback_init(void * kernel_end) return kernel_end; } -/* - * paging_init() sets up the memory map. - */ -void __init paging_init(void) +void __init arch_zone_limits_init(unsigned long *max_zone_pfn) { - unsigned long max_zone_pfn[MAX_NR_ZONES] = {0, }; unsigned long dma_pfn; dma_pfn = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT; @@ -221,8 +217,17 @@ void __init paging_init(void) max_zone_pfn[ZONE_DMA] = dma_pfn; max_zone_pfn[ZONE_NORMAL] = max_pfn; +} + +/* + * paging_init() sets up the memory map. + */ +void __init paging_init(void) +{ + unsigned long max_zone_pfn[MAX_NR_ZONES] = {0, }; /* Initialize mem_map[]. */ + arch_zone_limits_init(max_zone_pfn); free_area_init(max_zone_pfn); /* Initialize the kernel's ZERO_PGE. */ diff --git a/include/linux/mm.h b/include/linux/mm.h index ab2e7e30aef9..477339b7a032 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3556,6 +3556,7 @@ static inline unsigned long get_num_physpages(void) * free_area_init(max_zone_pfns); */ void free_area_init(unsigned long *max_zone_pfn); +void arch_zone_limits_init(unsigned long *max_zone_pfn); unsigned long node_map_pfn_alignment(void); extern unsigned long absent_pages_in_range(unsigned long start_pfn, unsigned long end_pfn); -- cgit v1.2.3 From 7988e85189048033a2784e8cf81c5d62dcd2af82 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Sun, 11 Jan 2026 10:20:36 +0200 Subject: arc: introduce arch_zone_limits_init() Move calculations of zone limits to a dedicated arch_zone_limits_init() function. Later MM core will use this function as an architecture specific callback during nodes and zones initialization and thus there won't be a need to call free_area_init() from every architecture. Link: https://lkml.kernel.org/r/20260111082105.290734-3-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Acked-by: Vineet Gupta Cc: Alexander Gordeev Cc: Alex Shi Cc: Andreas Larsson Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: David Hildenbrand Cc: David S. Miller Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Heiko Carstens Cc: Helge Deller Cc: Huacai Chen Cc: Ingo Molnar Cc: Johannes Berg Cc: John Paul Adrian Glaubitz Cc: Jonathan Corbet Cc: Klara Modin Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Magnus Lindholm Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Hocko Cc: Michal Simek Cc: Muchun Song Cc: Oscar Salvador Cc: Palmer Dabbelt Cc: Pratyush Yadav Cc: Richard Weinberger Cc: "Ritesh Harjani (IBM)" Cc: Russell King Cc: Stafford Horne Cc: Suren Baghdasaryan Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/arc/mm/init.c | 34 ++++++++++++++++++++-------------- 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/arch/arc/mm/init.c b/arch/arc/mm/init.c index a73cc94f806e..ff7974d38011 100644 --- a/arch/arc/mm/init.c +++ b/arch/arc/mm/init.c @@ -75,6 +75,25 @@ void __init early_init_dt_add_memory_arch(u64 base, u64 size) base, TO_MB(size), !in_use ? "Not used":""); } +void __init arch_zone_limits_init(unsigned long *max_zone_pfn) +{ + /*----------------- node/zones setup --------------------------*/ + max_zone_pfn[ZONE_NORMAL] = max_low_pfn; + +#ifdef CONFIG_HIGHMEM + /* + * max_high_pfn should be ok here for both HIGHMEM and HIGHMEM+PAE. + * For HIGHMEM without PAE max_high_pfn should be less than + * min_low_pfn to guarantee that these two regions don't overlap. + * For PAE case highmem is greater than lowmem, so it is natural + * to use max_high_pfn. + * + * In both cases, holes should be handled by pfn_valid(). + */ + max_zone_pfn[ZONE_HIGHMEM] = max_high_pfn; +#endif +} + /* * First memory setup routine called from setup_arch() * 1. setup swapper's mm @init_mm @@ -122,9 +141,6 @@ void __init setup_arch_memory(void) memblock_dump_all(); - /*----------------- node/zones setup --------------------------*/ - max_zone_pfn[ZONE_NORMAL] = max_low_pfn; - #ifdef CONFIG_HIGHMEM /* * On ARC (w/o PAE) HIGHMEM addresses are actually smaller (0 based) @@ -139,21 +155,11 @@ void __init setup_arch_memory(void) min_high_pfn = PFN_DOWN(high_mem_start); max_high_pfn = PFN_DOWN(high_mem_start + high_mem_sz); - /* - * max_high_pfn should be ok here for both HIGHMEM and HIGHMEM+PAE. - * For HIGHMEM without PAE max_high_pfn should be less than - * min_low_pfn to guarantee that these two regions don't overlap. - * For PAE case highmem is greater than lowmem, so it is natural - * to use max_high_pfn. - * - * In both cases, holes should be handled by pfn_valid(). - */ - max_zone_pfn[ZONE_HIGHMEM] = max_high_pfn; - arch_pfn_offset = min(min_low_pfn, min_high_pfn); kmap_init(); #endif /* CONFIG_HIGHMEM */ + arch_zone_limits_init(max_zone_pfn); free_area_init(max_zone_pfn); } -- cgit v1.2.3 From 30a66f8a8cd3ed3df69a672cf8f2a43eddd8a212 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Sun, 11 Jan 2026 10:20:37 +0200 Subject: arm: introduce arch_zone_limits_init() Move calculations of zone limits to a dedicated arch_zone_limits_init() function. Later MM core will use this function as an architecture specific callback during nodes and zones initialization and thus there won't be a need to call free_area_init() from every architecture. Link: https://lkml.kernel.org/r/20260111082105.290734-4-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Cc: Alexander Gordeev Cc: Alex Shi Cc: Andreas Larsson Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: David Hildenbrand Cc: David S. Miller Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Heiko Carstens Cc: Helge Deller Cc: Huacai Chen Cc: Ingo Molnar Cc: Johannes Berg Cc: John Paul Adrian Glaubitz Cc: Jonathan Corbet Cc: Klara Modin Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Magnus Lindholm Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Hocko Cc: Michal Simek Cc: Muchun Song Cc: Oscar Salvador Cc: Palmer Dabbelt Cc: Pratyush Yadav Cc: Richard Weinberger Cc: "Ritesh Harjani (IBM)" Cc: Russell King Cc: Stafford Horne Cc: Suren Baghdasaryan Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/arm/mm/init.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c index 54bdca025c9f..bdcc3639681f 100644 --- a/arch/arm/mm/init.c +++ b/arch/arm/mm/init.c @@ -107,18 +107,23 @@ void __init setup_dma_zone(const struct machine_desc *mdesc) #endif } -static void __init zone_sizes_init(unsigned long min, unsigned long max_low, - unsigned long max_high) +void __init arch_zone_limits_init(unsigned long *max_zone_pfn) { - unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0 }; - #ifdef CONFIG_ZONE_DMA - max_zone_pfn[ZONE_DMA] = min(arm_dma_pfn_limit, max_low); + max_zone_pfn[ZONE_DMA] = min(arm_dma_pfn_limit, max_low_pfn); #endif - max_zone_pfn[ZONE_NORMAL] = max_low; + max_zone_pfn[ZONE_NORMAL] = max_low_pfn; #ifdef CONFIG_HIGHMEM - max_zone_pfn[ZONE_HIGHMEM] = max_high; + max_zone_pfn[ZONE_HIGHMEM] = max_pfn; #endif +} + +static void __init zone_sizes_init(unsigned long min, unsigned long max_low, + unsigned long max_high) +{ + unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0 }; + + arch_zone_limits_init(max_zone_pfn); free_area_init(max_zone_pfn); } -- cgit v1.2.3 From 18b7cc70dea8b4071ebf736724e99103115c5f95 Mon Sep 17 00:00:00 2001 From: Klara Modin Date: Sun, 11 Jan 2026 10:20:38 +0200 Subject: arm: make initialization of zero page independent of the memory map Unlike most architectures, arm keeps a struct page pointer to the empty_zero_page and to initialize it requires conversion of a virtual address to page which makes it necessary to have memory map initialized before creating the empty_zero_page. Make empty_zero_page a stataic array in BSS to decouple it's initialization from the initialization of the memory map. This also aligns arm with vast majorty of architectures. Link: https://lkml.kernel.org/r/20260111082105.290734-5-rppt@kernel.org Signed-off-by: Klara Modin Signed-off-by: Mike Rapoport (Microsoft) Cc: Alexander Gordeev Cc: Alex Shi Cc: Andreas Larsson Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: David Hildenbrand Cc: David S. Miller Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Heiko Carstens Cc: Helge Deller Cc: Huacai Chen Cc: Ingo Molnar Cc: Johannes Berg Cc: John Paul Adrian Glaubitz Cc: Jonathan Corbet Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Magnus Lindholm Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Hocko Cc: Michal Simek Cc: Muchun Song Cc: Oscar Salvador Cc: Palmer Dabbelt Cc: Pratyush Yadav Cc: Richard Weinberger Cc: "Ritesh Harjani (IBM)" Cc: Russell King Cc: Stafford Horne Cc: Suren Baghdasaryan Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/arm/include/asm/pgtable.h | 4 ++-- arch/arm/mm/mmu.c | 10 +--------- arch/arm/mm/nommu.c | 10 +--------- 3 files changed, 4 insertions(+), 20 deletions(-) diff --git a/arch/arm/include/asm/pgtable.h b/arch/arm/include/asm/pgtable.h index 86378eec7757..6fa9acd6a7f5 100644 --- a/arch/arm/include/asm/pgtable.h +++ b/arch/arm/include/asm/pgtable.h @@ -15,8 +15,8 @@ * ZERO_PAGE is a global shared page that is always zero: used * for zero-mapped memory areas etc.. */ -extern struct page *empty_zero_page; -#define ZERO_PAGE(vaddr) (empty_zero_page) +extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)]; +#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page)) #endif #include diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c index 8bac96e205ac..518def8314e7 100644 --- a/arch/arm/mm/mmu.c +++ b/arch/arm/mm/mmu.c @@ -45,7 +45,7 @@ extern unsigned long __atags_pointer; * empty_zero_page is a special page that is used for * zero-initialized data and COW. */ -struct page *empty_zero_page; +unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)] __page_aligned_bss; EXPORT_SYMBOL(empty_zero_page); /* @@ -1754,8 +1754,6 @@ static void __init early_fixmap_shutdown(void) */ void __init paging_init(const struct machine_desc *mdesc) { - void *zero_page; - #ifdef CONFIG_XIP_KERNEL /* Store the kernel RW RAM region start/end in these variables */ kernel_sec_start = CONFIG_PHYS_OFFSET & SECTION_MASK; @@ -1781,13 +1779,7 @@ void __init paging_init(const struct machine_desc *mdesc) top_pmd = pmd_off_k(0xffff0000); - /* allocate the zero page. */ - zero_page = early_alloc(PAGE_SIZE); - bootmem_init(); - - empty_zero_page = virt_to_page(zero_page); - __flush_dcache_folio(NULL, page_folio(empty_zero_page)); } void __init early_mm_init(const struct machine_desc *mdesc) diff --git a/arch/arm/mm/nommu.c b/arch/arm/mm/nommu.c index d638cc87807e..7e42d8accec6 100644 --- a/arch/arm/mm/nommu.c +++ b/arch/arm/mm/nommu.c @@ -31,7 +31,7 @@ unsigned long vectors_base; * empty_zero_page is a special page that is used for * zero-initialized data and COW. */ -struct page *empty_zero_page; +unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)] __page_aligned_bss; EXPORT_SYMBOL(empty_zero_page); #ifdef CONFIG_ARM_MPU @@ -156,18 +156,10 @@ void __init adjust_lowmem_bounds(void) */ void __init paging_init(const struct machine_desc *mdesc) { - void *zero_page; - early_trap_init((void *)vectors_base); mpu_setup(); - /* allocate the zero page. */ - zero_page = (void *)memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE); - bootmem_init(); - - empty_zero_page = virt_to_page(zero_page); - flush_dcache_page(empty_zero_page); } /* -- cgit v1.2.3 From 60b35af0a6aa6b92dab6dd6d2cb0b39647d07436 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Sun, 11 Jan 2026 10:20:39 +0200 Subject: arm64: introduce arch_zone_limits_init() Move calculations of zone limits to a dedicated arch_zone_limits_init() function. Later MM core will use this function as an architecture specific callback during nodes and zones initialization and thus there won't be a need to call free_area_init() from every architecture. While on it rename zone_sizes_init() to dma_limits_init() to better reflect what that function does. Link: https://lkml.kernel.org/r/20260111082105.290734-6-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Cc: Alexander Gordeev Cc: Alex Shi Cc: Andreas Larsson Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: David Hildenbrand Cc: David S. Miller Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Heiko Carstens Cc: Helge Deller Cc: Huacai Chen Cc: Ingo Molnar Cc: Johannes Berg Cc: John Paul Adrian Glaubitz Cc: Jonathan Corbet Cc: Klara Modin Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Magnus Lindholm Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Hocko Cc: Michal Simek Cc: Muchun Song Cc: Oscar Salvador Cc: Palmer Dabbelt Cc: Pratyush Yadav Cc: Richard Weinberger Cc: "Ritesh Harjani (IBM)" Cc: Russell King Cc: Stafford Horne Cc: Suren Baghdasaryan Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/arm64/mm/init.c | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 524d34a0e921..06815d34cc11 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -118,7 +118,21 @@ static phys_addr_t __init max_zone_phys(phys_addr_t zone_limit) return min(zone_limit, memblock_end_of_DRAM() - 1) + 1; } -static void __init zone_sizes_init(void) +void __init arch_zone_limits_init(unsigned long *max_zone_pfns) +{ + phys_addr_t __maybe_unused dma32_phys_limit = + max_zone_phys(DMA_BIT_MASK(32)); + +#ifdef CONFIG_ZONE_DMA + max_zone_pfns[ZONE_DMA] = PFN_DOWN(max_zone_phys(zone_dma_limit)); +#endif +#ifdef CONFIG_ZONE_DMA32 + max_zone_pfns[ZONE_DMA32] = PFN_DOWN(dma32_phys_limit); +#endif + max_zone_pfns[ZONE_NORMAL] = max_pfn; +} + +static void __init dma_limits_init(void) { unsigned long max_zone_pfns[MAX_NR_ZONES] = {0}; phys_addr_t __maybe_unused acpi_zone_dma_limit; @@ -139,17 +153,15 @@ static void __init zone_sizes_init(void) if (memblock_start_of_DRAM() < U32_MAX) zone_dma_limit = min(zone_dma_limit, U32_MAX); arm64_dma_phys_limit = max_zone_phys(zone_dma_limit); - max_zone_pfns[ZONE_DMA] = PFN_DOWN(arm64_dma_phys_limit); #endif #ifdef CONFIG_ZONE_DMA32 - max_zone_pfns[ZONE_DMA32] = PFN_DOWN(dma32_phys_limit); if (!arm64_dma_phys_limit) arm64_dma_phys_limit = dma32_phys_limit; #endif if (!arm64_dma_phys_limit) arm64_dma_phys_limit = PHYS_MASK + 1; - max_zone_pfns[ZONE_NORMAL] = max_pfn; + arch_zone_limits_init(max_zone_pfns); free_area_init(max_zone_pfns); } @@ -319,7 +331,7 @@ void __init bootmem_init(void) * done after the fixed reservations */ sparse_init(); - zone_sizes_init(); + dma_limits_init(); /* * Reserve the CMA area after arm64_dma_phys_limit was initialised. -- cgit v1.2.3 From 37318eb97f2374f89be6a1ca1515004847e3cc2a Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Sun, 11 Jan 2026 10:20:40 +0200 Subject: csky: introduce arch_zone_limits_init() Move calculations of zone limits to a dedicated arch_zone_limits_init() function. Later MM core will use this function as an architecture specific callback during nodes and zones initialization and thus there won't be a need to call free_area_init() from every architecture. Link: https://lkml.kernel.org/r/20260111082105.290734-7-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Acked-by: Guo Ren Cc: Alexander Gordeev Cc: Alex Shi Cc: Andreas Larsson Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: David Hildenbrand Cc: David S. Miller Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Heiko Carstens Cc: Helge Deller Cc: Huacai Chen Cc: Ingo Molnar Cc: Johannes Berg Cc: John Paul Adrian Glaubitz Cc: Jonathan Corbet Cc: Klara Modin Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Magnus Lindholm Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Hocko Cc: Michal Simek Cc: Muchun Song Cc: Oscar Salvador Cc: Palmer Dabbelt Cc: Pratyush Yadav Cc: Richard Weinberger Cc: "Ritesh Harjani (IBM)" Cc: Russell King Cc: Stafford Horne Cc: Suren Baghdasaryan Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/csky/kernel/setup.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/arch/csky/kernel/setup.c b/arch/csky/kernel/setup.c index e0d6ca86ea8c..8968815d93e6 100644 --- a/arch/csky/kernel/setup.c +++ b/arch/csky/kernel/setup.c @@ -51,6 +51,14 @@ disable: } #endif +void __init arch_zone_limits_init(unsigned long *max_zone_pfns) +{ + max_zone_pfns[ZONE_NORMAL] = max_low_pfn; +#ifdef CONFIG_HIGHMEM + max_zone_pfns[ZONE_HIGHMEM] = max_pfn; +#endif +} + static void __init csky_memblock_init(void) { unsigned long lowmem_size = PFN_DOWN(LOWMEM_LIMIT - PHYS_OFFSET_OFFSET); @@ -83,12 +91,9 @@ static void __init csky_memblock_init(void) setup_initrd(); #endif - max_zone_pfn[ZONE_NORMAL] = max_low_pfn; - mmu_init(min_low_pfn, max_low_pfn); #ifdef CONFIG_HIGHMEM - max_zone_pfn[ZONE_HIGHMEM] = max_pfn; highstart_pfn = max_low_pfn; highend_pfn = max_pfn; @@ -97,6 +102,7 @@ static void __init csky_memblock_init(void) dma_contiguous_reserve(0); + arch_zone_limits_init(max_zone_pfn); free_area_init(max_zone_pfn); } -- cgit v1.2.3 From 934afdf7f4cc243c5b00352a0f8a54d2de283fe9 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Sun, 11 Jan 2026 10:20:41 +0200 Subject: hexagon: introduce arch_zone_limits_init() Move calculations of zone limits to a dedicated arch_zone_limits_init() function. Later MM core will use this function as an architecture specific callback during nodes and zones initialization and thus there won't be a need to call free_area_init() from every architecture. Link: https://lkml.kernel.org/r/20260111082105.290734-8-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Cc: Alexander Gordeev Cc: Alex Shi Cc: Andreas Larsson Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: David Hildenbrand Cc: David S. Miller Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Heiko Carstens Cc: Helge Deller Cc: Huacai Chen Cc: Ingo Molnar Cc: Johannes Berg Cc: John Paul Adrian Glaubitz Cc: Jonathan Corbet Cc: Klara Modin Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Magnus Lindholm Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Hocko Cc: Michal Simek Cc: Muchun Song Cc: Oscar Salvador Cc: Palmer Dabbelt Cc: Pratyush Yadav Cc: Richard Weinberger Cc: "Ritesh Harjani (IBM)" Cc: Russell King Cc: Stafford Horne Cc: Suren Baghdasaryan Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/hexagon/mm/init.c | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/arch/hexagon/mm/init.c b/arch/hexagon/mm/init.c index 34eb9d424b96..e2c9487d8d34 100644 --- a/arch/hexagon/mm/init.c +++ b/arch/hexagon/mm/init.c @@ -54,6 +54,18 @@ void sync_icache_dcache(pte_t pte) __vmcache_idsync(addr, PAGE_SIZE); } +void __init arch_zone_limits_init(unsigned long *max_zone_pfns) +{ + /* + * This is not particularly well documented anywhere, but + * give ZONE_NORMAL all the memory, including the big holes + * left by the kernel+bootmem_map which are already left as reserved + * in the bootmem_map; free_area_init should see those bits and + * adjust accordingly. + */ + max_zone_pfns[ZONE_NORMAL] = max_low_pfn; +} + /* * In order to set up page allocator "nodes", * somebody has to call free_area_init() for UMA. @@ -65,16 +77,7 @@ static void __init paging_init(void) { unsigned long max_zone_pfn[MAX_NR_ZONES] = {0, }; - /* - * This is not particularly well documented anywhere, but - * give ZONE_NORMAL all the memory, including the big holes - * left by the kernel+bootmem_map which are already left as reserved - * in the bootmem_map; free_area_init should see those bits and - * adjust accordingly. - */ - - max_zone_pfn[ZONE_NORMAL] = max_low_pfn; - + arch_zone_limits_init(max_zone_pfn); free_area_init(max_zone_pfn); /* sets up the zonelists and mem_map */ /* -- cgit v1.2.3 From 63cadcb731c914c85577512b0688fd62350644a8 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Sun, 11 Jan 2026 10:20:42 +0200 Subject: loongarch: introduce arch_zone_limits_init() Move calculations of zone limits to a dedicated arch_zone_limits_init() function. Later MM core will use this function as an architecture specific callback during nodes and zones initialization and thus there won't be a need to call free_area_init() from every architecture. Link: https://lkml.kernel.org/r/20260111082105.290734-9-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Cc: Alexander Gordeev Cc: Alex Shi Cc: Andreas Larsson Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: David Hildenbrand Cc: David S. Miller Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Heiko Carstens Cc: Helge Deller Cc: Huacai Chen Cc: Ingo Molnar Cc: Johannes Berg Cc: John Paul Adrian Glaubitz Cc: Jonathan Corbet Cc: Klara Modin Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Magnus Lindholm Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Hocko Cc: Michal Simek Cc: Muchun Song Cc: Oscar Salvador Cc: Palmer Dabbelt Cc: Pratyush Yadav Cc: Richard Weinberger Cc: "Ritesh Harjani (IBM)" Cc: Russell King Cc: Stafford Horne Cc: Suren Baghdasaryan Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/loongarch/mm/init.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/arch/loongarch/mm/init.c b/arch/loongarch/mm/init.c index 0946662afdd6..17235f87eafb 100644 --- a/arch/loongarch/mm/init.c +++ b/arch/loongarch/mm/init.c @@ -60,15 +60,19 @@ int __ref page_is_ram(unsigned long pfn) return memblock_is_memory(addr) && !memblock_is_reserved(addr); } -void __init paging_init(void) +void __init arch_zone_limits_init(unsigned long *max_zone_pfns) { - unsigned long max_zone_pfns[MAX_NR_ZONES]; - #ifdef CONFIG_ZONE_DMA32 max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN; #endif max_zone_pfns[ZONE_NORMAL] = max_low_pfn; +} + +void __init paging_init(void) +{ + unsigned long max_zone_pfns[MAX_NR_ZONES]; + arch_zone_limits_init(max_zone_pfns); free_area_init(max_zone_pfns); } -- cgit v1.2.3 From 41b08a7abf890deeaa7a71d1cbad9879b605d8ea Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Sun, 11 Jan 2026 10:20:43 +0200 Subject: m68k: introduce arch_zone_limits_init() Move calculations of zone limits to a dedicated arch_zone_limits_init() function. Later MM core will use this function as an architecture specific callback during nodes and zones initialization and thus there won't be a need to call free_area_init() from every architecture. Since all variants of m68k add all memory to ZONE_DMA, it is possible to use unified implementation for arch_zone_limits_init() that sets the end of ZONE_DMA to memblock_end_of_DRAM(). Link: https://lkml.kernel.org/r/20260111082105.290734-10-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Cc: Alexander Gordeev Cc: Alex Shi Cc: Andreas Larsson Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: David Hildenbrand Cc: David S. Miller Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Heiko Carstens Cc: Helge Deller Cc: Huacai Chen Cc: Ingo Molnar Cc: Johannes Berg Cc: John Paul Adrian Glaubitz Cc: Jonathan Corbet Cc: Klara Modin Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Magnus Lindholm Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Hocko Cc: Michal Simek Cc: Muchun Song Cc: Oscar Salvador Cc: Palmer Dabbelt Cc: Pratyush Yadav Cc: Richard Weinberger Cc: "Ritesh Harjani (IBM)" Cc: Russell King Cc: Stafford Horne Cc: Suren Baghdasaryan Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/m68k/mm/init.c | 7 ++++++- arch/m68k/mm/mcfmmu.c | 2 +- arch/m68k/mm/motorola.c | 2 +- arch/m68k/mm/sun3mmu.c | 2 +- 4 files changed, 9 insertions(+), 4 deletions(-) diff --git a/arch/m68k/mm/init.c b/arch/m68k/mm/init.c index 488411af1b3f..6b1d9d2434b5 100644 --- a/arch/m68k/mm/init.c +++ b/arch/m68k/mm/init.c @@ -40,6 +40,11 @@ void *empty_zero_page; EXPORT_SYMBOL(empty_zero_page); +void __init arch_zone_limits_init(unsigned long *max_zone_pfns) +{ + max_zone_pfns[ZONE_DMA] = PFN_DOWN(memblock_end_of_DRAM()); +} + #ifdef CONFIG_MMU int m68k_virt_to_node_shift; @@ -69,7 +74,7 @@ void __init paging_init(void) high_memory = (void *) end_mem; empty_zero_page = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE); - max_zone_pfn[ZONE_DMA] = end_mem >> PAGE_SHIFT; + arch_zone_limits_init(max_zone_pfn); free_area_init(max_zone_pfn); } diff --git a/arch/m68k/mm/mcfmmu.c b/arch/m68k/mm/mcfmmu.c index 19a75029036c..24a6f7bbd1ce 100644 --- a/arch/m68k/mm/mcfmmu.c +++ b/arch/m68k/mm/mcfmmu.c @@ -73,7 +73,7 @@ void __init paging_init(void) } current->mm = NULL; - max_zone_pfn[ZONE_DMA] = PFN_DOWN(_ramend); + arch_zone_limits_init(max_zone_pfn); free_area_init(max_zone_pfn); } diff --git a/arch/m68k/mm/motorola.c b/arch/m68k/mm/motorola.c index 62283bc2ed79..d6ccd23caf61 100644 --- a/arch/m68k/mm/motorola.c +++ b/arch/m68k/mm/motorola.c @@ -517,6 +517,6 @@ void __init paging_init(void) if (node_present_pages(i)) node_set_state(i, N_NORMAL_MEMORY); - max_zone_pfn[ZONE_DMA] = memblock_end_of_DRAM(); + arch_zone_limits_init(max_zone_pfn); free_area_init(max_zone_pfn); } diff --git a/arch/m68k/mm/sun3mmu.c b/arch/m68k/mm/sun3mmu.c index 1ecf6bdd08bf..fdd69cc4240c 100644 --- a/arch/m68k/mm/sun3mmu.c +++ b/arch/m68k/mm/sun3mmu.c @@ -82,7 +82,7 @@ void __init paging_init(void) current->mm = NULL; /* memory sizing is a hack stolen from motorola.c.. hope it works for us */ - max_zone_pfn[ZONE_DMA] = ((unsigned long)high_memory) >> PAGE_SHIFT; + arch_zone_limits_init(max_zone_pfn); /* I really wish I knew why the following change made things better... -- Sam */ free_area_init(max_zone_pfn); -- cgit v1.2.3 From 2ce38c9ae840ca7ddf401aec4310042581d64975 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Sun, 11 Jan 2026 10:20:44 +0200 Subject: microblaze: introduce arch_zone_limits_init() Move calculations of zone limits to a dedicated arch_zone_limits_init() function. Later MM core will use this function as an architecture specific callback during nodes and zones initialization and thus there won't be a need to call free_area_init() from every architecture. Link: https://lkml.kernel.org/r/20260111082105.290734-11-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Cc: Alexander Gordeev Cc: Alex Shi Cc: Andreas Larsson Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: David Hildenbrand Cc: David S. Miller Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Heiko Carstens Cc: Helge Deller Cc: Huacai Chen Cc: Ingo Molnar Cc: Johannes Berg Cc: John Paul Adrian Glaubitz Cc: Jonathan Corbet Cc: Klara Modin Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Magnus Lindholm Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Hocko Cc: Michal Simek Cc: Muchun Song Cc: Oscar Salvador Cc: Palmer Dabbelt Cc: Pratyush Yadav Cc: Richard Weinberger Cc: "Ritesh Harjani (IBM)" Cc: Russell King Cc: Stafford Horne Cc: Suren Baghdasaryan Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/microblaze/mm/init.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/arch/microblaze/mm/init.c b/arch/microblaze/mm/init.c index 31d475cdb1c5..54da60b81094 100644 --- a/arch/microblaze/mm/init.c +++ b/arch/microblaze/mm/init.c @@ -54,6 +54,16 @@ static void __init highmem_init(void) } #endif /* CONFIG_HIGHMEM */ +void __init arch_zone_limits_init(unsigned long *max_zone_pfns) +{ +#ifdef CONFIG_HIGHMEM + max_zone_pfns[ZONE_DMA] = max_low_pfn; + max_zone_pfns[ZONE_HIGHMEM] = max_pfn; +#else + max_zone_pfns[ZONE_DMA] = max_pfn; +#endif +} + /* * paging_init() sets up the page tables - in fact we've already done this. */ @@ -71,13 +81,8 @@ static void __init paging_init(void) #ifdef CONFIG_HIGHMEM highmem_init(); - - zones_size[ZONE_DMA] = max_low_pfn; - zones_size[ZONE_HIGHMEM] = max_pfn; -#else - zones_size[ZONE_DMA] = max_pfn; #endif - + arch_zone_limits_init(zones_size); /* We don't have holes in memory map */ free_area_init(zones_size); } -- cgit v1.2.3 From f61385e29444d4d2dca06766575da72cd814edf2 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Sun, 11 Jan 2026 10:20:45 +0200 Subject: mips: introduce arch_zone_limits_init() Move calculations of zone limits to a dedicated arch_zone_limits_init() function. Later MM core will use this function as an architecture specific callback during nodes and zones initialization and thus there won't be a need to call free_area_init() from every architecture. Link: https://lkml.kernel.org/r/20260111082105.290734-12-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Cc: Alexander Gordeev Cc: Alex Shi Cc: Andreas Larsson Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: David Hildenbrand Cc: David S. Miller Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Heiko Carstens Cc: Helge Deller Cc: Huacai Chen Cc: Ingo Molnar Cc: Johannes Berg Cc: John Paul Adrian Glaubitz Cc: Jonathan Corbet Cc: Klara Modin Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Magnus Lindholm Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Hocko Cc: Michal Simek Cc: Muchun Song Cc: Oscar Salvador Cc: Palmer Dabbelt Cc: Pratyush Yadav Cc: Richard Weinberger Cc: "Ritesh Harjani (IBM)" Cc: Russell King Cc: Stafford Horne Cc: Suren Baghdasaryan Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/mips/loongson64/numa.c | 9 +++++++-- arch/mips/mm/init.c | 14 +++++++++----- arch/mips/sgi-ip27/ip27-memory.c | 7 ++++++- 3 files changed, 22 insertions(+), 8 deletions(-) diff --git a/arch/mips/loongson64/numa.c b/arch/mips/loongson64/numa.c index 95d5f553ce19..f72a58f87878 100644 --- a/arch/mips/loongson64/numa.c +++ b/arch/mips/loongson64/numa.c @@ -154,13 +154,18 @@ static __init void prom_meminit(void) } } +void __init arch_zone_limits_init(unsigned long *max_zone_pfns) +{ + max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN; + max_zone_pfns[ZONE_NORMAL] = max_low_pfn; +} + void __init paging_init(void) { unsigned long zones_size[MAX_NR_ZONES] = {0, }; pagetable_init(); - zones_size[ZONE_DMA32] = MAX_DMA32_PFN; - zones_size[ZONE_NORMAL] = max_low_pfn; + arch_zone_limits_init(zones_size); free_area_init(zones_size); } diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c index 8986048f9b11..269bf6335ac4 100644 --- a/arch/mips/mm/init.c +++ b/arch/mips/mm/init.c @@ -394,12 +394,8 @@ void maar_init(void) } #ifndef CONFIG_NUMA -void __init paging_init(void) +void __init arch_zone_limits_init(unsigned long *max_zone_pfns) { - unsigned long max_zone_pfns[MAX_NR_ZONES]; - - pagetable_init(); - #ifdef CONFIG_ZONE_DMA max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN; #endif @@ -417,7 +413,15 @@ void __init paging_init(void) max_zone_pfns[ZONE_HIGHMEM] = max_low_pfn; } #endif +} + +void __init paging_init(void) +{ + unsigned long max_zone_pfns[MAX_NR_ZONES]; + + pagetable_init(); + arch_zone_limits_init(max_zone_pfns); free_area_init(max_zone_pfns); } diff --git a/arch/mips/sgi-ip27/ip27-memory.c b/arch/mips/sgi-ip27/ip27-memory.c index 2b3e46e2e607..babeb0e07687 100644 --- a/arch/mips/sgi-ip27/ip27-memory.c +++ b/arch/mips/sgi-ip27/ip27-memory.c @@ -406,11 +406,16 @@ void __init prom_meminit(void) } } +void __init arch_zone_limits_init(unsigned long *max_zone_pfns) +{ + max_zone_pfns[ZONE_NORMAL] = max_low_pfn; +} + void __init paging_init(void) { unsigned long zones_size[MAX_NR_ZONES] = {0, }; pagetable_init(); - zones_size[ZONE_NORMAL] = max_low_pfn; + arch_zone_limits_init(zones_size); free_area_init(zones_size); } -- cgit v1.2.3 From 3b1b0e5797bd325d0b40d82fe0f535badc51d1da Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Sun, 11 Jan 2026 10:20:46 +0200 Subject: nios2: introduce arch_zone_limits_init() Move calculations of zone limits to a dedicated arch_zone_limits_init() function. Later MM core will use this function as an architecture specific callback during nodes and zones initialization and thus there won't be a need to call free_area_init() from every architecture. Link: https://lkml.kernel.org/r/20260111082105.290734-13-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Acked-by: Dinh Nguyen Cc: Alexander Gordeev Cc: Alex Shi Cc: Andreas Larsson Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: David Hildenbrand Cc: David S. Miller Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Heiko Carstens Cc: Helge Deller Cc: Huacai Chen Cc: Ingo Molnar Cc: Johannes Berg Cc: John Paul Adrian Glaubitz Cc: Jonathan Corbet Cc: Klara Modin Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Magnus Lindholm Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Hocko Cc: Michal Simek Cc: Muchun Song Cc: Oscar Salvador Cc: Palmer Dabbelt Cc: Pratyush Yadav Cc: Richard Weinberger Cc: "Ritesh Harjani (IBM)" Cc: Russell King Cc: Stafford Horne Cc: Suren Baghdasaryan Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/nios2/mm/init.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/arch/nios2/mm/init.c b/arch/nios2/mm/init.c index 94efa3de3933..2cb666a65d9e 100644 --- a/arch/nios2/mm/init.c +++ b/arch/nios2/mm/init.c @@ -38,6 +38,11 @@ pgd_t *pgd_current; +void __init arch_zone_limits_init(unsigned long *max_zone_pfns) +{ + max_zone_pfns[ZONE_NORMAL] = max_low_pfn; +} + /* * paging_init() continues the virtual memory environment setup which * was begun by the code in arch/head.S. @@ -51,8 +56,7 @@ void __init paging_init(void) pagetable_init(); pgd_current = swapper_pg_dir; - max_zone_pfn[ZONE_NORMAL] = max_low_pfn; - + arch_zone_limits_init(max_zone_pfn); /* pass the memory from the bootmem allocator to the main allocator */ free_area_init(max_zone_pfn); -- cgit v1.2.3 From 1d28b1142383416d95c14c51ab3c865fdb6770cd Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Sun, 11 Jan 2026 10:20:47 +0200 Subject: openrisc: introduce arch_zone_limits_init() Move calculations of zone limits to a dedicated arch_zone_limits_init() function. Later MM core will use this function as an architecture specific callback during nodes and zones initialization and thus there won't be a need to call free_area_init() from every architecture. Link: https://lkml.kernel.org/r/20260111082105.290734-14-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Acked-by: Stafford Horne Cc: Alexander Gordeev Cc: Alex Shi Cc: Andreas Larsson Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: David Hildenbrand Cc: David S. Miller Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Heiko Carstens Cc: Helge Deller Cc: Huacai Chen Cc: Ingo Molnar Cc: Johannes Berg Cc: John Paul Adrian Glaubitz Cc: Jonathan Corbet Cc: Klara Modin Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Magnus Lindholm Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Hocko Cc: Michal Simek Cc: Muchun Song Cc: Oscar Salvador Cc: Palmer Dabbelt Cc: Pratyush Yadav Cc: Richard Weinberger Cc: "Ritesh Harjani (IBM)" Cc: Russell King Cc: Suren Baghdasaryan Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/openrisc/mm/init.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/arch/openrisc/mm/init.c b/arch/openrisc/mm/init.c index 9382d9a0ec78..67de93e7a685 100644 --- a/arch/openrisc/mm/init.c +++ b/arch/openrisc/mm/init.c @@ -39,15 +39,19 @@ int mem_init_done; -static void __init zone_sizes_init(void) +void __init arch_zone_limits_init(unsigned long *max_zone_pfns) { - unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0 }; - /* * We use only ZONE_NORMAL */ - max_zone_pfn[ZONE_NORMAL] = max_low_pfn; + max_zone_pfns[ZONE_NORMAL] = max_low_pfn; +} + +static void __init zone_sizes_init(void) +{ + unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0 }; + arch_zone_limits_init(max_zone_pfn); free_area_init(max_zone_pfn); } -- cgit v1.2.3 From 950696afe400d2f97df21ee6924cd297a994bf59 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Sun, 11 Jan 2026 10:20:48 +0200 Subject: parisc: introduce arch_zone_limits_init() Move calculations of zone limits to a dedicated arch_zone_limits_init() function. Later MM core will use this function as an architecture specific callback during nodes and zones initialization and thus there won't be a need to call free_area_init() from every architecture. Link: https://lkml.kernel.org/r/20260111082105.290734-15-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Acked-by: Helge Deller Cc: Alexander Gordeev Cc: Alex Shi Cc: Andreas Larsson Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: David Hildenbrand Cc: David S. Miller Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Heiko Carstens Cc: Huacai Chen Cc: Ingo Molnar Cc: Johannes Berg Cc: John Paul Adrian Glaubitz Cc: Jonathan Corbet Cc: Klara Modin Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Magnus Lindholm Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Hocko Cc: Michal Simek Cc: Muchun Song Cc: Oscar Salvador Cc: Palmer Dabbelt Cc: Pratyush Yadav Cc: Richard Weinberger Cc: "Ritesh Harjani (IBM)" Cc: Russell King Cc: Stafford Horne Cc: Suren Baghdasaryan Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/parisc/mm/init.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c index 14270715d754..dc5bd3efe738 100644 --- a/arch/parisc/mm/init.c +++ b/arch/parisc/mm/init.c @@ -693,12 +693,16 @@ static void __init fixmap_init(void) } while (addr < end); } +void __init arch_zone_limits_init(unsigned long *max_zone_pfns) +{ + max_zone_pfns[ZONE_NORMAL] = PFN_DOWN(memblock_end_of_DRAM()); +} + static void __init parisc_bootmem_free(void) { unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0, }; - max_zone_pfn[0] = memblock_end_of_DRAM(); - + arch_zone_limits_init(max_zone_pfn); free_area_init(max_zone_pfn); } -- cgit v1.2.3 From 27bebe446f8d00444bb6bdc5cd57062e64ff3c36 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Sun, 11 Jan 2026 10:20:49 +0200 Subject: powerpc: introduce arch_zone_limits_init() Move calculations of zone limits to a dedicated arch_zone_limits_init() function. Later MM core will use this function as an architecture specific callback during nodes and zones initialization and thus there won't be a need to call free_area_init() from every architecture. Link: https://lkml.kernel.org/r/20260111082105.290734-16-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Reviewed-by: Ritesh Harjani (IBM) Cc: Alexander Gordeev Cc: Alex Shi Cc: Andreas Larsson Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: David Hildenbrand Cc: David S. Miller Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Heiko Carstens Cc: Helge Deller Cc: Huacai Chen Cc: Ingo Molnar Cc: Johannes Berg Cc: John Paul Adrian Glaubitz Cc: Jonathan Corbet Cc: Klara Modin Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Magnus Lindholm Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Hocko Cc: Michal Simek Cc: Muchun Song Cc: Oscar Salvador Cc: Palmer Dabbelt Cc: Pratyush Yadav Cc: Richard Weinberger Cc: Russell King Cc: Stafford Horne Cc: Suren Baghdasaryan Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/powerpc/mm/mem.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 3ddbfdbfa941..03c05ec56041 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -221,13 +221,23 @@ static int __init mark_nonram_nosave(void) * anyway) will take a first dip into ZONE_NORMAL and get otherwise served by * ZONE_DMA. */ -static unsigned long max_zone_pfns[MAX_NR_ZONES]; +void __init arch_zone_limits_init(unsigned long *max_zone_pfns) +{ +#ifdef CONFIG_ZONE_DMA + max_zone_pfns[ZONE_DMA] = min((zone_dma_limit >> PAGE_SHIFT) + 1, max_low_pfn); +#endif + max_zone_pfns[ZONE_NORMAL] = max_low_pfn; +#ifdef CONFIG_HIGHMEM + max_zone_pfns[ZONE_HIGHMEM] = max_pfn; +#endif +} /* * paging_init() sets up the page tables - in fact we've already done this. */ void __init paging_init(void) { + unsigned long max_zone_pfns[MAX_NR_ZONES] = { 0 }; unsigned long long total_ram = memblock_phys_mem_size(); phys_addr_t top_of_ram = memblock_end_of_DRAM(); int zone_dma_bits; @@ -259,15 +269,7 @@ void __init paging_init(void) zone_dma_limit = DMA_BIT_MASK(zone_dma_bits); -#ifdef CONFIG_ZONE_DMA - max_zone_pfns[ZONE_DMA] = min(max_low_pfn, - 1UL << (zone_dma_bits - PAGE_SHIFT)); -#endif - max_zone_pfns[ZONE_NORMAL] = max_low_pfn; -#ifdef CONFIG_HIGHMEM - max_zone_pfns[ZONE_HIGHMEM] = max_pfn; -#endif - + arch_zone_limits_init(max_zone_pfns); free_area_init(max_zone_pfns); mark_nonram_nosave(); -- cgit v1.2.3 From db8cdb0ad603105d987e6c9b04d28254c9d46120 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Sun, 11 Jan 2026 10:20:50 +0200 Subject: riscv: introduce arch_zone_limits_init() Move calculations of zone limits to a dedicated arch_zone_limits_init() function. Later MM core will use this function as an architecture specific callback during nodes and zones initialization and thus there won't be a need to call free_area_init() from every architecture. Link: https://lkml.kernel.org/r/20260111082105.290734-17-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Cc: Alexander Gordeev Cc: Alex Shi Cc: Andreas Larsson Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: David Hildenbrand Cc: David S. Miller Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Heiko Carstens Cc: Helge Deller Cc: Huacai Chen Cc: Ingo Molnar Cc: Johannes Berg Cc: John Paul Adrian Glaubitz Cc: Jonathan Corbet Cc: Klara Modin Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Magnus Lindholm Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Hocko Cc: Michal Simek Cc: Muchun Song Cc: Oscar Salvador Cc: Palmer Dabbelt Cc: Pratyush Yadav Cc: Richard Weinberger Cc: "Ritesh Harjani (IBM)" Cc: Russell King Cc: Stafford Horne Cc: Suren Baghdasaryan Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/riscv/mm/init.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index addb8a9305be..97e8661fbcff 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -79,15 +79,19 @@ uintptr_t _dtb_early_pa __initdata; phys_addr_t dma32_phys_limit __initdata; -static void __init zone_sizes_init(void) +void __init arch_zone_limits_init(unsigned long *max_zone_pfns) { - unsigned long max_zone_pfns[MAX_NR_ZONES] = { 0, }; - #ifdef CONFIG_ZONE_DMA32 max_zone_pfns[ZONE_DMA32] = PFN_DOWN(dma32_phys_limit); #endif max_zone_pfns[ZONE_NORMAL] = max_low_pfn; +} + +static void __init zone_sizes_init(void) +{ + unsigned long max_zone_pfns[MAX_NR_ZONES] = { 0, }; + arch_zone_limits_init(max_zone_pfns); free_area_init(max_zone_pfns); } -- cgit v1.2.3 From 76c4c463bbc0836c46d84775e7ca5bf6e22e8618 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Sun, 11 Jan 2026 10:20:51 +0200 Subject: s390: introduce arch_zone_limits_init() Move calculations of zone limits to a dedicated arch_zone_limits_init() function. Later MM core will use this function as an architecture specific callback during nodes and zones initialization and thus there won't be a need to call free_area_init() from every architecture. Link: https://lkml.kernel.org/r/20260111082105.290734-18-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Cc: Alexander Gordeev Cc: Alex Shi Cc: Andreas Larsson Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: David Hildenbrand Cc: David S. Miller Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Heiko Carstens Cc: Helge Deller Cc: Huacai Chen Cc: Ingo Molnar Cc: Johannes Berg Cc: John Paul Adrian Glaubitz Cc: Jonathan Corbet Cc: Klara Modin Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Magnus Lindholm Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Hocko Cc: Michal Simek Cc: Muchun Song Cc: Oscar Salvador Cc: Palmer Dabbelt Cc: Pratyush Yadav Cc: Richard Weinberger Cc: "Ritesh Harjani (IBM)" Cc: Russell King Cc: Stafford Horne Cc: Suren Baghdasaryan Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/s390/mm/init.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index e4953453d254..1c11ad84dddb 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -86,6 +86,12 @@ static void __init setup_zero_pages(void) zero_page_mask = ((PAGE_SIZE << order) - 1) & PAGE_MASK; } +void __init arch_zone_limits_init(unsigned long *max_zone_pfns) +{ + max_zone_pfns[ZONE_DMA] = virt_to_pfn(MAX_DMA_ADDRESS); + max_zone_pfns[ZONE_NORMAL] = max_low_pfn; +} + /* * paging_init() sets up the page tables */ @@ -97,8 +103,7 @@ void __init paging_init(void) sparse_init(); zone_dma_limit = DMA_BIT_MASK(31); memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); - max_zone_pfns[ZONE_DMA] = virt_to_pfn(MAX_DMA_ADDRESS); - max_zone_pfns[ZONE_NORMAL] = max_low_pfn; + arch_zone_limits_init(max_zone_pfns); free_area_init(max_zone_pfns); } -- cgit v1.2.3 From 8bfa6c2259f494e18b06add7e822c1c2982d0c03 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Sun, 11 Jan 2026 10:20:52 +0200 Subject: sh: introduce arch_zone_limits_init() Move calculations of zone limits to a dedicated arch_zone_limits_init() function. Later MM core will use this function as an architecture specific callback during nodes and zones initialization and thus there won't be a need to call free_area_init() from every architecture. Link: https://lkml.kernel.org/r/20260111082105.290734-19-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Cc: Alexander Gordeev Cc: Alex Shi Cc: Andreas Larsson Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: David Hildenbrand Cc: David S. Miller Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Heiko Carstens Cc: Helge Deller Cc: Huacai Chen Cc: Ingo Molnar Cc: Johannes Berg Cc: John Paul Adrian Glaubitz Cc: Jonathan Corbet Cc: Klara Modin Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Magnus Lindholm Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Hocko Cc: Michal Simek Cc: Muchun Song Cc: Oscar Salvador Cc: Palmer Dabbelt Cc: Pratyush Yadav Cc: Richard Weinberger Cc: "Ritesh Harjani (IBM)" Cc: Russell King Cc: Stafford Horne Cc: Suren Baghdasaryan Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/sh/mm/init.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c index 99e302eeeec1..5e7e63642611 100644 --- a/arch/sh/mm/init.c +++ b/arch/sh/mm/init.c @@ -264,6 +264,11 @@ static void __init early_reserve_mem(void) reserve_crashkernel(); } +void __init arch_zone_limits_init(unsigned long *max_zone_pfns) +{ + max_zone_pfns[ZONE_NORMAL] = max_low_pfn; +} + void __init paging_init(void) { unsigned long max_zone_pfns[MAX_NR_ZONES]; @@ -322,7 +327,7 @@ void __init paging_init(void) kmap_coherent_init(); memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); - max_zone_pfns[ZONE_NORMAL] = max_low_pfn; + arch_zone_limits_init(max_zone_pfns); free_area_init(max_zone_pfns); } -- cgit v1.2.3 From 6ad7ea22cf6f3d635f9a7a25273b0d3f43b4b600 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Sun, 11 Jan 2026 10:20:53 +0200 Subject: sparc: introduce arch_zone_limits_init() Move calculations of zone limits to a dedicated arch_zone_limits_init() function. Later MM core will use this function as an architecture specific callback during nodes and zones initialization and thus there won't be a need to call free_area_init() from every architecture. Link: https://lkml.kernel.org/r/20260111082105.290734-20-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Acked-by: Andreas Larsson Cc: Alexander Gordeev Cc: Alex Shi Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: David Hildenbrand Cc: David S. Miller Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Heiko Carstens Cc: Helge Deller Cc: Huacai Chen Cc: Ingo Molnar Cc: Johannes Berg Cc: John Paul Adrian Glaubitz Cc: Jonathan Corbet Cc: Klara Modin Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Magnus Lindholm Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Hocko Cc: Michal Simek Cc: Muchun Song Cc: Oscar Salvador Cc: Palmer Dabbelt Cc: Pratyush Yadav Cc: Richard Weinberger Cc: "Ritesh Harjani (IBM)" Cc: Russell King Cc: Stafford Horne Cc: Suren Baghdasaryan Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/sparc/mm/init_64.c | 6 ++++++ arch/sparc/mm/srmmu.c | 12 ++++++++---- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c index df9f7c444c39..fbaad449dfc9 100644 --- a/arch/sparc/mm/init_64.c +++ b/arch/sparc/mm/init_64.c @@ -2279,6 +2279,11 @@ static void __init reduce_memory(phys_addr_t limit_ram) memblock_enforce_memory_limit(limit_ram); } +void __init arch_zone_limits_init(unsigned long *max_zone_pfns) +{ + max_zone_pfns[ZONE_NORMAL] = last_valid_pfn; +} + void __init paging_init(void) { unsigned long end_pfn, shift, phys_base; @@ -2461,6 +2466,7 @@ void __init paging_init(void) max_zone_pfns[ZONE_NORMAL] = end_pfn; + arch_zone_limits_init(max_zone_pfns); free_area_init(max_zone_pfns); } diff --git a/arch/sparc/mm/srmmu.c b/arch/sparc/mm/srmmu.c index f8fb4911d360..81e90151db90 100644 --- a/arch/sparc/mm/srmmu.c +++ b/arch/sparc/mm/srmmu.c @@ -884,6 +884,13 @@ static void __init map_kernel(void) void (*poke_srmmu)(void) = NULL; +void __init arch_zone_limits_init(unsigned long *max_zone_pfns) +{ + max_zone_pfns[ZONE_DMA] = max_low_pfn; + max_zone_pfns[ZONE_NORMAL] = max_low_pfn; + max_zone_pfns[ZONE_HIGHMEM] = highend_pfn; +} + void __init srmmu_paging_init(void) { int i; @@ -967,10 +974,7 @@ void __init srmmu_paging_init(void) { unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0 }; - max_zone_pfn[ZONE_DMA] = max_low_pfn; - max_zone_pfn[ZONE_NORMAL] = max_low_pfn; - max_zone_pfn[ZONE_HIGHMEM] = highend_pfn; - + arch_zone_limits_init(max_zone_pfn); free_area_init(max_zone_pfn); } } -- cgit v1.2.3 From 531de7f02d51b3198245d30364b50fde3dfaea06 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Sun, 11 Jan 2026 10:20:54 +0200 Subject: um: introduce arch_zone_limits_init() Move calculations of zone limits to a dedicated arch_zone_limits_init() function. Later MM core will use this function as an architecture specific callback during nodes and zones initialization and thus there won't be a need to call free_area_init() from every architecture. Link: https://lkml.kernel.org/r/20260111082105.290734-21-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Cc: Alexander Gordeev Cc: Alex Shi Cc: Andreas Larsson Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: David Hildenbrand Cc: David S. Miller Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Heiko Carstens Cc: Helge Deller Cc: Huacai Chen Cc: Ingo Molnar Cc: Johannes Berg Cc: John Paul Adrian Glaubitz Cc: Jonathan Corbet Cc: Klara Modin Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Magnus Lindholm Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Hocko Cc: Michal Simek Cc: Muchun Song Cc: Oscar Salvador Cc: Palmer Dabbelt Cc: Pratyush Yadav Cc: Richard Weinberger Cc: "Ritesh Harjani (IBM)" Cc: Russell King Cc: Stafford Horne Cc: Suren Baghdasaryan Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/um/kernel/mem.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/um/kernel/mem.c b/arch/um/kernel/mem.c index 39c4a7e21c6f..2ac4e9debedd 100644 --- a/arch/um/kernel/mem.c +++ b/arch/um/kernel/mem.c @@ -84,6 +84,11 @@ void __init mem_init(void) kmalloc_ok = 1; } +void __init arch_zone_limits_init(unsigned long *max_zone_pfns) +{ + max_zone_pfns[ZONE_NORMAL] = high_physmem >> PAGE_SHIFT; +} + void __init paging_init(void) { unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0 }; @@ -94,7 +99,7 @@ void __init paging_init(void) panic("%s: Failed to allocate %lu bytes align=%lx\n", __func__, PAGE_SIZE, PAGE_SIZE); - max_zone_pfn[ZONE_NORMAL] = high_physmem >> PAGE_SHIFT; + arch_zone_limits_init(max_zone_pfn); free_area_init(max_zone_pfn); } -- cgit v1.2.3 From 34f6b9c6e417dcde58bd5b4284e3f2b7689522c7 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Sun, 11 Jan 2026 10:20:55 +0200 Subject: x86: introduce arch_zone_limits_init() Move calculations of zone limits to a dedicated arch_zone_limits_init() function. Later MM core will use this function as an architecture specific callback during nodes and zones initialization and thus there won't be a need to call free_area_init() from every architecture. Link: https://lkml.kernel.org/r/20260111082105.290734-22-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Cc: Alexander Gordeev Cc: Alex Shi Cc: Andreas Larsson Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: David Hildenbrand Cc: David S. Miller Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Heiko Carstens Cc: Helge Deller Cc: Huacai Chen Cc: Ingo Molnar Cc: Johannes Berg Cc: John Paul Adrian Glaubitz Cc: Jonathan Corbet Cc: Klara Modin Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Magnus Lindholm Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Hocko Cc: Michal Simek Cc: Muchun Song Cc: Oscar Salvador Cc: Palmer Dabbelt Cc: Pratyush Yadav Cc: Richard Weinberger Cc: "Ritesh Harjani (IBM)" Cc: Russell King Cc: Stafford Horne Cc: Suren Baghdasaryan Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/x86/mm/init.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 8bf6ad4b9400..e7ef605a18d6 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -997,12 +997,8 @@ void __init free_initrd_mem(unsigned long start, unsigned long end) } #endif -void __init zone_sizes_init(void) +void __init arch_zone_limits_init(unsigned long *max_zone_pfns) { - unsigned long max_zone_pfns[MAX_NR_ZONES]; - - memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); - #ifdef CONFIG_ZONE_DMA max_zone_pfns[ZONE_DMA] = min(MAX_DMA_PFN, max_low_pfn); #endif @@ -1013,7 +1009,15 @@ void __init zone_sizes_init(void) #ifdef CONFIG_HIGHMEM max_zone_pfns[ZONE_HIGHMEM] = max_pfn; #endif +} + +void __init zone_sizes_init(void) +{ + unsigned long max_zone_pfns[MAX_NR_ZONES]; + + memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); + arch_zone_limits_init(max_zone_pfns); free_area_init(max_zone_pfns); } -- cgit v1.2.3 From 2d3c8c5f33e0b90d8a9f6a41014a11355a20f3b1 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Sun, 11 Jan 2026 10:20:56 +0200 Subject: xtensa: introduce arch_zone_limits_init() Move calculations of zone limits to a dedicated arch_zone_limits_init() function. Later MM core will use this function as an architecture specific callback during nodes and zones initialization and thus there won't be a need to call free_area_init() from every architecture. Link: https://lkml.kernel.org/r/20260111082105.290734-23-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Cc: Alexander Gordeev Cc: Alex Shi Cc: Andreas Larsson Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: David Hildenbrand Cc: David S. Miller Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Heiko Carstens Cc: Helge Deller Cc: Huacai Chen Cc: Ingo Molnar Cc: Johannes Berg Cc: John Paul Adrian Glaubitz Cc: Jonathan Corbet Cc: Klara Modin Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Magnus Lindholm Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Hocko Cc: Michal Simek Cc: Muchun Song Cc: Oscar Salvador Cc: Palmer Dabbelt Cc: Pratyush Yadav Cc: Richard Weinberger Cc: "Ritesh Harjani (IBM)" Cc: Russell King Cc: Stafford Horne Cc: Suren Baghdasaryan Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/xtensa/mm/init.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/arch/xtensa/mm/init.c b/arch/xtensa/mm/init.c index cc52733a0649..60299f359a3c 100644 --- a/arch/xtensa/mm/init.c +++ b/arch/xtensa/mm/init.c @@ -116,15 +116,19 @@ static void __init print_vm_layout(void) (unsigned long)(__bss_stop - __bss_start) >> 10); } -void __init zones_init(void) +void __init arch_zone_limits_init(unsigned long *max_zone_pfns) { - /* All pages are DMA-able, so we put them all in the DMA zone. */ - unsigned long max_zone_pfn[MAX_NR_ZONES] = { - [ZONE_NORMAL] = max_low_pfn, + max_zone_pfns[ZONE_NORMAL] = max_low_pfn; #ifdef CONFIG_HIGHMEM - [ZONE_HIGHMEM] = max_pfn, + max_zone_pfns[ZONE_HIGHMEM] = max_pfn; #endif - }; +} + +void __init zones_init(void) +{ + unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0, }; + + arch_zone_limits_init(max_zone_pfn); free_area_init(max_zone_pfn); print_vm_layout(); } -- cgit v1.2.3 From d49004c5f0c140bb83c87fab46dcf449cf00eb24 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Sun, 11 Jan 2026 10:20:57 +0200 Subject: arch, mm: consolidate initialization of nodes, zones and memory map To initialize node, zone and memory map data structures every architecture calls free_area_init() during setup_arch() and passes it an array of zone limits. Beside code duplication it creates "interesting" ordering cases between allocation and initialization of hugetlb and the memory map. Some architectures allocate hugetlb pages very early in setup_arch() in certain cases, some only create hugetlb CMA areas in setup_arch() and sometimes hugetlb allocations happen mm_core_init(). With arch_zone_limits_init() helper available now on all architectures it is no longer necessary to call free_area_init() from architecture setup code. Rather core MM initialization can call arch_zone_limits_init() in a single place. This allows to unify ordering of hugetlb vs memory map allocation and initialization. Remove the call to free_area_init() from architecture specific code and place it in a new mm_core_init_early() function that is called immediately after setup_arch(). After this refactoring it is possible to consolidate hugetlb allocations and eliminate differences in ordering of hugetlb and memory map initialization among different architectures. As the first step of this consolidation move hugetlb_bootmem_alloc() to mm_core_early_init(). Link: https://lkml.kernel.org/r/20260111082105.290734-24-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Cc: Alexander Gordeev Cc: Alex Shi Cc: Andreas Larsson Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: David Hildenbrand Cc: David S. Miller Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Heiko Carstens Cc: Helge Deller Cc: Huacai Chen Cc: Ingo Molnar Cc: Johannes Berg Cc: John Paul Adrian Glaubitz Cc: Jonathan Corbet Cc: Klara Modin Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Magnus Lindholm Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Hocko Cc: Michal Simek Cc: Muchun Song Cc: Oscar Salvador Cc: Palmer Dabbelt Cc: Pratyush Yadav Cc: Richard Weinberger Cc: "Ritesh Harjani (IBM)" Cc: Russell King Cc: Stafford Horne Cc: Suren Baghdasaryan Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/alpha/mm/init.c | 9 +-------- arch/arc/mm/init.c | 5 ----- arch/arm/mm/init.c | 16 ---------------- arch/arm64/mm/init.c | 4 ---- arch/csky/kernel/setup.c | 4 ---- arch/hexagon/mm/init.c | 12 ------------ arch/loongarch/include/asm/pgtable.h | 2 -- arch/loongarch/kernel/setup.c | 2 -- arch/loongarch/mm/init.c | 8 -------- arch/m68k/mm/init.c | 3 --- arch/m68k/mm/mcfmmu.c | 3 --- arch/m68k/mm/motorola.c | 6 +----- arch/m68k/mm/sun3mmu.c | 9 --------- arch/microblaze/mm/init.c | 7 ------- arch/mips/loongson64/numa.c | 4 ---- arch/mips/mm/init.c | 5 ----- arch/mips/sgi-ip27/ip27-memory.c | 4 ---- arch/nios2/mm/init.c | 6 ------ arch/openrisc/mm/init.c | 10 ---------- arch/parisc/mm/init.c | 9 --------- arch/powerpc/mm/mem.c | 4 ---- arch/riscv/mm/init.c | 9 --------- arch/s390/mm/init.c | 5 ----- arch/sh/mm/init.c | 5 ----- arch/sparc/mm/init_64.c | 11 ----------- arch/sparc/mm/srmmu.c | 7 ------- arch/um/kernel/mem.c | 5 ----- arch/x86/mm/init.c | 10 ---------- arch/x86/mm/init_32.c | 1 - arch/x86/mm/init_64.c | 2 -- arch/x86/mm/mm_internal.h | 1 - arch/xtensa/mm/init.c | 4 ---- include/linux/mm.h | 4 ++-- init/main.c | 1 + mm/mm_init.c | 18 ++++++++++-------- 35 files changed, 15 insertions(+), 200 deletions(-) diff --git a/arch/alpha/mm/init.c b/arch/alpha/mm/init.c index cd0cb1abde5f..9531cbc761c0 100644 --- a/arch/alpha/mm/init.c +++ b/arch/alpha/mm/init.c @@ -220,17 +220,10 @@ void __init arch_zone_limits_init(unsigned long *max_zone_pfn) } /* - * paging_init() sets up the memory map. + * paging_init() initializes the kernel's ZERO_PGE. */ void __init paging_init(void) { - unsigned long max_zone_pfn[MAX_NR_ZONES] = {0, }; - - /* Initialize mem_map[]. */ - arch_zone_limits_init(max_zone_pfn); - free_area_init(max_zone_pfn); - - /* Initialize the kernel's ZERO_PGE. */ memset(absolute_pointer(ZERO_PGE), 0, PAGE_SIZE); } diff --git a/arch/arc/mm/init.c b/arch/arc/mm/init.c index ff7974d38011..a5e92f46e5d1 100644 --- a/arch/arc/mm/init.c +++ b/arch/arc/mm/init.c @@ -102,8 +102,6 @@ void __init arch_zone_limits_init(unsigned long *max_zone_pfn) */ void __init setup_arch_memory(void) { - unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0 }; - setup_initial_init_mm(_text, _etext, _edata, _end); /* first page of system - kernel .vector starts here */ @@ -158,9 +156,6 @@ void __init setup_arch_memory(void) arch_pfn_offset = min(min_low_pfn, min_high_pfn); kmap_init(); #endif /* CONFIG_HIGHMEM */ - - arch_zone_limits_init(max_zone_pfn); - free_area_init(max_zone_pfn); } void __init arch_mm_preinit(void) diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c index bdcc3639681f..a8f7b4084715 100644 --- a/arch/arm/mm/init.c +++ b/arch/arm/mm/init.c @@ -118,15 +118,6 @@ void __init arch_zone_limits_init(unsigned long *max_zone_pfn) #endif } -static void __init zone_sizes_init(unsigned long min, unsigned long max_low, - unsigned long max_high) -{ - unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0 }; - - arch_zone_limits_init(max_zone_pfn); - free_area_init(max_zone_pfn); -} - #ifdef CONFIG_HAVE_ARCH_PFN_VALID int pfn_valid(unsigned long pfn) { @@ -222,13 +213,6 @@ void __init bootmem_init(void) * done after the fixed reservations */ sparse_init(); - - /* - * Now free the memory - free_area_init needs - * the sparse mem_map arrays initialized by sparse_init() - * for memmap_init_zone(), otherwise all PFNs are invalid. - */ - zone_sizes_init(min_low_pfn, max_low_pfn, max_pfn); } /* diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 06815d34cc11..3641e88ea871 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -134,7 +134,6 @@ void __init arch_zone_limits_init(unsigned long *max_zone_pfns) static void __init dma_limits_init(void) { - unsigned long max_zone_pfns[MAX_NR_ZONES] = {0}; phys_addr_t __maybe_unused acpi_zone_dma_limit; phys_addr_t __maybe_unused dt_zone_dma_limit; phys_addr_t __maybe_unused dma32_phys_limit = @@ -160,9 +159,6 @@ static void __init dma_limits_init(void) #endif if (!arm64_dma_phys_limit) arm64_dma_phys_limit = PHYS_MASK + 1; - - arch_zone_limits_init(max_zone_pfns); - free_area_init(max_zone_pfns); } int pfn_is_map_memory(unsigned long pfn) diff --git a/arch/csky/kernel/setup.c b/arch/csky/kernel/setup.c index 8968815d93e6..4bf3c01ead3a 100644 --- a/arch/csky/kernel/setup.c +++ b/arch/csky/kernel/setup.c @@ -63,7 +63,6 @@ static void __init csky_memblock_init(void) { unsigned long lowmem_size = PFN_DOWN(LOWMEM_LIMIT - PHYS_OFFSET_OFFSET); unsigned long sseg_size = PFN_DOWN(SSEG_SIZE - PHYS_OFFSET_OFFSET); - unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0 }; signed long size; memblock_reserve(__pa(_start), _end - _start); @@ -101,9 +100,6 @@ static void __init csky_memblock_init(void) memblock_set_current_limit(PFN_PHYS(max_low_pfn)); dma_contiguous_reserve(0); - - arch_zone_limits_init(max_zone_pfn); - free_area_init(max_zone_pfn); } void __init setup_arch(char **cmdline_p) diff --git a/arch/hexagon/mm/init.c b/arch/hexagon/mm/init.c index e2c9487d8d34..07086dbd33fd 100644 --- a/arch/hexagon/mm/init.c +++ b/arch/hexagon/mm/init.c @@ -66,20 +66,8 @@ void __init arch_zone_limits_init(unsigned long *max_zone_pfns) max_zone_pfns[ZONE_NORMAL] = max_low_pfn; } -/* - * In order to set up page allocator "nodes", - * somebody has to call free_area_init() for UMA. - * - * In this mode, we only have one pg_data_t - * structure: contig_mem_data. - */ static void __init paging_init(void) { - unsigned long max_zone_pfn[MAX_NR_ZONES] = {0, }; - - arch_zone_limits_init(max_zone_pfn); - free_area_init(max_zone_pfn); /* sets up the zonelists and mem_map */ - /* * Set the init_mm descriptors "context" value to point to the * initial kernel segment table's physical address. diff --git a/arch/loongarch/include/asm/pgtable.h b/arch/loongarch/include/asm/pgtable.h index f41a648a3d9e..c33b3bcb733e 100644 --- a/arch/loongarch/include/asm/pgtable.h +++ b/arch/loongarch/include/asm/pgtable.h @@ -353,8 +353,6 @@ static inline pte_t pte_swp_clear_exclusive(pte_t pte) return pte; } -extern void paging_init(void); - #define pte_none(pte) (!(pte_val(pte) & ~_PAGE_GLOBAL)) #define pte_present(pte) (pte_val(pte) & (_PAGE_PRESENT | _PAGE_PROTNONE)) #define pte_no_exec(pte) (pte_val(pte) & _PAGE_NO_EXEC) diff --git a/arch/loongarch/kernel/setup.c b/arch/loongarch/kernel/setup.c index 20cb6f306456..708ac025db71 100644 --- a/arch/loongarch/kernel/setup.c +++ b/arch/loongarch/kernel/setup.c @@ -621,8 +621,6 @@ void __init setup_arch(char **cmdline_p) prefill_possible_map(); #endif - paging_init(); - #ifdef CONFIG_KASAN kasan_init(); #endif diff --git a/arch/loongarch/mm/init.c b/arch/loongarch/mm/init.c index 17235f87eafb..c331bf69d2ec 100644 --- a/arch/loongarch/mm/init.c +++ b/arch/loongarch/mm/init.c @@ -68,14 +68,6 @@ void __init arch_zone_limits_init(unsigned long *max_zone_pfns) max_zone_pfns[ZONE_NORMAL] = max_low_pfn; } -void __init paging_init(void) -{ - unsigned long max_zone_pfns[MAX_NR_ZONES]; - - arch_zone_limits_init(max_zone_pfns); - free_area_init(max_zone_pfns); -} - void __ref free_initmem(void) { free_initmem_default(POISON_FREE_INITMEM); diff --git a/arch/m68k/mm/init.c b/arch/m68k/mm/init.c index 6b1d9d2434b5..53b71f786c27 100644 --- a/arch/m68k/mm/init.c +++ b/arch/m68k/mm/init.c @@ -69,13 +69,10 @@ void __init paging_init(void) * page_alloc get different views of the world. */ unsigned long end_mem = memory_end & PAGE_MASK; - unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0, }; high_memory = (void *) end_mem; empty_zero_page = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE); - arch_zone_limits_init(max_zone_pfn); - free_area_init(max_zone_pfn); } #endif /* CONFIG_MMU */ diff --git a/arch/m68k/mm/mcfmmu.c b/arch/m68k/mm/mcfmmu.c index 24a6f7bbd1ce..3418fd864237 100644 --- a/arch/m68k/mm/mcfmmu.c +++ b/arch/m68k/mm/mcfmmu.c @@ -39,7 +39,6 @@ void __init paging_init(void) pte_t *pg_table; unsigned long address, size; unsigned long next_pgtable; - unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0 }; int i; empty_zero_page = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE); @@ -73,8 +72,6 @@ void __init paging_init(void) } current->mm = NULL; - arch_zone_limits_init(max_zone_pfn); - free_area_init(max_zone_pfn); } int cf_tlb_miss(struct pt_regs *regs, int write, int dtlb, int extension_word) diff --git a/arch/m68k/mm/motorola.c b/arch/m68k/mm/motorola.c index d6ccd23caf61..127a3fa69f4c 100644 --- a/arch/m68k/mm/motorola.c +++ b/arch/m68k/mm/motorola.c @@ -429,7 +429,6 @@ DECLARE_VM_GET_PAGE_PROT */ void __init paging_init(void) { - unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0, }; unsigned long min_addr, max_addr; unsigned long addr; int i; @@ -511,12 +510,9 @@ void __init paging_init(void) set_fc(USER_DATA); #ifdef DEBUG - printk ("before free_area_init\n"); + printk ("before node_set_state\n"); #endif for (i = 0; i < m68k_num_memory; i++) if (node_present_pages(i)) node_set_state(i, N_NORMAL_MEMORY); - - arch_zone_limits_init(max_zone_pfn); - free_area_init(max_zone_pfn); } diff --git a/arch/m68k/mm/sun3mmu.c b/arch/m68k/mm/sun3mmu.c index fdd69cc4240c..c801677f7df8 100644 --- a/arch/m68k/mm/sun3mmu.c +++ b/arch/m68k/mm/sun3mmu.c @@ -41,7 +41,6 @@ void __init paging_init(void) unsigned long address; unsigned long next_pgtable; unsigned long bootmem_end; - unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0, }; unsigned long size; empty_zero_page = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE); @@ -80,14 +79,6 @@ void __init paging_init(void) mmu_emu_init(bootmem_end); current->mm = NULL; - - /* memory sizing is a hack stolen from motorola.c.. hope it works for us */ - arch_zone_limits_init(max_zone_pfn); - - /* I really wish I knew why the following change made things better... -- Sam */ - free_area_init(max_zone_pfn); - - } static const pgprot_t protection_map[16] = { diff --git a/arch/microblaze/mm/init.c b/arch/microblaze/mm/init.c index 54da60b81094..848cdee1380c 100644 --- a/arch/microblaze/mm/init.c +++ b/arch/microblaze/mm/init.c @@ -69,22 +69,15 @@ void __init arch_zone_limits_init(unsigned long *max_zone_pfns) */ static void __init paging_init(void) { - unsigned long zones_size[MAX_NR_ZONES]; int idx; /* Setup fixmaps */ for (idx = 0; idx < __end_of_fixed_addresses; idx++) clear_fixmap(idx); - /* Clean every zones */ - memset(zones_size, 0, sizeof(zones_size)); - #ifdef CONFIG_HIGHMEM highmem_init(); #endif - arch_zone_limits_init(zones_size); - /* We don't have holes in memory map */ - free_area_init(zones_size); } void __init setup_memory(void) diff --git a/arch/mips/loongson64/numa.c b/arch/mips/loongson64/numa.c index f72a58f87878..2cd95020df08 100644 --- a/arch/mips/loongson64/numa.c +++ b/arch/mips/loongson64/numa.c @@ -162,11 +162,7 @@ void __init arch_zone_limits_init(unsigned long *max_zone_pfns) void __init paging_init(void) { - unsigned long zones_size[MAX_NR_ZONES] = {0, }; - pagetable_init(); - arch_zone_limits_init(zones_size); - free_area_init(zones_size); } /* All PCI device belongs to logical Node-0 */ diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c index 269bf6335ac4..2575cba856d3 100644 --- a/arch/mips/mm/init.c +++ b/arch/mips/mm/init.c @@ -417,12 +417,7 @@ void __init arch_zone_limits_init(unsigned long *max_zone_pfns) void __init paging_init(void) { - unsigned long max_zone_pfns[MAX_NR_ZONES]; - pagetable_init(); - - arch_zone_limits_init(max_zone_pfns); - free_area_init(max_zone_pfns); } #ifdef CONFIG_64BIT diff --git a/arch/mips/sgi-ip27/ip27-memory.c b/arch/mips/sgi-ip27/ip27-memory.c index babeb0e07687..082651facf4f 100644 --- a/arch/mips/sgi-ip27/ip27-memory.c +++ b/arch/mips/sgi-ip27/ip27-memory.c @@ -413,9 +413,5 @@ void __init arch_zone_limits_init(unsigned long *max_zone_pfns) void __init paging_init(void) { - unsigned long zones_size[MAX_NR_ZONES] = {0, }; - pagetable_init(); - arch_zone_limits_init(zones_size); - free_area_init(zones_size); } diff --git a/arch/nios2/mm/init.c b/arch/nios2/mm/init.c index 2cb666a65d9e..6b22f1995c16 100644 --- a/arch/nios2/mm/init.c +++ b/arch/nios2/mm/init.c @@ -51,15 +51,9 @@ void __init arch_zone_limits_init(unsigned long *max_zone_pfns) */ void __init paging_init(void) { - unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0 }; - pagetable_init(); pgd_current = swapper_pg_dir; - arch_zone_limits_init(max_zone_pfn); - /* pass the memory from the bootmem allocator to the main allocator */ - free_area_init(max_zone_pfn); - flush_dcache_range((unsigned long)empty_zero_page, (unsigned long)empty_zero_page + PAGE_SIZE); } diff --git a/arch/openrisc/mm/init.c b/arch/openrisc/mm/init.c index 67de93e7a685..78fb0734cdbc 100644 --- a/arch/openrisc/mm/init.c +++ b/arch/openrisc/mm/init.c @@ -47,14 +47,6 @@ void __init arch_zone_limits_init(unsigned long *max_zone_pfns) max_zone_pfns[ZONE_NORMAL] = max_low_pfn; } -static void __init zone_sizes_init(void) -{ - unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0 }; - - arch_zone_limits_init(max_zone_pfn); - free_area_init(max_zone_pfn); -} - extern const char _s_kernel_ro[], _e_kernel_ro[]; /* @@ -145,8 +137,6 @@ void __init paging_init(void) map_ram(); - zone_sizes_init(); - /* self modifying code ;) */ /* Since the old TLB miss handler has been running up until now, * the kernel pages are still all RW, so we can still modify the diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c index dc5bd3efe738..ce6f09ab7a90 100644 --- a/arch/parisc/mm/init.c +++ b/arch/parisc/mm/init.c @@ -698,14 +698,6 @@ void __init arch_zone_limits_init(unsigned long *max_zone_pfns) max_zone_pfns[ZONE_NORMAL] = PFN_DOWN(memblock_end_of_DRAM()); } -static void __init parisc_bootmem_free(void) -{ - unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0, }; - - arch_zone_limits_init(max_zone_pfn); - free_area_init(max_zone_pfn); -} - void __init paging_init(void) { setup_bootmem(); @@ -716,7 +708,6 @@ void __init paging_init(void) flush_tlb_all_local(NULL); sparse_init(); - parisc_bootmem_free(); } static void alloc_btlb(unsigned long start, unsigned long end, int *slot, diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 03c05ec56041..b716c9cd141c 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -237,7 +237,6 @@ void __init arch_zone_limits_init(unsigned long *max_zone_pfns) */ void __init paging_init(void) { - unsigned long max_zone_pfns[MAX_NR_ZONES] = { 0 }; unsigned long long total_ram = memblock_phys_mem_size(); phys_addr_t top_of_ram = memblock_end_of_DRAM(); int zone_dma_bits; @@ -269,9 +268,6 @@ void __init paging_init(void) zone_dma_limit = DMA_BIT_MASK(zone_dma_bits); - arch_zone_limits_init(max_zone_pfns); - free_area_init(max_zone_pfns); - mark_nonram_nosave(); } diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index 97e8661fbcff..79b4792578c4 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -87,14 +87,6 @@ void __init arch_zone_limits_init(unsigned long *max_zone_pfns) max_zone_pfns[ZONE_NORMAL] = max_low_pfn; } -static void __init zone_sizes_init(void) -{ - unsigned long max_zone_pfns[MAX_NR_ZONES] = { 0, }; - - arch_zone_limits_init(max_zone_pfns); - free_area_init(max_zone_pfns); -} - #if defined(CONFIG_MMU) && defined(CONFIG_DEBUG_VM) #define LOG2_SZ_1K ilog2(SZ_1K) @@ -1443,7 +1435,6 @@ void __init misc_mem_init(void) /* The entire VMEMMAP region has been populated. Flush TLB for this region */ local_flush_tlb_kernel_range(VMEMMAP_START, VMEMMAP_END); #endif - zone_sizes_init(); arch_reserve_crashkernel(); memblock_dump_all(); } diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index 1c11ad84dddb..9ec608b5cbb1 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -97,14 +97,9 @@ void __init arch_zone_limits_init(unsigned long *max_zone_pfns) */ void __init paging_init(void) { - unsigned long max_zone_pfns[MAX_NR_ZONES]; - vmem_map_init(); sparse_init(); zone_dma_limit = DMA_BIT_MASK(31); - memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); - arch_zone_limits_init(max_zone_pfns); - free_area_init(max_zone_pfns); } void mark_rodata_ro(void) diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c index 5e7e63642611..3edee854b755 100644 --- a/arch/sh/mm/init.c +++ b/arch/sh/mm/init.c @@ -271,7 +271,6 @@ void __init arch_zone_limits_init(unsigned long *max_zone_pfns) void __init paging_init(void) { - unsigned long max_zone_pfns[MAX_NR_ZONES]; unsigned long vaddr, end; sh_mv.mv_mem_init(); @@ -325,10 +324,6 @@ void __init paging_init(void) page_table_range_init(vaddr, end, swapper_pg_dir); kmap_coherent_init(); - - memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); - arch_zone_limits_init(max_zone_pfns); - free_area_init(max_zone_pfns); } unsigned int mem_init_done = 0; diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c index fbaad449dfc9..931f872ce84a 100644 --- a/arch/sparc/mm/init_64.c +++ b/arch/sparc/mm/init_64.c @@ -2459,17 +2459,6 @@ void __init paging_init(void) kernel_physical_mapping_init(); - { - unsigned long max_zone_pfns[MAX_NR_ZONES]; - - memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); - - max_zone_pfns[ZONE_NORMAL] = end_pfn; - - arch_zone_limits_init(max_zone_pfns); - free_area_init(max_zone_pfns); - } - printk("Booting Linux...\n"); } diff --git a/arch/sparc/mm/srmmu.c b/arch/sparc/mm/srmmu.c index 81e90151db90..1b24c5e8d73d 100644 --- a/arch/sparc/mm/srmmu.c +++ b/arch/sparc/mm/srmmu.c @@ -970,13 +970,6 @@ void __init srmmu_paging_init(void) flush_tlb_all(); sparc_context_init(num_contexts); - - { - unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0 }; - - arch_zone_limits_init(max_zone_pfn); - free_area_init(max_zone_pfn); - } } void mmu_info(struct seq_file *m) diff --git a/arch/um/kernel/mem.c b/arch/um/kernel/mem.c index 2ac4e9debedd..89c8c8b94a79 100644 --- a/arch/um/kernel/mem.c +++ b/arch/um/kernel/mem.c @@ -91,16 +91,11 @@ void __init arch_zone_limits_init(unsigned long *max_zone_pfns) void __init paging_init(void) { - unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0 }; - empty_zero_page = (unsigned long *) memblock_alloc_low(PAGE_SIZE, PAGE_SIZE); if (!empty_zero_page) panic("%s: Failed to allocate %lu bytes align=%lx\n", __func__, PAGE_SIZE, PAGE_SIZE); - - arch_zone_limits_init(max_zone_pfn); - free_area_init(max_zone_pfn); } /* diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index e7ef605a18d6..e52a262d3207 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -1011,16 +1011,6 @@ void __init arch_zone_limits_init(unsigned long *max_zone_pfns) #endif } -void __init zone_sizes_init(void) -{ - unsigned long max_zone_pfns[MAX_NR_ZONES]; - - memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); - - arch_zone_limits_init(max_zone_pfns); - free_area_init(max_zone_pfns); -} - __visible DEFINE_PER_CPU_ALIGNED(struct tlb_state, cpu_tlbstate) = { .loaded_mm = &init_mm, .next_asid = 1, diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 8a34fff6ab2b..b55172118c91 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -655,7 +655,6 @@ void __init paging_init(void) */ olpc_dt_build_devicetree(); sparse_init(); - zone_sizes_init(); } /* diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 9983017ecbe0..4daa40071c9f 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -843,8 +843,6 @@ void __init paging_init(void) */ node_clear_state(0, N_MEMORY); node_clear_state(0, N_NORMAL_MEMORY); - - zone_sizes_init(); } #define PAGE_UNUSED 0xFD diff --git a/arch/x86/mm/mm_internal.h b/arch/x86/mm/mm_internal.h index 097aadc250f7..7c4a41235323 100644 --- a/arch/x86/mm/mm_internal.h +++ b/arch/x86/mm/mm_internal.h @@ -17,7 +17,6 @@ unsigned long kernel_physical_mapping_init(unsigned long start, unsigned long kernel_physical_mapping_change(unsigned long start, unsigned long end, unsigned long page_size_mask); -void zone_sizes_init(void); extern int after_bootmem; diff --git a/arch/xtensa/mm/init.c b/arch/xtensa/mm/init.c index 60299f359a3c..fe83a68335da 100644 --- a/arch/xtensa/mm/init.c +++ b/arch/xtensa/mm/init.c @@ -126,10 +126,6 @@ void __init arch_zone_limits_init(unsigned long *max_zone_pfns) void __init zones_init(void) { - unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0, }; - - arch_zone_limits_init(max_zone_pfn); - free_area_init(max_zone_pfn); print_vm_layout(); } diff --git a/include/linux/mm.h b/include/linux/mm.h index 477339b7a032..aacabf8a0b58 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -45,6 +45,7 @@ struct pt_regs; struct folio_batch; void arch_mm_preinit(void); +void mm_core_init_early(void); void mm_core_init(void); void init_mm_internals(void); @@ -3540,7 +3541,7 @@ static inline unsigned long get_num_physpages(void) } /* - * Using memblock node mappings, an architecture may initialise its + * FIXME: Using memblock node mappings, an architecture may initialise its * zones, allocate the backing mem_map and account for memory holes in an * architecture independent manner. * @@ -3555,7 +3556,6 @@ static inline unsigned long get_num_physpages(void) * memblock_add_node(base, size, nid, MEMBLOCK_NONE) * free_area_init(max_zone_pfns); */ -void free_area_init(unsigned long *max_zone_pfn); void arch_zone_limits_init(unsigned long *max_zone_pfn); unsigned long node_map_pfn_alignment(void); extern unsigned long absent_pages_in_range(unsigned long start_pfn, diff --git a/init/main.c b/init/main.c index b84818ad9685..445b5643ecec 100644 --- a/init/main.c +++ b/init/main.c @@ -1025,6 +1025,7 @@ void start_kernel(void) page_address_init(); pr_notice("%s", linux_banner); setup_arch(&command_line); + mm_core_init_early(); /* Static keys and static calls are needed by LSMs */ jump_label_init(); static_call_init(); diff --git a/mm/mm_init.c b/mm/mm_init.c index 0927bedb1254..6fb4415c0d1c 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -1807,7 +1807,6 @@ static void __init set_high_memory(void) /** * free_area_init - Initialise all pg_data_t and zone data - * @max_zone_pfn: an array of max PFNs for each zone * * This will call free_area_init_node() for each active node in the system. * Using the page ranges provided by memblock_set_node(), the size of each @@ -1818,17 +1817,14 @@ static void __init set_high_memory(void) * starts where the previous one ended. For example, ZONE_DMA32 starts * at arch_max_dma_pfn. */ -void __init free_area_init(unsigned long *max_zone_pfn) +static void __init free_area_init(void) { + unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0 }; unsigned long start_pfn, end_pfn; int i, nid, zone; bool descending; - /* Record where the zone boundaries are */ - memset(arch_zone_lowest_possible_pfn, 0, - sizeof(arch_zone_lowest_possible_pfn)); - memset(arch_zone_highest_possible_pfn, 0, - sizeof(arch_zone_highest_possible_pfn)); + arch_zone_limits_init(max_zone_pfn); start_pfn = PHYS_PFN(memblock_start_of_DRAM()); descending = arch_has_descending_max_zone_pfns(); @@ -2678,13 +2674,19 @@ void __init __weak mem_init(void) { } +void __init mm_core_init_early(void) +{ + hugetlb_bootmem_alloc(); + + free_area_init(); +} + /* * Set up kernel memory allocators */ void __init mm_core_init(void) { arch_mm_preinit(); - hugetlb_bootmem_alloc(); /* Initializations relying on SMP setup */ BUILD_BUG_ON(MAX_ZONELISTS > 2); -- cgit v1.2.3 From 4267739cabb82da75780c4699fe8208821929944 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Sun, 11 Jan 2026 10:20:58 +0200 Subject: arch, mm: consolidate initialization of SPARSE memory model Every architecture calls sparse_init() during setup_arch() although the data structures created by sparse_init() are not used until the initialization of the core MM. Beside the code duplication, calling sparse_init() from architecture specific code causes ordering differences of vmemmap and HVO initialization on different architectures. Move the call to sparse_init() from architecture specific code to free_area_init() to ensure that vmemmap and HVO initialization order is always the same. Link: https://lkml.kernel.org/r/20260111082105.290734-25-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Cc: Alexander Gordeev Cc: Alex Shi Cc: Andreas Larsson Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: David Hildenbrand Cc: David S. Miller Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Heiko Carstens Cc: Helge Deller Cc: Huacai Chen Cc: Ingo Molnar Cc: Johannes Berg Cc: John Paul Adrian Glaubitz Cc: Jonathan Corbet Cc: Klara Modin Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Magnus Lindholm Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Hocko Cc: Michal Simek Cc: Muchun Song Cc: Oscar Salvador Cc: Palmer Dabbelt Cc: Pratyush Yadav Cc: Richard Weinberger Cc: "Ritesh Harjani (IBM)" Cc: Russell King Cc: Stafford Horne Cc: Suren Baghdasaryan Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- Documentation/mm/memory-model.rst | 3 --- Documentation/translations/zh_CN/mm/memory-model.rst | 2 -- arch/alpha/kernel/setup.c | 1 - arch/arm/mm/init.c | 6 ------ arch/arm64/mm/init.c | 6 ------ arch/csky/kernel/setup.c | 2 -- arch/loongarch/kernel/setup.c | 8 -------- arch/mips/kernel/setup.c | 11 ----------- arch/parisc/mm/init.c | 2 -- arch/powerpc/include/asm/setup.h | 4 ++++ arch/powerpc/mm/mem.c | 5 ----- arch/powerpc/mm/numa.c | 2 -- arch/riscv/mm/init.c | 1 - arch/s390/mm/init.c | 1 - arch/sh/mm/init.c | 2 -- arch/sparc/mm/init_64.c | 2 -- arch/x86/mm/init_32.c | 1 - arch/x86/mm/init_64.c | 2 -- include/linux/mmzone.h | 2 -- mm/internal.h | 6 ++++++ mm/mm_init.c | 1 + 21 files changed, 11 insertions(+), 59 deletions(-) diff --git a/Documentation/mm/memory-model.rst b/Documentation/mm/memory-model.rst index 7957122039e8..199b11328f4f 100644 --- a/Documentation/mm/memory-model.rst +++ b/Documentation/mm/memory-model.rst @@ -97,9 +97,6 @@ sections: `mem_section` objects and the number of rows is calculated to fit all the memory sections. -The architecture setup code should call sparse_init() to -initialize the memory sections and the memory maps. - With SPARSEMEM there are two possible ways to convert a PFN to the corresponding `struct page` - a "classic sparse" and "sparse vmemmap". The selection is made at build time and it is determined by diff --git a/Documentation/translations/zh_CN/mm/memory-model.rst b/Documentation/translations/zh_CN/mm/memory-model.rst index 77ec149a970c..c0c5d8ecd880 100644 --- a/Documentation/translations/zh_CN/mm/memory-model.rst +++ b/Documentation/translations/zh_CN/mm/memory-model.rst @@ -83,8 +83,6 @@ SPARSEMEM模型将物理内存显示为一个部分的集合。一个区段用me 每一行包含价值 `PAGE_SIZE` 的 `mem_section` 对象,行数的计算是为了适应所有的 内存区。 -架构设置代码应该调用sparse_init()来初始化内存区和内存映射。 - 通过SPARSEMEM,有两种可能的方式将PFN转换为相应的 `struct page` --"classic sparse"和 "sparse vmemmap"。选择是在构建时进行的,它由 `CONFIG_SPARSEMEM_VMEMMAP` 的 值决定。 diff --git a/arch/alpha/kernel/setup.c b/arch/alpha/kernel/setup.c index bebdffafaee8..f0af444a69a4 100644 --- a/arch/alpha/kernel/setup.c +++ b/arch/alpha/kernel/setup.c @@ -607,7 +607,6 @@ setup_arch(char **cmdline_p) /* Find our memory. */ setup_memory(kernel_end); memblock_set_bottom_up(true); - sparse_init(); /* First guess at cpu cache sizes. Do this before init_arch. */ determine_cpu_caches(cpu->type); diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c index a8f7b4084715..0cc1bf04686d 100644 --- a/arch/arm/mm/init.c +++ b/arch/arm/mm/init.c @@ -207,12 +207,6 @@ void __init bootmem_init(void) early_memtest((phys_addr_t)min_low_pfn << PAGE_SHIFT, (phys_addr_t)max_low_pfn << PAGE_SHIFT); - - /* - * sparse_init() tries to allocate memory from memblock, so must be - * done after the fixed reservations - */ - sparse_init(); } /* diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 3641e88ea871..9d271aff7652 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -321,12 +321,6 @@ void __init bootmem_init(void) #endif kvm_hyp_reserve(); - - /* - * sparse_init() tries to allocate memory from memblock, so must be - * done after the fixed reservations - */ - sparse_init(); dma_limits_init(); /* diff --git a/arch/csky/kernel/setup.c b/arch/csky/kernel/setup.c index 4bf3c01ead3a..45c98dcf7f50 100644 --- a/arch/csky/kernel/setup.c +++ b/arch/csky/kernel/setup.c @@ -123,8 +123,6 @@ void __init setup_arch(char **cmdline_p) setup_smp(); #endif - sparse_init(); - fixaddr_init(); #ifdef CONFIG_HIGHMEM diff --git a/arch/loongarch/kernel/setup.c b/arch/loongarch/kernel/setup.c index 708ac025db71..d6a1ff0e16f1 100644 --- a/arch/loongarch/kernel/setup.c +++ b/arch/loongarch/kernel/setup.c @@ -402,14 +402,6 @@ static void __init arch_mem_init(char **cmdline_p) check_kernel_sections_mem(); - /* - * In order to reduce the possibility of kernel panic when failed to - * get IO TLB memory under CONFIG_SWIOTLB, it is better to allocate - * low memory as small as possible before swiotlb_init(), so make - * sparse_init() using top-down allocation. - */ - memblock_set_bottom_up(false); - sparse_init(); memblock_set_bottom_up(true); swiotlb_init(true, SWIOTLB_VERBOSE); diff --git a/arch/mips/kernel/setup.c b/arch/mips/kernel/setup.c index 11b9b6b63e19..d36d89d01fa4 100644 --- a/arch/mips/kernel/setup.c +++ b/arch/mips/kernel/setup.c @@ -614,7 +614,6 @@ static void __init bootcmdline_init(void) * kernel but generic memory management system is still entirely uninitialized. * * o bootmem_init() - * o sparse_init() * o paging_init() * o dma_contiguous_reserve() * @@ -665,16 +664,6 @@ static void __init arch_mem_init(char **cmdline_p) mips_parse_crashkernel(); device_tree_init(); - /* - * In order to reduce the possibility of kernel panic when failed to - * get IO TLB memory under CONFIG_SWIOTLB, it is better to allocate - * low memory as small as possible before plat_swiotlb_setup(), so - * make sparse_init() using top-down allocation. - */ - memblock_set_bottom_up(false); - sparse_init(); - memblock_set_bottom_up(true); - plat_swiotlb_setup(); dma_contiguous_reserve(PFN_PHYS(max_low_pfn)); diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c index ce6f09ab7a90..6a39e031e5ff 100644 --- a/arch/parisc/mm/init.c +++ b/arch/parisc/mm/init.c @@ -706,8 +706,6 @@ void __init paging_init(void) fixmap_init(); flush_cache_all_local(); /* start with known state */ flush_tlb_all_local(NULL); - - sparse_init(); } static void alloc_btlb(unsigned long start, unsigned long end, int *slot, diff --git a/arch/powerpc/include/asm/setup.h b/arch/powerpc/include/asm/setup.h index 50a92b24628d..6d60ea4868ab 100644 --- a/arch/powerpc/include/asm/setup.h +++ b/arch/powerpc/include/asm/setup.h @@ -20,7 +20,11 @@ extern void reloc_got2(unsigned long); void check_for_initrd(void); void mem_topology_setup(void); +#ifdef CONFIG_NUMA void initmem_init(void); +#else +static inline void initmem_init(void) {} +#endif void setup_panic(void); #define ARCH_PANIC_TIMEOUT 180 diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index b716c9cd141c..3789a51bdaae 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -182,11 +182,6 @@ void __init mem_topology_setup(void) memblock_set_node(0, PHYS_ADDR_MAX, &memblock.memory, 0); } -void __init initmem_init(void) -{ - sparse_init(); -} - /* mark pages that don't exist as nosave */ static int __init mark_nonram_nosave(void) { diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index 603a0f652ba6..f4cf3ae036de 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -1213,8 +1213,6 @@ void __init initmem_init(void) setup_node_data(nid, start_pfn, end_pfn); } - sparse_init(); - /* * We need the numa_cpu_lookup_table to be accurate for all CPUs, * even before we online them, so that we can use cpu_to_{node,mem} diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index 79b4792578c4..11ac4041afc0 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -1430,7 +1430,6 @@ void __init misc_mem_init(void) { early_memtest(min_low_pfn << PAGE_SHIFT, max_low_pfn << PAGE_SHIFT); arch_numa_init(); - sparse_init(); #ifdef CONFIG_SPARSEMEM_VMEMMAP /* The entire VMEMMAP region has been populated. Flush TLB for this region */ local_flush_tlb_kernel_range(VMEMMAP_START, VMEMMAP_END); diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index 9ec608b5cbb1..3c20475cbee2 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -98,7 +98,6 @@ void __init arch_zone_limits_init(unsigned long *max_zone_pfns) void __init paging_init(void) { vmem_map_init(); - sparse_init(); zone_dma_limit = DMA_BIT_MASK(31); } diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c index 3edee854b755..464a3a63e2fa 100644 --- a/arch/sh/mm/init.c +++ b/arch/sh/mm/init.c @@ -227,8 +227,6 @@ static void __init do_init_bootmem(void) node_set_online(0); plat_mem_setup(); - - sparse_init(); } static void __init early_reserve_mem(void) diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c index 931f872ce84a..4f7bdb18774b 100644 --- a/arch/sparc/mm/init_64.c +++ b/arch/sparc/mm/init_64.c @@ -1615,8 +1615,6 @@ static unsigned long __init bootmem_init(unsigned long phys_base) /* XXX cpu notifier XXX */ - sparse_init(); - return end_pfn; } diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index b55172118c91..0908c44d51e6 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -654,7 +654,6 @@ void __init paging_init(void) * NOTE: at this point the bootmem allocator is fully available. */ olpc_dt_build_devicetree(); - sparse_init(); } /* diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 4daa40071c9f..df2261fa4f98 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -833,8 +833,6 @@ void __init initmem_init(void) void __init paging_init(void) { - sparse_init(); - /* * clear the default setting with node 0 * note: don't use nodes_clear here, that is really clearing when diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index fc5d6c88d2f0..eb3815fc94ad 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -2286,9 +2286,7 @@ static inline unsigned long next_present_section_nr(unsigned long section_nr) #define pfn_to_nid(pfn) (0) #endif -void sparse_init(void); #else -#define sparse_init() do {} while (0) #define sparse_index_init(_sec, _nid) do {} while (0) #define sparse_vmemmap_init_nid_early(_nid) do {} while (0) #define sparse_vmemmap_init_nid_late(_nid) do {} while (0) diff --git a/mm/internal.h b/mm/internal.h index 9ee336aa0365..ecb6020cf313 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -852,6 +852,12 @@ void memmap_init_range(unsigned long, int, unsigned long, unsigned long, unsigned long, enum meminit_context, struct vmem_altmap *, int, bool); +#ifdef CONFIG_SPARSEMEM +void sparse_init(void); +#else +static inline void sparse_init(void) {} +#endif /* CONFIG_SPARSEMEM */ + #if defined CONFIG_COMPACTION || defined CONFIG_CMA /* diff --git a/mm/mm_init.c b/mm/mm_init.c index 6fb4415c0d1c..31246fe5c361 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -1825,6 +1825,7 @@ static void __init free_area_init(void) bool descending; arch_zone_limits_init(max_zone_pfn); + sparse_init(); start_pfn = PHYS_PFN(memblock_start_of_DRAM()); descending = arch_has_descending_max_zone_pfns(); -- cgit v1.2.3 From 5dea39496c681a2de1683cb808f525d6d7115753 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Sun, 11 Jan 2026 10:20:59 +0200 Subject: mips: drop paging_init() All three variants of paging_init() on MIPS are wrappers for pagetable_init(). Instead of having three identical wrappers, call pagetable_init() directly from setup_arch() and remove the unnecessary paging_init() functions. Link: https://lkml.kernel.org/r/20260111082105.290734-26-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Cc: Alexander Gordeev Cc: Alex Shi Cc: Andreas Larsson Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: David Hildenbrand Cc: David S. Miller Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Heiko Carstens Cc: Helge Deller Cc: Huacai Chen Cc: Ingo Molnar Cc: Johannes Berg Cc: John Paul Adrian Glaubitz Cc: Jonathan Corbet Cc: Klara Modin Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Magnus Lindholm Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Hocko Cc: Michal Simek Cc: Muchun Song Cc: Oscar Salvador Cc: Palmer Dabbelt Cc: Pratyush Yadav Cc: Richard Weinberger Cc: "Ritesh Harjani (IBM)" Cc: Russell King Cc: Stafford Horne Cc: Suren Baghdasaryan Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/mips/include/asm/pgalloc.h | 2 -- arch/mips/include/asm/pgtable.h | 2 +- arch/mips/kernel/setup.c | 4 ++-- arch/mips/loongson64/numa.c | 5 ----- arch/mips/mm/init.c | 5 ----- arch/mips/sgi-ip27/ip27-memory.c | 5 ----- 6 files changed, 3 insertions(+), 20 deletions(-) diff --git a/arch/mips/include/asm/pgalloc.h b/arch/mips/include/asm/pgalloc.h index 7a04381efa0b..6efd4a58bf10 100644 --- a/arch/mips/include/asm/pgalloc.h +++ b/arch/mips/include/asm/pgalloc.h @@ -101,6 +101,4 @@ static inline void p4d_populate(struct mm_struct *mm, p4d_t *p4d, pud_t *pud) #endif /* __PAGETABLE_PUD_FOLDED */ -extern void pagetable_init(void); - #endif /* _ASM_PGALLOC_H */ diff --git a/arch/mips/include/asm/pgtable.h b/arch/mips/include/asm/pgtable.h index 9c06a612d33a..fa7b935f947c 100644 --- a/arch/mips/include/asm/pgtable.h +++ b/arch/mips/include/asm/pgtable.h @@ -56,7 +56,7 @@ extern unsigned long zero_page_mask; (virt_to_page((void *)(empty_zero_page + (((unsigned long)(vaddr)) & zero_page_mask)))) #define __HAVE_COLOR_ZERO_PAGE -extern void paging_init(void); +extern void pagetable_init(void); /* * Conversion functions: convert a page and protection to a page entry, diff --git a/arch/mips/kernel/setup.c b/arch/mips/kernel/setup.c index d36d89d01fa4..7622aad0f0b3 100644 --- a/arch/mips/kernel/setup.c +++ b/arch/mips/kernel/setup.c @@ -614,7 +614,7 @@ static void __init bootcmdline_init(void) * kernel but generic memory management system is still entirely uninitialized. * * o bootmem_init() - * o paging_init() + * o pagetable_init() * o dma_contiguous_reserve() * * At this stage the bootmem allocator is ready to use. @@ -778,7 +778,7 @@ void __init setup_arch(char **cmdline_p) prefill_possible_map(); cpu_cache_init(); - paging_init(); + pagetable_init(); memblock_dump_all(); diff --git a/arch/mips/loongson64/numa.c b/arch/mips/loongson64/numa.c index 2cd95020df08..16ffb32cca50 100644 --- a/arch/mips/loongson64/numa.c +++ b/arch/mips/loongson64/numa.c @@ -160,11 +160,6 @@ void __init arch_zone_limits_init(unsigned long *max_zone_pfns) max_zone_pfns[ZONE_NORMAL] = max_low_pfn; } -void __init paging_init(void) -{ - pagetable_init(); -} - /* All PCI device belongs to logical Node-0 */ int pcibus_to_node(struct pci_bus *bus) { diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c index 2575cba856d3..4f6449ad02ca 100644 --- a/arch/mips/mm/init.c +++ b/arch/mips/mm/init.c @@ -415,11 +415,6 @@ void __init arch_zone_limits_init(unsigned long *max_zone_pfns) #endif } -void __init paging_init(void) -{ - pagetable_init(); -} - #ifdef CONFIG_64BIT static struct kcore_list kcore_kseg0; #endif diff --git a/arch/mips/sgi-ip27/ip27-memory.c b/arch/mips/sgi-ip27/ip27-memory.c index 082651facf4f..4317f5ae1fd1 100644 --- a/arch/mips/sgi-ip27/ip27-memory.c +++ b/arch/mips/sgi-ip27/ip27-memory.c @@ -410,8 +410,3 @@ void __init arch_zone_limits_init(unsigned long *max_zone_pfns) { max_zone_pfns[ZONE_NORMAL] = max_low_pfn; } - -void __init paging_init(void) -{ - pagetable_init(); -} -- cgit v1.2.3 From 6632314fddc4f5c9d3c1a6800fc2a62e6a5155e8 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Sun, 11 Jan 2026 10:21:00 +0200 Subject: x86: don't reserve hugetlb memory in setup_arch() Commit 665eaf313314 ("x86/setup: call hugetlb_bootmem_alloc early") added an early call to hugetlb_bootmem_alloc() to setup_arch() to allow HVO style pre-initialization of vmemmap on x86. With the ordering of hugetlb reservation vs memory map initialization sorted out in core MM this no longer needs to be an architecture specific quirk. Drop the call to hugetlb_bootmem_alloc() from x86::setup_arch(). Link: https://lkml.kernel.org/r/20260111082105.290734-27-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Cc: Alexander Gordeev Cc: Alex Shi Cc: Andreas Larsson Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: David Hildenbrand Cc: David S. Miller Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Heiko Carstens Cc: Helge Deller Cc: Huacai Chen Cc: Ingo Molnar Cc: Johannes Berg Cc: John Paul Adrian Glaubitz Cc: Jonathan Corbet Cc: Klara Modin Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Magnus Lindholm Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Hocko Cc: Michal Simek Cc: Muchun Song Cc: Oscar Salvador Cc: Palmer Dabbelt Cc: Pratyush Yadav Cc: Richard Weinberger Cc: "Ritesh Harjani (IBM)" Cc: Russell King Cc: Stafford Horne Cc: Suren Baghdasaryan Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/x86/kernel/setup.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 1b2edd07a3e1..e2318fa9b1bb 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1191,7 +1191,6 @@ void __init setup_arch(char **cmdline_p) if (boot_cpu_has(X86_FEATURE_GBPAGES)) { hugetlb_cma_reserve(PUD_SHIFT - PAGE_SHIFT); - hugetlb_bootmem_alloc(); } /* -- cgit v1.2.3 From 9fac145b6d3fe570277438f8d860eabf229dc545 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Sun, 11 Jan 2026 10:21:01 +0200 Subject: mm, arch: consolidate hugetlb CMA reservation Every architecture that supports hugetlb_cma command line parameter reserves CMA areas for hugetlb during setup_arch(). This obfuscates the ordering of hugetlb CMA initialization with respect to the rest initialization of the core MM. Introduce arch_hugetlb_cma_order() callback to allow architectures report the desired order-per-bit of CMA areas and provide a week implementation of arch_hugetlb_cma_order() for architectures that don't support hugetlb with CMA. Use this callback in hugetlb_cma_reserve() instead if passing the order as parameter and call hugetlb_cma_reserve() from mm_core_init_early() rather than have it spread over architecture specific code. Link: https://lkml.kernel.org/r/20260111082105.290734-28-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Cc: Alexander Gordeev Cc: Alex Shi Cc: Andreas Larsson Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: David Hildenbrand Cc: David S. Miller Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Heiko Carstens Cc: Helge Deller Cc: Huacai Chen Cc: Ingo Molnar Cc: Johannes Berg Cc: John Paul Adrian Glaubitz Cc: Jonathan Corbet Cc: Klara Modin Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Magnus Lindholm Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Hocko Cc: Michal Simek Cc: Muchun Song Cc: Oscar Salvador Cc: Palmer Dabbelt Cc: Pratyush Yadav Cc: Richard Weinberger Cc: "Ritesh Harjani (IBM)" Cc: Russell King Cc: Stafford Horne Cc: Suren Baghdasaryan Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- Documentation/driver-api/cxl/linux/early-boot.rst | 2 +- arch/arm64/include/asm/hugetlb.h | 2 -- arch/arm64/mm/hugetlbpage.c | 10 +++------- arch/arm64/mm/init.c | 9 --------- arch/powerpc/include/asm/hugetlb.h | 5 ----- arch/powerpc/kernel/setup-common.c | 1 - arch/powerpc/mm/hugetlbpage.c | 11 ++++------- arch/riscv/mm/hugetlbpage.c | 8 ++++++++ arch/riscv/mm/init.c | 2 -- arch/s390/kernel/setup.c | 2 -- arch/s390/mm/hugetlbpage.c | 8 ++++++++ arch/x86/kernel/setup.c | 4 ---- arch/x86/mm/hugetlbpage.c | 8 ++++++++ include/linux/hugetlb.h | 6 ++++-- mm/hugetlb_cma.c | 19 ++++++++++++++----- mm/mm_init.c | 1 + 16 files changed, 51 insertions(+), 47 deletions(-) diff --git a/Documentation/driver-api/cxl/linux/early-boot.rst b/Documentation/driver-api/cxl/linux/early-boot.rst index a7fc6fc85fbe..414481f33819 100644 --- a/Documentation/driver-api/cxl/linux/early-boot.rst +++ b/Documentation/driver-api/cxl/linux/early-boot.rst @@ -125,7 +125,7 @@ The contiguous memory allocator (CMA) enables reservation of contiguous memory regions on NUMA nodes during early boot. However, CMA cannot reserve memory on NUMA nodes that are not online during early boot. :: - void __init hugetlb_cma_reserve(int order) { + void __init hugetlb_cma_reserve(void) { if (!node_online(nid)) /* do not allow reservations */ } diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h index 44c1f757bfcf..e6f8ff3cc630 100644 --- a/arch/arm64/include/asm/hugetlb.h +++ b/arch/arm64/include/asm/hugetlb.h @@ -56,8 +56,6 @@ extern void huge_pte_clear(struct mm_struct *mm, unsigned long addr, #define __HAVE_ARCH_HUGE_PTEP_GET extern pte_t huge_ptep_get(struct mm_struct *mm, unsigned long addr, pte_t *ptep); -void __init arm64_hugetlb_cma_reserve(void); - #define huge_ptep_modify_prot_start huge_ptep_modify_prot_start extern pte_t huge_ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep); diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index 1d90a7e75333..f8dd58ab67a8 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c @@ -36,16 +36,12 @@ * huge pages could still be served from those areas. */ #ifdef CONFIG_CMA -void __init arm64_hugetlb_cma_reserve(void) +unsigned int arch_hugetlb_cma_order(void) { - int order; - if (pud_sect_supported()) - order = PUD_SHIFT - PAGE_SHIFT; - else - order = CONT_PMD_SHIFT - PAGE_SHIFT; + return PUD_SHIFT - PAGE_SHIFT; - hugetlb_cma_reserve(order); + return CONT_PMD_SHIFT - PAGE_SHIFT; } #endif /* CONFIG_CMA */ diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 9d271aff7652..96711b8578fd 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -311,15 +311,6 @@ void __init bootmem_init(void) arch_numa_init(); - /* - * must be done after arch_numa_init() which calls numa_init() to - * initialize node_online_map that gets used in hugetlb_cma_reserve() - * while allocating required CMA size across online nodes. - */ -#if defined(CONFIG_HUGETLB_PAGE) && defined(CONFIG_CMA) - arm64_hugetlb_cma_reserve(); -#endif - kvm_hyp_reserve(); dma_limits_init(); diff --git a/arch/powerpc/include/asm/hugetlb.h b/arch/powerpc/include/asm/hugetlb.h index 86326587e58d..6d32a4299445 100644 --- a/arch/powerpc/include/asm/hugetlb.h +++ b/arch/powerpc/include/asm/hugetlb.h @@ -68,7 +68,6 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t pte, int dirty); -void gigantic_hugetlb_cma_reserve(void) __init; #include #else /* ! CONFIG_HUGETLB_PAGE */ @@ -77,10 +76,6 @@ static inline void flush_hugetlb_page(struct vm_area_struct *vma, { } -static inline void __init gigantic_hugetlb_cma_reserve(void) -{ -} - static inline void __init hugetlbpage_init_defaultsize(void) { } diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c index c8c42b419742..cb5b73adc250 100644 --- a/arch/powerpc/kernel/setup-common.c +++ b/arch/powerpc/kernel/setup-common.c @@ -1003,7 +1003,6 @@ void __init setup_arch(char **cmdline_p) fadump_cma_init(); kdump_cma_reserve(); kvm_cma_reserve(); - gigantic_hugetlb_cma_reserve(); early_memtest(min_low_pfn << PAGE_SHIFT, max_low_pfn << PAGE_SHIFT); diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index d3c1b749dcfc..558fafb82b8a 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -200,18 +200,15 @@ static int __init hugetlbpage_init(void) arch_initcall(hugetlbpage_init); -void __init gigantic_hugetlb_cma_reserve(void) +unsigned int __init arch_hugetlb_cma_order(void) { - unsigned long order = 0; - if (radix_enabled()) - order = PUD_SHIFT - PAGE_SHIFT; + return PUD_SHIFT - PAGE_SHIFT; else if (!firmware_has_feature(FW_FEATURE_LPAR) && mmu_psize_defs[MMU_PAGE_16G].shift) /* * For pseries we do use ibm,expected#pages for reserving 16G pages. */ - order = mmu_psize_to_shift(MMU_PAGE_16G) - PAGE_SHIFT; + return mmu_psize_to_shift(MMU_PAGE_16G) - PAGE_SHIFT; - if (order) - hugetlb_cma_reserve(order); + return 0; } diff --git a/arch/riscv/mm/hugetlbpage.c b/arch/riscv/mm/hugetlbpage.c index 375dd96bb4a0..a6d217112cf4 100644 --- a/arch/riscv/mm/hugetlbpage.c +++ b/arch/riscv/mm/hugetlbpage.c @@ -447,3 +447,11 @@ static __init int gigantic_pages_init(void) } arch_initcall(gigantic_pages_init); #endif + +unsigned int __init arch_hugetlb_cma_order(void) +{ + if (IS_ENABLED(CONFIG_64BIT)) + return PUD_SHIFT - PAGE_SHIFT; + + return 0; +} diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index 11ac4041afc0..848efeb9e163 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -311,8 +311,6 @@ static void __init setup_bootmem(void) memblock_reserve(dtb_early_pa, fdt_totalsize(dtb_early_va)); dma_contiguous_reserve(dma32_phys_limit); - if (IS_ENABLED(CONFIG_64BIT)) - hugetlb_cma_reserve(PUD_SHIFT - PAGE_SHIFT); } #ifdef CONFIG_RELOCATABLE diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index c1fe0b53c5ac..b60284328fe3 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -963,8 +963,6 @@ void __init setup_arch(char **cmdline_p) setup_uv(); dma_contiguous_reserve(ident_map_size); vmcp_cma_reserve(); - if (cpu_has_edat2()) - hugetlb_cma_reserve(PUD_SHIFT - PAGE_SHIFT); reserve_crashkernel(); #ifdef CONFIG_CRASH_DUMP diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c index d42e61c7594e..d93417d1e53c 100644 --- a/arch/s390/mm/hugetlbpage.c +++ b/arch/s390/mm/hugetlbpage.c @@ -255,3 +255,11 @@ bool __init arch_hugetlb_valid_size(unsigned long size) else return false; } + +unsigned int __init arch_hugetlb_cma_order(void) +{ + if (cpu_has_edat2()) + return PUD_SHIFT - PAGE_SHIFT; + + return 0; +} diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index e2318fa9b1bb..e1efe3975aa0 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1189,10 +1189,6 @@ void __init setup_arch(char **cmdline_p) initmem_init(); dma_contiguous_reserve(max_pfn_mapped << PAGE_SHIFT); - if (boot_cpu_has(X86_FEATURE_GBPAGES)) { - hugetlb_cma_reserve(PUD_SHIFT - PAGE_SHIFT); - } - /* * Reserve memory for crash kernel after SRAT is parsed so that it * won't consume hotpluggable memory. diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index 58f7f2bd535d..3b26621c9128 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c @@ -42,3 +42,11 @@ static __init int gigantic_pages_init(void) arch_initcall(gigantic_pages_init); #endif #endif + +unsigned int __init arch_hugetlb_cma_order(void) +{ + if (boot_cpu_has(X86_FEATURE_GBPAGES)) + return PUD_SHIFT - PAGE_SHIFT; + + return 0; +} diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 694f6e83c637..00e6a73e7bba 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -281,6 +281,8 @@ void fixup_hugetlb_reservations(struct vm_area_struct *vma); void hugetlb_split(struct vm_area_struct *vma, unsigned long addr); int hugetlb_vma_lock_alloc(struct vm_area_struct *vma); +unsigned int arch_hugetlb_cma_order(void); + #else /* !CONFIG_HUGETLB_PAGE */ static inline void hugetlb_dup_vma_private(struct vm_area_struct *vma) @@ -1322,9 +1324,9 @@ static inline spinlock_t *huge_pte_lock(struct hstate *h, } #if defined(CONFIG_HUGETLB_PAGE) && defined(CONFIG_CMA) -extern void __init hugetlb_cma_reserve(int order); +extern void __init hugetlb_cma_reserve(void); #else -static inline __init void hugetlb_cma_reserve(int order) +static inline __init void hugetlb_cma_reserve(void) { } #endif diff --git a/mm/hugetlb_cma.c b/mm/hugetlb_cma.c index e8e4dc7182d5..b1eb5998282c 100644 --- a/mm/hugetlb_cma.c +++ b/mm/hugetlb_cma.c @@ -134,12 +134,24 @@ static int __init cmdline_parse_hugetlb_cma_only(char *p) early_param("hugetlb_cma_only", cmdline_parse_hugetlb_cma_only); -void __init hugetlb_cma_reserve(int order) +unsigned int __weak arch_hugetlb_cma_order(void) { - unsigned long size, reserved, per_node; + return 0; +} + +void __init hugetlb_cma_reserve(void) +{ + unsigned long size, reserved, per_node, order; bool node_specific_cma_alloc = false; int nid; + if (!hugetlb_cma_size) + return; + + order = arch_hugetlb_cma_order(); + if (!order) + return; + /* * HugeTLB CMA reservation is required for gigantic * huge pages which could not be allocated via the @@ -149,9 +161,6 @@ void __init hugetlb_cma_reserve(int order) VM_WARN_ON(order <= MAX_PAGE_ORDER); cma_reserve_called = true; - if (!hugetlb_cma_size) - return; - hugetlb_bootmem_set_nodes(); for (nid = 0; nid < MAX_NUMNODES; nid++) { diff --git a/mm/mm_init.c b/mm/mm_init.c index 31246fe5c361..0cfbdef91d72 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -2677,6 +2677,7 @@ void __init __weak mem_init(void) void __init mm_core_init_early(void) { + hugetlb_cma_reserve(); hugetlb_bootmem_alloc(); free_area_init(); -- cgit v1.2.3 From 7a9c0bf0aec621bba6d224e8c08713cf2cbcca0f Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Sun, 11 Jan 2026 10:21:02 +0200 Subject: mm/hugetlb: drop hugetlb_cma_check() hugetlb_cma_check() was required when the ordering of hugetlb_cma_reserve() and hugetlb_bootmem_alloc() was architecture depended. Since hugetlb_cma_reserve() is always called before hugetlb_bootmem_alloc() there is no need to check whether hugetlb_cma_reserve() was already called. Drop unneeded hugetlb_cma_check() function. Link: https://lkml.kernel.org/r/20260111082105.290734-29-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Acked-by: Muchun Song Cc: Alexander Gordeev Cc: Alex Shi Cc: Andreas Larsson Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: David Hildenbrand Cc: David S. Miller Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Heiko Carstens Cc: Helge Deller Cc: Huacai Chen Cc: Ingo Molnar Cc: Johannes Berg Cc: John Paul Adrian Glaubitz Cc: Jonathan Corbet Cc: Klara Modin Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Magnus Lindholm Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Hocko Cc: Michal Simek Cc: Oscar Salvador Cc: Palmer Dabbelt Cc: Pratyush Yadav Cc: Richard Weinberger Cc: "Ritesh Harjani (IBM)" Cc: Russell King Cc: Stafford Horne Cc: Suren Baghdasaryan Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- mm/hugetlb.c | 1 - mm/hugetlb_cma.c | 16 +++------------- mm/hugetlb_cma.h | 5 ----- 3 files changed, 3 insertions(+), 19 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index a1832da0f623..fe4b9f2ebdb6 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4159,7 +4159,6 @@ static int __init hugetlb_init(void) } } - hugetlb_cma_check(); hugetlb_init_hstates(); gather_bootmem_prealloc(); report_hugepages(); diff --git a/mm/hugetlb_cma.c b/mm/hugetlb_cma.c index b1eb5998282c..f5e79103e110 100644 --- a/mm/hugetlb_cma.c +++ b/mm/hugetlb_cma.c @@ -85,9 +85,6 @@ hugetlb_cma_alloc_bootmem(struct hstate *h, int *nid, bool node_exact) return m; } - -static bool cma_reserve_called __initdata; - static int __init cmdline_parse_hugetlb_cma(char *p) { int nid, count = 0; @@ -149,8 +146,10 @@ void __init hugetlb_cma_reserve(void) return; order = arch_hugetlb_cma_order(); - if (!order) + if (!order) { + pr_warn("hugetlb_cma: the option isn't supported by current arch\n"); return; + } /* * HugeTLB CMA reservation is required for gigantic @@ -159,7 +158,6 @@ void __init hugetlb_cma_reserve(void) * breaking this assumption. */ VM_WARN_ON(order <= MAX_PAGE_ORDER); - cma_reserve_called = true; hugetlb_bootmem_set_nodes(); @@ -253,14 +251,6 @@ void __init hugetlb_cma_reserve(void) hugetlb_cma_size = 0; } -void __init hugetlb_cma_check(void) -{ - if (!hugetlb_cma_size || cma_reserve_called) - return; - - pr_warn("hugetlb_cma: the option isn't supported by current arch\n"); -} - bool hugetlb_cma_exclusive_alloc(void) { return hugetlb_cma_only; diff --git a/mm/hugetlb_cma.h b/mm/hugetlb_cma.h index 2c2ec8a7e134..78186839df3a 100644 --- a/mm/hugetlb_cma.h +++ b/mm/hugetlb_cma.h @@ -8,7 +8,6 @@ struct folio *hugetlb_cma_alloc_folio(int order, gfp_t gfp_mask, int nid, nodemask_t *nodemask); struct huge_bootmem_page *hugetlb_cma_alloc_bootmem(struct hstate *h, int *nid, bool node_exact); -void hugetlb_cma_check(void); bool hugetlb_cma_exclusive_alloc(void); unsigned long hugetlb_cma_total_size(void); void hugetlb_cma_validate_params(void); @@ -31,10 +30,6 @@ struct huge_bootmem_page *hugetlb_cma_alloc_bootmem(struct hstate *h, int *nid, return NULL; } -static inline void hugetlb_cma_check(void) -{ -} - static inline bool hugetlb_cma_exclusive_alloc(void) { return false; -- cgit v1.2.3 From 743758ccf8bede3e7c38f3f7d3f5131aa0a7b4a6 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Sun, 11 Jan 2026 10:21:03 +0200 Subject: Revert "mm/hugetlb: deal with multiple calls to hugetlb_bootmem_alloc" This reverts commit d58b2498200724e4f8c12d71a5953da03c8c8bdf. hugetlb_bootmem_alloc() is called only once, no need to check if it was called already at its entry. Other checks performed during HVO initialization are also no longer necessary because sparse_init() that calls hugetlb_vmemmap_init_early() and hugetlb_vmemmap_init_late() is always called after hugetlb_bootmem_alloc(). Link: https://lkml.kernel.org/r/20260111082105.290734-30-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Acked-by: Muchun Song Cc: Alexander Gordeev Cc: Alex Shi Cc: Andreas Larsson Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: David Hildenbrand Cc: David S. Miller Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Heiko Carstens Cc: Helge Deller Cc: Huacai Chen Cc: Ingo Molnar Cc: Johannes Berg Cc: John Paul Adrian Glaubitz Cc: Jonathan Corbet Cc: Klara Modin Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Magnus Lindholm Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Hocko Cc: Michal Simek Cc: Oscar Salvador Cc: Palmer Dabbelt Cc: Pratyush Yadav Cc: Richard Weinberger Cc: "Ritesh Harjani (IBM)" Cc: Russell King Cc: Stafford Horne Cc: Suren Baghdasaryan Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 6 ------ mm/hugetlb.c | 12 ------------ mm/hugetlb_vmemmap.c | 11 ----------- 3 files changed, 29 deletions(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 00e6a73e7bba..94a03591990c 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -176,7 +176,6 @@ extern int sysctl_hugetlb_shm_group __read_mostly; extern struct list_head huge_boot_pages[MAX_NUMNODES]; void hugetlb_bootmem_alloc(void); -bool hugetlb_bootmem_allocated(void); extern nodemask_t hugetlb_bootmem_nodes; void hugetlb_bootmem_set_nodes(void); @@ -1306,11 +1305,6 @@ static inline bool hugetlbfs_pagecache_present( static inline void hugetlb_bootmem_alloc(void) { } - -static inline bool hugetlb_bootmem_allocated(void) -{ - return false; -} #endif /* CONFIG_HUGETLB_PAGE */ static inline spinlock_t *huge_pte_lock(struct hstate *h, diff --git a/mm/hugetlb.c b/mm/hugetlb.c index fe4b9f2ebdb6..04385a0122de 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4486,21 +4486,11 @@ void __init hugetlb_bootmem_set_nodes(void) } } -static bool __hugetlb_bootmem_allocated __initdata; - -bool __init hugetlb_bootmem_allocated(void) -{ - return __hugetlb_bootmem_allocated; -} - void __init hugetlb_bootmem_alloc(void) { struct hstate *h; int i; - if (__hugetlb_bootmem_allocated) - return; - hugetlb_bootmem_set_nodes(); for (i = 0; i < MAX_NUMNODES; i++) @@ -4514,8 +4504,6 @@ void __init hugetlb_bootmem_alloc(void) if (hstate_is_gigantic(h)) hugetlb_hstate_alloc_pages(h); } - - __hugetlb_bootmem_allocated = true; } /* diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index 9d01f883fd71..a9280259e12a 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -794,14 +794,6 @@ void __init hugetlb_vmemmap_init_early(int nid) struct huge_bootmem_page *m = NULL; void *map; - /* - * Noting to do if bootmem pages were not allocated - * early in boot, or if HVO wasn't enabled in the - * first place. - */ - if (!hugetlb_bootmem_allocated()) - return; - if (!READ_ONCE(vmemmap_optimize_enabled)) return; @@ -847,9 +839,6 @@ void __init hugetlb_vmemmap_init_late(int nid) struct hstate *h; void *map; - if (!hugetlb_bootmem_allocated()) - return; - if (!READ_ONCE(vmemmap_optimize_enabled)) return; -- cgit v1.2.3 From 0bec75167d9c491a5a01c6ca85303a58c5b95165 Mon Sep 17 00:00:00 2001 From: Greg Thelen Date: Thu, 1 Jan 2026 14:55:51 -0800 Subject: memcg-v1: remove folio_memcg_lock() doc reference MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit a29c0e4b2e86 ("memcg-v1: remove memcg move locking code") removed folio_memcg_lock(). Delete the final lingering documentation reference. Link: https://lkml.kernel.org/r/20260101225552.3423108-1-gthelen@google.com Fixes: a29c0e4b2e86 ("memcg-v1: remove memcg move locking code") Signed-off-by: Greg Thelen Acked-by: Shakeel Butt Acked-by: SeongJae Park Cc: Tejun Heo Cc: Johannes Weiner Cc: "Michal Koutný" Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- Documentation/admin-guide/cgroup-v1/memory.rst | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/Documentation/admin-guide/cgroup-v1/memory.rst b/Documentation/admin-guide/cgroup-v1/memory.rst index d6b1db8cc7eb..7db63c002922 100644 --- a/Documentation/admin-guide/cgroup-v1/memory.rst +++ b/Documentation/admin-guide/cgroup-v1/memory.rst @@ -311,9 +311,8 @@ Lock order is as follows:: folio_lock mm->page_table_lock or split pte_lock - folio_memcg_lock (memcg->move_lock) - mapping->i_pages lock - lruvec->lru_lock. + mapping->i_pages lock + lruvec->lru_lock. Per-node-per-memcgroup LRU (cgroup's private LRU) is guarded by lruvec->lru_lock; the folio LRU flag is cleared before -- cgit v1.2.3 From 542eda1a832947e0c44c9432972788587aaca95f Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Sun, 18 Jan 2026 14:50:37 +0000 Subject: mm/rmap: improve anon_vma_clone(), unlink_anon_vmas() comments, add asserts Patch series "mm: clean up anon_vma implementation", v3. The anon_vma logic is hugely confusing and, much like a bundle of wires entangled with one another, pulling on one thread seems only to lead to more entanglement elsewhere. There is a mish-mash of the core implementation, how that implementation is invoked, how helper functions are invoked and concepts such as adjacent anon_vma merge and anon_vma object reuse. This series tries to improve the situation somewhat. It starts by establishing some invariants in the core anon_vma_clone() and unlink_anon_vmas() functions, largely expressed via VM_WARN_ON_ONCE() asserts. These act as some form of self-documentation as to the conditions we find ourselves in when invoking these functions. We also add kdoc comments for anon_vma_clone() and unlink_anon_vmas(). We then update anon_vma_fork() to avoid passing a partially set up (and thus invalid) VMA to unlink_anon_vmas() - functions which are used both for partially set up and valid data types has historically been the source of a lot of confusion and bugs. We then makes use of the established known conditions to directly skip unfaulted VMAs (rather than implicitly via an empty vma->anon_vma_chain list). We remove the confusing anon_vma_merge() function (we already have a concept of anon_vma merge in that we merge anon_vma's that would otherwise be compatible except for attributes that mprotect() could change - which anon_vma_merge() has nothing to do with). We make the anon_vma functions internal to mm as they do not need to be used by any other part of the kernel, which allows for future abstraction without concern about this. We then reduce the time over which we hold the anon rmap lock in anon_vma_clone(), as it turns out we can allocate anon_vma_chain objects without holding this lock, since the objects are not yet accessible from the rmap. This should reduce anon_vma lock contention. This additionally allows us to remove a confusing GFP_NOWAIT, GFP_KERNEL allocation fallback strategy. Finally, we explicitly indicate which operation is being performed upon anon_vma_clone(), and separate out fork-only logic to make it very clear that anon_vma reuse only occurs on fork. This patch (of 9): Add kdoc comments and describe exactly what these functions are used for in detail, pointing out importantly that the anon_vma_clone() !dst->anon_vma && src->anon_vma dance is ONLY for fork. Both are confusing functions that will be refactored in a subsequent patch but the first stage is establishing documentation and some invariants. Add some basic CONFIG_DEBUG_VM asserts that help document expected state, specifically: anon_vma_clone() - mmap write lock held. - We do nothing if src VMA is not faulted. - The destination VMA has no anon_vma_chain yet. - We are always operating on the same active VMA (i.e. vma->anon_vma). - If not forking, must operate on the same mm_struct. unlink_anon_vmas() - mmap lock held (write lock except when freeing page tables). - That unfaulted VMAs are no-ops. We are presented with a special case when anon_vma_clone() fails to allocate memory, where we have a VMA with partially set up anon_vma state. Since we hold the exclusive mmap write lock, and since we are cloning from a source VMA which consequently can't also have its anon_vma state modified, we know no anon_vma referenced can be empty. This allows us to significantly simplify this case and just remove anon_vma_chain objects associated with the VMA, so we add a specific partial cleanup path for this scenario. This also allows us to drop the hack of setting vma->anon_vma to NULL before unlinking anon_vma state in this scenario. Link: https://lkml.kernel.org/r/cover.1768746221.git.lorenzo.stoakes@oracle.com Link: https://lkml.kernel.org/r/8644e89369be0cc89d7ac57443dff9e822803c91.1768746221.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Liam R. Howlett Cc: Barry Song Cc: Chris Li Cc: David Hildenbrand Cc: Harry Yoo Cc: Jann Horn Cc: Michal Hocko Cc: Mike Rapoport Cc: Pedro Falcato Cc: Rik van Riel Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/rmap.c | 133 ++++++++++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 107 insertions(+), 26 deletions(-) diff --git a/mm/rmap.c b/mm/rmap.c index 7b9879ef442d..2e0e1a373437 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -257,30 +257,62 @@ static inline void unlock_anon_vma_root(struct anon_vma *root) up_write(&root->rwsem); } -/* - * Attach the anon_vmas from src to dst. - * Returns 0 on success, -ENOMEM on failure. - * - * anon_vma_clone() is called by vma_expand(), vma_merge(), __split_vma(), - * copy_vma() and anon_vma_fork(). The first four want an exact copy of src, - * while the last one, anon_vma_fork(), may try to reuse an existing anon_vma to - * prevent endless growth of anon_vma. Since dst->anon_vma is set to NULL before - * call, we can identify this case by checking (!dst->anon_vma && - * src->anon_vma). - * - * If (!dst->anon_vma && src->anon_vma) is true, this function tries to find - * and reuse existing anon_vma which has no vmas and only one child anon_vma. - * This prevents degradation of anon_vma hierarchy to endless linear chain in - * case of constantly forking task. On the other hand, an anon_vma with more - * than one child isn't reused even if there was no alive vma, thus rmap - * walker has a good chance of avoiding scanning the whole hierarchy when it - * searches where page is mapped. +static void check_anon_vma_clone(struct vm_area_struct *dst, + struct vm_area_struct *src) +{ + /* The write lock must be held. */ + mmap_assert_write_locked(src->vm_mm); + /* If not a fork (implied by dst->anon_vma) then must be on same mm. */ + VM_WARN_ON_ONCE(dst->anon_vma && dst->vm_mm != src->vm_mm); + + /* If we have anything to do src->anon_vma must be provided. */ + VM_WARN_ON_ONCE(!src->anon_vma && !list_empty(&src->anon_vma_chain)); + VM_WARN_ON_ONCE(!src->anon_vma && dst->anon_vma); + /* We are establishing a new anon_vma_chain. */ + VM_WARN_ON_ONCE(!list_empty(&dst->anon_vma_chain)); + /* + * On fork, dst->anon_vma is set NULL (temporarily). Otherwise, anon_vma + * must be the same across dst and src. + */ + VM_WARN_ON_ONCE(dst->anon_vma && dst->anon_vma != src->anon_vma); +} + +static void cleanup_partial_anon_vmas(struct vm_area_struct *vma); + +/** + * anon_vma_clone - Establishes new anon_vma_chain objects in @dst linking to + * all of the anon_vma objects contained within @src anon_vma_chain's. + * @dst: The destination VMA with an empty anon_vma_chain. + * @src: The source VMA we wish to duplicate. + * + * This is the heart of the VMA side of the anon_vma implementation - we invoke + * this function whenever we need to set up a new VMA's anon_vma state. + * + * This is invoked for: + * + * - VMA Merge, but only when @dst is unfaulted and @src is faulted - meaning we + * clone @src into @dst. + * - VMA split. + * - VMA (m)remap. + * - Fork of faulted VMA. + * + * In all cases other than fork this is simply a duplication. Fork additionally + * adds a new active anon_vma. + * + * ONLY in the case of fork do we try to 'reuse' existing anon_vma's in an + * anon_vma hierarchy, reusing anon_vma's which have no VMA associated with them + * but do have a single child. This is to avoid waste of memory when repeatedly + * forking. + * + * Returns: 0 on success, -ENOMEM on failure. */ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) { struct anon_vma_chain *avc, *pavc; struct anon_vma *root = NULL; + check_anon_vma_clone(dst, src); + list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) { struct anon_vma *anon_vma; @@ -314,14 +346,7 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) return 0; enomem_failure: - /* - * dst->anon_vma is dropped here otherwise its num_active_vmas can - * be incorrectly decremented in unlink_anon_vmas(). - * We can safely do this because callers of anon_vma_clone() don't care - * about dst->anon_vma if anon_vma_clone() failed. - */ - dst->anon_vma = NULL; - unlink_anon_vmas(dst); + cleanup_partial_anon_vmas(dst); return -ENOMEM; } @@ -392,11 +417,67 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) return -ENOMEM; } +/* + * In the unfortunate case of anon_vma_clone() failing to allocate memory we + * have to clean things up. + * + * On clone we hold the exclusive mmap write lock, so we can't race + * unlink_anon_vmas(). Since we're cloning, we know we can't have empty + * anon_vma's, since existing anon_vma's are what we're cloning from. + * + * So this function needs only traverse the anon_vma_chain and free each + * allocated anon_vma_chain. + */ +static void cleanup_partial_anon_vmas(struct vm_area_struct *vma) +{ + struct anon_vma_chain *avc, *next; + struct anon_vma *root = NULL; + + /* + * We exclude everybody else from being able to modify anon_vma's + * underneath us. + */ + mmap_assert_locked(vma->vm_mm); + + list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { + struct anon_vma *anon_vma = avc->anon_vma; + + /* All anon_vma's share the same root. */ + if (!root) { + root = anon_vma->root; + anon_vma_lock_write(root); + } + + anon_vma_interval_tree_remove(avc, &anon_vma->rb_root); + list_del(&avc->same_vma); + anon_vma_chain_free(avc); + } + + if (root) + anon_vma_unlock_write(root); +} + +/** + * unlink_anon_vmas() - remove all links between a VMA and anon_vma's, freeing + * anon_vma_chain objects. + * @vma: The VMA whose links to anon_vma objects is to be severed. + * + * As part of the process anon_vma_chain's are freed, + * anon_vma->num_children,num_active_vmas is updated as required and, if the + * relevant anon_vma references no further VMAs, its reference count is + * decremented. + */ void unlink_anon_vmas(struct vm_area_struct *vma) { struct anon_vma_chain *avc, *next; struct anon_vma *root = NULL; + /* Always hold mmap lock, read-lock on unmap possibly. */ + mmap_assert_locked(vma->vm_mm); + + /* Unfaulted is a no-op. */ + VM_WARN_ON_ONCE(!vma->anon_vma && !list_empty(&vma->anon_vma_chain)); + /* * Unlink each anon_vma chained to the VMA. This list is ordered * from newest to oldest, ensuring the root anon_vma gets freed last. -- cgit v1.2.3 From 91901a441fa19d56bdc2c45b76f4165585af773b Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Sun, 18 Jan 2026 14:50:38 +0000 Subject: mm/rmap: eliminate partial anon_vma tear-down in anon_vma_fork() We have spun a web of unnecessary headaches for ourselves in anon_vma_fork() with the introduction of the anon_vma reuse logic, as introduced by commit 7a3ef208e662 ("mm: prevent endless growth of anon_vma hierarchy"). When we clone anon_vma's linked to a VMA via vma->anon_vma_chain, we check each anon_vma for specific conditions, and if met we set vma->anon_vma to this anon_vma to indicate we will reuse it rather than allocating a new one. It triggers upon the first ancestor anon_vma found that possesses at most 1 child, and no active VMAs. This was implemented such that if you continually fork and free VMAs, you would achieve anon_vma reuse rather than continually allocating unnecessary new anon_vma's. This however brings an unfortunate situation should a memory allocation fail during this process. anon_vma_fork(): 1. Clones the anon_vma. 2. If no reuse (i.e. !vma->anon_vma), tries to allocate anon_vma, AVC. 3. If 2 fails, we are forced to unwind step 1 by invoking unlink_anon_vmas(vma). This means that we send a partially set up (i.e. invalid) VMA to unlink_anon_vmas(). Doing this is dangerous and confusing - it is reasonable for kernel developers to assume unlink_anon_vmas() is called on a correctly established vma, and thus confusion arises if logic is implemented there to account for invalid VMAs, and further development might introduce subtle bugs. It is especially problematic in the anon rmap implementation which is essentially a broken abstraction. The patch solves the issue by simply trying to allocate the anon_vma and AVC ahead of time - i.e. optimising for the usual case - and freeing them should reuse occur or an error arise in anon_vma_clone(). This is not egregious performance-wise, as this function is called on the fork path which already performs a great number of allocations, and thus it is already a slow-path in this respect. It is additionally not egregious in terms of memory usage - the allocations are too-small-to-fail anyway unless, for instance, a fatal signal may have arisen, and any OOM for such tiny allocations that may arise would indicate the system is under so much memory pressure that the associated process is not long for this world anyway. We also update anon_vma->num_active_vmas to 1 directly rather than incrementing the newly allocated anon_vma's active VMA count - this makes it clear that this detached anon_vma can have only 1 num_active_vma at this point. Finally we eliminate the out_error and out_error_free_anon_vma labels which makes the logic much easier to follow. We also correct a small comment typo. Link: https://lkml.kernel.org/r/9923da5f8b095dd1e8d677692dcaf95859de0ef5.1768746221.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Cc: Barry Song Cc: Chris Li Cc: David Hildenbrand Cc: Harry Yoo Cc: Jann Horn Cc: Liam R. Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Pedro Falcato Cc: Rik van Riel Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/rmap.c | 46 ++++++++++++++++++++++++---------------------- 1 file changed, 24 insertions(+), 22 deletions(-) diff --git a/mm/rmap.c b/mm/rmap.c index 2e0e1a373437..fe2fd9ab0dea 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -359,7 +359,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) { struct anon_vma_chain *avc; struct anon_vma *anon_vma; - int error; + int rc; /* Don't bother if the parent process has no anon_vma here. */ if (!pvma->anon_vma) @@ -368,27 +368,35 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) /* Drop inherited anon_vma, we'll reuse existing or allocate new. */ vma->anon_vma = NULL; + anon_vma = anon_vma_alloc(); + if (!anon_vma) + return -ENOMEM; + avc = anon_vma_chain_alloc(GFP_KERNEL); + if (!avc) { + put_anon_vma(anon_vma); + return -ENOMEM; + } + /* * First, attach the new VMA to the parent VMA's anon_vmas, * so rmap can find non-COWed pages in child processes. */ - error = anon_vma_clone(vma, pvma); - if (error) - return error; - - /* An existing anon_vma has been reused, all done then. */ - if (vma->anon_vma) - return 0; + rc = anon_vma_clone(vma, pvma); + /* An error arose or an existing anon_vma was reused, all done then. */ + if (rc || vma->anon_vma) { + put_anon_vma(anon_vma); + anon_vma_chain_free(avc); + return rc; + } - /* Then add our own anon_vma. */ - anon_vma = anon_vma_alloc(); - if (!anon_vma) - goto out_error; - anon_vma->num_active_vmas++; - avc = anon_vma_chain_alloc(GFP_KERNEL); - if (!avc) - goto out_error_free_anon_vma; + /* + * OK no reuse, so add our own anon_vma. + * + * Since it is not linked anywhere we can safely manipulate anon_vma + * fields without a lock. + */ + anon_vma->num_active_vmas = 1; /* * The root anon_vma's rwsem is the lock actually used when we * lock any of the anon_vmas in this anon_vma tree. @@ -409,12 +417,6 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) anon_vma_unlock_write(anon_vma); return 0; - - out_error_free_anon_vma: - put_anon_vma(anon_vma); - out_error: - unlink_anon_vmas(vma); - return -ENOMEM; } /* -- cgit v1.2.3 From 69e945845585e415fe18afcddbca7cdd215ff3c7 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Sun, 18 Jan 2026 14:50:39 +0000 Subject: mm/rmap: skip unfaulted VMAs on anon_vma clone, unlink For both anon_vma_clone() and unlink_anon_vmas(), if the source VMA or the VMA to be linked are unfaulted (e.g. !vma->anon_vma), then the functions do nothing. Simply exit early in these cases. In the unlink_anon_vmas() case we can also remove a conditional that checks whether vma->anon_vma is set. Link: https://lkml.kernel.org/r/085a25f7528e1c8c687276e9b856e88dc8f105ca.1768746221.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Liam R. Howlett Reviewed-by: Suren Baghdasaryan Cc: Barry Song Cc: Chris Li Cc: David Hildenbrand Cc: Harry Yoo Cc: Jann Horn Cc: Michal Hocko Cc: Mike Rapoport Cc: Pedro Falcato Cc: Rik van Riel Cc: Shakeel Butt Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/rmap.c | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/mm/rmap.c b/mm/rmap.c index fe2fd9ab0dea..3c5fb8fb105f 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -313,6 +313,9 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) check_anon_vma_clone(dst, src); + if (!src->anon_vma) + return 0; + list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) { struct anon_vma *anon_vma; @@ -478,7 +481,10 @@ void unlink_anon_vmas(struct vm_area_struct *vma) mmap_assert_locked(vma->vm_mm); /* Unfaulted is a no-op. */ - VM_WARN_ON_ONCE(!vma->anon_vma && !list_empty(&vma->anon_vma_chain)); + if (!vma->anon_vma) { + VM_WARN_ON_ONCE(!list_empty(&vma->anon_vma_chain)); + return; + } /* * Unlink each anon_vma chained to the VMA. This list is ordered @@ -502,15 +508,13 @@ void unlink_anon_vmas(struct vm_area_struct *vma) list_del(&avc->same_vma); anon_vma_chain_free(avc); } - if (vma->anon_vma) { - vma->anon_vma->num_active_vmas--; - /* - * vma would still be needed after unlink, and anon_vma will be prepared - * when handle fault. - */ - vma->anon_vma = NULL; - } + vma->anon_vma->num_active_vmas--; + /* + * vma would still be needed after unlink, and anon_vma will be prepared + * when handle fault. + */ + vma->anon_vma = NULL; unlock_anon_vma_root(root); /* -- cgit v1.2.3 From 535f6b8df17d4350dc37b2635b52fa8ab964132c Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Sun, 18 Jan 2026 14:50:40 +0000 Subject: mm/rmap: remove unnecessary root lock dance in anon_vma clone, unmap The root anon_vma of all anon_vma's linked to a VMA must by definition be the same - a VMA and all of its descendants/ancestors must exist in the same CoW chain. Commit bb4aa39676f7 ("mm: avoid repeated anon_vma lock/unlock sequences in anon_vma_clone()") introduced paranoid checking of the root anon_vma remaining the same throughout all AVC's in 2011. I think 15 years later we can safely assume that this is always the case. Additionally, since unfaulted VMAs being cloned from or unlinked are no-op's, we can simply lock the anon_vma's associated with this rather than doing any specific dance around this. This removes unnecessary checks and makes it clear that the root anon_vma is shared between all anon_vma's in a given VMA's anon_vma_chain. Link: https://lkml.kernel.org/r/838030d2f0772b99fa99ff4b4fd571353f14a1a9.1768746221.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Liam R. Howlett Cc: Barry Song Cc: Chris Li Cc: David Hildenbrand Cc: Harry Yoo Cc: Jann Horn Cc: Michal Hocko Cc: Mike Rapoport Cc: Pedro Falcato Cc: Rik van Riel Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/rmap.c | 51 +++++++++++++++------------------------------------ 1 file changed, 15 insertions(+), 36 deletions(-) diff --git a/mm/rmap.c b/mm/rmap.c index 3c5fb8fb105f..d4d2e7b9fe5f 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -231,32 +231,6 @@ int __anon_vma_prepare(struct vm_area_struct *vma) return -ENOMEM; } -/* - * This is a useful helper function for locking the anon_vma root as - * we traverse the vma->anon_vma_chain, looping over anon_vma's that - * have the same vma. - * - * Such anon_vma's should have the same root, so you'd expect to see - * just a single mutex_lock for the whole traversal. - */ -static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct anon_vma *anon_vma) -{ - struct anon_vma *new_root = anon_vma->root; - if (new_root != root) { - if (WARN_ON_ONCE(root)) - up_write(&root->rwsem); - root = new_root; - down_write(&root->rwsem); - } - return root; -} - -static inline void unlock_anon_vma_root(struct anon_vma *root) -{ - if (root) - up_write(&root->rwsem); -} - static void check_anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) { @@ -309,26 +283,28 @@ static void cleanup_partial_anon_vmas(struct vm_area_struct *vma); int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) { struct anon_vma_chain *avc, *pavc; - struct anon_vma *root = NULL; check_anon_vma_clone(dst, src); if (!src->anon_vma) return 0; + check_anon_vma_clone(dst, src); + + /* All anon_vma's share the same root. */ + anon_vma_lock_write(src->anon_vma); list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) { struct anon_vma *anon_vma; avc = anon_vma_chain_alloc(GFP_NOWAIT); if (unlikely(!avc)) { - unlock_anon_vma_root(root); - root = NULL; + anon_vma_unlock_write(src->anon_vma); avc = anon_vma_chain_alloc(GFP_KERNEL); if (!avc) goto enomem_failure; + anon_vma_lock_write(src->anon_vma); } anon_vma = pavc->anon_vma; - root = lock_anon_vma_root(root, anon_vma); anon_vma_chain_link(dst, avc, anon_vma); /* @@ -345,7 +321,8 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) } if (dst->anon_vma) dst->anon_vma->num_active_vmas++; - unlock_anon_vma_root(root); + + anon_vma_unlock_write(src->anon_vma); return 0; enomem_failure: @@ -475,17 +452,19 @@ static void cleanup_partial_anon_vmas(struct vm_area_struct *vma) void unlink_anon_vmas(struct vm_area_struct *vma) { struct anon_vma_chain *avc, *next; - struct anon_vma *root = NULL; + struct anon_vma *active_anon_vma = vma->anon_vma; /* Always hold mmap lock, read-lock on unmap possibly. */ mmap_assert_locked(vma->vm_mm); /* Unfaulted is a no-op. */ - if (!vma->anon_vma) { + if (!active_anon_vma) { VM_WARN_ON_ONCE(!list_empty(&vma->anon_vma_chain)); return; } + anon_vma_lock_write(active_anon_vma); + /* * Unlink each anon_vma chained to the VMA. This list is ordered * from newest to oldest, ensuring the root anon_vma gets freed last. @@ -493,7 +472,6 @@ void unlink_anon_vmas(struct vm_area_struct *vma) list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { struct anon_vma *anon_vma = avc->anon_vma; - root = lock_anon_vma_root(root, anon_vma); anon_vma_interval_tree_remove(avc, &anon_vma->rb_root); /* @@ -509,13 +487,14 @@ void unlink_anon_vmas(struct vm_area_struct *vma) anon_vma_chain_free(avc); } - vma->anon_vma->num_active_vmas--; + active_anon_vma->num_active_vmas--; /* * vma would still be needed after unlink, and anon_vma will be prepared * when handle fault. */ vma->anon_vma = NULL; - unlock_anon_vma_root(root); + anon_vma_unlock_write(active_anon_vma); + /* * Iterate the list once more, it now only contains empty and unlinked -- cgit v1.2.3 From 53eb797ffc3abe30418b19777922b55fb339fc1f Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Sun, 18 Jan 2026 14:50:41 +0000 Subject: mm/rmap: remove anon_vma_merge() function This function is confusing, we already have the concept of anon_vma merge to adjacent VMA's anon_vma's to increase probability of anon_vma compatibility and therefore VMA merge (see is_mergeable_anon_vma() etc.), as well as anon_vma reuse, along side the usual VMA merge logic. We can remove the anon_vma check as it is redundant - a merge would not have been permitted with removal if the anon_vma's were not the same (and in the case of an unfaulted/faulted merge, we would have already set the unfaulted VMA's anon_vma to vp->remove->anon_vma in dup_anon_vma()). Avoid overloading this term when we're very simply unlinking anon_vma state from a removed VMA upon merge. Link: https://lkml.kernel.org/r/56bbe45e309f7af197b1c4f94a9a0c8931ff2d29.1768746221.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Suren Baghdasaryan Reviewed-by: Liam R. Howlett Cc: Barry Song Cc: Chris Li Cc: David Hildenbrand Cc: Harry Yoo Cc: Jann Horn Cc: Michal Hocko Cc: Mike Rapoport Cc: Pedro Falcato Cc: Rik van Riel Cc: Shakeel Butt Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/rmap.h | 7 ------- mm/vma.c | 2 +- tools/testing/vma/vma_internal.h | 5 ----- 3 files changed, 1 insertion(+), 13 deletions(-) diff --git a/include/linux/rmap.h b/include/linux/rmap.h index daa92a58585d..832bfc0ccfc6 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -165,13 +165,6 @@ static inline int anon_vma_prepare(struct vm_area_struct *vma) return __anon_vma_prepare(vma); } -static inline void anon_vma_merge(struct vm_area_struct *vma, - struct vm_area_struct *next) -{ - VM_BUG_ON_VMA(vma->anon_vma != next->anon_vma, vma); - unlink_anon_vmas(next); -} - struct anon_vma *folio_get_anon_vma(const struct folio *folio); #ifdef CONFIG_MM_ID diff --git a/mm/vma.c b/mm/vma.c index f81a5cfcd7cc..6c458c8656b8 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -381,7 +381,7 @@ again: fput(vp->file); } if (vp->remove->anon_vma) - anon_vma_merge(vp->vma, vp->remove); + unlink_anon_vmas(vp->remove); mm->map_count--; mpol_put(vma_policy(vp->remove)); if (!vp->remove2) diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h index 9f0a9f5ed0fe..93e5792306d9 100644 --- a/tools/testing/vma/vma_internal.h +++ b/tools/testing/vma/vma_internal.h @@ -1265,11 +1265,6 @@ static inline void i_mmap_unlock_write(struct address_space *mapping) { } -static inline void anon_vma_merge(struct vm_area_struct *vma, - struct vm_area_struct *next) -{ -} - static inline int userfaultfd_unmap_prep(struct vm_area_struct *vma, unsigned long start, unsigned long end, -- cgit v1.2.3 From 7549e3d20f1aa9a0b8c77f83144dde54ed6ab4fe Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Sun, 18 Jan 2026 14:50:42 +0000 Subject: mm/rmap: make anon_vma functions internal The bulk of the anon_vma operations are only used by mm, so formalise this by putting the function prototypes and inlines in mm/internal.h. This allows us to make changes without having to worry about the rest of the kernel. Link: https://lkml.kernel.org/r/79ec933c3a9c8bf1f64dab253bbfdae8a01cb921.1768746221.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Suren Baghdasaryan Reviewed-by: Liam R. Howlett Cc: Barry Song Cc: Chris Li Cc: David Hildenbrand Cc: Harry Yoo Cc: Jann Horn Cc: Michal Hocko Cc: Mike Rapoport Cc: Pedro Falcato Cc: Rik van Riel Cc: Shakeel Butt Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/rmap.h | 60 ---------------------------------------------------- mm/internal.h | 58 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 58 insertions(+), 60 deletions(-) diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 832bfc0ccfc6..dd764951b03d 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -104,68 +104,8 @@ enum ttu_flags { }; #ifdef CONFIG_MMU -static inline void get_anon_vma(struct anon_vma *anon_vma) -{ - atomic_inc(&anon_vma->refcount); -} - -void __put_anon_vma(struct anon_vma *anon_vma); - -static inline void put_anon_vma(struct anon_vma *anon_vma) -{ - if (atomic_dec_and_test(&anon_vma->refcount)) - __put_anon_vma(anon_vma); -} - -static inline void anon_vma_lock_write(struct anon_vma *anon_vma) -{ - down_write(&anon_vma->root->rwsem); -} -static inline int anon_vma_trylock_write(struct anon_vma *anon_vma) -{ - return down_write_trylock(&anon_vma->root->rwsem); -} - -static inline void anon_vma_unlock_write(struct anon_vma *anon_vma) -{ - up_write(&anon_vma->root->rwsem); -} - -static inline void anon_vma_lock_read(struct anon_vma *anon_vma) -{ - down_read(&anon_vma->root->rwsem); -} - -static inline int anon_vma_trylock_read(struct anon_vma *anon_vma) -{ - return down_read_trylock(&anon_vma->root->rwsem); -} - -static inline void anon_vma_unlock_read(struct anon_vma *anon_vma) -{ - up_read(&anon_vma->root->rwsem); -} - - -/* - * anon_vma helper functions. - */ void anon_vma_init(void); /* create anon_vma_cachep */ -int __anon_vma_prepare(struct vm_area_struct *); -void unlink_anon_vmas(struct vm_area_struct *); -int anon_vma_clone(struct vm_area_struct *, struct vm_area_struct *); -int anon_vma_fork(struct vm_area_struct *, struct vm_area_struct *); - -static inline int anon_vma_prepare(struct vm_area_struct *vma) -{ - if (likely(vma->anon_vma)) - return 0; - - return __anon_vma_prepare(vma); -} - -struct anon_vma *folio_get_anon_vma(const struct folio *folio); #ifdef CONFIG_MM_ID static __always_inline void folio_lock_large_mapcount(struct folio *folio) diff --git a/mm/internal.h b/mm/internal.h index ecb6020cf313..aac4ec53fe15 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -199,6 +199,64 @@ static inline void vma_close(struct vm_area_struct *vma) #ifdef CONFIG_MMU +static inline void get_anon_vma(struct anon_vma *anon_vma) +{ + atomic_inc(&anon_vma->refcount); +} + +void __put_anon_vma(struct anon_vma *anon_vma); + +static inline void put_anon_vma(struct anon_vma *anon_vma) +{ + if (atomic_dec_and_test(&anon_vma->refcount)) + __put_anon_vma(anon_vma); +} + +static inline void anon_vma_lock_write(struct anon_vma *anon_vma) +{ + down_write(&anon_vma->root->rwsem); +} + +static inline int anon_vma_trylock_write(struct anon_vma *anon_vma) +{ + return down_write_trylock(&anon_vma->root->rwsem); +} + +static inline void anon_vma_unlock_write(struct anon_vma *anon_vma) +{ + up_write(&anon_vma->root->rwsem); +} + +static inline void anon_vma_lock_read(struct anon_vma *anon_vma) +{ + down_read(&anon_vma->root->rwsem); +} + +static inline int anon_vma_trylock_read(struct anon_vma *anon_vma) +{ + return down_read_trylock(&anon_vma->root->rwsem); +} + +static inline void anon_vma_unlock_read(struct anon_vma *anon_vma) +{ + up_read(&anon_vma->root->rwsem); +} + +struct anon_vma *folio_get_anon_vma(const struct folio *folio); + +int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src); +int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma); +int __anon_vma_prepare(struct vm_area_struct *vma); +void unlink_anon_vmas(struct vm_area_struct *vma); + +static inline int anon_vma_prepare(struct vm_area_struct *vma) +{ + if (likely(vma->anon_vma)) + return 0; + + return __anon_vma_prepare(vma); +} + /* Flags for folio_pte_batch(). */ typedef int __bitwise fpb_t; -- cgit v1.2.3 From 85f03a86318c4172bfda4484cdf588ebab5fa410 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Sun, 18 Jan 2026 14:50:43 +0000 Subject: mm/mmap_lock: add vma_is_attached() helper This makes it easy to explicitly check for VMA detachment, which is useful for things like asserts. Note that we intentionally do not allow this function to be available should CONFIG_PER_VMA_LOCK be set - this is because vma_assert_attached() and vma_assert_detached() are no-ops if !CONFIG_PER_VMA_LOCK, so there is no correct state for vma_is_attached() to be in if this configuration option is not specified. Therefore users elsewhere must invoke this function only after checking for CONFIG_PER_VMA_LOCK. We rework the assert functions to utilise this. Link: https://lkml.kernel.org/r/0172d3bf527ca54ba27d8bce8f8476095b241ac7.1768746221.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Suren Baghdasaryan Reviewed-by: Liam R. Howlett Cc: Barry Song Cc: Chris Li Cc: David Hildenbrand Cc: Harry Yoo Cc: Jann Horn Cc: Michal Hocko Cc: Mike Rapoport Cc: Pedro Falcato Cc: Rik van Riel Cc: Shakeel Butt Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/mmap_lock.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h index d53f72dba7fe..b50416fbba20 100644 --- a/include/linux/mmap_lock.h +++ b/include/linux/mmap_lock.h @@ -251,6 +251,11 @@ static inline void vma_assert_locked(struct vm_area_struct *vma) !__is_vma_write_locked(vma, &mm_lock_seq), vma); } +static inline bool vma_is_attached(struct vm_area_struct *vma) +{ + return refcount_read(&vma->vm_refcnt); +} + /* * WARNING: to avoid racing with vma_mark_attached()/vma_mark_detached(), these * assertions should be made either under mmap_write_lock or when the object @@ -258,12 +263,12 @@ static inline void vma_assert_locked(struct vm_area_struct *vma) */ static inline void vma_assert_attached(struct vm_area_struct *vma) { - WARN_ON_ONCE(!refcount_read(&vma->vm_refcnt)); + WARN_ON_ONCE(!vma_is_attached(vma)); } static inline void vma_assert_detached(struct vm_area_struct *vma) { - WARN_ON_ONCE(refcount_read(&vma->vm_refcnt)); + WARN_ON_ONCE(vma_is_attached(vma)); } static inline void vma_mark_attached(struct vm_area_struct *vma) -- cgit v1.2.3 From bfc2b13b05a1343bb60a85d840fd8956731866c5 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Sun, 18 Jan 2026 14:50:44 +0000 Subject: mm/rmap: allocate anon_vma_chain objects unlocked when possible There is no reason to allocate the anon_vma_chain under the anon_vma write lock when cloning - we can in fact assign these to the destination VMA safely as we hold the exclusive mmap lock and therefore preclude anybody else accessing these fields. We only need take the anon_vma write lock when we link rbtree edges from the anon_vma to the newly established AVCs. This also allows us to eliminate the weird GFP_NOWAIT, GFP_KERNEL dance introduced in commit dd34739c03f2 ("mm: avoid anon_vma_chain allocation under anon_vma lock"), further simplifying this logic. This should reduce lock anon_vma contention, and clarifies exactly where the anon_vma lock is required. We cannot adjust __anon_vma_prepare() in the same way as this is only protected by VMA read lock, so we have to perform the allocation here under the anon_vma write lock and page_table_lock (to protect against racing threads), and we wish to retain the lock ordering. With this change we can simplify cleanup_partial_anon_vmas() even further - since we allocate AVC's without any lock taken and do not insert anything into the interval tree until after the allocations are tried, we can remove all logic pertaining to this and just free up AVC's only. Link: https://lkml.kernel.org/r/624bf1ac0bde4871fcfca2c8c8e294b6d8f7ae7b.1768746221.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Suren Baghdasaryan Reviewed-by: Liam R. Howlett Cc: Barry Song Cc: Chris Li Cc: David Hildenbrand Cc: Harry Yoo Cc: Jann Horn Cc: Michal Hocko Cc: Mike Rapoport Cc: Pedro Falcato Cc: Rik van Riel Cc: Shakeel Butt Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/rmap.c | 83 ++++++++++++++++++++++++++++----------------------------------- 1 file changed, 37 insertions(+), 46 deletions(-) diff --git a/mm/rmap.c b/mm/rmap.c index d4d2e7b9fe5f..a5ce9163454a 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -146,14 +146,13 @@ static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain) kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain); } -static void anon_vma_chain_link(struct vm_area_struct *vma, - struct anon_vma_chain *avc, - struct anon_vma *anon_vma) +static void anon_vma_chain_assign(struct vm_area_struct *vma, + struct anon_vma_chain *avc, + struct anon_vma *anon_vma) { avc->vma = vma; avc->anon_vma = anon_vma; list_add(&avc->same_vma, &vma->anon_vma_chain); - anon_vma_interval_tree_insert(avc, &anon_vma->rb_root); } /** @@ -210,7 +209,8 @@ int __anon_vma_prepare(struct vm_area_struct *vma) spin_lock(&mm->page_table_lock); if (likely(!vma->anon_vma)) { vma->anon_vma = anon_vma; - anon_vma_chain_link(vma, avc, anon_vma); + anon_vma_chain_assign(vma, avc, anon_vma); + anon_vma_interval_tree_insert(avc, &anon_vma->rb_root); anon_vma->num_active_vmas++; allocated = NULL; avc = NULL; @@ -291,21 +291,33 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) check_anon_vma_clone(dst, src); - /* All anon_vma's share the same root. */ + /* + * Allocate AVCs. We don't need an anon_vma lock for this as we + * are not updating the anon_vma rbtree nor are we changing + * anon_vma statistics. + * + * Either src, dst have the same mm for which we hold an exclusive mmap + * write lock, or we are forking and we hold it on src->vm_mm and dst is + * not yet accessible to other threads so there's no possibliity of the + * unlinked AVC's being observed yet. + */ + list_for_each_entry(pavc, &src->anon_vma_chain, same_vma) { + avc = anon_vma_chain_alloc(GFP_KERNEL); + if (!avc) + goto enomem_failure; + + anon_vma_chain_assign(dst, avc, pavc->anon_vma); + } + + /* + * Now link the anon_vma's back to the newly inserted AVCs. + * Note that all anon_vma's share the same root. + */ anon_vma_lock_write(src->anon_vma); - list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) { - struct anon_vma *anon_vma; - - avc = anon_vma_chain_alloc(GFP_NOWAIT); - if (unlikely(!avc)) { - anon_vma_unlock_write(src->anon_vma); - avc = anon_vma_chain_alloc(GFP_KERNEL); - if (!avc) - goto enomem_failure; - anon_vma_lock_write(src->anon_vma); - } - anon_vma = pavc->anon_vma; - anon_vma_chain_link(dst, avc, anon_vma); + list_for_each_entry_reverse(avc, &dst->anon_vma_chain, same_vma) { + struct anon_vma *anon_vma = avc->anon_vma; + + anon_vma_interval_tree_insert(avc, &anon_vma->rb_root); /* * Reuse existing anon_vma if it has no vma and only one @@ -321,7 +333,6 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) } if (dst->anon_vma) dst->anon_vma->num_active_vmas++; - anon_vma_unlock_write(src->anon_vma); return 0; @@ -391,8 +402,10 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) get_anon_vma(anon_vma->root); /* Mark this anon_vma as the one where our new (COWed) pages go. */ vma->anon_vma = anon_vma; + anon_vma_chain_assign(vma, avc, anon_vma); + /* Now let rmap see it. */ anon_vma_lock_write(anon_vma); - anon_vma_chain_link(vma, avc, anon_vma); + anon_vma_interval_tree_insert(avc, &anon_vma->rb_root); anon_vma->parent->num_children++; anon_vma_unlock_write(anon_vma); @@ -403,40 +416,18 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) * In the unfortunate case of anon_vma_clone() failing to allocate memory we * have to clean things up. * - * On clone we hold the exclusive mmap write lock, so we can't race - * unlink_anon_vmas(). Since we're cloning, we know we can't have empty - * anon_vma's, since existing anon_vma's are what we're cloning from. - * - * So this function needs only traverse the anon_vma_chain and free each - * allocated anon_vma_chain. + * Since we allocate anon_vma_chain's before we insert them into the interval + * trees, we simply have to free up the AVC's and remove the entries from the + * VMA's anon_vma_chain. */ static void cleanup_partial_anon_vmas(struct vm_area_struct *vma) { struct anon_vma_chain *avc, *next; - struct anon_vma *root = NULL; - - /* - * We exclude everybody else from being able to modify anon_vma's - * underneath us. - */ - mmap_assert_locked(vma->vm_mm); list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { - struct anon_vma *anon_vma = avc->anon_vma; - - /* All anon_vma's share the same root. */ - if (!root) { - root = anon_vma->root; - anon_vma_lock_write(root); - } - - anon_vma_interval_tree_remove(avc, &anon_vma->rb_root); list_del(&avc->same_vma); anon_vma_chain_free(avc); } - - if (root) - anon_vma_unlock_write(root); } /** -- cgit v1.2.3 From d17f02417a337de0a0c6e763e938ee5e41a97c3d Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Sun, 18 Jan 2026 14:50:45 +0000 Subject: mm/rmap: separate out fork-only logic on anon_vma_clone() Specify which operation is being performed to anon_vma_clone(), which allows us to do checks specific to each operation type, as well as to separate out and make clear that the anon_vma reuse logic is absolutely specific to fork only. This opens the door to further refactorings and refinements later as we have more information to work with. Link: https://lkml.kernel.org/r/cf7da7a2d973cdc72a1b80dd9a73260519e8fa9f.1768746221.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Liam R. Howlett Reviewed-by: Suren Baghdasaryan Cc: Barry Song Cc: Chris Li Cc: David Hildenbrand Cc: Harry Yoo Cc: Jann Horn Cc: Michal Hocko Cc: Mike Rapoport Cc: Pedro Falcato Cc: Rik van Riel Cc: Shakeel Butt Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/internal.h | 11 +++++- mm/rmap.c | 74 +++++++++++++++++++++++++++------------- mm/vma.c | 6 ++-- tools/testing/vma/vma_internal.h | 11 +++++- 4 files changed, 74 insertions(+), 28 deletions(-) diff --git a/mm/internal.h b/mm/internal.h index aac4ec53fe15..5585059f0209 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -244,7 +244,16 @@ static inline void anon_vma_unlock_read(struct anon_vma *anon_vma) struct anon_vma *folio_get_anon_vma(const struct folio *folio); -int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src); +/* Operations which modify VMAs. */ +enum vma_operation { + VMA_OP_SPLIT, + VMA_OP_MERGE_UNFAULTED, + VMA_OP_REMAP, + VMA_OP_FORK, +}; + +int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src, + enum vma_operation operation); int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma); int __anon_vma_prepare(struct vm_area_struct *vma); void unlink_anon_vmas(struct vm_area_struct *vma); diff --git a/mm/rmap.c b/mm/rmap.c index a5ce9163454a..6ddbf58111ff 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -232,12 +232,13 @@ int __anon_vma_prepare(struct vm_area_struct *vma) } static void check_anon_vma_clone(struct vm_area_struct *dst, - struct vm_area_struct *src) + struct vm_area_struct *src, + enum vma_operation operation) { /* The write lock must be held. */ mmap_assert_write_locked(src->vm_mm); - /* If not a fork (implied by dst->anon_vma) then must be on same mm. */ - VM_WARN_ON_ONCE(dst->anon_vma && dst->vm_mm != src->vm_mm); + /* If not a fork then must be on same mm. */ + VM_WARN_ON_ONCE(operation != VMA_OP_FORK && dst->vm_mm != src->vm_mm); /* If we have anything to do src->anon_vma must be provided. */ VM_WARN_ON_ONCE(!src->anon_vma && !list_empty(&src->anon_vma_chain)); @@ -249,6 +250,40 @@ static void check_anon_vma_clone(struct vm_area_struct *dst, * must be the same across dst and src. */ VM_WARN_ON_ONCE(dst->anon_vma && dst->anon_vma != src->anon_vma); + /* + * Essentially equivalent to above - if not a no-op, we should expect + * dst->anon_vma to be set for everything except a fork. + */ + VM_WARN_ON_ONCE(operation != VMA_OP_FORK && src->anon_vma && + !dst->anon_vma); + /* For the anon_vma to be compatible, it can only be singular. */ + VM_WARN_ON_ONCE(operation == VMA_OP_MERGE_UNFAULTED && + !list_is_singular(&src->anon_vma_chain)); +#ifdef CONFIG_PER_VMA_LOCK + /* Only merging an unfaulted VMA leaves the destination attached. */ + VM_WARN_ON_ONCE(operation != VMA_OP_MERGE_UNFAULTED && + vma_is_attached(dst)); +#endif +} + +static void maybe_reuse_anon_vma(struct vm_area_struct *dst, + struct anon_vma *anon_vma) +{ + /* If already populated, nothing to do.*/ + if (dst->anon_vma) + return; + + /* + * We reuse an anon_vma if any linking VMAs were unmapped and it has + * only a single child at most. + */ + if (anon_vma->num_active_vmas > 0) + return; + if (anon_vma->num_children > 1) + return; + + dst->anon_vma = anon_vma; + anon_vma->num_active_vmas++; } static void cleanup_partial_anon_vmas(struct vm_area_struct *vma); @@ -258,6 +293,7 @@ static void cleanup_partial_anon_vmas(struct vm_area_struct *vma); * all of the anon_vma objects contained within @src anon_vma_chain's. * @dst: The destination VMA with an empty anon_vma_chain. * @src: The source VMA we wish to duplicate. + * @operation: The type of operation which resulted in the clone. * * This is the heart of the VMA side of the anon_vma implementation - we invoke * this function whenever we need to set up a new VMA's anon_vma state. @@ -280,17 +316,17 @@ static void cleanup_partial_anon_vmas(struct vm_area_struct *vma); * * Returns: 0 on success, -ENOMEM on failure. */ -int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) +int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src, + enum vma_operation operation) { struct anon_vma_chain *avc, *pavc; + struct anon_vma *active_anon_vma = src->anon_vma; - check_anon_vma_clone(dst, src); + check_anon_vma_clone(dst, src, operation); - if (!src->anon_vma) + if (!active_anon_vma) return 0; - check_anon_vma_clone(dst, src); - /* * Allocate AVCs. We don't need an anon_vma lock for this as we * are not updating the anon_vma rbtree nor are we changing @@ -318,22 +354,14 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) struct anon_vma *anon_vma = avc->anon_vma; anon_vma_interval_tree_insert(avc, &anon_vma->rb_root); - - /* - * Reuse existing anon_vma if it has no vma and only one - * anon_vma child. - * - * Root anon_vma is never reused: - * it has self-parent reference and at least one child. - */ - if (!dst->anon_vma && src->anon_vma && - anon_vma->num_children < 2 && - anon_vma->num_active_vmas == 0) - dst->anon_vma = anon_vma; + if (operation == VMA_OP_FORK) + maybe_reuse_anon_vma(dst, anon_vma); } - if (dst->anon_vma) + + if (operation != VMA_OP_FORK) dst->anon_vma->num_active_vmas++; - anon_vma_unlock_write(src->anon_vma); + + anon_vma_unlock_write(active_anon_vma); return 0; enomem_failure: @@ -372,7 +400,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) * First, attach the new VMA to the parent VMA's anon_vmas, * so rmap can find non-COWed pages in child processes. */ - rc = anon_vma_clone(vma, pvma); + rc = anon_vma_clone(vma, pvma, VMA_OP_FORK); /* An error arose or an existing anon_vma was reused, all done then. */ if (rc || vma->anon_vma) { put_anon_vma(anon_vma); diff --git a/mm/vma.c b/mm/vma.c index 6c458c8656b8..3dbe414eff89 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -530,7 +530,7 @@ __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, if (err) goto out_free_vmi; - err = anon_vma_clone(new, vma); + err = anon_vma_clone(new, vma, VMA_OP_SPLIT); if (err) goto out_free_mpol; @@ -628,7 +628,7 @@ static int dup_anon_vma(struct vm_area_struct *dst, vma_assert_write_locked(dst); dst->anon_vma = src->anon_vma; - ret = anon_vma_clone(dst, src); + ret = anon_vma_clone(dst, src, VMA_OP_MERGE_UNFAULTED); if (ret) return ret; @@ -1901,7 +1901,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, vma_set_range(new_vma, addr, addr + len, pgoff); if (vma_dup_policy(vma, new_vma)) goto out_free_vma; - if (anon_vma_clone(new_vma, vma)) + if (anon_vma_clone(new_vma, vma, VMA_OP_REMAP)) goto out_free_mempol; if (new_vma->vm_file) get_file(new_vma->vm_file); diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h index 93e5792306d9..7fa56dcc53a6 100644 --- a/tools/testing/vma/vma_internal.h +++ b/tools/testing/vma/vma_internal.h @@ -600,6 +600,14 @@ struct mmap_action { bool hide_from_rmap_until_complete :1; }; +/* Operations which modify VMAs. */ +enum vma_operation { + VMA_OP_SPLIT, + VMA_OP_MERGE_UNFAULTED, + VMA_OP_REMAP, + VMA_OP_FORK, +}; + /* * Describes a VMA that is about to be mmap()'ed. Drivers may choose to * manipulate mutable fields which will cause those fields to be updated in the @@ -1157,7 +1165,8 @@ static inline int vma_dup_policy(struct vm_area_struct *src, struct vm_area_stru return 0; } -static inline int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) +static inline int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src, + enum vma_operation operation) { /* For testing purposes. We indicate that an anon_vma has been cloned. */ if (src->anon_vma != NULL) { -- cgit v1.2.3 From 66987218154918a6341a3e3eeeee58110a69e0bb Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 6 Jan 2026 12:52:36 +0100 Subject: mm/page_alloc: ignore the exact initial compaction result Patch series "tweaks for __alloc_pages_slowpath()", v3. This patch (of 3): For allocations that are of costly order and __GFP_NORETRY (and can perform compaction) we attempt direct compaction first. If that fails, we continue with a single round of direct reclaim+compaction (as for other __GFP_NORETRY allocations, except the compaction is of lower priority), with two exceptions that fail immediately: - __GFP_THISNODE is specified, to prevent zone_reclaim_mode-like behavior for e.g. THP page faults - compaction failed because it was deferred (i.e. has been failing recently so further attempts are not done for a while) or skipped, which means there are insufficient free base pages to defragment to begin with Upon closer inspection, the second condition has a somewhat flawed reasoning. If there are not enough base pages and reclaim could create them, we instead fail. When there are enough base pages and compaction has already ran and failed, we proceed and hope that reclaim and the subsequent compaction attempt will succeed. But it's unclear why they should and whether it will be as inexpensive as intended. It might make therefore more sense to just fail unconditionally after the initial compaction attempt. However that would change the semantics of __GFP_NORETRY to attempt reclaim at least once. Alternatively we can remove the compaction result checks and proceed with the single reclaim and (lower priority) compaction attempt, leaving only the __GFP_THISNODE exception for failing immediately. Link: https://lkml.kernel.org/r/20260106-thp-thisnode-tweak-v3-0-f5d67c21a193@suse.cz Link: https://lkml.kernel.org/r/20260106-thp-thisnode-tweak-v3-1-f5d67c21a193@suse.cz Signed-off-by: Vlastimil Babka Acked-by: Michal Hocko Cc: Brendan Jackman Cc: David Hildenbrand (Red Hat) Cc: David Rientjes Cc: Johannes Weiner Cc: Joshua Hahn Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Mike Rapoport Cc: Pedro Falcato Cc: Suren Baghdasaryan Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/page_alloc.c | 34 ++++++---------------------------- 1 file changed, 6 insertions(+), 28 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index bc3ee3102b19..8e6d2e61374a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4797,44 +4797,22 @@ restart: * includes some THP page fault allocations */ if (costly_order && (gfp_mask & __GFP_NORETRY)) { - /* - * If allocating entire pageblock(s) and compaction - * failed because all zones are below low watermarks - * or is prohibited because it recently failed at this - * order, fail immediately unless the allocator has - * requested compaction and reclaim retry. - * - * Reclaim is - * - potentially very expensive because zones are far - * below their low watermarks or this is part of very - * bursty high order allocations, - * - not guaranteed to help because isolate_freepages() - * may not iterate over freed pages as part of its - * linear scan, and - * - unlikely to make entire pageblocks free on its - * own. - */ - if (compact_result == COMPACT_SKIPPED || - compact_result == COMPACT_DEFERRED) - goto nopage; - /* * THP page faults may attempt local node only first, * but are then allowed to only compact, not reclaim, * see alloc_pages_mpol(). * - * Compaction can fail for other reasons than those - * checked above and we don't want such THP allocations - * to put reclaim pressure on a single node in a - * situation where other nodes might have plenty of - * available memory. + * Compaction has failed above and we don't want such + * THP allocations to put reclaim pressure on a single + * node in a situation where other nodes might have + * plenty of available memory. */ if (gfp_mask & __GFP_THISNODE) goto nopage; /* - * Looks like reclaim/compaction is worth trying, but - * sync compaction could be very expensive, so keep + * Proceed with single round of reclaim/compaction, but + * since sync compaction could be very expensive, keep * using async compaction. */ compact_priority = INIT_COMPACT_PRIORITY; -- cgit v1.2.3 From 53a9b4646f67c95df1775aa5f381cb7f42cae957 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 6 Jan 2026 12:52:37 +0100 Subject: mm/page_alloc: refactor the initial compaction handling The initial direct compaction done in some cases in __alloc_pages_slowpath() stands out from the main retry loop of reclaim + compaction. We can simplify this by instead skipping the initial reclaim attempt via a new local variable compact_first, and handle the compact_prority as necessary to match the original behavior. No functional change intended. Link: https://lkml.kernel.org/r/20260106-thp-thisnode-tweak-v3-2-f5d67c21a193@suse.cz Signed-off-by: Vlastimil Babka Suggested-by: Johannes Weiner Reviewed-by: Joshua Hahn Acked-by: Michal Hocko Cc: Brendan Jackman Cc: David Hildenbrand (Red Hat) Cc: David Rientjes Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Mike Rapoport Cc: Pedro Falcato Cc: Suren Baghdasaryan Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/gfp.h | 8 ++++- mm/page_alloc.c | 100 +++++++++++++++++++++++++--------------------------- 2 files changed, 55 insertions(+), 53 deletions(-) diff --git a/include/linux/gfp.h b/include/linux/gfp.h index b155929af5b1..f9fdc99ae594 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -407,9 +407,15 @@ extern gfp_t gfp_allowed_mask; /* Returns true if the gfp_mask allows use of ALLOC_NO_WATERMARK */ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask); +/* A helper for checking if gfp includes all the specified flags */ +static inline bool gfp_has_flags(gfp_t gfp, gfp_t flags) +{ + return (gfp & flags) == flags; +} + static inline bool gfp_has_io_fs(gfp_t gfp) { - return (gfp & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS); + return gfp_has_flags(gfp, __GFP_IO | __GFP_FS); } /* diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8e6d2e61374a..848c5c93ccb5 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4694,7 +4694,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, struct alloc_context *ac) { bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM; - bool can_compact = gfp_compaction_allowed(gfp_mask); + bool can_compact = can_direct_reclaim && gfp_compaction_allowed(gfp_mask); bool nofail = gfp_mask & __GFP_NOFAIL; const bool costly_order = order > PAGE_ALLOC_COSTLY_ORDER; struct page *page = NULL; @@ -4707,6 +4707,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, unsigned int cpuset_mems_cookie; unsigned int zonelist_iter_cookie; int reserve_flags; + bool compact_first = false; if (unlikely(nofail)) { /* @@ -4730,6 +4731,19 @@ restart: cpuset_mems_cookie = read_mems_allowed_begin(); zonelist_iter_cookie = zonelist_iter_begin(); + /* + * For costly allocations, try direct compaction first, as it's likely + * that we have enough base pages and don't need to reclaim. For non- + * movable high-order allocations, do that as well, as compaction will + * try prevent permanent fragmentation by migrating from blocks of the + * same migratetype. + */ + if (can_compact && (costly_order || (order > 0 && + ac->migratetype != MIGRATE_MOVABLE))) { + compact_first = true; + compact_priority = INIT_COMPACT_PRIORITY; + } + /* * The fast path uses conservative alloc_flags to succeed only until * kswapd needs to be woken up, and to avoid the cost of setting up @@ -4772,53 +4786,6 @@ restart: if (page) goto got_pg; - /* - * For costly allocations, try direct compaction first, as it's likely - * that we have enough base pages and don't need to reclaim. For non- - * movable high-order allocations, do that as well, as compaction will - * try prevent permanent fragmentation by migrating from blocks of the - * same migratetype. - * Don't try this for allocations that are allowed to ignore - * watermarks, as the ALLOC_NO_WATERMARKS attempt didn't yet happen. - */ - if (can_direct_reclaim && can_compact && - (costly_order || - (order > 0 && ac->migratetype != MIGRATE_MOVABLE)) - && !gfp_pfmemalloc_allowed(gfp_mask)) { - page = __alloc_pages_direct_compact(gfp_mask, order, - alloc_flags, ac, - INIT_COMPACT_PRIORITY, - &compact_result); - if (page) - goto got_pg; - - /* - * Checks for costly allocations with __GFP_NORETRY, which - * includes some THP page fault allocations - */ - if (costly_order && (gfp_mask & __GFP_NORETRY)) { - /* - * THP page faults may attempt local node only first, - * but are then allowed to only compact, not reclaim, - * see alloc_pages_mpol(). - * - * Compaction has failed above and we don't want such - * THP allocations to put reclaim pressure on a single - * node in a situation where other nodes might have - * plenty of available memory. - */ - if (gfp_mask & __GFP_THISNODE) - goto nopage; - - /* - * Proceed with single round of reclaim/compaction, but - * since sync compaction could be very expensive, keep - * using async compaction. - */ - compact_priority = INIT_COMPACT_PRIORITY; - } - } - retry: /* * Deal with possible cpuset update races or zonelist updates to avoid @@ -4862,10 +4829,12 @@ retry: goto nopage; /* Try direct reclaim and then allocating */ - page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac, - &did_some_progress); - if (page) - goto got_pg; + if (!compact_first) { + page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, + ac, &did_some_progress); + if (page) + goto got_pg; + } /* Try direct compaction and then allocating */ page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac, @@ -4873,6 +4842,33 @@ retry: if (page) goto got_pg; + if (compact_first) { + /* + * THP page faults may attempt local node only first, but are + * then allowed to only compact, not reclaim, see + * alloc_pages_mpol(). + * + * Compaction has failed above and we don't want such THP + * allocations to put reclaim pressure on a single node in a + * situation where other nodes might have plenty of available + * memory. + */ + if (gfp_has_flags(gfp_mask, __GFP_NORETRY | __GFP_THISNODE)) + goto nopage; + + /* + * For the initial compaction attempt we have lowered its + * priority. Restore it for further retries, if those are + * allowed. With __GFP_NORETRY there will be a single round of + * reclaim and compaction with the lowered priority. + */ + if (!(gfp_mask & __GFP_NORETRY)) + compact_priority = DEF_COMPACT_PRIORITY; + + compact_first = false; + goto retry; + } + /* Do not loop if specifically requested */ if (gfp_mask & __GFP_NORETRY) goto nopage; -- cgit v1.2.3 From 2c4c3e29897d43c431b1cf9432fb66977f262ac2 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 6 Jan 2026 12:52:38 +0100 Subject: mm/page_alloc: simplify __alloc_pages_slowpath() flow The actions done before entering the main retry loop include waking up kswapds and an allocation attempt with the precise alloc_flags. Then in the loop we keep waking up kswapds, and we retry the allocation with flags potentially further adjusted by being allowed to use reserves (due to e.g. becoming an OOM killer victim). We can adjust the retry loop to keep only one instance of waking up kswapds and allocation attempt. Introduce the can_retry_reserves variable for retrying once when we become eligible for reserves. It is still useful not to evaluate reserve_flags immediately for the first allocation attempt, because it's better to first try succeed in a non-preferred zone above the min watermark before allocating immediately from the preferred zone below min watermark. Additionally move the cpuset update checks introduced by e05741fb10c3 ("mm/page_alloc.c: avoid infinite retries caused by cpuset race") further down the retry loop. It's enough to do the checks only before reaching any potentially infinite 'goto retry;' loop. There should be no meaningful functional changes. The change of exact moments the retry for reserves and cpuset updates are checked should not result in different outomes modulo races with concurrent allocator activity. Link: https://lkml.kernel.org/r/20260106-thp-thisnode-tweak-v3-3-f5d67c21a193@suse.cz Signed-off-by: Vlastimil Babka Acked-by: Michal Hocko Cc: Johannes Weiner Cc: Joshua Hahn Cc: Brendan Jackman Cc: David Hildenbrand (Red Hat) Cc: David Rientjes Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Mike Rapoport Cc: Pedro Falcato Cc: Suren Baghdasaryan Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/page_alloc.c | 41 +++++++++++++++++++++++------------------ 1 file changed, 23 insertions(+), 18 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 848c5c93ccb5..f7d777921f05 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4708,6 +4708,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, unsigned int zonelist_iter_cookie; int reserve_flags; bool compact_first = false; + bool can_retry_reserves = true; if (unlikely(nofail)) { /* @@ -4775,6 +4776,8 @@ restart: goto nopage; } +retry: + /* Ensure kswapd doesn't accidentally go to sleep as long as we loop */ if (alloc_flags & ALLOC_KSWAPD) wake_all_kswapds(order, gfp_mask, ac); @@ -4786,19 +4789,6 @@ restart: if (page) goto got_pg; -retry: - /* - * Deal with possible cpuset update races or zonelist updates to avoid - * infinite retries. - */ - if (check_retry_cpuset(cpuset_mems_cookie, ac) || - check_retry_zonelist(zonelist_iter_cookie)) - goto restart; - - /* Ensure kswapd doesn't accidentally go to sleep as long as we loop */ - if (alloc_flags & ALLOC_KSWAPD) - wake_all_kswapds(order, gfp_mask, ac); - reserve_flags = __gfp_pfmemalloc_flags(gfp_mask); if (reserve_flags) alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, reserve_flags) | @@ -4813,12 +4803,18 @@ retry: ac->nodemask = NULL; ac->preferred_zoneref = first_zones_zonelist(ac->zonelist, ac->highest_zoneidx, ac->nodemask); - } - /* Attempt with potentially adjusted zonelist and alloc_flags */ - page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac); - if (page) - goto got_pg; + /* + * The first time we adjust anything due to being allowed to + * ignore memory policies or watermarks, retry immediately. This + * allows us to keep the first allocation attempt optimistic so + * it can succeed in a zone that is still above watermarks. + */ + if (can_retry_reserves) { + can_retry_reserves = false; + goto retry; + } + } /* Caller is not willing to reclaim, we can't balance anything */ if (!can_direct_reclaim) @@ -4881,6 +4877,15 @@ retry: !(gfp_mask & __GFP_RETRY_MAYFAIL))) goto nopage; + /* + * Deal with possible cpuset update races or zonelist updates to avoid + * infinite retries. No "goto retry;" can be placed above this check + * unless it can execute just once. + */ + if (check_retry_cpuset(cpuset_mems_cookie, ac) || + check_retry_zonelist(zonelist_iter_cookie)) + goto restart; + if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags, did_some_progress > 0, &no_progress_loops)) goto retry; -- cgit v1.2.3 From e77786b4682e69336e3de3eaeb12ec994027f611 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Thu, 25 Dec 2025 15:21:09 -0800 Subject: memcg: introduce private id API for in-kernel users Patch series "memcg: separate private and public ID namespaces". The memory cgroup subsystem maintains a private ID infrastructure that is decoupled from the cgroup IDs. This private ID system exists because some kernel objects (like swap entries and shadow entries in the workingset code) can outlive the cgroup they were associated with. The motivation is best described in commit 73f576c04b941 ("mm: memcontrol: fix cgroup creation failure after many small jobs"). Unfortunately, some in-kernel users (DAMON, LRU gen debugfs interface, shrinker debugfs) started exposing these private IDs to userspace. This is problematic because: 1. The private IDs are internal implementation details that could change 2. Userspace already has access to cgroup IDs through the cgroup filesystem 3. Using different ID namespaces in different interfaces is confusing This series cleans up the memcg ID infrastructure by: 1. Explicitly marking the private ID APIs with "private" in their names to make it clear they are for internal use only (swap/workingset) 2. Making the public cgroup ID APIs (mem_cgroup_id/mem_cgroup_get_from_id) unconditionally available 3. Converting DAMON, LRU gen, and shrinker debugfs interfaces to use the public cgroup IDs instead of the private IDs 4. Removing the now-unused wrapper functions and renaming the public APIs for clarity After this series: - mem_cgroup_private_id() / mem_cgroup_from_private_id() are used for internal kernel objects that outlive their cgroup (swap, workingset) - mem_cgroup_id() / mem_cgroup_get_from_id() return the public cgroup ID (from cgroup_id()) for use in userspace-facing interfaces This patch (of 8): The memory cgroup maintains a private ID infrastructure decoupled from the cgroup IDs for swapout records and shadow entries. The main motivation of this private ID infra is best described in the commit 73f576c04b941 ("mm: memcontrol: fix cgroup creation failure after many small jobs"). Unfortunately some users have started exposing these private IDs to the userspace where they should have used the cgroup IDs which are already exposed to the userspace. Let's rename the memcg ID APIs to explicitly mark them private. No functional change is intended. Link: https://lkml.kernel.org/r/20251225232116.294540-1-shakeel.butt@linux.dev Link: https://lkml.kernel.org/r/20251225232116.294540-2-shakeel.butt@linux.dev Signed-off-by: Shakeel Butt Acked-by: Michal Hocko Cc: Axel Rasmussen Cc: Dave Chinner Cc: David Hildenbrand Cc: Johannes Weiner Cc: Lorenzo Stoakes Cc: Muchun Song Cc: Qi Zheng Cc: Roman Gushchin Cc: SeongJae Park Cc: Wei Xu Cc: Yuanchu Xie Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 24 +++++++++++++++++--- mm/list_lru.c | 2 +- mm/memcontrol-v1.c | 6 ++--- mm/memcontrol-v1.h | 4 ++-- mm/memcontrol.c | 55 +++++++++++++++++++++++++--------------------- mm/workingset.c | 8 +++---- 6 files changed, 61 insertions(+), 38 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index fd400082313a..1c4224bcfb23 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -65,7 +65,7 @@ struct mem_cgroup_reclaim_cookie { #define MEM_CGROUP_ID_SHIFT 16 -struct mem_cgroup_id { +struct mem_cgroup_private_id { int id; refcount_t ref; }; @@ -191,7 +191,7 @@ struct mem_cgroup { struct cgroup_subsys_state css; /* Private memcg ID. Used to ID objects that outlive the cgroup */ - struct mem_cgroup_id id; + struct mem_cgroup_private_id id; /* Accounted resources */ struct page_counter memory; /* Both v1 & v2 */ @@ -821,13 +821,19 @@ void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *); void mem_cgroup_scan_tasks(struct mem_cgroup *memcg, int (*)(struct task_struct *, void *), void *arg); -static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg) +static inline unsigned short mem_cgroup_private_id(struct mem_cgroup *memcg) { if (mem_cgroup_disabled()) return 0; return memcg->id.id; } +struct mem_cgroup *mem_cgroup_from_private_id(unsigned short id); + +static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg) +{ + return mem_cgroup_private_id(memcg); +} struct mem_cgroup *mem_cgroup_from_id(unsigned short id); #ifdef CONFIG_SHRINKER_DEBUG @@ -1290,6 +1296,18 @@ static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id) return NULL; } +static inline unsigned short mem_cgroup_private_id(struct mem_cgroup *memcg) +{ + return 0; +} + +static inline struct mem_cgroup *mem_cgroup_from_private_id(unsigned short id) +{ + WARN_ON_ONCE(id); + /* XXX: This should always return root_mem_cgroup */ + return NULL; +} + #ifdef CONFIG_SHRINKER_DEBUG static inline unsigned long mem_cgroup_ino(struct mem_cgroup *memcg) { diff --git a/mm/list_lru.c b/mm/list_lru.c index 37b642f6cbda..13b9f66d950e 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -369,7 +369,7 @@ unsigned long list_lru_walk_node(struct list_lru *lru, int nid, xa_for_each(&lru->xa, index, mlru) { rcu_read_lock(); - memcg = mem_cgroup_from_id(index); + memcg = mem_cgroup_from_private_id(index); if (!mem_cgroup_tryget(memcg)) { rcu_read_unlock(); continue; diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c index 0b50cb122ff3..0e3d972fad33 100644 --- a/mm/memcontrol-v1.c +++ b/mm/memcontrol-v1.c @@ -635,14 +635,14 @@ void memcg1_swapout(struct folio *folio, swp_entry_t entry) * have an ID allocated to it anymore, charge the closest online * ancestor for the swap instead and transfer the memory+swap charge. */ - swap_memcg = mem_cgroup_id_get_online(memcg); + swap_memcg = mem_cgroup_private_id_get_online(memcg); nr_entries = folio_nr_pages(folio); /* Get references for the tail pages, too */ if (nr_entries > 1) - mem_cgroup_id_get_many(swap_memcg, nr_entries - 1); + mem_cgroup_private_id_get_many(swap_memcg, nr_entries - 1); mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries); - swap_cgroup_record(folio, mem_cgroup_id(swap_memcg), entry); + swap_cgroup_record(folio, mem_cgroup_private_id(swap_memcg), entry); folio_unqueue_deferred_split(folio); folio->memcg_data = 0; diff --git a/mm/memcontrol-v1.h b/mm/memcontrol-v1.h index e92b21af92b1..49933925b4ba 100644 --- a/mm/memcontrol-v1.h +++ b/mm/memcontrol-v1.h @@ -28,8 +28,8 @@ unsigned long memcg_events(struct mem_cgroup *memcg, int event); unsigned long memcg_page_state_output(struct mem_cgroup *memcg, int item); int memory_stat_show(struct seq_file *m, void *v); -void mem_cgroup_id_get_many(struct mem_cgroup *memcg, unsigned int n); -struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg); +void mem_cgroup_private_id_get_many(struct mem_cgroup *memcg, unsigned int n); +struct mem_cgroup *mem_cgroup_private_id_get_online(struct mem_cgroup *memcg); /* Cgroup v1-specific declarations */ #ifdef CONFIG_MEMCG_V1 diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 75fc22a33b28..25ad8433df2e 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3554,38 +3554,38 @@ static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg) */ #define MEM_CGROUP_ID_MAX ((1UL << MEM_CGROUP_ID_SHIFT) - 1) -static DEFINE_XARRAY_ALLOC1(mem_cgroup_ids); +static DEFINE_XARRAY_ALLOC1(mem_cgroup_private_ids); -static void mem_cgroup_id_remove(struct mem_cgroup *memcg) +static void mem_cgroup_private_id_remove(struct mem_cgroup *memcg) { if (memcg->id.id > 0) { - xa_erase(&mem_cgroup_ids, memcg->id.id); + xa_erase(&mem_cgroup_private_ids, memcg->id.id); memcg->id.id = 0; } } -void __maybe_unused mem_cgroup_id_get_many(struct mem_cgroup *memcg, +void __maybe_unused mem_cgroup_private_id_get_many(struct mem_cgroup *memcg, unsigned int n) { refcount_add(n, &memcg->id.ref); } -static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n) +static void mem_cgroup_private_id_put_many(struct mem_cgroup *memcg, unsigned int n) { if (refcount_sub_and_test(n, &memcg->id.ref)) { - mem_cgroup_id_remove(memcg); + mem_cgroup_private_id_remove(memcg); /* Memcg ID pins CSS */ css_put(&memcg->css); } } -static inline void mem_cgroup_id_put(struct mem_cgroup *memcg) +static inline void mem_cgroup_private_id_put(struct mem_cgroup *memcg) { - mem_cgroup_id_put_many(memcg, 1); + mem_cgroup_private_id_put_many(memcg, 1); } -struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg) +struct mem_cgroup *mem_cgroup_private_id_get_online(struct mem_cgroup *memcg) { while (!refcount_inc_not_zero(&memcg->id.ref)) { /* @@ -3604,15 +3604,20 @@ struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg) } /** - * mem_cgroup_from_id - look up a memcg from a memcg id + * mem_cgroup_from_private_id - look up a memcg from a memcg id * @id: the memcg id to look up * * Caller must hold rcu_read_lock(). */ -struct mem_cgroup *mem_cgroup_from_id(unsigned short id) +struct mem_cgroup *mem_cgroup_from_private_id(unsigned short id) { WARN_ON_ONCE(!rcu_read_lock_held()); - return xa_load(&mem_cgroup_ids, id); + return xa_load(&mem_cgroup_private_ids, id); +} + +struct mem_cgroup *mem_cgroup_from_id(unsigned short id) +{ + return mem_cgroup_from_private_id(id); } #ifdef CONFIG_SHRINKER_DEBUG @@ -3711,7 +3716,7 @@ static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent) if (!memcg) return ERR_PTR(-ENOMEM); - error = xa_alloc(&mem_cgroup_ids, &memcg->id.id, NULL, + error = xa_alloc(&mem_cgroup_private_ids, &memcg->id.id, NULL, XA_LIMIT(1, MEM_CGROUP_ID_MAX), GFP_KERNEL); if (error) goto fail; @@ -3771,7 +3776,7 @@ static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent) lru_gen_init_memcg(memcg); return memcg; fail: - mem_cgroup_id_remove(memcg); + mem_cgroup_private_id_remove(memcg); __mem_cgroup_free(memcg); return ERR_PTR(error); } @@ -3854,7 +3859,7 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css) css_get(css); /* - * Ensure mem_cgroup_from_id() works once we're fully online. + * Ensure mem_cgroup_from_private_id() works once we're fully online. * * We could do this earlier and require callers to filter with * css_tryget_online(). But right now there are no users that @@ -3863,13 +3868,13 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css) * publish it here at the end of onlining. This matches the * regular ID destruction during offlining. */ - xa_store(&mem_cgroup_ids, memcg->id.id, memcg, GFP_KERNEL); + xa_store(&mem_cgroup_private_ids, memcg->id.id, memcg, GFP_KERNEL); return 0; offline_kmem: memcg_offline_kmem(memcg); remove_id: - mem_cgroup_id_remove(memcg); + mem_cgroup_private_id_remove(memcg); return -ENOMEM; } @@ -3892,7 +3897,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) drain_all_stock(memcg); - mem_cgroup_id_put(memcg); + mem_cgroup_private_id_put(memcg); } static void mem_cgroup_css_released(struct cgroup_subsys_state *css) @@ -4779,7 +4784,7 @@ int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm, id = lookup_swap_cgroup_id(entry); rcu_read_lock(); - memcg = mem_cgroup_from_id(id); + memcg = mem_cgroup_from_private_id(id); if (!memcg || !css_tryget_online(&memcg->css)) memcg = get_mem_cgroup_from_mm(mm); rcu_read_unlock(); @@ -5174,22 +5179,22 @@ int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry) return 0; } - memcg = mem_cgroup_id_get_online(memcg); + memcg = mem_cgroup_private_id_get_online(memcg); if (!mem_cgroup_is_root(memcg) && !page_counter_try_charge(&memcg->swap, nr_pages, &counter)) { memcg_memory_event(memcg, MEMCG_SWAP_MAX); memcg_memory_event(memcg, MEMCG_SWAP_FAIL); - mem_cgroup_id_put(memcg); + mem_cgroup_private_id_put(memcg); return -ENOMEM; } /* Get references for the tail pages, too */ if (nr_pages > 1) - mem_cgroup_id_get_many(memcg, nr_pages - 1); + mem_cgroup_private_id_get_many(memcg, nr_pages - 1); mod_memcg_state(memcg, MEMCG_SWAP, nr_pages); - swap_cgroup_record(folio, mem_cgroup_id(memcg), entry); + swap_cgroup_record(folio, mem_cgroup_private_id(memcg), entry); return 0; } @@ -5206,7 +5211,7 @@ void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages) id = swap_cgroup_clear(entry, nr_pages); rcu_read_lock(); - memcg = mem_cgroup_from_id(id); + memcg = mem_cgroup_from_private_id(id); if (memcg) { if (!mem_cgroup_is_root(memcg)) { if (do_memsw_account()) @@ -5215,7 +5220,7 @@ void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages) page_counter_uncharge(&memcg->swap, nr_pages); } mod_memcg_state(memcg, MEMCG_SWAP, -nr_pages); - mem_cgroup_id_put_many(memcg, nr_pages); + mem_cgroup_private_id_put_many(memcg, nr_pages); } rcu_read_unlock(); } diff --git a/mm/workingset.c b/mm/workingset.c index e9f05634747a..13422d304715 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -254,7 +254,7 @@ static void *lru_gen_eviction(struct folio *folio) hist = lru_hist_from_seq(min_seq); atomic_long_add(delta, &lrugen->evicted[hist][type][tier]); - return pack_shadow(mem_cgroup_id(memcg), pgdat, token, workingset); + return pack_shadow(mem_cgroup_private_id(memcg), pgdat, token, workingset); } /* @@ -271,7 +271,7 @@ static bool lru_gen_test_recent(void *shadow, struct lruvec **lruvec, unpack_shadow(shadow, &memcg_id, &pgdat, token, workingset); - memcg = mem_cgroup_from_id(memcg_id); + memcg = mem_cgroup_from_private_id(memcg_id); *lruvec = mem_cgroup_lruvec(memcg, pgdat); max_seq = READ_ONCE((*lruvec)->lrugen.max_seq); @@ -395,7 +395,7 @@ void *workingset_eviction(struct folio *folio, struct mem_cgroup *target_memcg) lruvec = mem_cgroup_lruvec(target_memcg, pgdat); /* XXX: target_memcg can be NULL, go through lruvec */ - memcgid = mem_cgroup_id(lruvec_memcg(lruvec)); + memcgid = mem_cgroup_private_id(lruvec_memcg(lruvec)); eviction = atomic_long_read(&lruvec->nonresident_age); eviction >>= bucket_order; workingset_age_nonresident(lruvec, folio_nr_pages(folio)); @@ -456,7 +456,7 @@ bool workingset_test_recent(void *shadow, bool file, bool *workingset, * would be better if the root_mem_cgroup existed in all * configurations instead. */ - eviction_memcg = mem_cgroup_from_id(memcgid); + eviction_memcg = mem_cgroup_from_private_id(memcgid); if (!mem_cgroup_tryget(eviction_memcg)) eviction_memcg = NULL; rcu_read_unlock(); -- cgit v1.2.3 From 1d89d7fd592e2490cadd13c253d7b1b9f6116be8 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Thu, 25 Dec 2025 15:21:10 -0800 Subject: memcg: expose mem_cgroup_ino() and mem_cgroup_get_from_ino() unconditionally Remove the CONFIG_SHRINKER_DEBUG guards around mem_cgroup_ino() and mem_cgroup_get_from_ino(). These APIs provide a way to get a memcg's cgroup inode number and to look up a memcg from an inode number respectively. Making these functions unconditionally available allows other in-kernel users to leverage them without requiring CONFIG_SHRINKER_DEBUG to be enabled. No functional change for existing users. Link: https://lkml.kernel.org/r/20251225232116.294540-3-shakeel.butt@linux.dev Signed-off-by: Shakeel Butt Acked-by: Michal Hocko Cc: Axel Rasmussen Cc: Dave Chinner Cc: David Hildenbrand Cc: Johannes Weiner Cc: Lorenzo Stoakes Cc: Muchun Song Cc: Qi Zheng Cc: Roman Gushchin Cc: SeongJae Park Cc: Wei Xu Cc: Yuanchu Xie Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 4 ---- mm/memcontrol.c | 2 -- 2 files changed, 6 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 1c4224bcfb23..77f32be26ea8 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -836,14 +836,12 @@ static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg) } struct mem_cgroup *mem_cgroup_from_id(unsigned short id); -#ifdef CONFIG_SHRINKER_DEBUG static inline unsigned long mem_cgroup_ino(struct mem_cgroup *memcg) { return memcg ? cgroup_ino(memcg->css.cgroup) : 0; } struct mem_cgroup *mem_cgroup_get_from_ino(unsigned long ino); -#endif static inline struct mem_cgroup *mem_cgroup_from_seq(struct seq_file *m) { @@ -1308,7 +1306,6 @@ static inline struct mem_cgroup *mem_cgroup_from_private_id(unsigned short id) return NULL; } -#ifdef CONFIG_SHRINKER_DEBUG static inline unsigned long mem_cgroup_ino(struct mem_cgroup *memcg) { return 0; @@ -1318,7 +1315,6 @@ static inline struct mem_cgroup *mem_cgroup_get_from_ino(unsigned long ino) { return NULL; } -#endif static inline struct mem_cgroup *mem_cgroup_from_seq(struct seq_file *m) { diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 25ad8433df2e..e85816960e38 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3620,7 +3620,6 @@ struct mem_cgroup *mem_cgroup_from_id(unsigned short id) return mem_cgroup_from_private_id(id); } -#ifdef CONFIG_SHRINKER_DEBUG struct mem_cgroup *mem_cgroup_get_from_ino(unsigned long ino) { struct cgroup *cgrp; @@ -3641,7 +3640,6 @@ struct mem_cgroup *mem_cgroup_get_from_ino(unsigned long ino) return memcg; } -#endif static void free_mem_cgroup_per_node_info(struct mem_cgroup_per_node *pn) { -- cgit v1.2.3 From 510e12900298ca6dbc474f5f418b21532f2ad101 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Thu, 25 Dec 2025 15:21:11 -0800 Subject: memcg: mem_cgroup_get_from_ino() returns NULL on error Change mem_cgroup_get_from_ino() to return NULL on error instead of ERR_PTR values. This simplifies the API: NULL indicates failure, and a valid pointer indicates success with a CSS reference held that the caller must release via mem_cgroup_put(). Link: https://lkml.kernel.org/r/20251225232116.294540-4-shakeel.butt@linux.dev Signed-off-by: Shakeel Butt Acked-by: Michal Hocko Cc: Axel Rasmussen Cc: Dave Chinner Cc: David Hildenbrand Cc: Johannes Weiner Cc: Lorenzo Stoakes Cc: Muchun Song Cc: Qi Zheng Cc: Roman Gushchin Cc: SeongJae Park Cc: Wei Xu Cc: Yuanchu Xie Signed-off-by: Andrew Morton --- mm/memcontrol.c | 6 ++---- mm/shrinker_debug.c | 2 +- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e85816960e38..92beb74482fa 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3624,17 +3624,15 @@ struct mem_cgroup *mem_cgroup_get_from_ino(unsigned long ino) { struct cgroup *cgrp; struct cgroup_subsys_state *css; - struct mem_cgroup *memcg; + struct mem_cgroup *memcg = NULL; cgrp = cgroup_get_from_id(ino); if (IS_ERR(cgrp)) - return ERR_CAST(cgrp); + return NULL; css = cgroup_get_e_css(cgrp, &memory_cgrp_subsys); if (css) memcg = container_of(css, struct mem_cgroup, css); - else - memcg = ERR_PTR(-ENOENT); cgroup_put(cgrp); diff --git a/mm/shrinker_debug.c b/mm/shrinker_debug.c index 20eaee3e97f7..8aaeb8f5c3af 100644 --- a/mm/shrinker_debug.c +++ b/mm/shrinker_debug.c @@ -130,7 +130,7 @@ static ssize_t shrinker_debugfs_scan_write(struct file *file, if (shrinker->flags & SHRINKER_MEMCG_AWARE) { memcg = mem_cgroup_get_from_ino(ino); - if (!memcg || IS_ERR(memcg)) + if (!memcg) return -ENOENT; if (!mem_cgroup_online(memcg)) { -- cgit v1.2.3 From ea73e364716023b1a47d58b9f12e7c92f3b1e6a7 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Thu, 25 Dec 2025 15:21:12 -0800 Subject: memcg: use cgroup_id() instead of cgroup_ino() for memcg ID Switch mem_cgroup_ino() from using cgroup_ino() to cgroup_id(). The cgroup_ino() returns the kernfs inode number while cgroup_id() returns the kernfs node ID. For 64-bit systems, they are the same. Also cgroup_get_from_id() expects 64-bit node ID which is called by mem_cgroup_get_from_ino(). Change the type from unsigned long to u64 to match cgroup_id()'s return type, and update the format specifiers accordingly. Note that the names mem_cgroup_ino() and mem_cgroup_get_from_ino() are now misnomers since they deal with cgroup IDs rather than inode numbers. A follow-up patch will rename them. Link: https://lkml.kernel.org/r/20251225232116.294540-5-shakeel.butt@linux.dev Signed-off-by: Shakeel Butt Acked-by: Michal Hocko Cc: Axel Rasmussen Cc: Dave Chinner Cc: David Hildenbrand Cc: Johannes Weiner Cc: Lorenzo Stoakes Cc: Muchun Song Cc: Qi Zheng Cc: Roman Gushchin Cc: SeongJae Park Cc: Wei Xu Cc: Yuanchu Xie Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 10 +++++----- mm/memcontrol.c | 2 +- mm/shrinker_debug.c | 7 ++++--- 3 files changed, 10 insertions(+), 9 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 77f32be26ea8..c823150ec288 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -836,12 +836,12 @@ static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg) } struct mem_cgroup *mem_cgroup_from_id(unsigned short id); -static inline unsigned long mem_cgroup_ino(struct mem_cgroup *memcg) +static inline u64 mem_cgroup_ino(struct mem_cgroup *memcg) { - return memcg ? cgroup_ino(memcg->css.cgroup) : 0; + return memcg ? cgroup_id(memcg->css.cgroup) : 0; } -struct mem_cgroup *mem_cgroup_get_from_ino(unsigned long ino); +struct mem_cgroup *mem_cgroup_get_from_ino(u64 ino); static inline struct mem_cgroup *mem_cgroup_from_seq(struct seq_file *m) { @@ -1306,12 +1306,12 @@ static inline struct mem_cgroup *mem_cgroup_from_private_id(unsigned short id) return NULL; } -static inline unsigned long mem_cgroup_ino(struct mem_cgroup *memcg) +static inline u64 mem_cgroup_ino(struct mem_cgroup *memcg) { return 0; } -static inline struct mem_cgroup *mem_cgroup_get_from_ino(unsigned long ino) +static inline struct mem_cgroup *mem_cgroup_get_from_ino(u64 ino) { return NULL; } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 92beb74482fa..1ff2f9bd820c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3620,7 +3620,7 @@ struct mem_cgroup *mem_cgroup_from_id(unsigned short id) return mem_cgroup_from_private_id(id); } -struct mem_cgroup *mem_cgroup_get_from_ino(unsigned long ino) +struct mem_cgroup *mem_cgroup_get_from_ino(u64 ino) { struct cgroup *cgrp; struct cgroup_subsys_state *css; diff --git a/mm/shrinker_debug.c b/mm/shrinker_debug.c index 8aaeb8f5c3af..7ef16a0b2959 100644 --- a/mm/shrinker_debug.c +++ b/mm/shrinker_debug.c @@ -70,7 +70,7 @@ static int shrinker_debugfs_count_show(struct seq_file *m, void *v) memcg_aware ? memcg : NULL, count_per_node); if (total) { - seq_printf(m, "%lu", mem_cgroup_ino(memcg)); + seq_printf(m, "%llu", mem_cgroup_ino(memcg)); for_each_node(nid) seq_printf(m, " %lu", count_per_node[nid]); seq_putc(m, '\n'); @@ -106,7 +106,8 @@ static ssize_t shrinker_debugfs_scan_write(struct file *file, size_t size, loff_t *pos) { struct shrinker *shrinker = file->private_data; - unsigned long nr_to_scan = 0, ino, read_len; + unsigned long nr_to_scan = 0, read_len; + u64 ino; struct shrink_control sc = { .gfp_mask = GFP_KERNEL, }; @@ -119,7 +120,7 @@ static ssize_t shrinker_debugfs_scan_write(struct file *file, return -EFAULT; kbuf[read_len] = '\0'; - if (sscanf(kbuf, "%lu %d %lu", &ino, &nid, &nr_to_scan) != 3) + if (sscanf(kbuf, "%llu %d %lu", &ino, &nid, &nr_to_scan) != 3) return -EINVAL; if (nid < 0 || nid >= nr_node_ids) -- cgit v1.2.3 From 5866891a7ab1348686da70f70e925964d9227bf5 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Thu, 25 Dec 2025 15:21:13 -0800 Subject: mm/damon: use cgroup ID instead of private memcg ID DAMON was using the internal private memcg ID which is meant for tracking kernel objects that outlive their cgroup. Switch to using the public cgroup ID instead. Link: https://lkml.kernel.org/r/20251225232116.294540-6-shakeel.butt@linux.dev Signed-off-by: Shakeel Butt Reviewed-by: SeongJae Park Acked-by: Michal Hocko Cc: Axel Rasmussen Cc: Dave Chinner Cc: David Hildenbrand Cc: Johannes Weiner Cc: Lorenzo Stoakes Cc: Muchun Song Cc: Qi Zheng Cc: Roman Gushchin Cc: Wei Xu Cc: Yuanchu Xie Signed-off-by: Andrew Morton --- include/linux/damon.h | 4 ++-- mm/damon/core.c | 7 ++----- mm/damon/ops-common.c | 2 +- mm/damon/sysfs-schemes.c | 8 ++++---- 4 files changed, 9 insertions(+), 12 deletions(-) diff --git a/include/linux/damon.h b/include/linux/damon.h index a67292a2f09d..650e7ecfa32b 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -203,7 +203,7 @@ struct damos_quota_goal { u64 last_psi_total; struct { int nid; - unsigned short memcg_id; + u64 memcg_id; }; }; struct list_head list; @@ -419,7 +419,7 @@ struct damos_filter { bool matching; bool allow; union { - unsigned short memcg_id; + u64 memcg_id; struct damon_addr_range addr_range; int target_idx; struct damon_size_range sz_range; diff --git a/mm/damon/core.c b/mm/damon/core.c index 7f0028e23f92..3edbff685534 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -2094,16 +2094,13 @@ static unsigned long damos_get_node_memcg_used_bp( unsigned long used_pages, numerator; struct sysinfo i; - rcu_read_lock(); - memcg = mem_cgroup_from_id(goal->memcg_id); - if (!memcg || !mem_cgroup_tryget(memcg)) { - rcu_read_unlock(); + memcg = mem_cgroup_get_from_ino(goal->memcg_id); + if (!memcg) { if (goal->metric == DAMOS_QUOTA_NODE_MEMCG_USED_BP) return 0; else /* DAMOS_QUOTA_NODE_MEMCG_FREE_BP */ return 10000; } - rcu_read_unlock(); mem_cgroup_flush_stats(memcg); lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(goal->nid)); diff --git a/mm/damon/ops-common.c b/mm/damon/ops-common.c index a218d9922234..dd81db95f901 100644 --- a/mm/damon/ops-common.c +++ b/mm/damon/ops-common.c @@ -274,7 +274,7 @@ bool damos_folio_filter_match(struct damos_filter *filter, struct folio *folio) if (!memcg) matched = false; else - matched = filter->memcg_id == mem_cgroup_id(memcg); + matched = filter->memcg_id == mem_cgroup_ino(memcg); rcu_read_unlock(); break; case DAMOS_FILTER_TYPE_YOUNG: diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c index 19bc2288cd68..6125f259ecea 100644 --- a/mm/damon/sysfs-schemes.c +++ b/mm/damon/sysfs-schemes.c @@ -2494,7 +2494,7 @@ static bool damon_sysfs_memcg_path_eq(struct mem_cgroup *memcg, return false; } -static int damon_sysfs_memcg_path_to_id(char *memcg_path, unsigned short *id) +static int damon_sysfs_memcg_path_to_id(char *memcg_path, u64 *id) { struct mem_cgroup *memcg; char *path; @@ -2509,11 +2509,11 @@ static int damon_sysfs_memcg_path_to_id(char *memcg_path, unsigned short *id) for (memcg = mem_cgroup_iter(NULL, NULL, NULL); memcg; memcg = mem_cgroup_iter(NULL, memcg, NULL)) { - /* skip removed memcg */ - if (!mem_cgroup_id(memcg)) + /* skip offlined memcg */ + if (!mem_cgroup_online(memcg)) continue; if (damon_sysfs_memcg_path_eq(memcg, path, memcg_path)) { - *id = mem_cgroup_id(memcg); + *id = mem_cgroup_ino(memcg); found = true; break; } -- cgit v1.2.3 From 20ccbd89afe425eda2de48a9a701916d98c1f306 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Thu, 25 Dec 2025 15:21:14 -0800 Subject: mm/vmscan: use cgroup ID instead of private memcg ID in lru_gen interface The LRU gen debugfs interface was using the internal private memcg ID which is meant for tracking kernel objects that outlive their cgroup. Switch to using the public cgroup ID instead. Link: https://lkml.kernel.org/r/20251225232116.294540-7-shakeel.butt@linux.dev Signed-off-by: Shakeel Butt Acked-by: Michal Hocko Cc: Axel Rasmussen Cc: Dave Chinner Cc: David Hildenbrand Cc: Johannes Weiner Cc: Lorenzo Stoakes Cc: Muchun Song Cc: Qi Zheng Cc: Roman Gushchin Cc: SeongJae Park Cc: Wei Xu Cc: Yuanchu Xie Signed-off-by: Andrew Morton --- mm/vmscan.c | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 619691aa4393..b87baf3fc77f 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -5416,7 +5416,7 @@ static int lru_gen_seq_show(struct seq_file *m, void *v) if (memcg) cgroup_path(memcg->css.cgroup, m->private, PATH_MAX); #endif - seq_printf(m, "memcg %5hu %s\n", mem_cgroup_id(memcg), path); + seq_printf(m, "memcg %llu %s\n", mem_cgroup_ino(memcg), path); } seq_printf(m, " node %5d\n", nid); @@ -5501,7 +5501,7 @@ static int run_eviction(struct lruvec *lruvec, unsigned long seq, struct scan_co return -EINTR; } -static int run_cmd(char cmd, int memcg_id, int nid, unsigned long seq, +static int run_cmd(char cmd, u64 memcg_id, int nid, unsigned long seq, struct scan_control *sc, int swappiness, unsigned long opt) { struct lruvec *lruvec; @@ -5512,19 +5512,12 @@ static int run_cmd(char cmd, int memcg_id, int nid, unsigned long seq, return -EINVAL; if (!mem_cgroup_disabled()) { - rcu_read_lock(); - - memcg = mem_cgroup_from_id(memcg_id); - if (!mem_cgroup_tryget(memcg)) - memcg = NULL; - - rcu_read_unlock(); - + memcg = mem_cgroup_get_from_ino(memcg_id); if (!memcg) return -EINVAL; } - if (memcg_id != mem_cgroup_id(memcg)) + if (memcg_id != mem_cgroup_ino(memcg)) goto done; sc->target_mem_cgroup = memcg; @@ -5591,7 +5584,7 @@ static ssize_t lru_gen_seq_write(struct file *file, const char __user *src, int n; int end; char cmd, swap_string[5]; - unsigned int memcg_id; + u64 memcg_id; unsigned int nid; unsigned long seq; unsigned int swappiness; @@ -5601,7 +5594,7 @@ static ssize_t lru_gen_seq_write(struct file *file, const char __user *src, if (!*cur) continue; - n = sscanf(cur, "%c %u %u %lu %n %4s %n %lu %n", &cmd, &memcg_id, &nid, + n = sscanf(cur, "%c %llu %u %lu %n %4s %n %lu %n", &cmd, &memcg_id, &nid, &seq, &end, swap_string, &end, &opt, &end); if (n < 4 || cur[end]) { err = -EINVAL; -- cgit v1.2.3 From 2202e3a8cb80da583670034ee33c995513708949 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Thu, 25 Dec 2025 15:21:15 -0800 Subject: memcg: remove unused mem_cgroup_id() and mem_cgroup_from_id() Now that all callers have been converted to use either: - The private ID APIs (mem_cgroup_private_id/mem_cgroup_from_private_id) for internal kernel objects that outlive their cgroup - The public cgroup ID APIs (mem_cgroup_ino/mem_cgroup_get_from_ino) for external interfaces Remove the unused wrapper functions mem_cgroup_id() and mem_cgroup_from_id() along with their !CONFIG_MEMCG stubs. Link: https://lkml.kernel.org/r/20251225232116.294540-8-shakeel.butt@linux.dev Signed-off-by: Shakeel Butt Acked-by: Michal Hocko Cc: Axel Rasmussen Cc: Dave Chinner Cc: David Hildenbrand Cc: Johannes Weiner Cc: Lorenzo Stoakes Cc: Muchun Song Cc: Qi Zheng Cc: Roman Gushchin Cc: SeongJae Park Cc: Wei Xu Cc: Yuanchu Xie Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 18 ------------------ mm/memcontrol.c | 5 ----- 2 files changed, 23 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index c823150ec288..3e7d69020b39 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -830,12 +830,6 @@ static inline unsigned short mem_cgroup_private_id(struct mem_cgroup *memcg) } struct mem_cgroup *mem_cgroup_from_private_id(unsigned short id); -static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg) -{ - return mem_cgroup_private_id(memcg); -} -struct mem_cgroup *mem_cgroup_from_id(unsigned short id); - static inline u64 mem_cgroup_ino(struct mem_cgroup *memcg) { return memcg ? cgroup_id(memcg->css.cgroup) : 0; @@ -1282,18 +1276,6 @@ static inline void mem_cgroup_scan_tasks(struct mem_cgroup *memcg, { } -static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg) -{ - return 0; -} - -static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id) -{ - WARN_ON_ONCE(id); - /* XXX: This should always return root_mem_cgroup */ - return NULL; -} - static inline unsigned short mem_cgroup_private_id(struct mem_cgroup *memcg) { return 0; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 1ff2f9bd820c..ede39dde05df 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3615,11 +3615,6 @@ struct mem_cgroup *mem_cgroup_from_private_id(unsigned short id) return xa_load(&mem_cgroup_private_ids, id); } -struct mem_cgroup *mem_cgroup_from_id(unsigned short id) -{ - return mem_cgroup_from_private_id(id); -} - struct mem_cgroup *mem_cgroup_get_from_ino(u64 ino) { struct cgroup *cgrp; -- cgit v1.2.3 From 95296536eb19c969e91684287cf3bfcb382221d3 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Thu, 25 Dec 2025 15:21:16 -0800 Subject: memcg: rename mem_cgroup_ino() to mem_cgroup_id() Rename mem_cgroup_ino() to mem_cgroup_id() and mem_cgroup_get_from_ino() to mem_cgroup_get_from_id(). These functions now use cgroup IDs (from cgroup_id()) rather than inode numbers, so the names should reflect that. [shakeel.butt@linux.dev: replace ino with id, per SeongJae] Link: https://lkml.kernel.org/r/flkqanhyettp5uq22bjwg37rtmnpeg3mghznsylxcxxgaafpl4@nov2x7tagma7 [akpm@linux-foundation.org: build fix] Link: https://lkml.kernel.org/r/20251225232116.294540-9-shakeel.butt@linux.dev Signed-off-by: Shakeel Butt Acked-by: Michal Hocko Reviewed-by: SeongJae Park Cc: Axel Rasmussen Cc: Dave Chinner Cc: David Hildenbrand Cc: Johannes Weiner Cc: Lorenzo Stoakes Cc: Muchun Song Cc: Qi Zheng Cc: Roman Gushchin Cc: Wei Xu Cc: Yuanchu Xie Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 8 ++++---- mm/damon/core.c | 2 +- mm/damon/ops-common.c | 2 +- mm/damon/sysfs-schemes.c | 2 +- mm/memcontrol.c | 4 ++-- mm/shrinker_debug.c | 10 +++++----- mm/vmscan.c | 6 +++--- 7 files changed, 17 insertions(+), 17 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 3e7d69020b39..ed4764e1a30e 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -830,12 +830,12 @@ static inline unsigned short mem_cgroup_private_id(struct mem_cgroup *memcg) } struct mem_cgroup *mem_cgroup_from_private_id(unsigned short id); -static inline u64 mem_cgroup_ino(struct mem_cgroup *memcg) +static inline u64 mem_cgroup_id(struct mem_cgroup *memcg) { return memcg ? cgroup_id(memcg->css.cgroup) : 0; } -struct mem_cgroup *mem_cgroup_get_from_ino(u64 ino); +struct mem_cgroup *mem_cgroup_get_from_id(u64 id); static inline struct mem_cgroup *mem_cgroup_from_seq(struct seq_file *m) { @@ -1288,12 +1288,12 @@ static inline struct mem_cgroup *mem_cgroup_from_private_id(unsigned short id) return NULL; } -static inline u64 mem_cgroup_ino(struct mem_cgroup *memcg) +static inline u64 mem_cgroup_id(struct mem_cgroup *memcg) { return 0; } -static inline struct mem_cgroup *mem_cgroup_get_from_ino(u64 ino) +static inline struct mem_cgroup *mem_cgroup_get_from_id(u64 id) { return NULL; } diff --git a/mm/damon/core.c b/mm/damon/core.c index 3edbff685534..6888917c1a00 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -2094,7 +2094,7 @@ static unsigned long damos_get_node_memcg_used_bp( unsigned long used_pages, numerator; struct sysinfo i; - memcg = mem_cgroup_get_from_ino(goal->memcg_id); + memcg = mem_cgroup_get_from_id(goal->memcg_id); if (!memcg) { if (goal->metric == DAMOS_QUOTA_NODE_MEMCG_USED_BP) return 0; diff --git a/mm/damon/ops-common.c b/mm/damon/ops-common.c index dd81db95f901..a218d9922234 100644 --- a/mm/damon/ops-common.c +++ b/mm/damon/ops-common.c @@ -274,7 +274,7 @@ bool damos_folio_filter_match(struct damos_filter *filter, struct folio *folio) if (!memcg) matched = false; else - matched = filter->memcg_id == mem_cgroup_ino(memcg); + matched = filter->memcg_id == mem_cgroup_id(memcg); rcu_read_unlock(); break; case DAMOS_FILTER_TYPE_YOUNG: diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c index 6125f259ecea..419d6e7ee945 100644 --- a/mm/damon/sysfs-schemes.c +++ b/mm/damon/sysfs-schemes.c @@ -2513,7 +2513,7 @@ static int damon_sysfs_memcg_path_to_id(char *memcg_path, u64 *id) if (!mem_cgroup_online(memcg)) continue; if (damon_sysfs_memcg_path_eq(memcg, path, memcg_path)) { - *id = mem_cgroup_ino(memcg); + *id = mem_cgroup_id(memcg); found = true; break; } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index ede39dde05df..7d6cf47e6d4c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3615,13 +3615,13 @@ struct mem_cgroup *mem_cgroup_from_private_id(unsigned short id) return xa_load(&mem_cgroup_private_ids, id); } -struct mem_cgroup *mem_cgroup_get_from_ino(u64 ino) +struct mem_cgroup *mem_cgroup_get_from_id(u64 id) { struct cgroup *cgrp; struct cgroup_subsys_state *css; struct mem_cgroup *memcg = NULL; - cgrp = cgroup_get_from_id(ino); + cgrp = cgroup_get_from_id(id); if (IS_ERR(cgrp)) return NULL; diff --git a/mm/shrinker_debug.c b/mm/shrinker_debug.c index 7ef16a0b2959..affa64437302 100644 --- a/mm/shrinker_debug.c +++ b/mm/shrinker_debug.c @@ -70,7 +70,7 @@ static int shrinker_debugfs_count_show(struct seq_file *m, void *v) memcg_aware ? memcg : NULL, count_per_node); if (total) { - seq_printf(m, "%llu", mem_cgroup_ino(memcg)); + seq_printf(m, "%llu", mem_cgroup_id(memcg)); for_each_node(nid) seq_printf(m, " %lu", count_per_node[nid]); seq_putc(m, '\n'); @@ -107,7 +107,7 @@ static ssize_t shrinker_debugfs_scan_write(struct file *file, { struct shrinker *shrinker = file->private_data; unsigned long nr_to_scan = 0, read_len; - u64 ino; + u64 id; struct shrink_control sc = { .gfp_mask = GFP_KERNEL, }; @@ -120,7 +120,7 @@ static ssize_t shrinker_debugfs_scan_write(struct file *file, return -EFAULT; kbuf[read_len] = '\0'; - if (sscanf(kbuf, "%llu %d %lu", &ino, &nid, &nr_to_scan) != 3) + if (sscanf(kbuf, "%llu %d %lu", &id, &nid, &nr_to_scan) != 3) return -EINVAL; if (nid < 0 || nid >= nr_node_ids) @@ -130,7 +130,7 @@ static ssize_t shrinker_debugfs_scan_write(struct file *file, return size; if (shrinker->flags & SHRINKER_MEMCG_AWARE) { - memcg = mem_cgroup_get_from_ino(ino); + memcg = mem_cgroup_get_from_id(id); if (!memcg) return -ENOENT; @@ -138,7 +138,7 @@ static ssize_t shrinker_debugfs_scan_write(struct file *file, mem_cgroup_put(memcg); return -ENOENT; } - } else if (ino != 0) { + } else if (id != 0) { return -EINVAL; } diff --git a/mm/vmscan.c b/mm/vmscan.c index b87baf3fc77f..4aa47ab000c2 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -5416,7 +5416,7 @@ static int lru_gen_seq_show(struct seq_file *m, void *v) if (memcg) cgroup_path(memcg->css.cgroup, m->private, PATH_MAX); #endif - seq_printf(m, "memcg %llu %s\n", mem_cgroup_ino(memcg), path); + seq_printf(m, "memcg %llu %s\n", mem_cgroup_id(memcg), path); } seq_printf(m, " node %5d\n", nid); @@ -5512,12 +5512,12 @@ static int run_cmd(char cmd, u64 memcg_id, int nid, unsigned long seq, return -EINVAL; if (!mem_cgroup_disabled()) { - memcg = mem_cgroup_get_from_ino(memcg_id); + memcg = mem_cgroup_get_from_id(memcg_id); if (!memcg) return -EINVAL; } - if (memcg_id != mem_cgroup_ino(memcg)) + if (memcg_id != mem_cgroup_id(memcg)) goto done; sc->target_mem_cgroup = memcg; -- cgit v1.2.3 From 0be909f114c4e82a4fe5964851af1ab8889dc76c Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Wed, 7 Jan 2026 14:21:44 +0900 Subject: zsmalloc: use actual object size to detect spans Using class->size to detect spanning objects is not entirely correct, because some size classes can hold a range of object sizes of up to class->size bytes in length, due to size-classes merge. Such classes use padding for cases when actually written objects are smaller than class->size. zs_obj_read_begin() can incorrectly hit the slow path and perform memcpy of such objects, basically copying padding bytes. Instead of class->size zs_obj_read_begin() should use the actual compressed object length (both zram and zswap know it) so that it can correctly handle situations when a written object is small enough to fit into the first physical page. Link: https://lkml.kernel.org/r/20260107052145.3586917-1-senozhatsky@chromium.org Signed-off-by: Sergey Senozhatsky Reviewed-by: Yosry Ahmed [zsmalloc & zswap] Reviewed-by: Nhat Pham Cc: Brian Geffon Cc: Chengming Zhou Cc: Jens Axboe Cc: Johannes Weiner Cc: Minchan Kim Signed-off-by: Andrew Morton --- drivers/block/zram/zram_drv.c | 14 ++++++++------ include/linux/zsmalloc.h | 4 ++-- mm/zsmalloc.c | 16 +++++++++++----- mm/zswap.c | 5 +++-- 4 files changed, 24 insertions(+), 15 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 1d6760b3b557..f92845ef9192 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -2062,11 +2062,11 @@ static int read_incompressible_page(struct zram *zram, struct page *page, void *src, *dst; handle = get_slot_handle(zram, index); - src = zs_obj_read_begin(zram->mem_pool, handle, NULL); + src = zs_obj_read_begin(zram->mem_pool, handle, PAGE_SIZE, NULL); dst = kmap_local_page(page); copy_page(dst, src); kunmap_local(dst); - zs_obj_read_end(zram->mem_pool, handle, src); + zs_obj_read_end(zram->mem_pool, handle, PAGE_SIZE, src); return 0; } @@ -2084,11 +2084,12 @@ static int read_compressed_page(struct zram *zram, struct page *page, u32 index) prio = get_slot_comp_priority(zram, index); zstrm = zcomp_stream_get(zram->comps[prio]); - src = zs_obj_read_begin(zram->mem_pool, handle, zstrm->local_copy); + src = zs_obj_read_begin(zram->mem_pool, handle, size, + zstrm->local_copy); dst = kmap_local_page(page); ret = zcomp_decompress(zram->comps[prio], zstrm, src, size, dst); kunmap_local(dst); - zs_obj_read_end(zram->mem_pool, handle, src); + zs_obj_read_end(zram->mem_pool, handle, size, src); zcomp_stream_put(zstrm); return ret; @@ -2111,9 +2112,10 @@ static int read_from_zspool_raw(struct zram *zram, struct page *page, u32 index) * takes place here, as we read raw compressed data. */ zstrm = zcomp_stream_get(zram->comps[ZRAM_PRIMARY_COMP]); - src = zs_obj_read_begin(zram->mem_pool, handle, zstrm->local_copy); + src = zs_obj_read_begin(zram->mem_pool, handle, size, + zstrm->local_copy); memcpy_to_page(page, 0, src, size); - zs_obj_read_end(zram->mem_pool, handle, src); + zs_obj_read_end(zram->mem_pool, handle, size, src); zcomp_stream_put(zstrm); return 0; diff --git a/include/linux/zsmalloc.h b/include/linux/zsmalloc.h index f3ccff2d966c..5565c3171007 100644 --- a/include/linux/zsmalloc.h +++ b/include/linux/zsmalloc.h @@ -40,9 +40,9 @@ unsigned int zs_lookup_class_index(struct zs_pool *pool, unsigned int size); void zs_pool_stats(struct zs_pool *pool, struct zs_pool_stats *stats); void *zs_obj_read_begin(struct zs_pool *pool, unsigned long handle, - void *local_copy); + size_t mem_len, void *local_copy); void zs_obj_read_end(struct zs_pool *pool, unsigned long handle, - void *handle_mem); + size_t mem_len, void *handle_mem); void zs_obj_write(struct zs_pool *pool, unsigned long handle, void *handle_mem, size_t mem_len); diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 84da164dcbc5..119c196a287a 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -1065,7 +1065,7 @@ unsigned long zs_get_total_pages(struct zs_pool *pool) EXPORT_SYMBOL_GPL(zs_get_total_pages); void *zs_obj_read_begin(struct zs_pool *pool, unsigned long handle, - void *local_copy) + size_t mem_len, void *local_copy) { struct zspage *zspage; struct zpdesc *zpdesc; @@ -1087,7 +1087,10 @@ void *zs_obj_read_begin(struct zs_pool *pool, unsigned long handle, class = zspage_class(pool, zspage); off = offset_in_page(class->size * obj_idx); - if (off + class->size <= PAGE_SIZE) { + if (!ZsHugePage(zspage)) + mem_len += ZS_HANDLE_SIZE; + + if (off + mem_len <= PAGE_SIZE) { /* this object is contained entirely within a page */ addr = kmap_local_zpdesc(zpdesc); addr += off; @@ -1096,7 +1099,7 @@ void *zs_obj_read_begin(struct zs_pool *pool, unsigned long handle, /* this object spans two pages */ sizes[0] = PAGE_SIZE - off; - sizes[1] = class->size - sizes[0]; + sizes[1] = mem_len - sizes[0]; addr = local_copy; memcpy_from_page(addr, zpdesc_page(zpdesc), @@ -1115,7 +1118,7 @@ void *zs_obj_read_begin(struct zs_pool *pool, unsigned long handle, EXPORT_SYMBOL_GPL(zs_obj_read_begin); void zs_obj_read_end(struct zs_pool *pool, unsigned long handle, - void *handle_mem) + size_t mem_len, void *handle_mem) { struct zspage *zspage; struct zpdesc *zpdesc; @@ -1129,7 +1132,10 @@ void zs_obj_read_end(struct zs_pool *pool, unsigned long handle, class = zspage_class(pool, zspage); off = offset_in_page(class->size * obj_idx); - if (off + class->size <= PAGE_SIZE) { + if (!ZsHugePage(zspage)) + mem_len += ZS_HANDLE_SIZE; + + if (off + mem_len <= PAGE_SIZE) { if (!ZsHugePage(zspage)) off += ZS_HANDLE_SIZE; handle_mem -= off; diff --git a/mm/zswap.c b/mm/zswap.c index 6bf4f2441914..1f6c007310d8 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -937,7 +937,8 @@ static bool zswap_decompress(struct zswap_entry *entry, struct folio *folio) u8 *src, *obj; acomp_ctx = acomp_ctx_get_cpu_lock(pool); - obj = zs_obj_read_begin(pool->zs_pool, entry->handle, acomp_ctx->buffer); + obj = zs_obj_read_begin(pool->zs_pool, entry->handle, entry->length, + acomp_ctx->buffer); /* zswap entries of length PAGE_SIZE are not compressed. */ if (entry->length == PAGE_SIZE) { @@ -966,7 +967,7 @@ static bool zswap_decompress(struct zswap_entry *entry, struct folio *folio) dlen = acomp_ctx->req->dlen; read_done: - zs_obj_read_end(pool->zs_pool, entry->handle, obj); + zs_obj_read_end(pool->zs_pool, entry->handle, entry->length, obj); acomp_ctx_put_unlock(acomp_ctx); if (!decomp_ret && dlen == PAGE_SIZE) -- cgit v1.2.3 From 19c4707b535a31dc8a6009afc249f36db7011ac3 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Wed, 7 Jan 2026 14:21:45 +0900 Subject: zsmalloc: simplify read begin/end logic zs_obj_read_begin() currently maps or copies the compressed object with the prefix handle for !ZsHugePage case. Make the logic clearer and more efficient by moving the offset of the object in the page after the prefix handle instead, only copying the actual object and avoiding the need to adjust the returned address to account for the prefix. Adjust the logic to detect spanning objects in zs_obj_read_end() accordingly, slightly simplifying it by avoiding the need to account for the handle in both the offset and the object size. Link: https://lkml.kernel.org/r/20260107052145.3586917-2-senozhatsky@chromium.org Signed-off-by: Sergey Senozhatsky Co-developed-by: Yosry Ahmed Signed-off-by: Yosry Ahmed Cc: Brian Geffon Cc: Chengming Zhou Cc: Jens Axboe Cc: Johannes Weiner Cc: Minchan Kim Cc: Nhat Pham Signed-off-by: Andrew Morton --- mm/zsmalloc.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 119c196a287a..cc3d9501ae21 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -1088,7 +1088,7 @@ void *zs_obj_read_begin(struct zs_pool *pool, unsigned long handle, off = offset_in_page(class->size * obj_idx); if (!ZsHugePage(zspage)) - mem_len += ZS_HANDLE_SIZE; + off += ZS_HANDLE_SIZE; if (off + mem_len <= PAGE_SIZE) { /* this object is contained entirely within a page */ @@ -1110,9 +1110,6 @@ void *zs_obj_read_begin(struct zs_pool *pool, unsigned long handle, 0, sizes[1]); } - if (!ZsHugePage(zspage)) - addr += ZS_HANDLE_SIZE; - return addr; } EXPORT_SYMBOL_GPL(zs_obj_read_begin); @@ -1133,11 +1130,9 @@ void zs_obj_read_end(struct zs_pool *pool, unsigned long handle, off = offset_in_page(class->size * obj_idx); if (!ZsHugePage(zspage)) - mem_len += ZS_HANDLE_SIZE; + off += ZS_HANDLE_SIZE; if (off + mem_len <= PAGE_SIZE) { - if (!ZsHugePage(zspage)) - off += ZS_HANDLE_SIZE; handle_mem -= off; kunmap_local(handle_mem); } -- cgit v1.2.3 From 35520a712f9956657dfd0eaf4d9e873cd96ec43a Mon Sep 17 00:00:00 2001 From: Aaron Yang Date: Wed, 7 Jan 2026 17:30:38 -0800 Subject: mm/damon/paddr: initialize 'folio' variables to NULL for clarity In damon_pa_mark_accessed_or_deactivate(), damon_pa_pageout(), damon_pa_migrate(), and damon_pa_stat(), the local variable 'folio' is declared but not initialized. Initialize 'folio' to NULL to improve code readability and maintainability. Link: https://patch.msgid.link/20260104013255.16962-1-yangqixiao@inspur.com Link: https://lkml.kernel.org/r/20260108013041.80601-1-sj@kernel.org Signed-off-by: Aaron Yang Reviewed-by: SeongJae Park Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/paddr.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c index 07a8aead439e..7d887a3c0866 100644 --- a/mm/damon/paddr.c +++ b/mm/damon/paddr.c @@ -156,7 +156,7 @@ static unsigned long damon_pa_pageout(struct damon_region *r, LIST_HEAD(folio_list); bool install_young_filter = true; struct damos_filter *filter; - struct folio *folio; + struct folio *folio = NULL; /* check access in page level again by default */ damos_for_each_ops_filter(filter, s) { @@ -212,7 +212,7 @@ static inline unsigned long damon_pa_mark_accessed_or_deactivate( unsigned long *sz_filter_passed) { phys_addr_t addr, applied = 0; - struct folio *folio; + struct folio *folio = NULL; addr = damon_pa_phys_addr(r->ar.start, addr_unit); while (addr < damon_pa_phys_addr(r->ar.end, addr_unit)) { @@ -262,7 +262,7 @@ static unsigned long damon_pa_migrate(struct damon_region *r, { phys_addr_t addr, applied; LIST_HEAD(folio_list); - struct folio *folio; + struct folio *folio = NULL; addr = damon_pa_phys_addr(r->ar.start, addr_unit); while (addr < damon_pa_phys_addr(r->ar.end, addr_unit)) { @@ -295,7 +295,7 @@ static unsigned long damon_pa_stat(struct damon_region *r, unsigned long *sz_filter_passed) { phys_addr_t addr; - struct folio *folio; + struct folio *folio = NULL; if (!damos_ops_has_filter(s)) return 0; -- cgit v1.2.3 From 0cc3197bdb7ff590dd7cc1622a7fac66c240bc75 Mon Sep 17 00:00:00 2001 From: Hou Wenlong Date: Fri, 9 Jan 2026 21:31:51 +0800 Subject: mm/early_ioremap: print the starting physical address in __early_ioremap() The debug WARN() printing occurs after the while loop, so the 'phys_addr' reflects the last physical address rather than the actual starting physical address, which is not useful for debugging. To simplify, the WARN() statement could be moved up before the loop instead of introducing a new variable to record the original 'phys_addr' value. Additionally, swap the print order of 'slot_virt[slot]' and 'offset', as this will enhance output readability. Link: https://lkml.kernel.org/r/aa2d44c34f44c31b50285b7592ed4fd78d6f59ba.1767965415.git.houwenlong.hwl@antgroup.com Signed-off-by: Hou Wenlong Reviewed-by: Andrew Morton Reviewed-by: Mike Rapoport (Microsoft) Cc: David Hildenbrand Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/early_ioremap.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mm/early_ioremap.c b/mm/early_ioremap.c index ff35b84a7b50..3fdde074c9da 100644 --- a/mm/early_ioremap.c +++ b/mm/early_ioremap.c @@ -139,6 +139,9 @@ __early_ioremap(resource_size_t phys_addr, unsigned long size, pgprot_t prot) if (WARN_ON(nrpages > NR_FIX_BTMAPS)) return NULL; + WARN(early_ioremap_debug, "%s(%pa, %08lx) [%d] => %08lx + %08lx\n", + __func__, &phys_addr, size, slot, slot_virt[slot], offset); + /* * Ok, go for it.. */ @@ -152,8 +155,6 @@ __early_ioremap(resource_size_t phys_addr, unsigned long size, pgprot_t prot) --idx; --nrpages; } - WARN(early_ioremap_debug, "%s(%pa, %08lx) [%d] => %08lx + %08lx\n", - __func__, &phys_addr, size, slot, offset, slot_virt[slot]); prev_map[slot] = (void __iomem *)(offset + slot_virt[slot]); return prev_map[slot]; -- cgit v1.2.3 From 5fd8391cb71982152785edc1e6f48a5c7dcadabc Mon Sep 17 00:00:00 2001 From: Hou Wenlong Date: Mon, 12 Jan 2026 20:24:29 +0800 Subject: mm/early_ioremap: clean up the use of WARN() for debugging Using WARN() for debugging is strange when nothing is wrong, so replace WARN(early_ioremap_debug) with pr_warn() + dump_stack(). Link: https://lkml.kernel.org/r/d4470531ce0c03fd80f9a1be7e8d8ae1bc60fcd1.1768220636.git.houwenlong.hwl@antgroup.com Signed-off-by: Hou Wenlong Suggested-by: Mike Rapoport Acked-by: Mike Rapoport (Microsoft) Cc: David Hildenbrand Cc: Hou Wenlong Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/early_ioremap.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/mm/early_ioremap.c b/mm/early_ioremap.c index 3fdde074c9da..96c29b9dc85d 100644 --- a/mm/early_ioremap.c +++ b/mm/early_ioremap.c @@ -30,6 +30,14 @@ static int __init early_ioremap_debug_setup(char *str) } early_param("early_ioremap_debug", early_ioremap_debug_setup); +#define early_ioremap_dbg(fmt, args...) \ + do { \ + if (unlikely(early_ioremap_debug)) { \ + pr_warn(fmt, ##args); \ + dump_stack(); \ + } \ + } while (0) + static int after_paging_init __initdata; pgprot_t __init __weak early_memremap_pgprot_adjust(resource_size_t phys_addr, @@ -139,8 +147,8 @@ __early_ioremap(resource_size_t phys_addr, unsigned long size, pgprot_t prot) if (WARN_ON(nrpages > NR_FIX_BTMAPS)) return NULL; - WARN(early_ioremap_debug, "%s(%pa, %08lx) [%d] => %08lx + %08lx\n", - __func__, &phys_addr, size, slot, slot_virt[slot], offset); + early_ioremap_dbg("%s(%pa, %08lx) [%d] => %08lx + %08lx\n", + __func__, &phys_addr, size, slot, slot_virt[slot], offset); /* * Ok, go for it.. @@ -185,8 +193,7 @@ void __init early_iounmap(void __iomem *addr, unsigned long size) __func__, addr, size, slot, prev_size[slot])) return; - WARN(early_ioremap_debug, "%s(%p, %08lx) [%d]\n", - __func__, addr, size, slot); + early_ioremap_dbg("%s(%p, %08lx) [%d]\n", __func__, addr, size, slot); virt_addr = (unsigned long)addr; if (WARN_ON(virt_addr < fix_to_virt(FIX_BTMAP_BEGIN))) -- cgit v1.2.3 From 5747435e0fd474c24530ef1a6822f47e7d264b27 Mon Sep 17 00:00:00 2001 From: Deepanshu Kartikey Date: Mon, 12 Jan 2026 16:06:12 +0530 Subject: mm/vmalloc: prevent RCU stalls in kasan_release_vmalloc_node When CONFIG_PAGE_OWNER is enabled, freeing KASAN shadow pages during vmalloc cleanup triggers expensive stack unwinding that acquires RCU read locks. Processing a large purge_list without rescheduling can cause the task to hold CPU for extended periods (10+ seconds), leading to RCU stalls and potential OOM conditions. The issue manifests in purge_vmap_node() -> kasan_release_vmalloc_node() where iterating through hundreds or thousands of vmap_area entries and freeing their associated shadow pages causes: rcu: INFO: rcu_preempt detected stalls on CPUs/tasks: rcu: Tasks blocked on level-0 rcu_node (CPUs 0-1): P6229/1:b..l ... task:kworker/0:17 state:R running task stack:28840 pid:6229 ... kasan_release_vmalloc_node+0x1ba/0xad0 mm/vmalloc.c:2299 purge_vmap_node+0x1ba/0xad0 mm/vmalloc.c:2299 Each call to kasan_release_vmalloc() can free many pages, and with page_owner tracking, each free triggers save_stack() which performs stack unwinding under RCU read lock. Without yielding, this creates an unbounded RCU critical section. Add periodic cond_resched() calls within the loop to allow: - RCU grace periods to complete - Other tasks to run - Scheduler to preempt when needed The fix uses need_resched() for immediate response under load, with a batch count of 32 as a guaranteed upper bound to prevent worst-case stalls even under light load. Link: https://lkml.kernel.org/r/20260112103612.627247-1-kartikey406@gmail.com Signed-off-by: Deepanshu Kartikey Reported-by: syzbot+d8d4c31d40f868eaea30@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=d8d4c31d40f868eaea30 Link: https://lore.kernel.org/all/20260112084723.622910-1-kartikey406@gmail.com/T/ [v1] Suggested-by: Uladzislau Rezki Reviewed-by: Uladzislau Rezki (Sony) Cc: Hillf Danton Cc: Signed-off-by: Andrew Morton --- mm/vmalloc.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 32d6ee92d4ff..ca4c65328687 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2273,11 +2273,14 @@ decay_va_pool_node(struct vmap_node *vn, bool full_decay) reclaim_list_global(&decay_list); } +#define KASAN_RELEASE_BATCH_SIZE 32 + static void kasan_release_vmalloc_node(struct vmap_node *vn) { struct vmap_area *va; unsigned long start, end; + unsigned int batch_count = 0; start = list_first_entry(&vn->purge_list, struct vmap_area, list)->va_start; end = list_last_entry(&vn->purge_list, struct vmap_area, list)->va_end; @@ -2287,6 +2290,11 @@ kasan_release_vmalloc_node(struct vmap_node *vn) kasan_release_vmalloc(va->va_start, va->va_end, va->va_start, va->va_end, KASAN_VMALLOC_PAGE_RANGE); + + if (need_resched() || (++batch_count >= KASAN_RELEASE_BATCH_SIZE)) { + cond_resched(); + batch_count = 0; + } } kasan_release_vmalloc(start, end, start, end, KASAN_VMALLOC_TLB_FLUSH); -- cgit v1.2.3 From 01152bd2e44d6bcecd3573d653221ba3944ed0f1 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Fri, 9 Jan 2026 17:31:31 +0800 Subject: mm: debug_vm_pgtable: add debug_vm_pgtable_free_huge_page() Patch series "mm: hugetlb: allocate frozen gigantic folio", v6. Introduce alloc_contig_frozen_pages() and cma_alloc_frozen_compound() which avoid atomic operation about page refcount, and then convert to allocate frozen gigantic folio by the new helpers in hugetlb to cleanup the alloc_gigantic_folio(). This patch (of 6): Add a new helper to free huge page to be consistency to debug_vm_pgtable_alloc_huge_page(), and use HPAGE_PUD_ORDER instead of open-code. Also move the free_contig_range() under CONFIG_ALLOC_CONTIG since all caller are built with CONFIG_ALLOC_CONTIG. Link: https://lkml.kernel.org/r/20260109093136.1491549-2-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Acked-by: David Hildenbrand Reviewed-by: Zi Yan Reviewed-by: Muchun Song Reviewed-by: Sidhartha Kumar Cc: Brendan Jackman Cc: Jane Chu Cc: Johannes Weiner Cc: Matthew Wilcox (Oracle) Cc: Oscar Salvador Cc: Vlastimil Babka Cc: Claudiu Beznea Cc: Mark Brown Signed-off-by: Andrew Morton --- include/linux/gfp.h | 2 +- mm/debug_vm_pgtable.c | 38 +++++++++++++++++--------------------- mm/page_alloc.c | 2 +- 3 files changed, 19 insertions(+), 23 deletions(-) diff --git a/include/linux/gfp.h b/include/linux/gfp.h index f9fdc99ae594..627157972f6a 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -444,8 +444,8 @@ extern struct page *alloc_contig_pages_noprof(unsigned long nr_pages, gfp_t gfp_ int nid, nodemask_t *nodemask); #define alloc_contig_pages(...) alloc_hooks(alloc_contig_pages_noprof(__VA_ARGS__)) -#endif void free_contig_range(unsigned long pfn, unsigned long nr_pages); +#endif #ifdef CONFIG_CONTIG_ALLOC static inline struct folio *folio_alloc_gigantic_noprof(int order, gfp_t gfp, diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index ae9b9310d96f..83cf07269f13 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -971,22 +971,26 @@ static unsigned long __init get_random_vaddr(void) return random_vaddr; } -static void __init destroy_args(struct pgtable_debug_args *args) +static void __init +debug_vm_pgtable_free_huge_page(struct pgtable_debug_args *args, + unsigned long pfn, int order) { - struct page *page = NULL; +#ifdef CONFIG_CONTIG_ALLOC + if (args->is_contiguous_page) { + free_contig_range(pfn, 1 << order); + return; + } +#endif + __free_pages(pfn_to_page(pfn), order); +} +static void __init destroy_args(struct pgtable_debug_args *args) +{ /* Free (huge) page */ if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && has_transparent_pud_hugepage() && args->pud_pfn != ULONG_MAX) { - if (args->is_contiguous_page) { - free_contig_range(args->pud_pfn, - (1 << (HPAGE_PUD_SHIFT - PAGE_SHIFT))); - } else { - page = pfn_to_page(args->pud_pfn); - __free_pages(page, HPAGE_PUD_SHIFT - PAGE_SHIFT); - } - + debug_vm_pgtable_free_huge_page(args, args->pud_pfn, HPAGE_PUD_ORDER); args->pud_pfn = ULONG_MAX; args->pmd_pfn = ULONG_MAX; args->pte_pfn = ULONG_MAX; @@ -995,20 +999,13 @@ static void __init destroy_args(struct pgtable_debug_args *args) if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && has_transparent_hugepage() && args->pmd_pfn != ULONG_MAX) { - if (args->is_contiguous_page) { - free_contig_range(args->pmd_pfn, (1 << HPAGE_PMD_ORDER)); - } else { - page = pfn_to_page(args->pmd_pfn); - __free_pages(page, HPAGE_PMD_ORDER); - } - + debug_vm_pgtable_free_huge_page(args, args->pmd_pfn, HPAGE_PMD_ORDER); args->pmd_pfn = ULONG_MAX; args->pte_pfn = ULONG_MAX; } if (args->pte_pfn != ULONG_MAX) { - page = pfn_to_page(args->pte_pfn); - __free_page(page); + __free_page(pfn_to_page(args->pte_pfn)); args->pte_pfn = ULONG_MAX; } @@ -1242,8 +1239,7 @@ static int __init init_args(struct pgtable_debug_args *args) */ if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && has_transparent_pud_hugepage()) { - page = debug_vm_pgtable_alloc_huge_page(args, - HPAGE_PUD_SHIFT - PAGE_SHIFT); + page = debug_vm_pgtable_alloc_huge_page(args, HPAGE_PUD_ORDER); if (page) { args->pud_pfn = page_to_pfn(page); args->pmd_pfn = args->pud_pfn; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index f7d777921f05..c0b048584769 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7255,7 +7255,6 @@ retry: } return NULL; } -#endif /* CONFIG_CONTIG_ALLOC */ void free_contig_range(unsigned long pfn, unsigned long nr_pages) { @@ -7282,6 +7281,7 @@ void free_contig_range(unsigned long pfn, unsigned long nr_pages) WARN(count != 0, "%lu pages are still in use!\n", count); } EXPORT_SYMBOL(free_contig_range); +#endif /* CONFIG_CONTIG_ALLOC */ /* * Effectively disable pcplists for the zone by setting the high limit to 0 -- cgit v1.2.3 From a9deb800b89efb2050453f7178e73b1d8b124e0f Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Fri, 9 Jan 2026 17:31:32 +0800 Subject: mm: page_alloc: add __split_page() Factor out the splitting of non-compound page from make_alloc_exact() and split_page() into a new helper function __split_page(). While at it, convert the VM_BUG_ON_PAGE() into a VM_WARN_ON_PAGE(). Link: https://lkml.kernel.org/r/20260109093136.1491549-3-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Acked-by: David Hildenbrand Acked-by: Muchun Song Reviewed-by: Zi Yan Reviewed-by: Sidhartha Kumar Cc: Brendan Jackman Cc: Jane Chu Cc: Johannes Weiner Cc: Matthew Wilcox (Oracle) Cc: Oscar Salvador Cc: Vlastimil Babka Cc: Claudiu Beznea Cc: Mark Brown Signed-off-by: Andrew Morton --- include/linux/mmdebug.h | 10 ++++++++++ mm/page_alloc.c | 21 +++++++++++++-------- 2 files changed, 23 insertions(+), 8 deletions(-) diff --git a/include/linux/mmdebug.h b/include/linux/mmdebug.h index 14a45979cccc..ab60ffba08f5 100644 --- a/include/linux/mmdebug.h +++ b/include/linux/mmdebug.h @@ -47,6 +47,15 @@ void vma_iter_dump_tree(const struct vma_iterator *vmi); BUG(); \ } \ } while (0) +#define VM_WARN_ON_PAGE(cond, page) ({ \ + int __ret_warn = !!(cond); \ + \ + if (unlikely(__ret_warn)) { \ + dump_page(page, "VM_WARN_ON_PAGE(" __stringify(cond)")");\ + WARN_ON(1); \ + } \ + unlikely(__ret_warn); \ +}) #define VM_WARN_ON_ONCE_PAGE(cond, page) ({ \ static bool __section(".data..once") __warned; \ int __ret_warn_once = !!(cond); \ @@ -122,6 +131,7 @@ void vma_iter_dump_tree(const struct vma_iterator *vmi); #define VM_BUG_ON_MM(cond, mm) VM_BUG_ON(cond) #define VM_WARN_ON(cond) BUILD_BUG_ON_INVALID(cond) #define VM_WARN_ON_ONCE(cond) BUILD_BUG_ON_INVALID(cond) +#define VM_WARN_ON_PAGE(cond, page) BUILD_BUG_ON_INVALID(cond) #define VM_WARN_ON_ONCE_PAGE(cond, page) BUILD_BUG_ON_INVALID(cond) #define VM_WARN_ON_FOLIO(cond, folio) BUILD_BUG_ON_INVALID(cond) #define VM_WARN_ON_ONCE_FOLIO(cond, folio) BUILD_BUG_ON_INVALID(cond) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c0b048584769..3b99296eda5b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3107,6 +3107,15 @@ void free_unref_folios(struct folio_batch *folios) folio_batch_reinit(folios); } +static void __split_page(struct page *page, unsigned int order) +{ + VM_WARN_ON_PAGE(PageCompound(page), page); + + split_page_owner(page, order, 0); + pgalloc_tag_split(page_folio(page), order, 0); + split_page_memcg(page, order); +} + /* * split_page takes a non-compound higher-order page, and splits it into * n (1< Date: Fri, 9 Jan 2026 17:31:33 +0800 Subject: mm: cma: kill cma_pages_valid() Kill cma_pages_valid() which only used in cma_release(), also cleanup code duplication between cma pages valid checking and cma memrange finding. Link: https://lkml.kernel.org/r/20260109093136.1491549-4-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: Jane Chu Reviewed-by: Zi Yan Reviewed-by: Muchun Song Acked-by: David Hildenbrand Cc: Brendan Jackman Cc: Johannes Weiner Cc: Matthew Wilcox (Oracle) Cc: Oscar Salvador Cc: Sidhartha Kumar Cc: Vlastimil Babka Cc: Claudiu Beznea Cc: Mark Brown Signed-off-by: Andrew Morton --- include/linux/cma.h | 1 - mm/cma.c | 48 +++++++++++------------------------------------- 2 files changed, 11 insertions(+), 38 deletions(-) diff --git a/include/linux/cma.h b/include/linux/cma.h index 62d9c1cf6326..e5745d2aec55 100644 --- a/include/linux/cma.h +++ b/include/linux/cma.h @@ -49,7 +49,6 @@ extern int cma_init_reserved_mem(phys_addr_t base, phys_addr_t size, struct cma **res_cma); extern struct page *cma_alloc(struct cma *cma, unsigned long count, unsigned int align, bool no_warn); -extern bool cma_pages_valid(struct cma *cma, const struct page *pages, unsigned long count); extern bool cma_release(struct cma *cma, const struct page *pages, unsigned long count); extern int cma_for_each_area(int (*it)(struct cma *cma, void *data), void *data); diff --git a/mm/cma.c b/mm/cma.c index 813e6dc7b095..fe3a9eaac4e5 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -942,36 +942,6 @@ struct folio *cma_alloc_folio(struct cma *cma, int order, gfp_t gfp) return page ? page_folio(page) : NULL; } -bool cma_pages_valid(struct cma *cma, const struct page *pages, - unsigned long count) -{ - unsigned long pfn, end; - int r; - struct cma_memrange *cmr; - bool ret; - - if (!cma || !pages || count > cma->count) - return false; - - pfn = page_to_pfn(pages); - ret = false; - - for (r = 0; r < cma->nranges; r++) { - cmr = &cma->ranges[r]; - end = cmr->base_pfn + cmr->count; - if (pfn >= cmr->base_pfn && pfn < end) { - ret = pfn + count <= end; - break; - } - } - - if (!ret) - pr_debug("%s(page %p, count %lu)\n", - __func__, (void *)pages, count); - - return ret; -} - /** * cma_release() - release allocated pages * @cma: Contiguous memory region for which the allocation is performed. @@ -991,23 +961,27 @@ bool cma_release(struct cma *cma, const struct page *pages, pr_debug("%s(page %p, count %lu)\n", __func__, (void *)pages, count); - if (!cma_pages_valid(cma, pages, count)) + if (!cma || !pages || count > cma->count) return false; pfn = page_to_pfn(pages); - end_pfn = pfn + count; for (r = 0; r < cma->nranges; r++) { cmr = &cma->ranges[r]; - if (pfn >= cmr->base_pfn && - pfn < (cmr->base_pfn + cmr->count)) { - VM_BUG_ON(end_pfn > cmr->base_pfn + cmr->count); - break; + end_pfn = cmr->base_pfn + cmr->count; + if (pfn >= cmr->base_pfn && pfn < end_pfn) { + if (pfn + count <= end_pfn) + break; + + VM_WARN_ON_ONCE(1); } } - if (r == cma->nranges) + if (r == cma->nranges) { + pr_debug("%s(page %p, count %lu, no cma range matches the page range)\n", + __func__, (void *)pages, count); return false; + } free_contig_range(pfn, count); cma_clear_bitmap(cma, cmr, pfn, count); -- cgit v1.2.3 From e0c1326779cc1b8e3a9e30ae273b89202ed4c82c Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Fri, 9 Jan 2026 17:31:34 +0800 Subject: mm: page_alloc: add alloc_contig_frozen_{range,pages}() In order to allocate given range of pages or allocate compound pages without incrementing their refcount, adding two new helper alloc_contig_frozen_{range,pages}() which may be beneficial to some users (eg hugetlb). The new alloc_contig_{range,pages} only take !__GFP_COMP gfp now, and the free_contig_range() is refactored to only free non-compound pages, the only caller to free compound pages in cma_free_folio() is changed accordingly, and the free_contig_frozen_range() is provided to match the alloc_contig_frozen_range(), which is used to free frozen pages. Link: https://lkml.kernel.org/r/20260109093136.1491549-5-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: Zi Yan Reviewed-by: Sidhartha Kumar Cc: Brendan Jackman Cc: David Hildenbrand Cc: Jane Chu Cc: Johannes Weiner Cc: Matthew Wilcox (Oracle) Cc: Muchun Song Cc: Oscar Salvador Cc: Vlastimil Babka Cc: Claudiu Beznea Cc: Mark Brown Signed-off-by: Andrew Morton --- include/linux/gfp.h | 52 ++++++--------- mm/cma.c | 9 ++- mm/hugetlb.c | 9 ++- mm/internal.h | 13 ++++ mm/page_alloc.c | 186 ++++++++++++++++++++++++++++++++++++++-------------- 5 files changed, 184 insertions(+), 85 deletions(-) diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 627157972f6a..6ecf6dda93e0 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -436,40 +436,30 @@ typedef unsigned int __bitwise acr_flags_t; #define ACR_FLAGS_CMA ((__force acr_flags_t)BIT(0)) // allocate for CMA /* The below functions must be run on a range from a single zone. */ -extern int alloc_contig_range_noprof(unsigned long start, unsigned long end, - acr_flags_t alloc_flags, gfp_t gfp_mask); -#define alloc_contig_range(...) alloc_hooks(alloc_contig_range_noprof(__VA_ARGS__)) - -extern struct page *alloc_contig_pages_noprof(unsigned long nr_pages, gfp_t gfp_mask, - int nid, nodemask_t *nodemask); -#define alloc_contig_pages(...) alloc_hooks(alloc_contig_pages_noprof(__VA_ARGS__)) - +int alloc_contig_frozen_range_noprof(unsigned long start, unsigned long end, + acr_flags_t alloc_flags, gfp_t gfp_mask); +#define alloc_contig_frozen_range(...) \ + alloc_hooks(alloc_contig_frozen_range_noprof(__VA_ARGS__)) + +int alloc_contig_range_noprof(unsigned long start, unsigned long end, + acr_flags_t alloc_flags, gfp_t gfp_mask); +#define alloc_contig_range(...) \ + alloc_hooks(alloc_contig_range_noprof(__VA_ARGS__)) + +struct page *alloc_contig_frozen_pages_noprof(unsigned long nr_pages, + gfp_t gfp_mask, int nid, nodemask_t *nodemask); +#define alloc_contig_frozen_pages(...) \ + alloc_hooks(alloc_contig_frozen_pages_noprof(__VA_ARGS__)) + +struct page *alloc_contig_pages_noprof(unsigned long nr_pages, gfp_t gfp_mask, + int nid, nodemask_t *nodemask); +#define alloc_contig_pages(...) \ + alloc_hooks(alloc_contig_pages_noprof(__VA_ARGS__)) + +void free_contig_frozen_range(unsigned long pfn, unsigned long nr_pages); void free_contig_range(unsigned long pfn, unsigned long nr_pages); #endif -#ifdef CONFIG_CONTIG_ALLOC -static inline struct folio *folio_alloc_gigantic_noprof(int order, gfp_t gfp, - int nid, nodemask_t *node) -{ - struct page *page; - - if (WARN_ON(!order || !(gfp & __GFP_COMP))) - return NULL; - - page = alloc_contig_pages_noprof(1 << order, gfp, nid, node); - - return page ? page_folio(page) : NULL; -} -#else -static inline struct folio *folio_alloc_gigantic_noprof(int order, gfp_t gfp, - int nid, nodemask_t *node) -{ - return NULL; -} -#endif -/* This should be paired with folio_put() rather than free_contig_range(). */ -#define folio_alloc_gigantic(...) alloc_hooks(folio_alloc_gigantic_noprof(__VA_ARGS__)) - DEFINE_FREE(free_page, void *, free_page((unsigned long)_T)) #endif /* __LINUX_GFP_H */ diff --git a/mm/cma.c b/mm/cma.c index fe3a9eaac4e5..0e8c146424fb 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -836,7 +836,7 @@ static int cma_range_alloc(struct cma *cma, struct cma_memrange *cmr, spin_unlock_irq(&cma->lock); mutex_lock(&cma->alloc_mutex); - ret = alloc_contig_range(pfn, pfn + count, ACR_FLAGS_CMA, gfp); + ret = alloc_contig_frozen_range(pfn, pfn + count, ACR_FLAGS_CMA, gfp); mutex_unlock(&cma->alloc_mutex); if (!ret) break; @@ -904,6 +904,7 @@ static struct page *__cma_alloc(struct cma *cma, unsigned long count, trace_cma_alloc_finish(name, page ? page_to_pfn(page) : 0, page, count, align, ret); if (page) { + set_pages_refcounted(page, count); count_vm_event(CMA_ALLOC_SUCCESS); cma_sysfs_account_success_pages(cma, count); } else { @@ -983,7 +984,11 @@ bool cma_release(struct cma *cma, const struct page *pages, return false; } - free_contig_range(pfn, count); + if (PageHead(pages)) + __free_pages((struct page *)pages, compound_order(pages)); + else + free_contig_range(pfn, count); + cma_clear_bitmap(cma, cmr, pfn, count); cma_sysfs_account_release_pages(cma, count); trace_cma_release(cma->name, pfn, pages, count); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 04385a0122de..762aeebf85d2 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1428,12 +1428,17 @@ static struct folio *alloc_gigantic_folio(int order, gfp_t gfp_mask, retry: folio = hugetlb_cma_alloc_folio(order, gfp_mask, nid, nodemask); if (!folio) { + struct page *page; + if (hugetlb_cma_exclusive_alloc()) return NULL; - folio = folio_alloc_gigantic(order, gfp_mask, nid, nodemask); - if (!folio) + page = alloc_contig_frozen_pages(1 << order, gfp_mask, nid, nodemask); + if (!page) return NULL; + + set_page_refcounted(page); + folio = page_folio(page); } if (folio_ref_freeze(folio, 1)) diff --git a/mm/internal.h b/mm/internal.h index 5585059f0209..0623b865ad1a 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -580,6 +580,19 @@ static inline void set_page_refcounted(struct page *page) set_page_count(page, 1); } +static inline void set_pages_refcounted(struct page *page, unsigned long nr_pages) +{ + unsigned long pfn = page_to_pfn(page); + + if (PageHead(page)) { + set_page_refcounted(page); + return; + } + + for (; nr_pages--; pfn++) + set_page_refcounted(pfn_to_page(pfn)); +} + /* * Return true if a folio needs ->release_folio() calling upon it. */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 3b99296eda5b..a0bb57c4e851 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6889,7 +6889,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, return (ret < 0) ? ret : 0; } -static void split_free_pages(struct list_head *list, gfp_t gfp_mask) +static void split_free_frozen_pages(struct list_head *list, gfp_t gfp_mask) { int order; @@ -6901,11 +6901,10 @@ static void split_free_pages(struct list_head *list, gfp_t gfp_mask) int i; post_alloc_hook(page, order, gfp_mask); - set_page_refcounted(page); if (!order) continue; - split_page(page, order); + __split_page(page, order); /* Add all subpages to the order-0 head, in sequence. */ list_del(&page->lru); @@ -6949,8 +6948,14 @@ static int __alloc_contig_verify_gfp_mask(gfp_t gfp_mask, gfp_t *gfp_cc_mask) return 0; } +static void __free_contig_frozen_range(unsigned long pfn, unsigned long nr_pages) +{ + for (; nr_pages--; pfn++) + free_frozen_pages(pfn_to_page(pfn), 0); +} + /** - * alloc_contig_range() -- tries to allocate given range of pages + * alloc_contig_frozen_range() -- tries to allocate given range of frozen pages * @start: start PFN to allocate * @end: one-past-the-last PFN to allocate * @alloc_flags: allocation information @@ -6965,12 +6970,15 @@ static int __alloc_contig_verify_gfp_mask(gfp_t gfp_mask, gfp_t *gfp_cc_mask) * pageblocks in the range. Once isolated, the pageblocks should not * be modified by others. * - * Return: zero on success or negative error code. On success all - * pages which PFN is in [start, end) are allocated for the caller and - * need to be freed with free_contig_range(). + * All frozen pages which PFN is in [start, end) are allocated for the + * caller, and they could be freed with free_contig_frozen_range(), + * free_frozen_pages() also could be used to free compound frozen pages + * directly. + * + * Return: zero on success or negative error code. */ -int alloc_contig_range_noprof(unsigned long start, unsigned long end, - acr_flags_t alloc_flags, gfp_t gfp_mask) +int alloc_contig_frozen_range_noprof(unsigned long start, unsigned long end, + acr_flags_t alloc_flags, gfp_t gfp_mask) { const unsigned int order = ilog2(end - start); unsigned long outer_start, outer_end; @@ -7086,19 +7094,18 @@ int alloc_contig_range_noprof(unsigned long start, unsigned long end, } if (!(gfp_mask & __GFP_COMP)) { - split_free_pages(cc.freepages, gfp_mask); + split_free_frozen_pages(cc.freepages, gfp_mask); /* Free head and tail (if any) */ if (start != outer_start) - free_contig_range(outer_start, start - outer_start); + __free_contig_frozen_range(outer_start, start - outer_start); if (end != outer_end) - free_contig_range(end, outer_end - end); + __free_contig_frozen_range(end, outer_end - end); } else if (start == outer_start && end == outer_end && is_power_of_2(end - start)) { struct page *head = pfn_to_page(start); check_new_pages(head, order); prep_new_page(head, order, gfp_mask, 0); - set_page_refcounted(head); } else { ret = -EINVAL; WARN(true, "PFN range: requested [%lu, %lu), allocated [%lu, %lu)\n", @@ -7108,16 +7115,40 @@ done: undo_isolate_page_range(start, end); return ret; } -EXPORT_SYMBOL(alloc_contig_range_noprof); +EXPORT_SYMBOL(alloc_contig_frozen_range_noprof); -static int __alloc_contig_pages(unsigned long start_pfn, - unsigned long nr_pages, gfp_t gfp_mask) +/** + * alloc_contig_range() -- tries to allocate given range of pages + * @start: start PFN to allocate + * @end: one-past-the-last PFN to allocate + * @alloc_flags: allocation information + * @gfp_mask: GFP mask. + * + * This routine is a wrapper around alloc_contig_frozen_range(), it can't + * be used to allocate compound pages, the refcount of each allocated page + * will be set to one. + * + * All pages which PFN is in [start, end) are allocated for the caller, + * and should be freed with free_contig_range() or by manually calling + * __free_page() on each allocated page. + * + * Return: zero on success or negative error code. + */ +int alloc_contig_range_noprof(unsigned long start, unsigned long end, + acr_flags_t alloc_flags, gfp_t gfp_mask) { - unsigned long end_pfn = start_pfn + nr_pages; + int ret; - return alloc_contig_range_noprof(start_pfn, end_pfn, ACR_FLAGS_NONE, - gfp_mask); + if (WARN_ON(gfp_mask & __GFP_COMP)) + return -EINVAL; + + ret = alloc_contig_frozen_range_noprof(start, end, alloc_flags, gfp_mask); + if (!ret) + set_pages_refcounted(pfn_to_page(start), end - start); + + return ret; } +EXPORT_SYMBOL(alloc_contig_range_noprof); static bool pfn_range_valid_contig(struct zone *z, unsigned long start_pfn, unsigned long nr_pages, bool skip_hugetlb, @@ -7186,7 +7217,7 @@ static bool zone_spans_last_pfn(const struct zone *zone, } /** - * alloc_contig_pages() -- tries to find and allocate contiguous range of pages + * alloc_contig_frozen_pages() -- tries to find and allocate contiguous range of frozen pages * @nr_pages: Number of contiguous pages to allocate * @gfp_mask: GFP mask. Node/zone/placement hints limit the search; only some * action and reclaim modifiers are supported. Reclaim modifiers @@ -7194,22 +7225,25 @@ static bool zone_spans_last_pfn(const struct zone *zone, * @nid: Target node * @nodemask: Mask for other possible nodes * - * This routine is a wrapper around alloc_contig_range(). It scans over zones - * on an applicable zonelist to find a contiguous pfn range which can then be - * tried for allocation with alloc_contig_range(). This routine is intended - * for allocation requests which can not be fulfilled with the buddy allocator. + * This routine is a wrapper around alloc_contig_frozen_range(). It scans over + * zones on an applicable zonelist to find a contiguous pfn range which can then + * be tried for allocation with alloc_contig_frozen_range(). This routine is + * intended for allocation requests which can not be fulfilled with the buddy + * allocator. * * The allocated memory is always aligned to a page boundary. If nr_pages is a * power of two, then allocated range is also guaranteed to be aligned to same * nr_pages (e.g. 1GB request would be aligned to 1GB). * - * Allocated pages can be freed with free_contig_range() or by manually calling - * __free_page() on each allocated page. + * Allocated frozen pages need be freed with free_contig_frozen_range(), + * or by manually calling free_frozen_pages() on each allocated frozen + * non-compound page, for compound frozen pages could be freed with + * free_frozen_pages() directly. * - * Return: pointer to contiguous pages on success, or NULL if not successful. + * Return: pointer to contiguous frozen pages on success, or NULL if not successful. */ -struct page *alloc_contig_pages_noprof(unsigned long nr_pages, gfp_t gfp_mask, - int nid, nodemask_t *nodemask) +struct page *alloc_contig_frozen_pages_noprof(unsigned long nr_pages, + gfp_t gfp_mask, int nid, nodemask_t *nodemask) { unsigned long ret, pfn, flags; struct zonelist *zonelist; @@ -7231,13 +7265,15 @@ retry: &skipped_hugetlb)) { /* * We release the zone lock here because - * alloc_contig_range() will also lock the zone - * at some point. If there's an allocation - * spinning on this lock, it may win the race - * and cause alloc_contig_range() to fail... + * alloc_contig_frozen_range() will also lock + * the zone at some point. If there's an + * allocation spinning on this lock, it may + * win the race and cause allocation to fail. */ spin_unlock_irqrestore(&zone->lock, flags); - ret = __alloc_contig_pages(pfn, nr_pages, + ret = alloc_contig_frozen_range_noprof(pfn, + pfn + nr_pages, + ACR_FLAGS_NONE, gfp_mask); if (!ret) return pfn_to_page(pfn); @@ -7260,30 +7296,80 @@ retry: } return NULL; } +EXPORT_SYMBOL(alloc_contig_frozen_pages_noprof); -void free_contig_range(unsigned long pfn, unsigned long nr_pages) +/** + * alloc_contig_pages() -- tries to find and allocate contiguous range of pages + * @nr_pages: Number of contiguous pages to allocate + * @gfp_mask: GFP mask. + * @nid: Target node + * @nodemask: Mask for other possible nodes + * + * This routine is a wrapper around alloc_contig_frozen_pages(), it can't + * be used to allocate compound pages, the refcount of each allocated page + * will be set to one. + * + * Allocated pages can be freed with free_contig_range() or by manually + * calling __free_page() on each allocated page. + * + * Return: pointer to contiguous pages on success, or NULL if not successful. + */ +struct page *alloc_contig_pages_noprof(unsigned long nr_pages, gfp_t gfp_mask, + int nid, nodemask_t *nodemask) { - unsigned long count = 0; - struct folio *folio = pfn_folio(pfn); + struct page *page; - if (folio_test_large(folio)) { - int expected = folio_nr_pages(folio); + if (WARN_ON(gfp_mask & __GFP_COMP)) + return NULL; - if (nr_pages == expected) - folio_put(folio); - else - WARN(true, "PFN %lu: nr_pages %lu != expected %d\n", - pfn, nr_pages, expected); + page = alloc_contig_frozen_pages_noprof(nr_pages, gfp_mask, nid, + nodemask); + if (page) + set_pages_refcounted(page, nr_pages); + + return page; +} +EXPORT_SYMBOL(alloc_contig_pages_noprof); + +/** + * free_contig_frozen_range() -- free the contiguous range of frozen pages + * @pfn: start PFN to free + * @nr_pages: Number of contiguous frozen pages to free + * + * This can be used to free the allocated compound/non-compound frozen pages. + */ +void free_contig_frozen_range(unsigned long pfn, unsigned long nr_pages) +{ + struct page *first_page = pfn_to_page(pfn); + const unsigned int order = ilog2(nr_pages); + + if (WARN_ON_ONCE(first_page != compound_head(first_page))) + return; + + if (PageHead(first_page)) { + WARN_ON_ONCE(order != compound_order(first_page)); + free_frozen_pages(first_page, order); return; } - for (; nr_pages--; pfn++) { - struct page *page = pfn_to_page(pfn); + __free_contig_frozen_range(pfn, nr_pages); +} +EXPORT_SYMBOL(free_contig_frozen_range); + +/** + * free_contig_range() -- free the contiguous range of pages + * @pfn: start PFN to free + * @nr_pages: Number of contiguous pages to free + * + * This can be only used to free the allocated non-compound pages. + */ +void free_contig_range(unsigned long pfn, unsigned long nr_pages) +{ + if (WARN_ON_ONCE(PageHead(pfn_to_page(pfn)))) + return; - count += page_count(page) != 1; - __free_page(page); - } - WARN(count != 0, "%lu pages are still in use!\n", count); + for (; nr_pages--; pfn++) + __free_page(pfn_to_page(pfn)); } EXPORT_SYMBOL(free_contig_range); #endif /* CONFIG_CONTIG_ALLOC */ -- cgit v1.2.3 From 9bda131c6093e9c4a8739e2eeb65ba4d5fbefc2f Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Fri, 9 Jan 2026 17:31:35 +0800 Subject: mm: cma: add cma_alloc_frozen{_compound}() Introduce cma_alloc_frozen{_compound}() helper to alloc pages without incrementing their refcount, then convert hugetlb cma to use the cma_alloc_frozen_compound() and cma_release_frozen() and remove the unused cma_{alloc,free}_folio(), also move the cma_validate_zones() into mm/internal.h since no outside user. The set_pages_refcounted() is only called to set non-compound pages after above changes, so remove the processing about PageHead. Link: https://lkml.kernel.org/r/20260109093136.1491549-6-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: Zi Yan Cc: Brendan Jackman Cc: David Hildenbrand Cc: Jane Chu Cc: Johannes Weiner Cc: Matthew Wilcox (Oracle) Cc: Muchun Song Cc: Oscar Salvador Cc: Sidhartha Kumar Cc: Vlastimil Babka Cc: Claudiu Beznea Cc: Mark Brown Signed-off-by: Andrew Morton --- include/linux/cma.h | 26 +++---------- mm/cma.c | 107 +++++++++++++++++++++++++++++++++++----------------- mm/hugetlb_cma.c | 24 +++++++----- mm/internal.h | 10 ++--- 4 files changed, 97 insertions(+), 70 deletions(-) diff --git a/include/linux/cma.h b/include/linux/cma.h index e5745d2aec55..e2a690f7e77e 100644 --- a/include/linux/cma.h +++ b/include/linux/cma.h @@ -51,29 +51,15 @@ extern struct page *cma_alloc(struct cma *cma, unsigned long count, unsigned int bool no_warn); extern bool cma_release(struct cma *cma, const struct page *pages, unsigned long count); +struct page *cma_alloc_frozen(struct cma *cma, unsigned long count, + unsigned int align, bool no_warn); +struct page *cma_alloc_frozen_compound(struct cma *cma, unsigned int order); +bool cma_release_frozen(struct cma *cma, const struct page *pages, + unsigned long count); + extern int cma_for_each_area(int (*it)(struct cma *cma, void *data), void *data); extern bool cma_intersects(struct cma *cma, unsigned long start, unsigned long end); extern void cma_reserve_pages_on_error(struct cma *cma); -#ifdef CONFIG_CMA -struct folio *cma_alloc_folio(struct cma *cma, int order, gfp_t gfp); -bool cma_free_folio(struct cma *cma, const struct folio *folio); -bool cma_validate_zones(struct cma *cma); -#else -static inline struct folio *cma_alloc_folio(struct cma *cma, int order, gfp_t gfp) -{ - return NULL; -} - -static inline bool cma_free_folio(struct cma *cma, const struct folio *folio) -{ - return false; -} -static inline bool cma_validate_zones(struct cma *cma) -{ - return false; -} -#endif - #endif diff --git a/mm/cma.c b/mm/cma.c index 0e8c146424fb..b80b60ed4927 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -856,8 +856,8 @@ out: return ret; } -static struct page *__cma_alloc(struct cma *cma, unsigned long count, - unsigned int align, gfp_t gfp) +static struct page *__cma_alloc_frozen(struct cma *cma, + unsigned long count, unsigned int align, gfp_t gfp) { struct page *page = NULL; int ret = -ENOMEM, r; @@ -904,7 +904,6 @@ static struct page *__cma_alloc(struct cma *cma, unsigned long count, trace_cma_alloc_finish(name, page ? page_to_pfn(page) : 0, page, count, align, ret); if (page) { - set_pages_refcounted(page, count); count_vm_event(CMA_ALLOC_SUCCESS); cma_sysfs_account_success_pages(cma, count); } else { @@ -915,6 +914,21 @@ static struct page *__cma_alloc(struct cma *cma, unsigned long count, return page; } +struct page *cma_alloc_frozen(struct cma *cma, unsigned long count, + unsigned int align, bool no_warn) +{ + gfp_t gfp = GFP_KERNEL | (no_warn ? __GFP_NOWARN : 0); + + return __cma_alloc_frozen(cma, count, align, gfp); +} + +struct page *cma_alloc_frozen_compound(struct cma *cma, unsigned int order) +{ + gfp_t gfp = GFP_KERNEL | __GFP_COMP | __GFP_NOWARN; + + return __cma_alloc_frozen(cma, 1 << order, order, gfp); +} + /** * cma_alloc() - allocate pages from contiguous area * @cma: Contiguous memory region for which the allocation is performed. @@ -927,43 +941,27 @@ static struct page *__cma_alloc(struct cma *cma, unsigned long count, */ struct page *cma_alloc(struct cma *cma, unsigned long count, unsigned int align, bool no_warn) -{ - return __cma_alloc(cma, count, align, GFP_KERNEL | (no_warn ? __GFP_NOWARN : 0)); -} - -struct folio *cma_alloc_folio(struct cma *cma, int order, gfp_t gfp) { struct page *page; - if (WARN_ON(!order || !(gfp & __GFP_COMP))) - return NULL; - - page = __cma_alloc(cma, 1 << order, order, gfp); + page = cma_alloc_frozen(cma, count, align, no_warn); + if (page) + set_pages_refcounted(page, count); - return page ? page_folio(page) : NULL; + return page; } -/** - * cma_release() - release allocated pages - * @cma: Contiguous memory region for which the allocation is performed. - * @pages: Allocated pages. - * @count: Number of allocated pages. - * - * This function releases memory allocated by cma_alloc(). - * It returns false when provided pages do not belong to contiguous area and - * true otherwise. - */ -bool cma_release(struct cma *cma, const struct page *pages, - unsigned long count) +static struct cma_memrange *find_cma_memrange(struct cma *cma, + const struct page *pages, unsigned long count) { - struct cma_memrange *cmr; + struct cma_memrange *cmr = NULL; unsigned long pfn, end_pfn; int r; pr_debug("%s(page %p, count %lu)\n", __func__, (void *)pages, count); if (!cma || !pages || count > cma->count) - return false; + return NULL; pfn = page_to_pfn(pages); @@ -981,27 +979,66 @@ bool cma_release(struct cma *cma, const struct page *pages, if (r == cma->nranges) { pr_debug("%s(page %p, count %lu, no cma range matches the page range)\n", __func__, (void *)pages, count); - return false; + return NULL; } - if (PageHead(pages)) - __free_pages((struct page *)pages, compound_order(pages)); - else - free_contig_range(pfn, count); + return cmr; +} + +static void __cma_release_frozen(struct cma *cma, struct cma_memrange *cmr, + const struct page *pages, unsigned long count) +{ + unsigned long pfn = page_to_pfn(pages); + + pr_debug("%s(page %p, count %lu)\n", __func__, (void *)pages, count); + free_contig_frozen_range(pfn, count); cma_clear_bitmap(cma, cmr, pfn, count); cma_sysfs_account_release_pages(cma, count); trace_cma_release(cma->name, pfn, pages, count); +} + +/** + * cma_release() - release allocated pages + * @cma: Contiguous memory region for which the allocation is performed. + * @pages: Allocated pages. + * @count: Number of allocated pages. + * + * This function releases memory allocated by cma_alloc(). + * It returns false when provided pages do not belong to contiguous area and + * true otherwise. + */ +bool cma_release(struct cma *cma, const struct page *pages, + unsigned long count) +{ + struct cma_memrange *cmr; + unsigned long i, pfn; + + cmr = find_cma_memrange(cma, pages, count); + if (!cmr) + return false; + + pfn = page_to_pfn(pages); + for (i = 0; i < count; i++, pfn++) + VM_WARN_ON(!put_page_testzero(pfn_to_page(pfn))); + + __cma_release_frozen(cma, cmr, pages, count); return true; } -bool cma_free_folio(struct cma *cma, const struct folio *folio) +bool cma_release_frozen(struct cma *cma, const struct page *pages, + unsigned long count) { - if (WARN_ON(!folio_test_large(folio))) + struct cma_memrange *cmr; + + cmr = find_cma_memrange(cma, pages, count); + if (!cmr) return false; - return cma_release(cma, &folio->page, folio_nr_pages(folio)); + __cma_release_frozen(cma, cmr, pages, count); + + return true; } int cma_for_each_area(int (*it)(struct cma *cma, void *data), void *data) diff --git a/mm/hugetlb_cma.c b/mm/hugetlb_cma.c index f5e79103e110..58ceb6c9e410 100644 --- a/mm/hugetlb_cma.c +++ b/mm/hugetlb_cma.c @@ -20,35 +20,39 @@ static unsigned long hugetlb_cma_size __initdata; void hugetlb_cma_free_folio(struct folio *folio) { - int nid = folio_nid(folio); + folio_ref_dec(folio); - WARN_ON_ONCE(!cma_free_folio(hugetlb_cma[nid], folio)); + WARN_ON_ONCE(!cma_release_frozen(hugetlb_cma[folio_nid(folio)], + &folio->page, folio_nr_pages(folio))); } - struct folio *hugetlb_cma_alloc_folio(int order, gfp_t gfp_mask, int nid, nodemask_t *nodemask) { int node; - struct folio *folio = NULL; + struct folio *folio; + struct page *page = NULL; if (hugetlb_cma[nid]) - folio = cma_alloc_folio(hugetlb_cma[nid], order, gfp_mask); + page = cma_alloc_frozen_compound(hugetlb_cma[nid], order); - if (!folio && !(gfp_mask & __GFP_THISNODE)) { + if (!page && !(gfp_mask & __GFP_THISNODE)) { for_each_node_mask(node, *nodemask) { if (node == nid || !hugetlb_cma[node]) continue; - folio = cma_alloc_folio(hugetlb_cma[node], order, gfp_mask); - if (folio) + page = cma_alloc_frozen_compound(hugetlb_cma[node], order); + if (page) break; } } - if (folio) - folio_set_hugetlb_cma(folio); + if (!page) + return NULL; + set_page_refcounted(page); + folio = page_folio(page); + folio_set_hugetlb_cma(folio); return folio; } diff --git a/mm/internal.h b/mm/internal.h index 0623b865ad1a..27509a909915 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -584,11 +584,6 @@ static inline void set_pages_refcounted(struct page *page, unsigned long nr_page { unsigned long pfn = page_to_pfn(page); - if (PageHead(page)) { - set_page_refcounted(page); - return; - } - for (; nr_pages--; pfn++) set_page_refcounted(pfn_to_page(pfn)); } @@ -1014,9 +1009,14 @@ void init_cma_reserved_pageblock(struct page *page); struct cma; #ifdef CONFIG_CMA +bool cma_validate_zones(struct cma *cma); void *cma_reserve_early(struct cma *cma, unsigned long size); void init_cma_pageblock(struct page *page); #else +static inline bool cma_validate_zones(struct cma *cma) +{ + return false; +} static inline void *cma_reserve_early(struct cma *cma, unsigned long size) { return NULL; -- cgit v1.2.3 From 14f270761d3374db24c84630f2aa7a3c732fed4a Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Fri, 9 Jan 2026 17:31:36 +0800 Subject: mm: hugetlb: allocate frozen pages for gigantic allocation alloc_gigantic_folio() allocates a folio with refcount increated and then freeze it, convert to allocate a frozen folio to remove the atomic operation about folio refcount, and saving atomic operation during __update_and_free_hugetlb_folio() too. Besides, rename hugetlb_cma_{alloc,free}_folio(), alloc_gigantic_folio() and alloc_buddy_hugetlb_folio() with frozen which make them more self-explanatory. Link: https://lkml.kernel.org/r/20260109093136.1491549-7-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: Zi Yan Reviewed-by: Muchun Song Cc: Brendan Jackman Cc: David Hildenbrand Cc: Jane Chu Cc: Johannes Weiner Cc: Matthew Wilcox (Oracle) Cc: Oscar Salvador Cc: Sidhartha Kumar Cc: Vlastimil Babka Cc: Claudiu Beznea Cc: Mark Brown Signed-off-by: Andrew Morton --- mm/hugetlb.c | 75 +++++++++++++++----------------------------------------- mm/hugetlb_cma.c | 9 +++---- mm/hugetlb_cma.h | 10 ++++---- 3 files changed, 28 insertions(+), 66 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 762aeebf85d2..8c197307db0c 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -121,16 +121,6 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma, unsigned long start, unsigned long end, bool take_locks); static struct resv_map *vma_resv_map(struct vm_area_struct *vma); -static void hugetlb_free_folio(struct folio *folio) -{ - if (folio_test_hugetlb_cma(folio)) { - hugetlb_cma_free_folio(folio); - return; - } - - folio_put(folio); -} - static inline bool subpool_is_free(struct hugepage_subpool *spool) { if (spool->count) @@ -1417,52 +1407,25 @@ err: return NULL; } -#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE -#ifdef CONFIG_CONTIG_ALLOC -static struct folio *alloc_gigantic_folio(int order, gfp_t gfp_mask, +#if defined(CONFIG_ARCH_HAS_GIGANTIC_PAGE) && defined(CONFIG_CONTIG_ALLOC) +static struct folio *alloc_gigantic_frozen_folio(int order, gfp_t gfp_mask, int nid, nodemask_t *nodemask) { struct folio *folio; - bool retried = false; -retry: - folio = hugetlb_cma_alloc_folio(order, gfp_mask, nid, nodemask); - if (!folio) { - struct page *page; - - if (hugetlb_cma_exclusive_alloc()) - return NULL; - - page = alloc_contig_frozen_pages(1 << order, gfp_mask, nid, nodemask); - if (!page) - return NULL; - - set_page_refcounted(page); - folio = page_folio(page); - } - - if (folio_ref_freeze(folio, 1)) + folio = hugetlb_cma_alloc_frozen_folio(order, gfp_mask, nid, nodemask); + if (folio) return folio; - pr_warn("HugeTLB: unexpected refcount on PFN %lu\n", folio_pfn(folio)); - hugetlb_free_folio(folio); - if (!retried) { - retried = true; - goto retry; - } - return NULL; -} + if (hugetlb_cma_exclusive_alloc()) + return NULL; -#else /* !CONFIG_CONTIG_ALLOC */ -static struct folio *alloc_gigantic_folio(int order, gfp_t gfp_mask, int nid, - nodemask_t *nodemask) -{ - return NULL; + folio = (struct folio *)alloc_contig_frozen_pages(1 << order, gfp_mask, + nid, nodemask); + return folio; } -#endif /* CONFIG_CONTIG_ALLOC */ - -#else /* !CONFIG_ARCH_HAS_GIGANTIC_PAGE */ -static struct folio *alloc_gigantic_folio(int order, gfp_t gfp_mask, int nid, +#else /* !CONFIG_ARCH_HAS_GIGANTIC_PAGE || !CONFIG_CONTIG_ALLOC */ +static struct folio *alloc_gigantic_frozen_folio(int order, gfp_t gfp_mask, int nid, nodemask_t *nodemask) { return NULL; @@ -1592,9 +1555,11 @@ static void __update_and_free_hugetlb_folio(struct hstate *h, if (unlikely(folio_test_hwpoison(folio))) folio_clear_hugetlb_hwpoison(folio); - folio_ref_unfreeze(folio, 1); - - hugetlb_free_folio(folio); + VM_BUG_ON_FOLIO(folio_ref_count(folio), folio); + if (folio_test_hugetlb_cma(folio)) + hugetlb_cma_free_frozen_folio(folio); + else + free_frozen_pages(&folio->page, folio_order(folio)); } /* @@ -1874,7 +1839,7 @@ struct address_space *hugetlb_folio_mapping_lock_write(struct folio *folio) return NULL; } -static struct folio *alloc_buddy_hugetlb_folio(int order, gfp_t gfp_mask, +static struct folio *alloc_buddy_frozen_folio(int order, gfp_t gfp_mask, int nid, nodemask_t *nmask, nodemask_t *node_alloc_noretry) { struct folio *folio; @@ -1930,10 +1895,10 @@ static struct folio *only_alloc_fresh_hugetlb_folio(struct hstate *h, nid = numa_mem_id(); if (order_is_gigantic(order)) - folio = alloc_gigantic_folio(order, gfp_mask, nid, nmask); + folio = alloc_gigantic_frozen_folio(order, gfp_mask, nid, nmask); else - folio = alloc_buddy_hugetlb_folio(order, gfp_mask, nid, nmask, - node_alloc_noretry); + folio = alloc_buddy_frozen_folio(order, gfp_mask, nid, nmask, + node_alloc_noretry); if (folio) init_new_hugetlb_folio(folio); return folio; diff --git a/mm/hugetlb_cma.c b/mm/hugetlb_cma.c index 58ceb6c9e410..0ddf9755c090 100644 --- a/mm/hugetlb_cma.c +++ b/mm/hugetlb_cma.c @@ -18,16 +18,14 @@ static unsigned long hugetlb_cma_size_in_node[MAX_NUMNODES] __initdata; static bool hugetlb_cma_only; static unsigned long hugetlb_cma_size __initdata; -void hugetlb_cma_free_folio(struct folio *folio) +void hugetlb_cma_free_frozen_folio(struct folio *folio) { - folio_ref_dec(folio); - WARN_ON_ONCE(!cma_release_frozen(hugetlb_cma[folio_nid(folio)], &folio->page, folio_nr_pages(folio))); } -struct folio *hugetlb_cma_alloc_folio(int order, gfp_t gfp_mask, - int nid, nodemask_t *nodemask) +struct folio *hugetlb_cma_alloc_frozen_folio(int order, gfp_t gfp_mask, + int nid, nodemask_t *nodemask) { int node; struct folio *folio; @@ -50,7 +48,6 @@ struct folio *hugetlb_cma_alloc_folio(int order, gfp_t gfp_mask, if (!page) return NULL; - set_page_refcounted(page); folio = page_folio(page); folio_set_hugetlb_cma(folio); return folio; diff --git a/mm/hugetlb_cma.h b/mm/hugetlb_cma.h index 78186839df3a..c619c394b1ae 100644 --- a/mm/hugetlb_cma.h +++ b/mm/hugetlb_cma.h @@ -3,8 +3,8 @@ #define _LINUX_HUGETLB_CMA_H #ifdef CONFIG_CMA -void hugetlb_cma_free_folio(struct folio *folio); -struct folio *hugetlb_cma_alloc_folio(int order, gfp_t gfp_mask, +void hugetlb_cma_free_frozen_folio(struct folio *folio); +struct folio *hugetlb_cma_alloc_frozen_folio(int order, gfp_t gfp_mask, int nid, nodemask_t *nodemask); struct huge_bootmem_page *hugetlb_cma_alloc_bootmem(struct hstate *h, int *nid, bool node_exact); @@ -13,12 +13,12 @@ unsigned long hugetlb_cma_total_size(void); void hugetlb_cma_validate_params(void); bool hugetlb_early_cma(struct hstate *h); #else -static inline void hugetlb_cma_free_folio(struct folio *folio) +static inline void hugetlb_cma_free_frozen_folio(struct folio *folio) { } -static inline struct folio *hugetlb_cma_alloc_folio(int order, gfp_t gfp_mask, - int nid, nodemask_t *nodemask) +static inline struct folio *hugetlb_cma_alloc_frozen_folio(int order, + gfp_t gfp_mask, int nid, nodemask_t *nodemask) { return NULL; } -- cgit v1.2.3 From d60769075013ec7933a715b5b1ac37eb77c33420 Mon Sep 17 00:00:00 2001 From: Alice Ryhl Date: Wed, 7 Jan 2026 15:16:42 +0000 Subject: vmalloc: export vrealloc_node_align_noprof MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This symbol is used from the Nova driver, so it needs to be exported to avoid a build failure when building Nova as a module. ERROR: modpost: "vrealloc_node_align_noprof" [drivers/gpu/nova-core/nova_core.ko] undefined! ERROR: modpost: "vrealloc_node_align_noprof" [samples/rust/rust_dma.ko] undefined! This error is only triggered if helpers are inlined into Rust. Otherwise, Nova will call the exported symbol rust_helper_vrealloc_node_align() instead. There is no Fixes: tag as that feature is still WIP. I used non-GPL EXPORT_SYMBOL to match the rest of the file, but let me know if I should use EXPORT_SYMBOL_GPL. Link: https://lkml.kernel.org/r/20260107-export-vrealloc_node_align_noprof-v1-1-a581bec13054@google.com Signed-off-by: Alice Ryhl Reviewed-by: Danilo Krummrich Cc: Andreas Hindborg Cc: Björn Roy Baron Cc: Boqun Feng Cc: Gary Guo Cc: Miguel Ojeda Cc: Trevor Gross Cc: Uladzislau Rezki Signed-off-by: Andrew Morton --- mm/vmalloc.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index ca4c65328687..316762e6c9b4 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -4370,6 +4370,7 @@ need_realloc: return n; } +EXPORT_SYMBOL(vrealloc_node_align_noprof); #if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32) #define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL) -- cgit v1.2.3 From b19cb086043d30d3e74617f9971f68e7fd233c64 Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Tue, 13 Jan 2026 20:15:16 +0100 Subject: mm/kasan/kunit: extend vmalloc OOB tests to cover vrealloc() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extend the vmalloc_oob() test to validate OOB detection after resizing vmalloc allocations with vrealloc(). The test now verifies that KASAN correctly poisons and unpoisons vmalloc memory when allocations are shrunk and expanded, ensuring OOB accesses are reliably detected after each resize. [ryabinin.a.a@gmail.com: adjust vrealloc() size] Link: https://lkml.kernel.org/r/20260116132822.22227-1-ryabinin.a.a@gmail.com Link: https://lkml.kernel.org/r/20260113191516.31015-2-ryabinin.a.a@gmail.com Signed-off-by: Andrey Ryabinin Reviewed-by: Andrey Konovalov Cc: Alexander Potapenko Cc: Dmitriy Vyukov Cc: Maciej Żenczykowski Cc: Uladzislau Rezki Cc: Vincenzo Frascino Cc: Maciej Wieczor-Retman Signed-off-by: Andrew Morton --- mm/kasan/kasan_test_c.c | 50 ++++++++++++++++++++++++++++++++++--------------- 1 file changed, 35 insertions(+), 15 deletions(-) diff --git a/mm/kasan/kasan_test_c.c b/mm/kasan/kasan_test_c.c index 2cafca31b092..b4d157962121 100644 --- a/mm/kasan/kasan_test_c.c +++ b/mm/kasan/kasan_test_c.c @@ -1840,6 +1840,29 @@ static void vmalloc_helpers_tags(struct kunit *test) vfree(ptr); } +static void vmalloc_oob_helper(struct kunit *test, char *v_ptr, size_t size) +{ + /* + * We have to be careful not to hit the guard page in vmalloc tests. + * The MMU will catch that and crash us. + */ + + /* Make sure in-bounds accesses are valid. */ + v_ptr[0] = 0; + v_ptr[size - 1] = 0; + + /* + * An unaligned access past the requested vmalloc size. + * Only generic KASAN can precisely detect these. + */ + if (IS_ENABLED(CONFIG_KASAN_GENERIC)) + KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)v_ptr)[size]); + + /* An aligned access into the first out-of-bounds granule. */ + size = round_up(size, KASAN_GRANULE_SIZE); + KUNIT_EXPECT_KASAN_FAIL_READ(test, ((volatile char *)v_ptr)[size]); +} + static void vmalloc_oob(struct kunit *test) { char *v_ptr, *p_ptr; @@ -1856,24 +1879,21 @@ static void vmalloc_oob(struct kunit *test) OPTIMIZER_HIDE_VAR(v_ptr); - /* - * We have to be careful not to hit the guard page in vmalloc tests. - * The MMU will catch that and crash us. - */ + vmalloc_oob_helper(test, v_ptr, size); - /* Make sure in-bounds accesses are valid. */ - v_ptr[0] = 0; - v_ptr[size - 1] = 0; + size -= KASAN_GRANULE_SIZE + 1; + v_ptr = vrealloc(v_ptr, size, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, v_ptr); - /* - * An unaligned access past the requested vmalloc size. - * Only generic KASAN can precisely detect these. - */ - if (IS_ENABLED(CONFIG_KASAN_GENERIC)) - KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)v_ptr)[size]); + OPTIMIZER_HIDE_VAR(v_ptr); - /* An aligned access into the first out-of-bounds granule. */ - KUNIT_EXPECT_KASAN_FAIL_READ(test, ((volatile char *)v_ptr)[size + 5]); + vmalloc_oob_helper(test, v_ptr, size); + + size += 2 * KASAN_GRANULE_SIZE + 2; + v_ptr = vrealloc(v_ptr, size, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, v_ptr); + + vmalloc_oob_helper(test, v_ptr, size); /* Check that in-bounds accesses to the physical page are valid. */ page = vmalloc_to_page(v_ptr); -- cgit v1.2.3 From 4835e2871321fd9cf5bc9702dded323e3e3fbc1a Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 13 Jan 2026 07:27:06 -0800 Subject: mm/damon/core: introduce [in]active memory ratio damos quota goal metric Patch series "mm/damon: advance DAMOS-based LRU sorting". DAMOS_LRU_[DE]PRIO actions were added to DAMOS for more access-aware LRU lists sorting. For simple usage, a specialized kernel module, namely DAMON_LRU_SORT, has also been introduced. After the introduction of the module, DAMON got a few important new features, including the aim-based quota auto-tuning, age tracking, young page filter, and monitoring intervals auto-tuning. Meanwhile, DAMOS-based LRU sorting had no direct updates. Now we show some rooms to advance for DAMOS-based LRU sorting. Firstly, the aim-oriented quota auto-tuning can simplify the LRU sorting parameters tuning. But there is no good auto-tuning target metric for LRU sorting use case. Secondly, the behavior of DAMOS_LRU_[DE]PRIO are not very symmetric. DAMOS_LRU_DEPRIO directly moves the pages to inactive LRU list, while DAMOS_LRU_PRIO only marks the page as accessed, so that the page can not directly but only eventually moved to the active LRU list. Finally, DAMON_LRU_SORT users cannot utilize the modern features that can be useful for them, too. Improve the situation with the following changes. First, introduce a new DAMOS quota auto-tuning target metric for active:inactive memory size ratio. Since LRU sorting is a kind of balancing of active and inactive pages, the active:inactive memory size ratio can be intuitively set. Second, update DAMOS_LRU_[DE]PRIO behaviors to be more intuitive and symmetric, by letting them directly move the pages to [in]active LRU list. Third, update the DAMON_LRU_SORT module user interface to be able to fully utilize the modern features including the [in]active memory size ratio-based quota auto-tuning, young page filter, and monitoring intervals auto-tuning. With these changes, for example, users can now ask DAMON to "find hot/cold memory regions with auto-tuned monitoring intervals, do one more page level access check for found hot/cold memory, and move pages of those to active or inactive LRU lists accordingly, aiming X:Y active to inactive memory ratio." For example, if they know 30% of the memory is better to be protected from reclamation, 30:70 can be set as the target ratio. Test Results ------------ I ran DAMON_LRU_SORT with the features introduced by this series, on a real world server workload. For the active:inactive ratio goal, I set 50:50. I confirmed it achieves the target active:inactive ratio, without manual tuning of the monitoring intervals and the hot/coldness thresholds. The baseline system that was not running the DAMON_LRU_SORT was keeping active:inactive ratio of about 1:10. Note that the test didn't show a clear performance difference, though. I believe that was mainly because the workload was not very memory intensive. Also, whether the 50:50 target ratio was optimum is unclear. Nonetheless, the positive performance impact of the basic LRU sorting idea is already confirmed with the initial DAMON_LRU_SORT introduction patch series. The goal of this patch series is simplifying the parameters tuning of DAMOS-based LRU sorting, and the test confirmed the aimed goals are achieved. Patches Sequence ---------------- First three patches extend DAMOS quota auto-tuning to support [in]active memory ratio target metric type. Those (patches 1-3) introduce new metrics, implement DAMON sysfs support, and update the documentation, respectively. Following patch (patch 4) makes DAMOS_LRU_PRIO action to directly move target pages to active LRU list, instead of only marking them accessed. Following seven patches (patches 5-11) updates DAMON_LRU_SORT to support modern DAMON features. Patch 5 makes it uses not only access frequency but also age at under-quota regions prioritization. Patches 6-11 add the support for young page filtering, active:inactive memory ratio based quota auto-tuning, and monitoring intervals auto-tuning, with appropriate document updates. This patch (of 11): DAMOS_LRU_[DE]PRIO are DAMOS actions for making balance of active and inactive memory size. There is no appropriate DAMOS quota auto-tuning target metric for the use case. Add two new DAMOS quota goal metrics for the purpose, namely DAMOS_QUOTA_[IN]ACTIVE_MEM_BP. Those will represent the ratio of [in]active memory to total (inactive + active) memory. Hence, users will be able to ask DAMON to, for example, "find hot and cold memory, and move pages of those to active and inactive LRU lists, adjusting the hot/cold thresholds aiming 50:50 active:inactive memory ratio." Link: https://lkml.kernel.org/r/20260113152717.70459-1-sj@kernel.org Link: https://lkml.kernel.org/r/20260113152717.70459-2-sj@kernel.org Signed-off-by: SeongJae Park Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/damon.h | 4 ++++ mm/damon/core.c | 22 ++++++++++++++++++++++ 2 files changed, 26 insertions(+) diff --git a/include/linux/damon.h b/include/linux/damon.h index 650e7ecfa32b..26fb8e90dff6 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -155,6 +155,8 @@ enum damos_action { * @DAMOS_QUOTA_NODE_MEM_FREE_BP: MemFree ratio of a node. * @DAMOS_QUOTA_NODE_MEMCG_USED_BP: MemUsed ratio of a node for a cgroup. * @DAMOS_QUOTA_NODE_MEMCG_FREE_BP: MemFree ratio of a node for a cgroup. + * @DAMOS_QUOTA_ACTIVE_MEM_BP: Active to total LRU memory ratio. + * @DAMOS_QUOTA_INACTIVE_MEM_BP: Inactive to total LRU memory ratio. * @NR_DAMOS_QUOTA_GOAL_METRICS: Number of DAMOS quota goal metrics. * * Metrics equal to larger than @NR_DAMOS_QUOTA_GOAL_METRICS are unsupported. @@ -166,6 +168,8 @@ enum damos_quota_goal_metric { DAMOS_QUOTA_NODE_MEM_FREE_BP, DAMOS_QUOTA_NODE_MEMCG_USED_BP, DAMOS_QUOTA_NODE_MEMCG_FREE_BP, + DAMOS_QUOTA_ACTIVE_MEM_BP, + DAMOS_QUOTA_INACTIVE_MEM_BP, NR_DAMOS_QUOTA_GOAL_METRICS, }; diff --git a/mm/damon/core.c b/mm/damon/core.c index 6888917c1a00..729a5f7fac94 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -2132,6 +2132,23 @@ static unsigned long damos_get_node_memcg_used_bp( } #endif +/* + * Returns LRU-active or inactive memory to total LRU memory size ratio. + */ +static unsigned int damos_get_in_active_mem_bp(bool active_ratio) +{ + unsigned long active, inactive, total; + + /* This should align with /proc/meminfo output */ + active = global_node_page_state(NR_LRU_BASE + LRU_ACTIVE_ANON) + + global_node_page_state(NR_LRU_BASE + LRU_ACTIVE_FILE); + inactive = global_node_page_state(NR_LRU_BASE + LRU_INACTIVE_ANON) + + global_node_page_state(NR_LRU_BASE + LRU_INACTIVE_FILE); + total = active + inactive; + if (active_ratio) + return active * 10000 / total; + return inactive * 10000 / total; +} static void damos_set_quota_goal_current_value(struct damos_quota_goal *goal) { @@ -2154,6 +2171,11 @@ static void damos_set_quota_goal_current_value(struct damos_quota_goal *goal) case DAMOS_QUOTA_NODE_MEMCG_FREE_BP: goal->current_value = damos_get_node_memcg_used_bp(goal); break; + case DAMOS_QUOTA_ACTIVE_MEM_BP: + case DAMOS_QUOTA_INACTIVE_MEM_BP: + goal->current_value = damos_get_in_active_mem_bp( + goal->metric == DAMOS_QUOTA_ACTIVE_MEM_BP); + break; default: break; } -- cgit v1.2.3 From fbec8a1e4fa4daf2611c9a3e3b29d03a73acbd0c Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 13 Jan 2026 07:27:07 -0800 Subject: mm/damon/sysfs-schemes: support DAMOS_QUOTA_[IN]ACTIVE_MEM_BP Add support of DAMOS_QUOTA_[IN]ACTIVE_MEM_BP on DAMON sysfs interface. Users can use [in]active_mem_bp keyword input to the target_metric sysfs file to use the new DAMOS quota auto-tune target metrics. Link: https://lkml.kernel.org/r/20260113152717.70459-3-sj@kernel.org Signed-off-by: SeongJae Park Acked-by: wang lian Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/damon/sysfs-schemes.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c index 419d6e7ee945..2b05a6477188 100644 --- a/mm/damon/sysfs-schemes.c +++ b/mm/damon/sysfs-schemes.c @@ -1079,6 +1079,14 @@ struct damos_sysfs_qgoal_metric_name damos_sysfs_qgoal_metric_names[] = { .metric = DAMOS_QUOTA_NODE_MEMCG_FREE_BP, .name = "node_memcg_free_bp", }, + { + .metric = DAMOS_QUOTA_ACTIVE_MEM_BP, + .name = "active_mem_bp", + }, + { + .metric = DAMOS_QUOTA_INACTIVE_MEM_BP, + .name = "inactive_mem_bp", + }, }; static ssize_t target_metric_show(struct kobject *kobj, -- cgit v1.2.3 From 5022134c1b497ab33b5cbd4dc84ef32906b51759 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 13 Jan 2026 07:27:08 -0800 Subject: Docs/mm/damon/design: document DAMOS_QUOTA_[IN]ACTIVE_MEM_BP Update design document for newly added DAMOS_QUOTA_[IN]ACTIVE_MEM_BP metrics. Note that API document is automatically updated by kernel-doc comment, and the usage document points to the design document which uses keywords same to that for sysfs inputs. Hence updating only design document is sufficient. Link: https://lkml.kernel.org/r/20260113152717.70459-4-sj@kernel.org Signed-off-by: SeongJae Park Acked-by: wang lian Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- Documentation/mm/damon/design.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Documentation/mm/damon/design.rst b/Documentation/mm/damon/design.rst index 7fd819b8bbf7..0cfd4c25e92d 100644 --- a/Documentation/mm/damon/design.rst +++ b/Documentation/mm/damon/design.rst @@ -585,6 +585,10 @@ mechanism tries to make ``current_value`` of ``target_metric`` be same to specific NUMA node, in bp (1/10,000). - ``node_memcg_free_bp``: Specific cgroup's node unused memory ratio for a specific NUMA node, in bp (1/10,000). +- ``active_mem_bp``: Active to active + inactive (LRU) memory size ratio in bp + (1/10,000). +- ``inactive_mem_bp``: Inactive to active + inactive (LRU) memory size ratio in + bp (1/10,000). ``nid`` is optionally required for only ``node_mem_used_bp``, ``node_mem_free_bp``, ``node_memcg_used_bp`` and ``node_memcg_free_bp`` to -- cgit v1.2.3 From 80820e69fd1b92288dceeffc0a337883abb5096a Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 13 Jan 2026 07:27:09 -0800 Subject: mm/damon/paddr: activate DAMOS_LRU_PRIO targets instead of marking accessed DAMOS_LRU_DEPRIOD directly deactivates the pages, while DAMOS_LRU_PRIO calls folio_mark_accessed(), which does incremental activation. The incremental activation was assumed to be useful for making sure the pages of the hot memory region are really hot. After the introduction of DAMOS_LRU_PRIO, the young page filter has added. Users can use the young page filter to make sure the page is eligible to be activated. Meanwhile, the asymmetric behavior of DAMOS_LRU_[DE]PRIO can confuse users. Directly activate given pages for DAMOS_LRU_PRIO, to eliminate the unnecessary incremental activation steps, and be symmetric with DAMOS_LRU_DEPRIO for easier usages. Link: https://lkml.kernel.org/r/20260113152717.70459-5-sj@kernel.org Signed-off-by: SeongJae Park Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/damon/paddr.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c index 7d887a3c0866..4c2c935d82d6 100644 --- a/mm/damon/paddr.c +++ b/mm/damon/paddr.c @@ -206,9 +206,9 @@ put_folio: return damon_pa_core_addr(applied * PAGE_SIZE, addr_unit); } -static inline unsigned long damon_pa_mark_accessed_or_deactivate( +static inline unsigned long damon_pa_de_activate( struct damon_region *r, unsigned long addr_unit, - struct damos *s, bool mark_accessed, + struct damos *s, bool activate, unsigned long *sz_filter_passed) { phys_addr_t addr, applied = 0; @@ -227,8 +227,8 @@ static inline unsigned long damon_pa_mark_accessed_or_deactivate( else *sz_filter_passed += folio_size(folio) / addr_unit; - if (mark_accessed) - folio_mark_accessed(folio); + if (activate) + folio_activate(folio); else folio_deactivate(folio); applied += folio_nr_pages(folio); @@ -240,20 +240,18 @@ put_folio: return damon_pa_core_addr(applied * PAGE_SIZE, addr_unit); } -static unsigned long damon_pa_mark_accessed(struct damon_region *r, +static unsigned long damon_pa_activate_pages(struct damon_region *r, unsigned long addr_unit, struct damos *s, unsigned long *sz_filter_passed) { - return damon_pa_mark_accessed_or_deactivate(r, addr_unit, s, true, - sz_filter_passed); + return damon_pa_de_activate(r, addr_unit, s, true, sz_filter_passed); } static unsigned long damon_pa_deactivate_pages(struct damon_region *r, unsigned long addr_unit, struct damos *s, unsigned long *sz_filter_passed) { - return damon_pa_mark_accessed_or_deactivate(r, addr_unit, s, false, - sz_filter_passed); + return damon_pa_de_activate(r, addr_unit, s, false, sz_filter_passed); } static unsigned long damon_pa_migrate(struct damon_region *r, @@ -327,7 +325,7 @@ static unsigned long damon_pa_apply_scheme(struct damon_ctx *ctx, case DAMOS_PAGEOUT: return damon_pa_pageout(r, aunit, scheme, sz_filter_passed); case DAMOS_LRU_PRIO: - return damon_pa_mark_accessed(r, aunit, scheme, + return damon_pa_activate_pages(r, aunit, scheme, sz_filter_passed); case DAMOS_LRU_DEPRIO: return damon_pa_deactivate_pages(r, aunit, scheme, -- cgit v1.2.3 From 57d96d1ad2ccb29f0b8d67acd79e2f978bf165c4 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 13 Jan 2026 07:27:10 -0800 Subject: mm/damon/lru_sort: consider age for quota prioritization DAMON_LRU_SORT is doing under-quota access pattern based regions prioritization using only access frequency. Age of regions is another useful information for distinguishing hot and cold regions. Use it for prioritization, too. Link: https://lkml.kernel.org/r/20260113152717.70459-6-sj@kernel.org Signed-off-by: SeongJae Park Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/damon/lru_sort.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c index 9388b091deb7..a74c4ec170a9 100644 --- a/mm/damon/lru_sort.c +++ b/mm/damon/lru_sort.c @@ -71,7 +71,7 @@ static struct damos_quota damon_lru_sort_quota = { /* Within the quota, mark hotter regions accessed first. */ .weight_sz = 0, .weight_nr_accesses = 1, - .weight_age = 0, + .weight_age = 1, }; DEFINE_DAMON_MODULES_DAMOS_TIME_QUOTA(damon_lru_sort_quota); -- cgit v1.2.3 From 303dbb1f08cfe844d095fe008bd4d04e89d447f1 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 13 Jan 2026 07:27:11 -0800 Subject: mm/damon/lru_sort: support young page filters DAMON monitors access patterns at the region level, and hence there could be some page level mismatches. A few hot pages could be located in cold regions, and vice versa. Young page filters can be useful for doing additional page level access checks before applying some DAMOS action. DAMON_LRU_SORT is not using young page filters, though. Add a parameter for using it. Link: https://lkml.kernel.org/r/20260113152717.70459-7-sj@kernel.org Signed-off-by: SeongJae Park Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/damon/lru_sort.c | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c index a74c4ec170a9..f1fdb37b9b47 100644 --- a/mm/damon/lru_sort.c +++ b/mm/damon/lru_sort.c @@ -41,6 +41,21 @@ static bool enabled __read_mostly; static bool commit_inputs __read_mostly; module_param(commit_inputs, bool, 0600); +/* + * Filter [non-]young pages accordingly for LRU [de]prioritizations. + * + * If this is set, check page level access (youngness) once again before each + * LRU [de]prioritization operation. LRU prioritization operation is skipped + * if the page has not accessed since the last check (not young). LRU + * deprioritization operation is skipped if the page has accessed since the + * last check (young). The feature is enabled or disabled if this parameter is + * set as ``Y`` or ``N``, respectively. + * + * Disabled by default. + */ +static bool filter_young_pages __read_mostly; +module_param(filter_young_pages, bool, 0600); + /* * Access frequency threshold for hot memory regions identification in permil. * @@ -193,6 +208,28 @@ static struct damos *damon_lru_sort_new_cold_scheme(unsigned int cold_thres) return damon_lru_sort_new_scheme(&pattern, DAMOS_LRU_DEPRIO); } +static int damon_lru_sort_add_filters(struct damos *hot_scheme, + struct damos *cold_scheme) +{ + struct damos_filter *filter; + + if (!filter_young_pages) + return 0; + + /* disallow prioritizing not-young pages */ + filter = damos_new_filter(DAMOS_FILTER_TYPE_YOUNG, false, false); + if (!filter) + return -ENOMEM; + damos_add_filter(hot_scheme, filter); + + /* disabllow de-prioritizing young pages */ + filter = damos_new_filter(DAMOS_FILTER_TYPE_YOUNG, true, false); + if (!filter) + return -ENOMEM; + damos_add_filter(cold_scheme, filter); + return 0; +} + static int damon_lru_sort_apply_parameters(void) { struct damon_ctx *param_ctx; @@ -240,6 +277,10 @@ static int damon_lru_sort_apply_parameters(void) damon_set_schemes(param_ctx, &hot_scheme, 1); damon_add_scheme(param_ctx, cold_scheme); + err = damon_lru_sort_add_filters(hot_scheme, cold_scheme); + if (err) + goto out; + err = damon_set_region_biggest_system_ram_default(param_target, &monitor_region_start, &monitor_region_end, -- cgit v1.2.3 From b36aefb866a12e2fbdc76f3cf0be4025b85dcb2c Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 13 Jan 2026 07:27:12 -0800 Subject: Docs/admin-guide/mm/damon/lru_sort: document filter_young_pages Document the new DAMON_LRU_SORT parameter, filter_young_pages. It can be used to use page level access re-check for the LRU sorting. Link: https://lkml.kernel.org/r/20260113152717.70459-8-sj@kernel.org Signed-off-by: SeongJae Park Acked-by: wang lian Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/damon/lru_sort.rst | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/Documentation/admin-guide/mm/damon/lru_sort.rst b/Documentation/admin-guide/mm/damon/lru_sort.rst index 72a943202676..bb222a32aefd 100644 --- a/Documentation/admin-guide/mm/damon/lru_sort.rst +++ b/Documentation/admin-guide/mm/damon/lru_sort.rst @@ -79,6 +79,20 @@ of parametrs except ``enabled`` again. Once the re-reading is done, this parameter is set as ``N``. If invalid parameters are found while the re-reading, DAMON_LRU_SORT will be disabled. +filter_young_pages +------------------ + +Filter [non-]young pages accordingly for LRU [de]prioritizations. + +If this is set, check page level access (youngness) once again before each +LRU [de]prioritization operation. LRU prioritization operation is skipped +if the page has not accessed since the last check (not young). LRU +deprioritization operation is skipped if the page has accessed since the +last check (young). The feature is enabled or disabled if this parameter is +set as ``Y`` or ``N``, respectively. + +Disabled by default. + hot_thres_access_freq --------------------- -- cgit v1.2.3 From 40d98d31cd7060228e03303c5c34ae7101020416 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 13 Jan 2026 07:27:13 -0800 Subject: mm/damon/lru_sort: support active:inactive memory ratio based auto-tuning Doing DAMOS_LRU_[DE]PRIO with DAMOS_QUOTA_[IN]ACTIVE_MEM_BP based quota auto-tuning can be easy and intuitive, compared to the manual [de]prioritization target access pattern thresholds tuning. For example, users can ask DAMON to "find hot/cold pages and activate/deactivate those aiming 50:50 active:inactive memory size." But DAMON_LRU_SORT has no interface to do that. Add a module parameter for setting the target ratio. [sj@kernel.org: add inactive mem ratio quota goal to cold_scheme] Link: https://lkml.kernel.org/r/20260114055308.79884-1-sj@kernel.org Link: https://lkml.kernel.org/r/20260113152717.70459-9-sj@kernel.org Signed-off-by: SeongJae Park Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/damon/lru_sort.c | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c index f1fdb37b9b47..8af97642912a 100644 --- a/mm/damon/lru_sort.c +++ b/mm/damon/lru_sort.c @@ -41,6 +41,20 @@ static bool enabled __read_mostly; static bool commit_inputs __read_mostly; module_param(commit_inputs, bool, 0600); +/* + * Desired active to [in]active memory ratio in bp (1/10,000). + * + * While keeping the caps that set by other quotas, DAMON_LRU_SORT + * automatically increases and decreases the effective level of the quota + * aiming the LRU [de]prioritizations of the hot and cold memory resulting in + * this active to [in]active memory ratio. Value zero means disabling this + * auto-tuning feature. + * + * Disabled by default. + */ +static unsigned long active_mem_bp __read_mostly; +module_param(active_mem_bp, ulong, 0600); + /* * Filter [non-]young pages accordingly for LRU [de]prioritizations. * @@ -208,6 +222,26 @@ static struct damos *damon_lru_sort_new_cold_scheme(unsigned int cold_thres) return damon_lru_sort_new_scheme(&pattern, DAMOS_LRU_DEPRIO); } +static int damon_lru_sort_add_quota_goals(struct damos *hot_scheme, + struct damos *cold_scheme) +{ + struct damos_quota_goal *goal; + + if (!active_mem_bp) + return 0; + goal = damos_new_quota_goal(DAMOS_QUOTA_ACTIVE_MEM_BP, active_mem_bp); + if (!goal) + return -ENOMEM; + damos_add_quota_goal(&hot_scheme->quota, goal); + /* aim 0.2 % goal conflict, to keep little ping pong */ + goal = damos_new_quota_goal(DAMOS_QUOTA_INACTIVE_MEM_BP, + 10000 - active_mem_bp + 2); + if (!goal) + return -ENOMEM; + damos_add_quota_goal(&cold_scheme->quota, goal); + return 0; +} + static int damon_lru_sort_add_filters(struct damos *hot_scheme, struct damos *cold_scheme) { @@ -277,6 +311,9 @@ static int damon_lru_sort_apply_parameters(void) damon_set_schemes(param_ctx, &hot_scheme, 1); damon_add_scheme(param_ctx, cold_scheme); + err = damon_lru_sort_add_quota_goals(hot_scheme, cold_scheme); + if (err) + goto out; err = damon_lru_sort_add_filters(hot_scheme, cold_scheme); if (err) goto out; -- cgit v1.2.3 From cdfca22d15ca5f0f6b3ff33a23e1672dccc74eda Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 13 Jan 2026 07:27:14 -0800 Subject: Docs/admin-guide/mm/damon/lru_sort: document active_mem_bp parameter Document a newly added DAMON_LRU_SORT parameter for doing auto-tuning aiming an active to inactive memory size ratio. Link: https://lkml.kernel.org/r/20260113152717.70459-10-sj@kernel.org Signed-off-by: SeongJae Park Acked-by: wang lian Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/damon/lru_sort.rst | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/Documentation/admin-guide/mm/damon/lru_sort.rst b/Documentation/admin-guide/mm/damon/lru_sort.rst index bb222a32aefd..6af3ab5579a3 100644 --- a/Documentation/admin-guide/mm/damon/lru_sort.rst +++ b/Documentation/admin-guide/mm/damon/lru_sort.rst @@ -79,6 +79,18 @@ of parametrs except ``enabled`` again. Once the re-reading is done, this parameter is set as ``N``. If invalid parameters are found while the re-reading, DAMON_LRU_SORT will be disabled. +active_mem_bp +------------- + +Desired active to [in]active memory ratio in bp (1/10,000). + +While keeping the caps that set by other quotas, DAMON_LRU_SORT automatically +increases and decreases the effective level of the quota aiming the LRU +[de]prioritizations of the hot and cold memory resulting in this active to +[in]active memory ratio. Value zero means disabling this auto-tuning feature. + +Disabled by default. + filter_young_pages ------------------ -- cgit v1.2.3 From 4bdd692291275eaaabe993e1c4a7b5b01cd6dc37 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 13 Jan 2026 07:27:15 -0800 Subject: mm/damon/lru_sort: add monitoring intervals auto-tuning parameter DAMON monitoring intervals tuning was crucial for every DAMON use case. Now there are a tuning guideline and an automated intervals tuning feature. DAMON_LRU_SORT is still using manual control of intervals. Add a module parameter for utilizing the auto-tuning feature with a suggested auto-tuning parameters. Link: https://lkml.kernel.org/r/20260113152717.70459-11-sj@kernel.org Signed-off-by: SeongJae Park Acked-by: wang lian Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/damon/lru_sort.c | 30 +++++++++++++++++++++++++++--- 1 file changed, 27 insertions(+), 3 deletions(-) diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c index 8af97642912a..8296f984b428 100644 --- a/mm/damon/lru_sort.c +++ b/mm/damon/lru_sort.c @@ -55,6 +55,20 @@ module_param(commit_inputs, bool, 0600); static unsigned long active_mem_bp __read_mostly; module_param(active_mem_bp, ulong, 0600); +/* + * Auto-tune monitoring intervals. + * + * If this parameter is set as ``Y``, DAMON_LRU_SORT automatically tunes + * DAMON's sampling and aggregation intervals. The auto-tuning aims to capture + * meaningful amount of access events in each DAMON-snapshot, while keeping the + * sampling interval 5 milliseconds in minimum, and 10 seconds in maximum. + * Setting this as ``N`` disables the auto-tuning. + * + * Disabled by default. + */ +static bool autotune_monitoring_intervals __read_mostly; +module_param(autotune_monitoring_intervals, bool, 0600); + /* * Filter [non-]young pages accordingly for LRU [de]prioritizations. * @@ -268,6 +282,7 @@ static int damon_lru_sort_apply_parameters(void) { struct damon_ctx *param_ctx; struct damon_target *param_target; + struct damon_attrs attrs; struct damos *hot_scheme, *cold_scheme; unsigned int hot_thres, cold_thres; int err; @@ -290,18 +305,27 @@ static int damon_lru_sort_apply_parameters(void) goto out; } - err = damon_set_attrs(param_ctx, &damon_lru_sort_mon_attrs); + attrs = damon_lru_sort_mon_attrs; + if (autotune_monitoring_intervals) { + attrs.sample_interval = 5000; + attrs.aggr_interval = 100000; + attrs.intervals_goal.access_bp = 40; + attrs.intervals_goal.aggrs = 3; + attrs.intervals_goal.min_sample_us = 5000; + attrs.intervals_goal.max_sample_us = 10 * 1000 * 1000; + } + err = damon_set_attrs(param_ctx, &attrs); if (err) goto out; err = -ENOMEM; - hot_thres = damon_max_nr_accesses(&damon_lru_sort_mon_attrs) * + hot_thres = damon_max_nr_accesses(&attrs) * hot_thres_access_freq / 1000; hot_scheme = damon_lru_sort_new_hot_scheme(hot_thres); if (!hot_scheme) goto out; - cold_thres = cold_min_age / damon_lru_sort_mon_attrs.aggr_interval; + cold_thres = cold_min_age / attrs.aggr_interval; cold_scheme = damon_lru_sort_new_cold_scheme(cold_thres); if (!cold_scheme) { damon_destroy_scheme(hot_scheme); -- cgit v1.2.3 From ed581147a417940857eeea609229de0f5de5617f Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 13 Jan 2026 07:27:16 -0800 Subject: Docs/admin-guide/mm/damon/lru_sort: document intervals autotuning Document a newly added DAMON_LRU_SORT module parameter for using monitoring intervals auto-tuning feature of DAMON. Link: https://lkml.kernel.org/r/20260113152717.70459-12-sj@kernel.org Signed-off-by: SeongJae Park Acked-by: wang lian Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/damon/lru_sort.rst | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/Documentation/admin-guide/mm/damon/lru_sort.rst b/Documentation/admin-guide/mm/damon/lru_sort.rst index 6af3ab5579a3..20a8378d5a94 100644 --- a/Documentation/admin-guide/mm/damon/lru_sort.rst +++ b/Documentation/admin-guide/mm/damon/lru_sort.rst @@ -91,6 +91,17 @@ increases and decreases the effective level of the quota aiming the LRU Disabled by default. +Auto-tune monitoring intervals +------------------------------ + +If this parameter is set as ``Y``, DAMON_LRU_SORT automatically tunes DAMON's +sampling and aggregation intervals. The auto-tuning aims to capture meaningful +amount of access events in each DAMON-snapshot, while keeping the sampling +interval 5 milliseconds in minimum, and 10 seconds in maximum. Setting this as +``N`` disables the auto-tuning. + +Disabled by default. + filter_young_pages ------------------ -- cgit v1.2.3 From 79ffad20ebc05eb4e5dc942cdedbfbf0796c18c9 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Tue, 13 Jan 2026 10:11:50 +0100 Subject: mm: kmsan: add tests for high-order page freeing Add regression tests to verify that KMSAN correctly poisons the full memory range when freeing pages. Specifically, verify that accessing the tail pages of a high-order non-compound allocation triggers a use-after-free report. This ensures that the fix "mm: kmsan: Fix poisoning of high-order non-compound pages" is working as expected. Also add a test for standard order-0 pages for completeness. Link: https://lore.kernel.org/all/20260104134348.3544298-1-ryan.roberts@arm.com/ Link: https://lkml.kernel.org/r/20260113091151.4035013-1-glider@google.com Signed-off-by: Alexander Potapenko Reviewed-by: Ryan Roberts Cc: Dmitriy Vyukov Cc: Dmitry Vyukov Cc: Marco Elver Signed-off-by: Andrew Morton --- mm/kmsan/kmsan_test.c | 49 ++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 48 insertions(+), 1 deletion(-) diff --git a/mm/kmsan/kmsan_test.c b/mm/kmsan/kmsan_test.c index 902ec48b1e3e..ba44bf2072bb 100644 --- a/mm/kmsan/kmsan_test.c +++ b/mm/kmsan/kmsan_test.c @@ -361,7 +361,7 @@ static void test_init_vmalloc(struct kunit *test) KUNIT_EXPECT_TRUE(test, report_matches(&expect)); } -/* Test case: ensure that use-after-free reporting works. */ +/* Test case: ensure that use-after-free reporting works for kmalloc. */ static void test_uaf(struct kunit *test) { EXPECTATION_USE_AFTER_FREE(expect); @@ -378,6 +378,51 @@ static void test_uaf(struct kunit *test) KUNIT_EXPECT_TRUE(test, report_matches(&expect)); } +static volatile char *test_uaf_pages_helper(int order, int offset) +{ + struct page *page; + volatile char *var; + + /* Memory is initialized up until __free_pages() thanks to __GFP_ZERO. */ + page = alloc_pages(GFP_KERNEL | __GFP_ZERO, order); + var = page_address(page) + offset; + __free_pages(page, order); + + return var; +} + +/* Test case: ensure that use-after-free reporting works for a freed page. */ +static void test_uaf_pages(struct kunit *test) +{ + EXPECTATION_USE_AFTER_FREE(expect); + volatile char value; + + kunit_info(test, "use-after-free on a freed page (UMR report)\n"); + /* Allocate a single page, free it, then try to access it. */ + value = *test_uaf_pages_helper(0, 3); + USE(value); + + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + +/* Test case: ensure that UAF reporting works for high order pages. */ +static void test_uaf_high_order_pages(struct kunit *test) +{ + EXPECTATION_USE_AFTER_FREE(expect); + volatile char value; + + kunit_info(test, + "use-after-free on a freed high-order page (UMR report)\n"); + /* + * Create a high-order non-compound page, free it, then try to access + * its tail page. + */ + value = *test_uaf_pages_helper(1, PAGE_SIZE + 3); + USE(value); + + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + /* * Test case: ensure that uninitialized values are propagated through per-CPU * memory. @@ -683,6 +728,8 @@ static struct kunit_case kmsan_test_cases[] = { KUNIT_CASE(test_init_kmsan_vmap_vunmap), KUNIT_CASE(test_init_vmalloc), KUNIT_CASE(test_uaf), + KUNIT_CASE(test_uaf_pages), + KUNIT_CASE(test_uaf_high_order_pages), KUNIT_CASE(test_percpu_propagate), KUNIT_CASE(test_printk), KUNIT_CASE(test_init_memcpy), -- cgit v1.2.3 From 737dfe7d95263ae8e47e07a528e3676ffad6f59a Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Tue, 13 Jan 2026 10:11:51 +0100 Subject: mm: kmsan: add test_uninit_page Test that pages allocated with alloc_page() are uninitialized by default. Link: https://lkml.kernel.org/r/20260113091151.4035013-2-glider@google.com Signed-off-by: Alexander Potapenko Cc: Dmitriy Vyukov Cc: Marco Elver Cc: Ryan Roberts Signed-off-by: Andrew Morton --- mm/kmsan/kmsan_test.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/mm/kmsan/kmsan_test.c b/mm/kmsan/kmsan_test.c index ba44bf2072bb..81e642db6e23 100644 --- a/mm/kmsan/kmsan_test.c +++ b/mm/kmsan/kmsan_test.c @@ -378,6 +378,20 @@ static void test_uaf(struct kunit *test) KUNIT_EXPECT_TRUE(test, report_matches(&expect)); } +static void test_uninit_page(struct kunit *test) +{ + EXPECTATION_UNINIT_VALUE(expect); + struct page *page; + int *ptr; + + kunit_info(test, "uninitialized page allocation (UMR report)\n"); + page = alloc_pages(GFP_KERNEL, 0); + ptr = page_address(page); + USE(*ptr); + __free_pages(page, 0); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + static volatile char *test_uaf_pages_helper(int order, int offset) { struct page *page; @@ -727,6 +741,7 @@ static struct kunit_case kmsan_test_cases[] = { KUNIT_CASE(test_uninit_kmsan_check_memory), KUNIT_CASE(test_init_kmsan_vmap_vunmap), KUNIT_CASE(test_init_vmalloc), + KUNIT_CASE(test_uninit_page), KUNIT_CASE(test_uaf), KUNIT_CASE(test_uaf_pages), KUNIT_CASE(test_uaf_high_order_pages), -- cgit v1.2.3 From dc2e4982cb018306f0699cd460a9033467f07be5 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Tue, 13 Jan 2026 12:46:45 +0900 Subject: zsmalloc: introduce SG-list based object read API Currently, zsmalloc performs address linearization on read (which sometimes requires memcpy() to a local buffer). Not all zsmalloc users need a linear address. For example, Crypto API supports SG-list, performing linearization under the hood, if needed. In addition, some compressors can have native SG-list support, completely avoiding the linearization step. Provide an SG-list based zsmalloc read API: - zs_obj_read_sg_begin() - zs_obj_read_sg_end() This API allows callers to obtain an SG representation of the object (one entry for objects that are contained in a single page and two entries for spanning objects), avoiding the need for a bounce buffer and memcpy. [senozhatsky@chromium.org: make zs_obj_read_sg_begin() return void, per Yosry] Link: https://lkml.kernel.org/r/20260117024900.792237-1-senozhatsky@chromium.org Link: https://lkml.kernel.org/r/20260113034645.2729998-1-senozhatsky@chromium.org Signed-off-by: Sergey Senozhatsky Acked-by: Herbert Xu Tested-by: Yosry Ahmed Cc: Herbert Xu Cc: Brian Geffon Cc: Johannes Weiner Cc: Minchan Kim Cc: Nhat Pham Signed-off-by: Andrew Morton --- include/linux/zsmalloc.h | 4 +++ mm/zsmalloc.c | 63 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+) diff --git a/include/linux/zsmalloc.h b/include/linux/zsmalloc.h index 5565c3171007..478410c880b1 100644 --- a/include/linux/zsmalloc.h +++ b/include/linux/zsmalloc.h @@ -22,6 +22,7 @@ struct zs_pool_stats { }; struct zs_pool; +struct scatterlist; struct zs_pool *zs_create_pool(const char *name); void zs_destroy_pool(struct zs_pool *pool); @@ -43,6 +44,9 @@ void *zs_obj_read_begin(struct zs_pool *pool, unsigned long handle, size_t mem_len, void *local_copy); void zs_obj_read_end(struct zs_pool *pool, unsigned long handle, size_t mem_len, void *handle_mem); +void zs_obj_read_sg_begin(struct zs_pool *pool, unsigned long handle, + struct scatterlist *sg, size_t mem_len); +void zs_obj_read_sg_end(struct zs_pool *pool, unsigned long handle); void zs_obj_write(struct zs_pool *pool, unsigned long handle, void *handle_mem, size_t mem_len); diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index cc3d9501ae21..dccb88d52c07 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include @@ -1141,6 +1142,68 @@ void zs_obj_read_end(struct zs_pool *pool, unsigned long handle, } EXPORT_SYMBOL_GPL(zs_obj_read_end); +void zs_obj_read_sg_begin(struct zs_pool *pool, unsigned long handle, + struct scatterlist *sg, size_t mem_len) +{ + struct zspage *zspage; + struct zpdesc *zpdesc; + unsigned long obj, off; + unsigned int obj_idx; + struct size_class *class; + + /* Guarantee we can get zspage from handle safely */ + read_lock(&pool->lock); + obj = handle_to_obj(handle); + obj_to_location(obj, &zpdesc, &obj_idx); + zspage = get_zspage(zpdesc); + + /* Make sure migration doesn't move any pages in this zspage */ + zspage_read_lock(zspage); + read_unlock(&pool->lock); + + class = zspage_class(pool, zspage); + off = offset_in_page(class->size * obj_idx); + + if (!ZsHugePage(zspage)) + off += ZS_HANDLE_SIZE; + + if (off + mem_len <= PAGE_SIZE) { + /* this object is contained entirely within a page */ + sg_init_table(sg, 1); + sg_set_page(sg, zpdesc_page(zpdesc), mem_len, off); + } else { + size_t sizes[2]; + + /* this object spans two pages */ + sizes[0] = PAGE_SIZE - off; + sizes[1] = mem_len - sizes[0]; + + sg_init_table(sg, 2); + sg_set_page(sg, zpdesc_page(zpdesc), sizes[0], off); + + zpdesc = get_next_zpdesc(zpdesc); + sg = sg_next(sg); + + sg_set_page(sg, zpdesc_page(zpdesc), sizes[1], 0); + } +} +EXPORT_SYMBOL_GPL(zs_obj_read_sg_begin); + +void zs_obj_read_sg_end(struct zs_pool *pool, unsigned long handle) +{ + struct zspage *zspage; + struct zpdesc *zpdesc; + unsigned long obj; + unsigned int obj_idx; + + obj = handle_to_obj(handle); + obj_to_location(obj, &zpdesc, &obj_idx); + zspage = get_zspage(zpdesc); + + zspage_read_unlock(zspage); +} +EXPORT_SYMBOL_GPL(zs_obj_read_sg_end); + void zs_obj_write(struct zs_pool *pool, unsigned long handle, void *handle_mem, size_t mem_len) { -- cgit v1.2.3 From 3d702678f57edc524f73a7865382ae304269f590 Mon Sep 17 00:00:00 2001 From: Jinjiang Tu Date: Tue, 23 Dec 2025 19:05:23 +0800 Subject: mm/mempolicy: fix mpol_rebind_nodemask() for MPOL_F_NUMA_BALANCING commit bda420b98505 ("numa balancing: migrate on fault among multiple bound nodes") adds new flag MPOL_F_NUMA_BALANCING to enable NUMA balancing for MPOL_BIND memory policy. When the cpuset of tasks changes, the mempolicy of the task is rebound by mpol_rebind_nodemask(). When MPOL_F_STATIC_NODES and MPOL_F_RELATIVE_NODES are both not set, the behaviour of rebinding should be same whenever MPOL_F_NUMA_BALANCING is set or not. So, when an application calls set_mempolicy() with MPOL_F_NUMA_BALANCING set but both MPOL_F_STATIC_NODES and MPOL_F_RELATIVE_NODES cleared, mempolicy.w.cpuset_mems_allowed should be set to cpuset_current_mems_allowed nodemask. However, in current implementation, mpol_store_user_nodemask() wrongly returns true, causing mempolicy->w.user_nodemask to be incorrectly set to the user-specified nodemask. Later, when the cpuset of the application changes, mpol_rebind_nodemask() ends up rebinding based on the user-specified nodemask rather than the cpuset_mems_allowed nodemask as intended. I can reproduce with the following steps in qemu with 4 NUMA nodes: 1. echo '+cpuset' > /sys/fs/cgroup/cgroup.subtree_control 2. mkdir /sys/fs/cgroup/test 3. ./reproducer & 4. cat /proc/$pid/numa_maps, the task is bound to NUMA 1 5. echo $pid > /sys/fs/cgroup/test/cgroup.procs 6. cat /proc/$pid/numa_maps, the task is bound to NUMA 0 now. The reproducer code: int main() { struct bitmask *bmp; int ret; bmp = numa_parse_nodestring("1"); ret = set_mempolicy(MPOL_BIND | MPOL_F_NUMA_BALANCING, bmp->maskp, bmp->size + 1); if (ret < 0) { perror("Failed to call set_mempolicy"); exit(-1); } while (1); return 0; } If I call set_mempolicy() without MPOL_F_NUMA_BALANCING in the reproducer code. After step 5, the task is still bound to NUMA 1. To fix this, only set mempolicy->w.user_nodemask to the user-specified nodemask if MPOL_F_STATIC_NODES or MPOL_F_RELATIVE_NODES is present. Link: https://lkml.kernel.org/r/20260120011018.1256654-1-tujinjiang@huawei.com Link: https://lkml.kernel.org/r/20251223110523.1161421-1-tujinjiang@huawei.com Fixes: bda420b98505 ("numa balancing: migrate on fault among multiple bound nodes") Signed-off-by: Jinjiang Tu Reviewed-by: Gregory Price Reviewed-by: Huang Ying Acked-by: David Hildenbrand (Red Hat) Cc: Alistair Popple Cc: Byungchul Park Cc: Joshua Hahn Cc: Kefeng Wang Cc: Mathew Brost Cc: Mel Gorman Cc: Rakie Kim Cc: Zi Yan Signed-off-by: Andrew Morton --- include/uapi/linux/mempolicy.h | 3 +++ mm/mempolicy.c | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h index 8fbbe613611a..6c962d866e86 100644 --- a/include/uapi/linux/mempolicy.h +++ b/include/uapi/linux/mempolicy.h @@ -39,6 +39,9 @@ enum { #define MPOL_MODE_FLAGS \ (MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES | MPOL_F_NUMA_BALANCING) +/* Whether the nodemask is specified by users */ +#define MPOL_USER_NODEMASK_FLAGS (MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES) + /* Flags for get_mempolicy */ #define MPOL_F_NODE (1<<0) /* return next IL mode instead of node mask */ #define MPOL_F_ADDR (1<<1) /* look up vma using address */ diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 68a98ba57882..76da50425712 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -365,7 +365,7 @@ static const struct mempolicy_operations { static inline int mpol_store_user_nodemask(const struct mempolicy *pol) { - return pol->flags & MPOL_MODE_FLAGS; + return pol->flags & MPOL_USER_NODEMASK_FLAGS; } static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig, -- cgit v1.2.3 From 832d95b5314eea558cf4cc9ca40db10122ce8f63 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 9 Jan 2026 04:13:43 +0000 Subject: migrate: replace RMP_ flags with TTU_ flags Instead of translating between RMP_ and TTU_ flags, remove the RMP_ flags and just use the TTU_ flag space; there's plenty available. Possibly we should rename these to RMAP_ flags, and maybe even pass them in through rmap_walk_arg, but that can be done later. Link: https://lkml.kernel.org/r/20260109041345.3863089-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: David Hildenbrand (Red Hat) Reviewed-by: Lorenzo Stoakes Reviewed-by: Zi Yan Cc: Alistair Popple Cc: Byungchul Park Cc: Gregory Price Cc: Jann Horn Cc: Joshua Hahn Cc: Lance Yang Cc: Liam Howlett Cc: Matthew Brost Cc: Rakie Kim Cc: Rik van Riel Cc: Vlastimil Babka Cc: Ying Huang Signed-off-by: Andrew Morton --- include/linux/rmap.h | 9 +++------ mm/huge_memory.c | 8 ++++---- mm/migrate.c | 12 ++++++------ 3 files changed, 13 insertions(+), 16 deletions(-) diff --git a/include/linux/rmap.h b/include/linux/rmap.h index dd764951b03d..8dc0871e5f00 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -92,6 +92,7 @@ struct anon_vma_chain { }; enum ttu_flags { + TTU_USE_SHARED_ZEROPAGE = 0x2, /* for unused pages of large folios */ TTU_SPLIT_HUGE_PMD = 0x4, /* split huge PMD if any */ TTU_IGNORE_MLOCK = 0x8, /* ignore mlock */ TTU_SYNC = 0x10, /* avoid racy checks with PVMW_SYNC */ @@ -933,12 +934,8 @@ int mapping_wrprotect_range(struct address_space *mapping, pgoff_t pgoff, int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff, struct vm_area_struct *vma); -enum rmp_flags { - RMP_LOCKED = 1 << 0, - RMP_USE_SHARED_ZEROPAGE = 1 << 1, -}; - -void remove_migration_ptes(struct folio *src, struct folio *dst, int flags); +void remove_migration_ptes(struct folio *src, struct folio *dst, + enum ttu_flags flags); /* * rmap_walk_control: To control rmap traversing for specific needs diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 40cf59301c21..44ff8a648afd 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3431,7 +3431,7 @@ static void remap_page(struct folio *folio, unsigned long nr, int flags) if (!folio_test_anon(folio)) return; for (;;) { - remove_migration_ptes(folio, folio, RMP_LOCKED | flags); + remove_migration_ptes(folio, folio, TTU_RMAP_LOCKED | flags); i += folio_nr_pages(folio); if (i >= nr) break; @@ -3944,7 +3944,7 @@ static int __folio_split(struct folio *folio, unsigned int new_order, int old_order = folio_order(folio); struct folio *new_folio, *next; int nr_shmem_dropped = 0; - int remap_flags = 0; + enum ttu_flags ttu_flags = 0; int ret; pgoff_t end = 0; @@ -4064,9 +4064,9 @@ fail: shmem_uncharge(mapping->host, nr_shmem_dropped); if (!ret && is_anon && !folio_is_device_private(folio)) - remap_flags = RMP_USE_SHARED_ZEROPAGE; + ttu_flags = TTU_USE_SHARED_ZEROPAGE; - remap_page(folio, 1 << old_order, remap_flags); + remap_page(folio, 1 << old_order, ttu_flags); /* * Unlock all after-split folios except the one containing diff --git a/mm/migrate.c b/mm/migrate.c index 4688b9e38cd2..4750a2ba15fe 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -452,11 +452,12 @@ static bool remove_migration_pte(struct folio *folio, * Get rid of all migration entries and replace them by * references to the indicated page. */ -void remove_migration_ptes(struct folio *src, struct folio *dst, int flags) +void remove_migration_ptes(struct folio *src, struct folio *dst, + enum ttu_flags flags) { struct rmap_walk_arg rmap_walk_arg = { .folio = src, - .map_unused_to_zeropage = flags & RMP_USE_SHARED_ZEROPAGE, + .map_unused_to_zeropage = flags & TTU_USE_SHARED_ZEROPAGE, }; struct rmap_walk_control rwc = { @@ -464,9 +465,9 @@ void remove_migration_ptes(struct folio *src, struct folio *dst, int flags) .arg = &rmap_walk_arg, }; - VM_BUG_ON_FOLIO((flags & RMP_USE_SHARED_ZEROPAGE) && (src != dst), src); + VM_BUG_ON_FOLIO((flags & TTU_USE_SHARED_ZEROPAGE) && (src != dst), src); - if (flags & RMP_LOCKED) + if (flags & TTU_RMAP_LOCKED) rmap_walk_locked(dst, &rwc); else rmap_walk(dst, &rwc); @@ -1521,8 +1522,7 @@ static int unmap_and_move_huge_page(new_folio_t get_new_folio, rc = move_to_new_folio(dst, src, mode); if (page_was_mapped) - remove_migration_ptes(src, !rc ? dst : src, - ttu ? RMP_LOCKED : 0); + remove_migration_ptes(src, !rc ? dst : src, ttu); if (ttu & TTU_RMAP_LOCKED) i_mmap_unlock_write(mapping); -- cgit v1.2.3 From 9ac4941aceb027809cc32689a2944fa7a69388e4 Mon Sep 17 00:00:00 2001 From: Andrew Donnellan Date: Fri, 19 Dec 2025 04:09:33 +1100 Subject: arm64/mm: add addr parameter to __set_ptes_anysz() Patch series "Support page table check on PowerPC", v18. Support page table check on PowerPC. Page table check tracks the usage of of page table entries at each level to ensure that anonymous mappings have at most one writable consumer, and likewise that file-backed mappings are not simultaneously also anonymous mappings. In order to support this infrastructure, a number of helpers or stubs must be defined or updated for all powerpc platforms. Additionally, we separate set_pte_at() and set_pte_at_unchecked(), to allow for internal, uninstrumented mappings. On some PowerPC platforms, implementing {pte,pmd,pud}_user_accessible_page() requires the address. We revert previous changes that removed the address parameter from various interfaces, and add it to some other interfaces, in order to allow this. For now, we don't allow page table check alongside HUGETLB_PAGE, due to the arch-specific complexity of set_huge_page_at(). (I'm sure I could figure this out, but I have to get this version on this list before I leave my job.) This series was initially written by Rohan McLure, who has left IBM and is no longer working on powerpc. This patch (of 18): To provide support for page table check on powerpc, we need to reinstate the address parameter in several functions, including page_table_check_{ptes,pmds,puds}_set(). In preparation for this, add the addr parameter to arm64's __set_ptes_anysz() and change its callsites accordingly. Link: https://lkml.kernel.org/r/20251219-pgtable_check_v18rebase-v18-0-755bc151a50b@linux.ibm.com Link: https://lkml.kernel.org/r/20251219-pgtable_check_v18rebase-v18-1-755bc151a50b@linux.ibm.com Signed-off-by: Andrew Donnellan Reviewed-by: Pasha Tatashin Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Alistair Popple Cc: "Christophe Leroy (CS GROUP)" Cc: David Hildenbrand Cc: Donet Tom Cc: Guo Weikang Cc: Jason Gunthorpe Cc: Kevin Brodsky Cc: Madhavan Srinivasan Cc: Magnus Lindholm Cc: "Matthew Wilcox (Oracle)" Cc: Michael Ellerman Cc: Nicholas Miehlbradt Cc: Nicholas Piggin Cc: Paul Mackerras Cc: Qi Zheng Cc: "Ritesh Harjani (IBM)" Cc: Sweet Tea Dorminy Cc: Thomas Huth Cc: "Vishal Moola (Oracle)" Cc: Zi Yan Cc: Alexandre Ghiti Cc: Christophe Leroy Cc: Ingo Molnar Cc: Rohan McLure Signed-off-by: Andrew Morton --- arch/arm64/include/asm/pgtable.h | 19 ++++++++----------- arch/arm64/mm/hugetlbpage.c | 10 +++++----- 2 files changed, 13 insertions(+), 16 deletions(-) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 445e18e92221..52f3ea07427c 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -673,8 +673,8 @@ static inline pgprot_t pud_pgprot(pud_t pud) return __pgprot(pud_val(pfn_pud(pfn, __pgprot(0))) ^ pud_val(pud)); } -static inline void __set_ptes_anysz(struct mm_struct *mm, pte_t *ptep, - pte_t pte, unsigned int nr, +static inline void __set_ptes_anysz(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte, unsigned int nr, unsigned long pgsize) { unsigned long stride = pgsize >> PAGE_SHIFT; @@ -709,26 +709,23 @@ static inline void __set_ptes_anysz(struct mm_struct *mm, pte_t *ptep, __set_pte_complete(pte); } -static inline void __set_ptes(struct mm_struct *mm, - unsigned long __always_unused addr, +static inline void __set_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte, unsigned int nr) { - __set_ptes_anysz(mm, ptep, pte, nr, PAGE_SIZE); + __set_ptes_anysz(mm, addr, ptep, pte, nr, PAGE_SIZE); } -static inline void __set_pmds(struct mm_struct *mm, - unsigned long __always_unused addr, +static inline void __set_pmds(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, pmd_t pmd, unsigned int nr) { - __set_ptes_anysz(mm, (pte_t *)pmdp, pmd_pte(pmd), nr, PMD_SIZE); + __set_ptes_anysz(mm, addr, (pte_t *)pmdp, pmd_pte(pmd), nr, PMD_SIZE); } #define set_pmd_at(mm, addr, pmdp, pmd) __set_pmds(mm, addr, pmdp, pmd, 1) -static inline void __set_puds(struct mm_struct *mm, - unsigned long __always_unused addr, +static inline void __set_puds(struct mm_struct *mm, unsigned long addr, pud_t *pudp, pud_t pud, unsigned int nr) { - __set_ptes_anysz(mm, (pte_t *)pudp, pud_pte(pud), nr, PUD_SIZE); + __set_ptes_anysz(mm, addr, (pte_t *)pudp, pud_pte(pud), nr, PUD_SIZE); } #define set_pud_at(mm, addr, pudp, pud) __set_puds(mm, addr, pudp, pud, 1) diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index f8dd58ab67a8..b26cc64a1bae 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c @@ -221,8 +221,8 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, ncontig = num_contig_ptes(sz, &pgsize); if (!pte_present(pte)) { - for (i = 0; i < ncontig; i++, ptep++) - __set_ptes_anysz(mm, ptep, pte, 1, pgsize); + for (i = 0; i < ncontig; i++, ptep++, addr += pgsize) + __set_ptes_anysz(mm, addr, ptep, pte, 1, pgsize); return; } @@ -230,7 +230,7 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, if (pte_cont(pte) && pte_valid(__ptep_get(ptep))) clear_flush(mm, addr, ptep, pgsize, ncontig); - __set_ptes_anysz(mm, ptep, pte, ncontig, pgsize); + __set_ptes_anysz(mm, addr, ptep, pte, ncontig, pgsize); } pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, @@ -445,7 +445,7 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma, if (pte_young(orig_pte)) pte = pte_mkyoung(pte); - __set_ptes_anysz(mm, ptep, pte, ncontig, pgsize); + __set_ptes_anysz(mm, addr, ptep, pte, ncontig, pgsize); return 1; } @@ -469,7 +469,7 @@ void huge_ptep_set_wrprotect(struct mm_struct *mm, pte = get_clear_contig_flush(mm, addr, ptep, pgsize, ncontig); pte = pte_wrprotect(pte); - __set_ptes_anysz(mm, ptep, pte, ncontig, pgsize); + __set_ptes_anysz(mm, addr, ptep, pte, ncontig, pgsize); } pte_t huge_ptep_clear_flush(struct vm_area_struct *vma, -- cgit v1.2.3 From ee329c29fde849a8b541a836de742a454942589e Mon Sep 17 00:00:00 2001 From: Andrew Donnellan Date: Fri, 19 Dec 2025 04:09:34 +1100 Subject: arm64/mm: add addr parameter to __ptep_get_and_clear_anysz() To provide support for page table check on powerpc, we need to reinstate the address parameter in several functions, including page_table_check_{pte,pmd,pud}_clear(). In preparation for this, add the addr parameter to arm64's __ptep_get_and_clear_anysz() and change its callsites accordingly. Link: https://lkml.kernel.org/r/20251219-pgtable_check_v18rebase-v18-2-755bc151a50b@linux.ibm.com Signed-off-by: Andrew Donnellan Reviewed-by: Pasha Tatashin Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Alexandre Ghiti Cc: Alistair Popple Cc: Christophe Leroy Cc: "Christophe Leroy (CS GROUP)" Cc: David Hildenbrand Cc: Donet Tom Cc: Guo Weikang Cc: Ingo Molnar Cc: Jason Gunthorpe Cc: Kevin Brodsky Cc: Madhavan Srinivasan Cc: Magnus Lindholm Cc: "Matthew Wilcox (Oracle)" Cc: Michael Ellerman Cc: Nicholas Miehlbradt Cc: Nicholas Piggin Cc: Paul Mackerras Cc: Qi Zheng Cc: "Ritesh Harjani (IBM)" Cc: Rohan McLure Cc: Sweet Tea Dorminy Cc: Thomas Huth Cc: "Vishal Moola (Oracle)" Cc: Zi Yan Signed-off-by: Andrew Morton --- arch/arm64/include/asm/pgtable.h | 5 +++-- arch/arm64/mm/hugetlbpage.c | 7 ++++--- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 52f3ea07427c..29f7ae7011a8 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -1332,6 +1332,7 @@ static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma, #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG */ static inline pte_t __ptep_get_and_clear_anysz(struct mm_struct *mm, + unsigned long address, pte_t *ptep, unsigned long pgsize) { @@ -1359,7 +1360,7 @@ static inline pte_t __ptep_get_and_clear_anysz(struct mm_struct *mm, static inline pte_t __ptep_get_and_clear(struct mm_struct *mm, unsigned long address, pte_t *ptep) { - return __ptep_get_and_clear_anysz(mm, ptep, PAGE_SIZE); + return __ptep_get_and_clear_anysz(mm, address, ptep, PAGE_SIZE); } static inline void __clear_full_ptes(struct mm_struct *mm, unsigned long addr, @@ -1398,7 +1399,7 @@ static inline pte_t __get_and_clear_full_ptes(struct mm_struct *mm, static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, unsigned long address, pmd_t *pmdp) { - return pte_pmd(__ptep_get_and_clear_anysz(mm, (pte_t *)pmdp, PMD_SIZE)); + return pte_pmd(__ptep_get_and_clear_anysz(mm, address, (pte_t *)pmdp, PMD_SIZE)); } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index b26cc64a1bae..a42c05cf5640 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c @@ -155,11 +155,12 @@ static pte_t get_clear_contig(struct mm_struct *mm, pte_t pte, tmp_pte; bool present; - pte = __ptep_get_and_clear_anysz(mm, ptep, pgsize); + pte = __ptep_get_and_clear_anysz(mm, addr, ptep, pgsize); present = pte_present(pte); while (--ncontig) { ptep++; - tmp_pte = __ptep_get_and_clear_anysz(mm, ptep, pgsize); + addr += pgsize; + tmp_pte = __ptep_get_and_clear_anysz(mm, addr, ptep, pgsize); if (present) { if (pte_dirty(tmp_pte)) pte = pte_mkdirty(pte); @@ -203,7 +204,7 @@ static void clear_flush(struct mm_struct *mm, unsigned long i, saddr = addr; for (i = 0; i < ncontig; i++, addr += pgsize, ptep++) - __ptep_get_and_clear_anysz(mm, ptep, pgsize); + __ptep_get_and_clear_anysz(mm, addr, ptep, pgsize); if (mm == &init_mm) flush_tlb_kernel_range(saddr, addr); -- cgit v1.2.3 From c4a0c5ff85b7ca0d5fbd71888965f40e55295b19 Mon Sep 17 00:00:00 2001 From: Rohan McLure Date: Fri, 19 Dec 2025 04:09:35 +1100 Subject: mm/page_table_check: reinstate address parameter in [__]page_table_check_pud[s]_set() This reverts commit 6d144436d954 ("mm/page_table_check: remove unused parameter in [__]page_table_check_pud_set"). Reinstate previously unused parameters for the purpose of supporting powerpc platforms, as many do not encode user/kernel ownership of the page in the pte, but instead in the address of the access. Apply this to __page_table_check_puds_set(), page_table_check_puds_set() and the page_table_check_pud_set() wrapper macro. [ajd@linux.ibm.com: rebase on riscv + arm64 changes, update commit message] Link: https://lkml.kernel.org/r/20251219-pgtable_check_v18rebase-v18-3-755bc151a50b@linux.ibm.com Signed-off-by: Rohan McLure Signed-off-by: Andrew Donnellan Reviewed-by: Pasha Tatashin Acked-by: Ingo Molnar # x86 Acked-by: Alexandre Ghiti # riscv Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Alistair Popple Cc: Christophe Leroy Cc: "Christophe Leroy (CS GROUP)" Cc: David Hildenbrand Cc: Donet Tom Cc: Guo Weikang Cc: Jason Gunthorpe Cc: Kevin Brodsky Cc: Madhavan Srinivasan Cc: Magnus Lindholm Cc: "Matthew Wilcox (Oracle)" Cc: Michael Ellerman Cc: Nicholas Miehlbradt Cc: Nicholas Piggin Cc: Paul Mackerras Cc: Qi Zheng Cc: "Ritesh Harjani (IBM)" Cc: Sweet Tea Dorminy Cc: Thomas Huth Cc: "Vishal Moola (Oracle)" Cc: Zi Yan Signed-off-by: Andrew Morton --- arch/arm64/include/asm/pgtable.h | 3 ++- arch/riscv/include/asm/pgtable.h | 4 ++-- arch/x86/include/asm/pgtable.h | 4 ++-- include/linux/page_table_check.h | 12 ++++++------ mm/page_table_check.c | 4 ++-- 5 files changed, 14 insertions(+), 13 deletions(-) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 29f7ae7011a8..87ed9b1c011e 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -688,7 +688,8 @@ static inline void __set_ptes_anysz(struct mm_struct *mm, unsigned long addr, break; #ifndef __PAGETABLE_PMD_FOLDED case PUD_SIZE: - page_table_check_puds_set(mm, (pud_t *)ptep, pte_pud(pte), nr); + page_table_check_puds_set(mm, addr, (pud_t *)ptep, + pte_pud(pte), nr); break; #endif default: diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 9acd58a67123..07705adee128 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -953,7 +953,7 @@ static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr, static inline void set_pud_at(struct mm_struct *mm, unsigned long addr, pud_t *pudp, pud_t pud) { - page_table_check_pud_set(mm, pudp, pud); + page_table_check_pud_set(mm, addr, pudp, pud); return __set_pte_at(mm, (pte_t *)pudp, pud_pte(pud)); } @@ -1122,7 +1122,7 @@ static inline void update_mmu_cache_pud(struct vm_area_struct *vma, static inline pud_t pudp_establish(struct vm_area_struct *vma, unsigned long address, pud_t *pudp, pud_t pud) { - page_table_check_pud_set(vma->vm_mm, pudp, pud); + page_table_check_pud_set(vma->vm_mm, address, pudp, pud); return __pud(atomic_long_xchg((atomic_long_t *)pudp, pud_val(pud))); } diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 2842fa1f7a2c..2b540c563d8d 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -1221,7 +1221,7 @@ static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr, static inline void set_pud_at(struct mm_struct *mm, unsigned long addr, pud_t *pudp, pud_t pud) { - page_table_check_pud_set(mm, pudp, pud); + page_table_check_pud_set(mm, addr, pudp, pud); native_set_pud(pudp, pud); } @@ -1372,7 +1372,7 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma, static inline pud_t pudp_establish(struct vm_area_struct *vma, unsigned long address, pud_t *pudp, pud_t pud) { - page_table_check_pud_set(vma->vm_mm, pudp, pud); + page_table_check_pud_set(vma->vm_mm, address, pudp, pud); if (IS_ENABLED(CONFIG_SMP)) { return xchg(pudp, pud); } else { diff --git a/include/linux/page_table_check.h b/include/linux/page_table_check.h index 289620d4aad3..0bf18b884a12 100644 --- a/include/linux/page_table_check.h +++ b/include/linux/page_table_check.h @@ -21,8 +21,8 @@ void __page_table_check_ptes_set(struct mm_struct *mm, pte_t *ptep, pte_t pte, unsigned int nr); void __page_table_check_pmds_set(struct mm_struct *mm, pmd_t *pmdp, pmd_t pmd, unsigned int nr); -void __page_table_check_puds_set(struct mm_struct *mm, pud_t *pudp, pud_t pud, - unsigned int nr); +void __page_table_check_puds_set(struct mm_struct *mm, unsigned long addr, + pud_t *pudp, pud_t pud, unsigned int nr); void __page_table_check_pte_clear_range(struct mm_struct *mm, unsigned long addr, pmd_t pmd); @@ -86,12 +86,12 @@ static inline void page_table_check_pmds_set(struct mm_struct *mm, } static inline void page_table_check_puds_set(struct mm_struct *mm, - pud_t *pudp, pud_t pud, unsigned int nr) + unsigned long addr, pud_t *pudp, pud_t pud, unsigned int nr) { if (static_branch_likely(&page_table_check_disabled)) return; - __page_table_check_puds_set(mm, pudp, pud, nr); + __page_table_check_puds_set(mm, addr, pudp, pud, nr); } static inline void page_table_check_pte_clear_range(struct mm_struct *mm, @@ -137,7 +137,7 @@ static inline void page_table_check_pmds_set(struct mm_struct *mm, } static inline void page_table_check_puds_set(struct mm_struct *mm, - pud_t *pudp, pud_t pud, unsigned int nr) + unsigned long addr, pud_t *pudp, pud_t pud, unsigned int nr) { } @@ -150,6 +150,6 @@ static inline void page_table_check_pte_clear_range(struct mm_struct *mm, #endif /* CONFIG_PAGE_TABLE_CHECK */ #define page_table_check_pmd_set(mm, pmdp, pmd) page_table_check_pmds_set(mm, pmdp, pmd, 1) -#define page_table_check_pud_set(mm, pudp, pud) page_table_check_puds_set(mm, pudp, pud, 1) +#define page_table_check_pud_set(mm, addr, pudp, pud) page_table_check_puds_set(mm, addr, pudp, pud, 1) #endif /* __LINUX_PAGE_TABLE_CHECK_H */ diff --git a/mm/page_table_check.c b/mm/page_table_check.c index 741884645ab0..a48f835216a1 100644 --- a/mm/page_table_check.c +++ b/mm/page_table_check.c @@ -243,8 +243,8 @@ void __page_table_check_pmds_set(struct mm_struct *mm, pmd_t *pmdp, pmd_t pmd, } EXPORT_SYMBOL(__page_table_check_pmds_set); -void __page_table_check_puds_set(struct mm_struct *mm, pud_t *pudp, pud_t pud, - unsigned int nr) +void __page_table_check_puds_set(struct mm_struct *mm, unsigned long addr, + pud_t *pudp, pud_t pud, unsigned int nr) { unsigned long stride = PUD_SIZE >> PAGE_SHIFT; unsigned int i; -- cgit v1.2.3 From 6e2d8f9fc4edcbf9f4dd953e1f41b0ff64867e5b Mon Sep 17 00:00:00 2001 From: Rohan McLure Date: Fri, 19 Dec 2025 04:09:36 +1100 Subject: mm/page_table_check: reinstate address parameter in [__]page_table_check_pmd[s]_set() This reverts commit a3b837130b58 ("mm/page_table_check: remove unused parameter in [__]page_table_check_pmd_set"). Reinstate previously unused parameters for the purpose of supporting powerpc platforms, as many do not encode user/kernel ownership of the page in the pte, but instead in the address of the access. Apply this to __page_table_check_pmds_set(), page_table_check_pmd_set(), and the page_table_check_pmd_set() wrapper macro. [ajd@linux.ibm.com: rebase on arm64 + riscv changes, update commit message] Link: https://lkml.kernel.org/r/20251219-pgtable_check_v18rebase-v18-4-755bc151a50b@linux.ibm.com Signed-off-by: Rohan McLure Signed-off-by: Andrew Donnellan Reviewed-by: Pasha Tatashin Acked-by: Ingo Molnar # x86 Acked-by: Alexandre Ghiti # riscv Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Alistair Popple Cc: Christophe Leroy Cc: "Christophe Leroy (CS GROUP)" Cc: David Hildenbrand Cc: Donet Tom Cc: Guo Weikang Cc: Jason Gunthorpe Cc: Kevin Brodsky Cc: Madhavan Srinivasan Cc: Magnus Lindholm Cc: "Matthew Wilcox (Oracle)" Cc: Michael Ellerman Cc: Nicholas Miehlbradt Cc: Nicholas Piggin Cc: Paul Mackerras Cc: Qi Zheng Cc: "Ritesh Harjani (IBM)" Cc: Sweet Tea Dorminy Cc: Thomas Huth Cc: "Vishal Moola (Oracle)" Cc: Zi Yan Signed-off-by: Andrew Morton --- arch/arm64/include/asm/pgtable.h | 5 +++-- arch/riscv/include/asm/pgtable.h | 4 ++-- arch/x86/include/asm/pgtable.h | 4 ++-- include/linux/page_table_check.h | 12 ++++++------ mm/page_table_check.c | 4 ++-- 5 files changed, 15 insertions(+), 14 deletions(-) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 87ed9b1c011e..4b580d6246f5 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -684,7 +684,8 @@ static inline void __set_ptes_anysz(struct mm_struct *mm, unsigned long addr, page_table_check_ptes_set(mm, ptep, pte, nr); break; case PMD_SIZE: - page_table_check_pmds_set(mm, (pmd_t *)ptep, pte_pmd(pte), nr); + page_table_check_pmds_set(mm, addr, (pmd_t *)ptep, + pte_pmd(pte), nr); break; #ifndef __PAGETABLE_PMD_FOLDED case PUD_SIZE: @@ -1489,7 +1490,7 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm, static inline pmd_t pmdp_establish(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp, pmd_t pmd) { - page_table_check_pmd_set(vma->vm_mm, pmdp, pmd); + page_table_check_pmd_set(vma->vm_mm, address, pmdp, pmd); return __pmd(xchg_relaxed(&pmd_val(*pmdp), pmd_val(pmd))); } #endif diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 07705adee128..82b1c79bc2dd 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -946,7 +946,7 @@ static inline pmd_t pmd_swp_clear_soft_dirty(pmd_t pmd) static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, pmd_t pmd) { - page_table_check_pmd_set(mm, pmdp, pmd); + page_table_check_pmd_set(mm, addr, pmdp, pmd); return __set_pte_at(mm, (pte_t *)pmdp, pmd_pte(pmd)); } @@ -1023,7 +1023,7 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm, static inline pmd_t pmdp_establish(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp, pmd_t pmd) { - page_table_check_pmd_set(vma->vm_mm, pmdp, pmd); + page_table_check_pmd_set(vma->vm_mm, address, pmdp, pmd); return __pmd(atomic_long_xchg((atomic_long_t *)pmdp, pmd_val(pmd))); } diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 2b540c563d8d..7fd876f8d828 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -1214,7 +1214,7 @@ static inline pud_t native_local_pudp_get_and_clear(pud_t *pudp) static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, pmd_t pmd) { - page_table_check_pmd_set(mm, pmdp, pmd); + page_table_check_pmd_set(mm, addr, pmdp, pmd); set_pmd(pmdp, pmd); } @@ -1357,7 +1357,7 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm, static inline pmd_t pmdp_establish(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp, pmd_t pmd) { - page_table_check_pmd_set(vma->vm_mm, pmdp, pmd); + page_table_check_pmd_set(vma->vm_mm, address, pmdp, pmd); if (IS_ENABLED(CONFIG_SMP)) { return xchg(pmdp, pmd); } else { diff --git a/include/linux/page_table_check.h b/include/linux/page_table_check.h index 0bf18b884a12..cf7c28d8d468 100644 --- a/include/linux/page_table_check.h +++ b/include/linux/page_table_check.h @@ -19,8 +19,8 @@ void __page_table_check_pmd_clear(struct mm_struct *mm, pmd_t pmd); void __page_table_check_pud_clear(struct mm_struct *mm, pud_t pud); void __page_table_check_ptes_set(struct mm_struct *mm, pte_t *ptep, pte_t pte, unsigned int nr); -void __page_table_check_pmds_set(struct mm_struct *mm, pmd_t *pmdp, pmd_t pmd, - unsigned int nr); +void __page_table_check_pmds_set(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp, pmd_t pmd, unsigned int nr); void __page_table_check_puds_set(struct mm_struct *mm, unsigned long addr, pud_t *pudp, pud_t pud, unsigned int nr); void __page_table_check_pte_clear_range(struct mm_struct *mm, @@ -77,12 +77,12 @@ static inline void page_table_check_ptes_set(struct mm_struct *mm, } static inline void page_table_check_pmds_set(struct mm_struct *mm, - pmd_t *pmdp, pmd_t pmd, unsigned int nr) + unsigned long addr, pmd_t *pmdp, pmd_t pmd, unsigned int nr) { if (static_branch_likely(&page_table_check_disabled)) return; - __page_table_check_pmds_set(mm, pmdp, pmd, nr); + __page_table_check_pmds_set(mm, addr, pmdp, pmd, nr); } static inline void page_table_check_puds_set(struct mm_struct *mm, @@ -132,7 +132,7 @@ static inline void page_table_check_ptes_set(struct mm_struct *mm, } static inline void page_table_check_pmds_set(struct mm_struct *mm, - pmd_t *pmdp, pmd_t pmd, unsigned int nr) + unsigned long addr, pmd_t *pmdp, pmd_t pmd, unsigned int nr) { } @@ -149,7 +149,7 @@ static inline void page_table_check_pte_clear_range(struct mm_struct *mm, #endif /* CONFIG_PAGE_TABLE_CHECK */ -#define page_table_check_pmd_set(mm, pmdp, pmd) page_table_check_pmds_set(mm, pmdp, pmd, 1) +#define page_table_check_pmd_set(mm, addr, pmdp, pmd) page_table_check_pmds_set(mm, addr, pmdp, pmd, 1) #define page_table_check_pud_set(mm, addr, pudp, pud) page_table_check_puds_set(mm, addr, pudp, pud, 1) #endif /* __LINUX_PAGE_TABLE_CHECK_H */ diff --git a/mm/page_table_check.c b/mm/page_table_check.c index a48f835216a1..86dc4e4d1dad 100644 --- a/mm/page_table_check.c +++ b/mm/page_table_check.c @@ -225,8 +225,8 @@ static inline void page_table_check_pmd_flags(pmd_t pmd) } } -void __page_table_check_pmds_set(struct mm_struct *mm, pmd_t *pmdp, pmd_t pmd, - unsigned int nr) +void __page_table_check_pmds_set(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp, pmd_t pmd, unsigned int nr) { unsigned long stride = PMD_SIZE >> PAGE_SHIFT; unsigned int i; -- cgit v1.2.3 From 0a5ae4483177a621f5498c349d31f24b1ef10739 Mon Sep 17 00:00:00 2001 From: Rohan McLure Date: Fri, 19 Dec 2025 04:09:37 +1100 Subject: mm/page_table_check: provide addr parameter to page_table_check_ptes_set() To provide support for powerpc platforms, provide an addr parameter to the __page_table_check_ptes_set() and page_table_check_ptes_set() routines. This parameter is needed on some powerpc platforms which do not encode whether a mapping is for user or kernel in the pte. On such platforms, this can be inferred from the addr parameter. [ajd@linux.ibm.com: rebase on arm64 + riscv changes, update commit message] Link: https://lkml.kernel.org/r/20251219-pgtable_check_v18rebase-v18-5-755bc151a50b@linux.ibm.com Signed-off-by: Rohan McLure Reviewed-by: Pasha Tatashin Acked-by: Alexandre Ghiti # riscv Signed-off-by: Andrew Donnellan Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Alistair Popple Cc: Christophe Leroy Cc: "Christophe Leroy (CS GROUP)" Cc: David Hildenbrand Cc: Donet Tom Cc: Guo Weikang Cc: Ingo Molnar Cc: Jason Gunthorpe Cc: Kevin Brodsky Cc: Madhavan Srinivasan Cc: Magnus Lindholm Cc: "Matthew Wilcox (Oracle)" Cc: Michael Ellerman Cc: Nicholas Miehlbradt Cc: Nicholas Piggin Cc: Paul Mackerras Cc: Qi Zheng Cc: "Ritesh Harjani (IBM)" Cc: Sweet Tea Dorminy Cc: Thomas Huth Cc: "Vishal Moola (Oracle)" Cc: Zi Yan Signed-off-by: Andrew Morton --- arch/arm64/include/asm/pgtable.h | 2 +- arch/riscv/include/asm/pgtable.h | 2 +- include/linux/page_table_check.h | 12 +++++++----- include/linux/pgtable.h | 2 +- mm/page_table_check.c | 4 ++-- 5 files changed, 12 insertions(+), 10 deletions(-) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 4b580d6246f5..d1dd0266bb0c 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -681,7 +681,7 @@ static inline void __set_ptes_anysz(struct mm_struct *mm, unsigned long addr, switch (pgsize) { case PAGE_SIZE: - page_table_check_ptes_set(mm, ptep, pte, nr); + page_table_check_ptes_set(mm, addr, ptep, pte, nr); break; case PMD_SIZE: page_table_check_pmds_set(mm, addr, (pmd_t *)ptep, diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 82b1c79bc2dd..574a45a22454 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -627,7 +627,7 @@ static inline void __set_pte_at(struct mm_struct *mm, pte_t *ptep, pte_t pteval) static inline void set_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pteval, unsigned int nr) { - page_table_check_ptes_set(mm, ptep, pteval, nr); + page_table_check_ptes_set(mm, addr, ptep, pteval, nr); for (;;) { __set_pte_at(mm, ptep, pteval); diff --git a/include/linux/page_table_check.h b/include/linux/page_table_check.h index cf7c28d8d468..66e109238416 100644 --- a/include/linux/page_table_check.h +++ b/include/linux/page_table_check.h @@ -17,8 +17,8 @@ void __page_table_check_zero(struct page *page, unsigned int order); void __page_table_check_pte_clear(struct mm_struct *mm, pte_t pte); void __page_table_check_pmd_clear(struct mm_struct *mm, pmd_t pmd); void __page_table_check_pud_clear(struct mm_struct *mm, pud_t pud); -void __page_table_check_ptes_set(struct mm_struct *mm, pte_t *ptep, pte_t pte, - unsigned int nr); +void __page_table_check_ptes_set(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte, unsigned int nr); void __page_table_check_pmds_set(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, pmd_t pmd, unsigned int nr); void __page_table_check_puds_set(struct mm_struct *mm, unsigned long addr, @@ -68,12 +68,13 @@ static inline void page_table_check_pud_clear(struct mm_struct *mm, pud_t pud) } static inline void page_table_check_ptes_set(struct mm_struct *mm, - pte_t *ptep, pte_t pte, unsigned int nr) + unsigned long addr, pte_t *ptep, + pte_t pte, unsigned int nr) { if (static_branch_likely(&page_table_check_disabled)) return; - __page_table_check_ptes_set(mm, ptep, pte, nr); + __page_table_check_ptes_set(mm, addr, ptep, pte, nr); } static inline void page_table_check_pmds_set(struct mm_struct *mm, @@ -127,7 +128,8 @@ static inline void page_table_check_pud_clear(struct mm_struct *mm, pud_t pud) } static inline void page_table_check_ptes_set(struct mm_struct *mm, - pte_t *ptep, pte_t pte, unsigned int nr) + unsigned long addr, pte_t *ptep, + pte_t pte, unsigned int nr) { } diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 2f0dd3a4ace1..496873f44f67 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -429,7 +429,7 @@ static inline pte_t pte_advance_pfn(pte_t pte, unsigned long nr) static inline void set_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte, unsigned int nr) { - page_table_check_ptes_set(mm, ptep, pte, nr); + page_table_check_ptes_set(mm, addr, ptep, pte, nr); for (;;) { set_pte(ptep, pte); diff --git a/mm/page_table_check.c b/mm/page_table_check.c index 86dc4e4d1dad..2871d9c45368 100644 --- a/mm/page_table_check.c +++ b/mm/page_table_check.c @@ -196,8 +196,8 @@ static void page_table_check_pte_flags(pte_t pte) } } -void __page_table_check_ptes_set(struct mm_struct *mm, pte_t *ptep, pte_t pte, - unsigned int nr) +void __page_table_check_ptes_set(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte, unsigned int nr) { unsigned int i; -- cgit v1.2.3 From 2e6ac078ce5d6a9dc96cab861359faac508eb56d Mon Sep 17 00:00:00 2001 From: Rohan McLure Date: Fri, 19 Dec 2025 04:09:38 +1100 Subject: mm/page_table_check: reinstate address parameter in [__]page_table_check_pud_clear() This reverts commit 931c38e16499 ("mm/page_table_check: remove unused parameter in [__]page_table_check_pud_clear"). Reinstate previously unused parameters for the purpose of supporting powerpc platforms, as many do not encode user/kernel ownership of the page in the pte, but instead in the address of the access. [ajd@linux.ibm.com: rebase on arm64 changes] Link: https://lkml.kernel.org/r/20251219-pgtable_check_v18rebase-v18-6-755bc151a50b@linux.ibm.com Signed-off-by: Rohan McLure Signed-off-by: Andrew Donnellan Reviewed-by: Pasha Tatashin Acked-by: Ingo Molnar # x86 Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Alexandre Ghiti Cc: Alistair Popple Cc: Christophe Leroy Cc: "Christophe Leroy (CS GROUP)" Cc: David Hildenbrand Cc: Donet Tom Cc: Guo Weikang Cc: Jason Gunthorpe Cc: Kevin Brodsky Cc: Madhavan Srinivasan Cc: Magnus Lindholm Cc: "Matthew Wilcox (Oracle)" Cc: Michael Ellerman Cc: Nicholas Miehlbradt Cc: Nicholas Piggin Cc: Paul Mackerras Cc: Qi Zheng Cc: "Ritesh Harjani (IBM)" Cc: Sweet Tea Dorminy Cc: Thomas Huth Cc: "Vishal Moola (Oracle)" Cc: Zi Yan Signed-off-by: Andrew Morton --- arch/arm64/include/asm/pgtable.h | 2 +- arch/riscv/include/asm/pgtable.h | 2 +- arch/x86/include/asm/pgtable.h | 2 +- include/linux/page_table_check.h | 11 +++++++---- include/linux/pgtable.h | 2 +- mm/page_table_check.c | 5 +++-- 6 files changed, 14 insertions(+), 10 deletions(-) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index d1dd0266bb0c..595405e6bfc7 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -1349,7 +1349,7 @@ static inline pte_t __ptep_get_and_clear_anysz(struct mm_struct *mm, break; #ifndef __PAGETABLE_PMD_FOLDED case PUD_SIZE: - page_table_check_pud_clear(mm, pte_pud(pte)); + page_table_check_pud_clear(mm, address, pte_pud(pte)); break; #endif default: diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 574a45a22454..e06727c975fe 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -1101,7 +1101,7 @@ static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm, pud_clear(pudp); #endif - page_table_check_pud_clear(mm, pud); + page_table_check_pud_clear(mm, address, pud); return pud; } diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 7fd876f8d828..3eb36a36058f 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -1330,7 +1330,7 @@ static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm, { pud_t pud = native_pudp_get_and_clear(pudp); - page_table_check_pud_clear(mm, pud); + page_table_check_pud_clear(mm, addr, pud); return pud; } diff --git a/include/linux/page_table_check.h b/include/linux/page_table_check.h index 66e109238416..808cc3a48c28 100644 --- a/include/linux/page_table_check.h +++ b/include/linux/page_table_check.h @@ -16,7 +16,8 @@ extern struct page_ext_operations page_table_check_ops; void __page_table_check_zero(struct page *page, unsigned int order); void __page_table_check_pte_clear(struct mm_struct *mm, pte_t pte); void __page_table_check_pmd_clear(struct mm_struct *mm, pmd_t pmd); -void __page_table_check_pud_clear(struct mm_struct *mm, pud_t pud); +void __page_table_check_pud_clear(struct mm_struct *mm, unsigned long addr, + pud_t pud); void __page_table_check_ptes_set(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte, unsigned int nr); void __page_table_check_pmds_set(struct mm_struct *mm, unsigned long addr, @@ -59,12 +60,13 @@ static inline void page_table_check_pmd_clear(struct mm_struct *mm, pmd_t pmd) __page_table_check_pmd_clear(mm, pmd); } -static inline void page_table_check_pud_clear(struct mm_struct *mm, pud_t pud) +static inline void page_table_check_pud_clear(struct mm_struct *mm, + unsigned long addr, pud_t pud) { if (static_branch_likely(&page_table_check_disabled)) return; - __page_table_check_pud_clear(mm, pud); + __page_table_check_pud_clear(mm, addr, pud); } static inline void page_table_check_ptes_set(struct mm_struct *mm, @@ -123,7 +125,8 @@ static inline void page_table_check_pmd_clear(struct mm_struct *mm, pmd_t pmd) { } -static inline void page_table_check_pud_clear(struct mm_struct *mm, pud_t pud) +static inline void page_table_check_pud_clear(struct mm_struct *mm, + unsigned long addr, pud_t pud) { } diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 496873f44f67..ed3c28ebeb35 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -801,7 +801,7 @@ static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm, pud_t pud = *pudp; pud_clear(pudp); - page_table_check_pud_clear(mm, pud); + page_table_check_pud_clear(mm, address, pud); return pud; } diff --git a/mm/page_table_check.c b/mm/page_table_check.c index 2871d9c45368..2295bc9368ab 100644 --- a/mm/page_table_check.c +++ b/mm/page_table_check.c @@ -167,7 +167,8 @@ void __page_table_check_pmd_clear(struct mm_struct *mm, pmd_t pmd) } EXPORT_SYMBOL(__page_table_check_pmd_clear); -void __page_table_check_pud_clear(struct mm_struct *mm, pud_t pud) +void __page_table_check_pud_clear(struct mm_struct *mm, unsigned long addr, + pud_t pud) { if (&init_mm == mm) return; @@ -253,7 +254,7 @@ void __page_table_check_puds_set(struct mm_struct *mm, unsigned long addr, return; for (i = 0; i < nr; i++) - __page_table_check_pud_clear(mm, *(pudp + i)); + __page_table_check_pud_clear(mm, addr + PUD_SIZE * i, *(pudp + i)); if (pud_user_accessible_page(pud)) page_table_check_set(pud_pfn(pud), stride * nr, pud_write(pud)); } -- cgit v1.2.3 From 649ec9e3d03c4908ef51731cd7b422c4a3e2ccff Mon Sep 17 00:00:00 2001 From: Rohan McLure Date: Fri, 19 Dec 2025 04:09:39 +1100 Subject: mm/page_table_check: reinstate address parameter in [__]page_table_check_pmd_clear() This reverts commit 1831414cd729 ("mm/page_table_check: remove unused parameter in [__]page_table_check_pmd_clear"). Reinstate previously unused parameters for the purpose of supporting powerpc platforms, as many do not encode user/kernel ownership of the page in the pte, but instead in the address of the access. [ajd@linux.ibm.com: rebase on arm64 changes] Link: https://lkml.kernel.org/r/20251219-pgtable_check_v18rebase-v18-7-755bc151a50b@linux.ibm.com Signed-off-by: Rohan McLure Signed-off-by: Andrew Donnellan Reviewed-by: Pasha Tatashin Acked-by: Ingo Molnar # x86 Acked-by: Alexandre Ghiti # riscv Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Alistair Popple Cc: Christophe Leroy Cc: "Christophe Leroy (CS GROUP)" Cc: David Hildenbrand Cc: Donet Tom Cc: Guo Weikang Cc: Jason Gunthorpe Cc: Kevin Brodsky Cc: Madhavan Srinivasan Cc: Magnus Lindholm Cc: "Matthew Wilcox (Oracle)" Cc: Michael Ellerman Cc: Nicholas Miehlbradt Cc: Nicholas Piggin Cc: Paul Mackerras Cc: Qi Zheng Cc: "Ritesh Harjani (IBM)" Cc: Sweet Tea Dorminy Cc: Thomas Huth Cc: "Vishal Moola (Oracle)" Cc: Zi Yan Signed-off-by: Andrew Morton --- arch/arm64/include/asm/pgtable.h | 2 +- arch/riscv/include/asm/pgtable.h | 2 +- arch/x86/include/asm/pgtable.h | 2 +- include/linux/page_table_check.h | 11 +++++++---- include/linux/pgtable.h | 2 +- mm/page_table_check.c | 5 +++-- 6 files changed, 14 insertions(+), 10 deletions(-) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 595405e6bfc7..5abad90913eb 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -1345,7 +1345,7 @@ static inline pte_t __ptep_get_and_clear_anysz(struct mm_struct *mm, page_table_check_pte_clear(mm, pte); break; case PMD_SIZE: - page_table_check_pmd_clear(mm, pte_pmd(pte)); + page_table_check_pmd_clear(mm, address, pte_pmd(pte)); break; #ifndef __PAGETABLE_PMD_FOLDED case PUD_SIZE: diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index e06727c975fe..6464a2c18ebe 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -1007,7 +1007,7 @@ static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, pmd_clear(pmdp); #endif - page_table_check_pmd_clear(mm, pmd); + page_table_check_pmd_clear(mm, address, pmd); return pmd; } diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 3eb36a36058f..5a2b2d3a80d8 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -1319,7 +1319,7 @@ static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, unsigned long { pmd_t pmd = native_pmdp_get_and_clear(pmdp); - page_table_check_pmd_clear(mm, pmd); + page_table_check_pmd_clear(mm, addr, pmd); return pmd; } diff --git a/include/linux/page_table_check.h b/include/linux/page_table_check.h index 808cc3a48c28..3973b69ae294 100644 --- a/include/linux/page_table_check.h +++ b/include/linux/page_table_check.h @@ -15,7 +15,8 @@ extern struct page_ext_operations page_table_check_ops; void __page_table_check_zero(struct page *page, unsigned int order); void __page_table_check_pte_clear(struct mm_struct *mm, pte_t pte); -void __page_table_check_pmd_clear(struct mm_struct *mm, pmd_t pmd); +void __page_table_check_pmd_clear(struct mm_struct *mm, unsigned long addr, + pmd_t pmd); void __page_table_check_pud_clear(struct mm_struct *mm, unsigned long addr, pud_t pud); void __page_table_check_ptes_set(struct mm_struct *mm, unsigned long addr, @@ -52,12 +53,13 @@ static inline void page_table_check_pte_clear(struct mm_struct *mm, pte_t pte) __page_table_check_pte_clear(mm, pte); } -static inline void page_table_check_pmd_clear(struct mm_struct *mm, pmd_t pmd) +static inline void page_table_check_pmd_clear(struct mm_struct *mm, + unsigned long addr, pmd_t pmd) { if (static_branch_likely(&page_table_check_disabled)) return; - __page_table_check_pmd_clear(mm, pmd); + __page_table_check_pmd_clear(mm, addr, pmd); } static inline void page_table_check_pud_clear(struct mm_struct *mm, @@ -121,7 +123,8 @@ static inline void page_table_check_pte_clear(struct mm_struct *mm, pte_t pte) { } -static inline void page_table_check_pmd_clear(struct mm_struct *mm, pmd_t pmd) +static inline void page_table_check_pmd_clear(struct mm_struct *mm, + unsigned long addr, pmd_t pmd) { } diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index ed3c28ebeb35..2d1f7369624c 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -788,7 +788,7 @@ static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, pmd_t pmd = *pmdp; pmd_clear(pmdp); - page_table_check_pmd_clear(mm, pmd); + page_table_check_pmd_clear(mm, address, pmd); return pmd; } diff --git a/mm/page_table_check.c b/mm/page_table_check.c index 2295bc9368ab..e8280b0b6dda 100644 --- a/mm/page_table_check.c +++ b/mm/page_table_check.c @@ -156,7 +156,8 @@ void __page_table_check_pte_clear(struct mm_struct *mm, pte_t pte) } EXPORT_SYMBOL(__page_table_check_pte_clear); -void __page_table_check_pmd_clear(struct mm_struct *mm, pmd_t pmd) +void __page_table_check_pmd_clear(struct mm_struct *mm, unsigned long addr, + pmd_t pmd) { if (&init_mm == mm) return; @@ -238,7 +239,7 @@ void __page_table_check_pmds_set(struct mm_struct *mm, unsigned long addr, page_table_check_pmd_flags(pmd); for (i = 0; i < nr; i++) - __page_table_check_pmd_clear(mm, *(pmdp + i)); + __page_table_check_pmd_clear(mm, addr + PMD_SIZE * i, *(pmdp + i)); if (pmd_user_accessible_page(pmd)) page_table_check_set(pmd_pfn(pmd), stride * nr, pmd_write(pmd)); } -- cgit v1.2.3 From d7b4b67eb6b37aef1723a69add88c9a7add81308 Mon Sep 17 00:00:00 2001 From: Rohan McLure Date: Fri, 19 Dec 2025 04:09:40 +1100 Subject: mm/page_table_check: reinstate address parameter in [__]page_table_check_pte_clear() This reverts commit aa232204c468 ("mm/page_table_check: remove unused parameter in [__]page_table_check_pte_clear"). Reinstate previously unused parameters for the purpose of supporting powerpc platforms, as many do not encode user/kernel ownership of the page in the pte, but instead in the address of the access. [ajd@linux.ibm.com: rebase, fix additional occurrence and loop handling] Link: https://lkml.kernel.org/r/20251219-pgtable_check_v18rebase-v18-8-755bc151a50b@linux.ibm.com Signed-off-by: Rohan McLure Signed-off-by: Andrew Donnellan Reviewed-by: Pasha Tatashin Acked-by: Ingo Molnar # x86 Acked-by: Alexandre Ghiti # riscv Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Alistair Popple Cc: Christophe Leroy Cc: "Christophe Leroy (CS GROUP)" Cc: David Hildenbrand Cc: Donet Tom Cc: Guo Weikang Cc: Jason Gunthorpe Cc: Kevin Brodsky Cc: Madhavan Srinivasan Cc: Magnus Lindholm Cc: "Matthew Wilcox (Oracle)" Cc: Michael Ellerman Cc: Nicholas Miehlbradt Cc: Nicholas Piggin Cc: Paul Mackerras Cc: Qi Zheng Cc: "Ritesh Harjani (IBM)" Cc: Sweet Tea Dorminy Cc: Thomas Huth Cc: "Vishal Moola (Oracle)" Cc: Zi Yan Signed-off-by: Andrew Morton --- arch/arm64/include/asm/pgtable.h | 2 +- arch/riscv/include/asm/pgtable.h | 2 +- arch/x86/include/asm/pgtable.h | 4 ++-- include/linux/page_table_check.h | 11 +++++++---- include/linux/pgtable.h | 4 ++-- mm/page_table_check.c | 7 ++++--- 6 files changed, 17 insertions(+), 13 deletions(-) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 5abad90913eb..ce64c560e284 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -1342,7 +1342,7 @@ static inline pte_t __ptep_get_and_clear_anysz(struct mm_struct *mm, switch (pgsize) { case PAGE_SIZE: - page_table_check_pte_clear(mm, pte); + page_table_check_pte_clear(mm, address, pte); break; case PMD_SIZE: page_table_check_pmd_clear(mm, address, pte_pmd(pte)); diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 6464a2c18ebe..e3618d789aa4 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -664,7 +664,7 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm, set_pte(ptep, __pte(0)); #endif - page_table_check_pte_clear(mm, pte); + page_table_check_pte_clear(mm, address, pte); return pte; } diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 5a2b2d3a80d8..6ec6cf7ad2d4 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -1252,7 +1252,7 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { pte_t pte = native_ptep_get_and_clear(ptep); - page_table_check_pte_clear(mm, pte); + page_table_check_pte_clear(mm, addr, pte); return pte; } @@ -1268,7 +1268,7 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, * care about updates and native needs no locking */ pte = native_local_ptep_get_and_clear(ptep); - page_table_check_pte_clear(mm, pte); + page_table_check_pte_clear(mm, addr, pte); } else { pte = ptep_get_and_clear(mm, addr, ptep); } diff --git a/include/linux/page_table_check.h b/include/linux/page_table_check.h index 3973b69ae294..12268a32e8be 100644 --- a/include/linux/page_table_check.h +++ b/include/linux/page_table_check.h @@ -14,7 +14,8 @@ extern struct static_key_true page_table_check_disabled; extern struct page_ext_operations page_table_check_ops; void __page_table_check_zero(struct page *page, unsigned int order); -void __page_table_check_pte_clear(struct mm_struct *mm, pte_t pte); +void __page_table_check_pte_clear(struct mm_struct *mm, unsigned long addr, + pte_t pte); void __page_table_check_pmd_clear(struct mm_struct *mm, unsigned long addr, pmd_t pmd); void __page_table_check_pud_clear(struct mm_struct *mm, unsigned long addr, @@ -45,12 +46,13 @@ static inline void page_table_check_free(struct page *page, unsigned int order) __page_table_check_zero(page, order); } -static inline void page_table_check_pte_clear(struct mm_struct *mm, pte_t pte) +static inline void page_table_check_pte_clear(struct mm_struct *mm, + unsigned long addr, pte_t pte) { if (static_branch_likely(&page_table_check_disabled)) return; - __page_table_check_pte_clear(mm, pte); + __page_table_check_pte_clear(mm, addr, pte); } static inline void page_table_check_pmd_clear(struct mm_struct *mm, @@ -119,7 +121,8 @@ static inline void page_table_check_free(struct page *page, unsigned int order) { } -static inline void page_table_check_pte_clear(struct mm_struct *mm, pte_t pte) +static inline void page_table_check_pte_clear(struct mm_struct *mm, + unsigned long addr, pte_t pte) { } diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 2d1f7369624c..827dca25c0bc 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -634,7 +634,7 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm, { pte_t pte = ptep_get(ptep); pte_clear(mm, address, ptep); - page_table_check_pte_clear(mm, pte); + page_table_check_pte_clear(mm, address, pte); return pte; } #endif @@ -693,7 +693,7 @@ static inline void ptep_clear(struct mm_struct *mm, unsigned long addr, * No need for ptep_get_and_clear(): page table check doesn't care about * any bits that could have been set by HW concurrently. */ - page_table_check_pte_clear(mm, pte); + page_table_check_pte_clear(mm, addr, pte); } #ifdef CONFIG_GUP_GET_PXX_LOW_HIGH diff --git a/mm/page_table_check.c b/mm/page_table_check.c index e8280b0b6dda..de9e54bd27e6 100644 --- a/mm/page_table_check.c +++ b/mm/page_table_check.c @@ -145,7 +145,8 @@ void __page_table_check_zero(struct page *page, unsigned int order) rcu_read_unlock(); } -void __page_table_check_pte_clear(struct mm_struct *mm, pte_t pte) +void __page_table_check_pte_clear(struct mm_struct *mm, unsigned long addr, + pte_t pte) { if (&init_mm == mm) return; @@ -209,7 +210,7 @@ void __page_table_check_ptes_set(struct mm_struct *mm, unsigned long addr, page_table_check_pte_flags(pte); for (i = 0; i < nr; i++) - __page_table_check_pte_clear(mm, ptep_get(ptep + i)); + __page_table_check_pte_clear(mm, addr + PAGE_SIZE * i, ptep_get(ptep + i)); if (pte_user_accessible_page(pte)) page_table_check_set(pte_pfn(pte), nr, pte_write(pte)); } @@ -275,7 +276,7 @@ void __page_table_check_pte_clear_range(struct mm_struct *mm, if (WARN_ON(!ptep)) return; for (i = 0; i < PTRS_PER_PTE; i++) { - __page_table_check_pte_clear(mm, ptep_get(ptep)); + __page_table_check_pte_clear(mm, addr, ptep_get(ptep)); addr += PAGE_SIZE; ptep++; } -- cgit v1.2.3 From d79f9c9cf703d873849253f82fb9d6e1bd2b36f1 Mon Sep 17 00:00:00 2001 From: Rohan McLure Date: Fri, 19 Dec 2025 04:09:41 +1100 Subject: mm: provide address parameter to p{te,md,ud}_user_accessible_page() On several powerpc platforms, a page table entry may not imply whether the relevant mapping is for userspace or kernelspace. Instead, such platforms infer this by the address which is being accessed. Add an additional address argument to each of these routines in order to provide support for page table check on powerpc. [ajd@linux.ibm.com: rebase on arm64 changes] Link: https://lkml.kernel.org/r/20251219-pgtable_check_v18rebase-v18-9-755bc151a50b@linux.ibm.com Signed-off-by: Rohan McLure Signed-off-by: Andrew Donnellan Reviewed-by: Pasha Tatashin Acked-by: Ingo Molnar # x86 Acked-by: Alexandre Ghiti # riscv Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Alistair Popple Cc: Christophe Leroy Cc: "Christophe Leroy (CS GROUP)" Cc: David Hildenbrand Cc: Donet Tom Cc: Guo Weikang Cc: Jason Gunthorpe Cc: Kevin Brodsky Cc: Madhavan Srinivasan Cc: Magnus Lindholm Cc: "Matthew Wilcox (Oracle)" Cc: Michael Ellerman Cc: Nicholas Miehlbradt Cc: Nicholas Piggin Cc: Paul Mackerras Cc: Qi Zheng Cc: "Ritesh Harjani (IBM)" Cc: Sweet Tea Dorminy Cc: Thomas Huth Cc: "Vishal Moola (Oracle)" Cc: Zi Yan Signed-off-by: Andrew Morton --- arch/arm64/include/asm/pgtable.h | 6 +++--- arch/riscv/include/asm/pgtable.h | 6 +++--- arch/x86/include/asm/pgtable.h | 6 +++--- mm/page_table_check.c | 12 ++++++------ 4 files changed, 15 insertions(+), 15 deletions(-) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index ce64c560e284..d94445b4f3df 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -1265,17 +1265,17 @@ static inline int pmdp_set_access_flags(struct vm_area_struct *vma, #endif #ifdef CONFIG_PAGE_TABLE_CHECK -static inline bool pte_user_accessible_page(pte_t pte) +static inline bool pte_user_accessible_page(pte_t pte, unsigned long addr) { return pte_valid(pte) && (pte_user(pte) || pte_user_exec(pte)); } -static inline bool pmd_user_accessible_page(pmd_t pmd) +static inline bool pmd_user_accessible_page(pmd_t pmd, unsigned long addr) { return pmd_valid(pmd) && !pmd_table(pmd) && (pmd_user(pmd) || pmd_user_exec(pmd)); } -static inline bool pud_user_accessible_page(pud_t pud) +static inline bool pud_user_accessible_page(pud_t pud, unsigned long addr) { return pud_valid(pud) && !pud_table(pud) && (pud_user(pud) || pud_user_exec(pud)); } diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index e3618d789aa4..9ecbf0366719 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -958,17 +958,17 @@ static inline void set_pud_at(struct mm_struct *mm, unsigned long addr, } #ifdef CONFIG_PAGE_TABLE_CHECK -static inline bool pte_user_accessible_page(pte_t pte) +static inline bool pte_user_accessible_page(pte_t pte, unsigned long addr) { return pte_present(pte) && pte_user(pte); } -static inline bool pmd_user_accessible_page(pmd_t pmd) +static inline bool pmd_user_accessible_page(pmd_t pmd, unsigned long addr) { return pmd_leaf(pmd) && pmd_user(pmd); } -static inline bool pud_user_accessible_page(pud_t pud) +static inline bool pud_user_accessible_page(pud_t pud, unsigned long addr) { return pud_leaf(pud) && pud_user(pud); } diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 6ec6cf7ad2d4..1662c5a8f445 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -1680,17 +1680,17 @@ static inline bool arch_has_hw_nonleaf_pmd_young(void) #endif #ifdef CONFIG_PAGE_TABLE_CHECK -static inline bool pte_user_accessible_page(pte_t pte) +static inline bool pte_user_accessible_page(pte_t pte, unsigned long addr) { return (pte_val(pte) & _PAGE_PRESENT) && (pte_val(pte) & _PAGE_USER); } -static inline bool pmd_user_accessible_page(pmd_t pmd) +static inline bool pmd_user_accessible_page(pmd_t pmd, unsigned long addr) { return pmd_leaf(pmd) && (pmd_val(pmd) & _PAGE_PRESENT) && (pmd_val(pmd) & _PAGE_USER); } -static inline bool pud_user_accessible_page(pud_t pud) +static inline bool pud_user_accessible_page(pud_t pud, unsigned long addr) { return pud_leaf(pud) && (pud_val(pud) & _PAGE_PRESENT) && (pud_val(pud) & _PAGE_USER); } diff --git a/mm/page_table_check.c b/mm/page_table_check.c index de9e54bd27e6..2708c2b3ac1f 100644 --- a/mm/page_table_check.c +++ b/mm/page_table_check.c @@ -151,7 +151,7 @@ void __page_table_check_pte_clear(struct mm_struct *mm, unsigned long addr, if (&init_mm == mm) return; - if (pte_user_accessible_page(pte)) { + if (pte_user_accessible_page(pte, addr)) { page_table_check_clear(pte_pfn(pte), PAGE_SIZE >> PAGE_SHIFT); } } @@ -163,7 +163,7 @@ void __page_table_check_pmd_clear(struct mm_struct *mm, unsigned long addr, if (&init_mm == mm) return; - if (pmd_user_accessible_page(pmd)) { + if (pmd_user_accessible_page(pmd, addr)) { page_table_check_clear(pmd_pfn(pmd), PMD_SIZE >> PAGE_SHIFT); } } @@ -175,7 +175,7 @@ void __page_table_check_pud_clear(struct mm_struct *mm, unsigned long addr, if (&init_mm == mm) return; - if (pud_user_accessible_page(pud)) { + if (pud_user_accessible_page(pud, addr)) { page_table_check_clear(pud_pfn(pud), PUD_SIZE >> PAGE_SHIFT); } } @@ -211,7 +211,7 @@ void __page_table_check_ptes_set(struct mm_struct *mm, unsigned long addr, for (i = 0; i < nr; i++) __page_table_check_pte_clear(mm, addr + PAGE_SIZE * i, ptep_get(ptep + i)); - if (pte_user_accessible_page(pte)) + if (pte_user_accessible_page(pte, addr)) page_table_check_set(pte_pfn(pte), nr, pte_write(pte)); } EXPORT_SYMBOL(__page_table_check_ptes_set); @@ -241,7 +241,7 @@ void __page_table_check_pmds_set(struct mm_struct *mm, unsigned long addr, for (i = 0; i < nr; i++) __page_table_check_pmd_clear(mm, addr + PMD_SIZE * i, *(pmdp + i)); - if (pmd_user_accessible_page(pmd)) + if (pmd_user_accessible_page(pmd, addr)) page_table_check_set(pmd_pfn(pmd), stride * nr, pmd_write(pmd)); } EXPORT_SYMBOL(__page_table_check_pmds_set); @@ -257,7 +257,7 @@ void __page_table_check_puds_set(struct mm_struct *mm, unsigned long addr, for (i = 0; i < nr; i++) __page_table_check_pud_clear(mm, addr + PUD_SIZE * i, *(pudp + i)); - if (pud_user_accessible_page(pud)) + if (pud_user_accessible_page(pud, addr)) page_table_check_set(pud_pfn(pud), stride * nr, pud_write(pud)); } EXPORT_SYMBOL(__page_table_check_puds_set); -- cgit v1.2.3 From 2f5e576598c915db18b7ccd0003be52458959ce7 Mon Sep 17 00:00:00 2001 From: Rohan McLure Date: Fri, 19 Dec 2025 04:09:42 +1100 Subject: powerpc/mm: implement *_user_accessible_page() for ptes Page table checking depends on architectures providing an implementation of p{te,md,ud}_user_accessible_page. With refactorisations made on powerpc/mm, the pte_access_permitted() and similar methods verify whether a userland page is accessible with the required permissions. Since page table checking is the only user of p{te,md,ud}_user_accessible_page(), implement these for all platforms, using some of the same preliminary checks taken by pte_access_permitted() on that platform. Since commit 8e9bd41e4ce1 ("powerpc/nohash: Replace pte_user() by pte_read()") pte_user() is no longer required to be present on all platforms as it may be equivalent to or implied by pte_read(). Hence implementations of pte_user_accessible_page() are specialised. [ajd@linux.ibm.com: rebase and clean up] Link: https://lkml.kernel.org/r/20251219-pgtable_check_v18rebase-v18-10-755bc151a50b@linux.ibm.com Signed-off-by: Rohan McLure Signed-off-by: Andrew Donnellan Reviewed-by: Pasha Tatashin Acked-by: Madhavan Srinivasan Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Alexandre Ghiti Cc: Alistair Popple Cc: Christophe Leroy Cc: "Christophe Leroy (CS GROUP)" Cc: David Hildenbrand Cc: Donet Tom Cc: Guo Weikang Cc: Ingo Molnar Cc: Jason Gunthorpe Cc: Kevin Brodsky Cc: Magnus Lindholm Cc: "Matthew Wilcox (Oracle)" Cc: Michael Ellerman Cc: Nicholas Miehlbradt Cc: Nicholas Piggin Cc: Paul Mackerras Cc: Qi Zheng Cc: "Ritesh Harjani (IBM)" Cc: Sweet Tea Dorminy Cc: Thomas Huth Cc: "Vishal Moola (Oracle)" Cc: Zi Yan Signed-off-by: Andrew Morton --- arch/powerpc/include/asm/book3s/32/pgtable.h | 5 +++++ arch/powerpc/include/asm/book3s/64/pgtable.h | 17 +++++++++++++++++ arch/powerpc/include/asm/nohash/pgtable.h | 5 +++++ arch/powerpc/include/asm/pgtable.h | 8 ++++++++ 4 files changed, 35 insertions(+) diff --git a/arch/powerpc/include/asm/book3s/32/pgtable.h b/arch/powerpc/include/asm/book3s/32/pgtable.h index 87dcca962be7..2edca1068b6f 100644 --- a/arch/powerpc/include/asm/book3s/32/pgtable.h +++ b/arch/powerpc/include/asm/book3s/32/pgtable.h @@ -437,6 +437,11 @@ static inline bool pte_access_permitted(pte_t pte, bool write) return true; } +static inline bool pte_user_accessible_page(pte_t pte, unsigned long addr) +{ + return pte_present(pte) && !is_kernel_addr(addr); +} + /* Conversion functions: convert a page and protection to a page entry, * and a page entry and page directory to the page they refer to. * diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index aac8ce30cd3b..2d69a827594f 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -539,6 +539,11 @@ static inline bool pte_access_permitted(pte_t pte, bool write) return arch_pte_access_permitted(pte_val(pte), write, 0); } +static inline bool pte_user_accessible_page(pte_t pte, unsigned long addr) +{ + return pte_present(pte) && pte_user(pte); +} + /* * Conversion functions: convert a page and protection to a page entry, * and a page entry and page directory to the page they refer to. @@ -909,6 +914,12 @@ static inline bool pud_access_permitted(pud_t pud, bool write) return pte_access_permitted(pud_pte(pud), write); } +#define pud_user_accessible_page pud_user_accessible_page +static inline bool pud_user_accessible_page(pud_t pud, unsigned long addr) +{ + return pud_leaf(pud) && pte_user_accessible_page(pud_pte(pud), addr); +} + #define __p4d_raw(x) ((p4d_t) { __pgd_raw(x) }) static inline __be64 p4d_raw(p4d_t x) { @@ -1074,6 +1085,12 @@ static inline bool pmd_access_permitted(pmd_t pmd, bool write) return pte_access_permitted(pmd_pte(pmd), write); } +#define pmd_user_accessible_page pmd_user_accessible_page +static inline bool pmd_user_accessible_page(pmd_t pmd, unsigned long addr) +{ + return pmd_leaf(pmd) && pte_user_accessible_page(pmd_pte(pmd), addr); +} + #ifdef CONFIG_TRANSPARENT_HUGEPAGE extern pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot); extern pud_t pfn_pud(unsigned long pfn, pgprot_t pgprot); diff --git a/arch/powerpc/include/asm/nohash/pgtable.h b/arch/powerpc/include/asm/nohash/pgtable.h index 5af168b7f292..9bf3e40f27b6 100644 --- a/arch/powerpc/include/asm/nohash/pgtable.h +++ b/arch/powerpc/include/asm/nohash/pgtable.h @@ -243,6 +243,11 @@ static inline bool pte_access_permitted(pte_t pte, bool write) return true; } +static inline bool pte_user_accessible_page(pte_t pte, unsigned long addr) +{ + return pte_present(pte) && !is_kernel_addr(addr); +} + /* Conversion functions: convert a page and protection to a page entry, * and a page entry and page directory to the page they refer to. * diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h index 17fd7ff6e535..859cdbaa54a7 100644 --- a/arch/powerpc/include/asm/pgtable.h +++ b/arch/powerpc/include/asm/pgtable.h @@ -202,6 +202,14 @@ static inline bool arch_supports_memmap_on_memory(unsigned long vmemmap_size) #endif /* CONFIG_PPC64 */ +#ifndef pmd_user_accessible_page +#define pmd_user_accessible_page(pmd, addr) false +#endif + +#ifndef pud_user_accessible_page +#define pud_user_accessible_page(pud, addr) false +#endif + #endif /* __ASSEMBLER__ */ #endif /* _ASM_POWERPC_PGTABLE_H */ -- cgit v1.2.3 From 2360f523a49bdc021cda3cb32a6003193551e0fc Mon Sep 17 00:00:00 2001 From: Rohan McLure Date: Fri, 19 Dec 2025 04:09:43 +1100 Subject: powerpc/mm: use set_pte_at_unchecked() for internal usages In the new set_ptes() API, set_pte_at() (a special case of set_ptes()) is intended to be instrumented by the page table check facility. There are however several other routines that constitute the API for setting page table entries, including set_pmd_at() among others. Such routines are themselves implemented in terms of set_ptes_at(). A future patch providing support for page table checking on powerpc must take care to avoid duplicate calls to page_table_check_p{te,md,ud}_set(). Allow for assignment of pte entries without instrumentation through the set_pte_at_unchecked() routine introduced in this patch. Cause API-facing routines that call set_pte_at() to instead call set_pte_at_unchecked(), which will remain uninstrumented by page table check. set_ptes() is itself implemented by calls to __set_pte_at(), so this eliminates redundant code. [ajd@linux.ibm.com: don't change to unchecked for early boot/kernel mappings] Link: https://lkml.kernel.org/r/20251219-pgtable_check_v18rebase-v18-11-755bc151a50b@linux.ibm.com Signed-off-by: Rohan McLure Signed-off-by: Andrew Donnellan Acked-by: Madhavan Srinivasan Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Alexandre Ghiti Cc: Alistair Popple Cc: Christophe Leroy Cc: "Christophe Leroy (CS GROUP)" Cc: David Hildenbrand Cc: Donet Tom Cc: Guo Weikang Cc: Ingo Molnar Cc: Jason Gunthorpe Cc: Kevin Brodsky Cc: Magnus Lindholm Cc: "Matthew Wilcox (Oracle)" Cc: Michael Ellerman Cc: Nicholas Miehlbradt Cc: Nicholas Piggin Cc: Pasha Tatashin Cc: Paul Mackerras Cc: Qi Zheng Cc: "Ritesh Harjani (IBM)" Cc: Sweet Tea Dorminy Cc: Thomas Huth Cc: "Vishal Moola (Oracle)" Cc: Zi Yan Signed-off-by: Andrew Morton --- arch/powerpc/include/asm/pgtable.h | 2 ++ arch/powerpc/mm/book3s64/pgtable.c | 6 +++--- arch/powerpc/mm/book3s64/radix_pgtable.c | 6 +++--- arch/powerpc/mm/pgtable.c | 8 ++++++++ 4 files changed, 16 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h index 859cdbaa54a7..dcd3a88caaf6 100644 --- a/arch/powerpc/include/asm/pgtable.h +++ b/arch/powerpc/include/asm/pgtable.h @@ -34,6 +34,8 @@ struct mm_struct; void set_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte, unsigned int nr); #define set_ptes set_ptes +void set_pte_at_unchecked(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte); #define update_mmu_cache(vma, addr, ptep) \ update_mmu_cache_range(NULL, vma, addr, ptep, 1) diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c index e3485db7de02..97db2f42ea3d 100644 --- a/arch/powerpc/mm/book3s64/pgtable.c +++ b/arch/powerpc/mm/book3s64/pgtable.c @@ -127,7 +127,7 @@ void set_pmd_at(struct mm_struct *mm, unsigned long addr, WARN_ON(!(pmd_leaf(pmd))); #endif trace_hugepage_set_pmd(addr, pmd_val(pmd)); - return set_pte_at(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd)); + return set_pte_at_unchecked(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd)); } void set_pud_at(struct mm_struct *mm, unsigned long addr, @@ -144,7 +144,7 @@ void set_pud_at(struct mm_struct *mm, unsigned long addr, WARN_ON(!(pud_leaf(pud))); #endif trace_hugepage_set_pud(addr, pud_val(pud)); - return set_pte_at(mm, addr, pudp_ptep(pudp), pud_pte(pud)); + return set_pte_at_unchecked(mm, addr, pudp_ptep(pudp), pud_pte(pud)); } static void do_serialize(void *arg) @@ -550,7 +550,7 @@ void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, if (radix_enabled()) return radix__ptep_modify_prot_commit(vma, addr, ptep, old_pte, pte); - set_pte_at(vma->vm_mm, addr, ptep, pte); + set_pte_at_unchecked(vma->vm_mm, addr, ptep, pte); } #ifdef CONFIG_TRANSPARENT_HUGEPAGE diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c index 73977dbabcf2..b2541bf33d01 100644 --- a/arch/powerpc/mm/book3s64/radix_pgtable.c +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c @@ -1606,7 +1606,7 @@ void radix__ptep_modify_prot_commit(struct vm_area_struct *vma, (atomic_read(&mm->context.copros) > 0)) radix__flush_tlb_page(vma, addr); - set_pte_at(mm, addr, ptep, pte); + set_pte_at_unchecked(mm, addr, ptep, pte); } int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot) @@ -1617,7 +1617,7 @@ int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot) if (!radix_enabled()) return 0; - set_pte_at(&init_mm, 0 /* radix unused */, ptep, new_pud); + set_pte_at_unchecked(&init_mm, 0 /* radix unused */, ptep, new_pud); return 1; } @@ -1664,7 +1664,7 @@ int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot) if (!radix_enabled()) return 0; - set_pte_at(&init_mm, 0 /* radix unused */, ptep, new_pmd); + set_pte_at_unchecked(&init_mm, 0 /* radix unused */, ptep, new_pmd); return 1; } diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c index 56d7e8960e77..7b69cd16e011 100644 --- a/arch/powerpc/mm/pgtable.c +++ b/arch/powerpc/mm/pgtable.c @@ -224,6 +224,14 @@ void set_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep, } } +void set_pte_at_unchecked(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte) +{ + VM_WARN_ON(pte_hw_valid(*ptep) && !pte_protnone(*ptep)); + pte = set_pte_filter(pte, addr); + __set_pte_at(mm, addr, ptep, pte, 0); +} + void unmap_kernel_page(unsigned long va) { pmd_t *pmdp = pmd_off_k(va); -- cgit v1.2.3 From 641d47d4c9635987f2329054b1395421716d3fee Mon Sep 17 00:00:00 2001 From: Rohan McLure Date: Fri, 19 Dec 2025 04:09:44 +1100 Subject: powerpc/mm: support page table check On creation and clearing of a page table mapping, instrument such calls by invoking page_table_check_pte_set and page_table_check_pte_clear respectively. These calls serve as a sanity check against illegal mappings. Enable ARCH_SUPPORTS_PAGE_TABLE_CHECK on powerpc, except when HUGETLB_PAGE is enabled (powerpc has some weirdness in how it implements set_huge_pte_at(), which may require some further work). See also: riscv support in commit 3fee229a8eb9 ("riscv/mm: enable ARCH_SUPPORTS_PAGE_TABLE_CHECK") arm64 in commit 42b2547137f5 ("arm64/mm: enable ARCH_SUPPORTS_PAGE_TABLE_CHECK") x86_64 in commit d283d422c6c4 ("x86: mm: add x86_64 support for page table check") [ajd@linux.ibm.com: rebase, add additional instrumentation, misc fixes] Link: https://lkml.kernel.org/r/20251219-pgtable_check_v18rebase-v18-12-755bc151a50b@linux.ibm.com Signed-off-by: Rohan McLure Signed-off-by: Andrew Donnellan Reviewed-by: Christophe Leroy Reviewed-by: Pasha Tatashin Acked-by: Madhavan Srinivasan Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Alexandre Ghiti Cc: Alistair Popple Cc: "Christophe Leroy (CS GROUP)" Cc: David Hildenbrand Cc: Donet Tom Cc: Guo Weikang Cc: Ingo Molnar Cc: Jason Gunthorpe Cc: Kevin Brodsky Cc: Magnus Lindholm Cc: "Matthew Wilcox (Oracle)" Cc: Michael Ellerman Cc: Nicholas Miehlbradt Cc: Nicholas Piggin Cc: Paul Mackerras Cc: Qi Zheng Cc: "Ritesh Harjani (IBM)" Cc: Sweet Tea Dorminy Cc: Thomas Huth Cc: "Vishal Moola (Oracle)" Cc: Zi Yan Signed-off-by: Andrew Morton --- arch/powerpc/Kconfig | 1 + arch/powerpc/include/asm/book3s/32/pgtable.h | 7 ++++- arch/powerpc/include/asm/book3s/64/pgtable.h | 45 +++++++++++++++++++++------- arch/powerpc/include/asm/nohash/pgtable.h | 8 ++++- arch/powerpc/mm/book3s64/hash_pgtable.c | 4 +++ arch/powerpc/mm/book3s64/pgtable.c | 19 ++++++++---- arch/powerpc/mm/book3s64/radix_pgtable.c | 3 ++ arch/powerpc/mm/pgtable.c | 4 +++ 8 files changed, 73 insertions(+), 18 deletions(-) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 9537a61ebae0..271690445a45 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -172,6 +172,7 @@ config PPC select ARCH_STACKWALK select ARCH_SUPPORTS_ATOMIC_RMW select ARCH_SUPPORTS_DEBUG_PAGEALLOC if PPC_BOOK3S || PPC_8xx + select ARCH_SUPPORTS_PAGE_TABLE_CHECK if !HUGETLB_PAGE select ARCH_SUPPORTS_SCHED_MC if SMP select ARCH_SUPPORTS_SCHED_SMT if PPC64 && SMP select SCHED_MC if ARCH_SUPPORTS_SCHED_MC diff --git a/arch/powerpc/include/asm/book3s/32/pgtable.h b/arch/powerpc/include/asm/book3s/32/pgtable.h index 2edca1068b6f..dcbae8521830 100644 --- a/arch/powerpc/include/asm/book3s/32/pgtable.h +++ b/arch/powerpc/include/asm/book3s/32/pgtable.h @@ -202,6 +202,7 @@ void unmap_kernel_page(unsigned long va); #ifndef __ASSEMBLER__ #include #include +#include /* Bits to mask out from a PGD to get to the PUD page */ #define PGD_MASKED_BITS 0 @@ -315,7 +316,11 @@ static inline int __ptep_test_and_clear_young(struct mm_struct *mm, static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { - return __pte(pte_update(mm, addr, ptep, ~_PAGE_HASHPTE, 0, 0)); + pte_t old_pte = __pte(pte_update(mm, addr, ptep, ~_PAGE_HASHPTE, 0, 0)); + + page_table_check_pte_clear(mm, addr, old_pte); + + return old_pte; } #define __HAVE_ARCH_PTEP_SET_WRPROTECT diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index 2d69a827594f..1a91762b455d 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -144,6 +144,8 @@ #define PAGE_KERNEL_ROX __pgprot(_PAGE_BASE | _PAGE_KERNEL_ROX) #ifndef __ASSEMBLER__ +#include + /* * page table defines */ @@ -416,8 +418,11 @@ static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { - unsigned long old = pte_update(mm, addr, ptep, ~0UL, 0, 0); - return __pte(old); + pte_t old_pte = __pte(pte_update(mm, addr, ptep, ~0UL, 0, 0)); + + page_table_check_pte_clear(mm, addr, old_pte); + + return old_pte; } #define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL @@ -426,11 +431,16 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, pte_t *ptep, int full) { if (full && radix_enabled()) { + pte_t old_pte; + /* * We know that this is a full mm pte clear and * hence can be sure there is no parallel set_pte. */ - return radix__ptep_get_and_clear_full(mm, addr, ptep, full); + old_pte = radix__ptep_get_and_clear_full(mm, addr, ptep, full); + page_table_check_pte_clear(mm, addr, old_pte); + + return old_pte; } return ptep_get_and_clear(mm, addr, ptep); } @@ -1301,19 +1311,34 @@ extern int pudp_test_and_clear_young(struct vm_area_struct *vma, static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp) { - if (radix_enabled()) - return radix__pmdp_huge_get_and_clear(mm, addr, pmdp); - return hash__pmdp_huge_get_and_clear(mm, addr, pmdp); + pmd_t old_pmd; + + if (radix_enabled()) { + old_pmd = radix__pmdp_huge_get_and_clear(mm, addr, pmdp); + } else { + old_pmd = hash__pmdp_huge_get_and_clear(mm, addr, pmdp); + } + + page_table_check_pmd_clear(mm, addr, old_pmd); + + return old_pmd; } #define __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm, unsigned long addr, pud_t *pudp) { - if (radix_enabled()) - return radix__pudp_huge_get_and_clear(mm, addr, pudp); - BUG(); - return *pudp; + pud_t old_pud; + + if (radix_enabled()) { + old_pud = radix__pudp_huge_get_and_clear(mm, addr, pudp); + } else { + BUG(); + } + + page_table_check_pud_clear(mm, addr, old_pud); + + return old_pud; } static inline pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, diff --git a/arch/powerpc/include/asm/nohash/pgtable.h b/arch/powerpc/include/asm/nohash/pgtable.h index 9bf3e40f27b6..e6da5eaccff6 100644 --- a/arch/powerpc/include/asm/nohash/pgtable.h +++ b/arch/powerpc/include/asm/nohash/pgtable.h @@ -29,6 +29,8 @@ static inline pte_basic_t pte_update(struct mm_struct *mm, unsigned long addr, p #ifndef __ASSEMBLER__ +#include + extern int icache_44x_need_flush; #ifndef pte_huge_size @@ -122,7 +124,11 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { - return __pte(pte_update(mm, addr, ptep, ~0UL, 0, 0)); + pte_t old_pte = __pte(pte_update(mm, addr, ptep, ~0UL, 0, 0)); + + page_table_check_pte_clear(mm, addr, old_pte); + + return old_pte; } #define __HAVE_ARCH_PTEP_GET_AND_CLEAR diff --git a/arch/powerpc/mm/book3s64/hash_pgtable.c b/arch/powerpc/mm/book3s64/hash_pgtable.c index 82d31177630b..ac2a24d15d2e 100644 --- a/arch/powerpc/mm/book3s64/hash_pgtable.c +++ b/arch/powerpc/mm/book3s64/hash_pgtable.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include @@ -230,6 +231,9 @@ pmd_t hash__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long addres pmd = *pmdp; pmd_clear(pmdp); + + page_table_check_pmd_clear(vma->vm_mm, address, pmd); + /* * Wait for all pending hash_page to finish. This is needed * in case of subpage collapse. When we collapse normal pages diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c index 97db2f42ea3d..4b09c04654a8 100644 --- a/arch/powerpc/mm/book3s64/pgtable.c +++ b/arch/powerpc/mm/book3s64/pgtable.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include @@ -127,6 +128,7 @@ void set_pmd_at(struct mm_struct *mm, unsigned long addr, WARN_ON(!(pmd_leaf(pmd))); #endif trace_hugepage_set_pmd(addr, pmd_val(pmd)); + page_table_check_pmd_set(mm, addr, pmdp, pmd); return set_pte_at_unchecked(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd)); } @@ -144,6 +146,7 @@ void set_pud_at(struct mm_struct *mm, unsigned long addr, WARN_ON(!(pud_leaf(pud))); #endif trace_hugepage_set_pud(addr, pud_val(pud)); + page_table_check_pud_set(mm, addr, pudp, pud); return set_pte_at_unchecked(mm, addr, pudp_ptep(pudp), pud_pte(pud)); } @@ -179,23 +182,27 @@ void serialize_against_pte_lookup(struct mm_struct *mm) pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { - unsigned long old_pmd; + pmd_t old_pmd; VM_WARN_ON_ONCE(!pmd_present(*pmdp)); - old_pmd = pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT, _PAGE_INVALID); + old_pmd = __pmd(pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT, _PAGE_INVALID)); flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE); - return __pmd(old_pmd); + page_table_check_pmd_clear(vma->vm_mm, address, old_pmd); + + return old_pmd; } pud_t pudp_invalidate(struct vm_area_struct *vma, unsigned long address, pud_t *pudp) { - unsigned long old_pud; + pud_t old_pud; VM_WARN_ON_ONCE(!pud_present(*pudp)); - old_pud = pud_hugepage_update(vma->vm_mm, address, pudp, _PAGE_PRESENT, _PAGE_INVALID); + old_pud = __pud(pud_hugepage_update(vma->vm_mm, address, pudp, _PAGE_PRESENT, _PAGE_INVALID)); flush_pud_tlb_range(vma, address, address + HPAGE_PUD_SIZE); - return __pud(old_pud); + page_table_check_pud_clear(vma->vm_mm, address, old_pud); + + return old_pud; } pmd_t pmdp_huge_get_and_clear_full(struct vm_area_struct *vma, diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c index b2541bf33d01..10aced261cff 100644 --- a/arch/powerpc/mm/book3s64/radix_pgtable.c +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -1474,6 +1475,8 @@ pmd_t radix__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long addre pmd = *pmdp; pmd_clear(pmdp); + page_table_check_pmd_clear(vma->vm_mm, address, pmd); + radix__flush_tlb_collapsed_pmd(vma->vm_mm, address); return pmd; diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c index 7b69cd16e011..a9be337be3e4 100644 --- a/arch/powerpc/mm/pgtable.c +++ b/arch/powerpc/mm/pgtable.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -206,6 +207,9 @@ void set_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep, * and not hw_valid ptes. Hence there is no translation cache flush * involved that need to be batched. */ + + page_table_check_ptes_set(mm, addr, ptep, pte, nr); + for (;;) { /* -- cgit v1.2.3 From cbc064e708b687cd2dbc2b788c473e2a34e10f7c Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Wed, 14 Jan 2026 12:22:13 -0500 Subject: nodemask: propagate boolean for nodes_and{,not} MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "nodemask: align nodes_and{,not} with underlying bitmap ops". nodes_and{,not} are void despite that underlying bitmap_and(,not) return boolean, true if the result bitmap is non-empty. Align nodemask API, and simplify client code. This patch (of 3): Bitmap functions bitmap_and{,not} return boolean depending on emptiness of the result bitmap. The corresponding nodemask helpers ignore the returned value. Propagate the underlying bitmaps result to nodemasks users, as it simplifies user code. Link: https://lkml.kernel.org/r/20260114172217.861204-1-ynorov@nvidia.com Link: https://lkml.kernel.org/r/20260114172217.861204-2-ynorov@nvidia.com Signed-off-by: Yury Norov Reviewed-by: Gregory Price Reviewed-by: Joshua Hahn Reviewed-by: David Hildenbrand (Red Hat) Cc: Alistair Popple Cc: Byungchul Park Cc: "Huang, Ying" Cc: Johannes Weiner Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Mathew Brost Cc: Michal Hocko Cc: Michal Koutný Cc: Mike Rapoport Cc: Rakie Kim Cc: Rasmus Villemoes Cc: Suren Baghdasaryan Cc: Tejun Heo Cc: Vlastimil Babka Cc: Waiman Long Cc: Yury Norov (NVIDIA) Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/nodemask.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h index bd38648c998d..204c92462f3c 100644 --- a/include/linux/nodemask.h +++ b/include/linux/nodemask.h @@ -157,10 +157,10 @@ static __always_inline bool __node_test_and_set(int node, nodemask_t *addr) #define nodes_and(dst, src1, src2) \ __nodes_and(&(dst), &(src1), &(src2), MAX_NUMNODES) -static __always_inline void __nodes_and(nodemask_t *dstp, const nodemask_t *src1p, +static __always_inline bool __nodes_and(nodemask_t *dstp, const nodemask_t *src1p, const nodemask_t *src2p, unsigned int nbits) { - bitmap_and(dstp->bits, src1p->bits, src2p->bits, nbits); + return bitmap_and(dstp->bits, src1p->bits, src2p->bits, nbits); } #define nodes_or(dst, src1, src2) \ @@ -181,10 +181,10 @@ static __always_inline void __nodes_xor(nodemask_t *dstp, const nodemask_t *src1 #define nodes_andnot(dst, src1, src2) \ __nodes_andnot(&(dst), &(src1), &(src2), MAX_NUMNODES) -static __always_inline void __nodes_andnot(nodemask_t *dstp, const nodemask_t *src1p, +static __always_inline bool __nodes_andnot(nodemask_t *dstp, const nodemask_t *src1p, const nodemask_t *src2p, unsigned int nbits) { - bitmap_andnot(dstp->bits, src1p->bits, src2p->bits, nbits); + return bitmap_andnot(dstp->bits, src1p->bits, src2p->bits, nbits); } #define nodes_copy(dst, src) __nodes_copy(&(dst), &(src), MAX_NUMNODES) -- cgit v1.2.3 From 386781df63cb4d847f21dc9452d251a6d63d89b2 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Wed, 14 Jan 2026 12:22:14 -0500 Subject: mm: use nodes_and() return value to simplify client code MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit establish_demotion_targets() and kernel_migrate_pages() call node_empty() immediately after calling nodes_and(). Now that nodes_and() return false if nodemask is empty, drop the latter. Link: https://lkml.kernel.org/r/20260114172217.861204-3-ynorov@nvidia.com Signed-off-by: Yury Norov Reviewed-by: Gregory Price Reviewed-by: Joshua Hahn Acked-by: Vlastimil Babka Acked-by: David Hildenbrand (Red Hat) Cc: Alistair Popple Cc: Byungchul Park Cc: "Huang, Ying" Cc: Johannes Weiner Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Mathew Brost Cc: Michal Hocko Cc: Michal Koutný Cc: Mike Rapoport Cc: Rakie Kim Cc: Rasmus Villemoes Cc: Suren Baghdasaryan Cc: Tejun Heo Cc: Waiman Long Cc: Yury Norov (NVIDIA) Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/memory-tiers.c | 3 +-- mm/mempolicy.c | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c index 20aab9c19c5e..7ec442776574 100644 --- a/mm/memory-tiers.c +++ b/mm/memory-tiers.c @@ -475,8 +475,7 @@ static void establish_demotion_targets(void) */ list_for_each_entry_reverse(memtier, &memory_tiers, list) { tier_nodes = get_memtier_nodemask(memtier); - nodes_and(tier_nodes, node_states[N_CPU], tier_nodes); - if (!nodes_empty(tier_nodes)) { + if (nodes_and(tier_nodes, node_states[N_CPU], tier_nodes)) { /* * abstract distance below the max value of this memtier * is considered toptier. diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 76da50425712..dbd48502ac24 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1909,8 +1909,7 @@ static int kernel_migrate_pages(pid_t pid, unsigned long maxnode, } task_nodes = cpuset_mems_allowed(current); - nodes_and(*new, *new, task_nodes); - if (nodes_empty(*new)) + if (!nodes_and(*new, *new, task_nodes)) goto out_put; err = security_task_movememory(task); -- cgit v1.2.3 From 291487b753b132b382884726f45bc3ffa6ac902e Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Wed, 14 Jan 2026 12:22:15 -0500 Subject: cgroup: use nodes_and() output where appropriate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now that nodes_and() returns true if the result nodemask is not empty, drop useless nodes_intersects() in guarantee_online_mems() and nodes_empty() in update_nodemasks_hier(), which both are O(N). Link: https://lkml.kernel.org/r/20260114172217.861204-4-ynorov@nvidia.com Signed-off-by: Yury Norov Reviewed-by: Gregory Price Reviewed-by: Joshua Hahn Acked-by: Tejun Heo Cc: Alistair Popple Cc: Byungchul Park Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Johannes Weiner Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Mathew Brost Cc: Michal Hocko Cc: Michal Koutný Cc: Mike Rapoport Cc: Rakie Kim Cc: Rasmus Villemoes Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Waiman Long Cc: Yury Norov (NVIDIA) Cc: Zi Yan Signed-off-by: Andrew Morton --- kernel/cgroup/cpuset.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index c06e2e96f79d..99cf37e7d491 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -453,9 +453,8 @@ static void guarantee_active_cpus(struct task_struct *tsk, */ static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask) { - while (!nodes_intersects(cs->effective_mems, node_states[N_MEMORY])) + while (!nodes_and(*pmask, cs->effective_mems, node_states[N_MEMORY])) cs = parent_cs(cs); - nodes_and(*pmask, cs->effective_mems, node_states[N_MEMORY]); } /** @@ -2859,13 +2858,13 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems) cpuset_for_each_descendant_pre(cp, pos_css, cs) { struct cpuset *parent = parent_cs(cp); - nodes_and(*new_mems, cp->mems_allowed, parent->effective_mems); + bool has_mems = nodes_and(*new_mems, cp->mems_allowed, parent->effective_mems); /* * If it becomes empty, inherit the effective mask of the * parent, which is guaranteed to have some MEMs. */ - if (is_in_v2_mode() && nodes_empty(*new_mems)) + if (is_in_v2_mode() && !has_mems) *new_mems = parent->effective_mems; /* Skip the whole subtree if the nodemask remains the same. */ -- cgit v1.2.3 From 4262c53236977de3ceaa3bf2aefdf772c9b874dd Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 15 Jan 2026 07:20:41 -0800 Subject: mm/damon/core: implement damon_kdamond_pid() Patch series "mm/damon: hide kdamond and kdamond_lock from API callers". 'kdamond' and 'kdamond_lock' fields initially exposed to DAMON API callers for flexible synchronization and use cases. As DAMON API became somewhat complicated compared to the early days, Keeping those exposed could only encourage the API callers to invent more creative but complicated and difficult-to-debug use cases. Fortunately DAMON API callers didn't invent that many creative use cases. There exist only two use cases of 'kdamond' and 'kdamond_lock'. Finding whether the kdamond is actively running, and getting the pid of the kdamond. For the first use case, a dedicated API function, namely 'damon_is_running()' is provided, and all DAMON API callers are using the function for the use case. Hence only the second use case is where the fields are directly being used by DAMON API callers. To prevent future invention of complicated and erroneous use cases of the fields, hide the fields from the API callers. For that, provide new dedicated DAMON API functions for the remaining use case, namely damon_kdamond_pid(), migrate DAMON API callers to use the new function, and mark the fields as private fields. This patch (of 5): 'kdamond' and 'kdamond_lock' are directly being used by DAMON API callers for getting the pid of the corresponding kdamond. To discourage invention of creative but complicated and erroneous new usages of the fields that require careful synchronization, implement a new API function that can simply be used without the manual synchronizations. Link: https://lkml.kernel.org/r/20260115152047.68415-1-sj@kernel.org Link: https://lkml.kernel.org/r/20260115152047.68415-2-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/damon.h | 1 + mm/damon/core.c | 17 +++++++++++++++++ 2 files changed, 18 insertions(+) diff --git a/include/linux/damon.h b/include/linux/damon.h index 26fb8e90dff6..5b7ea7082134 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -972,6 +972,7 @@ bool damon_initialized(void); int damon_start(struct damon_ctx **ctxs, int nr_ctxs, bool exclusive); int damon_stop(struct damon_ctx **ctxs, int nr_ctxs); bool damon_is_running(struct damon_ctx *ctx); +int damon_kdamond_pid(struct damon_ctx *ctx); int damon_call(struct damon_ctx *ctx, struct damon_call_control *control); int damos_walk(struct damon_ctx *ctx, struct damos_walk_control *control); diff --git a/mm/damon/core.c b/mm/damon/core.c index 729a5f7fac94..81b998d32074 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -1442,6 +1442,23 @@ bool damon_is_running(struct damon_ctx *ctx) return running; } +/** + * damon_kdamond_pid() - Return pid of a given DAMON context's worker thread. + * @ctx: The DAMON context of the question. + * + * Return: pid if @ctx is running, negative error code otherwise. + */ +int damon_kdamond_pid(struct damon_ctx *ctx) +{ + int pid = -EINVAL; + + mutex_lock(&ctx->kdamond_lock); + if (ctx->kdamond) + pid = ctx->kdamond->pid; + mutex_unlock(&ctx->kdamond_lock); + return pid; +} + /* * damon_call_handle_inactive_ctx() - handle DAMON call request that added to * an inactive context. -- cgit v1.2.3 From f54b51ce31976b35c5aac239a3b59687196d6b9d Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 15 Jan 2026 07:20:42 -0800 Subject: mm/damon/sysfs: use damon_kdamond_pid() DAMON sysfs interface directly uses damon_ctx->kdamond field with manual synchronization using damon_ctx->kdamond_lock, to get the pid of the kdamond. Use a new dedicated function for the purpose, namely damon_kdamond_pid(), since that doesn't require manual and error-prone synchronization. Avoid use of kdamond_lock outside of the core. Link: https://lkml.kernel.org/r/20260115152047.68415-3-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/sysfs.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index 95fd9375a7d8..4de25708b05a 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -1819,10 +1819,9 @@ static ssize_t pid_show(struct kobject *kobj, if (!ctx) goto out; - mutex_lock(&ctx->kdamond_lock); - if (ctx->kdamond) - pid = ctx->kdamond->pid; - mutex_unlock(&ctx->kdamond_lock); + pid = damon_kdamond_pid(ctx); + if (pid < 0) + pid = -1; out: mutex_unlock(&damon_sysfs_lock); return sysfs_emit(buf, "%d\n", pid); -- cgit v1.2.3 From 306550f0a5817d271361aa010fd245a4b43af725 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 15 Jan 2026 07:20:43 -0800 Subject: mm/damon/lru_sort: use damon_kdamond_pid() DAMON_LRU_SORT directly uses damon_ctx->kdamond field with manual synchronization using damon_ctx->kdamond_lock, to get the pid of the kdamond. Use a new dedicated function for the purpose, namely damon_kdamond_pid(), since that doesn't require manual and error-prone synchronization. Link: https://lkml.kernel.org/r/20260115152047.68415-4-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/lru_sort.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c index 8296f984b428..bedb9134d286 100644 --- a/mm/damon/lru_sort.c +++ b/mm/damon/lru_sort.c @@ -405,7 +405,9 @@ static int damon_lru_sort_turn(bool on) err = damon_start(&ctx, 1, true); if (err) return err; - kdamond_pid = ctx->kdamond->pid; + kdamond_pid = damon_kdamond_pid(ctx); + if (kdamond_pid < 0) + return kdamond_pid; return damon_call(ctx, &call_control); } -- cgit v1.2.3 From 33402229d28d837ceb4c8bcebc96dc509d9203f9 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 15 Jan 2026 07:20:44 -0800 Subject: mm/damon/reclaim: use damon_kdamond_pid() DAMON_RECLAIM directly uses damon_ctx->kdamond field with manual synchronization using damon_ctx->kdamond_lock, to get the pid of the kdamond. Use a new dedicated function for the purpose, namely damon_kdamond_pid(), since that doesn't require manual and error-prone synchronization. Link: https://lkml.kernel.org/r/20260115152047.68415-5-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/reclaim.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c index 8463a5a5032f..55df43e241c5 100644 --- a/mm/damon/reclaim.c +++ b/mm/damon/reclaim.c @@ -307,7 +307,9 @@ static int damon_reclaim_turn(bool on) err = damon_start(&ctx, 1, true); if (err) return err; - kdamond_pid = ctx->kdamond->pid; + kdamond_pid = damon_kdamond_pid(ctx); + if (kdamond_pid < 0) + return kdamond_pid; return damon_call(ctx, &call_control); } -- cgit v1.2.3 From 6fe0e6d599a6bb4b65704285d40d4972423b7aaa Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 15 Jan 2026 07:20:45 -0800 Subject: mm/damon: hide kdamond and kdamond_lock of damon_ctx There is no DAMON API caller that directly access 'kdamond' and 'kdamond_lock' fields of 'struct damon_ctx'. Keeping those exposed could only encourage creative but error-prone usages. Hide them from DAMON API callers by marking those as private fields. Link: https://lkml.kernel.org/r/20260115152047.68415-6-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/damon.h | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/include/linux/damon.h b/include/linux/damon.h index 5b7ea7082134..e6930d8574d3 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -759,23 +759,20 @@ struct damon_attrs { * of the monitoring. * * @attrs: Monitoring attributes for accuracy/overhead control. - * @kdamond: Kernel thread who does the monitoring. - * @kdamond_lock: Mutex for the synchronizations with @kdamond. * - * For each monitoring context, one kernel thread for the monitoring is - * created. The pointer to the thread is stored in @kdamond. + * For each monitoring context, one kernel thread for the monitoring, namely + * kdamond, is created. The pid of kdamond can be retrieved using + * damon_kdamond_pid(). * - * Once started, the monitoring thread runs until explicitly required to be - * terminated or every monitoring target is invalid. The validity of the - * targets is checked via the &damon_operations.target_valid of @ops. The - * termination can also be explicitly requested by calling damon_stop(). - * The thread sets @kdamond to NULL when it terminates. Therefore, users can - * know whether the monitoring is ongoing or terminated by reading @kdamond. - * Reads and writes to @kdamond from outside of the monitoring thread must - * be protected by @kdamond_lock. + * Once started, kdamond runs until explicitly required to be terminated or + * every monitoring target is invalid. The validity of the targets is checked + * via the &damon_operations.target_valid of @ops. The termination can also be + * explicitly requested by calling damon_stop(). To know if a kdamond is + * running, damon_is_running() can be used. * - * Note that the monitoring thread protects only @kdamond via @kdamond_lock. - * Accesses to other fields must be protected by themselves. + * While the kdamond is running, all accesses to &struct damon_ctx from a + * thread other than the kdamond should be made using safe DAMON APIs, + * including damon_call() and damos_walk(). * * @ops: Set of monitoring operations for given use cases. * @addr_unit: Scale factor for core to ops address conversion. @@ -816,10 +813,12 @@ struct damon_ctx { struct damos_walk_control *walk_control; struct mutex walk_control_lock; -/* public: */ + /* Working thread of the given DAMON context */ struct task_struct *kdamond; + /* Protects @kdamond field access */ struct mutex kdamond_lock; +/* public: */ struct damon_operations ops; unsigned long addr_unit; unsigned long min_sz_region; -- cgit v1.2.3 From 7832e4d583ee7c6a7907731c568ca40b160d8a5e Mon Sep 17 00:00:00 2001 From: Shivank Garg Date: Sun, 18 Jan 2026 19:22:53 +0000 Subject: mm/khugepaged: remove unnecessary goto 'skip' label Patch series "mm/khugepaged: cleanups and scan limit fix", v3. This series contains several cleanups for mm/khugepaged.c to improve code readability and type safety, and one functional fix to ensure khugepaged_scan_mm_slot() correctly accounts for small VMAs towards scan limit. This patch (of 4): Replace goto skip with actual logic for better code readability. No functional change. Link: https://lkml.kernel.org/r/20260118192253.9263-4-shivankg@amd.com Link: https://lkml.kernel.org/r/20260118192253.9263-6-shivankg@amd.com Signed-off-by: Shivank Garg Reviewed-by: Liam R. Howlett Reviewed-by: Zi Yan Acked-by: David Hildenbrand (Red Hat) Reviewed-by: Lance Yang Tested-by: Nico Pache Reviewed-by: Nico Pache Reviewed-by: Dev Jain Cc: Baolin Wang Cc: Barry Song Cc: Lorenzo Stoakes Cc: Ryan Roberts Cc: Anshuman Khandual Cc: Wei Yang Signed-off-by: Andrew Morton --- mm/khugepaged.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 16582bdcb6ff..984294a16861 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -2442,14 +2442,15 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, break; } if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_KHUGEPAGED, PMD_ORDER)) { -skip: progress++; continue; } hstart = round_up(vma->vm_start, HPAGE_PMD_SIZE); hend = round_down(vma->vm_end, HPAGE_PMD_SIZE); - if (khugepaged_scan.address > hend) - goto skip; + if (khugepaged_scan.address > hend) { + progress++; + continue; + } if (khugepaged_scan.address < hstart) khugepaged_scan.address = hstart; VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK); -- cgit v1.2.3 From 3ab981c1fca08721a2cc100d4e097d4e0c9e149b Mon Sep 17 00:00:00 2001 From: Shivank Garg Date: Sun, 18 Jan 2026 19:22:57 +0000 Subject: mm/khugepaged: change collapse_pte_mapped_thp() to return void The only external caller of collapse_pte_mapped_thp() is uprobe, which ignores the return value. Change the external API to return void to simplify the interface. Introduce try_collapse_pte_mapped_thp() for internal use that preserves the return value. This prepares for future patch that will convert the return type to use enum scan_result. Link: https://lkml.kernel.org/r/20260118192253.9263-10-shivankg@amd.com Signed-off-by: Shivank Garg Suggested-by: David Hildenbrand (Red Hat) Acked-by: Lance Yang Acked-by: David Hildenbrand (Red Hat) Reviewed-by: Zi Yan Tested-by: Nico Pache Reviewed-by: Nico Pache Cc: Anshuman Khandual Cc: Baolin Wang Cc: Barry Song Cc: Dev Jain Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Ryan Roberts Cc: Wei Yang Signed-off-by: Andrew Morton --- include/linux/khugepaged.h | 9 ++++----- mm/khugepaged.c | 40 +++++++++++++++++++++++----------------- 2 files changed, 27 insertions(+), 22 deletions(-) diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h index eb1946a70cff..d7a9053ff4fe 100644 --- a/include/linux/khugepaged.h +++ b/include/linux/khugepaged.h @@ -17,8 +17,8 @@ extern void khugepaged_enter_vma(struct vm_area_struct *vma, vm_flags_t vm_flags); extern void khugepaged_min_free_kbytes_update(void); extern bool current_is_khugepaged(void); -extern int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, - bool install_pmd); +void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, + bool install_pmd); static inline void khugepaged_fork(struct mm_struct *mm, struct mm_struct *oldmm) { @@ -42,10 +42,9 @@ static inline void khugepaged_enter_vma(struct vm_area_struct *vma, vm_flags_t vm_flags) { } -static inline int collapse_pte_mapped_thp(struct mm_struct *mm, - unsigned long addr, bool install_pmd) +static inline void collapse_pte_mapped_thp(struct mm_struct *mm, + unsigned long addr, bool install_pmd) { - return 0; } static inline void khugepaged_min_free_kbytes_update(void) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 984294a16861..d513375b4f39 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1477,20 +1477,8 @@ static int set_huge_pmd(struct vm_area_struct *vma, unsigned long addr, return SCAN_SUCCEED; } -/** - * collapse_pte_mapped_thp - Try to collapse a pte-mapped THP for mm at - * address haddr. - * - * @mm: process address space where collapse happens - * @addr: THP collapse address - * @install_pmd: If a huge PMD should be installed - * - * This function checks whether all the PTEs in the PMD are pointing to the - * right THP. If so, retract the page table so the THP can refault in with - * as pmd-mapped. Possibly install a huge PMD mapping the THP. - */ -int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, - bool install_pmd) +static int try_collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, + bool install_pmd) { int nr_mapped_ptes = 0, result = SCAN_FAIL; unsigned int nr_batch_ptes; @@ -1711,6 +1699,24 @@ drop_folio: return result; } +/** + * collapse_pte_mapped_thp - Try to collapse a pte-mapped THP for mm at + * address haddr. + * + * @mm: process address space where collapse happens + * @addr: THP collapse address + * @install_pmd: If a huge PMD should be installed + * + * This function checks whether all the PTEs in the PMD are pointing to the + * right THP. If so, retract the page table so the THP can refault in with + * as pmd-mapped. Possibly install a huge PMD mapping the THP. + */ +void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, + bool install_pmd) +{ + try_collapse_pte_mapped_thp(mm, addr, install_pmd); +} + /* Can we retract page tables for this file-backed VMA? */ static bool file_backed_vma_is_retractable(struct vm_area_struct *vma) { @@ -2227,7 +2233,7 @@ immap_locked: /* * Remove pte page tables, so we can re-fault the page as huge. - * If MADV_COLLAPSE, adjust result to call collapse_pte_mapped_thp(). + * If MADV_COLLAPSE, adjust result to call try_collapse_pte_mapped_thp(). */ retract_page_tables(mapping, start); if (cc && !cc->is_khugepaged) @@ -2479,7 +2485,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, mmap_read_lock(mm); if (hpage_collapse_test_exit_or_disable(mm)) goto breakouterloop; - *result = collapse_pte_mapped_thp(mm, + *result = try_collapse_pte_mapped_thp(mm, khugepaged_scan.address, false); if (*result == SCAN_PMD_MAPPED) *result = SCAN_SUCCEED; @@ -2844,7 +2850,7 @@ handle_result: case SCAN_PTE_MAPPED_HUGEPAGE: BUG_ON(mmap_locked); mmap_read_lock(mm); - result = collapse_pte_mapped_thp(mm, addr, true); + result = try_collapse_pte_mapped_thp(mm, addr, true); mmap_read_unlock(mm); goto handle_result; /* Whitelisted set of results where continuing OK */ -- cgit v1.2.3 From 40bd4ff090685746459b05f59f00049ff58493e8 Mon Sep 17 00:00:00 2001 From: Shivank Garg Date: Sun, 18 Jan 2026 19:22:59 +0000 Subject: mm/khugepaged: use enum scan_result for result variables and return types Convert result variables and return types from int to enum scan_result throughout khugepaged code. This improves type safety and code clarity by making the intent explicit. No functional change. Link: https://lkml.kernel.org/r/20260118192253.9263-12-shivankg@amd.com Signed-off-by: Shivank Garg Reviewed-by: Zi Yan Acked-by: David Hildenbrand (Red Hat) Tested-by: Nico Pache Reviewed-by: Nico Pache Reviewed-by: Dev Jain Cc: Anshuman Khandual Cc: Baolin Wang Cc: Barry Song Cc: Lance Yang Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Ryan Roberts Cc: Wei Yang Signed-off-by: Andrew Morton --- mm/khugepaged.c | 99 +++++++++++++++++++++++++++------------------------------ 1 file changed, 46 insertions(+), 53 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index d513375b4f39..c0a29a85230a 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -537,17 +537,16 @@ static void release_pte_pages(pte_t *pte, pte_t *_pte, } } -static int __collapse_huge_page_isolate(struct vm_area_struct *vma, - unsigned long start_addr, - pte_t *pte, - struct collapse_control *cc, - struct list_head *compound_pagelist) +static enum scan_result __collapse_huge_page_isolate(struct vm_area_struct *vma, + unsigned long start_addr, pte_t *pte, struct collapse_control *cc, + struct list_head *compound_pagelist) { struct page *page = NULL; struct folio *folio = NULL; unsigned long addr = start_addr; pte_t *_pte; - int none_or_zero = 0, shared = 0, result = SCAN_FAIL, referenced = 0; + int none_or_zero = 0, shared = 0, referenced = 0; + enum scan_result result = SCAN_FAIL; for (_pte = pte; _pte < pte + HPAGE_PMD_NR; _pte++, addr += PAGE_SIZE) { @@ -780,13 +779,13 @@ static void __collapse_huge_page_copy_failed(pte_t *pte, * @ptl: lock on raw pages' PTEs * @compound_pagelist: list that stores compound pages */ -static int __collapse_huge_page_copy(pte_t *pte, struct folio *folio, +static enum scan_result __collapse_huge_page_copy(pte_t *pte, struct folio *folio, pmd_t *pmd, pmd_t orig_pmd, struct vm_area_struct *vma, unsigned long address, spinlock_t *ptl, struct list_head *compound_pagelist) { unsigned int i; - int result = SCAN_SUCCEED; + enum scan_result result = SCAN_SUCCEED; /* * Copying pages' contents is subject to memory poison at any iteration. @@ -898,10 +897,8 @@ static int hpage_collapse_find_target_node(struct collapse_control *cc) * Returns enum scan_result value. */ -static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, - bool expect_anon, - struct vm_area_struct **vmap, - struct collapse_control *cc) +static enum scan_result hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, + bool expect_anon, struct vm_area_struct **vmap, struct collapse_control *cc) { struct vm_area_struct *vma; enum tva_type type = cc->is_khugepaged ? TVA_KHUGEPAGED : @@ -930,7 +927,7 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, return SCAN_SUCCEED; } -static inline int check_pmd_state(pmd_t *pmd) +static inline enum scan_result check_pmd_state(pmd_t *pmd) { pmd_t pmde = pmdp_get_lockless(pmd); @@ -953,9 +950,8 @@ static inline int check_pmd_state(pmd_t *pmd) return SCAN_SUCCEED; } -static int find_pmd_or_thp_or_none(struct mm_struct *mm, - unsigned long address, - pmd_t **pmd) +static enum scan_result find_pmd_or_thp_or_none(struct mm_struct *mm, + unsigned long address, pmd_t **pmd) { *pmd = mm_find_pmd(mm, address); if (!*pmd) @@ -964,12 +960,11 @@ static int find_pmd_or_thp_or_none(struct mm_struct *mm, return check_pmd_state(*pmd); } -static int check_pmd_still_valid(struct mm_struct *mm, - unsigned long address, - pmd_t *pmd) +static enum scan_result check_pmd_still_valid(struct mm_struct *mm, + unsigned long address, pmd_t *pmd) { pmd_t *new_pmd; - int result = find_pmd_or_thp_or_none(mm, address, &new_pmd); + enum scan_result result = find_pmd_or_thp_or_none(mm, address, &new_pmd); if (result != SCAN_SUCCEED) return result; @@ -985,15 +980,14 @@ static int check_pmd_still_valid(struct mm_struct *mm, * Called and returns without pte mapped or spinlocks held. * Returns result: if not SCAN_SUCCEED, mmap_lock has been released. */ -static int __collapse_huge_page_swapin(struct mm_struct *mm, - struct vm_area_struct *vma, - unsigned long start_addr, pmd_t *pmd, - int referenced) +static enum scan_result __collapse_huge_page_swapin(struct mm_struct *mm, + struct vm_area_struct *vma, unsigned long start_addr, pmd_t *pmd, + int referenced) { int swapped_in = 0; vm_fault_t ret = 0; unsigned long addr, end = start_addr + (HPAGE_PMD_NR * PAGE_SIZE); - int result; + enum scan_result result; pte_t *pte = NULL; spinlock_t *ptl; @@ -1062,8 +1056,8 @@ out: return result; } -static int alloc_charge_folio(struct folio **foliop, struct mm_struct *mm, - struct collapse_control *cc) +static enum scan_result alloc_charge_folio(struct folio **foliop, struct mm_struct *mm, + struct collapse_control *cc) { gfp_t gfp = (cc->is_khugepaged ? alloc_hugepage_khugepaged_gfpmask() : GFP_TRANSHUGE); @@ -1090,9 +1084,8 @@ static int alloc_charge_folio(struct folio **foliop, struct mm_struct *mm, return SCAN_SUCCEED; } -static int collapse_huge_page(struct mm_struct *mm, unsigned long address, - int referenced, int unmapped, - struct collapse_control *cc) +static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long address, + int referenced, int unmapped, struct collapse_control *cc) { LIST_HEAD(compound_pagelist); pmd_t *pmd, _pmd; @@ -1100,7 +1093,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, pgtable_t pgtable; struct folio *folio; spinlock_t *pmd_ptl, *pte_ptl; - int result = SCAN_FAIL; + enum scan_result result = SCAN_FAIL; struct vm_area_struct *vma; struct mmu_notifier_range range; @@ -1246,15 +1239,14 @@ out_nolock: return result; } -static int hpage_collapse_scan_pmd(struct mm_struct *mm, - struct vm_area_struct *vma, - unsigned long start_addr, bool *mmap_locked, - struct collapse_control *cc) +static enum scan_result hpage_collapse_scan_pmd(struct mm_struct *mm, + struct vm_area_struct *vma, unsigned long start_addr, bool *mmap_locked, + struct collapse_control *cc) { pmd_t *pmd; pte_t *pte, *_pte; - int result = SCAN_FAIL, referenced = 0; - int none_or_zero = 0, shared = 0; + int none_or_zero = 0, shared = 0, referenced = 0; + enum scan_result result = SCAN_FAIL; struct page *page = NULL; struct folio *folio = NULL; unsigned long addr; @@ -1441,8 +1433,8 @@ static void collect_mm_slot(struct mm_slot *slot) } /* folio must be locked, and mmap_lock must be held */ -static int set_huge_pmd(struct vm_area_struct *vma, unsigned long addr, - pmd_t *pmdp, struct folio *folio, struct page *page) +static enum scan_result set_huge_pmd(struct vm_area_struct *vma, unsigned long addr, + pmd_t *pmdp, struct folio *folio, struct page *page) { struct mm_struct *mm = vma->vm_mm; struct vm_fault vmf = { @@ -1477,10 +1469,11 @@ static int set_huge_pmd(struct vm_area_struct *vma, unsigned long addr, return SCAN_SUCCEED; } -static int try_collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, +static enum scan_result try_collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, bool install_pmd) { - int nr_mapped_ptes = 0, result = SCAN_FAIL; + enum scan_result result = SCAN_FAIL; + int nr_mapped_ptes = 0; unsigned int nr_batch_ptes; struct mmu_notifier_range range; bool notified = false; @@ -1862,9 +1855,8 @@ drop_pml: * + unlock old pages * + unlock and free huge page; */ -static int collapse_file(struct mm_struct *mm, unsigned long addr, - struct file *file, pgoff_t start, - struct collapse_control *cc) +static enum scan_result collapse_file(struct mm_struct *mm, unsigned long addr, + struct file *file, pgoff_t start, struct collapse_control *cc) { struct address_space *mapping = file->f_mapping; struct page *dst; @@ -1872,7 +1864,8 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, pgoff_t index = 0, end = start + HPAGE_PMD_NR; LIST_HEAD(pagelist); XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER); - int nr_none = 0, result = SCAN_SUCCEED; + enum scan_result result = SCAN_SUCCEED; + int nr_none = 0; bool is_shmem = shmem_file(file); VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem); @@ -2293,16 +2286,15 @@ out: return result; } -static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr, - struct file *file, pgoff_t start, - struct collapse_control *cc) +static enum scan_result hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr, + struct file *file, pgoff_t start, struct collapse_control *cc) { struct folio *folio = NULL; struct address_space *mapping = file->f_mapping; XA_STATE(xas, &mapping->i_pages, start); int present, swap; int node = NUMA_NO_NODE; - int result = SCAN_SUCCEED; + enum scan_result result = SCAN_SUCCEED; present = 0; swap = 0; @@ -2400,7 +2392,7 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr, return result; } -static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, +static unsigned int khugepaged_scan_mm_slot(unsigned int pages, enum scan_result *result, struct collapse_control *cc) __releases(&khugepaged_mm_lock) __acquires(&khugepaged_mm_lock) @@ -2561,7 +2553,7 @@ static void khugepaged_do_scan(struct collapse_control *cc) unsigned int progress = 0, pass_through_head = 0; unsigned int pages = READ_ONCE(khugepaged_pages_to_scan); bool wait = true; - int result = SCAN_SUCCEED; + enum scan_result result = SCAN_SUCCEED; lru_add_drain_all(); @@ -2774,7 +2766,8 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start, struct collapse_control *cc; struct mm_struct *mm = vma->vm_mm; unsigned long hstart, hend, addr; - int thps = 0, last_fail = SCAN_FAIL; + enum scan_result last_fail = SCAN_FAIL; + int thps = 0; bool mmap_locked = true; BUG_ON(vma->vm_start > start); @@ -2795,7 +2788,7 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start, hend = end & HPAGE_PMD_MASK; for (addr = hstart; addr < hend; addr += HPAGE_PMD_SIZE) { - int result = SCAN_FAIL; + enum scan_result result = SCAN_FAIL; bool triggered_wb = false; retry: -- cgit v1.2.3 From 9c284c91b08e9a669e9e9657814be9ff49310fa2 Mon Sep 17 00:00:00 2001 From: Shivank Garg Date: Sun, 18 Jan 2026 19:23:01 +0000 Subject: mm/khugepaged: make khugepaged_collapse_control static The global variable 'khugepaged_collapse_control' is not used outside of mm/khugepaged.c. Make it static to limit its scope. Link: https://lkml.kernel.org/r/20260118192253.9263-14-shivankg@amd.com Signed-off-by: Shivank Garg Reviewed-by: Wei Yang Reviewed-by: Zi Yan Acked-by: David Hildenbrand (Red Hat) Reviewed-by: Anshuman Khandual Reviewed-by: Dev Jain Cc: Baolin Wang Cc: Barry Song Cc: Lance Yang Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Nico Pache Cc: Ryan Roberts Signed-off-by: Andrew Morton --- mm/khugepaged.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index c0a29a85230a..3ba6dcea5993 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -827,7 +827,7 @@ static void khugepaged_alloc_sleep(void) remove_wait_queue(&khugepaged_wait, &wait); } -struct collapse_control khugepaged_collapse_control = { +static struct collapse_control khugepaged_collapse_control = { .is_khugepaged = true, }; -- cgit v1.2.3 From 6e31add91a10e3804f020ec4e87cb9c3b2b6c3ec Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Red Hat)" Date: Tue, 20 Jan 2026 00:01:09 +0100 Subject: vmw_balloon: adjust BALLOON_DEFLATE when deflating while migrating MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "mm: balloon infrastructure cleanups", v3. I started with wanting to remove the dependency of the balloon infrastructure on the page lock, but ended up performing various other cleanups, some of which I had on my todo list for years. This series heavily cleans up and simplifies our balloon infrastructure, including our balloon page migration functionality. With this series, we no longer make use of the page lock for PageOffline pages as part of the balloon infrastructure (preparing for memdescs where PageOffline pages won't have any such lock), and simplifies migration handling such that refcounting can more easily be adjusted later (long-term focus is for PageOffline pages to not have a refcount either). Plenty of related cleanups. This patch (of 24): When we're effectively deflating the balloon while migrating a page because inflating the new page failed, we're not adjusting BALLOON_DEFLATE. Let's do that. This is a preparation for factoring out this handling to the core code, making it work in a similar way first. As this (deflating while migrating because of inflation error) is a corner case that I don't really expect to happen in practice and the stats are not that crucial, this likely doesn't classify as a fix. Link: https://lkml.kernel.org/r/20260119230133.3551867-1-david@kernel.org Link: https://lkml.kernel.org/r/20260119230133.3551867-2-david@kernel.org Signed-off-by: David Hildenbrand (Red Hat) Reviewed-by: SeongJae Park Reviewed-by: Lorenzo Stoakes Acked-by: Michael S. Tsirkin Cc: Christophe Leroy Cc: Eugenio Pérez Cc: Greg Kroah-Hartman Cc: Jason Wang Cc: Jerrin Shaji George Cc: Jonathan Corbet Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Oscar Salvador Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Zi Yan Signed-off-by: Andrew Morton --- drivers/misc/vmw_balloon.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/misc/vmw_balloon.c b/drivers/misc/vmw_balloon.c index cc1d18b3df5c..2cc34c4968fa 100644 --- a/drivers/misc/vmw_balloon.c +++ b/drivers/misc/vmw_balloon.c @@ -1818,6 +1818,8 @@ static int vmballoon_migratepage(struct balloon_dev_info *b_dev_info, if (status == VMW_BALLOON_SUCCESS) { balloon_page_insert(&b->b_dev_info, newpage); __count_vm_event(BALLOON_MIGRATE); + } else { + __count_vm_event(BALLOON_DEFLATE); } /* -- cgit v1.2.3 From d2346b09c51574fd6c281e3b8092116df1e42f81 Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Red Hat)" Date: Tue, 20 Jan 2026 00:01:10 +0100 Subject: vmw_balloon: remove vmballoon_compaction_init() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now that there is not a lot of logic left, let's just inline setting up the migration function and drop all these excessive comments that are not really required (or true) anymore. To avoid #ifdef in the caller we can instead use IS_ENABLED() and make the compiler happy by only providing the function declaration. Link: https://lkml.kernel.org/r/20260119230133.3551867-3-david@kernel.org Signed-off-by: David Hildenbrand (Red Hat) Reviewed-by: Lorenzo Stoakes Acked-by: Michael S. Tsirkin Cc: Arnd Bergmann Cc: Christophe Leroy Cc: Eugenio Pérez Cc: Greg Kroah-Hartman Cc: Jason Wang Cc: Jerrin Shaji George Cc: Jonathan Corbet Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Oscar Salvador Cc: SeongJae Park Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Zi Yan Signed-off-by: Andrew Morton --- drivers/misc/vmw_balloon.c | 30 +++++------------------------- 1 file changed, 5 insertions(+), 25 deletions(-) diff --git a/drivers/misc/vmw_balloon.c b/drivers/misc/vmw_balloon.c index 2cc34c4968fa..07e60a4b846a 100644 --- a/drivers/misc/vmw_balloon.c +++ b/drivers/misc/vmw_balloon.c @@ -1833,27 +1833,10 @@ out_unlock: up_read(&b->conf_sem); return ret; } - -/** - * vmballoon_compaction_init() - initialized compaction for the balloon. - * - * @b: pointer to the balloon. - * - * If during the initialization a failure occurred, this function does not - * perform cleanup. The caller must call vmballoon_compaction_deinit() in this - * case. - * - * Return: zero on success or error code on failure. - */ -static __init void vmballoon_compaction_init(struct vmballoon *b) -{ - b->b_dev_info.migratepage = vmballoon_migratepage; -} - #else /* CONFIG_BALLOON_COMPACTION */ -static inline void vmballoon_compaction_init(struct vmballoon *b) -{ -} +int vmballoon_migratepage(struct balloon_dev_info *b_dev_info, + struct page *newpage, struct page *page, + enum migrate_mode mode); #endif /* CONFIG_BALLOON_COMPACTION */ static int __init vmballoon_init(void) @@ -1873,12 +1856,9 @@ static int __init vmballoon_init(void) if (error) return error; - /* - * Initialization of compaction must be done after the call to - * balloon_devinfo_init() . - */ balloon_devinfo_init(&balloon.b_dev_info); - vmballoon_compaction_init(&balloon); + if (IS_ENABLED(CONFIG_BALLOON_COMPACTION)) + balloon.b_dev_info.migratepage = vmballoon_migratepage; INIT_LIST_HEAD(&balloon.huge_pages); spin_lock_init(&balloon.comm_lock); -- cgit v1.2.3 From 5b3342cbf0f4493fa955675df79f9d10ab778662 Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Red Hat)" Date: Tue, 20 Jan 2026 00:01:11 +0100 Subject: powerpc/pseries/cmm: remove cmm_balloon_compaction_init() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now that there is not a lot of logic left, let's just inline setting up the migration function. To avoid #ifdef in the caller we can instead use IS_ENABLED() and make the compiler happy by only providing the function declaration. Now that the function is gone, drop the "out_balloon_compaction" label. Note that before commit 68f2736a8583 ("mm: Convert all PageMovable users to movable_operations") we actually had to undo something, now not anymore. Link: https://lkml.kernel.org/r/20260119230133.3551867-4-david@kernel.org Signed-off-by: David Hildenbrand (Red Hat) Reviewed-by: Lorenzo Stoakes Acked-by: Michael S. Tsirkin Cc: Arnd Bergmann Cc: Christophe Leroy Cc: Eugenio Pérez Cc: Greg Kroah-Hartman Cc: Jason Wang Cc: Jerrin Shaji George Cc: Jonathan Corbet Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Oscar Salvador Cc: SeongJae Park Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Zi Yan Signed-off-by: Andrew Morton --- arch/powerpc/platforms/pseries/cmm.c | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/arch/powerpc/platforms/pseries/cmm.c b/arch/powerpc/platforms/pseries/cmm.c index 4cbbe2ee58ab..9a6efbc80d2a 100644 --- a/arch/powerpc/platforms/pseries/cmm.c +++ b/arch/powerpc/platforms/pseries/cmm.c @@ -548,15 +548,9 @@ static int cmm_migratepage(struct balloon_dev_info *b_dev_info, return 0; } - -static void cmm_balloon_compaction_init(void) -{ - b_dev_info.migratepage = cmm_migratepage; -} #else /* CONFIG_BALLOON_COMPACTION */ -static void cmm_balloon_compaction_init(void) -{ -} +int cmm_migratepage(struct balloon_dev_info *b_dev_info, struct page *newpage, + struct page *page, enum migrate_mode mode); #endif /* CONFIG_BALLOON_COMPACTION */ /** @@ -573,11 +567,12 @@ static int cmm_init(void) return -EOPNOTSUPP; balloon_devinfo_init(&b_dev_info); - cmm_balloon_compaction_init(); + if (IS_ENABLED(CONFIG_BALLOON_COMPACTION)) + b_dev_info.migratepage = cmm_migratepage; rc = register_oom_notifier(&cmm_oom_nb); if (rc < 0) - goto out_balloon_compaction; + return rc; if ((rc = register_reboot_notifier(&cmm_reboot_nb))) goto out_oom_notifier; @@ -606,7 +601,6 @@ out_reboot_notifier: unregister_reboot_notifier(&cmm_reboot_nb); out_oom_notifier: unregister_oom_notifier(&cmm_oom_nb); -out_balloon_compaction: return rc; } -- cgit v1.2.3 From 6af05dfe9af7df4756494e460289fc9a9d2fc531 Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Red Hat)" Date: Tue, 20 Jan 2026 00:01:12 +0100 Subject: mm/balloon_compaction: improve comments for WARN_ON_ONCE(!b_dev_info) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Let's clarify a bit by extending the comments. Link: https://lkml.kernel.org/r/20260119230133.3551867-5-david@kernel.org Signed-off-by: David Hildenbrand (Red Hat) Cc: Arnd Bergmann Cc: Christophe Leroy Cc: Eugenio Pérez Cc: Greg Kroah-Hartman Cc: Jason Wang Cc: Jerrin Shaji George Cc: Jonathan Corbet Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michael S. Tsirkin Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Oscar Salvador Cc: SeongJae Park Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/balloon_compaction.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c index 03c5dbabb156..85eea88cea08 100644 --- a/mm/balloon_compaction.c +++ b/mm/balloon_compaction.c @@ -222,7 +222,11 @@ static void balloon_page_putback(struct page *page) struct balloon_dev_info *b_dev_info = balloon_page_device(page); unsigned long flags; - /* Isolated balloon pages cannot get deflated. */ + /* + * When we isolated the page, the page was still inflated in a balloon + * device. As isolated balloon pages cannot get deflated, we still have + * a balloon device here. + */ if (WARN_ON_ONCE(!b_dev_info)) return; @@ -241,7 +245,11 @@ static int balloon_page_migrate(struct page *newpage, struct page *page, VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); - /* Isolated balloon pages cannot get deflated. */ + /* + * When we isolated the page, the page was still inflated in a balloon + * device. As isolated balloon pages cannot get deflated, we still have + * a balloon device here. + */ if (WARN_ON_ONCE(!balloon)) return -EAGAIN; -- cgit v1.2.3 From 1258460bd31ed6e0d504adeaf9df7e6c1b348d14 Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Red Hat)" Date: Tue, 20 Jan 2026 00:01:13 +0100 Subject: mm/balloon_compaction: centralize basic page migration handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Let's update the balloon page references, the balloon page list, the BALLOON_MIGRATE counter and the isolated-pages counter in balloon_page_migrate(), after letting the balloon->migratepage() callback deal with the actual inflation+deflation. Note that we now perform the balloon list modifications outside of any implementation-specific locks: which is fine, there is nothing special about these page actions that the lock would be protecting. The old page is already no longer in the list (isolated) and the new page is not yet in the list. Let's use -ENOENT to communicate the special "inflation of new page failed after already deflating the old page" to balloon_page_migrate() so it can handle it accordingly. While at it, rename balloon->b_dev_info to make it match the other functions. Also, drop the comment above balloon_page_migrate(), which seems unnecessary. Link: https://lkml.kernel.org/r/20260119230133.3551867-6-david@kernel.org Signed-off-by: David Hildenbrand (Red Hat) Acked-by: Michael S. Tsirkin Cc: Arnd Bergmann Cc: Christophe Leroy Cc: Eugenio Pérez Cc: Greg Kroah-Hartman Cc: Jason Wang Cc: Jerrin Shaji George Cc: Jonathan Corbet Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Oscar Salvador Cc: SeongJae Park Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Zi Yan Signed-off-by: Andrew Morton --- arch/powerpc/platforms/pseries/cmm.c | 16 ------------ drivers/misc/vmw_balloon.c | 49 ++++++------------------------------ drivers/virtio/virtio_balloon.c | 12 --------- mm/balloon_compaction.c | 31 ++++++++++++++++++++--- 4 files changed, 35 insertions(+), 73 deletions(-) diff --git a/arch/powerpc/platforms/pseries/cmm.c b/arch/powerpc/platforms/pseries/cmm.c index 9a6efbc80d2a..15f873f733a4 100644 --- a/arch/powerpc/platforms/pseries/cmm.c +++ b/arch/powerpc/platforms/pseries/cmm.c @@ -501,8 +501,6 @@ static int cmm_migratepage(struct balloon_dev_info *b_dev_info, struct page *newpage, struct page *page, enum migrate_mode mode) { - unsigned long flags; - /* * loan/"inflate" the newpage first. * @@ -517,9 +515,6 @@ static int cmm_migratepage(struct balloon_dev_info *b_dev_info, return -EBUSY; } - /* balloon page list reference */ - get_page(newpage); - /* * When we migrate a page to a different zone, we have to fixup the * count of both involved zones as we adjusted the managed page count @@ -530,22 +525,11 @@ static int cmm_migratepage(struct balloon_dev_info *b_dev_info, adjust_managed_page_count(newpage, -1); } - spin_lock_irqsave(&b_dev_info->pages_lock, flags); - balloon_page_insert(b_dev_info, newpage); - __count_vm_event(BALLOON_MIGRATE); - b_dev_info->isolated_pages--; - spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); - /* * activate/"deflate" the old page. We ignore any errors just like the * other callers. */ plpar_page_set_active(page); - - balloon_page_finalize(page); - /* balloon page list reference */ - put_page(page); - return 0; } #else /* CONFIG_BALLOON_COMPACTION */ diff --git a/drivers/misc/vmw_balloon.c b/drivers/misc/vmw_balloon.c index 07e60a4b846a..52b8c0f1eead 100644 --- a/drivers/misc/vmw_balloon.c +++ b/drivers/misc/vmw_balloon.c @@ -1724,18 +1724,17 @@ static inline void vmballoon_debugfs_exit(struct vmballoon *b) * @page: a ballooned page that should be migrated. * @mode: migration mode, ignored. * - * This function is really open-coded, but that is according to the interface - * that balloon_compaction provides. - * * Return: zero on success, -EAGAIN when migration cannot be performed - * momentarily, and -EBUSY if migration failed and should be retried - * with that specific page. + * momentarily, -EBUSY if migration failed and should be retried + * with that specific page, and -ENOENT when deflating @page + * succeeded but inflating @newpage failed, effectively deflating + * the balloon. */ static int vmballoon_migratepage(struct balloon_dev_info *b_dev_info, struct page *newpage, struct page *page, enum migrate_mode mode) { - unsigned long status, flags; + unsigned long status; struct vmballoon *b; int ret = 0; @@ -1773,14 +1772,6 @@ static int vmballoon_migratepage(struct balloon_dev_info *b_dev_info, goto out_unlock; } - /* - * The page is isolated, so it is safe to delete it without holding - * @pages_lock . We keep holding @comm_lock since we will need it in a - * second. - */ - balloon_page_finalize(page); - put_page(page); - /* Inflate */ vmballoon_add_page(b, 0, newpage); status = vmballoon_lock_op(b, 1, VMW_BALLOON_4K_PAGE, @@ -1799,36 +1790,12 @@ static int vmballoon_migratepage(struct balloon_dev_info *b_dev_info, * change. */ atomic64_dec(&b->size); - } else { /* - * Success. Take a reference for the page, and we will add it to - * the list after acquiring the lock. + * Tell the core that we're deflating the old page and don't + * need the new page. */ - get_page(newpage); - } - - /* Update the balloon list under the @pages_lock */ - spin_lock_irqsave(&b->b_dev_info.pages_lock, flags); - - /* - * On inflation success, we already took a reference for the @newpage. - * If we succeed just insert it to the list and update the statistics - * under the lock. - */ - if (status == VMW_BALLOON_SUCCESS) { - balloon_page_insert(&b->b_dev_info, newpage); - __count_vm_event(BALLOON_MIGRATE); - } else { - __count_vm_event(BALLOON_DEFLATE); + ret = -ENOENT; } - - /* - * We deflated successfully, so regardless to the inflation success, we - * need to reduce the number of isolated_pages. - */ - b->b_dev_info.isolated_pages--; - spin_unlock_irqrestore(&b->b_dev_info.pages_lock, flags); - out_unlock: up_read(&b->conf_sem); return ret; diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c index 74fe59f5a78c..df2756c071da 100644 --- a/drivers/virtio/virtio_balloon.c +++ b/drivers/virtio/virtio_balloon.c @@ -827,7 +827,6 @@ static int virtballoon_migratepage(struct balloon_dev_info *vb_dev_info, { struct virtio_balloon *vb = container_of(vb_dev_info, struct virtio_balloon, vb_dev_info); - unsigned long flags; /* * In order to avoid lock contention while migrating pages concurrently @@ -840,8 +839,6 @@ static int virtballoon_migratepage(struct balloon_dev_info *vb_dev_info, if (!mutex_trylock(&vb->balloon_lock)) return -EAGAIN; - get_page(newpage); /* balloon reference */ - /* * When we migrate a page to a different zone and adjusted the * managed page count when inflating, we have to fixup the count of @@ -854,11 +851,6 @@ static int virtballoon_migratepage(struct balloon_dev_info *vb_dev_info, } /* balloon's page migration 1st step -- inflate "newpage" */ - spin_lock_irqsave(&vb_dev_info->pages_lock, flags); - balloon_page_insert(vb_dev_info, newpage); - vb_dev_info->isolated_pages--; - __count_vm_event(BALLOON_MIGRATE); - spin_unlock_irqrestore(&vb_dev_info->pages_lock, flags); vb->num_pfns = VIRTIO_BALLOON_PAGES_PER_PAGE; set_page_pfns(vb, vb->pfns, newpage); tell_host(vb, vb->inflate_vq); @@ -869,10 +861,6 @@ static int virtballoon_migratepage(struct balloon_dev_info *vb_dev_info, tell_host(vb, vb->deflate_vq); mutex_unlock(&vb->balloon_lock); - - balloon_page_finalize(page); - put_page(page); /* balloon reference */ - return 0; } #endif /* CONFIG_BALLOON_COMPACTION */ diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c index 85eea88cea08..764fa25dc4bd 100644 --- a/mm/balloon_compaction.c +++ b/mm/balloon_compaction.c @@ -236,11 +236,12 @@ static void balloon_page_putback(struct page *page) spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); } -/* move_to_new_page() counterpart for a ballooned page */ static int balloon_page_migrate(struct page *newpage, struct page *page, enum migrate_mode mode) { - struct balloon_dev_info *balloon = balloon_page_device(page); + struct balloon_dev_info *b_dev_info = balloon_page_device(page); + unsigned long flags; + int rc; VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); @@ -250,10 +251,32 @@ static int balloon_page_migrate(struct page *newpage, struct page *page, * device. As isolated balloon pages cannot get deflated, we still have * a balloon device here. */ - if (WARN_ON_ONCE(!balloon)) + if (WARN_ON_ONCE(!b_dev_info)) return -EAGAIN; - return balloon->migratepage(balloon, newpage, page, mode); + rc = b_dev_info->migratepage(b_dev_info, newpage, page, mode); + if (rc < 0 && rc != -ENOENT) + return rc; + + spin_lock_irqsave(&b_dev_info->pages_lock, flags); + if (!rc) { + /* Insert the new page into the balloon list. */ + get_page(newpage); + balloon_page_insert(b_dev_info, newpage); + __count_vm_event(BALLOON_MIGRATE); + } else { + /* Old page was deflated but new page not inflated. */ + __count_vm_event(BALLOON_DEFLATE); + } + + b_dev_info->isolated_pages--; + spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); + + /* Free the now-deflated page we isolated in balloon_page_isolate(). */ + balloon_page_finalize(page); + put_page(page); + + return 0; } const struct movable_operations balloon_mops = { -- cgit v1.2.3 From a00de9ba30aa71fe68ab45a9d2df595a7c39dd74 Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Red Hat)" Date: Tue, 20 Jan 2026 00:01:14 +0100 Subject: mm/balloon_compaction: centralize adjust_managed_page_count() handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Let's centralize it, by allowing for the driver to enable this handling through a new flag (bool for now) in the balloon device info. Note that we now adjust the counter when adding/removing a page into the balloon list: when removing a page to deflate it, it will now happen before the driver communicated with hypervisor, not afterwards. This shouldn't make a difference in practice. Link: https://lkml.kernel.org/r/20260119230133.3551867-7-david@kernel.org Signed-off-by: David Hildenbrand (Red Hat) Acked-by: Liam R. Howlett Acked-by: Michael S. Tsirkin Cc: Arnd Bergmann Cc: Christophe Leroy Cc: Eugenio Pérez Cc: Greg Kroah-Hartman Cc: Jason Wang Cc: Jerrin Shaji George Cc: Jonathan Corbet Cc: Lorenzo Stoakes Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Oscar Salvador Cc: SeongJae Park Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Zi Yan Signed-off-by: Andrew Morton --- arch/powerpc/platforms/pseries/cmm.c | 13 +------------ drivers/virtio/virtio_balloon.c | 19 ++----------------- include/linux/balloon_compaction.h | 2 ++ mm/balloon_compaction.c | 17 +++++++++++++++++ 4 files changed, 22 insertions(+), 29 deletions(-) diff --git a/arch/powerpc/platforms/pseries/cmm.c b/arch/powerpc/platforms/pseries/cmm.c index 15f873f733a4..7fd8b3d7e763 100644 --- a/arch/powerpc/platforms/pseries/cmm.c +++ b/arch/powerpc/platforms/pseries/cmm.c @@ -165,7 +165,6 @@ static long cmm_alloc_pages(long nr) balloon_page_enqueue(&b_dev_info, page); atomic_long_inc(&loaned_pages); - adjust_managed_page_count(page, -1); nr--; } @@ -190,7 +189,6 @@ static long cmm_free_pages(long nr) if (!page) break; plpar_page_set_active(page); - adjust_managed_page_count(page, 1); __free_page(page); atomic_long_dec(&loaned_pages); nr--; @@ -515,16 +513,6 @@ static int cmm_migratepage(struct balloon_dev_info *b_dev_info, return -EBUSY; } - /* - * When we migrate a page to a different zone, we have to fixup the - * count of both involved zones as we adjusted the managed page count - * when inflating. - */ - if (page_zone(page) != page_zone(newpage)) { - adjust_managed_page_count(page, 1); - adjust_managed_page_count(newpage, -1); - } - /* * activate/"deflate" the old page. We ignore any errors just like the * other callers. @@ -551,6 +539,7 @@ static int cmm_init(void) return -EOPNOTSUPP; balloon_devinfo_init(&b_dev_info); + b_dev_info.adjust_managed_page_count = true; if (IS_ENABLED(CONFIG_BALLOON_COMPACTION)) b_dev_info.migratepage = cmm_migratepage; diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c index df2756c071da..15c1cf5fd249 100644 --- a/drivers/virtio/virtio_balloon.c +++ b/drivers/virtio/virtio_balloon.c @@ -274,9 +274,6 @@ static unsigned int fill_balloon(struct virtio_balloon *vb, size_t num) set_page_pfns(vb, vb->pfns + vb->num_pfns, page); vb->num_pages += VIRTIO_BALLOON_PAGES_PER_PAGE; - if (!virtio_has_feature(vb->vdev, - VIRTIO_BALLOON_F_DEFLATE_ON_OOM)) - adjust_managed_page_count(page, -1); vb->num_pfns += VIRTIO_BALLOON_PAGES_PER_PAGE; } @@ -295,9 +292,6 @@ static void release_pages_balloon(struct virtio_balloon *vb, struct page *page, *next; list_for_each_entry_safe(page, next, pages, lru) { - if (!virtio_has_feature(vb->vdev, - VIRTIO_BALLOON_F_DEFLATE_ON_OOM)) - adjust_managed_page_count(page, 1); list_del(&page->lru); put_page(page); /* balloon reference */ } @@ -839,17 +833,6 @@ static int virtballoon_migratepage(struct balloon_dev_info *vb_dev_info, if (!mutex_trylock(&vb->balloon_lock)) return -EAGAIN; - /* - * When we migrate a page to a different zone and adjusted the - * managed page count when inflating, we have to fixup the count of - * both involved zones. - */ - if (!virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_DEFLATE_ON_OOM) && - page_zone(page) != page_zone(newpage)) { - adjust_managed_page_count(page, 1); - adjust_managed_page_count(newpage, -1); - } - /* balloon's page migration 1st step -- inflate "newpage" */ vb->num_pfns = VIRTIO_BALLOON_PAGES_PER_PAGE; set_page_pfns(vb, vb->pfns, newpage); @@ -958,6 +941,8 @@ static int virtballoon_probe(struct virtio_device *vdev) if (err) goto out_free_vb; + if (!virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_DEFLATE_ON_OOM)) + vb->vb_dev_info.adjust_managed_page_count = true; #ifdef CONFIG_BALLOON_COMPACTION vb->vb_dev_info.migratepage = virtballoon_migratepage; #endif diff --git a/include/linux/balloon_compaction.h b/include/linux/balloon_compaction.h index 7cfe48769239..3109d3c43d30 100644 --- a/include/linux/balloon_compaction.h +++ b/include/linux/balloon_compaction.h @@ -56,6 +56,7 @@ struct balloon_dev_info { struct list_head pages; /* Pages enqueued & handled to Host */ int (*migratepage)(struct balloon_dev_info *, struct page *newpage, struct page *page, enum migrate_mode mode); + bool adjust_managed_page_count; }; extern struct page *balloon_page_alloc(void); @@ -73,6 +74,7 @@ static inline void balloon_devinfo_init(struct balloon_dev_info *balloon) spin_lock_init(&balloon->pages_lock); INIT_LIST_HEAD(&balloon->pages); balloon->migratepage = NULL; + balloon->adjust_managed_page_count = false; } #ifdef CONFIG_BALLOON_COMPACTION diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c index 764fa25dc4bd..4fe2a0cff69e 100644 --- a/mm/balloon_compaction.c +++ b/mm/balloon_compaction.c @@ -23,6 +23,8 @@ static void balloon_page_enqueue_one(struct balloon_dev_info *b_dev_info, BUG_ON(!trylock_page(page)); balloon_page_insert(b_dev_info, page); unlock_page(page); + if (b_dev_info->adjust_managed_page_count) + adjust_managed_page_count(page, -1); __count_vm_event(BALLOON_INFLATE); inc_node_page_state(page, NR_BALLOON_PAGES); } @@ -95,6 +97,8 @@ size_t balloon_page_list_dequeue(struct balloon_dev_info *b_dev_info, continue; list_del(&page->lru); + if (b_dev_info->adjust_managed_page_count) + adjust_managed_page_count(page, 1); balloon_page_finalize(page); __count_vm_event(BALLOON_DEFLATE); list_add(&page->lru, pages); @@ -264,9 +268,22 @@ static int balloon_page_migrate(struct page *newpage, struct page *page, get_page(newpage); balloon_page_insert(b_dev_info, newpage); __count_vm_event(BALLOON_MIGRATE); + + if (b_dev_info->adjust_managed_page_count && + page_zone(page) != page_zone(newpage)) { + /* + * When we migrate a page to a different zone we + * have to fixup the count of both involved zones. + */ + adjust_managed_page_count(page, 1); + adjust_managed_page_count(newpage, -1); + } } else { /* Old page was deflated but new page not inflated. */ __count_vm_event(BALLOON_DEFLATE); + + if (b_dev_info->adjust_managed_page_count) + adjust_managed_page_count(page, 1); } b_dev_info->isolated_pages--; -- cgit v1.2.3 From c33b47c334f933d846cadf7c2cff24433e5b3bb0 Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Red Hat)" Date: Tue, 20 Jan 2026 00:01:15 +0100 Subject: vmw_balloon: stop using the balloon_dev_info lock MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Let's not piggy-back on the existing lock and use a separate lock for the huge page list. Now that we use a separate lock, there is no need to disable interrupts, so use the non-irqsave variants. We only required the irqsave variants because of the balloon device lock. This is a preparation for changing the locking used to protect balloon_dev_info. While at it, talk about "page migration" instead of "page compaction". We'll change that in core code soon as well. Link: https://lkml.kernel.org/r/20260119230133.3551867-8-david@kernel.org Signed-off-by: David Hildenbrand (Red Hat) Reviewed-by: Lorenzo Stoakes Acked-by: Michael S. Tsirkin Cc: Arnd Bergmann Cc: Christophe Leroy Cc: Eugenio Pérez Cc: Greg Kroah-Hartman Cc: Jason Wang Cc: Jerrin Shaji George Cc: Jonathan Corbet Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Oscar Salvador Cc: SeongJae Park Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Zi Yan Signed-off-by: Andrew Morton --- drivers/misc/vmw_balloon.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/drivers/misc/vmw_balloon.c b/drivers/misc/vmw_balloon.c index 52b8c0f1eead..53e9335b6718 100644 --- a/drivers/misc/vmw_balloon.c +++ b/drivers/misc/vmw_balloon.c @@ -354,10 +354,15 @@ struct vmballoon { /** * @huge_pages - list of the inflated 2MB pages. * - * Protected by @b_dev_info.pages_lock . + * Protected by @huge_pages_lock. */ struct list_head huge_pages; + /** + * @huge_pages_lock: lock for the list of inflated 2MB pages. + */ + spinlock_t huge_pages_lock; + /** * @vmci_doorbell. * @@ -987,7 +992,6 @@ static void vmballoon_enqueue_page_list(struct vmballoon *b, unsigned int *n_pages, enum vmballoon_page_size_type page_size) { - unsigned long flags; struct page *page; if (page_size == VMW_BALLOON_4K_PAGE) { @@ -995,9 +999,9 @@ static void vmballoon_enqueue_page_list(struct vmballoon *b, } else { /* * Keep the huge pages in a local list which is not available - * for the balloon compaction mechanism. + * for the balloon page migration. */ - spin_lock_irqsave(&b->b_dev_info.pages_lock, flags); + spin_lock(&b->huge_pages_lock); list_for_each_entry(page, pages, lru) { vmballoon_mark_page_offline(page, VMW_BALLOON_2M_PAGE); @@ -1006,7 +1010,7 @@ static void vmballoon_enqueue_page_list(struct vmballoon *b, list_splice_init(pages, &b->huge_pages); __count_vm_events(BALLOON_INFLATE, *n_pages * vmballoon_page_in_frames(VMW_BALLOON_2M_PAGE)); - spin_unlock_irqrestore(&b->b_dev_info.pages_lock, flags); + spin_unlock(&b->huge_pages_lock); } *n_pages = 0; @@ -1033,7 +1037,6 @@ static void vmballoon_dequeue_page_list(struct vmballoon *b, { struct page *page, *tmp; unsigned int i = 0; - unsigned long flags; /* In the case of 4k pages, use the compaction infrastructure */ if (page_size == VMW_BALLOON_4K_PAGE) { @@ -1043,7 +1046,7 @@ static void vmballoon_dequeue_page_list(struct vmballoon *b, } /* 2MB pages */ - spin_lock_irqsave(&b->b_dev_info.pages_lock, flags); + spin_lock(&b->huge_pages_lock); list_for_each_entry_safe(page, tmp, &b->huge_pages, lru) { vmballoon_mark_page_online(page, VMW_BALLOON_2M_PAGE); @@ -1054,7 +1057,7 @@ static void vmballoon_dequeue_page_list(struct vmballoon *b, __count_vm_events(BALLOON_DEFLATE, i * vmballoon_page_in_frames(VMW_BALLOON_2M_PAGE)); - spin_unlock_irqrestore(&b->b_dev_info.pages_lock, flags); + spin_unlock(&b->huge_pages_lock); *n_pages = i; } @@ -1828,6 +1831,7 @@ static int __init vmballoon_init(void) balloon.b_dev_info.migratepage = vmballoon_migratepage; INIT_LIST_HEAD(&balloon.huge_pages); + spin_lock_init(&balloon.huge_pages_lock); spin_lock_init(&balloon.comm_lock); init_rwsem(&balloon.conf_sem); balloon.vmci_doorbell = VMCI_INVALID_HANDLE; -- cgit v1.2.3 From 8202313e3dfa9bdeb73427b564cfe2bfd02e4807 Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Red Hat)" Date: Tue, 20 Jan 2026 00:01:16 +0100 Subject: mm/balloon_compaction: use a device-independent balloon (list) lock MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In order to remove the dependency on the page lock for balloon pages, we need a lock that is independent of the page. It's crucial that we can handle the scenario where balloon deflation (clearing page->private) can race with page isolation (using page->private to obtain the balloon_dev_info where the lock currently resides). The current lock in balloon_dev_info is therefore not suitable. Fortunately, we never really have more than a single balloon device per VM, so we can just keep it simple and use a static lock to protect all balloon devices. Based on this change we will remove the dependency on the page lock next. Link: https://lkml.kernel.org/r/20260119230133.3551867-9-david@kernel.org Signed-off-by: David Hildenbrand (Red Hat) Acked-by: Michael S. Tsirkin Cc: Arnd Bergmann Cc: Christophe Leroy Cc: Eugenio Pérez Cc: Greg Kroah-Hartman Cc: Jason Wang Cc: Jerrin Shaji George Cc: Jonathan Corbet Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Oscar Salvador Cc: SeongJae Park Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/balloon_compaction.h | 6 ++---- mm/balloon_compaction.c | 34 ++++++++++++++++++++-------------- 2 files changed, 22 insertions(+), 18 deletions(-) diff --git a/include/linux/balloon_compaction.h b/include/linux/balloon_compaction.h index 3109d3c43d30..9a8568fcd477 100644 --- a/include/linux/balloon_compaction.h +++ b/include/linux/balloon_compaction.h @@ -21,10 +21,10 @@ * i. Setting the PG_movable_ops flag and page->private with the following * lock order * +-page_lock(page); - * +--spin_lock_irq(&b_dev_info->pages_lock); + * +--spin_lock_irq(&balloon_pages_lock); * * ii. isolation or dequeueing procedure must remove the page from balloon - * device page list under b_dev_info->pages_lock. + * device page list under balloon_pages_lock * * The functions provided by this interface are placed to help on coping with * the aforementioned balloon page corner case, as well as to ensure the simple @@ -52,7 +52,6 @@ */ struct balloon_dev_info { unsigned long isolated_pages; /* # of isolated pages for migration */ - spinlock_t pages_lock; /* Protection to pages list */ struct list_head pages; /* Pages enqueued & handled to Host */ int (*migratepage)(struct balloon_dev_info *, struct page *newpage, struct page *page, enum migrate_mode mode); @@ -71,7 +70,6 @@ extern size_t balloon_page_list_dequeue(struct balloon_dev_info *b_dev_info, static inline void balloon_devinfo_init(struct balloon_dev_info *balloon) { balloon->isolated_pages = 0; - spin_lock_init(&balloon->pages_lock); INIT_LIST_HEAD(&balloon->pages); balloon->migratepage = NULL; balloon->adjust_managed_page_count = false; diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c index 4fe2a0cff69e..a0fd779bbd01 100644 --- a/mm/balloon_compaction.c +++ b/mm/balloon_compaction.c @@ -11,6 +11,12 @@ #include #include +/* + * Lock protecting the balloon_dev_info of all devices. We don't really + * expect more than one device. + */ +static DEFINE_SPINLOCK(balloon_pages_lock); + static void balloon_page_enqueue_one(struct balloon_dev_info *b_dev_info, struct page *page) { @@ -47,13 +53,13 @@ size_t balloon_page_list_enqueue(struct balloon_dev_info *b_dev_info, unsigned long flags; size_t n_pages = 0; - spin_lock_irqsave(&b_dev_info->pages_lock, flags); + spin_lock_irqsave(&balloon_pages_lock, flags); list_for_each_entry_safe(page, tmp, pages, lru) { list_del(&page->lru); balloon_page_enqueue_one(b_dev_info, page); n_pages++; } - spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); + spin_unlock_irqrestore(&balloon_pages_lock, flags); return n_pages; } EXPORT_SYMBOL_GPL(balloon_page_list_enqueue); @@ -83,7 +89,7 @@ size_t balloon_page_list_dequeue(struct balloon_dev_info *b_dev_info, unsigned long flags; size_t n_pages = 0; - spin_lock_irqsave(&b_dev_info->pages_lock, flags); + spin_lock_irqsave(&balloon_pages_lock, flags); list_for_each_entry_safe(page, tmp, &b_dev_info->pages, lru) { if (n_pages == n_req_pages) break; @@ -106,7 +112,7 @@ size_t balloon_page_list_dequeue(struct balloon_dev_info *b_dev_info, dec_node_page_state(page, NR_BALLOON_PAGES); n_pages++; } - spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); + spin_unlock_irqrestore(&balloon_pages_lock, flags); return n_pages; } @@ -149,9 +155,9 @@ void balloon_page_enqueue(struct balloon_dev_info *b_dev_info, { unsigned long flags; - spin_lock_irqsave(&b_dev_info->pages_lock, flags); + spin_lock_irqsave(&balloon_pages_lock, flags); balloon_page_enqueue_one(b_dev_info, page); - spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); + spin_unlock_irqrestore(&balloon_pages_lock, flags); } EXPORT_SYMBOL_GPL(balloon_page_enqueue); @@ -191,11 +197,11 @@ struct page *balloon_page_dequeue(struct balloon_dev_info *b_dev_info) * BUG() here, otherwise the balloon driver may get stuck in * an infinite loop while attempting to release all its pages. */ - spin_lock_irqsave(&b_dev_info->pages_lock, flags); + spin_lock_irqsave(&balloon_pages_lock, flags); if (unlikely(list_empty(&b_dev_info->pages) && !b_dev_info->isolated_pages)) BUG(); - spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); + spin_unlock_irqrestore(&balloon_pages_lock, flags); return NULL; } return list_first_entry(&pages, struct page, lru); @@ -213,10 +219,10 @@ static bool balloon_page_isolate(struct page *page, isolate_mode_t mode) if (!b_dev_info) return false; - spin_lock_irqsave(&b_dev_info->pages_lock, flags); + spin_lock_irqsave(&balloon_pages_lock, flags); list_del(&page->lru); b_dev_info->isolated_pages++; - spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); + spin_unlock_irqrestore(&balloon_pages_lock, flags); return true; } @@ -234,10 +240,10 @@ static void balloon_page_putback(struct page *page) if (WARN_ON_ONCE(!b_dev_info)) return; - spin_lock_irqsave(&b_dev_info->pages_lock, flags); + spin_lock_irqsave(&balloon_pages_lock, flags); list_add(&page->lru, &b_dev_info->pages); b_dev_info->isolated_pages--; - spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); + spin_unlock_irqrestore(&balloon_pages_lock, flags); } static int balloon_page_migrate(struct page *newpage, struct page *page, @@ -262,7 +268,7 @@ static int balloon_page_migrate(struct page *newpage, struct page *page, if (rc < 0 && rc != -ENOENT) return rc; - spin_lock_irqsave(&b_dev_info->pages_lock, flags); + spin_lock_irqsave(&balloon_pages_lock, flags); if (!rc) { /* Insert the new page into the balloon list. */ get_page(newpage); @@ -287,7 +293,7 @@ static int balloon_page_migrate(struct page *newpage, struct page *page, } b_dev_info->isolated_pages--; - spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); + spin_unlock_irqrestore(&balloon_pages_lock, flags); /* Free the now-deflated page we isolated in balloon_page_isolate(). */ balloon_page_finalize(page); -- cgit v1.2.3 From a3fafdd3896719923f7055b6d7f10f6ee1950d8b Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Red Hat)" Date: Tue, 20 Jan 2026 00:01:17 +0100 Subject: mm/balloon_compaction: remove dependency on page lock MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Let's stop using the page lock in balloon code and instead use only the balloon_device_lock. As soon as we set the PG_movable_ops flag, we might now get isolation callbacks for that page as we are no longer holding the page lock. In there, we'll simply synchronize using the balloon_device_lock. So in balloon_page_isolate() lookup the balloon_dev_info through page->private under balloon_device_lock. It's crucial that we update page->private under the balloon_device_lock, so the isolation callback can properly deal with concurrent deflation. Consequently, make sure that balloon_page_finalize() is called under balloon_device_lock as we remove a page from the list and clear page->private. balloon_page_insert() is already called with the balloon_device_lock held. Note that the core will still lock the pages, for example in isolate_movable_ops_page(). The lock is there still relevant for handling the PageMovableOpsIsolated flag, but that can be later changed to use an atomic test-and-set instead, or moved into the movable_ops backends. Link: https://lkml.kernel.org/r/20260119230133.3551867-10-david@kernel.org Signed-off-by: David Hildenbrand (Red Hat) Acked-by: Michael S. Tsirkin Cc: Arnd Bergmann Cc: Christophe Leroy Cc: Eugenio Pérez Cc: Greg Kroah-Hartman Cc: Jason Wang Cc: Jerrin Shaji George Cc: Jonathan Corbet Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Oscar Salvador Cc: SeongJae Park Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/balloon_compaction.h | 25 +++++++++++++------------ mm/balloon_compaction.c | 38 ++++++++++++-------------------------- 2 files changed, 25 insertions(+), 38 deletions(-) diff --git a/include/linux/balloon_compaction.h b/include/linux/balloon_compaction.h index 9a8568fcd477..ad594af6ed10 100644 --- a/include/linux/balloon_compaction.h +++ b/include/linux/balloon_compaction.h @@ -12,25 +12,27 @@ * is derived from the page type (PageOffline()) combined with the * PG_movable_ops flag (PageMovableOps()). * + * Once the page type and the PG_movable_ops are set, migration code + * can initiate page isolation by invoking the + * movable_operations()->isolate_page() callback + * + * As long as page->private is set, the page is either on the balloon list + * or isolated for migration. If page->private is not set, the page is + * either still getting inflated, or was deflated to be freed by the balloon + * driver soon. Isolation is impossible in both cases. + * * As the page isolation scanning step a compaction thread does is a lockless * procedure (from a page standpoint), it might bring some racy situations while * performing balloon page compaction. In order to sort out these racy scenarios * and safely perform balloon's page compaction and migration we must, always, * ensure following these simple rules: * - * i. Setting the PG_movable_ops flag and page->private with the following - * lock order - * +-page_lock(page); - * +--spin_lock_irq(&balloon_pages_lock); + * i. Inflation/deflation must set/clear page->private under the + * balloon_pages_lock * * ii. isolation or dequeueing procedure must remove the page from balloon * device page list under balloon_pages_lock * - * The functions provided by this interface are placed to help on coping with - * the aforementioned balloon page corner case, as well as to ensure the simple - * set of exposed rules are satisfied while we are dealing with balloon pages - * compaction / migration. - * * Copyright (C) 2012, Red Hat, Inc. Rafael Aquini */ #ifndef _LINUX_BALLOON_COMPACTION_H @@ -93,8 +95,7 @@ static inline struct balloon_dev_info *balloon_page_device(struct page *page) * @balloon : pointer to balloon device * @page : page to be assigned as a 'balloon page' * - * Caller must ensure the page is locked and the spin_lock protecting balloon - * pages list is held before inserting a page into the balloon device. + * Caller must ensure the balloon_pages_lock is held. */ static inline void balloon_page_insert(struct balloon_dev_info *balloon, struct page *page) @@ -119,7 +120,7 @@ static inline gfp_t balloon_mapping_gfp_mask(void) * balloon list for release to the page allocator * @page: page to be released to the page allocator * - * Caller must ensure that the page is locked. + * Caller must ensure the balloon_pages_lock is held. */ static inline void balloon_page_finalize(struct page *page) { diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c index a0fd779bbd01..75763c73dbd5 100644 --- a/mm/balloon_compaction.c +++ b/mm/balloon_compaction.c @@ -20,15 +20,7 @@ static DEFINE_SPINLOCK(balloon_pages_lock); static void balloon_page_enqueue_one(struct balloon_dev_info *b_dev_info, struct page *page) { - /* - * Block others from accessing the 'page' when we get around to - * establishing additional references. We should be the only one - * holding a reference to the 'page' at this point. If we are not, then - * memory corruption is possible and we should stop execution. - */ - BUG_ON(!trylock_page(page)); balloon_page_insert(b_dev_info, page); - unlock_page(page); if (b_dev_info->adjust_managed_page_count) adjust_managed_page_count(page, -1); __count_vm_event(BALLOON_INFLATE); @@ -93,22 +85,12 @@ size_t balloon_page_list_dequeue(struct balloon_dev_info *b_dev_info, list_for_each_entry_safe(page, tmp, &b_dev_info->pages, lru) { if (n_pages == n_req_pages) break; - - /* - * Block others from accessing the 'page' while we get around to - * establishing additional references and preparing the 'page' - * to be released by the balloon driver. - */ - if (!trylock_page(page)) - continue; - list_del(&page->lru); if (b_dev_info->adjust_managed_page_count) adjust_managed_page_count(page, 1); balloon_page_finalize(page); __count_vm_event(BALLOON_DEFLATE); list_add(&page->lru, pages); - unlock_page(page); dec_node_page_state(page, NR_BALLOON_PAGES); n_pages++; } @@ -213,13 +195,19 @@ EXPORT_SYMBOL_GPL(balloon_page_dequeue); static bool balloon_page_isolate(struct page *page, isolate_mode_t mode) { - struct balloon_dev_info *b_dev_info = balloon_page_device(page); + struct balloon_dev_info *b_dev_info; unsigned long flags; - if (!b_dev_info) - return false; - spin_lock_irqsave(&balloon_pages_lock, flags); + b_dev_info = balloon_page_device(page); + if (!b_dev_info) { + /* + * The page already got deflated and removed from the + * balloon list. + */ + spin_unlock_irqrestore(&balloon_pages_lock, flags); + return false; + } list_del(&page->lru); b_dev_info->isolated_pages++; spin_unlock_irqrestore(&balloon_pages_lock, flags); @@ -253,9 +241,6 @@ static int balloon_page_migrate(struct page *newpage, struct page *page, unsigned long flags; int rc; - VM_BUG_ON_PAGE(!PageLocked(page), page); - VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); - /* * When we isolated the page, the page was still inflated in a balloon * device. As isolated balloon pages cannot get deflated, we still have @@ -293,10 +278,11 @@ static int balloon_page_migrate(struct page *newpage, struct page *page, } b_dev_info->isolated_pages--; - spin_unlock_irqrestore(&balloon_pages_lock, flags); /* Free the now-deflated page we isolated in balloon_page_isolate(). */ balloon_page_finalize(page); + spin_unlock_irqrestore(&balloon_pages_lock, flags); + put_page(page); return 0; -- cgit v1.2.3 From ddc50a97bef1e34c096bf3f0dc9590d7f570ed7b Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Red Hat)" Date: Tue, 20 Jan 2026 00:01:18 +0100 Subject: mm/balloon_compaction: make balloon_mops static MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is no need to expose this anymore, so let's just make it static. Link: https://lkml.kernel.org/r/20260119230133.3551867-11-david@kernel.org Signed-off-by: David Hildenbrand (Red Hat) Reviewed-by: Lorenzo Stoakes Acked-by: Michael S. Tsirkin Cc: Arnd Bergmann Cc: Christophe Leroy Cc: Eugenio Pérez Cc: Greg Kroah-Hartman Cc: Jason Wang Cc: Jerrin Shaji George Cc: Jonathan Corbet Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Oscar Salvador Cc: SeongJae Park Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/balloon_compaction.h | 1 - mm/balloon_compaction.c | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/include/linux/balloon_compaction.h b/include/linux/balloon_compaction.h index ad594af6ed10..7db66c2c86cd 100644 --- a/include/linux/balloon_compaction.h +++ b/include/linux/balloon_compaction.h @@ -78,7 +78,6 @@ static inline void balloon_devinfo_init(struct balloon_dev_info *balloon) } #ifdef CONFIG_BALLOON_COMPACTION -extern const struct movable_operations balloon_mops; /* * balloon_page_device - get the b_dev_info descriptor for the balloon device * that enqueues the given page. diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c index 75763c73dbd5..cf4d93176392 100644 --- a/mm/balloon_compaction.c +++ b/mm/balloon_compaction.c @@ -288,7 +288,7 @@ static int balloon_page_migrate(struct page *newpage, struct page *page, return 0; } -const struct movable_operations balloon_mops = { +static const struct movable_operations balloon_mops = { .migrate_page = balloon_page_migrate, .isolate_page = balloon_page_isolate, .putback_page = balloon_page_putback, -- cgit v1.2.3 From aa974cbf949e94c79b46a0053d40229bc634f9be Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Red Hat)" Date: Tue, 20 Jan 2026 00:01:19 +0100 Subject: mm/balloon_compaction: drop fs.h include from balloon_compaction.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Ever since commit 68f2736a8583 ("mm: Convert all PageMovable users to movable_operations") we no longer store an inode in balloon_dev_info, so we can stop including "fs.h". Link: https://lkml.kernel.org/r/20260119230133.3551867-12-david@kernel.org Signed-off-by: David Hildenbrand (Red Hat) Reviewed-by: Lorenzo Stoakes Acked-by: Michael S. Tsirkin Cc: Arnd Bergmann Cc: Christophe Leroy Cc: Eugenio Pérez Cc: Greg Kroah-Hartman Cc: Jason Wang Cc: Jerrin Shaji George Cc: Jonathan Corbet Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Oscar Salvador Cc: SeongJae Park Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/balloon_compaction.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/linux/balloon_compaction.h b/include/linux/balloon_compaction.h index 7db66c2c86cd..1452ea063524 100644 --- a/include/linux/balloon_compaction.h +++ b/include/linux/balloon_compaction.h @@ -42,7 +42,6 @@ #include #include #include -#include #include /* -- cgit v1.2.3 From f7e15373143aba99a6ec51dc4db7187bff6c9a0a Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Red Hat)" Date: Tue, 20 Jan 2026 00:01:20 +0100 Subject: drivers/virtio/virtio_balloon: stop using balloon_page_push/pop() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Let's stop using these functions so we can remove them. They look like belonging to the balloon API for managing the device balloon list when really they are just simple helpers only used by virtio-balloon. Let's just inline them and switch to a proper list_for_each_entry_safe(). Link: https://lkml.kernel.org/r/20260119230133.3551867-13-david@kernel.org Signed-off-by: David Hildenbrand (Red Hat) Reviewed-by: Lorenzo Stoakes Acked-by: Michael S. Tsirkin Cc: Arnd Bergmann Cc: Christophe Leroy Cc: Eugenio Pérez Cc: Greg Kroah-Hartman Cc: Jason Wang Cc: Jerrin Shaji George Cc: Jonathan Corbet Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Oscar Salvador Cc: SeongJae Park Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Zi Yan Signed-off-by: Andrew Morton --- drivers/virtio/virtio_balloon.c | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c index 15c1cf5fd249..6ae00de78b61 100644 --- a/drivers/virtio/virtio_balloon.c +++ b/drivers/virtio/virtio_balloon.c @@ -242,8 +242,8 @@ static void set_page_pfns(struct virtio_balloon *vb, static unsigned int fill_balloon(struct virtio_balloon *vb, size_t num) { unsigned int num_allocated_pages; + struct page *page, *next; unsigned int num_pfns; - struct page *page; LIST_HEAD(pages); /* We can only do one array worth at a time. */ @@ -262,14 +262,15 @@ static unsigned int fill_balloon(struct virtio_balloon *vb, size_t num) break; } - balloon_page_push(&pages, page); + list_add(&page->lru, &pages); } mutex_lock(&vb->balloon_lock); vb->num_pfns = 0; - while ((page = balloon_page_pop(&pages))) { + list_for_each_entry_safe(page, next, &pages, lru) { + list_del(&page->lru); balloon_page_enqueue(&vb->vb_dev_info, page); set_page_pfns(vb, vb->pfns + vb->num_pfns, page); @@ -474,15 +475,19 @@ static inline s64 towards_target(struct virtio_balloon *vb) static unsigned long return_free_pages_to_mm(struct virtio_balloon *vb, unsigned long num_to_return) { - struct page *page; - unsigned long num_returned; + unsigned long num_returned = 0; + struct page *page, *next; + + if (unlikely(!num_to_return)) + return 0; spin_lock_irq(&vb->free_page_list_lock); - for (num_returned = 0; num_returned < num_to_return; num_returned++) { - page = balloon_page_pop(&vb->free_page_list); - if (!page) - break; + + list_for_each_entry_safe(page, next, &vb->free_page_list, lru) { + list_del(&page->lru); __free_pages(page, VIRTIO_BALLOON_HINT_BLOCK_ORDER); + if (++num_returned == num_to_return) + break; } vb->num_free_page_blocks -= num_returned; spin_unlock_irq(&vb->free_page_list_lock); @@ -717,7 +722,7 @@ static int get_free_page_and_send(struct virtio_balloon *vb) } virtqueue_kick(vq); spin_lock_irq(&vb->free_page_list_lock); - balloon_page_push(&vb->free_page_list, page); + list_add(&page->lru, &vb->free_page_list); vb->num_free_page_blocks++; spin_unlock_irq(&vb->free_page_list_lock); } else { -- cgit v1.2.3 From 0fa3e9a48bafde8aa5a5b994b05396e9b86ce156 Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Red Hat)" Date: Tue, 20 Jan 2026 00:01:21 +0100 Subject: mm/balloon_compaction: remove balloon_page_push/pop() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Let's remove these helpers as they are unused now. Link: https://lkml.kernel.org/r/20260119230133.3551867-14-david@kernel.org Signed-off-by: David Hildenbrand (Red Hat) Reviewed-by: Lorenzo Stoakes Acked-by: Michael S. Tsirkin Cc: Arnd Bergmann Cc: Christophe Leroy Cc: Eugenio Pérez Cc: Greg Kroah-Hartman Cc: Jason Wang Cc: Jerrin Shaji George Cc: Jonathan Corbet Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Oscar Salvador Cc: SeongJae Park Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/balloon_compaction.h | 30 ------------------------------ mm/balloon_compaction.c | 5 ++--- 2 files changed, 2 insertions(+), 33 deletions(-) diff --git a/include/linux/balloon_compaction.h b/include/linux/balloon_compaction.h index 1452ea063524..e5451cf1f658 100644 --- a/include/linux/balloon_compaction.h +++ b/include/linux/balloon_compaction.h @@ -126,34 +126,4 @@ static inline void balloon_page_finalize(struct page *page) set_page_private(page, 0); /* PageOffline is sticky until the page is freed to the buddy. */ } - -/* - * balloon_page_push - insert a page into a page list. - * @head : pointer to list - * @page : page to be added - * - * Caller must ensure the page is private and protect the list. - */ -static inline void balloon_page_push(struct list_head *pages, struct page *page) -{ - list_add(&page->lru, pages); -} - -/* - * balloon_page_pop - remove a page from a page list. - * @head : pointer to list - * @page : page to be added - * - * Caller must ensure the page is private and protect the list. - */ -static inline struct page *balloon_page_pop(struct list_head *pages) -{ - struct page *page = list_first_entry_or_null(pages, struct page, lru); - - if (!page) - return NULL; - - list_del(&page->lru); - return page; -} #endif /* _LINUX_BALLOON_COMPACTION_H */ diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c index cf4d93176392..5e1507a13a52 100644 --- a/mm/balloon_compaction.c +++ b/mm/balloon_compaction.c @@ -128,9 +128,8 @@ EXPORT_SYMBOL_GPL(balloon_page_alloc); * Drivers must call this function to properly enqueue a new allocated balloon * page before definitively removing the page from the guest system. * - * Drivers must not call balloon_page_enqueue on pages that have been pushed to - * a list with balloon_page_push before removing them with balloon_page_pop. To - * enqueue a list of pages, use balloon_page_list_enqueue instead. + * Drivers must not enqueue pages while page->lru is still in + * use, and must not use page->lru until a page was unqueued again. */ void balloon_page_enqueue(struct balloon_dev_info *b_dev_info, struct page *page) -- cgit v1.2.3 From 9d792ef33e40c8511b00a38e5e2e63f20bd2d815 Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Red Hat)" Date: Tue, 20 Jan 2026 00:01:22 +0100 Subject: mm/balloon_compaction: fold balloon_mapping_gfp_mask() into balloon_page_alloc() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Let's just remove balloon_mapping_gfp_mask(). Link: https://lkml.kernel.org/r/20260119230133.3551867-15-david@kernel.org Signed-off-by: David Hildenbrand (Red Hat) Reviewed-by: Lorenzo Stoakes Acked-by: Michael S. Tsirkin Cc: Arnd Bergmann Cc: Christophe Leroy Cc: Eugenio Pérez Cc: Greg Kroah-Hartman Cc: Jason Wang Cc: Jerrin Shaji George Cc: Jonathan Corbet Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Oscar Salvador Cc: SeongJae Park Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/balloon_compaction.h | 7 ------- mm/balloon_compaction.c | 12 ++++++++---- 2 files changed, 8 insertions(+), 11 deletions(-) diff --git a/include/linux/balloon_compaction.h b/include/linux/balloon_compaction.h index e5451cf1f658..d1d473939897 100644 --- a/include/linux/balloon_compaction.h +++ b/include/linux/balloon_compaction.h @@ -106,13 +106,6 @@ static inline void balloon_page_insert(struct balloon_dev_info *balloon, list_add(&page->lru, &balloon->pages); } -static inline gfp_t balloon_mapping_gfp_mask(void) -{ - if (IS_ENABLED(CONFIG_BALLOON_COMPACTION)) - return GFP_HIGHUSER_MOVABLE; - return GFP_HIGHUSER; -} - /* * balloon_page_finalize - prepare a balloon page that was removed from the * balloon list for release to the page allocator diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c index 5e1507a13a52..1843e168db3c 100644 --- a/mm/balloon_compaction.c +++ b/mm/balloon_compaction.c @@ -112,10 +112,14 @@ EXPORT_SYMBOL_GPL(balloon_page_list_dequeue); */ struct page *balloon_page_alloc(void) { - struct page *page = alloc_page(balloon_mapping_gfp_mask() | - __GFP_NOMEMALLOC | __GFP_NORETRY | - __GFP_NOWARN); - return page; + gfp_t gfp_flags = __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN; + + if (IS_ENABLED(CONFIG_BALLOON_COMPACTION)) + gfp_flags |= GFP_HIGHUSER_MOVABLE; + else + gfp_flags |= GFP_HIGHUSER; + + return alloc_page(gfp_flags); } EXPORT_SYMBOL_GPL(balloon_page_alloc); -- cgit v1.2.3 From 03d6a2f68419b808d51ba39c84aedd6e9a6a92d8 Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Red Hat)" Date: Tue, 20 Jan 2026 00:01:23 +0100 Subject: mm/balloon_compaction: move internal helpers to balloon_compaction.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Let's move the helpers that are not required by drivers anymore. While at it, drop the doc of balloon_page_device() as it is trivial. [david@kernel.org: move balloon_page_device() under CONFIG_BALLOON_COMPACTION] Link: https://lkml.kernel.org/r/27f0adf1-54c1-4d99-8b7f-fd45574e7f41@kernel.org Link: https://lkml.kernel.org/r/20260119230133.3551867-16-david@kernel.org Signed-off-by: David Hildenbrand (Red Hat) Reviewed-by: Lorenzo Stoakes Acked-by: Michael S. Tsirkin Cc: Arnd Bergmann Cc: Christophe Leroy Cc: Eugenio Pérez Cc: Greg Kroah-Hartman Cc: Jason Wang Cc: Jerrin Shaji George Cc: Jonathan Corbet Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Oscar Salvador Cc: SeongJae Park Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/balloon_compaction.h | 44 -------------------------------------- mm/balloon_compaction.c | 37 ++++++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+), 44 deletions(-) diff --git a/include/linux/balloon_compaction.h b/include/linux/balloon_compaction.h index d1d473939897..eec8994056a4 100644 --- a/include/linux/balloon_compaction.h +++ b/include/linux/balloon_compaction.h @@ -75,48 +75,4 @@ static inline void balloon_devinfo_init(struct balloon_dev_info *balloon) balloon->migratepage = NULL; balloon->adjust_managed_page_count = false; } - -#ifdef CONFIG_BALLOON_COMPACTION -/* - * balloon_page_device - get the b_dev_info descriptor for the balloon device - * that enqueues the given page. - */ -static inline struct balloon_dev_info *balloon_page_device(struct page *page) -{ - return (struct balloon_dev_info *)page_private(page); -} -#endif /* CONFIG_BALLOON_COMPACTION */ - -/* - * balloon_page_insert - insert a page into the balloon's page list and make - * the page->private assignment accordingly. - * @balloon : pointer to balloon device - * @page : page to be assigned as a 'balloon page' - * - * Caller must ensure the balloon_pages_lock is held. - */ -static inline void balloon_page_insert(struct balloon_dev_info *balloon, - struct page *page) -{ - __SetPageOffline(page); - if (IS_ENABLED(CONFIG_BALLOON_COMPACTION)) { - SetPageMovableOps(page); - set_page_private(page, (unsigned long)balloon); - } - list_add(&page->lru, &balloon->pages); -} - -/* - * balloon_page_finalize - prepare a balloon page that was removed from the - * balloon list for release to the page allocator - * @page: page to be released to the page allocator - * - * Caller must ensure the balloon_pages_lock is held. - */ -static inline void balloon_page_finalize(struct page *page) -{ - if (IS_ENABLED(CONFIG_BALLOON_COMPACTION)) - set_page_private(page, 0); - /* PageOffline is sticky until the page is freed to the buddy. */ -} #endif /* _LINUX_BALLOON_COMPACTION_H */ diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c index 1843e168db3c..30fa7ee8e1f3 100644 --- a/mm/balloon_compaction.c +++ b/mm/balloon_compaction.c @@ -17,6 +17,39 @@ */ static DEFINE_SPINLOCK(balloon_pages_lock); +/* + * balloon_page_insert - insert a page into the balloon's page list and make + * the page->private assignment accordingly. + * @balloon : pointer to balloon device + * @page : page to be assigned as a 'balloon page' + * + * Caller must ensure the balloon_pages_lock is held. + */ +static void balloon_page_insert(struct balloon_dev_info *balloon, + struct page *page) +{ + __SetPageOffline(page); + if (IS_ENABLED(CONFIG_BALLOON_COMPACTION)) { + SetPageMovableOps(page); + set_page_private(page, (unsigned long)balloon); + } + list_add(&page->lru, &balloon->pages); +} + +/* + * balloon_page_finalize - prepare a balloon page that was removed from the + * balloon list for release to the page allocator + * @page: page to be released to the page allocator + * + * Caller must ensure the balloon_pages_lock is held. + */ +static void balloon_page_finalize(struct page *page) +{ + if (IS_ENABLED(CONFIG_BALLOON_COMPACTION)) + set_page_private(page, 0); + /* PageOffline is sticky until the page is freed to the buddy. */ +} + static void balloon_page_enqueue_one(struct balloon_dev_info *b_dev_info, struct page *page) { @@ -194,6 +227,10 @@ struct page *balloon_page_dequeue(struct balloon_dev_info *b_dev_info) EXPORT_SYMBOL_GPL(balloon_page_dequeue); #ifdef CONFIG_BALLOON_COMPACTION +static struct balloon_dev_info *balloon_page_device(struct page *page) +{ + return (struct balloon_dev_info *)page_private(page); +} static bool balloon_page_isolate(struct page *page, isolate_mode_t mode) -- cgit v1.2.3 From 631eb2282630dc0cccd8284c4ea37e29d17d1f48 Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Red Hat)" Date: Tue, 20 Jan 2026 00:01:24 +0100 Subject: mm/balloon_compaction: assert that the balloon_pages_lock is held MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Let's add some sanity checks for holding the balloon_pages_lock when we're effectively inflating/deflating a page. Link: https://lkml.kernel.org/r/20260119230133.3551867-17-david@kernel.org Signed-off-by: David Hildenbrand (Red Hat) Reviewed-by: Lorenzo Stoakes Acked-by: Michael S. Tsirkin Cc: Arnd Bergmann Cc: Christophe Leroy Cc: Eugenio Pérez Cc: Greg Kroah-Hartman Cc: Jason Wang Cc: Jerrin Shaji George Cc: Jonathan Corbet Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Oscar Salvador Cc: SeongJae Park Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/balloon_compaction.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c index 30fa7ee8e1f3..f77b305b0459 100644 --- a/mm/balloon_compaction.c +++ b/mm/balloon_compaction.c @@ -28,6 +28,7 @@ static DEFINE_SPINLOCK(balloon_pages_lock); static void balloon_page_insert(struct balloon_dev_info *balloon, struct page *page) { + lockdep_assert_held(&balloon_pages_lock); __SetPageOffline(page); if (IS_ENABLED(CONFIG_BALLOON_COMPACTION)) { SetPageMovableOps(page); @@ -45,6 +46,7 @@ static void balloon_page_insert(struct balloon_dev_info *balloon, */ static void balloon_page_finalize(struct page *page) { + lockdep_assert_held(&balloon_pages_lock); if (IS_ENABLED(CONFIG_BALLOON_COMPACTION)) set_page_private(page, 0); /* PageOffline is sticky until the page is freed to the buddy. */ -- cgit v1.2.3 From eee00d04142172c07466ab1192d1dccc6d5a2f87 Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Red Hat)" Date: Tue, 20 Jan 2026 00:01:25 +0100 Subject: mm/balloon_compaction: mark remaining functions for having proper kerneldoc MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Looks like all we are missing for proper kerneldoc is another "*". Link: https://lkml.kernel.org/r/20260119230133.3551867-18-david@kernel.org Signed-off-by: David Hildenbrand (Red Hat) Reviewed-by: Lorenzo Stoakes Acked-by: Michael S. Tsirkin Cc: Arnd Bergmann Cc: Christophe Leroy Cc: Eugenio Pérez Cc: Greg Kroah-Hartman Cc: Jason Wang Cc: Jerrin Shaji George Cc: Jonathan Corbet Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Oscar Salvador Cc: SeongJae Park Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/balloon_compaction.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c index f77b305b0459..7e37a7af9ef0 100644 --- a/mm/balloon_compaction.c +++ b/mm/balloon_compaction.c @@ -17,7 +17,7 @@ */ static DEFINE_SPINLOCK(balloon_pages_lock); -/* +/** * balloon_page_insert - insert a page into the balloon's page list and make * the page->private assignment accordingly. * @balloon : pointer to balloon device @@ -37,7 +37,7 @@ static void balloon_page_insert(struct balloon_dev_info *balloon, list_add(&page->lru, &balloon->pages); } -/* +/** * balloon_page_finalize - prepare a balloon page that was removed from the * balloon list for release to the page allocator * @page: page to be released to the page allocator @@ -135,7 +135,7 @@ size_t balloon_page_list_dequeue(struct balloon_dev_info *b_dev_info, } EXPORT_SYMBOL_GPL(balloon_page_list_dequeue); -/* +/** * balloon_page_alloc - allocates a new page for insertion into the balloon * page list. * @@ -158,7 +158,7 @@ struct page *balloon_page_alloc(void) } EXPORT_SYMBOL_GPL(balloon_page_alloc); -/* +/** * balloon_page_enqueue - inserts a new page into the balloon page list. * * @b_dev_info: balloon device descriptor where we will insert a new page @@ -181,7 +181,7 @@ void balloon_page_enqueue(struct balloon_dev_info *b_dev_info, } EXPORT_SYMBOL_GPL(balloon_page_enqueue); -/* +/** * balloon_page_dequeue - removes a page from balloon's page list and returns * its address to allow the driver to release the page. * @b_dev_info: balloon device descriptor where we will grab a page from. -- cgit v1.2.3 From 92ec9260d53b245d3266f74ecc66d8ea47aaec3d Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Red Hat)" Date: Tue, 20 Jan 2026 00:01:26 +0100 Subject: mm/balloon_compaction: remove "extern" from functions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adding "extern" to functions is frowned-upon. Let's just get rid of it for all functions here. Link: https://lkml.kernel.org/r/20260119230133.3551867-19-david@kernel.org Signed-off-by: David Hildenbrand (Red Hat) Reviewed-by: Lorenzo Stoakes Acked-by: Michael S. Tsirkin Cc: Arnd Bergmann Cc: Christophe Leroy Cc: Eugenio Pérez Cc: Greg Kroah-Hartman Cc: Jason Wang Cc: Jerrin Shaji George Cc: Jonathan Corbet Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Oscar Salvador Cc: SeongJae Park Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/balloon_compaction.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/include/linux/balloon_compaction.h b/include/linux/balloon_compaction.h index eec8994056a4..7757e0e314fd 100644 --- a/include/linux/balloon_compaction.h +++ b/include/linux/balloon_compaction.h @@ -59,14 +59,14 @@ struct balloon_dev_info { bool adjust_managed_page_count; }; -extern struct page *balloon_page_alloc(void); -extern void balloon_page_enqueue(struct balloon_dev_info *b_dev_info, - struct page *page); -extern struct page *balloon_page_dequeue(struct balloon_dev_info *b_dev_info); -extern size_t balloon_page_list_enqueue(struct balloon_dev_info *b_dev_info, - struct list_head *pages); -extern size_t balloon_page_list_dequeue(struct balloon_dev_info *b_dev_info, - struct list_head *pages, size_t n_req_pages); +struct page *balloon_page_alloc(void); +void balloon_page_enqueue(struct balloon_dev_info *b_dev_info, + struct page *page); +struct page *balloon_page_dequeue(struct balloon_dev_info *b_dev_info); +size_t balloon_page_list_enqueue(struct balloon_dev_info *b_dev_info, + struct list_head *pages); +size_t balloon_page_list_dequeue(struct balloon_dev_info *b_dev_info, + struct list_head *pages, size_t n_req_pages); static inline void balloon_devinfo_init(struct balloon_dev_info *balloon) { -- cgit v1.2.3 From a3db9e136ce1996d528dd4fc8d1d2bae7f8bef09 Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Red Hat)" Date: Tue, 20 Jan 2026 00:01:27 +0100 Subject: mm/vmscan: drop inclusion of balloon_compaction.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Before commit b1123ea6d3b3 ("mm: balloon: use general non-lru movable page feature"), the include was required because of isolated_balloon_page(). It's no longer required, so let's remove it. Link: https://lkml.kernel.org/r/20260119230133.3551867-20-david@kernel.org Signed-off-by: David Hildenbrand (Red Hat) Reviewed-by: Lorenzo Stoakes Acked-by: Michael S. Tsirkin Cc: Arnd Bergmann Cc: Christophe Leroy Cc: Eugenio Pérez Cc: Greg Kroah-Hartman Cc: Jason Wang Cc: Jerrin Shaji George Cc: Jonathan Corbet Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Oscar Salvador Cc: SeongJae Park Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/vmscan.c | 1 - 1 file changed, 1 deletion(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 4aa47ab000c2..b33039000d6e 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -63,7 +63,6 @@ #include #include -#include #include #include "internal.h" -- cgit v1.2.3 From 25b48b4cdf912f70998336b861a4bf767ee3d332 Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Red Hat)" Date: Tue, 20 Jan 2026 00:01:28 +0100 Subject: mm: rename balloon_compaction.(c|h) to balloon.(c|h) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Even without CONFIG_BALLOON_COMPACTION this infrastructure implements basic list and page management for a memory balloon. Link: https://lkml.kernel.org/r/20260119230133.3551867-21-david@kernel.org Signed-off-by: David Hildenbrand (Red Hat) Reviewed-by: Lorenzo Stoakes Acked-by: Michael S. Tsirkin Cc: Arnd Bergmann Cc: Christophe Leroy Cc: Eugenio Pérez Cc: Greg Kroah-Hartman Cc: Jason Wang Cc: Jerrin Shaji George Cc: Jonathan Corbet Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Oscar Salvador Cc: SeongJae Park Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Zi Yan Signed-off-by: Andrew Morton --- Documentation/core-api/mm-api.rst | 2 +- MAINTAINERS | 4 +- arch/powerpc/platforms/pseries/cmm.c | 2 +- drivers/misc/vmw_balloon.c | 2 +- drivers/virtio/virtio_balloon.c | 2 +- include/linux/balloon.h | 77 ++++++++ include/linux/balloon_compaction.h | 78 -------- mm/Makefile | 2 +- mm/balloon.c | 344 ++++++++++++++++++++++++++++++++++ mm/balloon_compaction.c | 345 ----------------------------------- 10 files changed, 428 insertions(+), 430 deletions(-) create mode 100644 include/linux/balloon.h delete mode 100644 include/linux/balloon_compaction.h create mode 100644 mm/balloon.c delete mode 100644 mm/balloon_compaction.c diff --git a/Documentation/core-api/mm-api.rst b/Documentation/core-api/mm-api.rst index 68193a4cfcf5..aabdd3cba58e 100644 --- a/Documentation/core-api/mm-api.rst +++ b/Documentation/core-api/mm-api.rst @@ -130,5 +130,5 @@ More Memory Management Functions .. kernel-doc:: mm/vmscan.c .. kernel-doc:: mm/memory_hotplug.c .. kernel-doc:: mm/mmu_notifier.c -.. kernel-doc:: mm/balloon_compaction.c +.. kernel-doc:: mm/balloon.c .. kernel-doc:: mm/huge_memory.c diff --git a/MAINTAINERS b/MAINTAINERS index ebc2f1bc0ade..a4535ec654dc 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -27546,9 +27546,9 @@ M: David Hildenbrand L: virtualization@lists.linux.dev S: Maintained F: drivers/virtio/virtio_balloon.c -F: include/linux/balloon_compaction.h +F: include/linux/balloon.h F: include/uapi/linux/virtio_balloon.h -F: mm/balloon_compaction.c +F: mm/balloon.c VIRTIO BLOCK AND SCSI DRIVERS M: "Michael S. Tsirkin" diff --git a/arch/powerpc/platforms/pseries/cmm.c b/arch/powerpc/platforms/pseries/cmm.c index 7fd8b3d7e763..7a3c4922685a 100644 --- a/arch/powerpc/platforms/pseries/cmm.c +++ b/arch/powerpc/platforms/pseries/cmm.c @@ -19,7 +19,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/misc/vmw_balloon.c b/drivers/misc/vmw_balloon.c index 53e9335b6718..7fd3f709108c 100644 --- a/drivers/misc/vmw_balloon.c +++ b/drivers/misc/vmw_balloon.c @@ -29,7 +29,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c index 6ae00de78b61..de8041c3285a 100644 --- a/drivers/virtio/virtio_balloon.c +++ b/drivers/virtio/virtio_balloon.c @@ -13,7 +13,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/include/linux/balloon.h b/include/linux/balloon.h new file mode 100644 index 000000000000..82585542300d --- /dev/null +++ b/include/linux/balloon.h @@ -0,0 +1,77 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Common interface for implementing a memory balloon, including support + * for migration of pages inflated in a memory balloon. + * + * Balloon page migration makes use of the general "movable_ops page migration" + * feature. + * + * page->private is used to reference the responsible balloon device. + * That these pages have movable_ops, and which movable_ops apply, + * is derived from the page type (PageOffline()) combined with the + * PG_movable_ops flag (PageMovableOps()). + * + * Once the page type and the PG_movable_ops are set, migration code + * can initiate page isolation by invoking the + * movable_operations()->isolate_page() callback + * + * As long as page->private is set, the page is either on the balloon list + * or isolated for migration. If page->private is not set, the page is + * either still getting inflated, or was deflated to be freed by the balloon + * driver soon. Isolation is impossible in both cases. + * + * As the page isolation scanning step a compaction thread does is a lockless + * procedure (from a page standpoint), it might bring some racy situations while + * performing balloon page compaction. In order to sort out these racy scenarios + * and safely perform balloon's page compaction and migration we must, always, + * ensure following these simple rules: + * + * i. Inflation/deflation must set/clear page->private under the + * balloon_pages_lock + * + * ii. isolation or dequeueing procedure must remove the page from balloon + * device page list under balloon_pages_lock + * + * Copyright (C) 2012, Red Hat, Inc. Rafael Aquini + */ +#ifndef _LINUX_BALLOON_H +#define _LINUX_BALLOON_H +#include +#include +#include +#include +#include +#include + +/* + * Balloon device information descriptor. + * This struct is used to allow the common balloon compaction interface + * procedures to find the proper balloon device holding memory pages they'll + * have to cope for page compaction / migration, as well as it serves the + * balloon driver as a page book-keeper for its registered balloon devices. + */ +struct balloon_dev_info { + unsigned long isolated_pages; /* # of isolated pages for migration */ + struct list_head pages; /* Pages enqueued & handled to Host */ + int (*migratepage)(struct balloon_dev_info *, struct page *newpage, + struct page *page, enum migrate_mode mode); + bool adjust_managed_page_count; +}; + +struct page *balloon_page_alloc(void); +void balloon_page_enqueue(struct balloon_dev_info *b_dev_info, + struct page *page); +struct page *balloon_page_dequeue(struct balloon_dev_info *b_dev_info); +size_t balloon_page_list_enqueue(struct balloon_dev_info *b_dev_info, + struct list_head *pages); +size_t balloon_page_list_dequeue(struct balloon_dev_info *b_dev_info, + struct list_head *pages, size_t n_req_pages); + +static inline void balloon_devinfo_init(struct balloon_dev_info *balloon) +{ + balloon->isolated_pages = 0; + INIT_LIST_HEAD(&balloon->pages); + balloon->migratepage = NULL; + balloon->adjust_managed_page_count = false; +} +#endif /* _LINUX_BALLOON_H */ diff --git a/include/linux/balloon_compaction.h b/include/linux/balloon_compaction.h deleted file mode 100644 index 7757e0e314fd..000000000000 --- a/include/linux/balloon_compaction.h +++ /dev/null @@ -1,78 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * include/linux/balloon_compaction.h - * - * Common interface definitions for making balloon pages movable by compaction. - * - * Balloon page migration makes use of the general "movable_ops page migration" - * feature. - * - * page->private is used to reference the responsible balloon device. - * That these pages have movable_ops, and which movable_ops apply, - * is derived from the page type (PageOffline()) combined with the - * PG_movable_ops flag (PageMovableOps()). - * - * Once the page type and the PG_movable_ops are set, migration code - * can initiate page isolation by invoking the - * movable_operations()->isolate_page() callback - * - * As long as page->private is set, the page is either on the balloon list - * or isolated for migration. If page->private is not set, the page is - * either still getting inflated, or was deflated to be freed by the balloon - * driver soon. Isolation is impossible in both cases. - * - * As the page isolation scanning step a compaction thread does is a lockless - * procedure (from a page standpoint), it might bring some racy situations while - * performing balloon page compaction. In order to sort out these racy scenarios - * and safely perform balloon's page compaction and migration we must, always, - * ensure following these simple rules: - * - * i. Inflation/deflation must set/clear page->private under the - * balloon_pages_lock - * - * ii. isolation or dequeueing procedure must remove the page from balloon - * device page list under balloon_pages_lock - * - * Copyright (C) 2012, Red Hat, Inc. Rafael Aquini - */ -#ifndef _LINUX_BALLOON_COMPACTION_H -#define _LINUX_BALLOON_COMPACTION_H -#include -#include -#include -#include -#include -#include - -/* - * Balloon device information descriptor. - * This struct is used to allow the common balloon compaction interface - * procedures to find the proper balloon device holding memory pages they'll - * have to cope for page compaction / migration, as well as it serves the - * balloon driver as a page book-keeper for its registered balloon devices. - */ -struct balloon_dev_info { - unsigned long isolated_pages; /* # of isolated pages for migration */ - struct list_head pages; /* Pages enqueued & handled to Host */ - int (*migratepage)(struct balloon_dev_info *, struct page *newpage, - struct page *page, enum migrate_mode mode); - bool adjust_managed_page_count; -}; - -struct page *balloon_page_alloc(void); -void balloon_page_enqueue(struct balloon_dev_info *b_dev_info, - struct page *page); -struct page *balloon_page_dequeue(struct balloon_dev_info *b_dev_info); -size_t balloon_page_list_enqueue(struct balloon_dev_info *b_dev_info, - struct list_head *pages); -size_t balloon_page_list_dequeue(struct balloon_dev_info *b_dev_info, - struct list_head *pages, size_t n_req_pages); - -static inline void balloon_devinfo_init(struct balloon_dev_info *balloon) -{ - balloon->isolated_pages = 0; - INIT_LIST_HEAD(&balloon->pages); - balloon->migratepage = NULL; - balloon->adjust_managed_page_count = false; -} -#endif /* _LINUX_BALLOON_COMPACTION_H */ diff --git a/mm/Makefile b/mm/Makefile index 9175f8cc6565..1e31e0a528dc 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -122,7 +122,7 @@ obj-$(CONFIG_CMA) += cma.o obj-$(CONFIG_NUMA) += numa.o obj-$(CONFIG_NUMA_MEMBLKS) += numa_memblks.o obj-$(CONFIG_NUMA_EMU) += numa_emulation.o -obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o +obj-$(CONFIG_MEMORY_BALLOON) += balloon.o obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o obj-$(CONFIG_PAGE_TABLE_CHECK) += page_table_check.o obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o diff --git a/mm/balloon.c b/mm/balloon.c new file mode 100644 index 000000000000..0f068b97e5d8 --- /dev/null +++ b/mm/balloon.c @@ -0,0 +1,344 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Common interface for implementing a memory balloon, including support + * for migration of pages inflated in a memory balloon. + * + * Copyright (C) 2012, Red Hat, Inc. Rafael Aquini + */ +#include +#include +#include +#include + +/* + * Lock protecting the balloon_dev_info of all devices. We don't really + * expect more than one device. + */ +static DEFINE_SPINLOCK(balloon_pages_lock); + +/** + * balloon_page_insert - insert a page into the balloon's page list and make + * the page->private assignment accordingly. + * @balloon : pointer to balloon device + * @page : page to be assigned as a 'balloon page' + * + * Caller must ensure the balloon_pages_lock is held. + */ +static void balloon_page_insert(struct balloon_dev_info *balloon, + struct page *page) +{ + lockdep_assert_held(&balloon_pages_lock); + __SetPageOffline(page); + if (IS_ENABLED(CONFIG_BALLOON_COMPACTION)) { + SetPageMovableOps(page); + set_page_private(page, (unsigned long)balloon); + } + list_add(&page->lru, &balloon->pages); +} + +/** + * balloon_page_finalize - prepare a balloon page that was removed from the + * balloon list for release to the page allocator + * @page: page to be released to the page allocator + * + * Caller must ensure the balloon_pages_lock is held. + */ +static void balloon_page_finalize(struct page *page) +{ + lockdep_assert_held(&balloon_pages_lock); + if (IS_ENABLED(CONFIG_BALLOON_COMPACTION)) + set_page_private(page, 0); + /* PageOffline is sticky until the page is freed to the buddy. */ +} + +static void balloon_page_enqueue_one(struct balloon_dev_info *b_dev_info, + struct page *page) +{ + balloon_page_insert(b_dev_info, page); + if (b_dev_info->adjust_managed_page_count) + adjust_managed_page_count(page, -1); + __count_vm_event(BALLOON_INFLATE); + inc_node_page_state(page, NR_BALLOON_PAGES); +} + +/** + * balloon_page_list_enqueue() - inserts a list of pages into the balloon page + * list. + * @b_dev_info: balloon device descriptor where we will insert a new page to + * @pages: pages to enqueue - allocated using balloon_page_alloc. + * + * Driver must call this function to properly enqueue balloon pages before + * definitively removing them from the guest system. + * + * Return: number of pages that were enqueued. + */ +size_t balloon_page_list_enqueue(struct balloon_dev_info *b_dev_info, + struct list_head *pages) +{ + struct page *page, *tmp; + unsigned long flags; + size_t n_pages = 0; + + spin_lock_irqsave(&balloon_pages_lock, flags); + list_for_each_entry_safe(page, tmp, pages, lru) { + list_del(&page->lru); + balloon_page_enqueue_one(b_dev_info, page); + n_pages++; + } + spin_unlock_irqrestore(&balloon_pages_lock, flags); + return n_pages; +} +EXPORT_SYMBOL_GPL(balloon_page_list_enqueue); + +/** + * balloon_page_list_dequeue() - removes pages from balloon's page list and + * returns a list of the pages. + * @b_dev_info: balloon device descriptor where we will grab a page from. + * @pages: pointer to the list of pages that would be returned to the caller. + * @n_req_pages: number of requested pages. + * + * Driver must call this function to properly de-allocate a previous enlisted + * balloon pages before definitively releasing it back to the guest system. + * This function tries to remove @n_req_pages from the ballooned pages and + * return them to the caller in the @pages list. + * + * Note that this function may fail to dequeue some pages even if the balloon + * isn't empty - since the page list can be temporarily empty due to compaction + * of isolated pages. + * + * Return: number of pages that were added to the @pages list. + */ +size_t balloon_page_list_dequeue(struct balloon_dev_info *b_dev_info, + struct list_head *pages, size_t n_req_pages) +{ + struct page *page, *tmp; + unsigned long flags; + size_t n_pages = 0; + + spin_lock_irqsave(&balloon_pages_lock, flags); + list_for_each_entry_safe(page, tmp, &b_dev_info->pages, lru) { + if (n_pages == n_req_pages) + break; + list_del(&page->lru); + if (b_dev_info->adjust_managed_page_count) + adjust_managed_page_count(page, 1); + balloon_page_finalize(page); + __count_vm_event(BALLOON_DEFLATE); + list_add(&page->lru, pages); + dec_node_page_state(page, NR_BALLOON_PAGES); + n_pages++; + } + spin_unlock_irqrestore(&balloon_pages_lock, flags); + + return n_pages; +} +EXPORT_SYMBOL_GPL(balloon_page_list_dequeue); + +/** + * balloon_page_alloc - allocates a new page for insertion into the balloon + * page list. + * + * Driver must call this function to properly allocate a new balloon page. + * Driver must call balloon_page_enqueue before definitively removing the page + * from the guest system. + * + * Return: struct page for the allocated page or NULL on allocation failure. + */ +struct page *balloon_page_alloc(void) +{ + gfp_t gfp_flags = __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN; + + if (IS_ENABLED(CONFIG_BALLOON_COMPACTION)) + gfp_flags |= GFP_HIGHUSER_MOVABLE; + else + gfp_flags |= GFP_HIGHUSER; + + return alloc_page(gfp_flags); +} +EXPORT_SYMBOL_GPL(balloon_page_alloc); + +/** + * balloon_page_enqueue - inserts a new page into the balloon page list. + * + * @b_dev_info: balloon device descriptor where we will insert a new page + * @page: new page to enqueue - allocated using balloon_page_alloc. + * + * Drivers must call this function to properly enqueue a new allocated balloon + * page before definitively removing the page from the guest system. + * + * Drivers must not enqueue pages while page->lru is still in + * use, and must not use page->lru until a page was unqueued again. + */ +void balloon_page_enqueue(struct balloon_dev_info *b_dev_info, + struct page *page) +{ + unsigned long flags; + + spin_lock_irqsave(&balloon_pages_lock, flags); + balloon_page_enqueue_one(b_dev_info, page); + spin_unlock_irqrestore(&balloon_pages_lock, flags); +} +EXPORT_SYMBOL_GPL(balloon_page_enqueue); + +/** + * balloon_page_dequeue - removes a page from balloon's page list and returns + * its address to allow the driver to release the page. + * @b_dev_info: balloon device descriptor where we will grab a page from. + * + * Driver must call this function to properly dequeue a previously enqueued page + * before definitively releasing it back to the guest system. + * + * Caller must perform its own accounting to ensure that this + * function is called only if some pages are actually enqueued. + * + * Note that this function may fail to dequeue some pages even if there are + * some enqueued pages - since the page list can be temporarily empty due to + * the compaction of isolated pages. + * + * TODO: remove the caller accounting requirements, and allow caller to wait + * until all pages can be dequeued. + * + * Return: struct page for the dequeued page, or NULL if no page was dequeued. + */ +struct page *balloon_page_dequeue(struct balloon_dev_info *b_dev_info) +{ + unsigned long flags; + LIST_HEAD(pages); + int n_pages; + + n_pages = balloon_page_list_dequeue(b_dev_info, &pages, 1); + + if (n_pages != 1) { + /* + * If we are unable to dequeue a balloon page because the page + * list is empty and there are no isolated pages, then something + * went out of track and some balloon pages are lost. + * BUG() here, otherwise the balloon driver may get stuck in + * an infinite loop while attempting to release all its pages. + */ + spin_lock_irqsave(&balloon_pages_lock, flags); + if (unlikely(list_empty(&b_dev_info->pages) && + !b_dev_info->isolated_pages)) + BUG(); + spin_unlock_irqrestore(&balloon_pages_lock, flags); + return NULL; + } + return list_first_entry(&pages, struct page, lru); +} +EXPORT_SYMBOL_GPL(balloon_page_dequeue); + +#ifdef CONFIG_BALLOON_COMPACTION +static struct balloon_dev_info *balloon_page_device(struct page *page) +{ + return (struct balloon_dev_info *)page_private(page); +} + +static bool balloon_page_isolate(struct page *page, isolate_mode_t mode) + +{ + struct balloon_dev_info *b_dev_info; + unsigned long flags; + + spin_lock_irqsave(&balloon_pages_lock, flags); + b_dev_info = balloon_page_device(page); + if (!b_dev_info) { + /* + * The page already got deflated and removed from the + * balloon list. + */ + spin_unlock_irqrestore(&balloon_pages_lock, flags); + return false; + } + list_del(&page->lru); + b_dev_info->isolated_pages++; + spin_unlock_irqrestore(&balloon_pages_lock, flags); + + return true; +} + +static void balloon_page_putback(struct page *page) +{ + struct balloon_dev_info *b_dev_info = balloon_page_device(page); + unsigned long flags; + + /* + * When we isolated the page, the page was still inflated in a balloon + * device. As isolated balloon pages cannot get deflated, we still have + * a balloon device here. + */ + if (WARN_ON_ONCE(!b_dev_info)) + return; + + spin_lock_irqsave(&balloon_pages_lock, flags); + list_add(&page->lru, &b_dev_info->pages); + b_dev_info->isolated_pages--; + spin_unlock_irqrestore(&balloon_pages_lock, flags); +} + +static int balloon_page_migrate(struct page *newpage, struct page *page, + enum migrate_mode mode) +{ + struct balloon_dev_info *b_dev_info = balloon_page_device(page); + unsigned long flags; + int rc; + + /* + * When we isolated the page, the page was still inflated in a balloon + * device. As isolated balloon pages cannot get deflated, we still have + * a balloon device here. + */ + if (WARN_ON_ONCE(!b_dev_info)) + return -EAGAIN; + + rc = b_dev_info->migratepage(b_dev_info, newpage, page, mode); + if (rc < 0 && rc != -ENOENT) + return rc; + + spin_lock_irqsave(&balloon_pages_lock, flags); + if (!rc) { + /* Insert the new page into the balloon list. */ + get_page(newpage); + balloon_page_insert(b_dev_info, newpage); + __count_vm_event(BALLOON_MIGRATE); + + if (b_dev_info->adjust_managed_page_count && + page_zone(page) != page_zone(newpage)) { + /* + * When we migrate a page to a different zone we + * have to fixup the count of both involved zones. + */ + adjust_managed_page_count(page, 1); + adjust_managed_page_count(newpage, -1); + } + } else { + /* Old page was deflated but new page not inflated. */ + __count_vm_event(BALLOON_DEFLATE); + + if (b_dev_info->adjust_managed_page_count) + adjust_managed_page_count(page, 1); + } + + b_dev_info->isolated_pages--; + + /* Free the now-deflated page we isolated in balloon_page_isolate(). */ + balloon_page_finalize(page); + spin_unlock_irqrestore(&balloon_pages_lock, flags); + + put_page(page); + + return 0; +} + +static const struct movable_operations balloon_mops = { + .migrate_page = balloon_page_migrate, + .isolate_page = balloon_page_isolate, + .putback_page = balloon_page_putback, +}; + +static int __init balloon_init(void) +{ + return set_movable_ops(&balloon_mops, PGTY_offline); +} +core_initcall(balloon_init); + +#endif /* CONFIG_BALLOON_COMPACTION */ diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c deleted file mode 100644 index 7e37a7af9ef0..000000000000 --- a/mm/balloon_compaction.c +++ /dev/null @@ -1,345 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * mm/balloon_compaction.c - * - * Common interface for making balloon pages movable by compaction. - * - * Copyright (C) 2012, Red Hat, Inc. Rafael Aquini - */ -#include -#include -#include -#include - -/* - * Lock protecting the balloon_dev_info of all devices. We don't really - * expect more than one device. - */ -static DEFINE_SPINLOCK(balloon_pages_lock); - -/** - * balloon_page_insert - insert a page into the balloon's page list and make - * the page->private assignment accordingly. - * @balloon : pointer to balloon device - * @page : page to be assigned as a 'balloon page' - * - * Caller must ensure the balloon_pages_lock is held. - */ -static void balloon_page_insert(struct balloon_dev_info *balloon, - struct page *page) -{ - lockdep_assert_held(&balloon_pages_lock); - __SetPageOffline(page); - if (IS_ENABLED(CONFIG_BALLOON_COMPACTION)) { - SetPageMovableOps(page); - set_page_private(page, (unsigned long)balloon); - } - list_add(&page->lru, &balloon->pages); -} - -/** - * balloon_page_finalize - prepare a balloon page that was removed from the - * balloon list for release to the page allocator - * @page: page to be released to the page allocator - * - * Caller must ensure the balloon_pages_lock is held. - */ -static void balloon_page_finalize(struct page *page) -{ - lockdep_assert_held(&balloon_pages_lock); - if (IS_ENABLED(CONFIG_BALLOON_COMPACTION)) - set_page_private(page, 0); - /* PageOffline is sticky until the page is freed to the buddy. */ -} - -static void balloon_page_enqueue_one(struct balloon_dev_info *b_dev_info, - struct page *page) -{ - balloon_page_insert(b_dev_info, page); - if (b_dev_info->adjust_managed_page_count) - adjust_managed_page_count(page, -1); - __count_vm_event(BALLOON_INFLATE); - inc_node_page_state(page, NR_BALLOON_PAGES); -} - -/** - * balloon_page_list_enqueue() - inserts a list of pages into the balloon page - * list. - * @b_dev_info: balloon device descriptor where we will insert a new page to - * @pages: pages to enqueue - allocated using balloon_page_alloc. - * - * Driver must call this function to properly enqueue balloon pages before - * definitively removing them from the guest system. - * - * Return: number of pages that were enqueued. - */ -size_t balloon_page_list_enqueue(struct balloon_dev_info *b_dev_info, - struct list_head *pages) -{ - struct page *page, *tmp; - unsigned long flags; - size_t n_pages = 0; - - spin_lock_irqsave(&balloon_pages_lock, flags); - list_for_each_entry_safe(page, tmp, pages, lru) { - list_del(&page->lru); - balloon_page_enqueue_one(b_dev_info, page); - n_pages++; - } - spin_unlock_irqrestore(&balloon_pages_lock, flags); - return n_pages; -} -EXPORT_SYMBOL_GPL(balloon_page_list_enqueue); - -/** - * balloon_page_list_dequeue() - removes pages from balloon's page list and - * returns a list of the pages. - * @b_dev_info: balloon device descriptor where we will grab a page from. - * @pages: pointer to the list of pages that would be returned to the caller. - * @n_req_pages: number of requested pages. - * - * Driver must call this function to properly de-allocate a previous enlisted - * balloon pages before definitively releasing it back to the guest system. - * This function tries to remove @n_req_pages from the ballooned pages and - * return them to the caller in the @pages list. - * - * Note that this function may fail to dequeue some pages even if the balloon - * isn't empty - since the page list can be temporarily empty due to compaction - * of isolated pages. - * - * Return: number of pages that were added to the @pages list. - */ -size_t balloon_page_list_dequeue(struct balloon_dev_info *b_dev_info, - struct list_head *pages, size_t n_req_pages) -{ - struct page *page, *tmp; - unsigned long flags; - size_t n_pages = 0; - - spin_lock_irqsave(&balloon_pages_lock, flags); - list_for_each_entry_safe(page, tmp, &b_dev_info->pages, lru) { - if (n_pages == n_req_pages) - break; - list_del(&page->lru); - if (b_dev_info->adjust_managed_page_count) - adjust_managed_page_count(page, 1); - balloon_page_finalize(page); - __count_vm_event(BALLOON_DEFLATE); - list_add(&page->lru, pages); - dec_node_page_state(page, NR_BALLOON_PAGES); - n_pages++; - } - spin_unlock_irqrestore(&balloon_pages_lock, flags); - - return n_pages; -} -EXPORT_SYMBOL_GPL(balloon_page_list_dequeue); - -/** - * balloon_page_alloc - allocates a new page for insertion into the balloon - * page list. - * - * Driver must call this function to properly allocate a new balloon page. - * Driver must call balloon_page_enqueue before definitively removing the page - * from the guest system. - * - * Return: struct page for the allocated page or NULL on allocation failure. - */ -struct page *balloon_page_alloc(void) -{ - gfp_t gfp_flags = __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN; - - if (IS_ENABLED(CONFIG_BALLOON_COMPACTION)) - gfp_flags |= GFP_HIGHUSER_MOVABLE; - else - gfp_flags |= GFP_HIGHUSER; - - return alloc_page(gfp_flags); -} -EXPORT_SYMBOL_GPL(balloon_page_alloc); - -/** - * balloon_page_enqueue - inserts a new page into the balloon page list. - * - * @b_dev_info: balloon device descriptor where we will insert a new page - * @page: new page to enqueue - allocated using balloon_page_alloc. - * - * Drivers must call this function to properly enqueue a new allocated balloon - * page before definitively removing the page from the guest system. - * - * Drivers must not enqueue pages while page->lru is still in - * use, and must not use page->lru until a page was unqueued again. - */ -void balloon_page_enqueue(struct balloon_dev_info *b_dev_info, - struct page *page) -{ - unsigned long flags; - - spin_lock_irqsave(&balloon_pages_lock, flags); - balloon_page_enqueue_one(b_dev_info, page); - spin_unlock_irqrestore(&balloon_pages_lock, flags); -} -EXPORT_SYMBOL_GPL(balloon_page_enqueue); - -/** - * balloon_page_dequeue - removes a page from balloon's page list and returns - * its address to allow the driver to release the page. - * @b_dev_info: balloon device descriptor where we will grab a page from. - * - * Driver must call this function to properly dequeue a previously enqueued page - * before definitively releasing it back to the guest system. - * - * Caller must perform its own accounting to ensure that this - * function is called only if some pages are actually enqueued. - * - * Note that this function may fail to dequeue some pages even if there are - * some enqueued pages - since the page list can be temporarily empty due to - * the compaction of isolated pages. - * - * TODO: remove the caller accounting requirements, and allow caller to wait - * until all pages can be dequeued. - * - * Return: struct page for the dequeued page, or NULL if no page was dequeued. - */ -struct page *balloon_page_dequeue(struct balloon_dev_info *b_dev_info) -{ - unsigned long flags; - LIST_HEAD(pages); - int n_pages; - - n_pages = balloon_page_list_dequeue(b_dev_info, &pages, 1); - - if (n_pages != 1) { - /* - * If we are unable to dequeue a balloon page because the page - * list is empty and there are no isolated pages, then something - * went out of track and some balloon pages are lost. - * BUG() here, otherwise the balloon driver may get stuck in - * an infinite loop while attempting to release all its pages. - */ - spin_lock_irqsave(&balloon_pages_lock, flags); - if (unlikely(list_empty(&b_dev_info->pages) && - !b_dev_info->isolated_pages)) - BUG(); - spin_unlock_irqrestore(&balloon_pages_lock, flags); - return NULL; - } - return list_first_entry(&pages, struct page, lru); -} -EXPORT_SYMBOL_GPL(balloon_page_dequeue); - -#ifdef CONFIG_BALLOON_COMPACTION -static struct balloon_dev_info *balloon_page_device(struct page *page) -{ - return (struct balloon_dev_info *)page_private(page); -} - -static bool balloon_page_isolate(struct page *page, isolate_mode_t mode) - -{ - struct balloon_dev_info *b_dev_info; - unsigned long flags; - - spin_lock_irqsave(&balloon_pages_lock, flags); - b_dev_info = balloon_page_device(page); - if (!b_dev_info) { - /* - * The page already got deflated and removed from the - * balloon list. - */ - spin_unlock_irqrestore(&balloon_pages_lock, flags); - return false; - } - list_del(&page->lru); - b_dev_info->isolated_pages++; - spin_unlock_irqrestore(&balloon_pages_lock, flags); - - return true; -} - -static void balloon_page_putback(struct page *page) -{ - struct balloon_dev_info *b_dev_info = balloon_page_device(page); - unsigned long flags; - - /* - * When we isolated the page, the page was still inflated in a balloon - * device. As isolated balloon pages cannot get deflated, we still have - * a balloon device here. - */ - if (WARN_ON_ONCE(!b_dev_info)) - return; - - spin_lock_irqsave(&balloon_pages_lock, flags); - list_add(&page->lru, &b_dev_info->pages); - b_dev_info->isolated_pages--; - spin_unlock_irqrestore(&balloon_pages_lock, flags); -} - -static int balloon_page_migrate(struct page *newpage, struct page *page, - enum migrate_mode mode) -{ - struct balloon_dev_info *b_dev_info = balloon_page_device(page); - unsigned long flags; - int rc; - - /* - * When we isolated the page, the page was still inflated in a balloon - * device. As isolated balloon pages cannot get deflated, we still have - * a balloon device here. - */ - if (WARN_ON_ONCE(!b_dev_info)) - return -EAGAIN; - - rc = b_dev_info->migratepage(b_dev_info, newpage, page, mode); - if (rc < 0 && rc != -ENOENT) - return rc; - - spin_lock_irqsave(&balloon_pages_lock, flags); - if (!rc) { - /* Insert the new page into the balloon list. */ - get_page(newpage); - balloon_page_insert(b_dev_info, newpage); - __count_vm_event(BALLOON_MIGRATE); - - if (b_dev_info->adjust_managed_page_count && - page_zone(page) != page_zone(newpage)) { - /* - * When we migrate a page to a different zone we - * have to fixup the count of both involved zones. - */ - adjust_managed_page_count(page, 1); - adjust_managed_page_count(newpage, -1); - } - } else { - /* Old page was deflated but new page not inflated. */ - __count_vm_event(BALLOON_DEFLATE); - - if (b_dev_info->adjust_managed_page_count) - adjust_managed_page_count(page, 1); - } - - b_dev_info->isolated_pages--; - - /* Free the now-deflated page we isolated in balloon_page_isolate(). */ - balloon_page_finalize(page); - spin_unlock_irqrestore(&balloon_pages_lock, flags); - - put_page(page); - - return 0; -} - -static const struct movable_operations balloon_mops = { - .migrate_page = balloon_page_migrate, - .isolate_page = balloon_page_isolate, - .putback_page = balloon_page_putback, -}; - -static int __init balloon_init(void) -{ - return set_movable_ops(&balloon_mops, PGTY_offline); -} -core_initcall(balloon_init); - -#endif /* CONFIG_BALLOON_COMPACTION */ -- cgit v1.2.3 From 7cf3318a25877c0908e450919f7e1517908e24f1 Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Red Hat)" Date: Tue, 20 Jan 2026 00:01:29 +0100 Subject: mm/kconfig: make BALLOON_COMPACTION depend on MIGRATION MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Migration support for balloon memory depends on MIGRATION not COMPACTION. Compaction is simply another user of page migration. The last dependency on compaction.c was effectively removed with commit 3d388584d599 ("mm: convert "movable" flag in page->mapping to a page flag"). Ever since, everything for handling movable_ops page migration resides in core migration code. So let's change the dependency and adjust the description + help text. We'll rename BALLOON_COMPACTION separately next. Link: https://lkml.kernel.org/r/20260119230133.3551867-22-david@kernel.org Signed-off-by: David Hildenbrand (Red Hat) Reviewed-by: Lorenzo Stoakes Acked-by: Michael S. Tsirkin Cc: Arnd Bergmann Cc: Christophe Leroy Cc: Eugenio Pérez Cc: Greg Kroah-Hartman Cc: Jason Wang Cc: Jerrin Shaji George Cc: Jonathan Corbet Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Oscar Salvador Cc: SeongJae Park Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/Kconfig | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/mm/Kconfig b/mm/Kconfig index 5f4d6e5b5715..c5374f3cf1c8 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -599,17 +599,14 @@ config MEMORY_BALLOON # # support for memory balloon compaction config BALLOON_COMPACTION - bool "Allow for balloon memory compaction/migration" + bool "Allow for balloon memory migration" default y - depends on COMPACTION && MEMORY_BALLOON - help - Memory fragmentation introduced by ballooning might reduce - significantly the number of 2MB contiguous memory blocks that can be - used within a guest, thus imposing performance penalties associated - with the reduced number of transparent huge pages that could be used - by the guest workload. Allowing the compaction & migration for memory - pages enlisted as being part of memory balloon devices avoids the - scenario aforementioned and helps improving memory defragmentation. + depends on MIGRATION && MEMORY_BALLOON + help + Allow for migration of pages inflated in a memory balloon such that + they can be allocated from memory areas only available for movable + allocations (e.g., ZONE_MOVABLE, CMA) and such that they can be + migrated for memory defragmentation purposes by memory compaction. # # support for memory compaction -- cgit v1.2.3 From cd8e95d80bc29b3c72288bd31e845b11755ef6a5 Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Red Hat)" Date: Tue, 20 Jan 2026 00:01:30 +0100 Subject: mm: rename CONFIG_BALLOON_COMPACTION to CONFIG_BALLOON_MIGRATION MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit While compaction depends on migration, the other direction is not the case. So let's make it clearer that this is all about migration of balloon pages. Adjust all comments/docs in the core to talk about "migration" instead of "compaction". While at it add some "/* CONFIG_BALLOON_MIGRATION */". Link: https://lkml.kernel.org/r/20260119230133.3551867-23-david@kernel.org Signed-off-by: David Hildenbrand (Red Hat) Reviewed-by: Lorenzo Stoakes Acked-by: Michael S. Tsirkin Cc: Arnd Bergmann Cc: Christophe Leroy Cc: Eugenio Pérez Cc: Greg Kroah-Hartman Cc: Jason Wang Cc: Jerrin Shaji George Cc: Jonathan Corbet Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Oscar Salvador Cc: SeongJae Park Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Zi Yan Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/memory-hotplug.rst | 8 ++++---- arch/powerpc/platforms/pseries/cmm.c | 8 ++++---- drivers/misc/vmw_balloon.c | 8 ++++---- drivers/virtio/virtio_balloon.c | 6 +++--- include/linux/balloon.h | 12 ++++++------ include/linux/vm_event_item.h | 4 ++-- mm/Kconfig | 4 ++-- mm/balloon.c | 10 +++++----- mm/memory_hotplug.c | 4 ++-- mm/migrate.c | 2 +- mm/vmstat.c | 4 ++-- 11 files changed, 35 insertions(+), 35 deletions(-) diff --git a/Documentation/admin-guide/mm/memory-hotplug.rst b/Documentation/admin-guide/mm/memory-hotplug.rst index 6581558fd0d7..0207f8725142 100644 --- a/Documentation/admin-guide/mm/memory-hotplug.rst +++ b/Documentation/admin-guide/mm/memory-hotplug.rst @@ -603,11 +603,11 @@ ZONE_MOVABLE, especially when fine-tuning zone ratios: memory for metadata and page tables in the direct map; having a lot of offline memory blocks is not a typical case, though. -- Memory ballooning without balloon compaction is incompatible with - ZONE_MOVABLE. Only some implementations, such as virtio-balloon and - pseries CMM, fully support balloon compaction. +- Memory ballooning without support for balloon memory migration is incompatible + with ZONE_MOVABLE. Only some implementations, such as virtio-balloon and + pseries CMM, fully support balloon memory migration. - Further, the CONFIG_BALLOON_COMPACTION kernel configuration option might be + Further, the CONFIG_BALLOON_MIGRATION kernel configuration option might be disabled. In that case, balloon inflation will only perform unmovable allocations and silently create a zone imbalance, usually triggered by inflation requests from the hypervisor. diff --git a/arch/powerpc/platforms/pseries/cmm.c b/arch/powerpc/platforms/pseries/cmm.c index 7a3c4922685a..8d83df12430f 100644 --- a/arch/powerpc/platforms/pseries/cmm.c +++ b/arch/powerpc/platforms/pseries/cmm.c @@ -494,7 +494,7 @@ static struct notifier_block cmm_mem_nb = { .priority = CMM_MEM_HOTPLUG_PRI }; -#ifdef CONFIG_BALLOON_COMPACTION +#ifdef CONFIG_BALLOON_MIGRATION static int cmm_migratepage(struct balloon_dev_info *b_dev_info, struct page *newpage, struct page *page, enum migrate_mode mode) @@ -520,10 +520,10 @@ static int cmm_migratepage(struct balloon_dev_info *b_dev_info, plpar_page_set_active(page); return 0; } -#else /* CONFIG_BALLOON_COMPACTION */ +#else /* CONFIG_BALLOON_MIGRATION */ int cmm_migratepage(struct balloon_dev_info *b_dev_info, struct page *newpage, struct page *page, enum migrate_mode mode); -#endif /* CONFIG_BALLOON_COMPACTION */ +#endif /* CONFIG_BALLOON_MIGRATION */ /** * cmm_init - Module initialization @@ -540,7 +540,7 @@ static int cmm_init(void) balloon_devinfo_init(&b_dev_info); b_dev_info.adjust_managed_page_count = true; - if (IS_ENABLED(CONFIG_BALLOON_COMPACTION)) + if (IS_ENABLED(CONFIG_BALLOON_MIGRATION)) b_dev_info.migratepage = cmm_migratepage; rc = register_oom_notifier(&cmm_oom_nb); diff --git a/drivers/misc/vmw_balloon.c b/drivers/misc/vmw_balloon.c index 7fd3f709108c..216a16395968 100644 --- a/drivers/misc/vmw_balloon.c +++ b/drivers/misc/vmw_balloon.c @@ -1719,7 +1719,7 @@ static inline void vmballoon_debugfs_exit(struct vmballoon *b) #endif /* CONFIG_DEBUG_FS */ -#ifdef CONFIG_BALLOON_COMPACTION +#ifdef CONFIG_BALLOON_MIGRATION /** * vmballoon_migratepage() - migrates a balloon page. * @b_dev_info: balloon device information descriptor. @@ -1803,11 +1803,11 @@ out_unlock: up_read(&b->conf_sem); return ret; } -#else /* CONFIG_BALLOON_COMPACTION */ +#else /* CONFIG_BALLOON_MIGRATION */ int vmballoon_migratepage(struct balloon_dev_info *b_dev_info, struct page *newpage, struct page *page, enum migrate_mode mode); -#endif /* CONFIG_BALLOON_COMPACTION */ +#endif /* CONFIG_BALLOON_MIGRATION */ static int __init vmballoon_init(void) { @@ -1827,7 +1827,7 @@ static int __init vmballoon_init(void) return error; balloon_devinfo_init(&balloon.b_dev_info); - if (IS_ENABLED(CONFIG_BALLOON_COMPACTION)) + if (IS_ENABLED(CONFIG_BALLOON_MIGRATION)) balloon.b_dev_info.migratepage = vmballoon_migratepage; INIT_LIST_HEAD(&balloon.huge_pages); diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c index de8041c3285a..4e549abe59ff 100644 --- a/drivers/virtio/virtio_balloon.c +++ b/drivers/virtio/virtio_balloon.c @@ -802,7 +802,7 @@ static void report_free_page_func(struct work_struct *work) } } -#ifdef CONFIG_BALLOON_COMPACTION +#ifdef CONFIG_BALLOON_MIGRATION /* * virtballoon_migratepage - perform the balloon page migration on behalf of * a compaction thread. (called under page lock) @@ -851,7 +851,7 @@ static int virtballoon_migratepage(struct balloon_dev_info *vb_dev_info, mutex_unlock(&vb->balloon_lock); return 0; } -#endif /* CONFIG_BALLOON_COMPACTION */ +#endif /* CONFIG_BALLOON_MIGRATION */ static unsigned long shrink_free_pages(struct virtio_balloon *vb, unsigned long pages_to_free) @@ -948,7 +948,7 @@ static int virtballoon_probe(struct virtio_device *vdev) if (!virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_DEFLATE_ON_OOM)) vb->vb_dev_info.adjust_managed_page_count = true; -#ifdef CONFIG_BALLOON_COMPACTION +#ifdef CONFIG_BALLOON_MIGRATION vb->vb_dev_info.migratepage = virtballoon_migratepage; #endif if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT)) { diff --git a/include/linux/balloon.h b/include/linux/balloon.h index 82585542300d..ca5b15150f42 100644 --- a/include/linux/balloon.h +++ b/include/linux/balloon.h @@ -22,9 +22,9 @@ * * As the page isolation scanning step a compaction thread does is a lockless * procedure (from a page standpoint), it might bring some racy situations while - * performing balloon page compaction. In order to sort out these racy scenarios - * and safely perform balloon's page compaction and migration we must, always, - * ensure following these simple rules: + * performing balloon page migration. In order to sort out these racy scenarios + * and safely perform balloon's page migration we must, always, ensure following + * these simple rules: * * i. Inflation/deflation must set/clear page->private under the * balloon_pages_lock @@ -45,10 +45,10 @@ /* * Balloon device information descriptor. - * This struct is used to allow the common balloon compaction interface + * This struct is used to allow the common balloon page migration interface * procedures to find the proper balloon device holding memory pages they'll - * have to cope for page compaction / migration, as well as it serves the - * balloon driver as a page book-keeper for its registered balloon devices. + * have to cope for page migration, as well as it serves the balloon driver as + * a page book-keeper for its registered balloon devices. */ struct balloon_dev_info { unsigned long isolated_pages; /* # of isolated pages for migration */ diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index 92f80b4d69a6..fca34d3473b6 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -125,9 +125,9 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, #ifdef CONFIG_MEMORY_BALLOON BALLOON_INFLATE, BALLOON_DEFLATE, -#ifdef CONFIG_BALLOON_COMPACTION +#ifdef CONFIG_BALLOON_MIGRATION BALLOON_MIGRATE, -#endif +#endif /* CONFIG_BALLOON_MIGRATION */ #endif #ifdef CONFIG_DEBUG_TLBFLUSH NR_TLB_REMOTE_FLUSH, /* cpu tried to flush others' tlbs */ diff --git a/mm/Kconfig b/mm/Kconfig index c5374f3cf1c8..cd6896c1ba7d 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -597,8 +597,8 @@ config MEMORY_BALLOON bool # -# support for memory balloon compaction -config BALLOON_COMPACTION +# support for memory balloon page migration +config BALLOON_MIGRATION bool "Allow for balloon memory migration" default y depends on MIGRATION && MEMORY_BALLOON diff --git a/mm/balloon.c b/mm/balloon.c index 0f068b97e5d8..96a8f1e20bc6 100644 --- a/mm/balloon.c +++ b/mm/balloon.c @@ -29,7 +29,7 @@ static void balloon_page_insert(struct balloon_dev_info *balloon, { lockdep_assert_held(&balloon_pages_lock); __SetPageOffline(page); - if (IS_ENABLED(CONFIG_BALLOON_COMPACTION)) { + if (IS_ENABLED(CONFIG_BALLOON_MIGRATION)) { SetPageMovableOps(page); set_page_private(page, (unsigned long)balloon); } @@ -46,7 +46,7 @@ static void balloon_page_insert(struct balloon_dev_info *balloon, static void balloon_page_finalize(struct page *page) { lockdep_assert_held(&balloon_pages_lock); - if (IS_ENABLED(CONFIG_BALLOON_COMPACTION)) + if (IS_ENABLED(CONFIG_BALLOON_MIGRATION)) set_page_private(page, 0); /* PageOffline is sticky until the page is freed to the buddy. */ } @@ -148,7 +148,7 @@ struct page *balloon_page_alloc(void) { gfp_t gfp_flags = __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN; - if (IS_ENABLED(CONFIG_BALLOON_COMPACTION)) + if (IS_ENABLED(CONFIG_BALLOON_MIGRATION)) gfp_flags |= GFP_HIGHUSER_MOVABLE; else gfp_flags |= GFP_HIGHUSER; @@ -227,7 +227,7 @@ struct page *balloon_page_dequeue(struct balloon_dev_info *b_dev_info) } EXPORT_SYMBOL_GPL(balloon_page_dequeue); -#ifdef CONFIG_BALLOON_COMPACTION +#ifdef CONFIG_BALLOON_MIGRATION static struct balloon_dev_info *balloon_page_device(struct page *page) { return (struct balloon_dev_info *)page_private(page); @@ -341,4 +341,4 @@ static int __init balloon_init(void) } core_initcall(balloon_init); -#endif /* CONFIG_BALLOON_COMPACTION */ +#endif /* CONFIG_BALLOON_MIGRATION */ diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 389989a28abe..bc805029da51 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -946,8 +946,8 @@ static struct zone *default_kernel_zone_for_pfn(int nid, unsigned long start_pfn * We rely on "present pages" instead of "managed pages", as the latter is * highly unreliable and dynamic in virtualized environments, and does not * consider boot time allocations. For example, memory ballooning adjusts the - * managed pages when inflating/deflating the balloon, and balloon compaction - * can even migrate inflated pages between zones. + * managed pages when inflating/deflating the balloon, and balloon page + * migration can even migrate inflated pages between zones. * * Using "present pages" is better but some things to keep in mind are: * diff --git a/mm/migrate.c b/mm/migrate.c index 4750a2ba15fe..1bf2cf8c44dd 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -88,7 +88,7 @@ static const struct movable_operations *page_movable_ops(struct page *page) * back to the buddy. */ if (PageOffline(page)) - /* Only balloon compaction sets PageOffline pages movable. */ + /* Only balloon page migration sets PageOffline pages movable. */ return offline_movable_ops; if (PageZsmalloc(page)) return zsmalloc_movable_ops; diff --git a/mm/vmstat.c b/mm/vmstat.c index 6ae8891c9693..e96a344ab597 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1422,9 +1422,9 @@ const char * const vmstat_text[] = { #ifdef CONFIG_MEMORY_BALLOON [I(BALLOON_INFLATE)] = "balloon_inflate", [I(BALLOON_DEFLATE)] = "balloon_deflate", -#ifdef CONFIG_BALLOON_COMPACTION +#ifdef CONFIG_BALLOON_MIGRATION [I(BALLOON_MIGRATE)] = "balloon_migrate", -#endif +#endif /* CONFIG_BALLOON_MIGRATION */ #endif /* CONFIG_MEMORY_BALLOON */ #ifdef CONFIG_DEBUG_TLBFLUSH [I(NR_TLB_REMOTE_FLUSH)] = "nr_tlb_remote_flush", -- cgit v1.2.3 From 1421758055ca6028d3b758914863f38d434bf36b Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Red Hat)" Date: Tue, 20 Jan 2026 00:01:31 +0100 Subject: mm: rename CONFIG_MEMORY_BALLOON -> CONFIG_BALLOON MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Let's make it consistent with the naming of the files but also with the naming of CONFIG_BALLOON_MIGRATION. While at it, add a "/* CONFIG_BALLOON */". Link: https://lkml.kernel.org/r/20260119230133.3551867-24-david@kernel.org Signed-off-by: David Hildenbrand (Red Hat) Reviewed-by: Lorenzo Stoakes Acked-by: Michael S. Tsirkin Cc: Arnd Bergmann Cc: Christophe Leroy Cc: Eugenio Pérez Cc: Greg Kroah-Hartman Cc: Jason Wang Cc: Jerrin Shaji George Cc: Jonathan Corbet Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Oscar Salvador Cc: SeongJae Park Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Zi Yan Signed-off-by: Andrew Morton --- arch/powerpc/platforms/pseries/Kconfig | 2 +- drivers/misc/Kconfig | 2 +- drivers/virtio/Kconfig | 2 +- include/linux/vm_event_item.h | 4 ++-- mm/Kconfig | 4 ++-- mm/Makefile | 2 +- mm/vmstat.c | 4 ++-- 7 files changed, 10 insertions(+), 10 deletions(-) diff --git a/arch/powerpc/platforms/pseries/Kconfig b/arch/powerpc/platforms/pseries/Kconfig index 3e042218d6cd..f7052b131a4c 100644 --- a/arch/powerpc/platforms/pseries/Kconfig +++ b/arch/powerpc/platforms/pseries/Kconfig @@ -120,7 +120,7 @@ config PPC_SMLPAR config CMM tristate "Collaborative memory management" depends on PPC_SMLPAR - select MEMORY_BALLOON + select BALLOON default y help Select this option, if you want to enable the kernel interface diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig index d7d41b054b98..5cc79d1517af 100644 --- a/drivers/misc/Kconfig +++ b/drivers/misc/Kconfig @@ -410,7 +410,7 @@ config DS1682 config VMWARE_BALLOON tristate "VMware Balloon Driver" depends on VMWARE_VMCI && X86 && HYPERVISOR_GUEST - select MEMORY_BALLOON + select BALLOON help This is VMware physical memory management driver which acts like a "balloon" that can be inflated to reclaim physical pages diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig index 6db5235a7693..ce5bc0d9ea28 100644 --- a/drivers/virtio/Kconfig +++ b/drivers/virtio/Kconfig @@ -112,7 +112,7 @@ config VIRTIO_PMEM config VIRTIO_BALLOON tristate "Virtio balloon driver" depends on VIRTIO - select MEMORY_BALLOON + select BALLOON select PAGE_REPORTING help This driver supports increasing and decreasing the amount diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index fca34d3473b6..22a139f82d75 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -122,13 +122,13 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, THP_SWPOUT, THP_SWPOUT_FALLBACK, #endif -#ifdef CONFIG_MEMORY_BALLOON +#ifdef CONFIG_BALLOON BALLOON_INFLATE, BALLOON_DEFLATE, #ifdef CONFIG_BALLOON_MIGRATION BALLOON_MIGRATE, #endif /* CONFIG_BALLOON_MIGRATION */ -#endif +#endif /* CONFIG_BALLOON */ #ifdef CONFIG_DEBUG_TLBFLUSH NR_TLB_REMOTE_FLUSH, /* cpu tried to flush others' tlbs */ NR_TLB_REMOTE_FLUSH_RECEIVED,/* cpu received ipi for flush */ diff --git a/mm/Kconfig b/mm/Kconfig index cd6896c1ba7d..d1d76ce7373e 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -593,7 +593,7 @@ config SPLIT_PMD_PTLOCKS # # support for memory balloon -config MEMORY_BALLOON +config BALLOON bool # @@ -601,7 +601,7 @@ config MEMORY_BALLOON config BALLOON_MIGRATION bool "Allow for balloon memory migration" default y - depends on MIGRATION && MEMORY_BALLOON + depends on MIGRATION && BALLOON help Allow for migration of pages inflated in a memory balloon such that they can be allocated from memory areas only available for movable diff --git a/mm/Makefile b/mm/Makefile index 1e31e0a528dc..0d85b10dbdde 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -122,7 +122,7 @@ obj-$(CONFIG_CMA) += cma.o obj-$(CONFIG_NUMA) += numa.o obj-$(CONFIG_NUMA_MEMBLKS) += numa_memblks.o obj-$(CONFIG_NUMA_EMU) += numa_emulation.o -obj-$(CONFIG_MEMORY_BALLOON) += balloon.o +obj-$(CONFIG_BALLOON) += balloon.o obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o obj-$(CONFIG_PAGE_TABLE_CHECK) += page_table_check.o obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o diff --git a/mm/vmstat.c b/mm/vmstat.c index e96a344ab597..0f64c898f79f 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1419,13 +1419,13 @@ const char * const vmstat_text[] = { [I(THP_SWPOUT)] = "thp_swpout", [I(THP_SWPOUT_FALLBACK)] = "thp_swpout_fallback", #endif -#ifdef CONFIG_MEMORY_BALLOON +#ifdef CONFIG_BALLOON [I(BALLOON_INFLATE)] = "balloon_inflate", [I(BALLOON_DEFLATE)] = "balloon_deflate", #ifdef CONFIG_BALLOON_MIGRATION [I(BALLOON_MIGRATE)] = "balloon_migrate", #endif /* CONFIG_BALLOON_MIGRATION */ -#endif /* CONFIG_MEMORY_BALLOON */ +#endif /* CONFIG_BALLOON */ #ifdef CONFIG_DEBUG_TLBFLUSH [I(NR_TLB_REMOTE_FLUSH)] = "nr_tlb_remote_flush", [I(NR_TLB_REMOTE_FLUSH_RECEIVED)] = "nr_tlb_remote_flush_received", -- cgit v1.2.3 From c0f609f799212e8ae0086b83a24ac616e0cd5696 Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Red Hat)" Date: Tue, 20 Jan 2026 00:01:32 +0100 Subject: MAINTAINERS: move memory balloon infrastructure to "MEMORY MANAGEMENT - BALLOON" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Nowadays, there is nothing virtio-balloon specific anymore about these files, the basic infrastructure is used by multiple memory balloon drivers. For now we'll route it through Andrew's tree, maybe in some future it makes sense to route this through a separate tree. Link: https://lkml.kernel.org/r/20260119230133.3551867-25-david@kernel.org Signed-off-by: David Hildenbrand (Red Hat) Acked-by: Lorenzo Stoakes Acked-by: Vlastimil Babka Cc: Arnd Bergmann Cc: Christophe Leroy Cc: Eugenio Pérez Cc: Greg Kroah-Hartman Cc: Jason Wang Cc: Jerrin Shaji George Cc: Jonathan Corbet Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michael S. Tsirkin Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Oscar Salvador Cc: SeongJae Park Cc: Suren Baghdasaryan Cc: Xuan Zhuo Cc: Zi Yan Signed-off-by: Andrew Morton --- MAINTAINERS | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index a4535ec654dc..b4088f7290be 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -16458,6 +16458,17 @@ T: quilt git://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new F: mm/ F: tools/mm/ +MEMORY MANAGEMENT - BALLOON +M: Andrew Morton +M: David Hildenbrand +L: linux-mm@kvack.org +L: virtualization@lists.linux.dev +S: Maintained +W: http://www.linux-mm.org +T: git git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm +F: include/linux/balloon.h +F: mm/balloon.c + MEMORY MANAGEMENT - CORE M: Andrew Morton M: David Hildenbrand @@ -27546,9 +27557,7 @@ M: David Hildenbrand L: virtualization@lists.linux.dev S: Maintained F: drivers/virtio/virtio_balloon.c -F: include/linux/balloon.h F: include/uapi/linux/virtio_balloon.h -F: mm/balloon.c VIRTIO BLOCK AND SCSI DRIVERS M: "Michael S. Tsirkin" -- cgit v1.2.3 From 6efc548d8a08ae918020225e16d040ce3903bff7 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Thu, 15 Jan 2026 17:08:07 +0900 Subject: zram: rename init_lock to dev_lock init_lock has completely outgrown its initial purpose and is no longer used only to "prevent concurrent execution of device init" as the stale comment suggests. The scope of this lock is much bigger now. These days this lock (rw_semaphore) controls how a task owns the corresponding zram device: either in shared mode or in exclusive mode. All zram device attribute writes should own the device in exclusive mode, which synchronizes these tasks and prevents, for example, concurrent execution of recompression and writeback. All zram device attribute reads should own the device in shared mode. Rename the lock to dev_lock to better reflect its current purpose. Link: https://lkml.kernel.org/r/20260115080807.3957860-1-senozhatsky@chromium.org Signed-off-by: Sergey Senozhatsky Reviewed-by: Andrew Morton Cc: Brian Geffon Cc: Jens Axboe Cc: Minchan Kim Signed-off-by: Andrew Morton --- drivers/block/zram/zram_drv.c | 60 +++++++++++++++++++++---------------------- drivers/block/zram/zram_drv.h | 4 +-- 2 files changed, 32 insertions(+), 32 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index f92845ef9192..61d3e2c74901 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -365,7 +365,7 @@ static ssize_t initstate_show(struct device *dev, struct device_attribute *attr, u32 val; struct zram *zram = dev_to_zram(dev); - guard(rwsem_read)(&zram->init_lock); + guard(rwsem_read)(&zram->dev_lock); val = init_done(zram); return sysfs_emit(buf, "%u\n", val); @@ -391,7 +391,7 @@ static ssize_t mem_limit_store(struct device *dev, if (buf == tmp) /* no chars parsed, invalid input */ return -EINVAL; - guard(rwsem_write)(&zram->init_lock); + guard(rwsem_write)(&zram->dev_lock); zram->limit_pages = PAGE_ALIGN(limit) >> PAGE_SHIFT; return len; @@ -409,7 +409,7 @@ static ssize_t mem_used_max_store(struct device *dev, if (err || val != 0) return -EINVAL; - guard(rwsem_read)(&zram->init_lock); + guard(rwsem_read)(&zram->dev_lock); if (init_done(zram)) { atomic_long_set(&zram->stats.max_used_pages, zs_get_total_pages(zram->mem_pool)); @@ -477,7 +477,7 @@ static ssize_t idle_store(struct device *dev, struct device_attribute *attr, return -EINVAL; } - guard(rwsem_read)(&zram->init_lock); + guard(rwsem_read)(&zram->dev_lock); if (!init_done(zram)) return -EINVAL; @@ -539,7 +539,7 @@ static ssize_t bd_stat_show(struct device *dev, struct device_attribute *attr, struct zram *zram = dev_to_zram(dev); ssize_t ret; - guard(rwsem_read)(&zram->init_lock); + guard(rwsem_read)(&zram->dev_lock); ret = sysfs_emit(buf, "%8llu %8llu %8llu\n", FOUR_K((u64)atomic64_read(&zram->stats.bd_count)), @@ -559,7 +559,7 @@ static ssize_t writeback_compressed_store(struct device *dev, if (kstrtobool(buf, &val)) return -EINVAL; - guard(rwsem_write)(&zram->init_lock); + guard(rwsem_write)(&zram->dev_lock); if (init_done(zram)) { return -EBUSY; } @@ -576,7 +576,7 @@ static ssize_t writeback_compressed_show(struct device *dev, bool val; struct zram *zram = dev_to_zram(dev); - guard(rwsem_read)(&zram->init_lock); + guard(rwsem_read)(&zram->dev_lock); val = zram->wb_compressed; return sysfs_emit(buf, "%d\n", val); @@ -592,7 +592,7 @@ static ssize_t writeback_limit_enable_store(struct device *dev, if (kstrtoull(buf, 10, &val)) return -EINVAL; - guard(rwsem_write)(&zram->init_lock); + guard(rwsem_write)(&zram->dev_lock); zram->wb_limit_enable = val; return len; @@ -605,7 +605,7 @@ static ssize_t writeback_limit_enable_show(struct device *dev, bool val; struct zram *zram = dev_to_zram(dev); - guard(rwsem_read)(&zram->init_lock); + guard(rwsem_read)(&zram->dev_lock); val = zram->wb_limit_enable; return sysfs_emit(buf, "%d\n", val); @@ -631,7 +631,7 @@ static ssize_t writeback_limit_store(struct device *dev, */ val = rounddown(val, PAGE_SIZE / 4096); - guard(rwsem_write)(&zram->init_lock); + guard(rwsem_write)(&zram->dev_lock); zram->bd_wb_limit = val; return len; @@ -643,7 +643,7 @@ static ssize_t writeback_limit_show(struct device *dev, u64 val; struct zram *zram = dev_to_zram(dev); - guard(rwsem_read)(&zram->init_lock); + guard(rwsem_read)(&zram->dev_lock); val = zram->bd_wb_limit; return sysfs_emit(buf, "%llu\n", val); @@ -662,7 +662,7 @@ static ssize_t writeback_batch_size_store(struct device *dev, if (!val) return -EINVAL; - guard(rwsem_write)(&zram->init_lock); + guard(rwsem_write)(&zram->dev_lock); zram->wb_batch_size = val; return len; @@ -675,7 +675,7 @@ static ssize_t writeback_batch_size_show(struct device *dev, u32 val; struct zram *zram = dev_to_zram(dev); - guard(rwsem_read)(&zram->init_lock); + guard(rwsem_read)(&zram->dev_lock); val = zram->wb_batch_size; return sysfs_emit(buf, "%u\n", val); @@ -703,7 +703,7 @@ static ssize_t backing_dev_show(struct device *dev, char *p; ssize_t ret; - guard(rwsem_read)(&zram->init_lock); + guard(rwsem_read)(&zram->dev_lock); file = zram->backing_dev; if (!file) { memcpy(buf, "none\n", 5); @@ -737,7 +737,7 @@ static ssize_t backing_dev_store(struct device *dev, if (!file_name) return -ENOMEM; - guard(rwsem_write)(&zram->init_lock); + guard(rwsem_write)(&zram->dev_lock); if (init_done(zram)) { pr_info("Can't setup backing device for initialized device\n"); err = -EBUSY; @@ -901,7 +901,7 @@ release_wb_ctl: static void zram_account_writeback_rollback(struct zram *zram) { - lockdep_assert_held_write(&zram->init_lock); + lockdep_assert_held_write(&zram->dev_lock); if (zram->wb_limit_enable) zram->bd_wb_limit += 1UL << (PAGE_SHIFT - 12); @@ -909,7 +909,7 @@ static void zram_account_writeback_rollback(struct zram *zram) static void zram_account_writeback_submit(struct zram *zram) { - lockdep_assert_held_write(&zram->init_lock); + lockdep_assert_held_write(&zram->dev_lock); if (zram->wb_limit_enable && zram->bd_wb_limit > 0) zram->bd_wb_limit -= 1UL << (PAGE_SHIFT - 12); @@ -1263,7 +1263,7 @@ static ssize_t writeback_store(struct device *dev, ssize_t ret = len; int err, mode = 0; - guard(rwsem_write)(&zram->init_lock); + guard(rwsem_write)(&zram->dev_lock); if (!init_done(zram)) return -EINVAL; @@ -1565,7 +1565,7 @@ static ssize_t read_block_state(struct file *file, char __user *buf, if (!kbuf) return -ENOMEM; - guard(rwsem_read)(&zram->init_lock); + guard(rwsem_read)(&zram->dev_lock); if (!init_done(zram)) { kvfree(kbuf); return -EINVAL; @@ -1666,7 +1666,7 @@ static int __comp_algorithm_store(struct zram *zram, u32 prio, const char *buf) return -EINVAL; } - guard(rwsem_write)(&zram->init_lock); + guard(rwsem_write)(&zram->dev_lock); if (init_done(zram)) { kfree(compressor); pr_info("Can't change algorithm for initialized device\n"); @@ -1794,7 +1794,7 @@ static ssize_t comp_algorithm_show(struct device *dev, struct zram *zram = dev_to_zram(dev); ssize_t sz; - guard(rwsem_read)(&zram->init_lock); + guard(rwsem_read)(&zram->dev_lock); sz = zcomp_available_show(zram->comp_algs[ZRAM_PRIMARY_COMP], buf, 0); return sz; } @@ -1820,7 +1820,7 @@ static ssize_t recomp_algorithm_show(struct device *dev, ssize_t sz = 0; u32 prio; - guard(rwsem_read)(&zram->init_lock); + guard(rwsem_read)(&zram->dev_lock); for (prio = ZRAM_SECONDARY_COMP; prio < ZRAM_MAX_COMPS; prio++) { if (!zram->comp_algs[prio]) continue; @@ -1878,7 +1878,7 @@ static ssize_t compact_store(struct device *dev, struct device_attribute *attr, { struct zram *zram = dev_to_zram(dev); - guard(rwsem_read)(&zram->init_lock); + guard(rwsem_read)(&zram->dev_lock); if (!init_done(zram)) return -EINVAL; @@ -1893,7 +1893,7 @@ static ssize_t io_stat_show(struct device *dev, struct device_attribute *attr, struct zram *zram = dev_to_zram(dev); ssize_t ret; - guard(rwsem_read)(&zram->init_lock); + guard(rwsem_read)(&zram->dev_lock); ret = sysfs_emit(buf, "%8llu %8llu 0 %8llu\n", (u64)atomic64_read(&zram->stats.failed_reads), @@ -1914,7 +1914,7 @@ static ssize_t mm_stat_show(struct device *dev, struct device_attribute *attr, memset(&pool_stats, 0x00, sizeof(struct zs_pool_stats)); - guard(rwsem_read)(&zram->init_lock); + guard(rwsem_read)(&zram->dev_lock); if (init_done(zram)) { mem_used = zs_get_total_pages(zram->mem_pool); zs_pool_stats(zram->mem_pool, &pool_stats); @@ -1945,7 +1945,7 @@ static ssize_t debug_stat_show(struct device *dev, struct zram *zram = dev_to_zram(dev); ssize_t ret; - guard(rwsem_read)(&zram->init_lock); + guard(rwsem_read)(&zram->dev_lock); ret = sysfs_emit(buf, "version: %d\n0 %8llu\n", version, @@ -2611,7 +2611,7 @@ static ssize_t recompress_store(struct device *dev, if (threshold >= huge_class_size) return -EINVAL; - guard(rwsem_write)(&zram->init_lock); + guard(rwsem_write)(&zram->dev_lock); if (!init_done(zram)) return -EINVAL; @@ -2863,7 +2863,7 @@ static void zram_destroy_comps(struct zram *zram) static void zram_reset_device(struct zram *zram) { - guard(rwsem_write)(&zram->init_lock); + guard(rwsem_write)(&zram->dev_lock); zram->limit_pages = 0; @@ -2893,7 +2893,7 @@ static ssize_t disksize_store(struct device *dev, struct device_attribute *attr, if (!disksize) return -EINVAL; - guard(rwsem_write)(&zram->init_lock); + guard(rwsem_write)(&zram->dev_lock); if (init_done(zram)) { pr_info("Cannot change disksize for initialized device\n"); return -EBUSY; @@ -3088,7 +3088,7 @@ static int zram_add(void) goto out_free_dev; device_id = ret; - init_rwsem(&zram->init_lock); + init_rwsem(&zram->dev_lock); #ifdef CONFIG_ZRAM_WRITEBACK zram->wb_batch_size = 32; zram->wb_compressed = false; diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index 469a3dab44ad..515a72d9c06f 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -111,8 +111,8 @@ struct zram { struct zcomp *comps[ZRAM_MAX_COMPS]; struct zcomp_params params[ZRAM_MAX_COMPS]; struct gendisk *disk; - /* Prevent concurrent execution of device init */ - struct rw_semaphore init_lock; + /* Locks the device either in exclusive or in shared mode */ + struct rw_semaphore dev_lock; /* * the number of pages zram can consume for storing compressed data */ -- cgit v1.2.3 From d468d8f86d80383e52ab6cf59e916b9f5578d46a Mon Sep 17 00:00:00 2001 From: Manish Kumar Date: Fri, 16 Jan 2026 01:01:00 +0530 Subject: mm: drop filename from page_alloc.c header comment The file name in the header comment is redundant and not useful, as the location is already known from the path. Remove it to align with kernel coding style. No functional change. Link: https://lkml.kernel.org/r/20260115193100.116109-1-manish1588@gmail.com Signed-off-by: Manish Kumar Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton --- mm/page_alloc.c | 1 - 1 file changed, 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a0bb57c4e851..e779b18168de 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * linux/mm/page_alloc.c * * Manages the free list, the system allocates free pages here. * Note that kmalloc() lives in slab.c -- cgit v1.2.3 From 77bcee8d4015a1191e1e3f5c5c51589086493ab0 Mon Sep 17 00:00:00 2001 From: Ran Xiaokai Date: Thu, 15 Jan 2026 03:15:36 +0000 Subject: alloc_tag: fix rw permission issue when handling boot parameter Boot parameters prefixed with "sysctl." are processed during the final stage of system initialization via kernel_init()-> do_sysctl_args(). When CONFIG_MEM_ALLOC_PROFILING_DEBUG is enabled, the sysctl.vm.mem_profiling entry is not writable and will cause a warning. Before run_init_process(), system initialization executes in kernel thread context. Use current->mm to distinguish sysctl writes during do_sysctl_args() from user-space triggered ones. And when the proc_handler is from do_sysctl_args(), always return success because the same value was already set by setup_early_mem_profiling() and this eliminates a permission denied warning. Link: https://lkml.kernel.org/r/20260115031536.164254-1-ranxiaokai627@163.com Signed-off-by: Ran Xiaokai Suggested-by: Suren Baghdasaryan Acked-by: Suren Baghdasaryan Cc: Kent Overstreet Signed-off-by: Andrew Morton --- lib/alloc_tag.c | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c index 846a5b5b44a4..00ae4673a271 100644 --- a/lib/alloc_tag.c +++ b/lib/alloc_tag.c @@ -776,8 +776,22 @@ EXPORT_SYMBOL(page_alloc_tagging_ops); static int proc_mem_profiling_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { - if (!mem_profiling_support && write) - return -EINVAL; + if (write) { + /* + * Call from do_sysctl_args() which is a no-op since the same + * value was already set by setup_early_mem_profiling. + * Return success to avoid warnings from do_sysctl_args(). + */ + if (!current->mm) + return 0; + +#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG + /* User can't toggle profiling while debugging */ + return -EACCES; +#endif + if (!mem_profiling_support) + return -EINVAL; + } return proc_do_static_key(table, write, buffer, lenp, ppos); } @@ -787,11 +801,7 @@ static const struct ctl_table memory_allocation_profiling_sysctls[] = { { .procname = "mem_profiling", .data = &mem_alloc_profiling_key, -#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG - .mode = 0444, -#else .mode = 0644, -#endif .proc_handler = proc_mem_profiling_handler, }, }; -- cgit v1.2.3 From 5898aa8f9a0b42fe1f65c7364010ab15ec5c38bf Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Wed, 14 Jan 2026 09:36:42 -0500 Subject: mm: fix OOM killer inaccuracy on large many-core systems Use the precise, albeit slower, precise RSS counter sums for the OOM killer task selection and console dumps. The approximated value is too imprecise on large many-core systems. The following rss tracking issues were noted by Sweet Tea Dorminy [1], which lead to picking wrong tasks as OOM kill target: Recently, several internal services had an RSS usage regression as part of a kernel upgrade. Previously, they were on a pre-6.2 kernel and were able to read RSS statistics in a backup watchdog process to monitor and decide if they'd overrun their memory budget. Now, however, a representative service with five threads, expected to use about a hundred MB of memory, on a 250-cpu machine had memory usage tens of megabytes different from the expected amount -- this constituted a significant percentage of inaccuracy, causing the watchdog to act. This was a result of commit f1a7941243c1 ("mm: convert mm's rss stats into percpu_counter") [1]. Previously, the memory error was bounded by 64*nr_threads pages, a very livable megabyte. Now, however, as a result of scheduler decisions moving the threads around the CPUs, the memory error could be as large as a gigabyte. This is a really tremendous inaccuracy for any few-threaded program on a large machine and impedes monitoring significantly. These stat counters are also used to make OOM killing decisions, so this additional inaccuracy could make a big difference in OOM situations -- either resulting in the wrong process being killed, or in less memory being returned from an OOM-kill than expected. Here is a (possibly incomplete) list of the prior approaches that were used or proposed, along with their downside: 1) Per-thread rss tracking: large error on many-thread processes. 2) Per-CPU counters: up to 12% slower for short-lived processes and 9% increased system time in make test workloads [1]. Moreover, the inaccuracy increases with O(n^2) with the number of CPUs. 3) Per-NUMA-node counters: requires atomics on fast-path (overhead), error is high with systems that have lots of NUMA nodes (32 times the number of NUMA nodes). commit 82241a83cd15 ("mm: fix the inaccurate memory statistics issue for users") introduced get_mm_counter_sum() for precise proc memory status queries for some proc files. The simple fix proposed here is to do the precise per-cpu counters sum every time a counter value needs to be read. This applies to the OOM killer task selection, oom task console dumps (printk). This change increases the latency introduced when the OOM killer executes in favor of doing a more precise OOM target task selection. Effectively, the OOM killer iterates on all tasks, for all relevant page types, for which the precise sum iterates on all possible CPUs. As a reference, here is the execution time of the OOM killer before/after the change: AMD EPYC 9654 96-Core (2 sockets) Within a KVM, configured with 256 logical cpus. | before | after | ----------------------------------|----------|----------| nr_processes=40 | 0.3 ms | 0.5 ms | nr_processes=10000 | 3.0 ms | 80.0 ms | Link: https://lkml.kernel.org/r/20260114143642.47333-1-mathieu.desnoyers@efficios.com Fixes: f1a7941243c1 ("mm: convert mm's rss stats into percpu_counter") Link: https://lore.kernel.org/lkml/20250331223516.7810-2-sweettea-kernel@dorminy.me/ # [1] Signed-off-by: Mathieu Desnoyers Suggested-by: Michal Hocko Acked-by: Michal Hocko Reviewed-by: Baolin Wang Acked-by: Vlastimil Babka Cc: "Paul E. McKenney" Cc: Steven Rostedt Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Dennis Zhou Cc: Tejun Heo Cc: Christoph Lameter Cc: Martin Liu Cc: David Rientjes Cc: Shakeel Butt Cc: SeongJae Park Cc: Michal Hocko Cc: Johannes Weiner Cc: Sweet Tea Dorminy Cc: Lorenzo Stoakes Cc: "Liam R . Howlett" Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Christian Brauner Cc: Wei Yang Cc: David Hildenbrand Cc: Miaohe Lin Cc: Al Viro Cc: Yu Zhao Cc: Roman Gushchin Cc: Mateusz Guzik Cc: Matthew Wilcox Cc: Aboorva Devarajan Signed-off-by: Andrew Morton --- include/linux/mm.h | 7 +++++++ mm/oom_kill.c | 22 +++++++++++----------- 2 files changed, 18 insertions(+), 11 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index aacabf8a0b58..aa90719234f1 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2906,6 +2906,13 @@ static inline unsigned long get_mm_rss(struct mm_struct *mm) get_mm_counter(mm, MM_SHMEMPAGES); } +static inline unsigned long get_mm_rss_sum(struct mm_struct *mm) +{ + return get_mm_counter_sum(mm, MM_FILEPAGES) + + get_mm_counter_sum(mm, MM_ANONPAGES) + + get_mm_counter_sum(mm, MM_SHMEMPAGES); +} + static inline unsigned long get_mm_hiwater_rss(struct mm_struct *mm) { return max(mm->hiwater_rss, get_mm_rss(mm)); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 94066316e3ec..5c6c95c169ee 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -228,7 +228,7 @@ long oom_badness(struct task_struct *p, unsigned long totalpages) * The baseline for the badness score is the proportion of RAM that each * task's rss, pagetable and swap space use. */ - points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS) + + points = get_mm_rss_sum(p->mm) + get_mm_counter_sum(p->mm, MM_SWAPENTS) + mm_pgtables_bytes(p->mm) / PAGE_SIZE; task_unlock(p); @@ -402,10 +402,10 @@ static int dump_task(struct task_struct *p, void *arg) pr_info("[%7d] %5d %5d %8lu %8lu %8lu %8lu %9lu %8ld %8lu %5hd %s\n", task->pid, from_kuid(&init_user_ns, task_uid(task)), - task->tgid, task->mm->total_vm, get_mm_rss(task->mm), - get_mm_counter(task->mm, MM_ANONPAGES), get_mm_counter(task->mm, MM_FILEPAGES), - get_mm_counter(task->mm, MM_SHMEMPAGES), mm_pgtables_bytes(task->mm), - get_mm_counter(task->mm, MM_SWAPENTS), + task->tgid, task->mm->total_vm, get_mm_rss_sum(task->mm), + get_mm_counter_sum(task->mm, MM_ANONPAGES), get_mm_counter_sum(task->mm, MM_FILEPAGES), + get_mm_counter_sum(task->mm, MM_SHMEMPAGES), mm_pgtables_bytes(task->mm), + get_mm_counter_sum(task->mm, MM_SWAPENTS), task->signal->oom_score_adj, task->comm); task_unlock(task); @@ -604,9 +604,9 @@ static bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n", task_pid_nr(tsk), tsk->comm, - K(get_mm_counter(mm, MM_ANONPAGES)), - K(get_mm_counter(mm, MM_FILEPAGES)), - K(get_mm_counter(mm, MM_SHMEMPAGES))); + K(get_mm_counter_sum(mm, MM_ANONPAGES)), + K(get_mm_counter_sum(mm, MM_FILEPAGES)), + K(get_mm_counter_sum(mm, MM_SHMEMPAGES))); out_finish: trace_finish_task_reaping(tsk->pid); out_unlock: @@ -960,9 +960,9 @@ static void __oom_kill_process(struct task_struct *victim, const char *message) mark_oom_victim(victim); pr_err("%s: Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB, UID:%u pgtables:%lukB oom_score_adj:%d\n", message, task_pid_nr(victim), victim->comm, K(mm->total_vm), - K(get_mm_counter(mm, MM_ANONPAGES)), - K(get_mm_counter(mm, MM_FILEPAGES)), - K(get_mm_counter(mm, MM_SHMEMPAGES)), + K(get_mm_counter_sum(mm, MM_ANONPAGES)), + K(get_mm_counter_sum(mm, MM_FILEPAGES)), + K(get_mm_counter_sum(mm, MM_SHMEMPAGES)), from_kuid(&init_user_ns, task_uid(victim)), mm_pgtables_bytes(mm) >> 10, victim->signal->oom_score_adj); task_unlock(victim); -- cgit v1.2.3 From dc9fe9b7056a44ad65715def880e7d91d32c047f Mon Sep 17 00:00:00 2001 From: Jiayuan Chen Date: Tue, 20 Jan 2026 10:43:48 +0800 Subject: mm/vmscan: mitigate spurious kswapd_failures reset from direct reclaim Patch series "mm/vmscan: add tracepoint and reason for kswapd_failures reset", v4. Currently, kswapd_failures is reset in multiple places (kswapd, direct reclaim, PCP freeing, memory-tiers), but there's no way to trace when and why it was reset, making it difficult to debug memory reclaim issues. This patch: 1. Introduce kswapd_clear_hopeless() as a wrapper function to centralize kswapd_failures reset logic. 2. Introduce kswapd_test_hopeless() to encapsulate hopeless node checks, replacing all open-coded kswapd_failures comparisons. 3. Add kswapd_clear_hopeless_reason enum to distinguish reset sources: - KSWAPD_CLEAR_HOPELESS_KSWAPD: reset from kswapd context - KSWAPD_CLEAR_HOPELESS_DIRECT: reset from direct reclaim - KSWAPD_CLEAR_HOPELESS_PCP: reset from PCP page freeing - KSWAPD_CLEAR_HOPELESS_OTHER: reset from other paths 4. Add tracepoints for better observability: - mm_vmscan_kswapd_clear_hopeless: traces each reset with reason - mm_vmscan_kswapd_reclaim_fail: traces each kswapd reclaim failure Test results: $ trace-cmd record -e vmscan:mm_vmscan_kswapd_clear_hopeless -e vmscan:mm_vmscan_kswapd_reclaim_fail $ # generate memory pressure $ trace-cmd report cpus=4 kswapd0-71 [000] 27.216563: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=1 kswapd0-71 [000] 27.217169: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=2 kswapd0-71 [000] 27.217764: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=3 kswapd0-71 [000] 27.218353: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=4 kswapd0-71 [000] 27.218993: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=5 kswapd0-71 [000] 27.219744: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=6 kswapd0-71 [000] 27.220488: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=7 kswapd0-71 [000] 27.221206: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=8 kswapd0-71 [000] 27.221806: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=9 kswapd0-71 [000] 27.222634: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=10 kswapd0-71 [000] 27.223286: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=11 kswapd0-71 [000] 27.223894: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=12 kswapd0-71 [000] 27.224712: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=13 kswapd0-71 [000] 27.225424: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=14 kswapd0-71 [000] 27.226082: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=15 kswapd0-71 [000] 27.226810: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=16 kswapd1-72 [002] 27.386869: mm_vmscan_kswapd_reclaim_fail: nid=1 failures=1 kswapd1-72 [002] 27.387435: mm_vmscan_kswapd_reclaim_fail: nid=1 failures=2 kswapd1-72 [002] 27.388016: mm_vmscan_kswapd_reclaim_fail: nid=1 failures=3 kswapd1-72 [002] 27.388586: mm_vmscan_kswapd_reclaim_fail: nid=1 failures=4 kswapd1-72 [002] 27.389155: mm_vmscan_kswapd_reclaim_fail: nid=1 failures=5 kswapd1-72 [002] 27.389723: mm_vmscan_kswapd_reclaim_fail: nid=1 failures=6 kswapd1-72 [002] 27.390292: mm_vmscan_kswapd_reclaim_fail: nid=1 failures=7 kswapd1-72 [002] 27.392364: mm_vmscan_kswapd_reclaim_fail: nid=1 failures=8 kswapd1-72 [002] 27.392934: mm_vmscan_kswapd_reclaim_fail: nid=1 failures=9 kswapd1-72 [002] 27.393504: mm_vmscan_kswapd_reclaim_fail: nid=1 failures=10 kswapd1-72 [002] 27.394073: mm_vmscan_kswapd_reclaim_fail: nid=1 failures=11 kswapd1-72 [002] 27.394899: mm_vmscan_kswapd_reclaim_fail: nid=1 failures=12 kswapd1-72 [002] 27.395472: mm_vmscan_kswapd_reclaim_fail: nid=1 failures=13 kswapd1-72 [002] 27.396055: mm_vmscan_kswapd_reclaim_fail: nid=1 failures=14 kswapd1-72 [002] 27.396628: mm_vmscan_kswapd_reclaim_fail: nid=1 failures=15 kswapd1-72 [002] 27.397199: mm_vmscan_kswapd_reclaim_fail: nid=1 failures=16 kworker/u18:0-40 [002] 27.410151: mm_vmscan_kswapd_clear_hopeless: nid=0 reason=DIRECT kswapd0-71 [000] 27.439454: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=1 kswapd0-71 [000] 27.440048: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=2 kswapd0-71 [000] 27.440634: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=3 kswapd0-71 [000] 27.441211: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=4 kswapd0-71 [000] 27.441787: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=5 kswapd0-71 [000] 27.442363: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=6 kswapd0-71 [000] 27.443030: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=7 kswapd0-71 [000] 27.443725: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=8 kswapd0-71 [000] 27.444315: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=9 kswapd0-71 [000] 27.444898: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=10 kswapd0-71 [000] 27.445476: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=11 kswapd0-71 [000] 27.446053: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=12 kswapd0-71 [000] 27.446646: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=13 kswapd0-71 [000] 27.447230: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=14 kswapd0-71 [000] 27.447812: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=15 kswapd0-71 [000] 27.448391: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=16 ann-423 [003] 28.028285: mm_vmscan_kswapd_clear_hopeless: nid=0 reason=PCP This patch (of 2): When kswapd fails to reclaim memory, kswapd_failures is incremented. Once it reaches MAX_RECLAIM_RETRIES, kswapd stops running to avoid futile reclaim attempts. However, any successful direct reclaim unconditionally resets kswapd_failures to 0, which can cause problems. We observed an issue in production on a multi-NUMA system where a process allocated large amounts of anonymous pages on a single NUMA node, causing its watermark to drop below high and evicting most file pages: $ numastat -m Per-node system memory usage (in MBs): Node 0 Node 1 Total --------------- --------------- --------------- MemTotal 128222.19 127983.91 256206.11 MemFree 1414.48 1432.80 2847.29 MemUsed 126807.71 126551.11 252358.82 SwapCached 0.00 0.00 0.00 Active 29017.91 25554.57 54572.48 Inactive 92749.06 95377.00 188126.06 Active(anon) 28998.96 23356.47 52355.43 Inactive(anon) 92685.27 87466.11 180151.39 Active(file) 18.95 2198.10 2217.05 Inactive(file) 63.79 7910.89 7974.68 With swap disabled, only file pages can be reclaimed. When kswapd is woken (e.g., via wake_all_kswapds()), it runs continuously but cannot raise free memory above the high watermark since reclaimable file pages are insufficient. Normally, kswapd would eventually stop after kswapd_failures reaches MAX_RECLAIM_RETRIES. However, containers on this machine have memory.high set in their cgroup. Business processes continuously trigger the high limit, causing frequent direct reclaim that keeps resetting kswapd_failures to 0. This prevents kswapd from ever stopping. The key insight is that direct reclaim triggered by cgroup memory.high performs aggressive scanning to throttle the allocating process. With sufficiently aggressive scanning, even hot pages will eventually be reclaimed, making direct reclaim "successful" at freeing some memory. However, this success does not mean the node has reached a balanced state - the freed memory may still be insufficient to bring free pages above the high watermark. Unconditionally resetting kswapd_failures in this case keeps kswapd alive indefinitely. The result is that kswapd runs endlessly. Unlike direct reclaim which only reclaims from the allocating cgroup, kswapd scans the entire node's memory. This causes hot file pages from all workloads on the node to be evicted, not just those from the cgroup triggering memory.high. These pages constantly refault, generating sustained heavy IO READ pressure across the entire system. Fix this by only resetting kswapd_failures when the node is actually balanced. This allows both kswapd and direct reclaim to clear kswapd_failures upon successful reclaim, but only when the reclaim actually resolves the memory pressure (i.e., the node becomes balanced). Link: https://lkml.kernel.org/r/20260120024402.387576-1-jiayuan.chen@linux.dev Link: https://lkml.kernel.org/r/20260120024402.387576-2-jiayuan.chen@linux.dev Signed-off-by: Jiayuan Chen Signed-off-by: Jiayuan Chen Acked-by: Shakeel Butt Cc: Axel Rasmussen Cc: Brendan Jackman Cc: David Hildenbrand Cc: Johannes Weiner Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: "Masami Hiramatsu (Google)" Cc: Mathieu Desnoyers Cc: Michal Hocko Cc: Mike Rapoport Cc: Qi Zheng Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Wei Xu Cc: Yuanchu Xie Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 2 ++ mm/vmscan.c | 22 ++++++++++++++++++++-- 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index eb3815fc94ad..8881198e85c6 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1536,6 +1536,8 @@ static inline unsigned long pgdat_end_pfn(pg_data_t *pgdat) void build_all_zonelists(pg_data_t *pgdat); void wakeup_kswapd(struct zone *zone, gfp_t gfp_mask, int order, enum zone_type highest_zoneidx); +void kswapd_try_clear_hopeless(struct pglist_data *pgdat, + unsigned int order, int highest_zoneidx); bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, int highest_zoneidx, unsigned int alloc_flags, long free_pages); diff --git a/mm/vmscan.c b/mm/vmscan.c index b33039000d6e..5d9b1bce6f01 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -5065,7 +5065,7 @@ static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control * blk_finish_plug(&plug); done: if (sc->nr_reclaimed > reclaimed) - atomic_set(&pgdat->kswapd_failures, 0); + kswapd_try_clear_hopeless(pgdat, sc->order, sc->reclaim_idx); } /****************************************************************************** @@ -6132,7 +6132,7 @@ again: * successful direct reclaim run will revive a dormant kswapd. */ if (reclaimable) - atomic_set(&pgdat->kswapd_failures, 0); + kswapd_try_clear_hopeless(pgdat, sc->order, sc->reclaim_idx); else if (sc->cache_trim_mode) sc->cache_trim_mode_failed = 1; } @@ -7391,6 +7391,24 @@ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order, wake_up_interruptible(&pgdat->kswapd_wait); } +static void kswapd_clear_hopeless(pg_data_t *pgdat) +{ + atomic_set(&pgdat->kswapd_failures, 0); +} + +/* + * Reset kswapd_failures only when the node is balanced. Without this + * check, successful direct reclaim (e.g., from cgroup memory.high + * throttling) can keep resetting kswapd_failures even when the node + * cannot be balanced, causing kswapd to run endlessly. + */ +void kswapd_try_clear_hopeless(struct pglist_data *pgdat, + unsigned int order, int highest_zoneidx) +{ + if (pgdat_balanced(pgdat, order, highest_zoneidx)) + kswapd_clear_hopeless(pgdat); +} + #ifdef CONFIG_HIBERNATION /* * Try to free `nr_to_reclaim' of memory, system-wide, and return the number of -- cgit v1.2.3 From a45088376d8a847a5e3b1982fcfceb41644e3b1d Mon Sep 17 00:00:00 2001 From: Jiayuan Chen Date: Tue, 20 Jan 2026 10:43:49 +0800 Subject: mm/vmscan: add tracepoint and reason for kswapd_failures reset Currently, kswapd_failures is reset in multiple places (kswapd, direct reclaim, PCP freeing, memory-tiers), but there's no way to trace when and why it was reset, making it difficult to debug memory reclaim issues. This patch: 1. Introduce kswapd_clear_hopeless() as a wrapper function to centralize kswapd_failures reset logic. 2. Introduce kswapd_test_hopeless() to encapsulate hopeless node checks, replacing all open-coded kswapd_failures comparisons. 3. Add kswapd_clear_hopeless_reason enum to distinguish reset sources: - KSWAPD_CLEAR_HOPELESS_KSWAPD: reset from kswapd context - KSWAPD_CLEAR_HOPELESS_DIRECT: reset from direct reclaim - KSWAPD_CLEAR_HOPELESS_PCP: reset from PCP page freeing - KSWAPD_CLEAR_HOPELESS_OTHER: reset from other paths 4. Add tracepoints for better observability: - mm_vmscan_kswapd_clear_hopeless: traces each reset with reason - mm_vmscan_kswapd_reclaim_fail: traces each kswapd reclaim failure Test results: $ trace-cmd record -e vmscan:mm_vmscan_kswapd_clear_hopeless -e vmscan:mm_vmscan_kswapd_reclaim_fail $ # generate memory pressure $ trace-cmd report cpus=4 kswapd0-71 [000] 27.216563: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=1 kswapd0-71 [000] 27.217169: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=2 kswapd0-71 [000] 27.217764: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=3 kswapd0-71 [000] 27.218353: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=4 kswapd0-71 [000] 27.218993: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=5 kswapd0-71 [000] 27.219744: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=6 kswapd0-71 [000] 27.220488: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=7 kswapd0-71 [000] 27.221206: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=8 kswapd0-71 [000] 27.221806: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=9 kswapd0-71 [000] 27.222634: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=10 kswapd0-71 [000] 27.223286: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=11 kswapd0-71 [000] 27.223894: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=12 kswapd0-71 [000] 27.224712: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=13 kswapd0-71 [000] 27.225424: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=14 kswapd0-71 [000] 27.226082: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=15 kswapd0-71 [000] 27.226810: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=16 kswapd1-72 [002] 27.386869: mm_vmscan_kswapd_reclaim_fail: nid=1 failures=1 kswapd1-72 [002] 27.387435: mm_vmscan_kswapd_reclaim_fail: nid=1 failures=2 kswapd1-72 [002] 27.388016: mm_vmscan_kswapd_reclaim_fail: nid=1 failures=3 kswapd1-72 [002] 27.388586: mm_vmscan_kswapd_reclaim_fail: nid=1 failures=4 kswapd1-72 [002] 27.389155: mm_vmscan_kswapd_reclaim_fail: nid=1 failures=5 kswapd1-72 [002] 27.389723: mm_vmscan_kswapd_reclaim_fail: nid=1 failures=6 kswapd1-72 [002] 27.390292: mm_vmscan_kswapd_reclaim_fail: nid=1 failures=7 kswapd1-72 [002] 27.392364: mm_vmscan_kswapd_reclaim_fail: nid=1 failures=8 kswapd1-72 [002] 27.392934: mm_vmscan_kswapd_reclaim_fail: nid=1 failures=9 kswapd1-72 [002] 27.393504: mm_vmscan_kswapd_reclaim_fail: nid=1 failures=10 kswapd1-72 [002] 27.394073: mm_vmscan_kswapd_reclaim_fail: nid=1 failures=11 kswapd1-72 [002] 27.394899: mm_vmscan_kswapd_reclaim_fail: nid=1 failures=12 kswapd1-72 [002] 27.395472: mm_vmscan_kswapd_reclaim_fail: nid=1 failures=13 kswapd1-72 [002] 27.396055: mm_vmscan_kswapd_reclaim_fail: nid=1 failures=14 kswapd1-72 [002] 27.396628: mm_vmscan_kswapd_reclaim_fail: nid=1 failures=15 kswapd1-72 [002] 27.397199: mm_vmscan_kswapd_reclaim_fail: nid=1 failures=16 kworker/u18:0-40 [002] 27.410151: mm_vmscan_kswapd_clear_hopeless: nid=0 reason=DIRECT kswapd0-71 [000] 27.439454: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=1 kswapd0-71 [000] 27.440048: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=2 kswapd0-71 [000] 27.440634: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=3 kswapd0-71 [000] 27.441211: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=4 kswapd0-71 [000] 27.441787: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=5 kswapd0-71 [000] 27.442363: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=6 kswapd0-71 [000] 27.443030: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=7 kswapd0-71 [000] 27.443725: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=8 kswapd0-71 [000] 27.444315: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=9 kswapd0-71 [000] 27.444898: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=10 kswapd0-71 [000] 27.445476: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=11 kswapd0-71 [000] 27.446053: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=12 kswapd0-71 [000] 27.446646: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=13 kswapd0-71 [000] 27.447230: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=14 kswapd0-71 [000] 27.447812: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=15 kswapd0-71 [000] 27.448391: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=16 ann-423 [003] 28.028285: mm_vmscan_kswapd_clear_hopeless: nid=0 reason=PCP Link: https://lkml.kernel.org/r/20260120024402.387576-3-jiayuan.chen@linux.dev Signed-off-by: Jiayuan Chen Signed-off-by: Jiayuan Chen Acked-by: Shakeel Butt Suggested-by: Johannes Weiner Reviewed-by: Steven Rostedt (Google) [tracing] Cc: Axel Rasmussen Cc: Brendan Jackman Cc: David Hildenbrand Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: "Masami Hiramatsu (Google)" Cc: Mathieu Desnoyers Cc: Michal Hocko Cc: Mike Rapoport Cc: Qi Zheng Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Wei Xu Cc: Yuanchu Xie Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 19 ++++++++++++---- include/trace/events/vmscan.h | 51 +++++++++++++++++++++++++++++++++++++++++++ mm/memory-tiers.c | 2 +- mm/page_alloc.c | 4 ++-- mm/show_mem.c | 3 +-- mm/vmscan.c | 29 ++++++++++++++++-------- mm/vmstat.c | 2 +- 7 files changed, 91 insertions(+), 19 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 8881198e85c6..3e51190a55e4 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1534,16 +1534,27 @@ static inline unsigned long pgdat_end_pfn(pg_data_t *pgdat) #include void build_all_zonelists(pg_data_t *pgdat); -void wakeup_kswapd(struct zone *zone, gfp_t gfp_mask, int order, - enum zone_type highest_zoneidx); -void kswapd_try_clear_hopeless(struct pglist_data *pgdat, - unsigned int order, int highest_zoneidx); bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, int highest_zoneidx, unsigned int alloc_flags, long free_pages); bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, int highest_zoneidx, unsigned int alloc_flags); + +enum kswapd_clear_hopeless_reason { + KSWAPD_CLEAR_HOPELESS_OTHER = 0, + KSWAPD_CLEAR_HOPELESS_KSWAPD, + KSWAPD_CLEAR_HOPELESS_DIRECT, + KSWAPD_CLEAR_HOPELESS_PCP, +}; + +void wakeup_kswapd(struct zone *zone, gfp_t gfp_mask, int order, + enum zone_type highest_zoneidx); +void kswapd_try_clear_hopeless(struct pglist_data *pgdat, + unsigned int order, int highest_zoneidx); +void kswapd_clear_hopeless(pg_data_t *pgdat, enum kswapd_clear_hopeless_reason reason); +bool kswapd_test_hopeless(pg_data_t *pgdat); + /* * Memory initialization context, use to differentiate memory added by * the platform statically or via memory hotplug interface. diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h index 490958fa10de..ea58e4656abf 100644 --- a/include/trace/events/vmscan.h +++ b/include/trace/events/vmscan.h @@ -40,6 +40,16 @@ {_VMSCAN_THROTTLE_CONGESTED, "VMSCAN_THROTTLE_CONGESTED"} \ ) : "VMSCAN_THROTTLE_NONE" +TRACE_DEFINE_ENUM(KSWAPD_CLEAR_HOPELESS_OTHER); +TRACE_DEFINE_ENUM(KSWAPD_CLEAR_HOPELESS_KSWAPD); +TRACE_DEFINE_ENUM(KSWAPD_CLEAR_HOPELESS_DIRECT); +TRACE_DEFINE_ENUM(KSWAPD_CLEAR_HOPELESS_PCP); + +#define kswapd_clear_hopeless_reason_ops \ + {KSWAPD_CLEAR_HOPELESS_KSWAPD, "KSWAPD"}, \ + {KSWAPD_CLEAR_HOPELESS_DIRECT, "DIRECT"}, \ + {KSWAPD_CLEAR_HOPELESS_PCP, "PCP"}, \ + {KSWAPD_CLEAR_HOPELESS_OTHER, "OTHER"} #define trace_reclaim_flags(file) ( \ (file ? RECLAIM_WB_FILE : RECLAIM_WB_ANON) | \ @@ -535,6 +545,47 @@ TRACE_EVENT(mm_vmscan_throttled, __entry->usec_delayed, show_throttle_flags(__entry->reason)) ); + +TRACE_EVENT(mm_vmscan_kswapd_reclaim_fail, + + TP_PROTO(int nid, int failures), + + TP_ARGS(nid, failures), + + TP_STRUCT__entry( + __field(int, nid) + __field(int, failures) + ), + + TP_fast_assign( + __entry->nid = nid; + __entry->failures = failures; + ), + + TP_printk("nid=%d failures=%d", + __entry->nid, __entry->failures) +); + +TRACE_EVENT(mm_vmscan_kswapd_clear_hopeless, + + TP_PROTO(int nid, int reason), + + TP_ARGS(nid, reason), + + TP_STRUCT__entry( + __field(int, nid) + __field(int, reason) + ), + + TP_fast_assign( + __entry->nid = nid; + __entry->reason = reason; + ), + + TP_printk("nid=%d reason=%s", + __entry->nid, + __print_symbolic(__entry->reason, kswapd_clear_hopeless_reason_ops)) +); #endif /* _TRACE_VMSCAN_H */ /* This part must be outside protection */ diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c index 7ec442776574..0ae8bec86346 100644 --- a/mm/memory-tiers.c +++ b/mm/memory-tiers.c @@ -955,7 +955,7 @@ static ssize_t demotion_enabled_store(struct kobject *kobj, struct pglist_data *pgdat; for_each_online_pgdat(pgdat) - atomic_set(&pgdat->kswapd_failures, 0); + kswapd_clear_hopeless(pgdat, KSWAPD_CLEAR_HOPELESS_OTHER); } return count; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e779b18168de..2c70ba9d5cc6 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2945,9 +2945,9 @@ static bool free_frozen_page_commit(struct zone *zone, * 'hopeless node' to stay in that state for a while. Let * kswapd work again by resetting kswapd_failures. */ - if (atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES && + if (kswapd_test_hopeless(pgdat) && next_memory_node(pgdat->node_id) < MAX_NUMNODES) - atomic_set(&pgdat->kswapd_failures, 0); + kswapd_clear_hopeless(pgdat, KSWAPD_CLEAR_HOPELESS_PCP); } return ret; } diff --git a/mm/show_mem.c b/mm/show_mem.c index 3a4b5207635d..24078ac3e6bc 100644 --- a/mm/show_mem.c +++ b/mm/show_mem.c @@ -278,8 +278,7 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z #endif K(node_page_state(pgdat, NR_PAGETABLE)), K(node_page_state(pgdat, NR_SECONDARY_PAGETABLE)), - str_yes_no(atomic_read(&pgdat->kswapd_failures) >= - MAX_RECLAIM_RETRIES), + str_yes_no(kswapd_test_hopeless(pgdat)), K(node_page_state(pgdat, NR_BALLOON_PAGES))); } diff --git a/mm/vmscan.c b/mm/vmscan.c index 5d9b1bce6f01..1d281174164e 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -506,7 +506,7 @@ static bool skip_throttle_noprogress(pg_data_t *pgdat) * If kswapd is disabled, reschedule if necessary but do not * throttle as the system is likely near OOM. */ - if (atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES) + if (kswapd_test_hopeless(pgdat)) return true; /* @@ -6437,7 +6437,7 @@ static bool allow_direct_reclaim(pg_data_t *pgdat) int i; bool wmark_ok; - if (atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES) + if (kswapd_test_hopeless(pgdat)) return true; for_each_managed_zone_pgdat(zone, pgdat, i, ZONE_NORMAL) { @@ -6846,7 +6846,7 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, wake_up_all(&pgdat->pfmemalloc_wait); /* Hopeless node, leave it to direct reclaim */ - if (atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES) + if (kswapd_test_hopeless(pgdat)) return true; if (pgdat_balanced(pgdat, order, highest_zoneidx)) { @@ -7111,8 +7111,11 @@ restart: * watermark_high at this point. We need to avoid increasing the * failure count to prevent the kswapd thread from stopping. */ - if (!sc.nr_reclaimed && !boosted) - atomic_inc(&pgdat->kswapd_failures); + if (!sc.nr_reclaimed && !boosted) { + int fail_cnt = atomic_inc_return(&pgdat->kswapd_failures); + /* kswapd context, low overhead to trace every failure */ + trace_mm_vmscan_kswapd_reclaim_fail(pgdat->node_id, fail_cnt); + } out: clear_reclaim_active(pgdat, highest_zoneidx); @@ -7371,7 +7374,7 @@ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order, return; /* Hopeless node, leave it to direct reclaim if possible */ - if (atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES || + if (kswapd_test_hopeless(pgdat) || (pgdat_balanced(pgdat, order, highest_zoneidx) && !pgdat_watermark_boosted(pgdat, highest_zoneidx))) { /* @@ -7391,9 +7394,11 @@ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order, wake_up_interruptible(&pgdat->kswapd_wait); } -static void kswapd_clear_hopeless(pg_data_t *pgdat) +void kswapd_clear_hopeless(pg_data_t *pgdat, enum kswapd_clear_hopeless_reason reason) { - atomic_set(&pgdat->kswapd_failures, 0); + /* Only trace actual resets, not redundant zero-to-zero */ + if (atomic_xchg(&pgdat->kswapd_failures, 0)) + trace_mm_vmscan_kswapd_clear_hopeless(pgdat->node_id, reason); } /* @@ -7406,7 +7411,13 @@ void kswapd_try_clear_hopeless(struct pglist_data *pgdat, unsigned int order, int highest_zoneidx) { if (pgdat_balanced(pgdat, order, highest_zoneidx)) - kswapd_clear_hopeless(pgdat); + kswapd_clear_hopeless(pgdat, current_is_kswapd() ? + KSWAPD_CLEAR_HOPELESS_KSWAPD : KSWAPD_CLEAR_HOPELESS_DIRECT); +} + +bool kswapd_test_hopeless(pg_data_t *pgdat) +{ + return atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES; } #ifdef CONFIG_HIBERNATION diff --git a/mm/vmstat.c b/mm/vmstat.c index 0f64c898f79f..23e176e1d09d 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1840,7 +1840,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, "\n start_pfn: %lu" "\n reserved_highatomic: %lu" "\n free_highatomic: %lu", - atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES, + kswapd_test_hopeless(pgdat), zone->zone_start_pfn, zone->nr_reserved_highatomic, zone->nr_free_highatomic); -- cgit v1.2.3 From 94350fe6cad77b46c3dcb8c96543bef7647efbc0 Mon Sep 17 00:00:00 2001 From: William Tambe Date: Thu, 11 Dec 2025 12:38:19 -0800 Subject: mm/highmem: fix __kmap_to_page() build error This changes fixes following build error which is a miss from ef6e06b2ef87 ("highmem: fix kmap_to_page() for kmap_local_page() addresses"). mm/highmem.c:184:66: error: 'pteval' undeclared (first use in this function); did you mean 'pte_val'? 184 | idx = arch_kmap_local_map_idx(i, pte_pfn(pteval)); In __kmap_to_page(), pteval is used but does not exist in the function. (akpm: affects xtensa only) Link: https://lkml.kernel.org/r/SJ0PR07MB86317E00EC0C59DA60935FDCD18DA@SJ0PR07MB8631.namprd07.prod.outlook.com Fixes: ef6e06b2ef87 ("highmem: fix kmap_to_page() for kmap_local_page() addresses") Signed-off-by: William Tambe Reviewed-by: Max Filippov Cc: Chris Zankel Cc: Max Filippov Cc: Signed-off-by: Andrew Morton --- mm/highmem.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/highmem.c b/mm/highmem.c index b5c8e4c2d5d4..a33e41183951 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -180,12 +180,13 @@ struct page *__kmap_to_page(void *vaddr) for (i = 0; i < kctrl->idx; i++) { unsigned long base_addr; int idx; + pte_t pteval = kctrl->pteval[i]; idx = arch_kmap_local_map_idx(i, pte_pfn(pteval)); base_addr = __fix_to_virt(FIX_KMAP_BEGIN + idx); if (base_addr == base) - return pte_page(kctrl->pteval[i]); + return pte_page(pteval); } } -- cgit v1.2.3 From a1c655f554441561bf4b256dc75d977a1433753c Mon Sep 17 00:00:00 2001 From: Joshua Hahn Date: Fri, 16 Jan 2026 14:27:15 -0500 Subject: mm/hugetlb: remove unnecessary if condition if (map_chg) is always true, since it is nested in another if statement which checks for map_chg == MAP_CHG_NEEDED, which is equal to 1. if (unlikely(map_chg == MAP_CHG_NEEDED && retval == 0)) { ... if (map_chg) { ... } } Remove the check, un-indent, and collapse the function call for readability. No functional change intended. Link: https://lkml.kernel.org/r/20260116192717.1600049-1-joshua.hahnjy@gmail.com Signed-off-by: Joshua Hahn Acked-by: David Hildenbrand (Red Hat) Reviewed-by: SeongJae Park Cc: Muchun Song Cc: Oscar Salvador Signed-off-by: Andrew Morton --- mm/hugetlb.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 8c197307db0c..a84869d33bed 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2989,13 +2989,10 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, rsv_adjust = hugepage_subpool_put_pages(spool, 1); hugetlb_acct_memory(h, -rsv_adjust); - if (map_chg) { - spin_lock_irq(&hugetlb_lock); - hugetlb_cgroup_uncharge_folio_rsvd( - hstate_index(h), pages_per_huge_page(h), - folio); - spin_unlock_irq(&hugetlb_lock); - } + spin_lock_irq(&hugetlb_lock); + hugetlb_cgroup_uncharge_folio_rsvd( + hstate_index(h), pages_per_huge_page(h), folio); + spin_unlock_irq(&hugetlb_lock); } } -- cgit v1.2.3 From 824b8c96c421e677cf0fe6f69939ff0082665a34 Mon Sep 17 00:00:00 2001 From: Joshua Hahn Date: Fri, 16 Jan 2026 14:27:16 -0500 Subject: mm/hugetlb: enforce brace style Documentation/process/coding-style.rst explicitly notes that if only one branch of a conditional statement is a single statement, braces should be used in both branches. Enforce this in mm/hugetlb.c. While add it, fix the indentation for vma_end_reservation. No functional change intended. Link: https://lkml.kernel.org/r/20260116192717.1600049-2-joshua.hahnjy@gmail.com Signed-off-by: Joshua Hahn Acked-by: David Hildenbrand (Red Hat) Reviewed-by: SeongJae Park Cc: Muchun Song Cc: Oscar Salvador Signed-off-by: Andrew Morton --- mm/hugetlb.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index a84869d33bed..9c7efad6fa48 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -578,8 +578,9 @@ hugetlb_resv_map_add(struct resv_map *map, struct list_head *rg, long from, record_hugetlb_cgroup_uncharge_info(cg, h, map, nrg); list_add(&nrg->link, rg); coalesce_file_region(map, nrg); - } else + } else { *regions_needed += 1; + } return to - from; } @@ -1247,8 +1248,9 @@ void hugetlb_dup_vma_private(struct vm_area_struct *vma) if (vma_lock && vma_lock->vma != vma) vma->vm_private_data = NULL; - } else + } else { vma->vm_private_data = NULL; + } } /* @@ -2076,8 +2078,9 @@ retry: h->max_huge_pages++; goto out; } - } else + } else { rc = 0; + } update_and_free_hugetlb_folio(h, folio, false); return rc; @@ -2672,11 +2675,12 @@ void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma, * be consumed on a subsequent allocation. */ folio_set_hugetlb_restore_reserve(folio); - } else + } else { /* * No reservation present, do nothing */ - vma_end_reservation(h, vma, address); + vma_end_reservation(h, vma, address); + } } } @@ -4672,10 +4676,12 @@ static void hugetlb_vm_op_open(struct vm_area_struct *vma) if (vma_lock->vma != vma) { vma->vm_private_data = NULL; hugetlb_vma_lock_alloc(vma); - } else + } else { pr_warn("HugeTLB: vma_lock already exists in %s.\n", __func__); - } else + } + } else { hugetlb_vma_lock_alloc(vma); + } } } -- cgit v1.2.3 From 0bcbd7cf6596826cfb0ca653f47fb9e9410b3f2e Mon Sep 17 00:00:00 2001 From: Marco Crivellari Date: Tue, 13 Jan 2026 12:46:28 +0100 Subject: mm: replace use of system_unbound_wq with system_dfl_wq MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "Replace wq users and add WQ_PERCPU to alloc_workqueue() users", v2. This series continues the effort to refactor the Workqueue API. No behavior changes are introduced by this series. === Recent changes to the WQ API === The following, address the recent changes in the Workqueue API: - commit 128ea9f6ccfb ("workqueue: Add system_percpu_wq and system_dfl_wq") - commit 930c2ea566af ("workqueue: Add new WQ_PERCPU flag") The old workqueues will be removed in a future release cycle and unbound will become the implicit default. === Introduced Changes by this series === 1) [P 1-2] Replace use of system_wq and system_unbound_wq Workqueue users converted to the better named new workqueues: system_wq -> system_percpu_wq system_unbound_wq -> system_dfl_wq This way the old obsolete workqueues (system_wq, system_unbound_wq) can be removed in the future. 2) [P 3] add WQ_PERCPU to remaining alloc_workqueue() users With the introduction of the WQ_PERCPU flag (equivalent to !WQ_UNBOUND), any alloc_workqueue() caller that doesn’t explicitly specify WQ_UNBOUND must now use WQ_PERCPU. WQ_UNBOUND will be removed in future. For more information: https://lore.kernel.org/all/20250221112003.1dSuoGyc@linutronix.de/ This patch (of 3): This patch continues the effort to refactor workqueue APIs, which has begun with the changes introducing new workqueues and a new alloc_workqueue flag: commit 128ea9f6ccfb ("workqueue: Add system_percpu_wq and system_dfl_wq") commit 930c2ea566af ("workqueue: Add new WQ_PERCPU flag") The point of the refactoring is to eventually alter the default behavior of workqueues to become unbound by default so that their workload placement is optimized by the scheduler. Before that to happen, workqueue users must be converted to the better named new workqueues with no intended behaviour changes: system_wq -> system_percpu_wq system_unbound_wq -> system_dfl_wq This way the old obsolete workqueues (system_wq, system_unbound_wq) can be removed in the future. Link: https://lkml.kernel.org/r/20260113114630.152942-1-marco.crivellari@suse.com Link: https://lore.kernel.org/all/20250221112003.1dSuoGyc@linutronix.de/ Link: https://lkml.kernel.org/r/20260113114630.152942-2-marco.crivellari@suse.com Signed-off-by: Marco Crivellari Suggested-by: Tejun Heo Reviewed-by: Sebastian Andrzej Siewior Reviewed-by: Frederic Weisbecker Cc: Alexander Potapenko Cc: David Hildenbrand Cc: Dmitry Vyukov Cc: Johannes Weiner Cc: Lai jiangshan Cc: "Liam R. Howlett" Cc: Lorenzo Stoakes Cc: Marco Elver Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Roman Gushchin Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/backing-dev.c | 2 +- mm/kfence/core.c | 6 +++--- mm/memcontrol.c | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/mm/backing-dev.c b/mm/backing-dev.c index a0e26d1b717f..0e315f770755 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -939,7 +939,7 @@ void wb_memcg_offline(struct mem_cgroup *memcg) memcg_cgwb_list->next = NULL; /* prevent new wb's */ spin_unlock_irq(&cgwb_lock); - queue_work(system_unbound_wq, &cleanup_offline_cgwbs_work); + queue_work(system_dfl_wq, &cleanup_offline_cgwbs_work); } /** diff --git a/mm/kfence/core.c b/mm/kfence/core.c index 4f79ec720752..1b779cee6ca2 100644 --- a/mm/kfence/core.c +++ b/mm/kfence/core.c @@ -900,7 +900,7 @@ static void toggle_allocation_gate(struct work_struct *work) /* Disable static key and reset timer. */ static_branch_disable(&kfence_allocation_key); #endif - queue_delayed_work(system_unbound_wq, &kfence_timer, + queue_delayed_work(system_dfl_wq, &kfence_timer, msecs_to_jiffies(kfence_sample_interval)); } @@ -950,7 +950,7 @@ static void kfence_init_enable(void) #endif WRITE_ONCE(kfence_enabled, true); - queue_delayed_work(system_unbound_wq, &kfence_timer, 0); + queue_delayed_work(system_dfl_wq, &kfence_timer, 0); pr_info("initialized - using %lu bytes for %d objects at 0x%p-0x%p\n", KFENCE_POOL_SIZE, CONFIG_KFENCE_NUM_OBJECTS, (void *)__kfence_pool, @@ -1046,7 +1046,7 @@ static int kfence_enable_late(void) return kfence_init_late(); WRITE_ONCE(kfence_enabled, true); - queue_delayed_work(system_unbound_wq, &kfence_timer, 0); + queue_delayed_work(system_dfl_wq, &kfence_timer, 0); pr_info("re-enabled\n"); return 0; } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 7d6cf47e6d4c..21d17975c4ac 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -644,7 +644,7 @@ static void flush_memcg_stats_dwork(struct work_struct *w) * in latency-sensitive paths is as cheap as possible. */ __mem_cgroup_flush_stats(root_mem_cgroup, true); - queue_delayed_work(system_unbound_wq, &stats_flush_dwork, FLUSH_TIME); + queue_delayed_work(system_dfl_wq, &stats_flush_dwork, FLUSH_TIME); } unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) @@ -3841,7 +3841,7 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css) goto offline_kmem; if (unlikely(mem_cgroup_is_root(memcg)) && !mem_cgroup_disabled()) - queue_delayed_work(system_unbound_wq, &stats_flush_dwork, + queue_delayed_work(system_dfl_wq, &stats_flush_dwork, FLUSH_TIME); lru_gen_online_memcg(memcg); -- cgit v1.2.3 From 73b2162126ff0b811929f700cec9475622c9cb11 Mon Sep 17 00:00:00 2001 From: Marco Crivellari Date: Tue, 13 Jan 2026 12:46:29 +0100 Subject: mm: replace use of system_wq with system_percpu_wq This patch continues the effort to refactor workqueue APIs, which has begun with the changes introducing new workqueues and a new alloc_workqueue flag: commit 128ea9f6ccfb ("workqueue: Add system_percpu_wq and system_dfl_wq") commit 930c2ea566af ("workqueue: Add new WQ_PERCPU flag") The point of the refactoring is to eventually alter the default behavior of workqueues to become unbound by default so that their workload placement is optimized by the scheduler. Before that to happen, workqueue users must be converted to the better named new workqueues with no intended behaviour changes: system_wq -> system_percpu_wq system_unbound_wq -> system_dfl_wq This way the old obsolete workqueues (system_wq, system_unbound_wq) can be removed in the future. Link: https://lore.kernel.org/all/20250221112003.1dSuoGyc@linutronix.de/ Link: https://lkml.kernel.org/r/20260113114630.152942-3-marco.crivellari@suse.com Signed-off-by: Marco Crivellari Suggested-by: Tejun Heo Reviewed-by: Sebastian Andrzej Siewior Reviewed-by: Frederic Weisbecker Cc: Alexander Potapenko Cc: David Hildenbrand Cc: Dmitry Vyukov Cc: Johannes Weiner Cc: Lai jiangshan Cc: "Liam R. Howlett" Cc: Lorenzo Stoakes Cc: Marco Elver Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Roman Gushchin Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/backing-dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 0e315f770755..69361742893a 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -971,7 +971,7 @@ static int __init cgwb_init(void) { /* * There can be many concurrent release work items overwhelming - * system_wq. Put them in a separate wq and limit concurrency. + * system_percpu_wq. Put them in a separate wq and limit concurrency. * There's no point in executing many of these in parallel. */ cgwb_release_wq = alloc_workqueue("cgwb_release", 0, 1); -- cgit v1.2.3 From ed0a826ce3025832c8d8b79924fd638f75b62bb7 Mon Sep 17 00:00:00 2001 From: Marco Crivellari Date: Tue, 13 Jan 2026 12:46:30 +0100 Subject: mm: add WQ_PERCPU to alloc_workqueue users This continues the effort to refactor workqueue APIs, which began with the introduction of new workqueues and a new alloc_workqueue flag in: commit 128ea9f6ccfb ("workqueue: Add system_percpu_wq and system_dfl_wq") commit 930c2ea566af ("workqueue: Add new WQ_PERCPU flag") The refactoring is going to alter the default behavior of alloc_workqueue() to be unbound by default. With the introduction of the WQ_PERCPU flag (equivalent to !WQ_UNBOUND), any alloc_workqueue() caller that doesn't explicitly specify WQ_UNBOUND must now use WQ_PERCPU. For more details see the Link tag below. In order to keep alloc_workqueue() behavior identical, explicitly request WQ_PERCPU. [akpm@linux-foundation.org: fix mm/slub.c] [akpm@linux-foundation.org: fix kmem_cache_init_late() properly, per Sebastian] Link: https://lore.kernel.org/all/20250221112003.1dSuoGyc@linutronix.de/ Link: https://lkml.kernel.org/r/20260113114630.152942-4-marco.crivellari@suse.com Signed-off-by: Marco Crivellari Suggested-by: Tejun Heo Reviewed-by: Sebastian Andrzej Siewior Reviewed-by: Frederic Weisbecker Cc: Alexander Potapenko Cc: David Hildenbrand Cc: Dmitry Vyukov Cc: Johannes Weiner Cc: Lai jiangshan Cc: "Liam R. Howlett" Cc: Lorenzo Stoakes Cc: Marco Elver Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Roman Gushchin Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/backing-dev.c | 2 +- mm/slub.c | 3 ++- mm/vmstat.c | 3 ++- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 69361742893a..e319bd5e8b75 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -974,7 +974,7 @@ static int __init cgwb_init(void) * system_percpu_wq. Put them in a separate wq and limit concurrency. * There's no point in executing many of these in parallel. */ - cgwb_release_wq = alloc_workqueue("cgwb_release", 0, 1); + cgwb_release_wq = alloc_workqueue("cgwb_release", WQ_PERCPU, 1); if (!cgwb_release_wq) return -ENOMEM; diff --git a/mm/slub.c b/mm/slub.c index 861592ac5425..409d16695c8b 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -8542,7 +8542,8 @@ void __init kmem_cache_init(void) void __init kmem_cache_init_late(void) { - flushwq = alloc_workqueue("slub_flushwq", WQ_MEM_RECLAIM, 0); + flushwq = alloc_workqueue("slub_flushwq", WQ_MEM_RECLAIM | WQ_PERCPU, + 0); WARN_ON(!flushwq); } diff --git a/mm/vmstat.c b/mm/vmstat.c index 23e176e1d09d..99270713e0c1 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -2259,7 +2259,8 @@ void __init init_mm_internals(void) { int ret __maybe_unused; - mm_percpu_wq = alloc_workqueue("mm_percpu_wq", WQ_MEM_RECLAIM, 0); + mm_percpu_wq = alloc_workqueue("mm_percpu_wq", + WQ_MEM_RECLAIM | WQ_PERCPU, 0); #ifdef CONFIG_SMP ret = cpuhp_setup_state_nocalls(CPUHP_MM_VMSTAT_DEAD, "mm/vmstat:dead", -- cgit v1.2.3 From 3a64d5b82eccc0dc629d43cde791a2c19bd67dfc Mon Sep 17 00:00:00 2001 From: Kevin Brodsky Date: Thu, 18 Dec 2025 10:05:40 +0000 Subject: sparc/mm: export symbols for lazy_mmu_mode KUnit tests The lazy_mmu_mode KUnit tests call lazy_mmu_mode_{enable,disable}. These tests may be built as a module, and because of inlining this means that arch_{enter,flush,leave}_lazy_mmu_mode need to be exported. [akpm@linux-foundation.org: remove mm/tests/lazy_mmu_mode_kunit.c comment, per Kevin] Link: https://lkml.kernel.org/r/20251218100541.2667405-1-kevin.brodsky@arm.com Fixes: ee628d9cc8d5 ("mm: add basic tests for lazy_mmu") Signed-off-by: Kevin Brodsky Acked-by: Andreas Larsson Signed-off-by: Andrew Morton --- arch/sparc/mm/tlb.c | 6 ++++++ mm/tests/lazy_mmu_mode_kunit.c | 1 - 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/sparc/mm/tlb.c b/arch/sparc/mm/tlb.c index 3a852071d260..6d9dd5eb1328 100644 --- a/arch/sparc/mm/tlb.c +++ b/arch/sparc/mm/tlb.c @@ -11,6 +11,8 @@ #include #include +#include + #include #include #include @@ -54,6 +56,8 @@ void arch_enter_lazy_mmu_mode(void) { preempt_disable(); } +/* For lazy_mmu_mode KUnit tests */ +EXPORT_SYMBOL_IF_KUNIT(arch_enter_lazy_mmu_mode); void arch_flush_lazy_mmu_mode(void) { @@ -62,12 +66,14 @@ void arch_flush_lazy_mmu_mode(void) if (tb->tlb_nr) flush_tlb_pending(); } +EXPORT_SYMBOL_IF_KUNIT(arch_flush_lazy_mmu_mode); void arch_leave_lazy_mmu_mode(void) { arch_flush_lazy_mmu_mode(); preempt_enable(); } +EXPORT_SYMBOL_IF_KUNIT(arch_leave_lazy_mmu_mode); static void tlb_batch_add_one(struct mm_struct *mm, unsigned long vaddr, bool exec, unsigned int hugepage_shift) diff --git a/mm/tests/lazy_mmu_mode_kunit.c b/mm/tests/lazy_mmu_mode_kunit.c index 1c23456b467e..b689241c6bef 100644 --- a/mm/tests/lazy_mmu_mode_kunit.c +++ b/mm/tests/lazy_mmu_mode_kunit.c @@ -2,7 +2,6 @@ #include #include -/* For some symbols referenced by arch_{enter,leave}_lazy_mmu_mode on powerpc */ MODULE_IMPORT_NS("EXPORTED_FOR_KUNIT_TESTING"); static void expect_not_active(struct kunit *test) -- cgit v1.2.3 From 4ac76c51709dff01b285a2d8afea80ca7ae66d28 Mon Sep 17 00:00:00 2001 From: Kevin Brodsky Date: Thu, 22 Jan 2026 17:02:16 +0000 Subject: selftests/mm: default KDIR to build directory Patch series "Various mm kselftests improvements/fixes", v3. Various improvements/fixes for the mm kselftests: - Patch 1-3 extend support for more build configurations: out-of-tree $KDIR, cross-compilation, etc. - Patch 4-7 fix issues related to faulting in pages, introducing a new helper for that purpose. - Patch 8 fixes the value returned by pagemap_ioctl (PASS was always returned, which explains why the issue fixed in patch 6 went unnoticed). - Patch 9 improves the exit code of pfnmap. Net results: - 1 test no longer fails (patch 7) - 3 tests are no longer skipped (patch 4) - More accurate return values for whole suites (patch 8, 9) - Extra tests are more likely to be built (patch 1-3) This patch (of 9): KDIR currently defaults to the running kernel's modules directory when building the page_frag module. The underlying assumption is that most users build the kselftests in order to run them against the system they're built on. This assumption seems questionable, and there is no guarantee that the module can actually be built against the running kernel. Switch the default value of KDIR to the kernel's build directory, i.e. $(O) if O= or KBUILD_OUTPUT= is used, and the source directory otherwise. This seems like the least surprising option: the test module is built against the kernel that has been previously built. Note: we can't use $(top_srcdir) in mm/Makefile because it is only defined once lib.mk is included. Link: https://lkml.kernel.org/r/20260122170224.4056513-1-kevin.brodsky@arm.com Link: https://lkml.kernel.org/r/20260122170224.4056513-2-kevin.brodsky@arm.com Signed-off-by: Kevin Brodsky Cc: David Hildenbrand Cc: Dev Jain Cc: Lorenzo Stoakes Cc: Mark Brown Cc: Ryan Roberts Cc: Shuah Khan Cc: Jason Gunthorpe Cc: John Hubbard Cc: Paolo Abeni Cc: SeongJae Park Cc: Usama Anjum Cc: wang lian Cc: Yunsheng Lin Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/Makefile | 2 +- tools/testing/selftests/mm/page_frag/Makefile | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile index eaf9312097f7..bb93101e339e 100644 --- a/tools/testing/selftests/mm/Makefile +++ b/tools/testing/selftests/mm/Makefile @@ -44,7 +44,7 @@ LDLIBS = -lrt -lpthread -lm # warnings. CFLAGS += -U_FORTIFY_SOURCE -KDIR ?= /lib/modules/$(shell uname -r)/build +KDIR ?= $(if $(O),$(O),$(realpath ../../../..)) ifneq (,$(wildcard $(KDIR)/Module.symvers)) ifneq (,$(wildcard $(KDIR)/include/linux/page_frag_cache.h)) TEST_GEN_MODS_DIR := page_frag diff --git a/tools/testing/selftests/mm/page_frag/Makefile b/tools/testing/selftests/mm/page_frag/Makefile index 8c8bb39ffa28..96e5f646e69b 100644 --- a/tools/testing/selftests/mm/page_frag/Makefile +++ b/tools/testing/selftests/mm/page_frag/Makefile @@ -1,5 +1,5 @@ PAGE_FRAG_TEST_DIR := $(realpath $(dir $(abspath $(lastword $(MAKEFILE_LIST))))) -KDIR ?= /lib/modules/$(shell uname -r)/build +KDIR ?= $(if $(O),$(O),$(realpath ../../../../..)) ifeq ($(V),1) Q = -- cgit v1.2.3 From 1821be740d2e9329805cafa368e476064fde0789 Mon Sep 17 00:00:00 2001 From: Kevin Brodsky Date: Thu, 22 Jan 2026 17:02:17 +0000 Subject: selftests/mm: remove flaky header check Commit 96ed62ea0298 ("mm: page_frag: fix a compile error when kernel is not compiled") introduced a check to avoid attempting to build the page_frag module if is missing. Unfortunately this check only works if KDIR points to /lib/modules/... or an in-tree kernel build. It always fails if KDIR points to an out-of-tree build (i.e. when the kernel was built with O=... make) because only generated headers are present under $KDIR/include/ in that case. A recent commit switched KDIR to default to the kernel's build directory, so that check is no longer justified. Link: https://lkml.kernel.org/r/20260122170224.4056513-3-kevin.brodsky@arm.com Signed-off-by: Kevin Brodsky Reviewed-by: Mark Brown Cc: Paolo Abeni Cc: Yunsheng Lin Cc: David Hildenbrand Cc: Dev Jain Cc: Jason Gunthorpe Cc: John Hubbard Cc: Lorenzo Stoakes Cc: Ryan Roberts Cc: SeongJae Park Cc: Shuah Khan Cc: Usama Anjum Cc: wang lian Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/Makefile | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile index bb93101e339e..4e5c8a330a0c 100644 --- a/tools/testing/selftests/mm/Makefile +++ b/tools/testing/selftests/mm/Makefile @@ -46,12 +46,8 @@ CFLAGS += -U_FORTIFY_SOURCE KDIR ?= $(if $(O),$(O),$(realpath ../../../..)) ifneq (,$(wildcard $(KDIR)/Module.symvers)) -ifneq (,$(wildcard $(KDIR)/include/linux/page_frag_cache.h)) TEST_GEN_MODS_DIR := page_frag else -PAGE_FRAG_WARNING = "missing page_frag_cache.h, please use a newer kernel" -endif -else PAGE_FRAG_WARNING = "missing Module.symvers, please have the kernel built first" endif -- cgit v1.2.3 From 7f532d19c8be76ad2fcd7ab6b0c9eb618f70966b Mon Sep 17 00:00:00 2001 From: Kevin Brodsky Date: Thu, 22 Jan 2026 17:02:18 +0000 Subject: selftests/mm: pass down full CC and CFLAGS to check_config.sh check_config.sh checks that liburing is available by running the compiler provided as its first argument. This makes two assumptions: 1. CC consists of only one word 2. No extra flag is required Unfortunately, there are many situations where these assumptions don't hold. For instance: - When using Clang, CC consists of multiple words - When cross-compiling, extra flags may be required to allow the compiler to find headers Remove these assumptions by passing down CC and CFLAGS as-is from the Makefile, so that the same command line is used as when actually building the tests. Link: https://lkml.kernel.org/r/20260122170224.4056513-4-kevin.brodsky@arm.com Signed-off-by: Kevin Brodsky Reviewed-by: Mark Brown Acked-by: David Hildenbrand (Red Hat) Cc: Jason Gunthorpe Cc: John Hubbard Cc: Dev Jain Cc: Lorenzo Stoakes Cc: Paolo Abeni Cc: Ryan Roberts Cc: SeongJae Park Cc: Shuah Khan Cc: Usama Anjum Cc: wang lian Cc: Yunsheng Lin Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/Makefile | 2 +- tools/testing/selftests/mm/check_config.sh | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile index 4e5c8a330a0c..de4afc34e3b1 100644 --- a/tools/testing/selftests/mm/Makefile +++ b/tools/testing/selftests/mm/Makefile @@ -230,7 +230,7 @@ $(OUTPUT)/migration: LDLIBS += -lnuma $(OUTPUT)/rmap: LDLIBS += -lnuma local_config.mk local_config.h: check_config.sh - /bin/sh ./check_config.sh $(CC) + CC="$(CC)" CFLAGS="$(CFLAGS)" ./check_config.sh EXTRA_CLEAN += local_config.mk local_config.h diff --git a/tools/testing/selftests/mm/check_config.sh b/tools/testing/selftests/mm/check_config.sh index 3954f4746161..b84c82bbf875 100755 --- a/tools/testing/selftests/mm/check_config.sh +++ b/tools/testing/selftests/mm/check_config.sh @@ -16,8 +16,7 @@ echo "#include " > $tmpfile_c echo "#include " >> $tmpfile_c echo "int func(void) { return 0; }" >> $tmpfile_c -CC=${1:?"Usage: $0 # example compiler: gcc"} -$CC -c $tmpfile_c -o $tmpfile_o >/dev/null 2>&1 +$CC $CFLAGS -c $tmpfile_c -o $tmpfile_o if [ -f $tmpfile_o ]; then echo "#define LOCAL_CONFIG_HAVE_LIBURING 1" > $OUTPUT_H_FILE -- cgit v1.2.3 From bce1dabd310e87fefe0645fec9ba98b84d37e418 Mon Sep 17 00:00:00 2001 From: Kevin Brodsky Date: Thu, 22 Jan 2026 17:02:19 +0000 Subject: selftests/mm: fix usage of FORCE_READ() in cow tests Commit 5bbc2b785e63 ("selftests/mm: fix FORCE_READ to read input value correctly") modified FORCE_READ() to take a value instead of a pointer. It also changed most of the call sites accordingly, but missed many of them in cow.c. In those cases, we ended up with the pointer itself being read, not the memory it points to. No failure occurred as a result, so it looks like the tests work just fine without faulting in. However, the huge_zeropage tests explicitly check that pages are populated, so those became skipped. Convert all the remaining FORCE_READ() to fault in the mapped page, as was originally intended. This allows the huge_zeropage tests to run again (3 tests in total). Link: https://lkml.kernel.org/r/20260122170224.4056513-5-kevin.brodsky@arm.com Fixes: 5bbc2b785e63 ("selftests/mm: fix FORCE_READ to read input value correctly") Signed-off-by: Kevin Brodsky Acked-by: SeongJae Park Reviewed-by: wang lian Acked-by: David Hildenbrand (Red Hat) Reviewed-by: Dev Jain Cc: Jason Gunthorpe Cc: John Hubbard Cc: Lorenzo Stoakes Cc: Mark Brown Cc: Paolo Abeni Cc: Ryan Roberts Cc: Shuah Khan Cc: Usama Anjum Cc: Yunsheng Lin Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/cow.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/tools/testing/selftests/mm/cow.c b/tools/testing/selftests/mm/cow.c index accfd198dbda..83b3563be26b 100644 --- a/tools/testing/selftests/mm/cow.c +++ b/tools/testing/selftests/mm/cow.c @@ -1612,8 +1612,8 @@ static void run_with_huge_zeropage(non_anon_test_fn fn, const char *desc) * the first sub-page and test if we get another sub-page populated * automatically. */ - FORCE_READ(mem); - FORCE_READ(smem); + FORCE_READ(*mem); + FORCE_READ(*smem); if (!pagemap_is_populated(pagemap_fd, mem + pagesize) || !pagemap_is_populated(pagemap_fd, smem + pagesize)) { ksft_test_result_skip("Did not get THPs populated\n"); @@ -1663,8 +1663,8 @@ static void run_with_memfd(non_anon_test_fn fn, const char *desc) } /* Fault the page in. */ - FORCE_READ(mem); - FORCE_READ(smem); + FORCE_READ(*mem); + FORCE_READ(*smem); fn(mem, smem, pagesize); munmap: @@ -1719,8 +1719,8 @@ static void run_with_tmpfile(non_anon_test_fn fn, const char *desc) } /* Fault the page in. */ - FORCE_READ(mem); - FORCE_READ(smem); + FORCE_READ(*mem); + FORCE_READ(*smem); fn(mem, smem, pagesize); munmap: @@ -1773,8 +1773,8 @@ static void run_with_memfd_hugetlb(non_anon_test_fn fn, const char *desc, } /* Fault the page in. */ - FORCE_READ(mem); - FORCE_READ(smem); + FORCE_READ(*mem); + FORCE_READ(*smem); fn(mem, smem, hugetlbsize); munmap: -- cgit v1.2.3 From 20d3fac43608a1d7ef71991935abc4456baa1da7 Mon Sep 17 00:00:00 2001 From: Kevin Brodsky Date: Thu, 22 Jan 2026 17:02:20 +0000 Subject: selftests/mm: check that FORCE_READ() succeeded Many cow tests rely on FORCE_READ() to populate pages. Introduce a helper to make sure that the pages are actually populated, and fail otherwise. Link: https://lkml.kernel.org/r/20260122170224.4056513-6-kevin.brodsky@arm.com Signed-off-by: Kevin Brodsky Suggested-by: David Hildenbrand (Red Hat) Cc: Dev Jain Cc: Jason Gunthorpe Cc: John Hubbard Cc: Lorenzo Stoakes Cc: Mark Brown Cc: Paolo Abeni Cc: Ryan Roberts Cc: SeongJae Park Cc: Shuah Khan Cc: Usama Anjum Cc: wang lian Cc: Yunsheng Lin Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/cow.c | 43 ++++++++++++++++++++++++++++++---------- 1 file changed, 33 insertions(+), 10 deletions(-) diff --git a/tools/testing/selftests/mm/cow.c b/tools/testing/selftests/mm/cow.c index 83b3563be26b..d9c69c04b67d 100644 --- a/tools/testing/selftests/mm/cow.c +++ b/tools/testing/selftests/mm/cow.c @@ -75,6 +75,18 @@ static bool range_is_swapped(void *addr, size_t size) return true; } +static bool populate_page_checked(char *addr) +{ + bool ret; + + FORCE_READ(*addr); + ret = pagemap_is_populated(pagemap_fd, addr); + if (!ret) + ksft_print_msg("Failed to populate page\n"); + + return ret; +} + struct comm_pipes { int child_ready[2]; int parent_ready[2]; @@ -1549,8 +1561,10 @@ static void run_with_zeropage(non_anon_test_fn fn, const char *desc) } /* Read from the page to populate the shared zeropage. */ - FORCE_READ(*mem); - FORCE_READ(*smem); + if (!populate_page_checked(mem) || !populate_page_checked(smem)) { + log_test_result(KSFT_FAIL); + goto munmap; + } fn(mem, smem, pagesize); munmap: @@ -1612,8 +1626,11 @@ static void run_with_huge_zeropage(non_anon_test_fn fn, const char *desc) * the first sub-page and test if we get another sub-page populated * automatically. */ - FORCE_READ(*mem); - FORCE_READ(*smem); + if (!populate_page_checked(mem) || !populate_page_checked(smem)) { + log_test_result(KSFT_FAIL); + goto munmap; + } + if (!pagemap_is_populated(pagemap_fd, mem + pagesize) || !pagemap_is_populated(pagemap_fd, smem + pagesize)) { ksft_test_result_skip("Did not get THPs populated\n"); @@ -1663,8 +1680,10 @@ static void run_with_memfd(non_anon_test_fn fn, const char *desc) } /* Fault the page in. */ - FORCE_READ(*mem); - FORCE_READ(*smem); + if (!populate_page_checked(mem) || !populate_page_checked(smem)) { + log_test_result(KSFT_FAIL); + goto munmap; + } fn(mem, smem, pagesize); munmap: @@ -1719,8 +1738,10 @@ static void run_with_tmpfile(non_anon_test_fn fn, const char *desc) } /* Fault the page in. */ - FORCE_READ(*mem); - FORCE_READ(*smem); + if (!populate_page_checked(mem) || !populate_page_checked(smem)) { + log_test_result(KSFT_FAIL); + goto munmap; + } fn(mem, smem, pagesize); munmap: @@ -1773,8 +1794,10 @@ static void run_with_memfd_hugetlb(non_anon_test_fn fn, const char *desc, } /* Fault the page in. */ - FORCE_READ(*mem); - FORCE_READ(*smem); + if (!populate_page_checked(mem) || !populate_page_checked(smem)) { + log_test_result(KSFT_FAIL); + goto munmap; + } fn(mem, smem, hugetlbsize); munmap: -- cgit v1.2.3 From dd2b4e04c09808ff921e3460a608537d1a94595d Mon Sep 17 00:00:00 2001 From: Kevin Brodsky Date: Thu, 22 Jan 2026 17:02:21 +0000 Subject: selftests/mm: introduce helper to read every page FORCE_READ(*addr) ensures that the compiler will emit a load from addr. Several tests need to trigger such a load for a range of pages, ensuring that every page is faulted in, if it wasn't already. Introduce a new helper force_read_pages() that does exactly that and replace existing loops with a call to it. The step size (regular/huge page size) is preserved for all loops, except in split_huge_page_test. Reading every byte is unnecessary; we now read every huge page, matching the following call to check_huge_file(). Link: https://lkml.kernel.org/r/20260122170224.4056513-7-kevin.brodsky@arm.com Signed-off-by: Kevin Brodsky Reviewed-by: Dev Jain Reviewed-by: Muhammad Usama Anjum Acked-by: David Hildenbrand (Red Hat) Cc: Jason Gunthorpe Cc: John Hubbard Cc: Lorenzo Stoakes Cc: Mark Brown Cc: Paolo Abeni Cc: Ryan Roberts Cc: SeongJae Park Cc: Shuah Khan Cc: wang lian Cc: Yunsheng Lin Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/hugetlb-madvise.c | 9 +-------- tools/testing/selftests/mm/pfnmap.c | 9 +++------ tools/testing/selftests/mm/split_huge_page_test.c | 6 +----- tools/testing/selftests/mm/vm_util.h | 7 +++++++ 4 files changed, 12 insertions(+), 19 deletions(-) diff --git a/tools/testing/selftests/mm/hugetlb-madvise.c b/tools/testing/selftests/mm/hugetlb-madvise.c index 05d9d2805ae4..5b12041fa310 100644 --- a/tools/testing/selftests/mm/hugetlb-madvise.c +++ b/tools/testing/selftests/mm/hugetlb-madvise.c @@ -47,14 +47,7 @@ void write_fault_pages(void *addr, unsigned long nr_pages) void read_fault_pages(void *addr, unsigned long nr_pages) { - unsigned long i; - - for (i = 0; i < nr_pages; i++) { - unsigned long *addr2 = - ((unsigned long *)(addr + (i * huge_page_size))); - /* Prevent the compiler from optimizing out the entire loop: */ - FORCE_READ(*addr2); - } + force_read_pages(addr, nr_pages, huge_page_size); } int main(int argc, char **argv) diff --git a/tools/testing/selftests/mm/pfnmap.c b/tools/testing/selftests/mm/pfnmap.c index f546dfb10cae..45b5f1cf6019 100644 --- a/tools/testing/selftests/mm/pfnmap.c +++ b/tools/testing/selftests/mm/pfnmap.c @@ -35,18 +35,15 @@ static void signal_handler(int sig) static int test_read_access(char *addr, size_t size, size_t pagesize) { - size_t offs; int ret; if (signal(SIGSEGV, signal_handler) == SIG_ERR) return -EINVAL; ret = sigsetjmp(sigjmp_buf_env, 1); - if (!ret) { - for (offs = 0; offs < size; offs += pagesize) - /* Force a read that the compiler cannot optimize out. */ - *((volatile char *)(addr + offs)); - } + if (!ret) + force_read_pages(addr, size/pagesize, pagesize); + if (signal(SIGSEGV, SIG_DFL) == SIG_ERR) return -EINVAL; diff --git a/tools/testing/selftests/mm/split_huge_page_test.c b/tools/testing/selftests/mm/split_huge_page_test.c index 40799f3f0213..e0167111bdd1 100644 --- a/tools/testing/selftests/mm/split_huge_page_test.c +++ b/tools/testing/selftests/mm/split_huge_page_test.c @@ -652,11 +652,7 @@ static int create_pagecache_thp_and_fd(const char *testfile, size_t fd_size, } madvise(*addr, fd_size, MADV_HUGEPAGE); - for (size_t i = 0; i < fd_size; i++) { - char *addr2 = *addr + i; - - FORCE_READ(*addr2); - } + force_read_pages(*addr, fd_size / pmd_pagesize, pmd_pagesize); if (!check_huge_file(*addr, fd_size / pmd_pagesize, pmd_pagesize)) { ksft_print_msg("No large pagecache folio generated, please provide a filesystem supporting large folio\n"); diff --git a/tools/testing/selftests/mm/vm_util.h b/tools/testing/selftests/mm/vm_util.h index 6ad32b1830f1..522f7f9050f5 100644 --- a/tools/testing/selftests/mm/vm_util.h +++ b/tools/testing/selftests/mm/vm_util.h @@ -54,6 +54,13 @@ static inline unsigned int pshift(void) return __page_shift; } +static inline void force_read_pages(char *addr, unsigned int nr_pages, + size_t pagesize) +{ + for (unsigned int i = 0; i < nr_pages; i++) + FORCE_READ(addr[i * pagesize]); +} + bool detect_huge_zeropage(void); /* -- cgit v1.2.3 From 7e938f00b00319510ae097e20b7487dfa578d53f Mon Sep 17 00:00:00 2001 From: Kevin Brodsky Date: Thu, 22 Jan 2026 17:02:22 +0000 Subject: selftests/mm: fix faulting-in code in pagemap_ioctl test One of the pagemap_ioctl tests attempts to fault in pages by memcpy()'ing them to an unused buffer. This probably worked originally, but since commit 46036188ea1f ("selftests/mm: build with -O2") the compiler is free to optimise away that unused buffer and the memcpy() with it. As a result there might not be any resident page in the mapping and the test may fail. We don't need to copy all that memory anyway. Just fault in every page. While at it also make sure to compute the number of pages once using simple integer arithmetic instead of ceilf() and implicit conversions. Link: https://lkml.kernel.org/r/20260122170224.4056513-8-kevin.brodsky@arm.com Fixes: 46036188ea1f ("selftests/mm: build with -O2") Signed-off-by: Kevin Brodsky Acked-by: David Hildenbrand (Red Hat) Reviewed-by: Dev Jain Reviewed-by: Muhammad Usama Anjum Cc: Jason Gunthorpe Cc: John Hubbard Cc: Lorenzo Stoakes Cc: Mark Brown Cc: Paolo Abeni Cc: Ryan Roberts Cc: SeongJae Park Cc: Shuah Khan Cc: wang lian Cc: Yunsheng Lin Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/pagemap_ioctl.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/mm/pagemap_ioctl.c b/tools/testing/selftests/mm/pagemap_ioctl.c index 2cb5441f29c7..1896c7d4f72e 100644 --- a/tools/testing/selftests/mm/pagemap_ioctl.c +++ b/tools/testing/selftests/mm/pagemap_ioctl.c @@ -1052,11 +1052,10 @@ static void test_simple(void) int sanity_tests(void) { unsigned long long mem_size, vec_size; - long ret, fd, i, buf_size; + long ret, fd, i, buf_size, nr_pages; struct page_region *vec; char *mem, *fmem; struct stat sbuf; - char *tmp_buf; /* 1. wrong operation */ mem_size = 10 * page_size; @@ -1167,14 +1166,14 @@ int sanity_tests(void) if (fmem == MAP_FAILED) ksft_exit_fail_msg("error nomem %d %s\n", errno, strerror(errno)); - tmp_buf = malloc(sbuf.st_size); - memcpy(tmp_buf, fmem, sbuf.st_size); + nr_pages = (sbuf.st_size + page_size - 1) / page_size; + force_read_pages(fmem, nr_pages, page_size); ret = pagemap_ioctl(fmem, sbuf.st_size, vec, vec_size, 0, 0, 0, PAGEMAP_NON_WRITTEN_BITS, 0, PAGEMAP_NON_WRITTEN_BITS); ksft_test_result(ret >= 0 && vec[0].start == (uintptr_t)fmem && - LEN(vec[0]) == ceilf((float)sbuf.st_size/page_size) && + LEN(vec[0]) == nr_pages && (vec[0].categories & PAGE_IS_FILE), "%s Memory mapped file\n", __func__); -- cgit v1.2.3 From 148e5879532f835118e00c3040acef077b57721a Mon Sep 17 00:00:00 2001 From: Kevin Brodsky Date: Thu, 22 Jan 2026 17:02:23 +0000 Subject: selftests/mm: fix exit code in pagemap_ioctl Make sure pagemap_ioctl exits with an appropriate value: * If the tests are run, call ksft_finished() to report the right status instead of reporting PASS unconditionally. * Report SKIP if userfaultfd isn't available (in line with other tests) * Report FAIL if we failed to open /proc/self/pagemap, as this file has been added a long time ago and doesn't depend on any CONFIG option (returning -EINVAL from main() is meaningless) Link: https://lkml.kernel.org/r/20260122170224.4056513-9-kevin.brodsky@arm.com Signed-off-by: Kevin Brodsky Reviewed-by: Ryan Roberts Reviewed-by: Mark Brown Acked-by: David Hildenbrand (Red Hat) Acked-by: SeongJae Park Reviewed-by: wang lian Reviewed-by: Dev Jain Cc: Usama Anjum Cc: Jason Gunthorpe Cc: John Hubbard Cc: Lorenzo Stoakes Cc: Paolo Abeni Cc: Shuah Khan Cc: Yunsheng Lin Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/pagemap_ioctl.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/mm/pagemap_ioctl.c b/tools/testing/selftests/mm/pagemap_ioctl.c index 1896c7d4f72e..2ca8a7e3c27e 100644 --- a/tools/testing/selftests/mm/pagemap_ioctl.c +++ b/tools/testing/selftests/mm/pagemap_ioctl.c @@ -1552,7 +1552,7 @@ int main(int __attribute__((unused)) argc, char *argv[]) ksft_print_header(); if (init_uffd()) - ksft_exit_pass(); + ksft_exit_skip("Failed to initialize userfaultfd\n"); ksft_set_plan(117); @@ -1561,7 +1561,7 @@ int main(int __attribute__((unused)) argc, char *argv[]) pagemap_fd = open(PAGEMAP, O_RDONLY); if (pagemap_fd < 0) - return -EINVAL; + ksft_exit_fail_msg("Failed to open " PAGEMAP "\n"); /* 1. Sanity testing */ sanity_tests_sd(); @@ -1733,5 +1733,5 @@ int main(int __attribute__((unused)) argc, char *argv[]) zeropfn_tests(); close(pagemap_fd); - ksft_exit_pass(); + ksft_finished(); } -- cgit v1.2.3 From fde8353121aa304ee88542f011dd5dc83ced47e4 Mon Sep 17 00:00:00 2001 From: Kevin Brodsky Date: Thu, 22 Jan 2026 17:02:24 +0000 Subject: selftests/mm: report SKIP in pfnmap if a check fails pfnmap currently checks the target file in FIXTURE_SETUP(pfnmap), meaning once for every test, and skips the test if any check fails. The target file is the same for every test so this is a little overkill. More importantly, this approach means that the whole suite will report PASS even if all the tests are skipped because kernel configuration (e.g. CONFIG_STRICT_DEVMEM=y) prevented /dev/mem from being mapped, for instance. Let's ensure that KSFT_SKIP is returned as exit code if any check fails by performing the checks in pfnmap_init(), run once. That function also takes care of finding the offset of the pages to be mapped and saves it in a global. The file is now opened only once and the fd saved in a global, but it is still mapped/unmapped for every test, as some of them modify the mapping. Link: https://lkml.kernel.org/r/20260122170224.4056513-10-kevin.brodsky@arm.com Signed-off-by: Kevin Brodsky Acked-by: David Hildenbrand (Red Hat) Cc: Dev Jain Cc: Jason Gunthorpe Cc: John Hubbard Cc: Lorenzo Stoakes Cc: Mark Brown Cc: Paolo Abeni Cc: Ryan Roberts Cc: SeongJae Park Cc: Shuah Khan Cc: Usama Anjum Cc: wang lian Cc: Yunsheng Lin Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/pfnmap.c | 84 +++++++++++++++++++++++-------------- 1 file changed, 53 insertions(+), 31 deletions(-) diff --git a/tools/testing/selftests/mm/pfnmap.c b/tools/testing/selftests/mm/pfnmap.c index 45b5f1cf6019..4f550822385a 100644 --- a/tools/testing/selftests/mm/pfnmap.c +++ b/tools/testing/selftests/mm/pfnmap.c @@ -25,8 +25,12 @@ #include "kselftest_harness.h" #include "vm_util.h" +#define DEV_MEM_NPAGES 2 + static sigjmp_buf sigjmp_buf_env; static char *file = "/dev/mem"; +static off_t file_offset; +static int fd; static void signal_handler(int sig) { @@ -88,7 +92,7 @@ static int find_ram_target(off_t *offset, break; /* We need two pages. */ - if (end > start + 2 * pagesize) { + if (end > start + DEV_MEM_NPAGES * pagesize) { fclose(file); *offset = start; return 0; @@ -97,11 +101,48 @@ static int find_ram_target(off_t *offset, return -ENOENT; } +static void pfnmap_init(void) +{ + size_t pagesize = getpagesize(); + size_t size = DEV_MEM_NPAGES * pagesize; + void *addr; + + if (strncmp(file, "/dev/mem", strlen("/dev/mem")) == 0) { + int err = find_ram_target(&file_offset, pagesize); + + if (err) + ksft_exit_skip("Cannot find ram target in '/proc/iomem': %s\n", + strerror(-err)); + } else { + file_offset = 0; + } + + fd = open(file, O_RDONLY); + if (fd < 0) + ksft_exit_skip("Cannot open '%s': %s\n", file, strerror(errno)); + + /* + * Make sure we can map the file, and perform some basic checks; skip + * the whole suite if anything goes wrong. + * A fresh mapping is then created for every test case by + * FIXTURE_SETUP(pfnmap). + */ + addr = mmap(NULL, size, PROT_READ, MAP_SHARED, fd, file_offset); + if (addr == MAP_FAILED) + ksft_exit_skip("Cannot mmap '%s': %s\n", file, strerror(errno)); + + if (!check_vmflag_pfnmap(addr)) + ksft_exit_skip("Invalid file: '%s'. Not pfnmap'ed\n", file); + + if (test_read_access(addr, size, pagesize)) + ksft_exit_skip("Cannot read-access mmap'ed '%s'\n", file); + + munmap(addr, size); +} + FIXTURE(pfnmap) { - off_t offset; size_t pagesize; - int dev_mem_fd; char *addr1; size_t size1; char *addr2; @@ -112,31 +153,10 @@ FIXTURE_SETUP(pfnmap) { self->pagesize = getpagesize(); - if (strncmp(file, "/dev/mem", strlen("/dev/mem")) == 0) { - /* We'll require two physical pages throughout our tests ... */ - if (find_ram_target(&self->offset, self->pagesize)) - SKIP(return, - "Cannot find ram target in '/proc/iomem'\n"); - } else { - self->offset = 0; - } - - self->dev_mem_fd = open(file, O_RDONLY); - if (self->dev_mem_fd < 0) - SKIP(return, "Cannot open '%s'\n", file); - - self->size1 = self->pagesize * 2; + self->size1 = DEV_MEM_NPAGES * self->pagesize; self->addr1 = mmap(NULL, self->size1, PROT_READ, MAP_SHARED, - self->dev_mem_fd, self->offset); - if (self->addr1 == MAP_FAILED) - SKIP(return, "Cannot mmap '%s'\n", file); - - if (!check_vmflag_pfnmap(self->addr1)) - SKIP(return, "Invalid file: '%s'. Not pfnmap'ed\n", file); - - /* ... and want to be able to read from them. */ - if (test_read_access(self->addr1, self->size1, self->pagesize)) - SKIP(return, "Cannot read-access mmap'ed '%s'\n", file); + fd, file_offset); + ASSERT_NE(self->addr1, MAP_FAILED); self->size2 = 0; self->addr2 = MAP_FAILED; @@ -148,8 +168,6 @@ FIXTURE_TEARDOWN(pfnmap) munmap(self->addr2, self->size2); if (self->addr1 != MAP_FAILED) munmap(self->addr1, self->size1); - if (self->dev_mem_fd >= 0) - close(self->dev_mem_fd); } TEST_F(pfnmap, madvise_disallowed) @@ -189,7 +207,7 @@ TEST_F(pfnmap, munmap_split) */ self->size2 = self->pagesize; self->addr2 = mmap(NULL, self->pagesize, PROT_READ, MAP_SHARED, - self->dev_mem_fd, self->offset); + fd, file_offset); ASSERT_NE(self->addr2, MAP_FAILED); } @@ -259,8 +277,12 @@ int main(int argc, char **argv) if (strcmp(argv[i], "--") == 0) { if (i + 1 < argc && strlen(argv[i + 1]) > 0) file = argv[i + 1]; - return test_harness_run(i, argv); + argc = i; + break; } } + + pfnmap_init(); + return test_harness_run(argc, argv); } -- cgit v1.2.3 From c83109e95c9d78e41b39e65b6490e511f4b8fba2 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Mon, 12 Jan 2026 23:09:50 +0800 Subject: mm: page_isolation: introduce page_is_unmovable() Patch series "mm: accelerate gigantic folio allocation". Optimize pfn_range_valid_contig() and replace_free_hugepage_folios() in alloc_contig_frozen_pages() to speed up gigantic folio allocation. The allocation time for 120*1G folios drops from 3.605s to 0.431s. This patch (of 5): Factor out the check if a page is unmovable into a new helper, and will be reused in the following patch. No functional change intended, the minor changes are as follows, 1) Avoid unnecessary calls by checking CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION 2) Directly call PageCompound since PageTransCompound may be dropped 3) Using folio_test_hugetlb() Link: https://lkml.kernel.org/r/20260112150954.1802953-1-wangkefeng.wang@huawei.com Link: https://lkml.kernel.org/r/20260112150954.1802953-2-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: Zi Yan Reviewed-by: Oscar Salvador Cc: Brendan Jackman Cc: David Hildenbrand Cc: Jane Chu Cc: Johannes Weiner Cc: Matthew Wilcox (Oracle) Cc: Muchun Song Cc: Sidhartha Kumar Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/page-isolation.h | 2 + mm/page_isolation.c | 187 ++++++++++++++++++++++------------------- 2 files changed, 101 insertions(+), 88 deletions(-) diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h index 3e2f960e166c..6f8638c9904f 100644 --- a/include/linux/page-isolation.h +++ b/include/linux/page-isolation.h @@ -67,4 +67,6 @@ void undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn); int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn, enum pb_isolate_mode mode); +bool page_is_unmovable(struct zone *zone, struct page *page, + enum pb_isolate_mode mode, unsigned long *step); #endif diff --git a/mm/page_isolation.c b/mm/page_isolation.c index b5924eff4f8b..c48ff5c00244 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -15,6 +15,100 @@ #define CREATE_TRACE_POINTS #include +bool page_is_unmovable(struct zone *zone, struct page *page, + enum pb_isolate_mode mode, unsigned long *step) +{ + /* + * Both, bootmem allocations and memory holes are marked + * PG_reserved and are unmovable. We can even have unmovable + * allocations inside ZONE_MOVABLE, for example when + * specifying "movablecore". + */ + if (PageReserved(page)) + return true; + + /* + * If the zone is movable and we have ruled out all reserved + * pages then it should be reasonably safe to assume the rest + * is movable. + */ + if (zone_idx(zone) == ZONE_MOVABLE) + return false; + + /* + * Hugepages are not in LRU lists, but they're movable. + * THPs are on the LRU, but need to be counted as #small pages. + * We need not scan over tail pages because we don't + * handle each tail page individually in migration. + */ + if (PageHuge(page) || PageCompound(page)) { + struct folio *folio = page_folio(page); + + if (folio_test_hugetlb(folio)) { + struct hstate *h; + + if (!IS_ENABLED(CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION)) + return true; + + /* + * The huge page may be freed so can not + * use folio_hstate() directly. + */ + h = size_to_hstate(folio_size(folio)); + if (h && !hugepage_migration_supported(h)) + return true; + + } else if (!folio_test_lru(folio)) { + return true; + } + + *step = folio_nr_pages(folio) - folio_page_idx(folio, page); + return false; + } + + /* + * We can't use page_count without pin a page + * because another CPU can free compound page. + * This check already skips compound tails of THP + * because their page->_refcount is zero at all time. + */ + if (!page_ref_count(page)) { + if (PageBuddy(page)) + *step = (1 << buddy_order(page)); + return false; + } + + /* + * The HWPoisoned page may be not in buddy system, and + * page_count() is not 0. + */ + if ((mode == PB_ISOLATE_MODE_MEM_OFFLINE) && PageHWPoison(page)) + return false; + + /* + * We treat all PageOffline() pages as movable when offlining + * to give drivers a chance to decrement their reference count + * in MEM_GOING_OFFLINE in order to indicate that these pages + * can be offlined as there are no direct references anymore. + * For actually unmovable PageOffline() where the driver does + * not support this, we will fail later when trying to actually + * move these pages that still have a reference count > 0. + * (false negatives in this function only) + */ + if ((mode == PB_ISOLATE_MODE_MEM_OFFLINE) && PageOffline(page)) + return false; + + if (PageLRU(page) || page_has_movable_ops(page)) + return false; + + /* + * If there are RECLAIMABLE pages, we need to check + * it. But now, memory offline itself doesn't call + * shrink_node_slabs() and it still to be fixed. + */ + return true; +} + /* * This function checks whether the range [start_pfn, end_pfn) includes * unmovable pages or not. The range must fall into a single pageblock and @@ -35,7 +129,6 @@ static struct page *has_unmovable_pages(unsigned long start_pfn, unsigned long e { struct page *page = pfn_to_page(start_pfn); struct zone *zone = page_zone(page); - unsigned long pfn; VM_BUG_ON(pageblock_start_pfn(start_pfn) != pageblock_start_pfn(end_pfn - 1)); @@ -52,96 +145,14 @@ static struct page *has_unmovable_pages(unsigned long start_pfn, unsigned long e return page; } - for (pfn = start_pfn; pfn < end_pfn; pfn++) { - page = pfn_to_page(pfn); + while (start_pfn < end_pfn) { + unsigned long step = 1; - /* - * Both, bootmem allocations and memory holes are marked - * PG_reserved and are unmovable. We can even have unmovable - * allocations inside ZONE_MOVABLE, for example when - * specifying "movablecore". - */ - if (PageReserved(page)) + page = pfn_to_page(start_pfn); + if (page_is_unmovable(zone, page, mode, &step)) return page; - /* - * If the zone is movable and we have ruled out all reserved - * pages then it should be reasonably safe to assume the rest - * is movable. - */ - if (zone_idx(zone) == ZONE_MOVABLE) - continue; - - /* - * Hugepages are not in LRU lists, but they're movable. - * THPs are on the LRU, but need to be counted as #small pages. - * We need not scan over tail pages because we don't - * handle each tail page individually in migration. - */ - if (PageHuge(page) || PageTransCompound(page)) { - struct folio *folio = page_folio(page); - unsigned int skip_pages; - - if (PageHuge(page)) { - struct hstate *h; - - /* - * The huge page may be freed so can not - * use folio_hstate() directly. - */ - h = size_to_hstate(folio_size(folio)); - if (h && !hugepage_migration_supported(h)) - return page; - } else if (!folio_test_lru(folio)) { - return page; - } - - skip_pages = folio_nr_pages(folio) - folio_page_idx(folio, page); - pfn += skip_pages - 1; - continue; - } - - /* - * We can't use page_count without pin a page - * because another CPU can free compound page. - * This check already skips compound tails of THP - * because their page->_refcount is zero at all time. - */ - if (!page_ref_count(page)) { - if (PageBuddy(page)) - pfn += (1 << buddy_order(page)) - 1; - continue; - } - - /* - * The HWPoisoned page may be not in buddy system, and - * page_count() is not 0. - */ - if ((mode == PB_ISOLATE_MODE_MEM_OFFLINE) && PageHWPoison(page)) - continue; - - /* - * We treat all PageOffline() pages as movable when offlining - * to give drivers a chance to decrement their reference count - * in MEM_GOING_OFFLINE in order to indicate that these pages - * can be offlined as there are no direct references anymore. - * For actually unmovable PageOffline() where the driver does - * not support this, we will fail later when trying to actually - * move these pages that still have a reference count > 0. - * (false negatives in this function only) - */ - if ((mode == PB_ISOLATE_MODE_MEM_OFFLINE) && PageOffline(page)) - continue; - - if (PageLRU(page) || page_has_movable_ops(page)) - continue; - - /* - * If there are RECLAIMABLE pages, we need to check - * it. But now, memory offline itself doesn't call - * shrink_node_slabs() and it still to be fixed. - */ - return page; + start_pfn += step; } return NULL; } -- cgit v1.2.3 From 9a8e0c31b3121df8ed193437b59969adabc7e721 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Mon, 12 Jan 2026 23:09:51 +0800 Subject: mm: page_alloc: optimize pfn_range_valid_contig() The alloc_contig_pages() spends a significant amount of time within pfn_range_valid_contig(). - set_max_huge_pages - 99.98% alloc_pool_huge_folio only_alloc_fresh_hugetlb_folio.isra.0 - alloc_contig_frozen_pages_noprof - 87.00% pfn_range_valid_contig pfn_to_online_page - 12.91% alloc_contig_frozen_range_noprof 4.51% replace_free_hugepage_folios - 4.02% prep_new_page prep_compound_page - 2.98% undo_isolate_page_range - 2.79% unset_migratetype_isolate - 2.75% __move_freepages_block_isolate 2.71% __move_freepages_block - 0.98% start_isolate_page_range 0.66% set_migratetype_isolate To optimize this process, use the new helper page_is_unmovable() to avoid more unnecessary iterations for compound pages, such as THP not on LRU, and high-order buddy pages, which significantly improving the efficiency of contiguous memory allocation. A simple test on machine with 114G free memory, allocate 120 * 1G HugeTLB folios(104 successfully returned), time echo 120 > /sys/kernel/mm/hugepages/hugepages-1048576kB/nr_hugepages Before: 0m3.605s After: 0m0.602s Link: https://lkml.kernel.org/r/20260112150954.1802953-3-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: Oscar Salvador Reviewed-by: Zi Yan Cc: Brendan Jackman Cc: David Hildenbrand Cc: Jane Chu Cc: Johannes Weiner Cc: Matthew Wilcox (Oracle) Cc: Muchun Song Cc: Sidhartha Kumar Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/page_alloc.c | 25 ++++++++----------------- 1 file changed, 8 insertions(+), 17 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2c70ba9d5cc6..e4104973e22f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7153,18 +7153,20 @@ static bool pfn_range_valid_contig(struct zone *z, unsigned long start_pfn, unsigned long nr_pages, bool skip_hugetlb, bool *skipped_hugetlb) { - unsigned long i, end_pfn = start_pfn + nr_pages; + unsigned long end_pfn = start_pfn + nr_pages; struct page *page; - for (i = start_pfn; i < end_pfn; i++) { - page = pfn_to_online_page(i); + while (start_pfn < end_pfn) { + unsigned long step = 1; + + page = pfn_to_online_page(start_pfn); if (!page) return false; if (page_zone(page) != z) return false; - if (PageReserved(page)) + if (page_is_unmovable(z, page, PB_ISOLATE_MODE_OTHER, &step)) return false; /* @@ -7179,9 +7181,6 @@ static bool pfn_range_valid_contig(struct zone *z, unsigned long start_pfn, if (PageHuge(page)) { unsigned int order; - if (!IS_ENABLED(CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION)) - return false; - if (skip_hugetlb) { *skipped_hugetlb = true; return false; @@ -7192,17 +7191,9 @@ static bool pfn_range_valid_contig(struct zone *z, unsigned long start_pfn, if ((order >= MAX_FOLIO_ORDER) || (nr_pages <= (1 << order))) return false; - - /* - * Reaching this point means we've encounted a huge page - * smaller than nr_pages, skip all pfn's for that page. - * - * We can't get here from a tail-PageHuge, as it implies - * we started a scan in the middle of a hugepage larger - * than nr_pages - which the prior check filters for. - */ - i += (1 << order) - 1; } + + start_pfn += step; } return true; } -- cgit v1.2.3 From 5a74b9f1dc3d75635ca8918e53664d5d2ee0fff5 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Mon, 12 Jan 2026 23:09:52 +0800 Subject: mm: hugetlb: optimize replace_free_hugepage_folios() If no free hugepage folios are available, there is no need to perform any replacement operations. Additionally, gigantic folios should not be replaced under any circumstances. Therefore, we only check for the presence of non-gigantic folios, also adding the gigantic folio check to avoid accidental replacement. To optimize performance, we skip unnecessary iterations over pfn for compound pages and high-order buddy pages to save processing time. A simple test on machine with 114G free memory, allocate 120 * 1G HugeTLB folios(104 successfully returned), time echo 120 > /sys/kernel/mm/hugepages/hugepages-1048576kB/nr_hugepages Before: 0m0.602s After: 0m0.431s [wangkefeng.wang@huawei.com: v2] Link: https://lkml.kernel.org/r/20260114135512.2159799-1-wangkefeng.wang@huawei.com [akpm@linux-foundation.org: use single-return-point style, tweak comment] Link: https://lkml.kernel.org/r/20260112150954.1802953-4-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: Brendan Jackman Cc: David Hildenbrand Cc: Jane Chu Cc: Johannes Weiner Cc: Matthew Wilcox (Oracle) Cc: Muchun Song Cc: Oscar Salvador Cc: Sidhartha Kumar Cc: Vlastimil Babka Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/hugetlb.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 49 insertions(+), 10 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 9c7efad6fa48..120ebd448b42 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2810,23 +2810,62 @@ int isolate_or_dissolve_huge_folio(struct folio *folio, struct list_head *list) */ int replace_free_hugepage_folios(unsigned long start_pfn, unsigned long end_pfn) { - struct folio *folio; + unsigned long nr = 0; + struct page *page; + struct hstate *h; + LIST_HEAD(list); int ret = 0; - LIST_HEAD(isolate_list); + /* Avoid pfn iterations if no free non-gigantic huge pages */ + for_each_hstate(h) { + if (hstate_is_gigantic(h)) + continue; + + nr += h->free_huge_pages; + if (nr) + break; + } + + if (!nr) + return 0; while (start_pfn < end_pfn) { - folio = pfn_folio(start_pfn); + page = pfn_to_page(start_pfn); + nr = 1; - /* Not to disrupt normal path by vainly holding hugetlb_lock */ - if (folio_test_hugetlb(folio) && !folio_ref_count(folio)) { - ret = alloc_and_dissolve_hugetlb_folio(folio, &isolate_list); - if (ret) - break; + if (PageHuge(page) || PageCompound(page)) { + struct folio *folio = page_folio(page); + + nr = folio_nr_pages(folio) - folio_page_idx(folio, page); + + /* + * Don't disrupt normal path by vainly holding + * hugetlb_lock + */ + if (folio_test_hugetlb(folio) && !folio_ref_count(folio)) { + if (order_is_gigantic(folio_order(folio))) { + ret = -ENOMEM; + break; + } + + ret = alloc_and_dissolve_hugetlb_folio(folio, &list); + if (ret) + break; + + putback_movable_pages(&list); + } + } else if (PageBuddy(page)) { + /* + * Buddy order check without zone lock is unsafe and + * the order is maybe invalid, but race should be + * small, and the worst thing is skipping free hugetlb. + */ + const unsigned int order = buddy_order_unsafe(page); - putback_movable_pages(&isolate_list); + if (order <= MAX_PAGE_ORDER) + nr = 1UL << order; } - start_pfn++; + start_pfn += nr; } return ret; -- cgit v1.2.3 From d925730734e9e936146b7ba691aa02f1b60f2c61 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Mon, 12 Jan 2026 23:09:53 +0800 Subject: mm: hugetlb_cma: optimize hugetlb_cma_alloc_frozen_folio() Check hugetlb_cma_size which helps to avoid unnecessary gfp check or nodemask traversal. Link: https://lkml.kernel.org/r/20260112150954.1802953-5-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: Zi Yan Cc: Brendan Jackman Cc: David Hildenbrand Cc: Jane Chu Cc: Johannes Weiner Cc: Matthew Wilcox (Oracle) Cc: Muchun Song Cc: Oscar Salvador Cc: Sidhartha Kumar Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/hugetlb_cma.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/mm/hugetlb_cma.c b/mm/hugetlb_cma.c index 0ddf9755c090..d8fa93825992 100644 --- a/mm/hugetlb_cma.c +++ b/mm/hugetlb_cma.c @@ -16,7 +16,7 @@ static struct cma *hugetlb_cma[MAX_NUMNODES]; static unsigned long hugetlb_cma_size_in_node[MAX_NUMNODES] __initdata; static bool hugetlb_cma_only; -static unsigned long hugetlb_cma_size __initdata; +static unsigned long hugetlb_cma_size __ro_after_init; void hugetlb_cma_free_frozen_folio(struct folio *folio) { @@ -31,6 +31,9 @@ struct folio *hugetlb_cma_alloc_frozen_folio(int order, gfp_t gfp_mask, struct folio *folio; struct page *page = NULL; + if (!hugetlb_cma_size) + return NULL; + if (hugetlb_cma[nid]) page = cma_alloc_frozen_compound(hugetlb_cma[nid], order); -- cgit v1.2.3 From ae85e5610813c9904ea4a111bf47edd1940ebf63 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Mon, 12 Jan 2026 23:09:54 +0800 Subject: mm: hugetlb_cma: mark hugetlb_cma{_only} as __ro_after_init hugetlb_cma and hugetlb_cma_only are initialized once during init and never changed. Link: https://lkml.kernel.org/r/20260112150954.1802953-6-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Acked-by: Zi Yan Acked-by: Muchun Song Cc: Brendan Jackman Cc: David Hildenbrand Cc: Jane Chu Cc: Johannes Weiner Cc: Matthew Wilcox (Oracle) Cc: Oscar Salvador Cc: Sidhartha Kumar Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/hugetlb_cma.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/hugetlb_cma.c b/mm/hugetlb_cma.c index d8fa93825992..f83ae4998990 100644 --- a/mm/hugetlb_cma.c +++ b/mm/hugetlb_cma.c @@ -13,9 +13,9 @@ #include "hugetlb_cma.h" -static struct cma *hugetlb_cma[MAX_NUMNODES]; +static struct cma *hugetlb_cma[MAX_NUMNODES] __ro_after_init; static unsigned long hugetlb_cma_size_in_node[MAX_NUMNODES] __initdata; -static bool hugetlb_cma_only; +static bool hugetlb_cma_only __ro_after_init; static unsigned long hugetlb_cma_size __ro_after_init; void hugetlb_cma_free_frozen_folio(struct folio *folio) -- cgit v1.2.3 From dd2c6ec24fca9235ccd1b9bfd382d0ddb419e41a Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Fri, 16 Jan 2026 13:20:53 +0000 Subject: selftests/mm: remove virtual_address_range test This self test is asserting internal implementation details and is highly vulnerable to internal kernel changes as a result. It is currently failing locally from at least v6.17, and it seems that it may have been failing for longer in many configurations/hardware as it skips if e.g. CONFIG_ANON_VMA_NAME is not specified. With these skips and the fact that run_vmtests.sh won't run the tests in certain configurations it is likely we have simply missed this test being broken in CI for a long while. I have tried multiple versions of these tests and am unable to find a working bisect as previous versions of the test fail also. The tests are essentially mmap()'ing a series of mappings with no hint and asserting what the get_unmapped_area*() functions will come up with, with seemingly few checks for what other mappings may already be in place. It then appears to be mmap()'ing with a hint, and making a series of similar assertions about the internal implementation details of the hinting logic. Commit 0ef3783d7558 ("selftests/mm: add support to test 4PB VA on PPC64"), commit 3bd6137220bb ("selftests/mm: virtual_address_range: avoid reading from VM_IO mappings"), and especially commit a005145b9c96 ("selftests/mm: virtual_address_range: mmap() without PROT_WRITE") are good examples of the whack-a-mole nature of maintaining this test. The last commit there being particularly pertinent as it was accounting for an internal implementation detail change that really should have no bearing on self-tests, that is commit e93d2521b27f ("x86/vdso: Split virtual clock pages into dedicated mapping"). The purpose of the mm self-tests are to assert attributes about the API exposed to users, and to ensure that expectations are met. This test is emphatically not doing this, rather making a series of assumptions about internal implementation details and asserting them. It therefore, sadly, seems that the best course is to remove this test altogether. Link: https://lkml.kernel.org/r/20260116132053.857887-1-lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Acked-by: SeongJae Park Cc: David Hildenbrand Cc: Liam Howlett Cc: Mark Brown Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/.gitignore | 1 - tools/testing/selftests/mm/Makefile | 3 - tools/testing/selftests/mm/run_vmtests.sh | 12 - tools/testing/selftests/mm/virtual_address_range.c | 260 --------------------- 4 files changed, 276 deletions(-) delete mode 100644 tools/testing/selftests/mm/virtual_address_range.c diff --git a/tools/testing/selftests/mm/.gitignore b/tools/testing/selftests/mm/.gitignore index c2a8586e51a1..702e5723c35d 100644 --- a/tools/testing/selftests/mm/.gitignore +++ b/tools/testing/selftests/mm/.gitignore @@ -32,7 +32,6 @@ uffd-unit-tests uffd-wp-mremap mlock-intersect-test mlock-random-test -virtual_address_range gup_test va_128TBswitch map_fixed_noreplace diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile index de4afc34e3b1..2fdb05e5a56a 100644 --- a/tools/testing/selftests/mm/Makefile +++ b/tools/testing/selftests/mm/Makefile @@ -136,9 +136,6 @@ endif ifneq (,$(filter $(ARCH),arm64 mips64 parisc64 powerpc riscv64 s390x sparc64 x86_64 s390)) TEST_GEN_FILES += va_high_addr_switch -ifneq ($(ARCH),riscv64) -TEST_GEN_FILES += virtual_address_range -endif TEST_GEN_FILES += write_to_hugetlbfs endif diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh index 2dadbfc6e535..452875db532c 100755 --- a/tools/testing/selftests/mm/run_vmtests.sh +++ b/tools/testing/selftests/mm/run_vmtests.sh @@ -399,18 +399,6 @@ CATEGORY="hugetlb" run_test ./hugetlb-read-hwpoison fi if [ $VADDR64 -ne 0 ]; then - - # set overcommit_policy as OVERCOMMIT_ALWAYS so that kernel - # allows high virtual address allocation requests independent - # of platform's physical memory. - - if [ -x ./virtual_address_range ]; then - prev_policy=$(cat /proc/sys/vm/overcommit_memory) - echo 1 > /proc/sys/vm/overcommit_memory - CATEGORY="hugevm" run_test ./virtual_address_range - echo $prev_policy > /proc/sys/vm/overcommit_memory - fi - # va high address boundary switch test CATEGORY="hugevm" run_test bash ./va_high_addr_switch.sh fi # VADDR64 diff --git a/tools/testing/selftests/mm/virtual_address_range.c b/tools/testing/selftests/mm/virtual_address_range.c deleted file mode 100644 index 4f0923825ed7..000000000000 --- a/tools/testing/selftests/mm/virtual_address_range.c +++ /dev/null @@ -1,260 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Copyright 2017, Anshuman Khandual, IBM Corp. - * - * Works on architectures which support 128TB virtual - * address range and beyond. - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "vm_util.h" -#include "kselftest.h" - -/* - * Maximum address range mapped with a single mmap() - * call is little bit more than 1GB. Hence 1GB is - * chosen as the single chunk size for address space - * mapping. - */ - -#define SZ_1GB (1024 * 1024 * 1024UL) -#define SZ_1TB (1024 * 1024 * 1024 * 1024UL) - -#define MAP_CHUNK_SIZE SZ_1GB - -/* - * Address space till 128TB is mapped without any hint - * and is enabled by default. Address space beyond 128TB - * till 512TB is obtained by passing hint address as the - * first argument into mmap() system call. - * - * The process heap address space is divided into two - * different areas one below 128TB and one above 128TB - * till it reaches 512TB. One with size 128TB and the - * other being 384TB. - * - * On Arm64 the address space is 256TB and support for - * high mappings up to 4PB virtual address space has - * been added. - * - * On PowerPC64, the address space up to 128TB can be - * mapped without a hint. Addresses beyond 128TB, up to - * 4PB, can be mapped with a hint. - * - */ - -#define NR_CHUNKS_128TB ((128 * SZ_1TB) / MAP_CHUNK_SIZE) /* Number of chunks for 128TB */ -#define NR_CHUNKS_256TB (NR_CHUNKS_128TB * 2UL) -#define NR_CHUNKS_384TB (NR_CHUNKS_128TB * 3UL) -#define NR_CHUNKS_3840TB (NR_CHUNKS_128TB * 30UL) -#define NR_CHUNKS_3968TB (NR_CHUNKS_128TB * 31UL) - -#define ADDR_MARK_128TB (1UL << 47) /* First address beyond 128TB */ -#define ADDR_MARK_256TB (1UL << 48) /* First address beyond 256TB */ - -#ifdef __aarch64__ -#define HIGH_ADDR_MARK ADDR_MARK_256TB -#define HIGH_ADDR_SHIFT 49 -#define NR_CHUNKS_LOW NR_CHUNKS_256TB -#define NR_CHUNKS_HIGH NR_CHUNKS_3840TB -#elif defined(__PPC64__) -#define HIGH_ADDR_MARK ADDR_MARK_128TB -#define HIGH_ADDR_SHIFT 48 -#define NR_CHUNKS_LOW NR_CHUNKS_128TB -#define NR_CHUNKS_HIGH NR_CHUNKS_3968TB -#else -#define HIGH_ADDR_MARK ADDR_MARK_128TB -#define HIGH_ADDR_SHIFT 48 -#define NR_CHUNKS_LOW NR_CHUNKS_128TB -#define NR_CHUNKS_HIGH NR_CHUNKS_384TB -#endif - -static char *hint_addr(void) -{ - int bits = HIGH_ADDR_SHIFT + rand() % (63 - HIGH_ADDR_SHIFT); - - return (char *) (1UL << bits); -} - -static void validate_addr(char *ptr, int high_addr) -{ - unsigned long addr = (unsigned long) ptr; - - if (high_addr) { - if (addr < HIGH_ADDR_MARK) - ksft_exit_fail_msg("Bad address %lx\n", addr); - return; - } - - if (addr > HIGH_ADDR_MARK) - ksft_exit_fail_msg("Bad address %lx\n", addr); -} - -static void mark_range(char *ptr, size_t size) -{ - if (prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, ptr, size, "virtual_address_range") == -1) { - if (errno == EINVAL) { - /* Depends on CONFIG_ANON_VMA_NAME */ - ksft_test_result_skip("prctl(PR_SET_VMA_ANON_NAME) not supported\n"); - ksft_finished(); - } else { - ksft_exit_fail_perror("prctl(PR_SET_VMA_ANON_NAME) failed\n"); - } - } -} - -static int is_marked_vma(const char *vma_name) -{ - return vma_name && !strcmp(vma_name, "[anon:virtual_address_range]\n"); -} - -static int validate_lower_address_hint(void) -{ - char *ptr; - - ptr = mmap((void *) (1UL << 45), MAP_CHUNK_SIZE, PROT_READ | - PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - - if (ptr == MAP_FAILED) - return 0; - - return 1; -} - -static int validate_complete_va_space(void) -{ - unsigned long start_addr, end_addr, prev_end_addr; - char line[400]; - char prot[6]; - FILE *file; - int fd; - - fd = open("va_dump", O_CREAT | O_WRONLY, 0600); - unlink("va_dump"); - if (fd < 0) { - ksft_test_result_skip("cannot create or open dump file\n"); - ksft_finished(); - } - - file = fopen("/proc/self/maps", "r"); - if (file == NULL) - ksft_exit_fail_msg("cannot open /proc/self/maps\n"); - - prev_end_addr = 0; - while (fgets(line, sizeof(line), file)) { - const char *vma_name = NULL; - int vma_name_start = 0; - unsigned long hop; - - if (sscanf(line, "%lx-%lx %4s %*s %*s %*s %n", - &start_addr, &end_addr, prot, &vma_name_start) != 3) - ksft_exit_fail_msg("cannot parse /proc/self/maps\n"); - - if (vma_name_start) - vma_name = line + vma_name_start; - - /* end of userspace mappings; ignore vsyscall mapping */ - if (start_addr & (1UL << 63)) - return 0; - - /* /proc/self/maps must have gaps less than MAP_CHUNK_SIZE */ - if (start_addr - prev_end_addr >= MAP_CHUNK_SIZE) - return 1; - - prev_end_addr = end_addr; - - if (prot[0] != 'r') - continue; - - if (check_vmflag_io((void *)start_addr)) - continue; - - /* - * Confirm whether MAP_CHUNK_SIZE chunk can be found or not. - * If write succeeds, no need to check MAP_CHUNK_SIZE - 1 - * addresses after that. If the address was not held by this - * process, write would fail with errno set to EFAULT. - * Anyways, if write returns anything apart from 1, exit the - * program since that would mean a bug in /proc/self/maps. - */ - hop = 0; - while (start_addr + hop < end_addr) { - if (write(fd, (void *)(start_addr + hop), 1) != 1) - return 1; - lseek(fd, 0, SEEK_SET); - - if (is_marked_vma(vma_name)) - munmap((char *)(start_addr + hop), MAP_CHUNK_SIZE); - - hop += MAP_CHUNK_SIZE; - } - } - return 0; -} - -int main(int argc, char *argv[]) -{ - char *ptr[NR_CHUNKS_LOW]; - char **hptr; - char *hint; - unsigned long i, lchunks, hchunks; - - ksft_print_header(); - ksft_set_plan(1); - - for (i = 0; i < NR_CHUNKS_LOW; i++) { - ptr[i] = mmap(NULL, MAP_CHUNK_SIZE, PROT_READ, - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - - if (ptr[i] == MAP_FAILED) { - if (validate_lower_address_hint()) - ksft_exit_fail_msg("mmap unexpectedly succeeded with hint\n"); - break; - } - - mark_range(ptr[i], MAP_CHUNK_SIZE); - validate_addr(ptr[i], 0); - } - lchunks = i; - hptr = (char **) calloc(NR_CHUNKS_HIGH, sizeof(char *)); - if (hptr == NULL) { - ksft_test_result_skip("Memory constraint not fulfilled\n"); - ksft_finished(); - } - - for (i = 0; i < NR_CHUNKS_HIGH; i++) { - hint = hint_addr(); - hptr[i] = mmap(hint, MAP_CHUNK_SIZE, PROT_READ, - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - - if (hptr[i] == MAP_FAILED) - break; - - mark_range(hptr[i], MAP_CHUNK_SIZE); - validate_addr(hptr[i], 1); - } - hchunks = i; - if (validate_complete_va_space()) { - ksft_test_result_fail("BUG in mmap() or /proc/self/maps\n"); - ksft_finished(); - } - - for (i = 0; i < lchunks; i++) - munmap(ptr[i], MAP_CHUNK_SIZE); - - for (i = 0; i < hchunks; i++) - munmap(hptr[i], MAP_CHUNK_SIZE); - - free(hptr); - - ksft_test_result_pass("Test\n"); - ksft_finished(); -} -- cgit v1.2.3 From 94a62284ede0250e48c886416041ad65907ee917 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 16 Jan 2026 18:07:24 -0800 Subject: selftests/damon/sysfs_memcg_path_leak.sh: use kmemleak Patch series "selftests/damon: improve leak detection and wss estimation reliability". Two DAMON selftets, namely 'sysfs_memcg_leak' and 'sysfs_update_schemes_tried_regions_wss_estimation' frequently show intermittent failures due to their unreliable leak detection and working set size estimation. Make those more reliable. This patch (of 5): sysfs_memcg_path_leak.sh determines if the memory leak has happened by seeing if Slab size on /proc/meminfo increases more than expected after an action. Depending on the system and background workloads, the reasonable expectation varies. For the reason, the test frequently shows intermittent failures. Use kmemleak, which is much more reliable and correct, instead. Link: https://lkml.kernel.org/r/20260117020731.226785-1-sj@kernel.org Link: https://lkml.kernel.org/r/20260117020731.226785-2-sj@kernel.org Signed-off-by: SeongJae Park Cc: Shuah Khan Signed-off-by: Andrew Morton --- .../selftests/damon/sysfs_memcg_path_leak.sh | 26 ++++++++++++---------- 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/tools/testing/selftests/damon/sysfs_memcg_path_leak.sh b/tools/testing/selftests/damon/sysfs_memcg_path_leak.sh index 64c5d8c518a4..33a7ff43ed6c 100755 --- a/tools/testing/selftests/damon/sysfs_memcg_path_leak.sh +++ b/tools/testing/selftests/damon/sysfs_memcg_path_leak.sh @@ -14,6 +14,13 @@ then exit $ksft_skip fi +kmemleak="/sys/kernel/debug/kmemleak" +if [ ! -f "$kmemleak" ] +then + echo "$kmemleak not found" + exit $ksft_skip +fi + # ensure filter directory echo 1 > "$damon_sysfs/kdamonds/nr_kdamonds" echo 1 > "$damon_sysfs/kdamonds/0/contexts/nr_contexts" @@ -22,22 +29,17 @@ echo 1 > "$damon_sysfs/kdamonds/0/contexts/0/schemes/0/filters/nr_filters" filter_dir="$damon_sysfs/kdamonds/0/contexts/0/schemes/0/filters/0" -before_kb=$(grep Slab /proc/meminfo | awk '{print $2}') - -# try to leak 3000 KiB -for i in {1..102400}; +# try to leak 128 times +for i in {1..128}; do echo "012345678901234567890123456789" > "$filter_dir/memcg_path" done -after_kb=$(grep Slab /proc/meminfo | awk '{print $2}') -# expect up to 1500 KiB free from other tasks memory -expected_after_kb_max=$((before_kb + 1500)) - -if [ "$after_kb" -gt "$expected_after_kb_max" ] +echo scan > "$kmemleak" +kmemleak_report=$(cat "$kmemleak") +if [ "$kmemleak_report" = "" ] then - echo "maybe memcg_path are leaking: $before_kb -> $after_kb" - exit 1 -else exit 0 fi +echo "$kmemleak_report" +exit 1 -- cgit v1.2.3 From 891d206e27dc1a684e460b079d2b53e17135d693 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 16 Jan 2026 18:07:25 -0800 Subject: selftests/damon/wss_estimation: test for up to 160 MiB working set size DAMON reads and writes Accessed bits of page tables without manual TLB flush for two reasons. First, it minimizes the overhead. Second, real systems that need DAMON are expected to be memory intensive enough to cause periodic TLB flushes. For test setups that use small test workloads, however, the system's TLB could be big enough to cover whole or most accesses of the test workload. In this case, no page table walk happens and DAMON cannot show any access from the test workload. The test workload for DAMON's working set size estimation selftest is such a case. It accesses only 10 MiB working set, and it turned out there are test setups that have TLBs large enough to cover the 10 MiB data accesses. As a result, the test fails depending on the test machine. Make it more reliable by trying larger working sets up to 160 MiB when it fails. Link: https://lkml.kernel.org/r/20260117020731.226785-3-sj@kernel.org Signed-off-by: SeongJae Park Cc: Shuah Khan Signed-off-by: Andrew Morton --- ..._update_schemes_tried_regions_wss_estimation.py | 29 +++++++++++++++++----- 1 file changed, 23 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/damon/sysfs_update_schemes_tried_regions_wss_estimation.py b/tools/testing/selftests/damon/sysfs_update_schemes_tried_regions_wss_estimation.py index 90ad7409a7a6..bf48ef8e5241 100755 --- a/tools/testing/selftests/damon/sysfs_update_schemes_tried_regions_wss_estimation.py +++ b/tools/testing/selftests/damon/sysfs_update_schemes_tried_regions_wss_estimation.py @@ -6,9 +6,8 @@ import time import _damon_sysfs -def main(): - # access two 10 MiB memory regions, 2 second per each - sz_region = 10 * 1024 * 1024 +def pass_wss_estimation(sz_region): + # access two regions of given size, 2 seocnds per each region proc = subprocess.Popen(['./access_memory', '2', '%d' % sz_region, '2000']) kdamonds = _damon_sysfs.Kdamonds([_damon_sysfs.Kdamond( contexts=[_damon_sysfs.DamonCtx( @@ -36,20 +35,38 @@ def main(): wss_collected.append( kdamonds.kdamonds[0].contexts[0].schemes[0].tried_bytes) + err = kdamonds.stop() + if err is not None: + print('kdamond stop failed: %s' % err) + exit(1) wss_collected.sort() acceptable_error_rate = 0.2 for percentile in [50, 75]: sample = wss_collected[int(len(wss_collected) * percentile / 100)] error_rate = abs(sample - sz_region) / sz_region - print('%d-th percentile (%d) error %f' % - (percentile, sample, error_rate)) + print('%d-th percentile error %f (expect %d, result %d)' % + (percentile, error_rate, sz_region, sample)) if error_rate > acceptable_error_rate: print('the error rate is not acceptable (> %f)' % acceptable_error_rate) print('samples are as below') print('\n'.join(['%d' % wss for wss in wss_collected])) - exit(1) + return False + return True + +def main(): + # DAMON doesn't flush TLB. If the system has large TLB that can cover + # whole test working set, DAMON cannot see the access. Test up to 160 MiB + # test working set. + sz_region_mb = 10 + max_sz_region_mb = 160 + while sz_region_mb <= max_sz_region_mb: + test_pass = pass_wss_estimation(sz_region_mb * 1024 * 1024) + if test_pass is True: + exit(0) + sz_region_mb *= 2 + exit(1) if __name__ == '__main__': main() -- cgit v1.2.3 From 514d1bcb58e0ef93fafa4f9c3035d604a4219867 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 16 Jan 2026 18:07:26 -0800 Subject: selftests/damon/access_memory: add repeat mode 'access_memory' is an artificial memory access generator program that is used for a few DAMON selftests. It accesses a given number of regions one by one only once, and exits. Depending on systems, the test workload may exit faster than expected, making the tests unreliable. For reliable control of the artificial memory access pattern, add a mode to make it repeat running. Link: https://lkml.kernel.org/r/20260117020731.226785-4-sj@kernel.org Signed-off-by: SeongJae Park Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/damon/access_memory.c | 29 +++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/tools/testing/selftests/damon/access_memory.c b/tools/testing/selftests/damon/access_memory.c index 56b17e8fe1be..567793b11107 100644 --- a/tools/testing/selftests/damon/access_memory.c +++ b/tools/testing/selftests/damon/access_memory.c @@ -8,6 +8,11 @@ #include #include +enum access_mode { + ACCESS_MODE_ONCE, + ACCESS_MODE_REPEAT, +}; + int main(int argc, char *argv[]) { char **regions; @@ -15,10 +20,12 @@ int main(int argc, char *argv[]) int nr_regions; int sz_region; int access_time_ms; + enum access_mode mode = ACCESS_MODE_ONCE; + int i; - if (argc != 4) { - printf("Usage: %s