From 6626734dd2b151753e134730e27d17e64784c345 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Mon, 12 Jan 2026 15:46:37 +0000 Subject: mm_zone: Generalise has_managed_dma() It would be useful to be able to check for potential DMA pages beyond just ZONE_DMA - generalise the existing has_managed_dma() function to allow checking other zones too. Signed-off-by: Robin Murphy Acked-by: David Hildenbrand (Red Hat) Acked-by: Mike Rapoport (Microsoft) Tested-by: Vladimir Kondratiev Reviewed-by: Baoquan He Signed-off-by: Marek Szyprowski Link: https://lore.kernel.org/r/bd002d2351074e57be1ca08f03f333debac658fb.1768230104.git.robin.murphy@arm.com --- mm/page_alloc.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 822e05f1a964..36ccc85c5073 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7418,20 +7418,16 @@ bool put_page_back_buddy(struct page *page) } #endif -#ifdef CONFIG_ZONE_DMA -bool has_managed_dma(void) +bool has_managed_zone(enum zone_type zone) { struct pglist_data *pgdat; for_each_online_pgdat(pgdat) { - struct zone *zone = &pgdat->node_zones[ZONE_DMA]; - - if (managed_zone(zone)) + if (managed_zone(&pgdat->node_zones[zone])) return true; } return false; } -#endif /* CONFIG_ZONE_DMA */ #ifdef CONFIG_UNACCEPTED_MEMORY -- cgit v1.2.3 From 12a6ddfc76bb8a6d4508171d806c8632cf50a74a Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Wed, 24 Dec 2025 12:33:56 -0500 Subject: mm: add missing static initializer for init_mm::mm_cid.lock MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Initialize the mm_cid.lock struct member of init_mm. Link: https://lkml.kernel.org/r/20251224173358.647691-2-mathieu.desnoyers@efficios.com Fixes: 8cea569ca785 ("sched/mmcid: Use proper data structures") Signed-off-by: Mathieu Desnoyers Reviewed-by: Thomas Gleixner Cc: Aboorva Devarajan Cc: Al Viro Cc: Baolin Wang Cc: Christan König Cc: Christian Brauner Cc: Christoph Lameter Cc: David Hildenbrand Cc: David Rientjes Cc: Dennis Zhou Cc: Johannes Weiner Cc: "Liam R . Howlett" Cc: Lorenzo Stoakes Cc: Mark Brown Cc: Martin Liu Cc: Masami Hiramatsu Cc: Mateusz Guzik Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Michal Hocko Cc: Mike Rapoport Cc: "Paul E. McKenney" Cc: Roman Gushchin Cc: SeongJae Park Cc: Shakeel Butt Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Sweet Tea Dorminy Cc: Tejun Heo Cc: Vlastimil Babka Cc: Wei Yang Cc: Yu Zhao Cc: Peter Zijlstra (Intel) Cc: Signed-off-by: Andrew Morton --- mm/init-mm.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'mm') diff --git a/mm/init-mm.c b/mm/init-mm.c index 4600e7605cab..a514f8ce47e3 100644 --- a/mm/init-mm.c +++ b/mm/init-mm.c @@ -44,6 +44,9 @@ struct mm_struct init_mm = { .mm_lock_seq = SEQCNT_ZERO(init_mm.mm_lock_seq), #endif .user_ns = &init_user_ns, +#ifdef CONFIG_SCHED_MM_CID + .mm_cid.lock = __RAW_SPIN_LOCK_UNLOCKED(init_mm.mm_cid.lock), +#endif .cpu_bitmap = CPU_BITS_NONE, INIT_MM_CONTEXT(init_mm) }; -- cgit v1.2.3 From 6ac433f8b2590b09ca00863d218665729ac985f7 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Wed, 24 Dec 2025 12:33:57 -0500 Subject: mm: rename cpu_bitmap field to flexible_array MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The cpu_bitmap flexible array now contains more than just the cpu_bitmap. In preparation for changing the static mm_struct definitions to cover for the additional space required, change the cpu_bitmap type from "unsigned long" to "char", require an unsigned long alignment of the flexible array, and rename the field from "cpu_bitmap" to "flexible_array". Introduce the MM_STRUCT_FLEXIBLE_ARRAY_INIT macro to statically initialize the flexible array. This covers the init_mm and efi_mm static definitions. This is a preparation step for fixing the missing mm_cid size for static mm_struct definitions. Link: https://lkml.kernel.org/r/20251224173358.647691-3-mathieu.desnoyers@efficios.com Fixes: af7f588d8f73 ("sched: Introduce per-memory-map concurrency ID") Signed-off-by: Mathieu Desnoyers Reviewed-by: Thomas Gleixner Cc: Mark Brown Cc: Aboorva Devarajan Cc: Al Viro Cc: Baolin Wang Cc: Christan König Cc: Christian Brauner Cc: Christoph Lameter Cc: David Hildenbrand Cc: David Rientjes Cc: Dennis Zhou Cc: Johannes Weiner Cc: "Liam R . Howlett" Cc: Lorenzo Stoakes Cc: Martin Liu Cc: Masami Hiramatsu Cc: Mateusz Guzik Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Michal Hocko Cc: Mike Rapoport Cc: "Paul E. McKenney" Cc: Roman Gushchin Cc: SeongJae Park Cc: Shakeel Butt Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Sweet Tea Dorminy Cc: Tejun Heo Cc: Vlastimil Babka Cc: Wei Yang Cc: Yu Zhao Cc: Peter Zijlstra (Intel) Cc: Signed-off-by: Andrew Morton --- drivers/firmware/efi/efi.c | 2 +- include/linux/mm_types.h | 13 +++++++++---- mm/init-mm.c | 2 +- 3 files changed, 11 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c index f5ff6e84a9b7..17b5f3415465 100644 --- a/drivers/firmware/efi/efi.c +++ b/drivers/firmware/efi/efi.c @@ -74,10 +74,10 @@ struct mm_struct efi_mm = { .page_table_lock = __SPIN_LOCK_UNLOCKED(efi_mm.page_table_lock), .mmlist = LIST_HEAD_INIT(efi_mm.mmlist), .user_ns = &init_user_ns, - .cpu_bitmap = { [BITS_TO_LONGS(NR_CPUS)] = 0}, #ifdef CONFIG_SCHED_MM_CID .mm_cid.lock = __RAW_SPIN_LOCK_UNLOCKED(efi_mm.mm_cid.lock), #endif + .flexible_array = MM_STRUCT_FLEXIBLE_ARRAY_INIT, }; struct workqueue_struct *efi_rts_wq; diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 42af2292951d..110b319a2ffb 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -1329,7 +1329,7 @@ struct mm_struct { * The mm_cpumask needs to be at the end of mm_struct, because it * is dynamically sized based on nr_cpu_ids. */ - unsigned long cpu_bitmap[]; + char flexible_array[] __aligned(__alignof__(unsigned long)); }; /* Copy value to the first system word of mm flags, non-atomically. */ @@ -1366,19 +1366,24 @@ static inline void __mm_flags_set_mask_bits_word(struct mm_struct *mm, MT_FLAGS_USE_RCU) extern struct mm_struct init_mm; +#define MM_STRUCT_FLEXIBLE_ARRAY_INIT \ +{ \ + [0 ... sizeof(cpumask_t)-1] = 0 \ +} + /* Pointer magic because the dynamic array size confuses some compilers. */ static inline void mm_init_cpumask(struct mm_struct *mm) { unsigned long cpu_bitmap = (unsigned long)mm; - cpu_bitmap += offsetof(struct mm_struct, cpu_bitmap); + cpu_bitmap += offsetof(struct mm_struct, flexible_array); cpumask_clear((struct cpumask *)cpu_bitmap); } /* Future-safe accessor for struct mm_struct's cpu_vm_mask. */ static inline cpumask_t *mm_cpumask(struct mm_struct *mm) { - return (struct cpumask *)&mm->cpu_bitmap; + return (struct cpumask *)&mm->flexible_array; } #ifdef CONFIG_LRU_GEN @@ -1469,7 +1474,7 @@ static inline cpumask_t *mm_cpus_allowed(struct mm_struct *mm) { unsigned long bitmap = (unsigned long)mm; - bitmap += offsetof(struct mm_struct, cpu_bitmap); + bitmap += offsetof(struct mm_struct, flexible_array); /* Skip cpu_bitmap */ bitmap += cpumask_size(); return (struct cpumask *)bitmap; diff --git a/mm/init-mm.c b/mm/init-mm.c index a514f8ce47e3..c5556bb9d5f0 100644 --- a/mm/init-mm.c +++ b/mm/init-mm.c @@ -47,7 +47,7 @@ struct mm_struct init_mm = { #ifdef CONFIG_SCHED_MM_CID .mm_cid.lock = __RAW_SPIN_LOCK_UNLOCKED(init_mm.mm_cid.lock), #endif - .cpu_bitmap = CPU_BITS_NONE, + .flexible_array = MM_STRUCT_FLEXIBLE_ARRAY_INIT, INIT_MM_CONTEXT(init_mm) }; -- cgit v1.2.3 From b7880cb166ab62c2409046b2347261abf701530e Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 9 Jan 2026 04:13:42 +0000 Subject: migrate: correct lock ordering for hugetlb file folios Syzbot has found a deadlock (analyzed by Lance Yang): 1) Task (5749): Holds folio_lock, then tries to acquire i_mmap_rwsem(read lock). 2) Task (5754): Holds i_mmap_rwsem(write lock), then tries to acquire folio_lock. migrate_pages() -> migrate_hugetlbs() -> unmap_and_move_huge_page() <- Takes folio_lock! -> remove_migration_ptes() -> __rmap_walk_file() -> i_mmap_lock_read() <- Waits for i_mmap_rwsem(read lock)! hugetlbfs_fallocate() -> hugetlbfs_punch_hole() <- Takes i_mmap_rwsem(write lock)! -> hugetlbfs_zero_partial_page() -> filemap_lock_hugetlb_folio() -> filemap_lock_folio() -> __filemap_get_folio <- Waits for folio_lock! The migration path is the one taking locks in the wrong order according to the documentation at the top of mm/rmap.c. So expand the scope of the existing i_mmap_lock to cover the calls to remove_migration_ptes() too. This is (mostly) how it used to be after commit c0d0381ade79. That was removed by 336bf30eb765 for both file & anon hugetlb pages when it should only have been removed for anon hugetlb pages. Link: https://lkml.kernel.org/r/20260109041345.3863089-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Fixes: 336bf30eb765 ("hugetlbfs: fix anon huge page migration race") Reported-by: syzbot+2d9c96466c978346b55f@syzkaller.appspotmail.com Link: https://lore.kernel.org/all/68e9715a.050a0220.1186a4.000d.GAE@google.com Debugged-by: Lance Yang Acked-by: David Hildenbrand (Red Hat) Acked-by: Zi Yan Cc: Alistair Popple Cc: Byungchul Park Cc: Gregory Price Cc: Jann Horn Cc: Joshua Hahn Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Matthew Brost Cc: Rakie Kim Cc: Rik van Riel Cc: Vlastimil Babka Cc: Ying Huang Cc: Signed-off-by: Andrew Morton --- mm/migrate.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/migrate.c b/mm/migrate.c index 5169f9717f60..4688b9e38cd2 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1458,6 +1458,7 @@ static int unmap_and_move_huge_page(new_folio_t get_new_folio, int page_was_mapped = 0; struct anon_vma *anon_vma = NULL; struct address_space *mapping = NULL; + enum ttu_flags ttu = 0; if (folio_ref_count(src) == 1) { /* page was freed from under us. So we are done. */ @@ -1498,8 +1499,6 @@ static int unmap_and_move_huge_page(new_folio_t get_new_folio, goto put_anon; if (folio_mapped(src)) { - enum ttu_flags ttu = 0; - if (!folio_test_anon(src)) { /* * In shared mappings, try_to_unmap could potentially @@ -1516,16 +1515,17 @@ static int unmap_and_move_huge_page(new_folio_t get_new_folio, try_to_migrate(src, ttu); page_was_mapped = 1; - - if (ttu & TTU_RMAP_LOCKED) - i_mmap_unlock_write(mapping); } if (!folio_mapped(src)) rc = move_to_new_folio(dst, src, mode); if (page_was_mapped) - remove_migration_ptes(src, !rc ? dst : src, 0); + remove_migration_ptes(src, !rc ? dst : src, + ttu ? RMP_LOCKED : 0); + + if (ttu & TTU_RMAP_LOCKED) + i_mmap_unlock_write(mapping); unlock_put_anon: folio_unlock(dst); -- cgit v1.2.3 From 605f6586ecf78395f0185ab24c368fb46a06e434 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 12 Jan 2026 15:51:43 +0000 Subject: mm/vma: do not leak memory when .mmap_prepare swaps the file The current implementation of mmap() is set up such that a struct file object is obtained for the input fd in ksys_mmap_pgoff() via fget(), and its reference count decremented at the end of the function via. fput(). If a merge can be achieved, we are fine to simply decrement the refcount on the file. Otherwise, in __mmap_new_file_vma(), we increment the reference count on the file via get_file() such that the fput() in ksys_mmap_pgoff() does not free the now-referenced file object. The introduction of the f_op->mmap_prepare hook changes things, as it becomes possible for a driver to replace the file object right at the beginning of the mmap operation. The current implementation is buggy if this happens because it unconditionally calls get_file() on the mapping's file whether or not it was replaced (and thus whether or not its reference count will be decremented at the end of ksys_mmap_pgoff()). This results in a memory leak, and was exposed in commit ab04945f91bc ("mm: update mem char driver to use mmap_prepare"). This patch solves the problem by explicitly tracking whether we actually need to call get_file() on the file or not, and only doing so if required. Link: https://lkml.kernel.org/r/20260112155143.661284-1-lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Fixes: ab04945f91bc ("mm: update mem char driver to use mmap_prepare") Reported-by: syzbot+bf5de69ebb4bdf86f59f@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/6964a92b.050a0220.eaf7.008a.GAE@google.com/ Cc: Al Viro Cc: Christian Brauner Cc: Jan Kara Cc: Jann Horn Cc: Jason Gunthorpe Cc: Liam Howlett Cc: Pedro Falcato Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/vma.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/vma.c b/mm/vma.c index dc92f3dd8514..7a908a964d18 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -37,6 +37,8 @@ struct mmap_state { bool check_ksm_early :1; /* If we map new, hold the file rmap lock on mapping. */ bool hold_file_rmap_lock :1; + /* If .mmap_prepare changed the file, we don't need to pin. */ + bool file_doesnt_need_get :1; }; #define MMAP_STATE(name, mm_, vmi_, addr_, len_, pgoff_, vm_flags_, file_) \ @@ -2450,7 +2452,9 @@ static int __mmap_new_file_vma(struct mmap_state *map, struct vma_iterator *vmi = map->vmi; int error; - vma->vm_file = get_file(map->file); + vma->vm_file = map->file; + if (!map->file_doesnt_need_get) + get_file(map->file); if (!map->file->f_op->mmap) return 0; @@ -2638,7 +2642,10 @@ static int call_mmap_prepare(struct mmap_state *map, /* Update fields permitted to be changed. */ map->pgoff = desc->pgoff; - map->file = desc->vm_file; + if (desc->vm_file != map->file) { + map->file_doesnt_need_get = true; + map->file = desc->vm_file; + } map->vm_flags = desc->vm_flags; map->page_prot = desc->page_prot; /* User-defined fields. */ -- cgit v1.2.3 From 90888b4ae103e65e5dfd438adb8d7d7ece91afd2 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Wed, 14 Jan 2026 11:56:19 +0000 Subject: mm: remove unnecessary and incorrect mmap lock assert This check was introduced by commit 42fc541404f2 ("mmap locking API: add mmap_assert_locked() and mmap_assert_write_locked()") which replaced a VM_BUG_ON_VMA() over rwsem_is_locked from commit a00cc7d9dd93 ("mm, x86: add support for PUD-sized transparent hugepages"), i.e. the commit that introduced PUD THPs. These seem to be careful asserts introduced to ensure that locks are held in general, however for a zap we require that VMAs are kept stable, and this is a requirement that has held perfectly well for a long time. These were long before VMA locks and thus there appears to be no reason to think this is assert is there for anything other than 'stabilised VMA'. Asserting that the VMA under examination is stable only in the case of a THP PUD is strange and unnecessary. If we wish to be careful and assert such things, we should do so at the zap level. However in any case the current situation is already simply incorrect - a VMA lock suffices here. Remove the assert for now as it is unnecessarily, incorrect and unhelpful, subsequent work can introduce an assert in general for zapping if required. Link: https://lkml.kernel.org/r/20260114115619.1087466-1-lorenzo.stoakes@oracle.com Fixes: 2ab7f1bbafc9 ("mm/madvise: allow guard page install/remove under VMA lock") Signed-off-by: Lorenzo Stoakes Reported-by: Chris Mason Closes: https://lore.kernel.org/all/20260113220856.2358195-1-clm@meta.com/ Acked-by: David Hildenbrand (Red Hat) Cc: Liam Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: SeongJae Park Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/memory.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 2a55edc48a65..a0822b564cc0 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1963,10 +1963,9 @@ static inline unsigned long zap_pud_range(struct mmu_gather *tlb, do { next = pud_addr_end(addr, end); if (pud_trans_huge(*pud)) { - if (next - addr != HPAGE_PUD_SIZE) { - mmap_assert_locked(tlb->mm); + if (next - addr != HPAGE_PUD_SIZE) split_huge_pud(vma, pud, addr); - } else if (zap_huge_pud(tlb, vma, pud, addr)) + else if (zap_huge_pud(tlb, vma, pud, addr)) goto next; /* fall through */ } -- cgit v1.2.3 From 3937027caecb4f8251e82dd857ba1d749bb5a428 Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Red Hat)" Date: Tue, 23 Dec 2025 22:40:35 +0100 Subject: mm/hugetlb: fix two comments related to huge_pmd_unshare() Ever since we stopped using the page count to detect shared PMD page tables, these comments are outdated. The only reason we have to flush the TLB early is because once we drop the i_mmap_rwsem, the previously shared page table could get freed (to then get reallocated and used for other purpose). So we really have to flush the TLB before that could happen. So let's simplify the comments a bit. The "If we unshared PMDs, the TLB flush was not recorded in mmu_gather." part introduced as in commit a4a118f2eead ("hugetlbfs: flush TLBs correctly after huge_pmd_unshare") was confusing: sure it is recorded in the mmu_gather, otherwise tlb_flush_mmu_tlbonly() wouldn't do anything. So let's drop that comment while at it as well. We'll centralize these comments in a single helper as we rework the code next. Link: https://lkml.kernel.org/r/20251223214037.580860-3-david@kernel.org Fixes: 59d9094df3d7 ("mm: hugetlb: independent PMD page table shared count") Signed-off-by: David Hildenbrand (Red Hat) Reviewed-by: Rik van Riel Tested-by: Laurence Oberman Reviewed-by: Lorenzo Stoakes Acked-by: Oscar Salvador Reviewed-by: Harry Yoo Cc: Liu Shixin Cc: Lance Yang Cc: "Uschakow, Stanislav" Cc: Signed-off-by: Andrew Morton --- mm/hugetlb.c | 24 ++++++++---------------- 1 file changed, 8 insertions(+), 16 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index e0ab14020513..67131aa24d77 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5320,17 +5320,10 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, tlb_end_vma(tlb, vma); /* - * If we unshared PMDs, the TLB flush was not recorded in mmu_gather. We - * could defer the flush until now, since by holding i_mmap_rwsem we - * guaranteed that the last reference would not be dropped. But we must - * do the flushing before we return, as otherwise i_mmap_rwsem will be - * dropped and the last reference to the shared PMDs page might be - * dropped as well. - * - * In theory we could defer the freeing of the PMD pages as well, but - * huge_pmd_unshare() relies on the exact page_count for the PMD page to - * detect sharing, so we cannot defer the release of the page either. - * Instead, do flush now. + * There is nothing protecting a previously-shared page table that we + * unshared through huge_pmd_unshare() from getting freed after we + * release i_mmap_rwsem, so flush the TLB now. If huge_pmd_unshare() + * succeeded, flush the range corresponding to the pud. */ if (force_flush) tlb_flush_mmu_tlbonly(tlb); @@ -6552,11 +6545,10 @@ next: cond_resched(); } /* - * Must flush TLB before releasing i_mmap_rwsem: x86's huge_pmd_unshare - * may have cleared our pud entry and done put_page on the page table: - * once we release i_mmap_rwsem, another task can do the final put_page - * and that page table be reused and filled with junk. If we actually - * did unshare a page of pmds, flush the range corresponding to the pud. + * There is nothing protecting a previously-shared page table that we + * unshared through huge_pmd_unshare() from getting freed after we + * release i_mmap_rwsem, so flush the TLB now. If huge_pmd_unshare() + * succeeded, flush the range corresponding to the pud. */ if (shared_pmd) flush_hugetlb_tlb_range(vma, range.start, range.end); -- cgit v1.2.3 From a8682d500f691b6dfaa16ae1502d990aeb86e8be Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Red Hat)" Date: Tue, 23 Dec 2025 22:40:36 +0100 Subject: mm/rmap: fix two comments related to huge_pmd_unshare() PMD page table unsharing no longer touches the refcount of a PMD page table. Also, it is not about dropping the refcount of a "PMD page" but the "PMD page table". Let's just simplify by saying that the PMD page table was unmapped, consequently also unmapping the folio that was mapped into this page. This code should be deduplicated in the future. Link: https://lkml.kernel.org/r/20251223214037.580860-4-david@kernel.org Fixes: 59d9094df3d7 ("mm: hugetlb: independent PMD page table shared count") Signed-off-by: David Hildenbrand (Red Hat) Reviewed-by: Rik van Riel Tested-by: Laurence Oberman Reviewed-by: Lorenzo Stoakes Acked-by: Oscar Salvador Cc: Liu Shixin Cc: Harry Yoo Cc: Lance Yang Cc: "Uschakow, Stanislav" Cc: Signed-off-by: Andrew Morton --- mm/rmap.c | 20 ++++---------------- 1 file changed, 4 insertions(+), 16 deletions(-) (limited to 'mm') diff --git a/mm/rmap.c b/mm/rmap.c index f955f02d570e..748f48727a16 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -2016,14 +2016,8 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, flush_tlb_range(vma, range.start, range.end); /* - * The ref count of the PMD page was - * dropped which is part of the way map - * counting is done for shared PMDs. - * Return 'true' here. When there is - * no other sharing, huge_pmd_unshare - * returns false and we will unmap the - * actual page and drop map count - * to zero. + * The PMD table was unmapped, + * consequently unmapping the folio. */ goto walk_done; } @@ -2416,14 +2410,8 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, range.start, range.end); /* - * The ref count of the PMD page was - * dropped which is part of the way map - * counting is done for shared PMDs. - * Return 'true' here. When there is - * no other sharing, huge_pmd_unshare - * returns false and we will unmap the - * actual page and drop map count - * to zero. + * The PMD table was unmapped, + * consequently unmapping the folio. */ page_vma_mapped_walk_done(&pvmw); break; -- cgit v1.2.3 From 8ce720d5bd91e9dc16db3604aa4b1bf76770a9a1 Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Red Hat)" Date: Tue, 23 Dec 2025 22:40:37 +0100 Subject: mm/hugetlb: fix excessive IPI broadcasts when unsharing PMD tables using mmu_gather As reported, ever since commit 1013af4f585f ("mm/hugetlb: fix huge_pmd_unshare() vs GUP-fast race") we can end up in some situations where we perform so many IPI broadcasts when unsharing hugetlb PMD page tables that it severely regresses some workloads. In particular, when we fork()+exit(), or when we munmap() a large area backed by many shared PMD tables, we perform one IPI broadcast per unshared PMD table. There are two optimizations to be had: (1) When we process (unshare) multiple such PMD tables, such as during exit(), it is sufficient to send a single IPI broadcast (as long as we respect locking rules) instead of one per PMD table. Locking prevents that any of these PMD tables could get reused before we drop the lock. (2) When we are not the last sharer (> 2 users including us), there is no need to send the IPI broadcast. The shared PMD tables cannot become exclusive (fully unshared) before an IPI will be broadcasted by the last sharer. Concurrent GUP-fast could walk into a PMD table just before we unshared it. It could then succeed in grabbing a page from the shared page table even after munmap() etc succeeded (and supressed an IPI). But there is not difference compared to GUP-fast just sleeping for a while after grabbing the page and re-enabling IRQs. Most importantly, GUP-fast will never walk into page tables that are no-longer shared, because the last sharer will issue an IPI broadcast. (if ever required, checking whether the PUD changed in GUP-fast after grabbing the page like we do in the PTE case could handle this) So let's rework PMD sharing TLB flushing + IPI sync to use the mmu_gather infrastructure so we can implement these optimizations and demystify the code at least a bit. Extend the mmu_gather infrastructure to be able to deal with our special hugetlb PMD table sharing implementation. To make initialization of the mmu_gather easier when working on a single VMA (in particular, when dealing with hugetlb), provide tlb_gather_mmu_vma(). We'll consolidate the handling for (full) unsharing of PMD tables in tlb_unshare_pmd_ptdesc() and tlb_flush_unshared_tables(), and track in "struct mmu_gather" whether we had (full) unsharing of PMD tables. Because locking is very special (concurrent unsharing+reuse must be prevented), we disallow deferring flushing to tlb_finish_mmu() and instead require an explicit earlier call to tlb_flush_unshared_tables(). From hugetlb code, we call huge_pmd_unshare_flush() where we make sure that the expected lock protecting us from concurrent unsharing+reuse is still held. Check with a VM_WARN_ON_ONCE() in tlb_finish_mmu() that tlb_flush_unshared_tables() was properly called earlier. Document it all properly. Notes about tlb_remove_table_sync_one() interaction with unsharing: There are two fairly tricky things: (1) tlb_remove_table_sync_one() is a NOP on architectures without CONFIG_MMU_GATHER_RCU_TABLE_FREE. Here, the assumption is that the previous TLB flush would send an IPI to all relevant CPUs. Careful: some architectures like x86 only send IPIs to all relevant CPUs when tlb->freed_tables is set. The relevant architectures should be selecting MMU_GATHER_RCU_TABLE_FREE, but x86 might not do that in stable kernels and it might have been problematic before this patch. Also, the arch flushing behavior (independent of IPIs) is different when tlb->freed_tables is set. Do we have to enlighten them to also take care of tlb->unshared_tables? So far we didn't care, so hopefully we are fine. Of course, we could be setting tlb->freed_tables as well, but that might then unnecessarily flush too much, because the semantics of tlb->freed_tables are a bit fuzzy. This patch changes nothing in this regard. (2) tlb_remove_table_sync_one() is not a NOP on architectures with CONFIG_MMU_GATHER_RCU_TABLE_FREE that actually don't need a sync. Take x86 as an example: in the common case (!pv, !X86_FEATURE_INVLPGB) we still issue IPIs during TLB flushes and don't actually need the second tlb_remove_table_sync_one(). This optimized can be implemented on top of this, by checking e.g., in tlb_remove_table_sync_one() whether we really need IPIs. But as described in (1), it really must honor tlb->freed_tables then to send IPIs to all relevant CPUs. Notes on TLB flushing changes: (1) Flushing for non-shared PMD tables We're converting from flush_hugetlb_tlb_range() to tlb_remove_huge_tlb_entry(). Given that we properly initialize the MMU gather in tlb_gather_mmu_vma() to be hugetlb aware, similar to __unmap_hugepage_range(), that should be fine. (2) Flushing for shared PMD tables We're converting from various things (flush_hugetlb_tlb_range(), tlb_flush_pmd_range(), flush_tlb_range()) to tlb_flush_pmd_range(). tlb_flush_pmd_range() achieves the same that tlb_remove_huge_tlb_entry() would achieve in these scenarios. Note that tlb_remove_huge_tlb_entry() also calls __tlb_remove_tlb_entry(), however that is only implemented on powerpc, which does not support PMD table sharing. Similar to (1), tlb_gather_mmu_vma() should make sure that TLB flushing keeps on working as expected. Further, note that the ptdesc_pmd_pts_dec() in huge_pmd_share() is not a concern, as we are holding the i_mmap_lock the whole time, preventing concurrent unsharing. That ptdesc_pmd_pts_dec() usage will be removed separately as a cleanup later. There are plenty more cleanups to be had, but they have to wait until this is fixed. [david@kernel.org: fix kerneldoc] Link: https://lkml.kernel.org/r/f223dd74-331c-412d-93fc-69e360a5006c@kernel.org Link: https://lkml.kernel.org/r/20251223214037.580860-5-david@kernel.org Fixes: 1013af4f585f ("mm/hugetlb: fix huge_pmd_unshare() vs GUP-fast race") Signed-off-by: David Hildenbrand (Red Hat) Reported-by: Uschakow, Stanislav" Closes: https://lore.kernel.org/all/4d3878531c76479d9f8ca9789dc6485d@amazon.de/ Tested-by: Laurence Oberman Acked-by: Harry Yoo Reviewed-by: Lorenzo Stoakes Cc: Lance Yang Cc: Liu Shixin Cc: Oscar Salvador Cc: Rik van Riel Cc: Signed-off-by: Andrew Morton --- include/asm-generic/tlb.h | 77 ++++++++++++++++++++++++++++- include/linux/hugetlb.h | 15 ++++-- include/linux/mm_types.h | 1 + mm/hugetlb.c | 123 +++++++++++++++++++++++++++------------------- mm/mmu_gather.c | 33 +++++++++++++ mm/rmap.c | 25 +++++++--- 6 files changed, 208 insertions(+), 66 deletions(-) (limited to 'mm') diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index 1fff717cae51..4d679d2a206b 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -46,7 +46,8 @@ * * The mmu_gather API consists of: * - * - tlb_gather_mmu() / tlb_gather_mmu_fullmm() / tlb_finish_mmu() + * - tlb_gather_mmu() / tlb_gather_mmu_fullmm() / tlb_gather_mmu_vma() / + * tlb_finish_mmu() * * start and finish a mmu_gather * @@ -364,6 +365,20 @@ struct mmu_gather { unsigned int vma_huge : 1; unsigned int vma_pfn : 1; + /* + * Did we unshare (unmap) any shared page tables? For now only + * used for hugetlb PMD table sharing. + */ + unsigned int unshared_tables : 1; + + /* + * Did we unshare any page tables such that they are now exclusive + * and could get reused+modified by the new owner? When setting this + * flag, "unshared_tables" will be set as well. For now only used + * for hugetlb PMD table sharing. + */ + unsigned int fully_unshared_tables : 1; + unsigned int batch_count; #ifndef CONFIG_MMU_GATHER_NO_GATHER @@ -400,6 +415,7 @@ static inline void __tlb_reset_range(struct mmu_gather *tlb) tlb->cleared_pmds = 0; tlb->cleared_puds = 0; tlb->cleared_p4ds = 0; + tlb->unshared_tables = 0; /* * Do not reset mmu_gather::vma_* fields here, we do not * call into tlb_start_vma() again to set them if there is an @@ -484,7 +500,7 @@ static inline void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb) * these bits. */ if (!(tlb->freed_tables || tlb->cleared_ptes || tlb->cleared_pmds || - tlb->cleared_puds || tlb->cleared_p4ds)) + tlb->cleared_puds || tlb->cleared_p4ds || tlb->unshared_tables)) return; tlb_flush(tlb); @@ -773,6 +789,63 @@ static inline bool huge_pmd_needs_flush(pmd_t oldpmd, pmd_t newpmd) } #endif +#ifdef CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING +static inline void tlb_unshare_pmd_ptdesc(struct mmu_gather *tlb, struct ptdesc *pt, + unsigned long addr) +{ + /* + * The caller must make sure that concurrent unsharing + exclusive + * reuse is impossible until tlb_flush_unshared_tables() was called. + */ + VM_WARN_ON_ONCE(!ptdesc_pmd_is_shared(pt)); + ptdesc_pmd_pts_dec(pt); + + /* Clearing a PUD pointing at a PMD table with PMD leaves. */ + tlb_flush_pmd_range(tlb, addr & PUD_MASK, PUD_SIZE); + + /* + * If the page table is now exclusively owned, we fully unshared + * a page table. + */ + if (!ptdesc_pmd_is_shared(pt)) + tlb->fully_unshared_tables = true; + tlb->unshared_tables = true; +} + +static inline void tlb_flush_unshared_tables(struct mmu_gather *tlb) +{ + /* + * As soon as the caller drops locks to allow for reuse of + * previously-shared tables, these tables could get modified and + * even reused outside of hugetlb context, so we have to make sure that + * any page table walkers (incl. TLB, GUP-fast) are aware of that + * change. + * + * Even if we are not fully unsharing a PMD table, we must + * flush the TLB for the unsharer now. + */ + if (tlb->unshared_tables) + tlb_flush_mmu_tlbonly(tlb); + + /* + * Similarly, we must make sure that concurrent GUP-fast will not + * walk previously-shared page tables that are getting modified+reused + * elsewhere. So broadcast an IPI to wait for any concurrent GUP-fast. + * + * We only perform this when we are the last sharer of a page table, + * as the IPI will reach all CPUs: any GUP-fast. + * + * Note that on configs where tlb_remove_table_sync_one() is a NOP, + * the expectation is that the tlb_flush_mmu_tlbonly() would have issued + * required IPIs already for us. + */ + if (tlb->fully_unshared_tables) { + tlb_remove_table_sync_one(); + tlb->fully_unshared_tables = false; + } +} +#endif /* CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING */ + #endif /* CONFIG_MMU */ #endif /* _ASM_GENERIC__TLB_H */ diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 03c8725efa28..e51b8ef0cebd 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -240,8 +240,9 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, unsigned long sz); unsigned long hugetlb_mask_last_page(struct hstate *h); -int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long addr, pte_t *ptep); +int huge_pmd_unshare(struct mmu_gather *tlb, struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep); +void huge_pmd_unshare_flush(struct mmu_gather *tlb, struct vm_area_struct *vma); void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma, unsigned long *start, unsigned long *end); @@ -300,13 +301,17 @@ static inline struct address_space *hugetlb_folio_mapping_lock_write( return NULL; } -static inline int huge_pmd_unshare(struct mm_struct *mm, - struct vm_area_struct *vma, - unsigned long addr, pte_t *ptep) +static inline int huge_pmd_unshare(struct mmu_gather *tlb, + struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { return 0; } +static inline void huge_pmd_unshare_flush(struct mmu_gather *tlb, + struct vm_area_struct *vma) +{ +} + static inline void adjust_range_if_pmd_sharing_possible( struct vm_area_struct *vma, unsigned long *start, unsigned long *end) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index aa4639888f89..78950eb8926d 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -1530,6 +1530,7 @@ static inline unsigned int mm_cid_size(void) struct mmu_gather; extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm); extern void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm); +void tlb_gather_mmu_vma(struct mmu_gather *tlb, struct vm_area_struct *vma); extern void tlb_finish_mmu(struct mmu_gather *tlb); struct vm_fault; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 67131aa24d77..a1832da0f623 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5112,7 +5112,7 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma, unsigned long last_addr_mask; pte_t *src_pte, *dst_pte; struct mmu_notifier_range range; - bool shared_pmd = false; + struct mmu_gather tlb; mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, old_addr, old_end); @@ -5122,6 +5122,7 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma, * range. */ flush_cache_range(vma, range.start, range.end); + tlb_gather_mmu_vma(&tlb, vma); mmu_notifier_invalidate_range_start(&range); last_addr_mask = hugetlb_mask_last_page(h); @@ -5138,8 +5139,7 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma, if (huge_pte_none(huge_ptep_get(mm, old_addr, src_pte))) continue; - if (huge_pmd_unshare(mm, vma, old_addr, src_pte)) { - shared_pmd = true; + if (huge_pmd_unshare(&tlb, vma, old_addr, src_pte)) { old_addr |= last_addr_mask; new_addr |= last_addr_mask; continue; @@ -5150,15 +5150,16 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma, break; move_huge_pte(vma, old_addr, new_addr, src_pte, dst_pte, sz); + tlb_remove_huge_tlb_entry(h, &tlb, src_pte, old_addr); } - if (shared_pmd) - flush_hugetlb_tlb_range(vma, range.start, range.end); - else - flush_hugetlb_tlb_range(vma, old_end - len, old_end); + tlb_flush_mmu_tlbonly(&tlb); + huge_pmd_unshare_flush(&tlb, vma); + mmu_notifier_invalidate_range_end(&range); i_mmap_unlock_write(mapping); hugetlb_vma_unlock_write(vma); + tlb_finish_mmu(&tlb); return len + old_addr - old_end; } @@ -5177,7 +5178,6 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long sz = huge_page_size(h); bool adjust_reservation; unsigned long last_addr_mask; - bool force_flush = false; WARN_ON(!is_vm_hugetlb_page(vma)); BUG_ON(start & ~huge_page_mask(h)); @@ -5200,10 +5200,8 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, } ptl = huge_pte_lock(h, mm, ptep); - if (huge_pmd_unshare(mm, vma, address, ptep)) { + if (huge_pmd_unshare(tlb, vma, address, ptep)) { spin_unlock(ptl); - tlb_flush_pmd_range(tlb, address & PUD_MASK, PUD_SIZE); - force_flush = true; address |= last_addr_mask; continue; } @@ -5319,14 +5317,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, } tlb_end_vma(tlb, vma); - /* - * There is nothing protecting a previously-shared page table that we - * unshared through huge_pmd_unshare() from getting freed after we - * release i_mmap_rwsem, so flush the TLB now. If huge_pmd_unshare() - * succeeded, flush the range corresponding to the pud. - */ - if (force_flush) - tlb_flush_mmu_tlbonly(tlb); + huge_pmd_unshare_flush(tlb, vma); } void __hugetlb_zap_begin(struct vm_area_struct *vma, @@ -6425,11 +6416,11 @@ long hugetlb_change_protection(struct vm_area_struct *vma, pte_t pte; struct hstate *h = hstate_vma(vma); long pages = 0, psize = huge_page_size(h); - bool shared_pmd = false; struct mmu_notifier_range range; unsigned long last_addr_mask; bool uffd_wp = cp_flags & MM_CP_UFFD_WP; bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE; + struct mmu_gather tlb; /* * In the case of shared PMDs, the area to flush could be beyond @@ -6442,6 +6433,7 @@ long hugetlb_change_protection(struct vm_area_struct *vma, BUG_ON(address >= end); flush_cache_range(vma, range.start, range.end); + tlb_gather_mmu_vma(&tlb, vma); mmu_notifier_invalidate_range_start(&range); hugetlb_vma_lock_write(vma); @@ -6468,7 +6460,7 @@ long hugetlb_change_protection(struct vm_area_struct *vma, } } ptl = huge_pte_lock(h, mm, ptep); - if (huge_pmd_unshare(mm, vma, address, ptep)) { + if (huge_pmd_unshare(&tlb, vma, address, ptep)) { /* * When uffd-wp is enabled on the vma, unshare * shouldn't happen at all. Warn about it if it @@ -6477,7 +6469,6 @@ long hugetlb_change_protection(struct vm_area_struct *vma, WARN_ON_ONCE(uffd_wp || uffd_wp_resolve); pages++; spin_unlock(ptl); - shared_pmd = true; address |= last_addr_mask; continue; } @@ -6538,22 +6529,16 @@ long hugetlb_change_protection(struct vm_area_struct *vma, pte = huge_pte_clear_uffd_wp(pte); huge_ptep_modify_prot_commit(vma, address, ptep, old_pte, pte); pages++; + tlb_remove_huge_tlb_entry(h, &tlb, ptep, address); } next: spin_unlock(ptl); cond_resched(); } - /* - * There is nothing protecting a previously-shared page table that we - * unshared through huge_pmd_unshare() from getting freed after we - * release i_mmap_rwsem, so flush the TLB now. If huge_pmd_unshare() - * succeeded, flush the range corresponding to the pud. - */ - if (shared_pmd) - flush_hugetlb_tlb_range(vma, range.start, range.end); - else - flush_hugetlb_tlb_range(vma, start, end); + + tlb_flush_mmu_tlbonly(&tlb); + huge_pmd_unshare_flush(&tlb, vma); /* * No need to call mmu_notifier_arch_invalidate_secondary_tlbs() we are * downgrading page table protection not changing it to point to a new @@ -6564,6 +6549,7 @@ next: i_mmap_unlock_write(vma->vm_file->f_mapping); hugetlb_vma_unlock_write(vma); mmu_notifier_invalidate_range_end(&range); + tlb_finish_mmu(&tlb); return pages > 0 ? (pages << h->order) : pages; } @@ -6920,18 +6906,27 @@ out: return pte; } -/* - * unmap huge page backed by shared pte. +/** + * huge_pmd_unshare - Unmap a pmd table if it is shared by multiple users + * @tlb: the current mmu_gather. + * @vma: the vma covering the pmd table. + * @addr: the address we are trying to unshare. + * @ptep: pointer into the (pmd) page table. + * + * Called with the page table lock held, the i_mmap_rwsem held in write mode + * and the hugetlb vma lock held in write mode. * - * Called with page table lock held. + * Note: The caller must call huge_pmd_unshare_flush() before dropping the + * i_mmap_rwsem. * - * returns: 1 successfully unmapped a shared pte page - * 0 the underlying pte page is not shared, or it is the last user + * Returns: 1 if it was a shared PMD table and it got unmapped, or 0 if it + * was not a shared PMD table. */ -int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long addr, pte_t *ptep) +int huge_pmd_unshare(struct mmu_gather *tlb, struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep) { unsigned long sz = huge_page_size(hstate_vma(vma)); + struct mm_struct *mm = vma->vm_mm; pgd_t *pgd = pgd_offset(mm, addr); p4d_t *p4d = p4d_offset(pgd, addr); pud_t *pud = pud_offset(p4d, addr); @@ -6943,18 +6938,36 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, i_mmap_assert_write_locked(vma->vm_file->f_mapping); hugetlb_vma_assert_locked(vma); pud_clear(pud); - /* - * Once our caller drops the rmap lock, some other process might be - * using this page table as a normal, non-hugetlb page table. - * Wait for pending gup_fast() in other threads to finish before letting - * that happen. - */ - tlb_remove_table_sync_one(); - ptdesc_pmd_pts_dec(virt_to_ptdesc(ptep)); + + tlb_unshare_pmd_ptdesc(tlb, virt_to_ptdesc(ptep), addr); + mm_dec_nr_pmds(mm); return 1; } +/* + * huge_pmd_unshare_flush - Complete a sequence of huge_pmd_unshare() calls + * @tlb: the current mmu_gather. + * @vma: the vma covering the pmd table. + * + * Perform necessary TLB flushes or IPI broadcasts to synchronize PMD table + * unsharing with concurrent page table walkers. + * + * This function must be called after a sequence of huge_pmd_unshare() + * calls while still holding the i_mmap_rwsem. + */ +void huge_pmd_unshare_flush(struct mmu_gather *tlb, struct vm_area_struct *vma) +{ + /* + * We must synchronize page table unsharing such that nobody will + * try reusing a previously-shared page table while it might still + * be in use by previous sharers (TLB, GUP_fast). + */ + i_mmap_assert_write_locked(vma->vm_file->f_mapping); + + tlb_flush_unshared_tables(tlb); +} + #else /* !CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING */ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, @@ -6963,12 +6976,16 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, return NULL; } -int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long addr, pte_t *ptep) +int huge_pmd_unshare(struct mmu_gather *tlb, struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep) { return 0; } +void huge_pmd_unshare_flush(struct mmu_gather *tlb, struct vm_area_struct *vma) +{ +} + void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma, unsigned long *start, unsigned long *end) { @@ -7235,6 +7252,7 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma, unsigned long sz = huge_page_size(h); struct mm_struct *mm = vma->vm_mm; struct mmu_notifier_range range; + struct mmu_gather tlb; unsigned long address; spinlock_t *ptl; pte_t *ptep; @@ -7246,6 +7264,8 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma, return; flush_cache_range(vma, start, end); + tlb_gather_mmu_vma(&tlb, vma); + /* * No need to call adjust_range_if_pmd_sharing_possible(), because * we have already done the PUD_SIZE alignment. @@ -7264,10 +7284,10 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma, if (!ptep) continue; ptl = huge_pte_lock(h, mm, ptep); - huge_pmd_unshare(mm, vma, address, ptep); + huge_pmd_unshare(&tlb, vma, address, ptep); spin_unlock(ptl); } - flush_hugetlb_tlb_range(vma, start, end); + huge_pmd_unshare_flush(&tlb, vma); if (take_locks) { i_mmap_unlock_write(vma->vm_file->f_mapping); hugetlb_vma_unlock_write(vma); @@ -7277,6 +7297,7 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma, * Documentation/mm/mmu_notifier.rst. */ mmu_notifier_invalidate_range_end(&range); + tlb_finish_mmu(&tlb); } /* diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c index 247e3f9db6c7..7468ec388455 100644 --- a/mm/mmu_gather.c +++ b/mm/mmu_gather.c @@ -10,6 +10,7 @@ #include #include #include +#include #include @@ -426,6 +427,7 @@ static void __tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, #endif tlb->vma_pfn = 0; + tlb->fully_unshared_tables = 0; __tlb_reset_range(tlb); inc_tlb_flush_pending(tlb->mm); } @@ -459,6 +461,31 @@ void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm) __tlb_gather_mmu(tlb, mm, true); } +/** + * tlb_gather_mmu_vma - initialize an mmu_gather structure for operating on a + * single VMA + * @tlb: the mmu_gather structure to initialize + * @vma: the vm_area_struct + * + * Called to initialize an (on-stack) mmu_gather structure for operating on + * a single VMA. In contrast to tlb_gather_mmu(), calling this function will + * not require another call to tlb_start_vma(). In contrast to tlb_start_vma(), + * this function will *not* call flush_cache_range(). + * + * For hugetlb VMAs, this function will also initialize the mmu_gather + * page_size accordingly, not requiring a separate call to + * tlb_change_page_size(). + * + */ +void tlb_gather_mmu_vma(struct mmu_gather *tlb, struct vm_area_struct *vma) +{ + tlb_gather_mmu(tlb, vma->vm_mm); + tlb_update_vma_flags(tlb, vma); + if (is_vm_hugetlb_page(vma)) + /* All entries have the same size. */ + tlb_change_page_size(tlb, huge_page_size(hstate_vma(vma))); +} + /** * tlb_finish_mmu - finish an mmu_gather structure * @tlb: the mmu_gather structure to finish @@ -468,6 +495,12 @@ void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm) */ void tlb_finish_mmu(struct mmu_gather *tlb) { + /* + * We expect an earlier huge_pmd_unshare_flush() call to sort this out, + * due to complicated locking requirements with page table unsharing. + */ + VM_WARN_ON_ONCE(tlb->fully_unshared_tables); + /* * If there are parallel threads are doing PTE changes on same range * under non-exclusive lock (e.g., mmap_lock read-side) but defer TLB diff --git a/mm/rmap.c b/mm/rmap.c index 748f48727a16..7b9879ef442d 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -76,7 +76,7 @@ #include #include -#include +#include #define CREATE_TRACE_POINTS #include @@ -2008,13 +2008,17 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, * if unsuccessful. */ if (!anon) { + struct mmu_gather tlb; + VM_BUG_ON(!(flags & TTU_RMAP_LOCKED)); if (!hugetlb_vma_trylock_write(vma)) goto walk_abort; - if (huge_pmd_unshare(mm, vma, address, pvmw.pte)) { + + tlb_gather_mmu_vma(&tlb, vma); + if (huge_pmd_unshare(&tlb, vma, address, pvmw.pte)) { hugetlb_vma_unlock_write(vma); - flush_tlb_range(vma, - range.start, range.end); + huge_pmd_unshare_flush(&tlb, vma); + tlb_finish_mmu(&tlb); /* * The PMD table was unmapped, * consequently unmapping the folio. @@ -2022,6 +2026,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, goto walk_done; } hugetlb_vma_unlock_write(vma); + tlb_finish_mmu(&tlb); } pteval = huge_ptep_clear_flush(vma, address, pvmw.pte); if (pte_dirty(pteval)) @@ -2398,17 +2403,20 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, * fail if unsuccessful. */ if (!anon) { + struct mmu_gather tlb; + VM_BUG_ON(!(flags & TTU_RMAP_LOCKED)); if (!hugetlb_vma_trylock_write(vma)) { page_vma_mapped_walk_done(&pvmw); ret = false; break; } - if (huge_pmd_unshare(mm, vma, address, pvmw.pte)) { - hugetlb_vma_unlock_write(vma); - flush_tlb_range(vma, - range.start, range.end); + tlb_gather_mmu_vma(&tlb, vma); + if (huge_pmd_unshare(&tlb, vma, address, pvmw.pte)) { + hugetlb_vma_unlock_write(vma); + huge_pmd_unshare_flush(&tlb, vma); + tlb_finish_mmu(&tlb); /* * The PMD table was unmapped, * consequently unmapping the folio. @@ -2417,6 +2425,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, break; } hugetlb_vma_unlock_write(vma); + tlb_finish_mmu(&tlb); } /* Nuke the hugetlb page table entry */ pteval = huge_ptep_clear_flush(vma, address, pvmw.pte); -- cgit v1.2.3 From 35e247032606f06c2f19d90a6562bc315206b7a7 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Wed, 14 Jan 2026 11:00:06 +0000 Subject: mm: do not copy page tables unnecessarily for VM_UFFD_WP Commit ab04b530e7e8 ("mm: introduce copy-on-fork VMAs and make VM_MAYBE_GUARD one") aggregates flags checks in vma_needs_copy(), including VM_UFFD_WP. However in doing so, it incorrectly performed this check against src_vma. This check was done on the assumption that all relevant flags are copied upon fork. However the userfaultfd logic is very innovative in that it implements custom logic on fork in dup_userfaultfd(), including a rather well hidden case where lacking UFFD_FEATURE_EVENT_FORK causes VM_UFFD_WP to not be propagated to the destination VMA. And indeed, vma_needs_copy(), prior to this patch, did check this property on dst_vma, not src_vma. Since all the other relevant flags are copied on fork, we can simply fix this by checking against dst_vma. While we're here, we fix a comment against VM_COPY_ON_FORK (noting that it did indeed already reference dst_vma) to make it abundantly clear that we must check against the destination VMA. Link: https://lkml.kernel.org/r/20260114110006.1047071-1-lorenzo.stoakes@oracle.com Fixes: ab04b530e7e8 ("mm: introduce copy-on-fork VMAs and make VM_MAYBE_GUARD one") Signed-off-by: Lorenzo Stoakes Reported-by: Chris Mason Closes: https://lore.kernel.org/all/20260113231257.3002271-1-clm@meta.com/ Acked-by: David Hildenbrand (Red Hat) Acked-by: Pedro Falcato Cc: Liam Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/mm.h | 6 +++++- mm/memory.c | 6 +++++- 2 files changed, 10 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/include/linux/mm.h b/include/linux/mm.h index 6f959d8ca4b4..f0d5be9dc736 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -608,7 +608,11 @@ enum { /* * Flags which should result in page tables being copied on fork. These are * flags which indicate that the VMA maps page tables which cannot be - * reconsistuted upon page fault, so necessitate page table copying upon + * reconsistuted upon page fault, so necessitate page table copying upon fork. + * + * Note that these flags should be compared with the DESTINATION VMA not the + * source, as VM_UFFD_WP may not be propagated to destination, while all other + * flags will be. * * VM_PFNMAP / VM_MIXEDMAP - These contain kernel-mapped data which cannot be * reasonably reconstructed on page fault. diff --git a/mm/memory.c b/mm/memory.c index a0822b564cc0..da360a6eb8a4 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1465,7 +1465,11 @@ copy_p4d_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, static bool vma_needs_copy(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) { - if (src_vma->vm_flags & VM_COPY_ON_FORK) + /* + * We check against dst_vma as while sane VMA flags will have been + * copied, VM_UFFD_WP may be set only on dst_vma. + */ + if (dst_vma->vm_flags & VM_COPY_ON_FORK) return true; /* * The presence of an anon_vma indicates an anonymous VMA has page -- cgit v1.2.3 From 9bc9ccbf4c935852e4916081dbce4c25a585ec7d Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Fri, 16 Jan 2026 06:10:11 -0800 Subject: mm/kfence: fix potential deadlock in reboot notifier The reboot notifier callback can deadlock when calling cancel_delayed_work_sync() if toggle_allocation_gate() is blocked in wait_event_idle() waiting for allocations, that might not happen on shutdown path. The issue is that cancel_delayed_work_sync() waits for the work to complete, but the work is waiting for kfence_allocation_gate > 0 which requires allocations to happen (each allocation is increased by 1) - allocations that may have stopped during shutdown. Fix this by: 1. Using cancel_delayed_work() (non-sync) to avoid blocking. Now the callback succeeds and return. 2. Adding wake_up() to unblock any waiting toggle_allocation_gate() 3. Adding !kfence_enabled to the wait condition so the wake succeeds The static_branch_disable() IPI will still execute after the wake, but at this early point in shutdown (reboot notifier runs with INT_MAX priority), the system is still functional and CPUs can respond to IPIs. Link: https://lkml.kernel.org/r/20260116-kfence_fix-v1-1-4165a055933f@debian.org Fixes: ce2bba89566b ("mm/kfence: add reboot notifier to disable KFENCE on shutdown") Signed-off-by: Breno Leitao Reported-by: Chris Mason Closes: https://lore.kernel.org/all/20260113140234.677117-1-clm@meta.com/ Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Breno Leitao Cc: Chris Mason Cc: Dmitriy Vyukov Signed-off-by: Andrew Morton --- mm/kfence/core.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/kfence/core.c b/mm/kfence/core.c index 577a1699c553..da0f5b6f5744 100644 --- a/mm/kfence/core.c +++ b/mm/kfence/core.c @@ -823,6 +823,9 @@ static struct notifier_block kfence_check_canary_notifier = { static struct delayed_work kfence_timer; #ifdef CONFIG_KFENCE_STATIC_KEYS +/* Wait queue to wake up allocation-gate timer task. */ +static DECLARE_WAIT_QUEUE_HEAD(allocation_wait); + static int kfence_reboot_callback(struct notifier_block *nb, unsigned long action, void *data) { @@ -832,7 +835,12 @@ static int kfence_reboot_callback(struct notifier_block *nb, */ WRITE_ONCE(kfence_enabled, false); /* Cancel any pending timer work */ - cancel_delayed_work_sync(&kfence_timer); + cancel_delayed_work(&kfence_timer); + /* + * Wake up any blocked toggle_allocation_gate() so it can complete + * early while the system is still able to handle IPIs. + */ + wake_up(&allocation_wait); return NOTIFY_OK; } @@ -842,9 +850,6 @@ static struct notifier_block kfence_reboot_notifier = { .priority = INT_MAX, /* Run early to stop timers ASAP */ }; -/* Wait queue to wake up allocation-gate timer task. */ -static DECLARE_WAIT_QUEUE_HEAD(allocation_wait); - static void wake_up_kfence_timer(struct irq_work *work) { wake_up(&allocation_wait); @@ -873,7 +878,9 @@ static void toggle_allocation_gate(struct work_struct *work) /* Enable static key, and await allocation to happen. */ static_branch_enable(&kfence_allocation_key); - wait_event_idle(allocation_wait, atomic_read(&kfence_allocation_gate) > 0); + wait_event_idle(allocation_wait, + atomic_read(&kfence_allocation_gate) > 0 || + !READ_ONCE(kfence_enabled)); /* Disable static key and reset timer. */ static_branch_disable(&kfence_allocation_key); -- cgit v1.2.3 From 16aca2c98a6fdf071e5a1a765a295995d7c7e346 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Fri, 16 Jan 2026 20:52:47 +0000 Subject: mm: restore per-memcg proactive reclaim with !CONFIG_NUMA Commit 2b7226af730c ("mm/memcg: make memory.reclaim interface generic") moved proactive reclaim logic from memory.reclaim handler to a generic user_proactive_reclaim() helper to be used for per-node proactive reclaim. However, user_proactive_reclaim() was only defined under CONFIG_NUMA, with a stub always returning 0 otherwise. This broke memory.reclaim on !CONFIG_NUMA configs, causing it to report success without actually attempting reclaim. Move the definition of user_proactive_reclaim() outside CONFIG_NUMA, and instead define a stub for __node_reclaim() in the !CONFIG_NUMA case. __node_reclaim() is only called from user_proactive_reclaim() when a write is made to sys/devices/system/node/nodeX/reclaim, which is only defined with CONFIG_NUMA. Link: https://lkml.kernel.org/r/20260116205247.928004-1-yosry.ahmed@linux.dev Fixes: 2b7226af730c ("mm/memcg: make memory.reclaim interface generic") Signed-off-by: Yosry Ahmed Acked-by: Shakeel Butt Acked-by: Michal Hocko Cc: Axel Rasmussen Cc: David Hildenbrand Cc: Davidlohr Bueso Cc: Johannes Weiner Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Mike Rapoport Cc: Qi Zheng Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Wei Xu Cc: Yuanchu Xie Cc: Signed-off-by: Andrew Morton --- mm/internal.h | 8 -------- mm/vmscan.c | 13 +++++++++++-- 2 files changed, 11 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/mm/internal.h b/mm/internal.h index e430da900430..f35dbcf99a86 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -538,16 +538,8 @@ extern unsigned long highest_memmap_pfn; bool folio_isolate_lru(struct folio *folio); void folio_putback_lru(struct folio *folio); extern void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason); -#ifdef CONFIG_NUMA int user_proactive_reclaim(char *buf, struct mem_cgroup *memcg, pg_data_t *pgdat); -#else -static inline int user_proactive_reclaim(char *buf, - struct mem_cgroup *memcg, pg_data_t *pgdat) -{ - return 0; -} -#endif /* * in mm/rmap.c: diff --git a/mm/vmscan.c b/mm/vmscan.c index 670fe9fae5ba..614ccf39fe3f 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -7707,6 +7707,17 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order) return ret; } +#else + +static unsigned long __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, + unsigned long nr_pages, + struct scan_control *sc) +{ + return 0; +} + +#endif + enum { MEMORY_RECLAIM_SWAPPINESS = 0, MEMORY_RECLAIM_SWAPPINESS_MAX, @@ -7814,8 +7825,6 @@ int user_proactive_reclaim(char *buf, return 0; } -#endif - /** * check_move_unevictable_folios - Move evictable folios to appropriate zone * lru list -- cgit v1.2.3 From 99a3e3a1cfc93b8fe318c0a3a5cfb01f1d4ad53c Mon Sep 17 00:00:00 2001 From: Swaraj Gaikwad Date: Tue, 13 Jan 2026 20:36:39 +0530 Subject: slab: fix kmalloc_nolock() context check for PREEMPT_RT On PREEMPT_RT kernels, local_lock becomes a sleeping lock. The current check in kmalloc_nolock() only verifies we're not in NMI or hard IRQ context, but misses the case where preemption is disabled. When a BPF program runs from a tracepoint with preemption disabled (preempt_count > 0), kmalloc_nolock() proceeds to call local_lock_irqsave() which attempts to acquire a sleeping lock, triggering: BUG: sleeping function called from invalid context in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 6128 preempt_count: 2, expected: 0 Fix this by checking !preemptible() on PREEMPT_RT, which directly expresses the constraint that we cannot take a sleeping lock when preemption is disabled. This encompasses the previous checks for NMI and hard IRQ contexts while also catching cases where preemption is disabled. Fixes: af92793e52c3 ("slab: Introduce kmalloc_nolock() and kfree_nolock().") Reported-by: syzbot+b1546ad4a95331b2101e@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=b1546ad4a95331b2101e Signed-off-by: Swaraj Gaikwad Acked-by: Sebastian Andrzej Siewior Acked-by: Alexei Starovoitov Acked-by: Harry Yoo Link: https://patch.msgid.link/20260113150639.48407-1-swarajgaikwad1925@gmail.co Cc: Signed-off-by: Vlastimil Babka --- mm/slub.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 861592ac5425..f77b7407c51b 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -5694,8 +5694,12 @@ void *kmalloc_nolock_noprof(size_t size, gfp_t gfp_flags, int node) if (unlikely(!size)) return ZERO_SIZE_PTR; - if (IS_ENABLED(CONFIG_PREEMPT_RT) && (in_nmi() || in_hardirq())) - /* kmalloc_nolock() in PREEMPT_RT is not supported from irq */ + if (IS_ENABLED(CONFIG_PREEMPT_RT) && !preemptible()) + /* + * kmalloc_nolock() in PREEMPT_RT is not supported from + * non-preemptible context because local_lock becomes a + * sleeping lock on RT. + */ return NULL; retry: if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) -- cgit v1.2.3