diff options
Diffstat (limited to 'mm/huge_memory.c')
| -rw-r--r-- | mm/huge_memory.c | 1301 |
1 files changed, 796 insertions, 505 deletions
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 1b81680b4225..f7c565f11a98 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -37,11 +37,11 @@ #include <linux/sched/sysctl.h> #include <linux/memory-tiers.h> #include <linux/compat.h> +#include <linux/pgalloc.h> #include <linux/pgalloc_tag.h> #include <linux/pagewalk.h> #include <asm/tlb.h> -#include <asm/pgalloc.h> #include "internal.h" #include "swap.h" @@ -214,7 +214,8 @@ retry: if (likely(atomic_inc_not_zero(&huge_zero_refcount))) return true; - zero_folio = folio_alloc((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE, + zero_folio = folio_alloc((GFP_TRANSHUGE | __GFP_ZERO | __GFP_ZEROTAGS) & + ~__GFP_MOVABLE, HPAGE_PMD_ORDER); if (!zero_folio) { count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED); @@ -1076,28 +1077,103 @@ pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) return pmd; } +static struct deferred_split *split_queue_node(int nid) +{ + struct pglist_data *pgdata = NODE_DATA(nid); + + return &pgdata->deferred_split_queue; +} + #ifdef CONFIG_MEMCG static inline -struct deferred_split *get_deferred_split_queue(struct folio *folio) +struct mem_cgroup *folio_split_queue_memcg(struct folio *folio, + struct deferred_split *queue) { - struct mem_cgroup *memcg = folio_memcg(folio); - struct pglist_data *pgdat = NODE_DATA(folio_nid(folio)); + if (mem_cgroup_disabled()) + return NULL; + if (split_queue_node(folio_nid(folio)) == queue) + return NULL; + return container_of(queue, struct mem_cgroup, deferred_split_queue); +} - if (memcg) - return &memcg->deferred_split_queue; - else - return &pgdat->deferred_split_queue; +static struct deferred_split *memcg_split_queue(int nid, struct mem_cgroup *memcg) +{ + return memcg ? &memcg->deferred_split_queue : split_queue_node(nid); } #else static inline -struct deferred_split *get_deferred_split_queue(struct folio *folio) +struct mem_cgroup *folio_split_queue_memcg(struct folio *folio, + struct deferred_split *queue) { - struct pglist_data *pgdat = NODE_DATA(folio_nid(folio)); + return NULL; +} - return &pgdat->deferred_split_queue; +static struct deferred_split *memcg_split_queue(int nid, struct mem_cgroup *memcg) +{ + return split_queue_node(nid); } #endif +static struct deferred_split *split_queue_lock(int nid, struct mem_cgroup *memcg) +{ + struct deferred_split *queue; + +retry: + queue = memcg_split_queue(nid, memcg); + spin_lock(&queue->split_queue_lock); + /* + * There is a period between setting memcg to dying and reparenting + * deferred split queue, and during this period the THPs in the deferred + * split queue will be hidden from the shrinker side. + */ + if (unlikely(memcg_is_dying(memcg))) { + spin_unlock(&queue->split_queue_lock); + memcg = parent_mem_cgroup(memcg); + goto retry; + } + + return queue; +} + +static struct deferred_split * +split_queue_lock_irqsave(int nid, struct mem_cgroup *memcg, unsigned long *flags) +{ + struct deferred_split *queue; + +retry: + queue = memcg_split_queue(nid, memcg); + spin_lock_irqsave(&queue->split_queue_lock, *flags); + if (unlikely(memcg_is_dying(memcg))) { + spin_unlock_irqrestore(&queue->split_queue_lock, *flags); + memcg = parent_mem_cgroup(memcg); + goto retry; + } + + return queue; +} + +static struct deferred_split *folio_split_queue_lock(struct folio *folio) +{ + return split_queue_lock(folio_nid(folio), folio_memcg(folio)); +} + +static struct deferred_split * +folio_split_queue_lock_irqsave(struct folio *folio, unsigned long *flags) +{ + return split_queue_lock_irqsave(folio_nid(folio), folio_memcg(folio), flags); +} + +static inline void split_queue_unlock(struct deferred_split *queue) +{ + spin_unlock(&queue->split_queue_lock); +} + +static inline void split_queue_unlock_irqrestore(struct deferred_split *queue, + unsigned long flags) +{ + spin_unlock_irqrestore(&queue->split_queue_lock, flags); +} + static inline bool is_transparent_hugepage(const struct folio *folio) { if (!folio_test_large(folio)) @@ -1126,7 +1202,7 @@ static unsigned long __thp_get_unmapped_area(struct file *filp, if (len_pad < len || (off + len_pad) < off) return 0; - ret = mm_get_unmapped_area_vmflags(current->mm, filp, addr, len_pad, + ret = mm_get_unmapped_area_vmflags(filp, addr, len_pad, off >> PAGE_SHIFT, flags, vm_flags); /* @@ -1163,7 +1239,7 @@ unsigned long thp_get_unmapped_area_vmflags(struct file *filp, unsigned long add if (ret) return ret; - return mm_get_unmapped_area_vmflags(current->mm, filp, addr, len, pgoff, flags, + return mm_get_unmapped_area_vmflags(filp, addr, len, pgoff, flags, vm_flags); } @@ -1217,7 +1293,7 @@ static struct folio *vma_alloc_anon_folio_pmd(struct vm_area_struct *vma, return folio; } -static void map_anon_folio_pmd(struct folio *folio, pmd_t *pmd, +void map_anon_folio_pmd_nopf(struct folio *folio, pmd_t *pmd, struct vm_area_struct *vma, unsigned long haddr) { pmd_t entry; @@ -1228,6 +1304,13 @@ static void map_anon_folio_pmd(struct folio *folio, pmd_t *pmd, folio_add_lru_vma(folio, vma); set_pmd_at(vma->vm_mm, haddr, pmd, entry); update_mmu_cache_pmd(vma, haddr, pmd); + deferred_split_folio(folio, false); +} + +static void map_anon_folio_pmd_pf(struct folio *folio, pmd_t *pmd, + struct vm_area_struct *vma, unsigned long haddr) +{ + map_anon_folio_pmd_nopf(folio, pmd, vma, haddr); add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); count_vm_event(THP_FAULT_ALLOC); count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_ALLOC); @@ -1270,9 +1353,8 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf) return ret; } pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable); - map_anon_folio_pmd(folio, vmf->pmd, vma, haddr); + map_anon_folio_pmd_pf(folio, vmf->pmd, vma, haddr); mm_inc_nr_ptes(vma->vm_mm); - deferred_split_folio(folio, false); spin_unlock(vmf->ptl); } @@ -1287,6 +1369,44 @@ release: } +vm_fault_t do_huge_pmd_device_private(struct vm_fault *vmf) +{ + struct vm_area_struct *vma = vmf->vma; + vm_fault_t ret = 0; + spinlock_t *ptl; + softleaf_t entry; + struct page *page; + struct folio *folio; + + if (vmf->flags & FAULT_FLAG_VMA_LOCK) { + vma_end_read(vma); + return VM_FAULT_RETRY; + } + + ptl = pmd_lock(vma->vm_mm, vmf->pmd); + if (unlikely(!pmd_same(*vmf->pmd, vmf->orig_pmd))) { + spin_unlock(ptl); + return 0; + } + + entry = softleaf_from_pmd(vmf->orig_pmd); + page = softleaf_to_page(entry); + folio = page_folio(page); + vmf->page = page; + vmf->pte = NULL; + if (folio_trylock(folio)) { + folio_get(folio); + spin_unlock(ptl); + ret = page_pgmap(page)->ops->migrate_to_ram(vmf); + folio_unlock(folio); + folio_put(folio); + } else { + spin_unlock(ptl); + } + + return ret; +} + /* * always: directly stall for all thp allocations * defer: wake kswapd and fail if not immediately available @@ -1641,17 +1761,86 @@ vm_fault_t vmf_insert_folio_pud(struct vm_fault *vmf, struct folio *folio, EXPORT_SYMBOL_GPL(vmf_insert_folio_pud); #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ -void touch_pmd(struct vm_area_struct *vma, unsigned long addr, +/** + * touch_pmd - Mark page table pmd entry as accessed and dirty (for write) + * @vma: The VMA covering @addr + * @addr: The virtual address + * @pmd: pmd pointer into the page table mapping @addr + * @write: Whether it's a write access + * + * Return: whether the pmd entry is changed + */ +bool touch_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, bool write) { - pmd_t _pmd; + pmd_t entry; - _pmd = pmd_mkyoung(*pmd); + entry = pmd_mkyoung(*pmd); if (write) - _pmd = pmd_mkdirty(_pmd); + entry = pmd_mkdirty(entry); if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK, - pmd, _pmd, write)) + pmd, entry, write)) { update_mmu_cache_pmd(vma, addr, pmd); + return true; + } + + return false; +} + +static void copy_huge_non_present_pmd( + struct mm_struct *dst_mm, struct mm_struct *src_mm, + pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, + struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, + pmd_t pmd, pgtable_t pgtable) +{ + softleaf_t entry = softleaf_from_pmd(pmd); + struct folio *src_folio; + + VM_WARN_ON_ONCE(!pmd_is_valid_softleaf(pmd)); + + if (softleaf_is_migration_write(entry) || + softleaf_is_migration_read_exclusive(entry)) { + entry = make_readable_migration_entry(swp_offset(entry)); + pmd = swp_entry_to_pmd(entry); + if (pmd_swp_soft_dirty(*src_pmd)) + pmd = pmd_swp_mksoft_dirty(pmd); + if (pmd_swp_uffd_wp(*src_pmd)) + pmd = pmd_swp_mkuffd_wp(pmd); + set_pmd_at(src_mm, addr, src_pmd, pmd); + } else if (softleaf_is_device_private(entry)) { + /* + * For device private entries, since there are no + * read exclusive entries, writable = !readable + */ + if (softleaf_is_device_private_write(entry)) { + entry = make_readable_device_private_entry(swp_offset(entry)); + pmd = swp_entry_to_pmd(entry); + + if (pmd_swp_soft_dirty(*src_pmd)) + pmd = pmd_swp_mksoft_dirty(pmd); + if (pmd_swp_uffd_wp(*src_pmd)) + pmd = pmd_swp_mkuffd_wp(pmd); + set_pmd_at(src_mm, addr, src_pmd, pmd); + } + + src_folio = softleaf_to_folio(entry); + VM_WARN_ON(!folio_test_large(src_folio)); + + folio_get(src_folio); + /* + * folio_try_dup_anon_rmap_pmd does not fail for + * device private entries. + */ + folio_try_dup_anon_rmap_pmd(src_folio, &src_folio->page, + dst_vma, src_vma); + } + + add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); + mm_inc_nr_ptes(dst_mm); + pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable); + if (!userfaultfd_wp(dst_vma)) + pmd = pmd_swp_clear_uffd_wp(pmd); + set_pmd_at(dst_mm, addr, dst_pmd, pmd); } int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, @@ -1699,31 +1888,13 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, ret = -EAGAIN; pmd = *src_pmd; -#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION - if (unlikely(is_swap_pmd(pmd))) { - swp_entry_t entry = pmd_to_swp_entry(pmd); - - VM_BUG_ON(!is_pmd_migration_entry(pmd)); - if (!is_readable_migration_entry(entry)) { - entry = make_readable_migration_entry( - swp_offset(entry)); - pmd = swp_entry_to_pmd(entry); - if (pmd_swp_soft_dirty(*src_pmd)) - pmd = pmd_swp_mksoft_dirty(pmd); - if (pmd_swp_uffd_wp(*src_pmd)) - pmd = pmd_swp_mkuffd_wp(pmd); - set_pmd_at(src_mm, addr, src_pmd, pmd); - } - add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); - mm_inc_nr_ptes(dst_mm); - pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable); - if (!userfaultfd_wp(dst_vma)) - pmd = pmd_swp_clear_uffd_wp(pmd); - set_pmd_at(dst_mm, addr, dst_pmd, pmd); + if (unlikely(thp_migration_supported() && + pmd_is_valid_softleaf(pmd))) { + copy_huge_non_present_pmd(dst_mm, src_mm, dst_pmd, src_pmd, addr, + dst_vma, src_vma, pmd, pgtable); ret = 0; goto out_unlock; } -#endif if (unlikely(!pmd_trans_huge(pmd))) { pte_free(dst_mm, pgtable); @@ -1841,18 +2012,14 @@ unlock: } #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ -void huge_pmd_set_accessed(struct vm_fault *vmf) +bool huge_pmd_set_accessed(struct vm_fault *vmf) { bool write = vmf->flags & FAULT_FLAG_WRITE; - vmf->ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd); if (unlikely(!pmd_same(*vmf->pmd, vmf->orig_pmd))) - goto unlock; - - touch_pmd(vmf->vma, vmf->address, vmf->pmd, write); + return false; -unlock: - spin_unlock(vmf->ptl); + return touch_pmd(vmf->vma, vmf->address, vmf->pmd, write); } static vm_fault_t do_huge_zero_wp_pmd(struct vm_fault *vmf) @@ -1877,7 +2044,7 @@ static vm_fault_t do_huge_zero_wp_pmd(struct vm_fault *vmf) if (ret) goto release; (void)pmdp_huge_clear_flush(vma, haddr, vmf->pmd); - map_anon_folio_pmd(folio, vmf->pmd, vma, haddr); + map_anon_folio_pmd_pf(folio, vmf->pmd, vma, haddr); goto unlock; release: folio_put(folio); @@ -2113,7 +2280,7 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, if (unlikely(!pmd_present(orig_pmd))) { VM_BUG_ON(thp_migration_supported() && - !is_pmd_migration_entry(orig_pmd)); + !pmd_is_migration_entry(orig_pmd)); goto out; } @@ -2211,15 +2378,15 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, folio_remove_rmap_pmd(folio, page, vma); WARN_ON_ONCE(folio_mapcount(folio) < 0); VM_BUG_ON_PAGE(!PageHead(page), page); - } else if (thp_migration_supported()) { - swp_entry_t entry; + } else if (pmd_is_valid_softleaf(orig_pmd)) { + const softleaf_t entry = softleaf_from_pmd(orig_pmd); - VM_BUG_ON(!is_pmd_migration_entry(orig_pmd)); - entry = pmd_to_swp_entry(orig_pmd); - folio = pfn_swap_entry_folio(entry); + folio = softleaf_to_folio(entry); flush_needed = 0; - } else - WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!"); + + if (!thp_migration_supported()) + WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!"); + } if (folio_test_anon(folio)) { zap_deposited_table(tlb->mm, pmd); @@ -2239,6 +2406,12 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, folio_mark_accessed(folio); } + if (folio_is_device_private(folio)) { + folio_remove_rmap_pmd(folio, &folio->page, vma); + WARN_ON_ONCE(folio_mapcount(folio) < 0); + folio_put(folio); + } + spin_unlock(ptl); if (flush_needed) tlb_remove_page_size(tlb, &folio->page, HPAGE_PMD_SIZE); @@ -2263,20 +2436,23 @@ static inline int pmd_move_must_withdraw(spinlock_t *new_pmd_ptl, static pmd_t move_soft_dirty_pmd(pmd_t pmd) { -#ifdef CONFIG_MEM_SOFT_DIRTY - if (unlikely(is_pmd_migration_entry(pmd))) - pmd = pmd_swp_mksoft_dirty(pmd); - else if (pmd_present(pmd)) - pmd = pmd_mksoft_dirty(pmd); -#endif + if (pgtable_supports_soft_dirty()) { + if (unlikely(pmd_is_migration_entry(pmd))) + pmd = pmd_swp_mksoft_dirty(pmd); + else if (pmd_present(pmd)) + pmd = pmd_mksoft_dirty(pmd); + } + return pmd; } static pmd_t clear_uffd_wp_pmd(pmd_t pmd) { + if (pmd_none(pmd)) + return pmd; if (pmd_present(pmd)) pmd = pmd_clear_uffd_wp(pmd); - else if (is_swap_pmd(pmd)) + else pmd = pmd_swp_clear_uffd_wp(pmd); return pmd; @@ -2333,6 +2509,42 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, return false; } +static void change_non_present_huge_pmd(struct mm_struct *mm, + unsigned long addr, pmd_t *pmd, bool uffd_wp, + bool uffd_wp_resolve) +{ + softleaf_t entry = softleaf_from_pmd(*pmd); + const struct folio *folio = softleaf_to_folio(entry); + pmd_t newpmd; + + VM_WARN_ON(!pmd_is_valid_softleaf(*pmd)); + if (softleaf_is_migration_write(entry)) { + /* + * A protection check is difficult so + * just be safe and disable write + */ + if (folio_test_anon(folio)) + entry = make_readable_exclusive_migration_entry(swp_offset(entry)); + else + entry = make_readable_migration_entry(swp_offset(entry)); + newpmd = swp_entry_to_pmd(entry); + if (pmd_swp_soft_dirty(*pmd)) + newpmd = pmd_swp_mksoft_dirty(newpmd); + } else if (softleaf_is_device_private_write(entry)) { + entry = make_readable_device_private_entry(swp_offset(entry)); + newpmd = swp_entry_to_pmd(entry); + } else { + newpmd = *pmd; + } + + if (uffd_wp) + newpmd = pmd_swp_mkuffd_wp(newpmd); + else if (uffd_wp_resolve) + newpmd = pmd_swp_clear_uffd_wp(newpmd); + if (!pmd_same(*pmd, newpmd)) + set_pmd_at(mm, addr, pmd, newpmd); +} + /* * Returns * - 0 if PMD could not be locked @@ -2361,42 +2573,14 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, if (!ptl) return 0; -#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION - if (is_swap_pmd(*pmd)) { - swp_entry_t entry = pmd_to_swp_entry(*pmd); - struct folio *folio = pfn_swap_entry_folio(entry); - pmd_t newpmd; - - VM_BUG_ON(!is_pmd_migration_entry(*pmd)); - if (is_writable_migration_entry(entry)) { - /* - * A protection check is difficult so - * just be safe and disable write - */ - if (folio_test_anon(folio)) - entry = make_readable_exclusive_migration_entry(swp_offset(entry)); - else - entry = make_readable_migration_entry(swp_offset(entry)); - newpmd = swp_entry_to_pmd(entry); - if (pmd_swp_soft_dirty(*pmd)) - newpmd = pmd_swp_mksoft_dirty(newpmd); - } else { - newpmd = *pmd; - } - - if (uffd_wp) - newpmd = pmd_swp_mkuffd_wp(newpmd); - else if (uffd_wp_resolve) - newpmd = pmd_swp_clear_uffd_wp(newpmd); - if (!pmd_same(*pmd, newpmd)) - set_pmd_at(mm, addr, pmd, newpmd); + if (thp_migration_supported() && pmd_is_valid_softleaf(*pmd)) { + change_non_present_huge_pmd(mm, addr, pmd, uffd_wp, + uffd_wp_resolve); goto unlock; } -#endif if (prot_numa) { - struct folio *folio; - bool toptier; + /* * Avoid trapping faults against the zero page. The read-only * data is likely to be read-cached on the local CPU and @@ -2408,19 +2592,9 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, if (pmd_protnone(*pmd)) goto unlock; - folio = pmd_folio(*pmd); - toptier = node_is_toptier(folio_nid(folio)); - /* - * Skip scanning top tier node if normal numa - * balancing is disabled - */ - if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) && - toptier) + if (!folio_can_map_prot_numa(pmd_folio(*pmd), vma, + vma_is_single_threaded_private(vma))) goto unlock; - - if (folio_use_access_time(folio)) - folio_xchg_access_time(folio, - jiffies_to_msecs(jiffies)); } /* * In case prot_numa, we are under mmap_read_lock(mm). It's critical @@ -2533,7 +2707,6 @@ int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pm pmd_t _dst_pmd, src_pmdval; struct page *src_page; struct folio *src_folio; - struct anon_vma *src_anon_vma; spinlock_t *src_ptl, *dst_ptl; pgtable_t src_pgtable; struct mmu_notifier_range range; @@ -2555,7 +2728,7 @@ int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pm if (!pmd_trans_huge(src_pmdval)) { spin_unlock(src_ptl); - if (is_pmd_migration_entry(src_pmdval)) { + if (pmd_is_migration_entry(src_pmdval)) { pmd_migration_entry_wait(mm, &src_pmdval); return -EAGAIN; } @@ -2582,23 +2755,9 @@ int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pm src_addr + HPAGE_PMD_SIZE); mmu_notifier_invalidate_range_start(&range); - if (src_folio) { + if (src_folio) folio_lock(src_folio); - /* - * split_huge_page walks the anon_vma chain without the page - * lock. Serialize against it with the anon_vma lock, the page - * lock is not enough. - */ - src_anon_vma = folio_get_anon_vma(src_folio); - if (!src_anon_vma) { - err = -EAGAIN; - goto unlock_folio; - } - anon_vma_lock_write(src_anon_vma); - } else - src_anon_vma = NULL; - dst_ptl = pmd_lockptr(mm, dst_pmd); double_pt_lock(src_ptl, dst_ptl); if (unlikely(!pmd_same(*src_pmd, src_pmdval) || @@ -2643,11 +2802,6 @@ int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pm pgtable_trans_huge_deposit(mm, dst_pmd, src_pgtable); unlock_ptls: double_pt_unlock(src_ptl, dst_ptl); - if (src_anon_vma) { - anon_vma_unlock_write(src_anon_vma); - put_anon_vma(src_anon_vma); - } -unlock_folio: /* unblock rmap walks */ if (src_folio) folio_unlock(src_folio); @@ -2667,8 +2821,9 @@ unlock_folio: spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma) { spinlock_t *ptl; + ptl = pmd_lock(vma->vm_mm, pmd); - if (likely(is_swap_pmd(*pmd) || pmd_trans_huge(*pmd))) + if (likely(pmd_is_huge(*pmd))) return ptl; spin_unlock(ptl); return NULL; @@ -2834,7 +2989,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, struct page *page; pgtable_t pgtable; pmd_t old_pmd, _pmd; - bool young, write, soft_dirty, pmd_migration = false, uffd_wp = false; + bool soft_dirty, uffd_wp = false, young = false, write = false; bool anon_exclusive = false, dirty = false; unsigned long addr; pte_t *pte; @@ -2843,7 +2998,8 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, VM_BUG_ON(haddr & ~HPAGE_PMD_MASK); VM_BUG_ON_VMA(vma->vm_start > haddr, vma); VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma); - VM_BUG_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd)); + + VM_WARN_ON_ONCE(!pmd_is_valid_softleaf(*pmd) && !pmd_trans_huge(*pmd)); count_vm_event(THP_SPLIT_PMD); @@ -2857,11 +3013,10 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, zap_deposited_table(mm, pmd); if (!vma_is_dax(vma) && vma_is_special_huge(vma)) return; - if (unlikely(is_pmd_migration_entry(old_pmd))) { - swp_entry_t entry; + if (unlikely(pmd_is_migration_entry(old_pmd))) { + const softleaf_t old_entry = softleaf_from_pmd(old_pmd); - entry = pmd_to_swp_entry(old_pmd); - folio = pfn_swap_entry_folio(entry); + folio = softleaf_to_folio(old_entry); } else if (is_huge_zero_pmd(old_pmd)) { return; } else { @@ -2891,20 +3046,54 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, return __split_huge_zero_page_pmd(vma, haddr, pmd); } - pmd_migration = is_pmd_migration_entry(*pmd); - if (unlikely(pmd_migration)) { - swp_entry_t entry; + if (pmd_is_migration_entry(*pmd)) { + softleaf_t entry; old_pmd = *pmd; - entry = pmd_to_swp_entry(old_pmd); - page = pfn_swap_entry_to_page(entry); - write = is_writable_migration_entry(entry); + entry = softleaf_from_pmd(old_pmd); + page = softleaf_to_page(entry); + folio = page_folio(page); + + soft_dirty = pmd_swp_soft_dirty(old_pmd); + uffd_wp = pmd_swp_uffd_wp(old_pmd); + + write = softleaf_is_migration_write(entry); if (PageAnon(page)) - anon_exclusive = is_readable_exclusive_migration_entry(entry); - young = is_migration_entry_young(entry); - dirty = is_migration_entry_dirty(entry); + anon_exclusive = softleaf_is_migration_read_exclusive(entry); + young = softleaf_is_migration_young(entry); + dirty = softleaf_is_migration_dirty(entry); + } else if (pmd_is_device_private_entry(*pmd)) { + softleaf_t entry; + + old_pmd = *pmd; + entry = softleaf_from_pmd(old_pmd); + page = softleaf_to_page(entry); + folio = page_folio(page); + soft_dirty = pmd_swp_soft_dirty(old_pmd); uffd_wp = pmd_swp_uffd_wp(old_pmd); + + write = softleaf_is_device_private_write(entry); + anon_exclusive = PageAnonExclusive(page); + + /* + * Device private THP should be treated the same as regular + * folios w.r.t anon exclusive handling. See the comments for + * folio handling and anon_exclusive below. + */ + if (freeze && anon_exclusive && + folio_try_share_anon_rmap_pmd(folio, page)) + freeze = false; + if (!freeze) { + rmap_t rmap_flags = RMAP_NONE; + + folio_ref_add(folio, HPAGE_PMD_NR - 1); + if (anon_exclusive) + rmap_flags |= RMAP_EXCLUSIVE; + + folio_add_anon_rmap_ptes(folio, page, HPAGE_PMD_NR, + vma, haddr, rmap_flags); + } } else { /* * Up to this point the pmd is present and huge and userland has @@ -2988,11 +3177,11 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, * Note that NUMA hinting access restrictions are not transferred to * avoid any possibility of altering permissions across VMAs. */ - if (freeze || pmd_migration) { - for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) { - pte_t entry; - swp_entry_t swp_entry; + if (freeze || pmd_is_migration_entry(old_pmd)) { + pte_t entry; + swp_entry_t swp_entry; + for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) { if (write) swp_entry = make_writable_migration_entry( page_to_pfn(page + i)); @@ -3011,7 +3200,33 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, entry = pte_swp_mksoft_dirty(entry); if (uffd_wp) entry = pte_swp_mkuffd_wp(entry); + VM_WARN_ON(!pte_none(ptep_get(pte + i))); + set_pte_at(mm, addr, pte + i, entry); + } + } else if (pmd_is_device_private_entry(old_pmd)) { + pte_t entry; + swp_entry_t swp_entry; + for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) { + /* + * anon_exclusive was already propagated to the relevant + * pages corresponding to the pte entries when freeze + * is false. + */ + if (write) + swp_entry = make_writable_device_private_entry( + page_to_pfn(page + i)); + else + swp_entry = make_readable_device_private_entry( + page_to_pfn(page + i)); + /* + * Young and dirty bits are not progated via swp_entry + */ + entry = swp_entry_to_pte(swp_entry); + if (soft_dirty) + entry = pte_swp_mksoft_dirty(entry); + if (uffd_wp) + entry = pte_swp_mkuffd_wp(entry); VM_WARN_ON(!pte_none(ptep_get(pte + i))); set_pte_at(mm, addr, pte + i, entry); } @@ -3038,7 +3253,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, } pte_unmap(pte); - if (!pmd_migration) + if (!pmd_is_migration_entry(*pmd)) folio_remove_rmap_pmd(folio, page, vma); if (freeze) put_page(page); @@ -3051,7 +3266,7 @@ void split_huge_pmd_locked(struct vm_area_struct *vma, unsigned long address, pmd_t *pmd, bool freeze) { VM_WARN_ON_ONCE(!IS_ALIGNED(address, HPAGE_PMD_SIZE)); - if (pmd_trans_huge(*pmd) || is_pmd_migration_entry(*pmd)) + if (pmd_trans_huge(*pmd) || pmd_is_valid_softleaf(*pmd)) __split_huge_pmd_locked(vma, pmd, address, freeze); } @@ -3230,6 +3445,9 @@ static void lru_add_split_folio(struct folio *folio, struct folio *new_folio, VM_BUG_ON_FOLIO(folio_test_lru(new_folio), folio); lockdep_assert_held(&lruvec->lru_lock); + if (folio_is_device_private(folio)) + return; + if (list) { /* page reclaim is reclaiming a huge page */ VM_WARN_ON(folio_test_lru(folio)); @@ -3263,6 +3481,14 @@ bool can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins) caller_pins; } +static bool page_range_has_hwpoisoned(struct page *page, long nr_pages) +{ + for (; nr_pages; page++, nr_pages--) + if (PageHWPoison(page)) + return true; + return false; +} + /* * It splits @folio into @new_order folios and copies the @folio metadata to * all the resulting folios. @@ -3270,17 +3496,24 @@ bool can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins) static void __split_folio_to_order(struct folio *folio, int old_order, int new_order) { + /* Scan poisoned pages when split a poisoned folio to large folios */ + const bool handle_hwpoison = folio_test_has_hwpoisoned(folio) && new_order; long new_nr_pages = 1 << new_order; long nr_pages = 1 << old_order; long i; + folio_clear_has_hwpoisoned(folio); + + /* Check first new_nr_pages since the loop below skips them */ + if (handle_hwpoison && + page_range_has_hwpoisoned(folio_page(folio, 0), new_nr_pages)) + folio_set_has_hwpoisoned(folio); /* * Skip the first new_nr_pages, since the new folio from them have all * the flags from the original folio. */ for (i = new_nr_pages; i < nr_pages; i += new_nr_pages) { struct page *new_head = &folio->page + i; - /* * Careful: new_folio is not a "real" folio before we cleared PageTail. * Don't pass it around before clear_compound_head(). @@ -3322,18 +3555,13 @@ static void __split_folio_to_order(struct folio *folio, int old_order, (1L << PG_dirty) | LRU_GEN_MASK | LRU_REFS_MASK)); + if (handle_hwpoison && + page_range_has_hwpoisoned(new_head, new_nr_pages)) + folio_set_has_hwpoisoned(new_folio); + new_folio->mapping = folio->mapping; new_folio->index = folio->index + i; - /* - * page->private should not be set in tail pages. Fix up and warn once - * if private is unexpectedly set. - */ - if (unlikely(new_folio->private)) { - VM_WARN_ON_ONCE_PAGE(true, new_head); - new_folio->private = NULL; - } - if (folio_test_swapcache(folio)) new_folio->swap.val = folio->swap.val + i; @@ -3369,8 +3597,9 @@ static void __split_folio_to_order(struct folio *folio, int old_order, ClearPageCompound(&folio->page); } -/* - * It splits an unmapped @folio to lower order smaller folios in two ways. +/** + * __split_unmapped_folio() - splits an unmapped @folio to lower order folios in + * two ways: uniform split or non-uniform split. * @folio: the to-be-split folio * @new_order: the smallest order of the after split folios (since buddy * allocator like split generates folios with orders from @folio's @@ -3379,66 +3608,56 @@ static void __split_folio_to_order(struct folio *folio, int old_order, * will be split until its order becomes @new_order. * @xas: xa_state pointing to folio->mapping->i_pages and locked by caller * @mapping: @folio->mapping - * @uniform_split: if the split is uniform or not (buddy allocator like split) + * @split_type: if the split is uniform or not (buddy allocator like split) * * * 1. uniform split: the given @folio into multiple @new_order small folios, * where all small folios have the same order. This is done when - * uniform_split is true. + * split_type is SPLIT_TYPE_UNIFORM. * 2. buddy allocator like (non-uniform) split: the given @folio is split into * half and one of the half (containing the given page) is split into half - * until the given @page's order becomes @new_order. This is done when - * uniform_split is false. + * until the given @folio's order becomes @new_order. This is done when + * split_type is SPLIT_TYPE_NON_UNIFORM. * * The high level flow for these two methods are: - * 1. uniform split: a single __split_folio_to_order() is called to split the - * @folio into @new_order, then we traverse all the resulting folios one by - * one in PFN ascending order and perform stats, unfreeze, adding to list, - * and file mapping index operations. - * 2. non-uniform split: in general, folio_order - @new_order calls to - * __split_folio_to_order() are made in a for loop to split the @folio - * to one lower order at a time. The resulting small folios are processed - * like what is done during the traversal in 1, except the one containing - * @page, which is split in next for loop. + * + * 1. uniform split: @xas is split with no expectation of failure and a single + * __split_folio_to_order() is called to split the @folio into @new_order + * along with stats update. + * 2. non-uniform split: folio_order - @new_order calls to + * __split_folio_to_order() are expected to be made in a for loop to split + * the @folio to one lower order at a time. The folio containing @split_at + * is split in each iteration. @xas is split into half in each iteration and + * can fail. A failed @xas split leaves split folios as is without merging + * them back. * * After splitting, the caller's folio reference will be transferred to the - * folio containing @page. The caller needs to unlock and/or free after-split - * folios if necessary. + * folio containing @split_at. The caller needs to unlock and/or free + * after-split folios if necessary. * - * For !uniform_split, when -ENOMEM is returned, the original folio might be - * split. The caller needs to check the input folio. + * Return: 0 - successful, <0 - failed (if -ENOMEM is returned, @folio might be + * split but not to @new_order, the caller needs to check) */ static int __split_unmapped_folio(struct folio *folio, int new_order, struct page *split_at, struct xa_state *xas, - struct address_space *mapping, bool uniform_split) + struct address_space *mapping, enum split_type split_type) { - int order = folio_order(folio); - int start_order = uniform_split ? new_order : order - 1; - bool stop_split = false; - struct folio *next; + const bool is_anon = folio_test_anon(folio); + int old_order = folio_order(folio); + int start_order = split_type == SPLIT_TYPE_UNIFORM ? new_order : old_order - 1; int split_order; - int ret = 0; - - if (folio_test_anon(folio)) - mod_mthp_stat(order, MTHP_STAT_NR_ANON, -1); - - folio_clear_has_hwpoisoned(folio); /* * split to new_order one order at a time. For uniform split, * folio is split to new_order directly. */ for (split_order = start_order; - split_order >= new_order && !stop_split; + split_order >= new_order; split_order--) { - struct folio *end_folio = folio_next(folio); - int old_order = folio_order(folio); - struct folio *new_folio; + int nr_new_folios = 1UL << (old_order - split_order); /* order-1 anonymous folio is not supported */ - if (folio_test_anon(folio) && split_order == 1) - continue; - if (uniform_split && split_order != new_order) + if (is_anon && split_order == 1) continue; if (mapping) { @@ -3447,79 +3666,81 @@ static int __split_unmapped_folio(struct folio *folio, int new_order, * irq is disabled to allocate enough memory, whereas * non-uniform split can handle ENOMEM. */ - if (uniform_split) + if (split_type == SPLIT_TYPE_UNIFORM) xas_split(xas, folio, old_order); else { xas_set_order(xas, folio->index, split_order); xas_try_split(xas, folio, old_order); - if (xas_error(xas)) { - ret = xas_error(xas); - stop_split = true; - } + if (xas_error(xas)) + return xas_error(xas); } } - if (!stop_split) { - folio_split_memcg_refs(folio, old_order, split_order); - split_page_owner(&folio->page, old_order, split_order); - pgalloc_tag_split(folio, old_order, split_order); + folio_split_memcg_refs(folio, old_order, split_order); + split_page_owner(&folio->page, old_order, split_order); + pgalloc_tag_split(folio, old_order, split_order); + __split_folio_to_order(folio, old_order, split_order); - __split_folio_to_order(folio, old_order, split_order); + if (is_anon) { + mod_mthp_stat(old_order, MTHP_STAT_NR_ANON, -1); + mod_mthp_stat(split_order, MTHP_STAT_NR_ANON, nr_new_folios); } - /* - * Iterate through after-split folios and update folio stats. - * But in buddy allocator like split, the folio - * containing the specified page is skipped until its order - * is new_order, since the folio will be worked on in next - * iteration. + * If uniform split, the process is complete. + * If non-uniform, continue splitting the folio at @split_at + * as long as the next @split_order is >= @new_order. */ - for (new_folio = folio; new_folio != end_folio; new_folio = next) { - next = folio_next(new_folio); - /* - * for buddy allocator like split, new_folio containing - * @split_at page could be split again, thus do not - * change stats yet. Wait until new_folio's order is - * @new_order or stop_split is set to true by the above - * xas_split() failure. - */ - if (new_folio == page_folio(split_at)) { - folio = new_folio; - if (split_order != new_order && !stop_split) - continue; - } - if (folio_test_anon(new_folio)) - mod_mthp_stat(folio_order(new_folio), - MTHP_STAT_NR_ANON, 1); - } + folio = page_folio(split_at); + old_order = split_order; } - return ret; + return 0; } -bool non_uniform_split_supported(struct folio *folio, unsigned int new_order, - bool warns) +bool folio_split_supported(struct folio *folio, unsigned int new_order, + enum split_type split_type, bool warns) { if (folio_test_anon(folio)) { /* order-1 is not supported for anonymous THP. */ VM_WARN_ONCE(warns && new_order == 1, "Cannot split to order-1 folio"); - return new_order != 1; - } else if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && - !mapping_large_folio_support(folio->mapping)) { - /* - * No split if the file system does not support large folio. - * Note that we might still have THPs in such mappings due to - * CONFIG_READ_ONLY_THP_FOR_FS. But in that case, the mapping - * does not actually support large folios properly. - */ - VM_WARN_ONCE(warns, - "Cannot split file folio to non-0 order"); - return false; + if (new_order == 1) + return false; + } else if (split_type == SPLIT_TYPE_NON_UNIFORM || new_order) { + if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && + !mapping_large_folio_support(folio->mapping)) { + /* + * We can always split a folio down to a single page + * (new_order == 0) uniformly. + * + * For any other scenario + * a) uniform split targeting a large folio + * (new_order > 0) + * b) any non-uniform split + * we must confirm that the file system supports large + * folios. + * + * Note that we might still have THPs in such + * mappings, which is created from khugepaged when + * CONFIG_READ_ONLY_THP_FOR_FS is enabled. But in that + * case, the mapping does not actually support large + * folios properly. + */ + VM_WARN_ONCE(warns, + "Cannot split file folio to non-0 order"); + return false; + } } - /* Only swapping a whole PMD-mapped folio is supported */ - if (folio_test_swapcache(folio)) { + /* + * swapcache folio could only be split to order 0 + * + * non-uniform split creates after-split folios with orders from + * folio_order(folio) - 1 to new_order, making it not suitable for any + * swapcache folio split. Only uniform split to order-0 can be used + * here. + */ + if ((split_type == SPLIT_TYPE_NON_UNIFORM || new_order) && folio_test_swapcache(folio)) { VM_WARN_ONCE(warns, "Cannot split swapcache folio to non-0 order"); return false; @@ -3528,40 +3749,160 @@ bool non_uniform_split_supported(struct folio *folio, unsigned int new_order, return true; } -/* See comments in non_uniform_split_supported() */ -bool uniform_split_supported(struct folio *folio, unsigned int new_order, - bool warns) +static int __folio_freeze_and_split_unmapped(struct folio *folio, unsigned int new_order, + struct page *split_at, struct xa_state *xas, + struct address_space *mapping, bool do_lru, + struct list_head *list, enum split_type split_type, + pgoff_t end, int *nr_shmem_dropped, int extra_pins) { - if (folio_test_anon(folio)) { - VM_WARN_ONCE(warns && new_order == 1, - "Cannot split to order-1 folio"); - return new_order != 1; - } else if (new_order) { - if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && - !mapping_large_folio_support(folio->mapping)) { - VM_WARN_ONCE(warns, - "Cannot split file folio to non-0 order"); - return false; + struct folio *end_folio = folio_next(folio); + struct folio *new_folio, *next; + int old_order = folio_order(folio); + int ret = 0; + struct deferred_split *ds_queue; + + VM_WARN_ON_ONCE(!mapping && end); + /* Prevent deferred_split_scan() touching ->_refcount */ + ds_queue = folio_split_queue_lock(folio); + if (folio_ref_freeze(folio, 1 + extra_pins)) { + struct swap_cluster_info *ci = NULL; + struct lruvec *lruvec; + int expected_refs; + + if (old_order > 1) { + if (!list_empty(&folio->_deferred_list)) { + ds_queue->split_queue_len--; + /* + * Reinitialize page_deferred_list after removing the + * page from the split_queue, otherwise a subsequent + * split will see list corruption when checking the + * page_deferred_list. + */ + list_del_init(&folio->_deferred_list); + } + if (folio_test_partially_mapped(folio)) { + folio_clear_partially_mapped(folio); + mod_mthp_stat(old_order, + MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1); + } } - } + split_queue_unlock(ds_queue); + if (mapping) { + int nr = folio_nr_pages(folio); - if (new_order && folio_test_swapcache(folio)) { - VM_WARN_ONCE(warns, - "Cannot split swapcache folio to non-0 order"); - return false; + if (folio_test_pmd_mappable(folio) && + new_order < HPAGE_PMD_ORDER) { + if (folio_test_swapbacked(folio)) { + lruvec_stat_mod_folio(folio, + NR_SHMEM_THPS, -nr); + } else { + lruvec_stat_mod_folio(folio, + NR_FILE_THPS, -nr); + filemap_nr_thps_dec(mapping); + } + } + } + + if (folio_test_swapcache(folio)) { + if (mapping) { + VM_WARN_ON_ONCE_FOLIO(mapping, folio); + return -EINVAL; + } + + ci = swap_cluster_get_and_lock(folio); + } + + /* lock lru list/PageCompound, ref frozen by page_ref_freeze */ + if (do_lru) + lruvec = folio_lruvec_lock(folio); + + ret = __split_unmapped_folio(folio, new_order, split_at, xas, + mapping, split_type); + + /* + * Unfreeze after-split folios and put them back to the right + * list. @folio should be kept frozon until page cache + * entries are updated with all the other after-split folios + * to prevent others seeing stale page cache entries. + * As a result, new_folio starts from the next folio of + * @folio. + */ + for (new_folio = folio_next(folio); new_folio != end_folio; + new_folio = next) { + unsigned long nr_pages = folio_nr_pages(new_folio); + + next = folio_next(new_folio); + + zone_device_private_split_cb(folio, new_folio); + + expected_refs = folio_expected_ref_count(new_folio) + 1; + folio_ref_unfreeze(new_folio, expected_refs); + + if (do_lru) + lru_add_split_folio(folio, new_folio, lruvec, list); + + /* + * Anonymous folio with swap cache. + * NOTE: shmem in swap cache is not supported yet. + */ + if (ci) { + __swap_cache_replace_folio(ci, folio, new_folio); + continue; + } + + /* Anonymous folio without swap cache */ + if (!mapping) + continue; + + /* Add the new folio to the page cache. */ + if (new_folio->index < end) { + __xa_store(&mapping->i_pages, new_folio->index, + new_folio, 0); + continue; + } + + VM_WARN_ON_ONCE(!nr_shmem_dropped); + /* Drop folio beyond EOF: ->index >= end */ + if (shmem_mapping(mapping) && nr_shmem_dropped) + *nr_shmem_dropped += nr_pages; + else if (folio_test_clear_dirty(new_folio)) + folio_account_cleaned( + new_folio, inode_to_wb(mapping->host)); + __filemap_remove_folio(new_folio, NULL); + folio_put_refs(new_folio, nr_pages); + } + + zone_device_private_split_cb(folio, NULL); + /* + * Unfreeze @folio only after all page cache entries, which + * used to point to it, have been updated with new folios. + * Otherwise, a parallel folio_try_get() can grab @folio + * and its caller can see stale page cache entries. + */ + expected_refs = folio_expected_ref_count(folio) + 1; + folio_ref_unfreeze(folio, expected_refs); + + if (do_lru) + unlock_page_lruvec(lruvec); + + if (ci) + swap_cluster_unlock(ci); + } else { + split_queue_unlock(ds_queue); + return -EAGAIN; } - return true; + return ret; } -/* - * __folio_split: split a folio at @split_at to a @new_order folio +/** + * __folio_split() - split a folio at @split_at to a @new_order folio * @folio: folio to split * @new_order: the order of the new folio * @split_at: a page within the new folio * @lock_at: a page within @folio to be left locked to caller * @list: after-split folios will be put on it if non NULL - * @uniform_split: perform uniform split or not (non-uniform split) + * @split_type: perform uniform split or not (non-uniform split) * * It calls __split_unmapped_folio() to perform uniform and non-uniform split. * It is in charge of checking whether the split is supported or not and @@ -3572,25 +3913,24 @@ bool uniform_split_supported(struct folio *folio, unsigned int new_order, * 1. for uniform split, @lock_at points to one of @folio's subpages; * 2. for buddy allocator like (non-uniform) split, @lock_at points to @folio. * - * return: 0: successful, <0 failed (if -ENOMEM is returned, @folio might be + * Return: 0 - successful, <0 - failed (if -ENOMEM is returned, @folio might be * split but not to @new_order, the caller needs to check) */ static int __folio_split(struct folio *folio, unsigned int new_order, struct page *split_at, struct page *lock_at, - struct list_head *list, bool uniform_split) + struct list_head *list, enum split_type split_type) { - struct deferred_split *ds_queue = get_deferred_split_queue(folio); XA_STATE(xas, &folio->mapping->i_pages, folio->index); struct folio *end_folio = folio_next(folio); bool is_anon = folio_test_anon(folio); struct address_space *mapping = NULL; struct anon_vma *anon_vma = NULL; - int order = folio_order(folio); + int old_order = folio_order(folio); struct folio *new_folio, *next; int nr_shmem_dropped = 0; int remap_flags = 0; int extra_pins, ret; - pgoff_t end; + pgoff_t end = 0; bool is_hzp; VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio); @@ -3599,14 +3939,20 @@ static int __folio_split(struct folio *folio, unsigned int new_order, if (folio != page_folio(split_at) || folio != page_folio(lock_at)) return -EINVAL; - if (new_order >= folio_order(folio)) - return -EINVAL; + /* + * Folios that just got truncated cannot get split. Signal to the + * caller that there was a race. + * + * TODO: this will also currently refuse shmem folios that are in the + * swapcache. + */ + if (!is_anon && !folio->mapping) + return -EBUSY; - if (uniform_split && !uniform_split_supported(folio, new_order, true)) + if (new_order >= old_order) return -EINVAL; - if (!uniform_split && - !non_uniform_split_supported(folio, new_order, true)) + if (!folio_split_supported(folio, new_order, split_type, /* warn = */ true)) return -EINVAL; is_hzp = is_huge_zero_folio(folio); @@ -3632,29 +3978,15 @@ static int __folio_split(struct folio *folio, unsigned int new_order, ret = -EBUSY; goto out; } - mapping = NULL; anon_vma_lock_write(anon_vma); + mapping = NULL; } else { unsigned int min_order; gfp_t gfp; mapping = folio->mapping; - - /* Truncated ? */ - /* - * TODO: add support for large shmem folio in swap cache. - * When shmem is in swap cache, mapping is NULL and - * folio_test_swapcache() is true. - */ - if (!mapping) { - ret = -EBUSY; - goto out; - } - min_order = mapping_min_folio_order(folio->mapping); if (new_order < min_order) { - VM_WARN_ONCE(1, "Cannot split mapped folio below min-order: %u", - min_order); ret = -EINVAL; goto out; } @@ -3667,9 +3999,9 @@ static int __folio_split(struct folio *folio, unsigned int new_order, goto out; } - if (uniform_split) { + if (split_type == SPLIT_TYPE_UNIFORM) { xas_set_order(&xas, folio->index, new_order); - xas_split_alloc(&xas, folio, folio_order(folio), gfp); + xas_split_alloc(&xas, folio, old_order, gfp); if (xas_error(&xas)) { ret = xas_error(&xas); goto out; @@ -3717,127 +4049,9 @@ static int __folio_split(struct folio *folio, unsigned int new_order, } } - /* Prevent deferred_split_scan() touching ->_refcount */ - spin_lock(&ds_queue->split_queue_lock); - if (folio_ref_freeze(folio, 1 + extra_pins)) { - struct swap_cluster_info *ci = NULL; - struct lruvec *lruvec; - int expected_refs; - - if (folio_order(folio) > 1 && - !list_empty(&folio->_deferred_list)) { - ds_queue->split_queue_len--; - if (folio_test_partially_mapped(folio)) { - folio_clear_partially_mapped(folio); - mod_mthp_stat(folio_order(folio), - MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1); - } - /* - * Reinitialize page_deferred_list after removing the - * page from the split_queue, otherwise a subsequent - * split will see list corruption when checking the - * page_deferred_list. - */ - list_del_init(&folio->_deferred_list); - } - spin_unlock(&ds_queue->split_queue_lock); - if (mapping) { - int nr = folio_nr_pages(folio); - - if (folio_test_pmd_mappable(folio) && - new_order < HPAGE_PMD_ORDER) { - if (folio_test_swapbacked(folio)) { - __lruvec_stat_mod_folio(folio, - NR_SHMEM_THPS, -nr); - } else { - __lruvec_stat_mod_folio(folio, - NR_FILE_THPS, -nr); - filemap_nr_thps_dec(mapping); - } - } - } - - if (folio_test_swapcache(folio)) { - if (mapping) { - VM_WARN_ON_ONCE_FOLIO(mapping, folio); - ret = -EINVAL; - goto fail; - } - - ci = swap_cluster_get_and_lock(folio); - } - - /* lock lru list/PageCompound, ref frozen by page_ref_freeze */ - lruvec = folio_lruvec_lock(folio); - - ret = __split_unmapped_folio(folio, new_order, split_at, &xas, - mapping, uniform_split); - - /* - * Unfreeze after-split folios and put them back to the right - * list. @folio should be kept frozon until page cache - * entries are updated with all the other after-split folios - * to prevent others seeing stale page cache entries. - * As a result, new_folio starts from the next folio of - * @folio. - */ - for (new_folio = folio_next(folio); new_folio != end_folio; - new_folio = next) { - unsigned long nr_pages = folio_nr_pages(new_folio); - - next = folio_next(new_folio); - - expected_refs = folio_expected_ref_count(new_folio) + 1; - folio_ref_unfreeze(new_folio, expected_refs); - - lru_add_split_folio(folio, new_folio, lruvec, list); - - /* - * Anonymous folio with swap cache. - * NOTE: shmem in swap cache is not supported yet. - */ - if (ci) { - __swap_cache_replace_folio(ci, folio, new_folio); - continue; - } - - /* Anonymous folio without swap cache */ - if (!mapping) - continue; - - /* Add the new folio to the page cache. */ - if (new_folio->index < end) { - __xa_store(&mapping->i_pages, new_folio->index, - new_folio, 0); - continue; - } - - /* Drop folio beyond EOF: ->index >= end */ - if (shmem_mapping(mapping)) - nr_shmem_dropped += nr_pages; - else if (folio_test_clear_dirty(new_folio)) - folio_account_cleaned( - new_folio, inode_to_wb(mapping->host)); - __filemap_remove_folio(new_folio, NULL); - folio_put_refs(new_folio, nr_pages); - } - /* - * Unfreeze @folio only after all page cache entries, which - * used to point to it, have been updated with new folios. - * Otherwise, a parallel folio_try_get() can grab @folio - * and its caller can see stale page cache entries. - */ - expected_refs = folio_expected_ref_count(folio) + 1; - folio_ref_unfreeze(folio, expected_refs); - - unlock_page_lruvec(lruvec); - - if (ci) - swap_cluster_unlock(ci); - } else { - spin_unlock(&ds_queue->split_queue_lock); - ret = -EAGAIN; - } + ret = __folio_freeze_and_split_unmapped(folio, new_order, split_at, &xas, mapping, + true, list, split_type, end, &nr_shmem_dropped, + extra_pins); fail: if (mapping) xas_unlock(&xas); @@ -3847,9 +4061,10 @@ fail: if (nr_shmem_dropped) shmem_uncharge(mapping->host, nr_shmem_dropped); - if (!ret && is_anon) + if (!ret && is_anon && !folio_is_device_private(folio)) remap_flags = RMP_USE_SHARED_ZEROPAGE; - remap_page(folio, 1 << order, remap_flags); + + remap_page(folio, 1 << old_order, remap_flags); /* * Unlock all after-split folios except the one containing @@ -3880,9 +4095,51 @@ out_unlock: i_mmap_unlock_read(mapping); out: xas_destroy(&xas); - if (order == HPAGE_PMD_ORDER) + if (old_order == HPAGE_PMD_ORDER) count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED); - count_mthp_stat(order, !ret ? MTHP_STAT_SPLIT : MTHP_STAT_SPLIT_FAILED); + count_mthp_stat(old_order, !ret ? MTHP_STAT_SPLIT : MTHP_STAT_SPLIT_FAILED); + return ret; +} + +/** + * folio_split_unmapped() - split a large anon folio that is already unmapped + * @folio: folio to split + * @new_order: the order of folios after split + * + * This function is a helper for splitting folios that have already been + * unmapped. The use case is that the device or the CPU can refuse to migrate + * THP pages in the middle of migration, due to allocation issues on either + * side. + * + * anon_vma_lock is not required to be held, mmap_read_lock() or + * mmap_write_lock() should be held. @folio is expected to be locked by the + * caller. device-private and non device-private folios are supported along + * with folios that are in the swapcache. @folio should also be unmapped and + * isolated from LRU (if applicable) + * + * Upon return, the folio is not remapped, split folios are not added to LRU, + * free_folio_and_swap_cache() is not called, and new folios remain locked. + * + * Return: 0 on success, -EAGAIN if the folio cannot be split (e.g., due to + * insufficient reference count or extra pins). + */ +int folio_split_unmapped(struct folio *folio, unsigned int new_order) +{ + int extra_pins, ret = 0; + + VM_WARN_ON_ONCE_FOLIO(folio_mapped(folio), folio); + VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio); + VM_WARN_ON_ONCE_FOLIO(!folio_test_large(folio), folio); + VM_WARN_ON_ONCE_FOLIO(!folio_test_anon(folio), folio); + + if (!can_split_folio(folio, 1, &extra_pins)) + return -EAGAIN; + + local_irq_disable(); + ret = __folio_freeze_and_split_unmapped(folio, new_order, &folio->page, NULL, + NULL, false, NULL, SPLIT_TYPE_UNIFORM, + 0, NULL, extra_pins); + local_irq_enable(); return ret; } @@ -3933,22 +4190,22 @@ out: * Returns -EINVAL when trying to split to an order that is incompatible * with the folio. Splitting to order 0 is compatible with all folios. */ -int split_huge_page_to_list_to_order(struct page *page, struct list_head *list, +int __split_huge_page_to_list_to_order(struct page *page, struct list_head *list, unsigned int new_order) { struct folio *folio = page_folio(page); - return __folio_split(folio, new_order, &folio->page, page, list, true); + return __folio_split(folio, new_order, &folio->page, page, list, + SPLIT_TYPE_UNIFORM); } -/* - * folio_split: split a folio at @split_at to a @new_order folio +/** + * folio_split() - split a folio at @split_at to a @new_order folio * @folio: folio to split * @new_order: the order of the new folio * @split_at: a page within the new folio - * - * return: 0: successful, <0 failed (if -ENOMEM is returned, @folio might be - * split but not to @new_order, the caller needs to check) + * @list: after-split folios are added to @list if not null, otherwise to LRU + * list * * It has the same prerequisites and returns as * split_huge_page_to_list_to_order(). @@ -3962,12 +4219,15 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list, * [order-4, {order-3}, order-3, order-5, order-6, order-7, order-8]. * * After split, folio is left locked for caller. + * + * Return: 0 - successful, <0 - failed (if -ENOMEM is returned, @folio might be + * split but not to @new_order, the caller needs to check) */ int folio_split(struct folio *folio, unsigned int new_order, struct page *split_at, struct list_head *list) { return __folio_split(folio, new_order, split_at, &folio->page, list, - false); + SPLIT_TYPE_NON_UNIFORM); } int min_order_for_split(struct folio *folio) @@ -3986,12 +4246,7 @@ int min_order_for_split(struct folio *folio) int split_folio_to_list(struct folio *folio, struct list_head *list) { - int ret = min_order_for_split(folio); - - if (ret < 0) - return ret; - - return split_huge_page_to_list_to_order(&folio->page, list, ret); + return split_huge_page_to_list_to_order(&folio->page, list, 0); } /* @@ -4014,10 +4269,9 @@ bool __folio_unqueue_deferred_split(struct folio *folio) bool unqueued = false; WARN_ON_ONCE(folio_ref_count(folio)); - WARN_ON_ONCE(!mem_cgroup_disabled() && !folio_memcg(folio)); + WARN_ON_ONCE(!mem_cgroup_disabled() && !folio_memcg_charged(folio)); - ds_queue = get_deferred_split_queue(folio); - spin_lock_irqsave(&ds_queue->split_queue_lock, flags); + ds_queue = folio_split_queue_lock_irqsave(folio, &flags); if (!list_empty(&folio->_deferred_list)) { ds_queue->split_queue_len--; if (folio_test_partially_mapped(folio)) { @@ -4028,7 +4282,7 @@ bool __folio_unqueue_deferred_split(struct folio *folio) list_del_init(&folio->_deferred_list); unqueued = true; } - spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); + split_queue_unlock_irqrestore(ds_queue, flags); return unqueued; /* useful for debug warnings */ } @@ -4036,10 +4290,7 @@ bool __folio_unqueue_deferred_split(struct folio *folio) /* partially_mapped=false won't clear PG_partially_mapped folio flag */ void deferred_split_folio(struct folio *folio, bool partially_mapped) { - struct deferred_split *ds_queue = get_deferred_split_queue(folio); -#ifdef CONFIG_MEMCG - struct mem_cgroup *memcg = folio_memcg(folio); -#endif + struct deferred_split *ds_queue; unsigned long flags; /* @@ -4062,7 +4313,7 @@ void deferred_split_folio(struct folio *folio, bool partially_mapped) if (folio_test_swapcache(folio)) return; - spin_lock_irqsave(&ds_queue->split_queue_lock, flags); + ds_queue = folio_split_queue_lock_irqsave(folio, &flags); if (partially_mapped) { if (!folio_test_partially_mapped(folio)) { folio_set_partially_mapped(folio); @@ -4077,15 +4328,16 @@ void deferred_split_folio(struct folio *folio, bool partially_mapped) VM_WARN_ON_FOLIO(folio_test_partially_mapped(folio), folio); } if (list_empty(&folio->_deferred_list)) { + struct mem_cgroup *memcg; + + memcg = folio_split_queue_memcg(folio, ds_queue); list_add_tail(&folio->_deferred_list, &ds_queue->split_queue); ds_queue->split_queue_len++; -#ifdef CONFIG_MEMCG if (memcg) set_shrinker_bit(memcg, folio_nid(folio), - deferred_split_shrinker->id); -#endif + shrinker_id(deferred_split_shrinker)); } - spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); + split_queue_unlock_irqrestore(ds_queue, flags); } static unsigned long deferred_split_count(struct shrinker *shrink, @@ -4109,6 +4361,9 @@ static bool thp_underused(struct folio *folio) if (khugepaged_max_ptes_none == HPAGE_PMD_NR - 1) return false; + if (folio_contain_hwpoisoned_page(folio)) + return false; + for (i = 0; i < folio_nr_pages(folio); i++) { if (pages_identical(folio_page(folio, i), ZERO_PAGE(0))) { if (++num_zero_pages > khugepaged_max_ptes_none) @@ -4128,43 +4383,42 @@ static bool thp_underused(struct folio *folio) static unsigned long deferred_split_scan(struct shrinker *shrink, struct shrink_control *sc) { - struct pglist_data *pgdata = NODE_DATA(sc->nid); - struct deferred_split *ds_queue = &pgdata->deferred_split_queue; + struct deferred_split *ds_queue; unsigned long flags; - LIST_HEAD(list); - struct folio *folio, *next, *prev = NULL; - int split = 0, removed = 0; + struct folio *folio, *next; + int split = 0, i; + struct folio_batch fbatch; -#ifdef CONFIG_MEMCG - if (sc->memcg) - ds_queue = &sc->memcg->deferred_split_queue; -#endif + folio_batch_init(&fbatch); - spin_lock_irqsave(&ds_queue->split_queue_lock, flags); +retry: + ds_queue = split_queue_lock_irqsave(sc->nid, sc->memcg, &flags); /* Take pin on all head pages to avoid freeing them under us */ list_for_each_entry_safe(folio, next, &ds_queue->split_queue, _deferred_list) { if (folio_try_get(folio)) { - list_move(&folio->_deferred_list, &list); - } else { + folio_batch_add(&fbatch, folio); + } else if (folio_test_partially_mapped(folio)) { /* We lost race with folio_put() */ - if (folio_test_partially_mapped(folio)) { - folio_clear_partially_mapped(folio); - mod_mthp_stat(folio_order(folio), - MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1); - } - list_del_init(&folio->_deferred_list); - ds_queue->split_queue_len--; + folio_clear_partially_mapped(folio); + mod_mthp_stat(folio_order(folio), + MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1); } + list_del_init(&folio->_deferred_list); + ds_queue->split_queue_len--; if (!--sc->nr_to_scan) break; + if (!folio_batch_space(&fbatch)) + break; } - spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); + split_queue_unlock_irqrestore(ds_queue, flags); - list_for_each_entry_safe(folio, next, &list, _deferred_list) { + for (i = 0; i < folio_batch_count(&fbatch); i++) { bool did_split = false; bool underused = false; + struct deferred_split *fqueue; + folio = fbatch.folios[i]; if (!folio_test_partially_mapped(folio)) { /* * See try_to_map_unused_to_zeropage(): we cannot @@ -4187,38 +4441,27 @@ static unsigned long deferred_split_scan(struct shrinker *shrink, } folio_unlock(folio); next: + if (did_split || !folio_test_partially_mapped(folio)) + continue; /* - * split_folio() removes folio from list on success. * Only add back to the queue if folio is partially mapped. * If thp_underused returns false, or if split_folio fails * in the case it was underused, then consider it used and * don't add it back to split_queue. */ - if (did_split) { - ; /* folio already removed from list */ - } else if (!folio_test_partially_mapped(folio)) { - list_del_init(&folio->_deferred_list); - removed++; - } else { - /* - * That unlocked list_del_init() above would be unsafe, - * unless its folio is separated from any earlier folios - * left on the list (which may be concurrently unqueued) - * by one safe folio with refcount still raised. - */ - swap(folio, prev); + fqueue = folio_split_queue_lock_irqsave(folio, &flags); + if (list_empty(&folio->_deferred_list)) { + list_add_tail(&folio->_deferred_list, &fqueue->split_queue); + fqueue->split_queue_len++; } - if (folio) - folio_put(folio); + split_queue_unlock_irqrestore(fqueue, flags); } + folios_put(&fbatch); - spin_lock_irqsave(&ds_queue->split_queue_lock, flags); - list_splice_tail(&list, &ds_queue->split_queue); - ds_queue->split_queue_len -= removed; - spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); - - if (prev) - folio_put(prev); + if (sc->nr_to_scan && !list_empty(&ds_queue->split_queue)) { + cond_resched(); + goto retry; + } /* * Stop shrinker if we didn't split any page, but the queue is empty. @@ -4229,6 +4472,33 @@ next: return split; } +#ifdef CONFIG_MEMCG +void reparent_deferred_split_queue(struct mem_cgroup *memcg) +{ + struct mem_cgroup *parent = parent_mem_cgroup(memcg); + struct deferred_split *ds_queue = &memcg->deferred_split_queue; + struct deferred_split *parent_ds_queue = &parent->deferred_split_queue; + int nid; + + spin_lock_irq(&ds_queue->split_queue_lock); + spin_lock_nested(&parent_ds_queue->split_queue_lock, SINGLE_DEPTH_NESTING); + + if (!ds_queue->split_queue_len) + goto unlock; + + list_splice_tail_init(&ds_queue->split_queue, &parent_ds_queue->split_queue); + parent_ds_queue->split_queue_len += ds_queue->split_queue_len; + ds_queue->split_queue_len = 0; + + for_each_node(nid) + set_shrinker_bit(parent, nid, shrinker_id(deferred_split_shrinker)); + +unlock: + spin_unlock(&parent_ds_queue->split_queue_lock); + spin_unlock_irq(&ds_queue->split_queue_lock); +} +#endif + #ifdef CONFIG_DEBUG_FS static void split_huge_pages_all(void) { @@ -4590,7 +4860,10 @@ int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw, return 0; flush_cache_range(vma, address, address + HPAGE_PMD_SIZE); - pmdval = pmdp_invalidate(vma, address, pvmw->pmd); + if (unlikely(!pmd_present(*pvmw->pmd))) + pmdval = pmdp_huge_get_and_clear(vma->vm_mm, address, pvmw->pmd); + else + pmdval = pmdp_invalidate(vma, address, pvmw->pmd); /* See folio_try_share_anon_rmap_pmd(): invalidate PMD first. */ anon_exclusive = folio_test_anon(folio) && PageAnonExclusive(page); @@ -4632,30 +4905,48 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new) unsigned long address = pvmw->address; unsigned long haddr = address & HPAGE_PMD_MASK; pmd_t pmde; - swp_entry_t entry; + softleaf_t entry; if (!(pvmw->pmd && !pvmw->pte)) return; - entry = pmd_to_swp_entry(*pvmw->pmd); + entry = softleaf_from_pmd(*pvmw->pmd); folio_get(folio); pmde = folio_mk_pmd(folio, READ_ONCE(vma->vm_page_prot)); + if (pmd_swp_soft_dirty(*pvmw->pmd)) pmde = pmd_mksoft_dirty(pmde); - if (is_writable_migration_entry(entry)) + if (softleaf_is_migration_write(entry)) pmde = pmd_mkwrite(pmde, vma); if (pmd_swp_uffd_wp(*pvmw->pmd)) pmde = pmd_mkuffd_wp(pmde); - if (!is_migration_entry_young(entry)) + if (!softleaf_is_migration_young(entry)) pmde = pmd_mkold(pmde); /* NOTE: this may contain setting soft-dirty on some archs */ - if (folio_test_dirty(folio) && is_migration_entry_dirty(entry)) + if (folio_test_dirty(folio) && softleaf_is_migration_dirty(entry)) pmde = pmd_mkdirty(pmde); + if (folio_is_device_private(folio)) { + swp_entry_t entry; + + if (pmd_write(pmde)) + entry = make_writable_device_private_entry( + page_to_pfn(new)); + else + entry = make_readable_device_private_entry( + page_to_pfn(new)); + pmde = swp_entry_to_pmd(entry); + + if (pmd_swp_soft_dirty(*pvmw->pmd)) + pmde = pmd_swp_mksoft_dirty(pmde); + if (pmd_swp_uffd_wp(*pvmw->pmd)) + pmde = pmd_swp_mkuffd_wp(pmde); + } + if (folio_test_anon(folio)) { rmap_t rmap_flags = RMAP_NONE; - if (!is_readable_migration_entry(entry)) + if (!softleaf_is_migration_read(entry)) rmap_flags |= RMAP_EXCLUSIVE; folio_add_anon_rmap_pmd(folio, new, vma, haddr, rmap_flags); |
