diff options
Diffstat (limited to 'mm/khugepaged.c')
-rw-r--r-- | mm/khugepaged.c | 176 |
1 files changed, 82 insertions, 94 deletions
diff --git a/mm/khugepaged.c b/mm/khugepaged.c index b486c1d19b2d..7ab2d1a42df3 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -39,7 +39,6 @@ enum scan_result { SCAN_PTE_NON_PRESENT, SCAN_PTE_UFFD_WP, SCAN_PTE_MAPPED_HUGEPAGE, - SCAN_PAGE_RO, SCAN_LACK_REFERENCED_PAGE, SCAN_PAGE_NULL, SCAN_SCAN_ABORT, @@ -105,14 +104,6 @@ struct collapse_control { }; /** - * struct khugepaged_mm_slot - khugepaged information per mm that is being scanned - * @slot: hash lookup from mm to mm_slot - */ -struct khugepaged_mm_slot { - struct mm_slot slot; -}; - -/** * struct khugepaged_scan - cursor for scanning * @mm_head: the head of the mm list to scan * @mm_slot: the current mm_slot we are scanning @@ -122,7 +113,7 @@ struct khugepaged_mm_slot { */ struct khugepaged_scan { struct list_head mm_head; - struct khugepaged_mm_slot *mm_slot; + struct mm_slot *mm_slot; unsigned long address; }; @@ -385,7 +376,10 @@ int hugepage_madvise(struct vm_area_struct *vma, int __init khugepaged_init(void) { - mm_slot_cache = KMEM_CACHE(khugepaged_mm_slot, 0); + mm_slot_cache = kmem_cache_create("khugepaged_mm_slot", + sizeof(struct mm_slot), + __alignof__(struct mm_slot), + 0, NULL); if (!mm_slot_cache) return -ENOMEM; @@ -410,7 +404,7 @@ static inline int hpage_collapse_test_exit(struct mm_struct *mm) static inline int hpage_collapse_test_exit_or_disable(struct mm_struct *mm) { return hpage_collapse_test_exit(mm) || - test_bit(MMF_DISABLE_THP, &mm->flags); + mm_flags_test(MMF_DISABLE_THP_COMPLETELY, mm); } static bool hugepage_pmd_enabled(void) @@ -439,21 +433,18 @@ static bool hugepage_pmd_enabled(void) void __khugepaged_enter(struct mm_struct *mm) { - struct khugepaged_mm_slot *mm_slot; struct mm_slot *slot; int wakeup; /* __khugepaged_exit() must not run from under us */ VM_BUG_ON_MM(hpage_collapse_test_exit(mm), mm); - if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) + if (unlikely(mm_flags_test_and_set(MMF_VM_HUGEPAGE, mm))) return; - mm_slot = mm_slot_alloc(mm_slot_cache); - if (!mm_slot) + slot = mm_slot_alloc(mm_slot_cache); + if (!slot) return; - slot = &mm_slot->slot; - spin_lock(&khugepaged_mm_lock); mm_slot_insert(mm_slots_hash, mm, slot); /* @@ -472,24 +463,21 @@ void __khugepaged_enter(struct mm_struct *mm) void khugepaged_enter_vma(struct vm_area_struct *vma, vm_flags_t vm_flags) { - if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags) && + if (!mm_flags_test(MMF_VM_HUGEPAGE, vma->vm_mm) && hugepage_pmd_enabled()) { - if (thp_vma_allowable_order(vma, vm_flags, TVA_ENFORCE_SYSFS, - PMD_ORDER)) + if (thp_vma_allowable_order(vma, vm_flags, TVA_KHUGEPAGED, PMD_ORDER)) __khugepaged_enter(vma->vm_mm); } } void __khugepaged_exit(struct mm_struct *mm) { - struct khugepaged_mm_slot *mm_slot; struct mm_slot *slot; int free = 0; spin_lock(&khugepaged_mm_lock); slot = mm_slot_lookup(mm_slots_hash, mm); - mm_slot = mm_slot_entry(slot, struct khugepaged_mm_slot, slot); - if (mm_slot && khugepaged_scan.mm_slot != mm_slot) { + if (slot && khugepaged_scan.mm_slot != slot) { hash_del(&slot->hash); list_del(&slot->mm_node); free = 1; @@ -497,10 +485,10 @@ void __khugepaged_exit(struct mm_struct *mm) spin_unlock(&khugepaged_mm_lock); if (free) { - clear_bit(MMF_VM_HUGEPAGE, &mm->flags); - mm_slot_free(mm_slot_cache, mm_slot); + mm_flags_clear(MMF_VM_HUGEPAGE, mm); + mm_slot_free(mm_slot_cache, slot); mmdrop(mm); - } else if (mm_slot) { + } else if (slot) { /* * This is required to serialize against * hpage_collapse_test_exit() (which is guaranteed to run @@ -549,19 +537,19 @@ static void release_pte_pages(pte_t *pte, pte_t *_pte, } static int __collapse_huge_page_isolate(struct vm_area_struct *vma, - unsigned long address, + unsigned long start_addr, pte_t *pte, struct collapse_control *cc, struct list_head *compound_pagelist) { struct page *page = NULL; struct folio *folio = NULL; + unsigned long addr = start_addr; pte_t *_pte; int none_or_zero = 0, shared = 0, result = SCAN_FAIL, referenced = 0; - bool writable = false; for (_pte = pte; _pte < pte + HPAGE_PMD_NR; - _pte++, address += PAGE_SIZE) { + _pte++, addr += PAGE_SIZE) { pte_t pteval = ptep_get(_pte); if (pte_none(pteval) || (pte_present(pteval) && is_zero_pfn(pte_pfn(pteval)))) { @@ -584,7 +572,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, result = SCAN_PTE_UFFD_WP; goto out; } - page = vm_normal_page(vma, address, pteval); + page = vm_normal_page(vma, addr, pteval); if (unlikely(!page) || unlikely(is_zone_device_page(page))) { result = SCAN_PAGE_NULL; goto out; @@ -669,28 +657,23 @@ next: */ if (cc->is_khugepaged && (pte_young(pteval) || folio_test_young(folio) || - folio_test_referenced(folio) || mmu_notifier_test_young(vma->vm_mm, - address))) + folio_test_referenced(folio) || + mmu_notifier_test_young(vma->vm_mm, addr))) referenced++; - - if (pte_write(pteval)) - writable = true; } - if (unlikely(!writable)) { - result = SCAN_PAGE_RO; - } else if (unlikely(cc->is_khugepaged && !referenced)) { + if (unlikely(cc->is_khugepaged && !referenced)) { result = SCAN_LACK_REFERENCED_PAGE; } else { result = SCAN_SUCCEED; trace_mm_collapse_huge_page_isolate(folio, none_or_zero, - referenced, writable, result); + referenced, result); return result; } out: release_pte_pages(pte, _pte, compound_pagelist); trace_mm_collapse_huge_page_isolate(folio, none_or_zero, - referenced, writable, result); + referenced, result); return result; } @@ -921,7 +904,8 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, struct collapse_control *cc) { struct vm_area_struct *vma; - unsigned long tva_flags = cc->is_khugepaged ? TVA_ENFORCE_SYSFS : 0; + enum tva_type type = cc->is_khugepaged ? TVA_KHUGEPAGED : + TVA_FORCED_COLLAPSE; if (unlikely(hpage_collapse_test_exit_or_disable(mm))) return SCAN_ANY_PROCESS; @@ -932,7 +916,7 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, if (!thp_vma_suitable_order(vma, address, PMD_ORDER)) return SCAN_ADDRESS_RANGE; - if (!thp_vma_allowable_order(vma, vma->vm_flags, tva_flags, PMD_ORDER)) + if (!thp_vma_allowable_order(vma, vma->vm_flags, type, PMD_ORDER)) return SCAN_VMA_CHECK; /* * Anon VMA expected, the address may be unmapped then @@ -1003,21 +987,21 @@ static int check_pmd_still_valid(struct mm_struct *mm, */ static int __collapse_huge_page_swapin(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long haddr, pmd_t *pmd, + unsigned long start_addr, pmd_t *pmd, int referenced) { int swapped_in = 0; vm_fault_t ret = 0; - unsigned long address, end = haddr + (HPAGE_PMD_NR * PAGE_SIZE); + unsigned long addr, end = start_addr + (HPAGE_PMD_NR * PAGE_SIZE); int result; pte_t *pte = NULL; spinlock_t *ptl; - for (address = haddr; address < end; address += PAGE_SIZE) { + for (addr = start_addr; addr < end; addr += PAGE_SIZE) { struct vm_fault vmf = { .vma = vma, - .address = address, - .pgoff = linear_page_index(vma, address), + .address = addr, + .pgoff = linear_page_index(vma, addr), .flags = FAULT_FLAG_ALLOW_RETRY, .pmd = pmd, }; @@ -1027,7 +1011,7 @@ static int __collapse_huge_page_swapin(struct mm_struct *mm, * Here the ptl is only used to check pte_same() in * do_swap_page(), so readonly version is enough. */ - pte = pte_offset_map_ro_nolock(mm, pmd, address, &ptl); + pte = pte_offset_map_ro_nolock(mm, pmd, addr, &ptl); if (!pte) { mmap_read_unlock(mm); result = SCAN_PMD_NULL; @@ -1270,7 +1254,7 @@ out_nolock: static int hpage_collapse_scan_pmd(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, bool *mmap_locked, + unsigned long start_addr, bool *mmap_locked, struct collapse_control *cc) { pmd_t *pmd; @@ -1279,27 +1263,26 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm, int none_or_zero = 0, shared = 0; struct page *page = NULL; struct folio *folio = NULL; - unsigned long _address; + unsigned long addr; spinlock_t *ptl; int node = NUMA_NO_NODE, unmapped = 0; - bool writable = false; - VM_BUG_ON(address & ~HPAGE_PMD_MASK); + VM_BUG_ON(start_addr & ~HPAGE_PMD_MASK); - result = find_pmd_or_thp_or_none(mm, address, &pmd); + result = find_pmd_or_thp_or_none(mm, start_addr, &pmd); if (result != SCAN_SUCCEED) goto out; memset(cc->node_load, 0, sizeof(cc->node_load)); nodes_clear(cc->alloc_nmask); - pte = pte_offset_map_lock(mm, pmd, address, &ptl); + pte = pte_offset_map_lock(mm, pmd, start_addr, &ptl); if (!pte) { result = SCAN_PMD_NULL; goto out; } - for (_address = address, _pte = pte; _pte < pte + HPAGE_PMD_NR; - _pte++, _address += PAGE_SIZE) { + for (addr = start_addr, _pte = pte; _pte < pte + HPAGE_PMD_NR; + _pte++, addr += PAGE_SIZE) { pte_t pteval = ptep_get(_pte); if (is_swap_pte(pteval)) { ++unmapped; @@ -1346,10 +1329,8 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm, result = SCAN_PTE_UFFD_WP; goto out_unmap; } - if (pte_write(pteval)) - writable = true; - page = vm_normal_page(vma, _address, pteval); + page = vm_normal_page(vma, addr, pteval); if (unlikely(!page) || unlikely(is_zone_device_page(page))) { result = SCAN_PAGE_NULL; goto out_unmap; @@ -1418,12 +1399,10 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm, if (cc->is_khugepaged && (pte_young(pteval) || folio_test_young(folio) || folio_test_referenced(folio) || - mmu_notifier_test_young(vma->vm_mm, _address))) + mmu_notifier_test_young(vma->vm_mm, addr))) referenced++; } - if (!writable) { - result = SCAN_PAGE_RO; - } else if (cc->is_khugepaged && + if (cc->is_khugepaged && (!referenced || (unmapped && referenced < HPAGE_PMD_NR / 2))) { result = SCAN_LACK_REFERENCED_PAGE; @@ -1433,20 +1412,19 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm, out_unmap: pte_unmap_unlock(pte, ptl); if (result == SCAN_SUCCEED) { - result = collapse_huge_page(mm, address, referenced, + result = collapse_huge_page(mm, start_addr, referenced, unmapped, cc); /* collapse_huge_page will return with the mmap_lock released */ *mmap_locked = false; } out: - trace_mm_khugepaged_scan_pmd(mm, folio, writable, referenced, + trace_mm_khugepaged_scan_pmd(mm, folio, referenced, none_or_zero, result, unmapped); return result; } -static void collect_mm_slot(struct khugepaged_mm_slot *mm_slot) +static void collect_mm_slot(struct mm_slot *slot) { - struct mm_slot *slot = &mm_slot->slot; struct mm_struct *mm = slot->mm; lockdep_assert_held(&khugepaged_mm_lock); @@ -1459,11 +1437,11 @@ static void collect_mm_slot(struct khugepaged_mm_slot *mm_slot) /* * Not strictly needed because the mm exited already. * - * clear_bit(MMF_VM_HUGEPAGE, &mm->flags); + * mm_flags_clear(MMF_VM_HUGEPAGE, mm); */ /* khugepaged_mm_lock actually not necessary for the below */ - mm_slot_free(mm_slot_cache, mm_slot); + mm_slot_free(mm_slot_cache, slot); mmdrop(mm); } } @@ -1472,15 +1450,32 @@ static void collect_mm_slot(struct khugepaged_mm_slot *mm_slot) static int set_huge_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp, struct folio *folio, struct page *page) { + struct mm_struct *mm = vma->vm_mm; struct vm_fault vmf = { .vma = vma, .address = addr, .flags = 0, - .pmd = pmdp, }; + pgd_t *pgdp; + p4d_t *p4dp; + pud_t *pudp; mmap_assert_locked(vma->vm_mm); + if (!pmdp) { + pgdp = pgd_offset(mm, addr); + p4dp = p4d_alloc(mm, pgdp, addr); + if (!p4dp) + return SCAN_FAIL; + pudp = pud_alloc(mm, p4dp, addr); + if (!pudp) + return SCAN_FAIL; + pmdp = pmd_alloc(mm, pudp, addr); + if (!pmdp) + return SCAN_FAIL; + } + + vmf.pmd = pmdp; if (do_set_pmd(&vmf, folio, page)) return SCAN_FAIL; @@ -1533,9 +1528,9 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, * in the page cache with a single hugepage. If a mm were to fault-in * this memory (mapped by a suitably aligned VMA), we'd get the hugepage * and map it by a PMD, regardless of sysfs THP settings. As such, let's - * analogously elide sysfs THP settings here. + * analogously elide sysfs THP settings here and force collapse. */ - if (!thp_vma_allowable_order(vma, vma->vm_flags, 0, PMD_ORDER)) + if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_FORCED_COLLAPSE, PMD_ORDER)) return SCAN_VMA_CHECK; /* Keep pmd pgtable for uffd-wp; see comment in retract_page_tables() */ @@ -1556,6 +1551,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, switch (result) { case SCAN_SUCCEED: break; + case SCAN_PMD_NULL: case SCAN_PMD_NONE: /* * All pte entries have been removed and pmd cleared. @@ -2388,7 +2384,6 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, __acquires(&khugepaged_mm_lock) { struct vma_iterator vmi; - struct khugepaged_mm_slot *mm_slot; struct mm_slot *slot; struct mm_struct *mm; struct vm_area_struct *vma; @@ -2399,14 +2394,12 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, *result = SCAN_FAIL; if (khugepaged_scan.mm_slot) { - mm_slot = khugepaged_scan.mm_slot; - slot = &mm_slot->slot; + slot = khugepaged_scan.mm_slot; } else { - slot = list_entry(khugepaged_scan.mm_head.next, + slot = list_first_entry(&khugepaged_scan.mm_head, struct mm_slot, mm_node); - mm_slot = mm_slot_entry(slot, struct khugepaged_mm_slot, slot); khugepaged_scan.address = 0; - khugepaged_scan.mm_slot = mm_slot; + khugepaged_scan.mm_slot = slot; } spin_unlock(&khugepaged_mm_lock); @@ -2432,8 +2425,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, progress++; break; } - if (!thp_vma_allowable_order(vma, vma->vm_flags, - TVA_ENFORCE_SYSFS, PMD_ORDER)) { + if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_KHUGEPAGED, PMD_ORDER)) { skip: progress++; continue; @@ -2505,7 +2497,7 @@ breakouterloop: breakouterloop_mmap_lock: spin_lock(&khugepaged_mm_lock); - VM_BUG_ON(khugepaged_scan.mm_slot != mm_slot); + VM_BUG_ON(khugepaged_scan.mm_slot != slot); /* * Release the current mm_slot if this mm is about to die, or * if we scanned all vmas of this mm. @@ -2516,18 +2508,15 @@ breakouterloop_mmap_lock: * khugepaged runs here, khugepaged_exit will find * mm_slot not pointing to the exiting mm. */ - if (slot->mm_node.next != &khugepaged_scan.mm_head) { - slot = list_entry(slot->mm_node.next, - struct mm_slot, mm_node); - khugepaged_scan.mm_slot = - mm_slot_entry(slot, struct khugepaged_mm_slot, slot); + if (!list_is_last(&slot->mm_node, &khugepaged_scan.mm_head)) { + khugepaged_scan.mm_slot = list_next_entry(slot, mm_node); khugepaged_scan.address = 0; } else { khugepaged_scan.mm_slot = NULL; khugepaged_full_scans++; } - collect_mm_slot(mm_slot); + collect_mm_slot(slot); } return progress; @@ -2614,7 +2603,7 @@ static void khugepaged_wait_work(void) static int khugepaged(void *none) { - struct khugepaged_mm_slot *mm_slot; + struct mm_slot *slot; set_freezable(); set_user_nice(current, MAX_NICE); @@ -2625,10 +2614,10 @@ static int khugepaged(void *none) } spin_lock(&khugepaged_mm_lock); - mm_slot = khugepaged_scan.mm_slot; + slot = khugepaged_scan.mm_slot; khugepaged_scan.mm_slot = NULL; - if (mm_slot) - collect_mm_slot(mm_slot); + if (slot) + collect_mm_slot(slot); spin_unlock(&khugepaged_mm_lock); return 0; } @@ -2767,7 +2756,7 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start, BUG_ON(vma->vm_start > start); BUG_ON(vma->vm_end < end); - if (!thp_vma_allowable_order(vma, vma->vm_flags, 0, PMD_ORDER)) + if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_FORCED_COLLAPSE, PMD_ORDER)) return -EINVAL; cc = kmalloc(sizeof(*cc), GFP_KERNEL); @@ -2832,7 +2821,6 @@ handle_result: case SCAN_PMD_NULL: case SCAN_PTE_NON_PRESENT: case SCAN_PTE_UFFD_WP: - case SCAN_PAGE_RO: case SCAN_LACK_REFERENCED_PAGE: case SCAN_PAGE_NULL: case SCAN_PAGE_COUNT: |