diff options
Diffstat (limited to 'mm/memory.c')
-rw-r--r-- | mm/memory.c | 391 |
1 files changed, 257 insertions, 134 deletions
diff --git a/mm/memory.c b/mm/memory.c index 0ba4f6b71847..74b45e258323 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -491,22 +491,8 @@ static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss) add_mm_counter(mm, i, rss[i]); } -/* - * This function is called to print an error when a bad pte - * is found. For example, we might have a PFN-mapped pte in - * a region that doesn't allow it. - * - * The calling function must still handle the error. - */ -static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, - pte_t pte, struct page *page) +static bool is_bad_page_map_ratelimited(void) { - pgd_t *pgd = pgd_offset(vma->vm_mm, addr); - p4d_t *p4d = p4d_offset(pgd, addr); - pud_t *pud = pud_offset(p4d, addr); - pmd_t *pmd = pmd_offset(pud, addr); - struct address_space *mapping; - pgoff_t index; static unsigned long resume; static unsigned long nr_shown; static unsigned long nr_unshown; @@ -518,7 +504,7 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, if (nr_shown == 60) { if (time_before(jiffies, resume)) { nr_unshown++; - return; + return true; } if (nr_unshown) { pr_alert("BUG: Bad page map: %lu messages suppressed\n", @@ -529,15 +515,91 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, } if (nr_shown++ == 0) resume = jiffies + 60 * HZ; + return false; +} + +static void __print_bad_page_map_pgtable(struct mm_struct *mm, unsigned long addr) +{ + unsigned long long pgdv, p4dv, pudv, pmdv; + p4d_t p4d, *p4dp; + pud_t pud, *pudp; + pmd_t pmd, *pmdp; + pgd_t *pgdp; + + /* + * Although this looks like a fully lockless pgtable walk, it is not: + * see locking requirements for print_bad_page_map(). + */ + pgdp = pgd_offset(mm, addr); + pgdv = pgd_val(*pgdp); + + if (!pgd_present(*pgdp) || pgd_leaf(*pgdp)) { + pr_alert("pgd:%08llx\n", pgdv); + return; + } + + p4dp = p4d_offset(pgdp, addr); + p4d = p4dp_get(p4dp); + p4dv = p4d_val(p4d); + + if (!p4d_present(p4d) || p4d_leaf(p4d)) { + pr_alert("pgd:%08llx p4d:%08llx\n", pgdv, p4dv); + return; + } + + pudp = pud_offset(p4dp, addr); + pud = pudp_get(pudp); + pudv = pud_val(pud); + + if (!pud_present(pud) || pud_leaf(pud)) { + pr_alert("pgd:%08llx p4d:%08llx pud:%08llx\n", pgdv, p4dv, pudv); + return; + } + + pmdp = pmd_offset(pudp, addr); + pmd = pmdp_get(pmdp); + pmdv = pmd_val(pmd); + + /* + * Dumping the PTE would be nice, but it's tricky with CONFIG_HIGHPTE, + * because the table should already be mapped by the caller and + * doing another map would be bad. print_bad_page_map() should + * already take care of printing the PTE. + */ + pr_alert("pgd:%08llx p4d:%08llx pud:%08llx pmd:%08llx\n", pgdv, + p4dv, pudv, pmdv); +} + +/* + * This function is called to print an error when a bad page table entry (e.g., + * corrupted page table entry) is found. For example, we might have a + * PFN-mapped pte in a region that doesn't allow it. + * + * The calling function must still handle the error. + * + * This function must be called during a proper page table walk, as it will + * re-walk the page table to dump information: the caller MUST prevent page + * table teardown (by holding mmap, vma or rmap lock) and MUST hold the leaf + * page table lock. + */ +static void print_bad_page_map(struct vm_area_struct *vma, + unsigned long addr, unsigned long long entry, struct page *page, + enum pgtable_level level) +{ + struct address_space *mapping; + pgoff_t index; + + if (is_bad_page_map_ratelimited()) + return; mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL; index = linear_page_index(vma, addr); - pr_alert("BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n", - current->comm, - (long long)pte_val(pte), (long long)pmd_val(*pmd)); + pr_alert("BUG: Bad page map in process %s %s:%08llx", current->comm, + pgtable_level_to_str(level), entry); + __print_bad_page_map_pgtable(vma->vm_mm, addr); if (page) - dump_page(page, "bad pte"); + dump_page(page, "bad page map"); pr_alert("addr:%px vm_flags:%08lx anon_vma:%px mapping:%px index:%lx\n", (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index); pr_alert("file:%pD fault:%ps mmap:%ps mmap_prepare: %ps read_folio:%ps\n", @@ -549,18 +611,39 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, dump_stack(); add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); } +#define print_bad_pte(vma, addr, pte, page) \ + print_bad_page_map(vma, addr, pte_val(pte), page, PGTABLE_LEVEL_PTE) -/* - * vm_normal_page -- This function gets the "struct page" associated with a pte. +/** + * __vm_normal_page() - Get the "struct page" associated with a page table entry. + * @vma: The VMA mapping the page table entry. + * @addr: The address where the page table entry is mapped. + * @pfn: The PFN stored in the page table entry. + * @special: Whether the page table entry is marked "special". + * @level: The page table level for error reporting purposes only. + * @entry: The page table entry value for error reporting purposes only. * * "Special" mappings do not wish to be associated with a "struct page" (either * it doesn't exist, or it exists but they don't want to touch it). In this - * case, NULL is returned here. "Normal" mappings do have a struct page. + * case, NULL is returned here. "Normal" mappings do have a struct page and + * are ordinarily refcounted. + * + * Page mappings of the shared zero folios are always considered "special", as + * they are not ordinarily refcounted: neither the refcount nor the mapcount + * of these folios is adjusted when mapping them into user page tables. + * Selected page table walkers (such as GUP) can still identify mappings of the + * shared zero folios and work with the underlying "struct page". * - * There are 2 broad cases. Firstly, an architecture may define a pte_special() - * pte bit, in which case this function is trivial. Secondly, an architecture - * may not have a spare pte bit, which requires a more complicated scheme, - * described below. + * There are 2 broad cases. Firstly, an architecture may define a "special" + * page table entry bit, such as pte_special(), in which case this function is + * trivial. Secondly, an architecture may not have a spare page table + * entry bit, which requires a more complicated scheme, described below. + * + * With CONFIG_FIND_NORMAL_PAGE, we might have the "special" bit set on + * page table entries that actually map "normal" pages: however, that page + * cannot be looked up through the PFN stored in the page table entry, but + * instead will be looked up through vm_ops->find_normal_page(). So far, this + * only applies to PTEs. * * A raw VM_PFNMAP mapping (ie. one that is not COWed) is always considered a * special mapping (even if there are underlying and valid "struct pages"). @@ -585,72 +668,104 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, * * VM_MIXEDMAP mappings can likewise contain memory with or without "struct * page" backing, however the difference is that _all_ pages with a struct - * page (that is, those where pfn_valid is true) are refcounted and considered - * normal pages by the VM. The only exception are zeropages, which are - * *never* refcounted. + * page (that is, those where pfn_valid is true, except the shared zero + * folios) are refcounted and considered normal pages by the VM. * * The disadvantage is that pages are refcounted (which can be slower and * simply not an option for some PFNMAP users). The advantage is that we * don't have to follow the strict linearity rule of PFNMAP mappings in * order to support COWable mappings. * + * Return: Returns the "struct page" if this is a "normal" mapping. Returns + * NULL if this is a "special" mapping. */ -struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, - pte_t pte) +static inline struct page *__vm_normal_page(struct vm_area_struct *vma, + unsigned long addr, unsigned long pfn, bool special, + unsigned long long entry, enum pgtable_level level) { - unsigned long pfn = pte_pfn(pte); - if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL)) { - if (likely(!pte_special(pte))) - goto check_pfn; - if (vma->vm_ops && vma->vm_ops->find_special_page) - return vma->vm_ops->find_special_page(vma, addr); - if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)) - return NULL; - if (is_zero_pfn(pfn)) - return NULL; - - print_bad_pte(vma, addr, pte, NULL); - return NULL; - } - - /* !CONFIG_ARCH_HAS_PTE_SPECIAL case follows: */ - - if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) { - if (vma->vm_flags & VM_MIXEDMAP) { - if (!pfn_valid(pfn)) - return NULL; - if (is_zero_pfn(pfn)) - return NULL; - goto out; - } else { - unsigned long off; - off = (addr - vma->vm_start) >> PAGE_SHIFT; - if (pfn == vma->vm_pgoff + off) + if (unlikely(special)) { +#ifdef CONFIG_FIND_NORMAL_PAGE + if (vma->vm_ops && vma->vm_ops->find_normal_page) + return vma->vm_ops->find_normal_page(vma, addr); +#endif /* CONFIG_FIND_NORMAL_PAGE */ + if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)) return NULL; - if (!is_cow_mapping(vma->vm_flags)) + if (is_zero_pfn(pfn) || is_huge_zero_pfn(pfn)) return NULL; + + print_bad_page_map(vma, addr, entry, NULL, level); + return NULL; } - } + /* + * With CONFIG_ARCH_HAS_PTE_SPECIAL, any special page table + * mappings (incl. shared zero folios) are marked accordingly. + */ + } else { + if (unlikely(vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))) { + if (vma->vm_flags & VM_MIXEDMAP) { + /* If it has a "struct page", it's "normal". */ + if (!pfn_valid(pfn)) + return NULL; + } else { + unsigned long off = (addr - vma->vm_start) >> PAGE_SHIFT; - if (is_zero_pfn(pfn)) - return NULL; + /* Only CoW'ed anon folios are "normal". */ + if (pfn == vma->vm_pgoff + off) + return NULL; + if (!is_cow_mapping(vma->vm_flags)) + return NULL; + } + } + + if (is_zero_pfn(pfn) || is_huge_zero_pfn(pfn)) + return NULL; + } -check_pfn: if (unlikely(pfn > highest_memmap_pfn)) { - print_bad_pte(vma, addr, pte, NULL); + /* Corrupted page table entry. */ + print_bad_page_map(vma, addr, entry, NULL, level); return NULL; } - /* * NOTE! We still have PageReserved() pages in the page tables. - * eg. VDSO mappings can cause them to exist. + * For example, VDSO mappings can cause them to exist. */ -out: - VM_WARN_ON_ONCE(is_zero_pfn(pfn)); + VM_WARN_ON_ONCE(is_zero_pfn(pfn) || is_huge_zero_pfn(pfn)); return pfn_to_page(pfn); } +/** + * vm_normal_page() - Get the "struct page" associated with a PTE + * @vma: The VMA mapping the @pte. + * @addr: The address where the @pte is mapped. + * @pte: The PTE. + * + * Get the "struct page" associated with a PTE. See __vm_normal_page() + * for details on "normal" and "special" mappings. + * + * Return: Returns the "struct page" if this is a "normal" mapping. Returns + * NULL if this is a "special" mapping. + */ +struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, + pte_t pte) +{ + return __vm_normal_page(vma, addr, pte_pfn(pte), pte_special(pte), + pte_val(pte), PGTABLE_LEVEL_PTE); +} + +/** + * vm_normal_folio() - Get the "struct folio" associated with a PTE + * @vma: The VMA mapping the @pte. + * @addr: The address where the @pte is mapped. + * @pte: The PTE. + * + * Get the "struct folio" associated with a PTE. See __vm_normal_page() + * for details on "normal" and "special" mappings. + * + * Return: Returns the "struct folio" if this is a "normal" mapping. Returns + * NULL if this is a "special" mapping. + */ struct folio *vm_normal_folio(struct vm_area_struct *vma, unsigned long addr, pte_t pte) { @@ -662,43 +777,37 @@ struct folio *vm_normal_folio(struct vm_area_struct *vma, unsigned long addr, } #ifdef CONFIG_PGTABLE_HAS_HUGE_LEAVES +/** + * vm_normal_page_pmd() - Get the "struct page" associated with a PMD + * @vma: The VMA mapping the @pmd. + * @addr: The address where the @pmd is mapped. + * @pmd: The PMD. + * + * Get the "struct page" associated with a PTE. See __vm_normal_page() + * for details on "normal" and "special" mappings. + * + * Return: Returns the "struct page" if this is a "normal" mapping. Returns + * NULL if this is a "special" mapping. + */ struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t pmd) { - unsigned long pfn = pmd_pfn(pmd); - - /* Currently it's only used for huge pfnmaps */ - if (unlikely(pmd_special(pmd))) - return NULL; - - if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) { - if (vma->vm_flags & VM_MIXEDMAP) { - if (!pfn_valid(pfn)) - return NULL; - goto out; - } else { - unsigned long off; - off = (addr - vma->vm_start) >> PAGE_SHIFT; - if (pfn == vma->vm_pgoff + off) - return NULL; - if (!is_cow_mapping(vma->vm_flags)) - return NULL; - } - } - - if (is_huge_zero_pfn(pfn)) - return NULL; - if (unlikely(pfn > highest_memmap_pfn)) - return NULL; - - /* - * NOTE! We still have PageReserved() pages in the page tables. - * eg. VDSO mappings can cause them to exist. - */ -out: - return pfn_to_page(pfn); + return __vm_normal_page(vma, addr, pmd_pfn(pmd), pmd_special(pmd), + pmd_val(pmd), PGTABLE_LEVEL_PMD); } +/** + * vm_normal_folio_pmd() - Get the "struct folio" associated with a PMD + * @vma: The VMA mapping the @pmd. + * @addr: The address where the @pmd is mapped. + * @pmd: The PMD. + * + * Get the "struct folio" associated with a PTE. See __vm_normal_page() + * for details on "normal" and "special" mappings. + * + * Return: Returns the "struct folio" if this is a "normal" mapping. Returns + * NULL if this is a "special" mapping. + */ struct folio *vm_normal_folio_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t pmd) { @@ -708,6 +817,25 @@ struct folio *vm_normal_folio_pmd(struct vm_area_struct *vma, return page_folio(page); return NULL; } + +/** + * vm_normal_page_pud() - Get the "struct page" associated with a PUD + * @vma: The VMA mapping the @pud. + * @addr: The address where the @pud is mapped. + * @pud: The PUD. + * + * Get the "struct page" associated with a PUD. See __vm_normal_page() + * for details on "normal" and "special" mappings. + * + * Return: Returns the "struct page" if this is a "normal" mapping. Returns + * NULL if this is a "special" mapping. + */ +struct page *vm_normal_page_pud(struct vm_area_struct *vma, + unsigned long addr, pud_t pud) +{ + return __vm_normal_page(vma, addr, pud_pfn(pud), pud_special(pud), + pud_val(pud), PGTABLE_LEVEL_PUD); +} #endif /** @@ -2138,8 +2266,7 @@ static int validate_page_before_insert(struct vm_area_struct *vma, return -EINVAL; return 0; } - if (folio_test_anon(folio) || folio_test_slab(folio) || - page_has_type(page)) + if (folio_test_anon(folio) || page_has_type(page)) return -EINVAL; flush_dcache_folio(folio); return 0; @@ -4387,8 +4514,8 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf) * Get a list of all the (large) orders below PMD_ORDER that are enabled * and suitable for swapping THP. */ - orders = thp_vma_allowable_orders(vma, vma->vm_flags, - TVA_IN_PF | TVA_ENFORCE_SYSFS, BIT(PMD_ORDER) - 1); + orders = thp_vma_allowable_orders(vma, vma->vm_flags, TVA_PAGEFAULT, + BIT(PMD_ORDER) - 1); orders = thp_vma_suitable_orders(vma, vmf->address, orders); orders = thp_swap_suitable_orders(swp_offset(entry), vmf->address, orders); @@ -4532,9 +4659,9 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) if (unlikely(!si)) goto out; - folio = swap_cache_get_folio(entry, vma, vmf->address); + folio = swap_cache_get_folio(entry); if (folio) - page = folio_file_page(folio, swp_offset(entry)); + swap_update_readahead(folio, vma, vmf->address); swapcache = folio; if (!folio) { @@ -4571,7 +4698,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) memcg1_swapin(entry, nr_pages); - shadow = get_shadow_from_swap_cache(entry); + shadow = swap_cache_get_shadow(entry); if (shadow) workingset_refault(folio, shadow); @@ -4605,20 +4732,13 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) ret = VM_FAULT_MAJOR; count_vm_event(PGMAJFAULT); count_memcg_event_mm(vma->vm_mm, PGMAJFAULT); - page = folio_file_page(folio, swp_offset(entry)); - } else if (PageHWPoison(page)) { - /* - * hwpoisoned dirty swapcache pages are kept for killing - * owner processes (which may be unknown at hwpoison time) - */ - ret = VM_FAULT_HWPOISON; - goto out_release; } ret |= folio_lock_or_retry(folio, vmf); if (ret & VM_FAULT_RETRY) goto out_release; + page = folio_file_page(folio, swp_offset(entry)); if (swapcache) { /* * Make sure folio_free_swap() or swapoff did not release the @@ -4627,10 +4747,18 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) * swapcache, we need to check that the page's swap has not * changed. */ - if (unlikely(!folio_test_swapcache(folio) || - page_swap_entry(page).val != entry.val)) + if (unlikely(!folio_matches_swap_entry(folio, entry))) goto out_page; + if (unlikely(PageHWPoison(page))) { + /* + * hwpoisoned dirty swapcache pages are kept for killing + * owner processes (which may be unknown at hwpoison time) + */ + ret = VM_FAULT_HWPOISON; + goto out_page; + } + /* * KSM sometimes has to copy on read faults, for example, if * folio->index of non-ksm folios would be nonlinear inside the @@ -4935,8 +5063,8 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf) * for this vma. Then filter out the orders that can't be allocated over * the faulting address and still be fully contained in the vma. */ - orders = thp_vma_allowable_orders(vma, vma->vm_flags, - TVA_IN_PF | TVA_ENFORCE_SYSFS, BIT(PMD_ORDER) - 1); + orders = thp_vma_allowable_orders(vma, vma->vm_flags, TVA_PAGEFAULT, + BIT(PMD_ORDER) - 1); orders = thp_vma_suitable_orders(vma, vmf->address, orders); if (!orders) @@ -5204,9 +5332,11 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct folio *folio, struct page *pa * It is too late to allocate a small folio, we already have a large * folio in the pagecache: especially s390 KVM cannot tolerate any * PMD mappings, but PTE-mapped THP are fine. So let's simply refuse any - * PMD mappings if THPs are disabled. + * PMD mappings if THPs are disabled. As we already have a THP, + * behave as if we are forcing a collapse. */ - if (thp_disabled_by_hw() || vma_thp_disabled(vma, vma->vm_flags)) + if (thp_disabled_by_hw() || vma_thp_disabled(vma, vma->vm_flags, + /* forced_collapse=*/ true)) return ret; if (!thp_vma_suitable_order(vma, haddr, PMD_ORDER)) @@ -5386,13 +5516,8 @@ fallback: nr_pages = folio_nr_pages(folio); - /* - * Using per-page fault to maintain the uffd semantics, and same - * approach also applies to non shmem/tmpfs faults to avoid - * inflating the RSS of the process. - */ - if (!vma_is_shmem(vma) || unlikely(userfaultfd_armed(vma)) || - unlikely(needs_fallback)) { + /* Using per-page fault to maintain the uffd semantics */ + if (unlikely(userfaultfd_armed(vma)) || unlikely(needs_fallback)) { nr_pages = 1; } else if (nr_pages > 1) { pgoff_t idx = folio_page_idx(folio, page); @@ -6126,8 +6251,7 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, return VM_FAULT_OOM; retry_pud: if (pud_none(*vmf.pud) && - thp_vma_allowable_order(vma, vm_flags, - TVA_IN_PF | TVA_ENFORCE_SYSFS, PUD_ORDER)) { + thp_vma_allowable_order(vma, vm_flags, TVA_PAGEFAULT, PUD_ORDER)) { ret = create_huge_pud(&vmf); if (!(ret & VM_FAULT_FALLBACK)) return ret; @@ -6161,8 +6285,7 @@ retry_pud: goto retry_pud; if (pmd_none(*vmf.pmd) && - thp_vma_allowable_order(vma, vm_flags, - TVA_IN_PF | TVA_ENFORCE_SYSFS, PMD_ORDER)) { + thp_vma_allowable_order(vma, vm_flags, TVA_PAGEFAULT, PMD_ORDER)) { ret = create_huge_pmd(&vmf); if (!(ret & VM_FAULT_FALLBACK)) return ret; |