summaryrefslogtreecommitdiff
path: root/mm/memory.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/memory.c')
-rw-r--r--mm/memory.c391
1 files changed, 257 insertions, 134 deletions
diff --git a/mm/memory.c b/mm/memory.c
index 0ba4f6b71847..74b45e258323 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -491,22 +491,8 @@ static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss)
add_mm_counter(mm, i, rss[i]);
}
-/*
- * This function is called to print an error when a bad pte
- * is found. For example, we might have a PFN-mapped pte in
- * a region that doesn't allow it.
- *
- * The calling function must still handle the error.
- */
-static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
- pte_t pte, struct page *page)
+static bool is_bad_page_map_ratelimited(void)
{
- pgd_t *pgd = pgd_offset(vma->vm_mm, addr);
- p4d_t *p4d = p4d_offset(pgd, addr);
- pud_t *pud = pud_offset(p4d, addr);
- pmd_t *pmd = pmd_offset(pud, addr);
- struct address_space *mapping;
- pgoff_t index;
static unsigned long resume;
static unsigned long nr_shown;
static unsigned long nr_unshown;
@@ -518,7 +504,7 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
if (nr_shown == 60) {
if (time_before(jiffies, resume)) {
nr_unshown++;
- return;
+ return true;
}
if (nr_unshown) {
pr_alert("BUG: Bad page map: %lu messages suppressed\n",
@@ -529,15 +515,91 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
}
if (nr_shown++ == 0)
resume = jiffies + 60 * HZ;
+ return false;
+}
+
+static void __print_bad_page_map_pgtable(struct mm_struct *mm, unsigned long addr)
+{
+ unsigned long long pgdv, p4dv, pudv, pmdv;
+ p4d_t p4d, *p4dp;
+ pud_t pud, *pudp;
+ pmd_t pmd, *pmdp;
+ pgd_t *pgdp;
+
+ /*
+ * Although this looks like a fully lockless pgtable walk, it is not:
+ * see locking requirements for print_bad_page_map().
+ */
+ pgdp = pgd_offset(mm, addr);
+ pgdv = pgd_val(*pgdp);
+
+ if (!pgd_present(*pgdp) || pgd_leaf(*pgdp)) {
+ pr_alert("pgd:%08llx\n", pgdv);
+ return;
+ }
+
+ p4dp = p4d_offset(pgdp, addr);
+ p4d = p4dp_get(p4dp);
+ p4dv = p4d_val(p4d);
+
+ if (!p4d_present(p4d) || p4d_leaf(p4d)) {
+ pr_alert("pgd:%08llx p4d:%08llx\n", pgdv, p4dv);
+ return;
+ }
+
+ pudp = pud_offset(p4dp, addr);
+ pud = pudp_get(pudp);
+ pudv = pud_val(pud);
+
+ if (!pud_present(pud) || pud_leaf(pud)) {
+ pr_alert("pgd:%08llx p4d:%08llx pud:%08llx\n", pgdv, p4dv, pudv);
+ return;
+ }
+
+ pmdp = pmd_offset(pudp, addr);
+ pmd = pmdp_get(pmdp);
+ pmdv = pmd_val(pmd);
+
+ /*
+ * Dumping the PTE would be nice, but it's tricky with CONFIG_HIGHPTE,
+ * because the table should already be mapped by the caller and
+ * doing another map would be bad. print_bad_page_map() should
+ * already take care of printing the PTE.
+ */
+ pr_alert("pgd:%08llx p4d:%08llx pud:%08llx pmd:%08llx\n", pgdv,
+ p4dv, pudv, pmdv);
+}
+
+/*
+ * This function is called to print an error when a bad page table entry (e.g.,
+ * corrupted page table entry) is found. For example, we might have a
+ * PFN-mapped pte in a region that doesn't allow it.
+ *
+ * The calling function must still handle the error.
+ *
+ * This function must be called during a proper page table walk, as it will
+ * re-walk the page table to dump information: the caller MUST prevent page
+ * table teardown (by holding mmap, vma or rmap lock) and MUST hold the leaf
+ * page table lock.
+ */
+static void print_bad_page_map(struct vm_area_struct *vma,
+ unsigned long addr, unsigned long long entry, struct page *page,
+ enum pgtable_level level)
+{
+ struct address_space *mapping;
+ pgoff_t index;
+
+ if (is_bad_page_map_ratelimited())
+ return;
mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL;
index = linear_page_index(vma, addr);
- pr_alert("BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n",
- current->comm,
- (long long)pte_val(pte), (long long)pmd_val(*pmd));
+ pr_alert("BUG: Bad page map in process %s %s:%08llx", current->comm,
+ pgtable_level_to_str(level), entry);
+ __print_bad_page_map_pgtable(vma->vm_mm, addr);
if (page)
- dump_page(page, "bad pte");
+ dump_page(page, "bad page map");
pr_alert("addr:%px vm_flags:%08lx anon_vma:%px mapping:%px index:%lx\n",
(void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
pr_alert("file:%pD fault:%ps mmap:%ps mmap_prepare: %ps read_folio:%ps\n",
@@ -549,18 +611,39 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
dump_stack();
add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
}
+#define print_bad_pte(vma, addr, pte, page) \
+ print_bad_page_map(vma, addr, pte_val(pte), page, PGTABLE_LEVEL_PTE)
-/*
- * vm_normal_page -- This function gets the "struct page" associated with a pte.
+/**
+ * __vm_normal_page() - Get the "struct page" associated with a page table entry.
+ * @vma: The VMA mapping the page table entry.
+ * @addr: The address where the page table entry is mapped.
+ * @pfn: The PFN stored in the page table entry.
+ * @special: Whether the page table entry is marked "special".
+ * @level: The page table level for error reporting purposes only.
+ * @entry: The page table entry value for error reporting purposes only.
*
* "Special" mappings do not wish to be associated with a "struct page" (either
* it doesn't exist, or it exists but they don't want to touch it). In this
- * case, NULL is returned here. "Normal" mappings do have a struct page.
+ * case, NULL is returned here. "Normal" mappings do have a struct page and
+ * are ordinarily refcounted.
+ *
+ * Page mappings of the shared zero folios are always considered "special", as
+ * they are not ordinarily refcounted: neither the refcount nor the mapcount
+ * of these folios is adjusted when mapping them into user page tables.
+ * Selected page table walkers (such as GUP) can still identify mappings of the
+ * shared zero folios and work with the underlying "struct page".
*
- * There are 2 broad cases. Firstly, an architecture may define a pte_special()
- * pte bit, in which case this function is trivial. Secondly, an architecture
- * may not have a spare pte bit, which requires a more complicated scheme,
- * described below.
+ * There are 2 broad cases. Firstly, an architecture may define a "special"
+ * page table entry bit, such as pte_special(), in which case this function is
+ * trivial. Secondly, an architecture may not have a spare page table
+ * entry bit, which requires a more complicated scheme, described below.
+ *
+ * With CONFIG_FIND_NORMAL_PAGE, we might have the "special" bit set on
+ * page table entries that actually map "normal" pages: however, that page
+ * cannot be looked up through the PFN stored in the page table entry, but
+ * instead will be looked up through vm_ops->find_normal_page(). So far, this
+ * only applies to PTEs.
*
* A raw VM_PFNMAP mapping (ie. one that is not COWed) is always considered a
* special mapping (even if there are underlying and valid "struct pages").
@@ -585,72 +668,104 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
*
* VM_MIXEDMAP mappings can likewise contain memory with or without "struct
* page" backing, however the difference is that _all_ pages with a struct
- * page (that is, those where pfn_valid is true) are refcounted and considered
- * normal pages by the VM. The only exception are zeropages, which are
- * *never* refcounted.
+ * page (that is, those where pfn_valid is true, except the shared zero
+ * folios) are refcounted and considered normal pages by the VM.
*
* The disadvantage is that pages are refcounted (which can be slower and
* simply not an option for some PFNMAP users). The advantage is that we
* don't have to follow the strict linearity rule of PFNMAP mappings in
* order to support COWable mappings.
*
+ * Return: Returns the "struct page" if this is a "normal" mapping. Returns
+ * NULL if this is a "special" mapping.
*/
-struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
- pte_t pte)
+static inline struct page *__vm_normal_page(struct vm_area_struct *vma,
+ unsigned long addr, unsigned long pfn, bool special,
+ unsigned long long entry, enum pgtable_level level)
{
- unsigned long pfn = pte_pfn(pte);
-
if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL)) {
- if (likely(!pte_special(pte)))
- goto check_pfn;
- if (vma->vm_ops && vma->vm_ops->find_special_page)
- return vma->vm_ops->find_special_page(vma, addr);
- if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
- return NULL;
- if (is_zero_pfn(pfn))
- return NULL;
-
- print_bad_pte(vma, addr, pte, NULL);
- return NULL;
- }
-
- /* !CONFIG_ARCH_HAS_PTE_SPECIAL case follows: */
-
- if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
- if (vma->vm_flags & VM_MIXEDMAP) {
- if (!pfn_valid(pfn))
- return NULL;
- if (is_zero_pfn(pfn))
- return NULL;
- goto out;
- } else {
- unsigned long off;
- off = (addr - vma->vm_start) >> PAGE_SHIFT;
- if (pfn == vma->vm_pgoff + off)
+ if (unlikely(special)) {
+#ifdef CONFIG_FIND_NORMAL_PAGE
+ if (vma->vm_ops && vma->vm_ops->find_normal_page)
+ return vma->vm_ops->find_normal_page(vma, addr);
+#endif /* CONFIG_FIND_NORMAL_PAGE */
+ if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
return NULL;
- if (!is_cow_mapping(vma->vm_flags))
+ if (is_zero_pfn(pfn) || is_huge_zero_pfn(pfn))
return NULL;
+
+ print_bad_page_map(vma, addr, entry, NULL, level);
+ return NULL;
}
- }
+ /*
+ * With CONFIG_ARCH_HAS_PTE_SPECIAL, any special page table
+ * mappings (incl. shared zero folios) are marked accordingly.
+ */
+ } else {
+ if (unlikely(vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))) {
+ if (vma->vm_flags & VM_MIXEDMAP) {
+ /* If it has a "struct page", it's "normal". */
+ if (!pfn_valid(pfn))
+ return NULL;
+ } else {
+ unsigned long off = (addr - vma->vm_start) >> PAGE_SHIFT;
- if (is_zero_pfn(pfn))
- return NULL;
+ /* Only CoW'ed anon folios are "normal". */
+ if (pfn == vma->vm_pgoff + off)
+ return NULL;
+ if (!is_cow_mapping(vma->vm_flags))
+ return NULL;
+ }
+ }
+
+ if (is_zero_pfn(pfn) || is_huge_zero_pfn(pfn))
+ return NULL;
+ }
-check_pfn:
if (unlikely(pfn > highest_memmap_pfn)) {
- print_bad_pte(vma, addr, pte, NULL);
+ /* Corrupted page table entry. */
+ print_bad_page_map(vma, addr, entry, NULL, level);
return NULL;
}
-
/*
* NOTE! We still have PageReserved() pages in the page tables.
- * eg. VDSO mappings can cause them to exist.
+ * For example, VDSO mappings can cause them to exist.
*/
-out:
- VM_WARN_ON_ONCE(is_zero_pfn(pfn));
+ VM_WARN_ON_ONCE(is_zero_pfn(pfn) || is_huge_zero_pfn(pfn));
return pfn_to_page(pfn);
}
+/**
+ * vm_normal_page() - Get the "struct page" associated with a PTE
+ * @vma: The VMA mapping the @pte.
+ * @addr: The address where the @pte is mapped.
+ * @pte: The PTE.
+ *
+ * Get the "struct page" associated with a PTE. See __vm_normal_page()
+ * for details on "normal" and "special" mappings.
+ *
+ * Return: Returns the "struct page" if this is a "normal" mapping. Returns
+ * NULL if this is a "special" mapping.
+ */
+struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
+ pte_t pte)
+{
+ return __vm_normal_page(vma, addr, pte_pfn(pte), pte_special(pte),
+ pte_val(pte), PGTABLE_LEVEL_PTE);
+}
+
+/**
+ * vm_normal_folio() - Get the "struct folio" associated with a PTE
+ * @vma: The VMA mapping the @pte.
+ * @addr: The address where the @pte is mapped.
+ * @pte: The PTE.
+ *
+ * Get the "struct folio" associated with a PTE. See __vm_normal_page()
+ * for details on "normal" and "special" mappings.
+ *
+ * Return: Returns the "struct folio" if this is a "normal" mapping. Returns
+ * NULL if this is a "special" mapping.
+ */
struct folio *vm_normal_folio(struct vm_area_struct *vma, unsigned long addr,
pte_t pte)
{
@@ -662,43 +777,37 @@ struct folio *vm_normal_folio(struct vm_area_struct *vma, unsigned long addr,
}
#ifdef CONFIG_PGTABLE_HAS_HUGE_LEAVES
+/**
+ * vm_normal_page_pmd() - Get the "struct page" associated with a PMD
+ * @vma: The VMA mapping the @pmd.
+ * @addr: The address where the @pmd is mapped.
+ * @pmd: The PMD.
+ *
+ * Get the "struct page" associated with a PTE. See __vm_normal_page()
+ * for details on "normal" and "special" mappings.
+ *
+ * Return: Returns the "struct page" if this is a "normal" mapping. Returns
+ * NULL if this is a "special" mapping.
+ */
struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
pmd_t pmd)
{
- unsigned long pfn = pmd_pfn(pmd);
-
- /* Currently it's only used for huge pfnmaps */
- if (unlikely(pmd_special(pmd)))
- return NULL;
-
- if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
- if (vma->vm_flags & VM_MIXEDMAP) {
- if (!pfn_valid(pfn))
- return NULL;
- goto out;
- } else {
- unsigned long off;
- off = (addr - vma->vm_start) >> PAGE_SHIFT;
- if (pfn == vma->vm_pgoff + off)
- return NULL;
- if (!is_cow_mapping(vma->vm_flags))
- return NULL;
- }
- }
-
- if (is_huge_zero_pfn(pfn))
- return NULL;
- if (unlikely(pfn > highest_memmap_pfn))
- return NULL;
-
- /*
- * NOTE! We still have PageReserved() pages in the page tables.
- * eg. VDSO mappings can cause them to exist.
- */
-out:
- return pfn_to_page(pfn);
+ return __vm_normal_page(vma, addr, pmd_pfn(pmd), pmd_special(pmd),
+ pmd_val(pmd), PGTABLE_LEVEL_PMD);
}
+/**
+ * vm_normal_folio_pmd() - Get the "struct folio" associated with a PMD
+ * @vma: The VMA mapping the @pmd.
+ * @addr: The address where the @pmd is mapped.
+ * @pmd: The PMD.
+ *
+ * Get the "struct folio" associated with a PTE. See __vm_normal_page()
+ * for details on "normal" and "special" mappings.
+ *
+ * Return: Returns the "struct folio" if this is a "normal" mapping. Returns
+ * NULL if this is a "special" mapping.
+ */
struct folio *vm_normal_folio_pmd(struct vm_area_struct *vma,
unsigned long addr, pmd_t pmd)
{
@@ -708,6 +817,25 @@ struct folio *vm_normal_folio_pmd(struct vm_area_struct *vma,
return page_folio(page);
return NULL;
}
+
+/**
+ * vm_normal_page_pud() - Get the "struct page" associated with a PUD
+ * @vma: The VMA mapping the @pud.
+ * @addr: The address where the @pud is mapped.
+ * @pud: The PUD.
+ *
+ * Get the "struct page" associated with a PUD. See __vm_normal_page()
+ * for details on "normal" and "special" mappings.
+ *
+ * Return: Returns the "struct page" if this is a "normal" mapping. Returns
+ * NULL if this is a "special" mapping.
+ */
+struct page *vm_normal_page_pud(struct vm_area_struct *vma,
+ unsigned long addr, pud_t pud)
+{
+ return __vm_normal_page(vma, addr, pud_pfn(pud), pud_special(pud),
+ pud_val(pud), PGTABLE_LEVEL_PUD);
+}
#endif
/**
@@ -2138,8 +2266,7 @@ static int validate_page_before_insert(struct vm_area_struct *vma,
return -EINVAL;
return 0;
}
- if (folio_test_anon(folio) || folio_test_slab(folio) ||
- page_has_type(page))
+ if (folio_test_anon(folio) || page_has_type(page))
return -EINVAL;
flush_dcache_folio(folio);
return 0;
@@ -4387,8 +4514,8 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf)
* Get a list of all the (large) orders below PMD_ORDER that are enabled
* and suitable for swapping THP.
*/
- orders = thp_vma_allowable_orders(vma, vma->vm_flags,
- TVA_IN_PF | TVA_ENFORCE_SYSFS, BIT(PMD_ORDER) - 1);
+ orders = thp_vma_allowable_orders(vma, vma->vm_flags, TVA_PAGEFAULT,
+ BIT(PMD_ORDER) - 1);
orders = thp_vma_suitable_orders(vma, vmf->address, orders);
orders = thp_swap_suitable_orders(swp_offset(entry),
vmf->address, orders);
@@ -4532,9 +4659,9 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
if (unlikely(!si))
goto out;
- folio = swap_cache_get_folio(entry, vma, vmf->address);
+ folio = swap_cache_get_folio(entry);
if (folio)
- page = folio_file_page(folio, swp_offset(entry));
+ swap_update_readahead(folio, vma, vmf->address);
swapcache = folio;
if (!folio) {
@@ -4571,7 +4698,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
memcg1_swapin(entry, nr_pages);
- shadow = get_shadow_from_swap_cache(entry);
+ shadow = swap_cache_get_shadow(entry);
if (shadow)
workingset_refault(folio, shadow);
@@ -4605,20 +4732,13 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
ret = VM_FAULT_MAJOR;
count_vm_event(PGMAJFAULT);
count_memcg_event_mm(vma->vm_mm, PGMAJFAULT);
- page = folio_file_page(folio, swp_offset(entry));
- } else if (PageHWPoison(page)) {
- /*
- * hwpoisoned dirty swapcache pages are kept for killing
- * owner processes (which may be unknown at hwpoison time)
- */
- ret = VM_FAULT_HWPOISON;
- goto out_release;
}
ret |= folio_lock_or_retry(folio, vmf);
if (ret & VM_FAULT_RETRY)
goto out_release;
+ page = folio_file_page(folio, swp_offset(entry));
if (swapcache) {
/*
* Make sure folio_free_swap() or swapoff did not release the
@@ -4627,10 +4747,18 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
* swapcache, we need to check that the page's swap has not
* changed.
*/
- if (unlikely(!folio_test_swapcache(folio) ||
- page_swap_entry(page).val != entry.val))
+ if (unlikely(!folio_matches_swap_entry(folio, entry)))
goto out_page;
+ if (unlikely(PageHWPoison(page))) {
+ /*
+ * hwpoisoned dirty swapcache pages are kept for killing
+ * owner processes (which may be unknown at hwpoison time)
+ */
+ ret = VM_FAULT_HWPOISON;
+ goto out_page;
+ }
+
/*
* KSM sometimes has to copy on read faults, for example, if
* folio->index of non-ksm folios would be nonlinear inside the
@@ -4935,8 +5063,8 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf)
* for this vma. Then filter out the orders that can't be allocated over
* the faulting address and still be fully contained in the vma.
*/
- orders = thp_vma_allowable_orders(vma, vma->vm_flags,
- TVA_IN_PF | TVA_ENFORCE_SYSFS, BIT(PMD_ORDER) - 1);
+ orders = thp_vma_allowable_orders(vma, vma->vm_flags, TVA_PAGEFAULT,
+ BIT(PMD_ORDER) - 1);
orders = thp_vma_suitable_orders(vma, vmf->address, orders);
if (!orders)
@@ -5204,9 +5332,11 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct folio *folio, struct page *pa
* It is too late to allocate a small folio, we already have a large
* folio in the pagecache: especially s390 KVM cannot tolerate any
* PMD mappings, but PTE-mapped THP are fine. So let's simply refuse any
- * PMD mappings if THPs are disabled.
+ * PMD mappings if THPs are disabled. As we already have a THP,
+ * behave as if we are forcing a collapse.
*/
- if (thp_disabled_by_hw() || vma_thp_disabled(vma, vma->vm_flags))
+ if (thp_disabled_by_hw() || vma_thp_disabled(vma, vma->vm_flags,
+ /* forced_collapse=*/ true))
return ret;
if (!thp_vma_suitable_order(vma, haddr, PMD_ORDER))
@@ -5386,13 +5516,8 @@ fallback:
nr_pages = folio_nr_pages(folio);
- /*
- * Using per-page fault to maintain the uffd semantics, and same
- * approach also applies to non shmem/tmpfs faults to avoid
- * inflating the RSS of the process.
- */
- if (!vma_is_shmem(vma) || unlikely(userfaultfd_armed(vma)) ||
- unlikely(needs_fallback)) {
+ /* Using per-page fault to maintain the uffd semantics */
+ if (unlikely(userfaultfd_armed(vma)) || unlikely(needs_fallback)) {
nr_pages = 1;
} else if (nr_pages > 1) {
pgoff_t idx = folio_page_idx(folio, page);
@@ -6126,8 +6251,7 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
return VM_FAULT_OOM;
retry_pud:
if (pud_none(*vmf.pud) &&
- thp_vma_allowable_order(vma, vm_flags,
- TVA_IN_PF | TVA_ENFORCE_SYSFS, PUD_ORDER)) {
+ thp_vma_allowable_order(vma, vm_flags, TVA_PAGEFAULT, PUD_ORDER)) {
ret = create_huge_pud(&vmf);
if (!(ret & VM_FAULT_FALLBACK))
return ret;
@@ -6161,8 +6285,7 @@ retry_pud:
goto retry_pud;
if (pmd_none(*vmf.pmd) &&
- thp_vma_allowable_order(vma, vm_flags,
- TVA_IN_PF | TVA_ENFORCE_SYSFS, PMD_ORDER)) {
+ thp_vma_allowable_order(vma, vm_flags, TVA_PAGEFAULT, PMD_ORDER)) {
ret = create_huge_pmd(&vmf);
if (!(ret & VM_FAULT_FALLBACK))
return ret;