summaryrefslogtreecommitdiff
path: root/mm/memory.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/memory.c')
-rw-r--r--mm/memory.c407
1 files changed, 250 insertions, 157 deletions
diff --git a/mm/memory.c b/mm/memory.c
index 74b45e258323..2a55edc48a65 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -60,11 +60,12 @@
#include <linux/writeback.h>
#include <linux/memcontrol.h>
#include <linux/mmu_notifier.h>
-#include <linux/swapops.h>
+#include <linux/leafops.h>
#include <linux/elf.h>
#include <linux/gfp.h>
#include <linux/migrate.h>
#include <linux/string.h>
+#include <linux/shmem_fs.h>
#include <linux/memory-tiers.h>
#include <linux/debugfs.h>
#include <linux/userfaultfd_k.h>
@@ -75,13 +76,13 @@
#include <linux/ptrace.h>
#include <linux/vmalloc.h>
#include <linux/sched/sysctl.h>
+#include <linux/pgalloc.h>
+#include <linux/uaccess.h>
#include <trace/events/kmem.h>
#include <asm/io.h>
#include <asm/mmu_context.h>
-#include <asm/pgalloc.h>
-#include <linux/uaccess.h>
#include <asm/tlb.h>
#include <asm/tlbflush.h>
@@ -108,7 +109,7 @@ static __always_inline bool vmf_orig_pte_uffd_wp(struct vm_fault *vmf)
if (!(vmf->flags & FAULT_FLAG_ORIG_PTE_VALID))
return false;
- return pte_marker_uffd_wp(vmf->orig_pte);
+ return pte_is_uffd_wp_marker(vmf->orig_pte);
}
/*
@@ -901,7 +902,8 @@ static void restore_exclusive_pte(struct vm_area_struct *vma,
static int try_restore_exclusive_pte(struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep, pte_t orig_pte)
{
- struct page *page = pfn_swap_entry_to_page(pte_to_swp_entry(orig_pte));
+ const softleaf_t entry = softleaf_from_pte(orig_pte);
+ struct page *page = softleaf_to_page(entry);
struct folio *folio = page_folio(page);
if (folio_trylock(folio)) {
@@ -926,12 +928,12 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
{
vm_flags_t vm_flags = dst_vma->vm_flags;
pte_t orig_pte = ptep_get(src_pte);
+ softleaf_t entry = softleaf_from_pte(orig_pte);
pte_t pte = orig_pte;
struct folio *folio;
struct page *page;
- swp_entry_t entry = pte_to_swp_entry(orig_pte);
- if (likely(!non_swap_entry(entry))) {
+ if (likely(softleaf_is_swap(entry))) {
if (swap_duplicate(entry) < 0)
return -EIO;
@@ -949,12 +951,12 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
set_pte_at(src_mm, addr, src_pte, pte);
}
rss[MM_SWAPENTS]++;
- } else if (is_migration_entry(entry)) {
- folio = pfn_swap_entry_folio(entry);
+ } else if (softleaf_is_migration(entry)) {
+ folio = softleaf_to_folio(entry);
rss[mm_counter(folio)]++;
- if (!is_readable_migration_entry(entry) &&
+ if (!softleaf_is_migration_read(entry) &&
is_cow_mapping(vm_flags)) {
/*
* COW mappings require pages in both parent and child
@@ -963,15 +965,15 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
*/
entry = make_readable_migration_entry(
swp_offset(entry));
- pte = swp_entry_to_pte(entry);
+ pte = softleaf_to_pte(entry);
if (pte_swp_soft_dirty(orig_pte))
pte = pte_swp_mksoft_dirty(pte);
if (pte_swp_uffd_wp(orig_pte))
pte = pte_swp_mkuffd_wp(pte);
set_pte_at(src_mm, addr, src_pte, pte);
}
- } else if (is_device_private_entry(entry)) {
- page = pfn_swap_entry_to_page(entry);
+ } else if (softleaf_is_device_private(entry)) {
+ page = softleaf_to_page(entry);
folio = page_folio(page);
/*
@@ -995,7 +997,7 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
* when a device driver is involved (you cannot easily
* save and restore device driver state).
*/
- if (is_writable_device_private_entry(entry) &&
+ if (softleaf_is_device_private_write(entry) &&
is_cow_mapping(vm_flags)) {
entry = make_readable_device_private_entry(
swp_offset(entry));
@@ -1004,7 +1006,7 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pte = pte_swp_mkuffd_wp(pte);
set_pte_at(src_mm, addr, src_pte, pte);
}
- } else if (is_device_exclusive_entry(entry)) {
+ } else if (softleaf_is_device_exclusive(entry)) {
/*
* Make device exclusive entries present by restoring the
* original entry then copying as for a present pte. Device
@@ -1015,7 +1017,7 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
if (try_restore_exclusive_pte(src_vma, addr, src_pte, orig_pte))
return -EBUSY;
return -ENOENT;
- } else if (is_pte_marker_entry(entry)) {
+ } else if (softleaf_is_marker(entry)) {
pte_marker marker = copy_pte_marker(entry, dst_vma);
if (marker)
@@ -1216,7 +1218,7 @@ copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
spinlock_t *src_ptl, *dst_ptl;
int progress, max_nr, ret = 0;
int rss[NR_MM_COUNTERS];
- swp_entry_t entry = (swp_entry_t){0};
+ softleaf_t entry = softleaf_mk_none();
struct folio *prealloc = NULL;
int nr;
@@ -1280,7 +1282,7 @@ again:
dst_vma, src_vma,
addr, rss);
if (ret == -EIO) {
- entry = pte_to_swp_entry(ptep_get(src_pte));
+ entry = softleaf_from_pte(ptep_get(src_pte));
break;
} else if (ret == -EBUSY) {
break;
@@ -1373,8 +1375,9 @@ copy_pmd_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
src_pmd = pmd_offset(src_pud, addr);
do {
next = pmd_addr_end(addr, end);
- if (is_swap_pmd(*src_pmd) || pmd_trans_huge(*src_pmd)) {
+ if (pmd_is_huge(*src_pmd)) {
int err;
+
VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, src_vma);
err = copy_huge_pmd(dst_mm, src_mm, dst_pmd, src_pmd,
addr, dst_vma, src_vma);
@@ -1462,18 +1465,12 @@ copy_p4d_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
static bool
vma_needs_copy(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
{
+ if (src_vma->vm_flags & VM_COPY_ON_FORK)
+ return true;
/*
- * Always copy pgtables when dst_vma has uffd-wp enabled even if it's
- * file-backed (e.g. shmem). Because when uffd-wp is enabled, pgtable
- * contains uffd-wp protection information, that's something we can't
- * retrieve from page cache, and skip copying will lose those info.
+ * The presence of an anon_vma indicates an anonymous VMA has page
+ * tables which naturally cannot be reconstituted on page fault.
*/
- if (userfaultfd_wp(dst_vma))
- return true;
-
- if (src_vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
- return true;
-
if (src_vma->anon_vma)
return true;
@@ -1593,7 +1590,9 @@ zap_install_uffd_wp_if_needed(struct vm_area_struct *vma,
{
bool was_installed = false;
-#ifdef CONFIG_PTE_MARKER_UFFD_WP
+ if (!uffd_supports_wp_marker())
+ return false;
+
/* Zap on anonymous always means dropping everything */
if (vma_is_anonymous(vma))
return false;
@@ -1610,7 +1609,7 @@ zap_install_uffd_wp_if_needed(struct vm_area_struct *vma,
pte++;
addr += PAGE_SIZE;
}
-#endif
+
return was_installed;
}
@@ -1716,14 +1715,14 @@ static inline int zap_nonpresent_ptes(struct mmu_gather *tlb,
unsigned int max_nr, unsigned long addr,
struct zap_details *details, int *rss, bool *any_skipped)
{
- swp_entry_t entry;
+ softleaf_t entry;
int nr = 1;
*any_skipped = true;
- entry = pte_to_swp_entry(ptent);
- if (is_device_private_entry(entry) ||
- is_device_exclusive_entry(entry)) {
- struct page *page = pfn_swap_entry_to_page(entry);
+ entry = softleaf_from_pte(ptent);
+ if (softleaf_is_device_private(entry) ||
+ softleaf_is_device_exclusive(entry)) {
+ struct page *page = softleaf_to_page(entry);
struct folio *folio = page_folio(page);
if (unlikely(!should_zap_folio(details, folio)))
@@ -1738,7 +1737,7 @@ static inline int zap_nonpresent_ptes(struct mmu_gather *tlb,
rss[mm_counter(folio)]--;
folio_remove_rmap_pte(folio, page, vma);
folio_put(folio);
- } else if (!non_swap_entry(entry)) {
+ } else if (softleaf_is_swap(entry)) {
/* Genuine swap entries, hence a private anon pages */
if (!should_zap_cows(details))
return 1;
@@ -1746,20 +1745,20 @@ static inline int zap_nonpresent_ptes(struct mmu_gather *tlb,
nr = swap_pte_batch(pte, max_nr, ptent);
rss[MM_SWAPENTS] -= nr;
free_swap_and_cache_nr(entry, nr);
- } else if (is_migration_entry(entry)) {
- struct folio *folio = pfn_swap_entry_folio(entry);
+ } else if (softleaf_is_migration(entry)) {
+ struct folio *folio = softleaf_to_folio(entry);
if (!should_zap_folio(details, folio))
return 1;
rss[mm_counter(folio)]--;
- } else if (pte_marker_entry_uffd_wp(entry)) {
+ } else if (softleaf_is_uffd_wp_marker(entry)) {
/*
* For anon: always drop the marker; for file: only
* drop the marker if explicitly requested.
*/
if (!vma_is_anonymous(vma) && !zap_drop_markers(details))
return 1;
- } else if (is_guard_swp_entry(entry)) {
+ } else if (softleaf_is_guard_marker(entry)) {
/*
* Ordinary zapping should not remove guard PTE
* markers. Only do so if we should remove PTE markers
@@ -1767,7 +1766,8 @@ static inline int zap_nonpresent_ptes(struct mmu_gather *tlb,
*/
if (!zap_drop_markers(details))
return 1;
- } else if (is_hwpoison_entry(entry) || is_poisoned_swp_entry(entry)) {
+ } else if (softleaf_is_hwpoison(entry) ||
+ softleaf_is_poison_marker(entry)) {
if (!should_zap_cows(details))
return 1;
} else {
@@ -1920,7 +1920,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
pmd = pmd_offset(pud, addr);
do {
next = pmd_addr_end(addr, end);
- if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd)) {
+ if (pmd_is_huge(*pmd)) {
if (next - addr != HPAGE_PMD_SIZE)
__split_huge_pmd(vma, pmd, addr, false);
else if (zap_huge_pmd(tlb, vma, pmd, addr)) {
@@ -2022,8 +2022,7 @@ void unmap_page_range(struct mmu_gather *tlb,
static void unmap_single_vma(struct mmu_gather *tlb,
struct vm_area_struct *vma, unsigned long start_addr,
- unsigned long end_addr,
- struct zap_details *details, bool mm_wr_locked)
+ unsigned long end_addr, struct zap_details *details)
{
unsigned long start = max(vma->vm_start, start_addr);
unsigned long end;
@@ -2069,7 +2068,6 @@ static void unmap_single_vma(struct mmu_gather *tlb,
* @start_addr: virtual address at which to start unmapping
* @end_addr: virtual address at which to end unmapping
* @tree_end: The maximum index to check
- * @mm_wr_locked: lock flag
*
* Unmap all pages in the vma list.
*
@@ -2084,8 +2082,7 @@ static void unmap_single_vma(struct mmu_gather *tlb,
*/
void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas,
struct vm_area_struct *vma, unsigned long start_addr,
- unsigned long end_addr, unsigned long tree_end,
- bool mm_wr_locked)
+ unsigned long end_addr, unsigned long tree_end)
{
struct mmu_notifier_range range;
struct zap_details details = {
@@ -2101,8 +2098,7 @@ void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas,
unsigned long start = start_addr;
unsigned long end = end_addr;
hugetlb_zap_begin(vma, &start, &end);
- unmap_single_vma(tlb, vma, start, end, &details,
- mm_wr_locked);
+ unmap_single_vma(tlb, vma, start, end, &details);
hugetlb_zap_end(vma, &details);
vma = mas_find(mas, tree_end - 1);
} while (vma && likely(!xa_is_zero(vma)));
@@ -2138,7 +2134,7 @@ void zap_page_range_single_batched(struct mmu_gather *tlb,
* unmap 'address-end' not 'range.start-range.end' as range
* could have been expanded for hugetlb pmd sharing.
*/
- unmap_single_vma(tlb, vma, address, end, details, false);
+ unmap_single_vma(tlb, vma, address, end, details);
mmu_notifier_invalidate_range_end(&range);
if (is_vm_hugetlb_page(vma)) {
/*
@@ -2899,6 +2895,25 @@ static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd,
return 0;
}
+static int get_remap_pgoff(vm_flags_t vm_flags, unsigned long addr,
+ unsigned long end, unsigned long vm_start, unsigned long vm_end,
+ unsigned long pfn, pgoff_t *vm_pgoff_p)
+{
+ /*
+ * There's a horrible special case to handle copy-on-write
+ * behaviour that some programs depend on. We mark the "original"
+ * un-COW'ed pages by matching them up with "vma->vm_pgoff".
+ * See vm_normal_page() for details.
+ */
+ if (is_cow_mapping(vm_flags)) {
+ if (addr != vm_start || end != vm_end)
+ return -EINVAL;
+ *vm_pgoff_p = pfn;
+ }
+
+ return 0;
+}
+
static int remap_pfn_range_internal(struct vm_area_struct *vma, unsigned long addr,
unsigned long pfn, unsigned long size, pgprot_t prot)
{
@@ -2911,31 +2926,7 @@ static int remap_pfn_range_internal(struct vm_area_struct *vma, unsigned long ad
if (WARN_ON_ONCE(!PAGE_ALIGNED(addr)))
return -EINVAL;
- /*
- * Physically remapped pages are special. Tell the
- * rest of the world about it:
- * VM_IO tells people not to look at these pages
- * (accesses can have side effects).
- * VM_PFNMAP tells the core MM that the base pages are just
- * raw PFN mappings, and do not have a "struct page" associated
- * with them.
- * VM_DONTEXPAND
- * Disable vma merging and expanding with mremap().
- * VM_DONTDUMP
- * Omit vma from core dump, even when VM_IO turned off.
- *
- * There's a horrible special case to handle copy-on-write
- * behaviour that some programs depend on. We mark the "original"
- * un-COW'ed pages by matching them up with "vma->vm_pgoff".
- * See vm_normal_page() for details.
- */
- if (is_cow_mapping(vma->vm_flags)) {
- if (addr != vma->vm_start || end != vma->vm_end)
- return -EINVAL;
- vma->vm_pgoff = pfn;
- }
-
- vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP);
+ VM_WARN_ON_ONCE((vma->vm_flags & VM_REMAP_FLAGS) != VM_REMAP_FLAGS);
BUG_ON(addr >= end);
pfn -= addr >> PAGE_SHIFT;
@@ -2956,7 +2947,7 @@ static int remap_pfn_range_internal(struct vm_area_struct *vma, unsigned long ad
* Variant of remap_pfn_range that does not call track_pfn_remap. The caller
* must have pre-validated the caching bits of the pgprot_t.
*/
-int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr,
+static int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr,
unsigned long pfn, unsigned long size, pgprot_t prot)
{
int error = remap_pfn_range_internal(vma, addr, pfn, size, prot);
@@ -3001,23 +2992,9 @@ void pfnmap_track_ctx_release(struct kref *ref)
pfnmap_untrack(ctx->pfn, ctx->size);
kfree(ctx);
}
-#endif /* __HAVE_PFNMAP_TRACKING */
-/**
- * remap_pfn_range - remap kernel memory to userspace
- * @vma: user vma to map to
- * @addr: target page aligned user address to start at
- * @pfn: page frame number of kernel physical memory address
- * @size: size of mapping area
- * @prot: page protection flags for this mapping
- *
- * Note: this is only safe if the mm semaphore is held when called.
- *
- * Return: %0 on success, negative error code otherwise.
- */
-#ifdef __HAVE_PFNMAP_TRACKING
-int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
- unsigned long pfn, unsigned long size, pgprot_t prot)
+static int remap_pfn_range_track(struct vm_area_struct *vma, unsigned long addr,
+ unsigned long pfn, unsigned long size, pgprot_t prot)
{
struct pfnmap_track_ctx *ctx = NULL;
int err;
@@ -3053,15 +3030,78 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
return err;
}
+static int do_remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
+ unsigned long pfn, unsigned long size, pgprot_t prot)
+{
+ return remap_pfn_range_track(vma, addr, pfn, size, prot);
+}
#else
-int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
- unsigned long pfn, unsigned long size, pgprot_t prot)
+static int do_remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
+ unsigned long pfn, unsigned long size, pgprot_t prot)
{
return remap_pfn_range_notrack(vma, addr, pfn, size, prot);
}
#endif
+
+void remap_pfn_range_prepare(struct vm_area_desc *desc, unsigned long pfn)
+{
+ /*
+ * We set addr=VMA start, end=VMA end here, so this won't fail, but we
+ * check it again on complete and will fail there if specified addr is
+ * invalid.
+ */
+ get_remap_pgoff(desc->vm_flags, desc->start, desc->end,
+ desc->start, desc->end, pfn, &desc->pgoff);
+ desc->vm_flags |= VM_REMAP_FLAGS;
+}
+
+static int remap_pfn_range_prepare_vma(struct vm_area_struct *vma, unsigned long addr,
+ unsigned long pfn, unsigned long size)
+{
+ unsigned long end = addr + PAGE_ALIGN(size);
+ int err;
+
+ err = get_remap_pgoff(vma->vm_flags, addr, end,
+ vma->vm_start, vma->vm_end,
+ pfn, &vma->vm_pgoff);
+ if (err)
+ return err;
+
+ vm_flags_set(vma, VM_REMAP_FLAGS);
+ return 0;
+}
+
+/**
+ * remap_pfn_range - remap kernel memory to userspace
+ * @vma: user vma to map to
+ * @addr: target page aligned user address to start at
+ * @pfn: page frame number of kernel physical memory address
+ * @size: size of mapping area
+ * @prot: page protection flags for this mapping
+ *
+ * Note: this is only safe if the mm semaphore is held when called.
+ *
+ * Return: %0 on success, negative error code otherwise.
+ */
+int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
+ unsigned long pfn, unsigned long size, pgprot_t prot)
+{
+ int err;
+
+ err = remap_pfn_range_prepare_vma(vma, addr, pfn, size);
+ if (err)
+ return err;
+
+ return do_remap_pfn_range(vma, addr, pfn, size, prot);
+}
EXPORT_SYMBOL(remap_pfn_range);
+int remap_pfn_range_complete(struct vm_area_struct *vma, unsigned long addr,
+ unsigned long pfn, unsigned long size, pgprot_t prot)
+{
+ return do_remap_pfn_range(vma, addr, pfn, size, prot);
+}
+
/**
* vm_iomap_memory - remap memory to userspace
* @vma: user vma to map to
@@ -4327,7 +4367,7 @@ static inline bool should_try_to_free_swap(struct folio *folio,
* If we want to map a page that's in the swapcache writable, we
* have to detect via the refcount if we're really the exclusive
* user. Try freeing the swapcache to get rid of the swapcache
- * reference only in case it's likely that we'll be the exlusive user.
+ * reference only in case it's likely that we'll be the exclusive user.
*/
return (fault_flags & FAULT_FLAG_WRITE) && !folio_test_ksm(folio) &&
folio_ref_count(folio) == (1 + folio_nr_pages(folio));
@@ -4345,7 +4385,7 @@ static vm_fault_t pte_marker_clear(struct vm_fault *vmf)
*
* This should also cover the case where e.g. the pte changed
* quickly from a PTE_MARKER_UFFD_WP into PTE_MARKER_POISONED.
- * So is_pte_marker() check is not enough to safely drop the pte.
+ * So pte_is_marker() check is not enough to safely drop the pte.
*/
if (pte_same(vmf->orig_pte, ptep_get(vmf->pte)))
pte_clear(vmf->vma->vm_mm, vmf->address, vmf->pte);
@@ -4379,8 +4419,8 @@ static vm_fault_t pte_marker_handle_uffd_wp(struct vm_fault *vmf)
static vm_fault_t handle_pte_marker(struct vm_fault *vmf)
{
- swp_entry_t entry = pte_to_swp_entry(vmf->orig_pte);
- unsigned long marker = pte_marker_get(entry);
+ const softleaf_t entry = softleaf_from_pte(vmf->orig_pte);
+ const pte_marker marker = softleaf_to_marker(entry);
/*
* PTE markers should never be empty. If anything weird happened,
@@ -4397,7 +4437,7 @@ static vm_fault_t handle_pte_marker(struct vm_fault *vmf)
if (marker & PTE_MARKER_GUARD)
return VM_FAULT_SIGSEGV;
- if (pte_marker_entry_uffd_wp(entry))
+ if (softleaf_is_uffd_wp_marker(entry))
return pte_marker_handle_uffd_wp(vmf);
/* This is an unknown pte marker */
@@ -4408,13 +4448,13 @@ static struct folio *__alloc_swap_folio(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
struct folio *folio;
- swp_entry_t entry;
+ softleaf_t entry;
folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, vmf->address);
if (!folio)
return NULL;
- entry = pte_to_swp_entry(vmf->orig_pte);
+ entry = softleaf_from_pte(vmf->orig_pte);
if (mem_cgroup_swapin_charge_folio(folio, vma->vm_mm,
GFP_KERNEL, entry)) {
folio_put(folio);
@@ -4432,7 +4472,7 @@ static struct folio *__alloc_swap_folio(struct vm_fault *vmf)
static bool can_swapin_thp(struct vm_fault *vmf, pte_t *ptep, int nr_pages)
{
unsigned long addr;
- swp_entry_t entry;
+ softleaf_t entry;
int idx;
pte_t pte;
@@ -4442,7 +4482,7 @@ static bool can_swapin_thp(struct vm_fault *vmf, pte_t *ptep, int nr_pages)
if (!pte_same(pte, pte_move_swp_offset(vmf->orig_pte, -idx)))
return false;
- entry = pte_to_swp_entry(pte);
+ entry = softleaf_from_pte(pte);
if (swap_pte_batch(ptep, nr_pages, pte) != nr_pages)
return false;
@@ -4488,7 +4528,7 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf)
unsigned long orders;
struct folio *folio;
unsigned long addr;
- swp_entry_t entry;
+ softleaf_t entry;
spinlock_t *ptl;
pte_t *pte;
gfp_t gfp;
@@ -4509,7 +4549,7 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf)
if (!zswap_never_enabled())
goto fallback;
- entry = pte_to_swp_entry(vmf->orig_pte);
+ entry = softleaf_from_pte(vmf->orig_pte);
/*
* Get a list of all the (large) orders below PMD_ORDER that are enabled
* and suitable for swapping THP.
@@ -4588,7 +4628,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
rmap_t rmap_flags = RMAP_NONE;
bool need_clear_cache = false;
bool exclusive = false;
- swp_entry_t entry;
+ softleaf_t entry;
pte_t pte;
vm_fault_t ret = 0;
void *shadow = NULL;
@@ -4600,15 +4640,15 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
if (!pte_unmap_same(vmf))
goto out;
- entry = pte_to_swp_entry(vmf->orig_pte);
- if (unlikely(non_swap_entry(entry))) {
- if (is_migration_entry(entry)) {
+ entry = softleaf_from_pte(vmf->orig_pte);
+ if (unlikely(!softleaf_is_swap(entry))) {
+ if (softleaf_is_migration(entry)) {
migration_entry_wait(vma->vm_mm, vmf->pmd,
vmf->address);
- } else if (is_device_exclusive_entry(entry)) {
- vmf->page = pfn_swap_entry_to_page(entry);
+ } else if (softleaf_is_device_exclusive(entry)) {
+ vmf->page = softleaf_to_page(entry);
ret = remove_device_exclusive_entry(vmf);
- } else if (is_device_private_entry(entry)) {
+ } else if (softleaf_is_device_private(entry)) {
if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
/*
* migrate_to_ram is not yet ready to operate
@@ -4619,7 +4659,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
goto out;
}
- vmf->page = pfn_swap_entry_to_page(entry);
+ vmf->page = softleaf_to_page(entry);
vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
vmf->address, &vmf->ptl);
if (unlikely(!vmf->pte ||
@@ -4643,9 +4683,9 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
} else {
pte_unmap_unlock(vmf->pte, vmf->ptl);
}
- } else if (is_hwpoison_entry(entry)) {
+ } else if (softleaf_is_hwpoison(entry)) {
ret = VM_FAULT_HWPOISON;
- } else if (is_pte_marker_entry(entry)) {
+ } else if (softleaf_is_marker(entry)) {
ret = handle_pte_marker(vmf);
} else {
print_bad_pte(vma, vmf->address, vmf->orig_pte, NULL);
@@ -5404,7 +5444,7 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct folio *folio, struct page *pa
/**
* set_pte_range - Set a range of PTEs to point to pages in a folio.
- * @vmf: Fault decription.
+ * @vmf: Fault description.
* @folio: The folio that contains @page.
* @page: The first page to create a PTE for.
* @nr: The number of PTEs to create.
@@ -5501,8 +5541,25 @@ fallback:
return ret;
}
+ if (!needs_fallback && vma->vm_file) {
+ struct address_space *mapping = vma->vm_file->f_mapping;
+ pgoff_t file_end;
+
+ file_end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
+
+ /*
+ * Do not allow to map with PTEs beyond i_size and with PMD
+ * across i_size to preserve SIGBUS semantics.
+ *
+ * Make an exception for shmem/tmpfs that for long time
+ * intentionally mapped with PMDs across i_size.
+ */
+ needs_fallback = !shmem_mapping(mapping) &&
+ file_end < folio_next_index(folio);
+ }
+
if (pmd_none(*vmf->pmd)) {
- if (folio_test_pmd_mappable(folio)) {
+ if (!needs_fallback && folio_test_pmd_mappable(folio)) {
ret = do_set_pmd(vmf, folio, page);
if (ret != VM_FAULT_FALLBACK)
return ret;
@@ -6116,6 +6173,45 @@ split:
}
/*
+ * The page faults may be spurious because of the racy access to the
+ * page table. For example, a non-populated virtual page is accessed
+ * on 2 CPUs simultaneously, thus the page faults are triggered on
+ * both CPUs. However, it's possible that one CPU (say CPU A) cannot
+ * find the reason for the page fault if the other CPU (say CPU B) has
+ * changed the page table before the PTE is checked on CPU A. Most of
+ * the time, the spurious page faults can be ignored safely. However,
+ * if the page fault is for the write access, it's possible that a
+ * stale read-only TLB entry exists in the local CPU and needs to be
+ * flushed on some architectures. This is called the spurious page
+ * fault fixing.
+ *
+ * Note: flush_tlb_fix_spurious_fault() is defined as flush_tlb_page()
+ * by default and used as such on most architectures, while
+ * flush_tlb_fix_spurious_fault_pmd() is defined as NOP by default and
+ * used as such on most architectures.
+ */
+static void fix_spurious_fault(struct vm_fault *vmf,
+ enum pgtable_level ptlevel)
+{
+ /* Skip spurious TLB flush for retried page fault */
+ if (vmf->flags & FAULT_FLAG_TRIED)
+ return;
+ /*
+ * This is needed only for protection faults but the arch code
+ * is not yet telling us if this is a protection fault or not.
+ * This still avoids useless tlb flushes for .text page faults
+ * with threads.
+ */
+ if (vmf->flags & FAULT_FLAG_WRITE) {
+ if (ptlevel == PGTABLE_LEVEL_PTE)
+ flush_tlb_fix_spurious_fault(vmf->vma, vmf->address,
+ vmf->pte);
+ else
+ flush_tlb_fix_spurious_fault_pmd(vmf->vma, vmf->address,
+ vmf->pmd);
+ }
+}
+/*
* These routines also need to handle stuff like marking pages dirty
* and/or accessed for architectures that don't do it in hardware (most
* RISC architectures). The early dirtying is also good on the i386.
@@ -6196,23 +6292,11 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
}
entry = pte_mkyoung(entry);
if (ptep_set_access_flags(vmf->vma, vmf->address, vmf->pte, entry,
- vmf->flags & FAULT_FLAG_WRITE)) {
+ vmf->flags & FAULT_FLAG_WRITE))
update_mmu_cache_range(vmf, vmf->vma, vmf->address,
vmf->pte, 1);
- } else {
- /* Skip spurious TLB flush for retried page fault */
- if (vmf->flags & FAULT_FLAG_TRIED)
- goto unlock;
- /*
- * This is needed only for protection faults but the arch code
- * is not yet telling us if this is a protection fault or not.
- * This still avoids useless tlb flushes for .text page faults
- * with threads.
- */
- if (vmf->flags & FAULT_FLAG_WRITE)
- flush_tlb_fix_spurious_fault(vmf->vma, vmf->address,
- vmf->pte);
- }
+ else
+ fix_spurious_fault(vmf, PGTABLE_LEVEL_PTE);
unlock:
pte_unmap_unlock(vmf->pte, vmf->ptl);
return 0;
@@ -6287,34 +6371,43 @@ retry_pud:
if (pmd_none(*vmf.pmd) &&
thp_vma_allowable_order(vma, vm_flags, TVA_PAGEFAULT, PMD_ORDER)) {
ret = create_huge_pmd(&vmf);
- if (!(ret & VM_FAULT_FALLBACK))
+ if (ret & VM_FAULT_FALLBACK)
+ goto fallback;
+ else
return ret;
- } else {
- vmf.orig_pmd = pmdp_get_lockless(vmf.pmd);
+ }
- if (unlikely(is_swap_pmd(vmf.orig_pmd))) {
- VM_BUG_ON(thp_migration_supported() &&
- !is_pmd_migration_entry(vmf.orig_pmd));
- if (is_pmd_migration_entry(vmf.orig_pmd))
- pmd_migration_entry_wait(mm, vmf.pmd);
- return 0;
- }
- if (pmd_trans_huge(vmf.orig_pmd)) {
- if (pmd_protnone(vmf.orig_pmd) && vma_is_accessible(vma))
- return do_huge_pmd_numa_page(&vmf);
+ vmf.orig_pmd = pmdp_get_lockless(vmf.pmd);
+ if (pmd_none(vmf.orig_pmd))
+ goto fallback;
- if ((flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) &&
- !pmd_write(vmf.orig_pmd)) {
- ret = wp_huge_pmd(&vmf);
- if (!(ret & VM_FAULT_FALLBACK))
- return ret;
- } else {
- huge_pmd_set_accessed(&vmf);
- return 0;
- }
+ if (unlikely(!pmd_present(vmf.orig_pmd))) {
+ if (pmd_is_device_private_entry(vmf.orig_pmd))
+ return do_huge_pmd_device_private(&vmf);
+
+ if (pmd_is_migration_entry(vmf.orig_pmd))
+ pmd_migration_entry_wait(mm, vmf.pmd);
+ return 0;
+ }
+ if (pmd_trans_huge(vmf.orig_pmd)) {
+ if (pmd_protnone(vmf.orig_pmd) && vma_is_accessible(vma))
+ return do_huge_pmd_numa_page(&vmf);
+
+ if ((flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) &&
+ !pmd_write(vmf.orig_pmd)) {
+ ret = wp_huge_pmd(&vmf);
+ if (!(ret & VM_FAULT_FALLBACK))
+ return ret;
+ } else {
+ vmf.ptl = pmd_lock(mm, vmf.pmd);
+ if (!huge_pmd_set_accessed(&vmf))
+ fix_spurious_fault(&vmf, PGTABLE_LEVEL_PMD);
+ spin_unlock(vmf.ptl);
+ return 0;
}
}
+fallback:
return handle_pte_fault(&vmf);
}
@@ -6672,12 +6765,12 @@ retry:
goto out;
p4dp = p4d_offset(pgdp, address);
- p4d = READ_ONCE(*p4dp);
+ p4d = p4dp_get(p4dp);
if (p4d_none(p4d) || unlikely(p4d_bad(p4d)))
goto out;
pudp = pud_offset(p4dp, address);
- pud = READ_ONCE(*pudp);
+ pud = pudp_get(pudp);
if (pud_none(pud))
goto out;
if (pud_leaf(pud)) {