summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/cma.c5
-rw-r--r--mm/damon/core.c15
-rw-r--r--mm/damon/stat.c53
-rw-r--r--mm/filemap.c15
-rw-r--r--mm/hmm.c4
-rw-r--r--mm/huge_memory.c16
-rw-r--r--mm/hugetlb.c4
-rw-r--r--mm/memcontrol.c2
-rw-r--r--mm/memfd_luo.c49
-rw-r--r--mm/memory.c3
-rw-r--r--mm/migrate.c8
-rw-r--r--mm/migrate_device.c2
-rw-r--r--mm/rmap.c28
-rw-r--r--mm/slub.c11
-rw-r--r--mm/zswap.c8
15 files changed, 179 insertions, 44 deletions
diff --git a/mm/cma.c b/mm/cma.c
index 94b5da468a7d..15cc0ae76c8e 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -1013,6 +1013,7 @@ bool cma_release(struct cma *cma, const struct page *pages,
unsigned long count)
{
struct cma_memrange *cmr;
+ unsigned long ret = 0;
unsigned long i, pfn;
cmr = find_cma_memrange(cma, pages, count);
@@ -1021,7 +1022,9 @@ bool cma_release(struct cma *cma, const struct page *pages,
pfn = page_to_pfn(pages);
for (i = 0; i < count; i++, pfn++)
- VM_WARN_ON(!put_page_testzero(pfn_to_page(pfn)));
+ ret += !put_page_testzero(pfn_to_page(pfn));
+
+ WARN(ret, "%lu pages are still in use!\n", ret);
__cma_release_frozen(cma, cmr, pages, count);
diff --git a/mm/damon/core.c b/mm/damon/core.c
index adfc52fee9dc..3e1890d64d06 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -1252,6 +1252,7 @@ int damon_commit_ctx(struct damon_ctx *dst, struct damon_ctx *src)
{
int err;
+ dst->maybe_corrupted = true;
if (!is_power_of_2(src->min_region_sz))
return -EINVAL;
@@ -1277,6 +1278,7 @@ int damon_commit_ctx(struct damon_ctx *dst, struct damon_ctx *src)
dst->addr_unit = src->addr_unit;
dst->min_region_sz = src->min_region_sz;
+ dst->maybe_corrupted = false;
return 0;
}
@@ -1562,8 +1564,13 @@ int damos_walk(struct damon_ctx *ctx, struct damos_walk_control *control)
}
ctx->walk_control = control;
mutex_unlock(&ctx->walk_control_lock);
- if (!damon_is_running(ctx))
+ if (!damon_is_running(ctx)) {
+ mutex_lock(&ctx->walk_control_lock);
+ if (ctx->walk_control == control)
+ ctx->walk_control = NULL;
+ mutex_unlock(&ctx->walk_control_lock);
return -EINVAL;
+ }
wait_for_completion(&control->completion);
if (control->canceled)
return -ECANCELED;
@@ -2673,6 +2680,8 @@ static void kdamond_call(struct damon_ctx *ctx, bool cancel)
complete(&control->completion);
else if (control->canceled && control->dealloc_on_cancel)
kfree(control);
+ if (!cancel && ctx->maybe_corrupted)
+ break;
}
mutex_lock(&ctx->call_controls_lock);
@@ -2702,6 +2711,8 @@ static int kdamond_wait_activation(struct damon_ctx *ctx)
kdamond_usleep(min_wait_time);
kdamond_call(ctx, false);
+ if (ctx->maybe_corrupted)
+ return -EINVAL;
damos_walk_cancel(ctx);
}
return -EBUSY;
@@ -2785,6 +2796,8 @@ static int kdamond_fn(void *data)
* kdamond_merge_regions() if possible, to reduce overhead
*/
kdamond_call(ctx, false);
+ if (ctx->maybe_corrupted)
+ break;
if (!list_empty(&ctx->schemes))
kdamond_apply_schemes(ctx);
else
diff --git a/mm/damon/stat.c b/mm/damon/stat.c
index 25fb44ccf99d..cf2c5a541eee 100644
--- a/mm/damon/stat.c
+++ b/mm/damon/stat.c
@@ -145,12 +145,59 @@ static int damon_stat_damon_call_fn(void *data)
return 0;
}
+struct damon_stat_system_ram_range_walk_arg {
+ bool walked;
+ struct resource res;
+};
+
+static int damon_stat_system_ram_walk_fn(struct resource *res, void *arg)
+{
+ struct damon_stat_system_ram_range_walk_arg *a = arg;
+
+ if (!a->walked) {
+ a->walked = true;
+ a->res.start = res->start;
+ }
+ a->res.end = res->end;
+ return 0;
+}
+
+static unsigned long damon_stat_res_to_core_addr(resource_size_t ra,
+ unsigned long addr_unit)
+{
+ /*
+ * Use div_u64() for avoiding linking errors related with __udivdi3,
+ * __aeabi_uldivmod, or similar problems. This should also improve the
+ * performance optimization (read div_u64() comment for the detail).
+ */
+ if (sizeof(ra) == 8 && sizeof(addr_unit) == 4)
+ return div_u64(ra, addr_unit);
+ return ra / addr_unit;
+}
+
+static int damon_stat_set_monitoring_region(struct damon_target *t,
+ unsigned long addr_unit, unsigned long min_region_sz)
+{
+ struct damon_addr_range addr_range;
+ struct damon_stat_system_ram_range_walk_arg arg = {};
+
+ walk_system_ram_res(0, -1, &arg, damon_stat_system_ram_walk_fn);
+ if (!arg.walked)
+ return -EINVAL;
+ addr_range.start = damon_stat_res_to_core_addr(
+ arg.res.start, addr_unit);
+ addr_range.end = damon_stat_res_to_core_addr(
+ arg.res.end + 1, addr_unit);
+ if (addr_range.end <= addr_range.start)
+ return -EINVAL;
+ return damon_set_regions(t, &addr_range, 1, min_region_sz);
+}
+
static struct damon_ctx *damon_stat_build_ctx(void)
{
struct damon_ctx *ctx;
struct damon_attrs attrs;
struct damon_target *target;
- unsigned long start = 0, end = 0;
ctx = damon_new_ctx();
if (!ctx)
@@ -180,8 +227,8 @@ static struct damon_ctx *damon_stat_build_ctx(void)
if (!target)
goto free_out;
damon_add_target(ctx, target);
- if (damon_set_region_biggest_system_ram_default(target, &start, &end,
- ctx->min_region_sz))
+ if (damon_stat_set_monitoring_region(target, ctx->addr_unit,
+ ctx->min_region_sz))
goto free_out;
return ctx;
free_out:
diff --git a/mm/filemap.c b/mm/filemap.c
index 6cd7974d4ada..406cef06b684 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1379,14 +1379,16 @@ repeat:
#ifdef CONFIG_MIGRATION
/**
- * migration_entry_wait_on_locked - Wait for a migration entry to be removed
- * @entry: migration swap entry.
+ * softleaf_entry_wait_on_locked - Wait for a migration entry or
+ * device_private entry to be removed.
+ * @entry: migration or device_private swap entry.
* @ptl: already locked ptl. This function will drop the lock.
*
- * Wait for a migration entry referencing the given page to be removed. This is
+ * Wait for a migration entry referencing the given page, or device_private
+ * entry referencing a dvice_private page to be unlocked. This is
* equivalent to folio_put_wait_locked(folio, TASK_UNINTERRUPTIBLE) except
* this can be called without taking a reference on the page. Instead this
- * should be called while holding the ptl for the migration entry referencing
+ * should be called while holding the ptl for @entry referencing
* the page.
*
* Returns after unlocking the ptl.
@@ -1394,7 +1396,7 @@ repeat:
* This follows the same logic as folio_wait_bit_common() so see the comments
* there.
*/
-void migration_entry_wait_on_locked(softleaf_t entry, spinlock_t *ptl)
+void softleaf_entry_wait_on_locked(softleaf_t entry, spinlock_t *ptl)
__releases(ptl)
{
struct wait_page_queue wait_page;
@@ -1428,6 +1430,9 @@ void migration_entry_wait_on_locked(softleaf_t entry, spinlock_t *ptl)
* If a migration entry exists for the page the migration path must hold
* a valid reference to the page, and it must take the ptl to remove the
* migration entry. So the page is valid until the ptl is dropped.
+ * Similarly any path attempting to drop the last reference to a
+ * device-private page needs to grab the ptl to remove the device-private
+ * entry.
*/
spin_unlock(ptl);
diff --git a/mm/hmm.c b/mm/hmm.c
index f6c4ddff4bd6..5955f2f0c83d 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -778,7 +778,7 @@ dma_addr_t hmm_dma_map_pfn(struct device *dev, struct hmm_dma_map *map,
struct page *page = hmm_pfn_to_page(pfns[idx]);
phys_addr_t paddr = hmm_pfn_to_phys(pfns[idx]);
size_t offset = idx * map->dma_entry_size;
- unsigned long attrs = 0;
+ unsigned long attrs = DMA_ATTR_REQUIRE_COHERENT;
dma_addr_t dma_addr;
int ret;
@@ -871,7 +871,7 @@ bool hmm_dma_unmap_pfn(struct device *dev, struct hmm_dma_map *map, size_t idx)
struct dma_iova_state *state = &map->state;
dma_addr_t *dma_addrs = map->dma_list;
unsigned long *pfns = map->pfn_list;
- unsigned long attrs = 0;
+ unsigned long attrs = DMA_ATTR_REQUIRE_COHERENT;
if ((pfns[idx] & valid_dma) != valid_dma)
return false;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 8e2746ea74ad..b298cba853ab 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2797,7 +2797,8 @@ int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pm
_dst_pmd = pmd_mkwrite(pmd_mkdirty(_dst_pmd), dst_vma);
} else {
src_pmdval = pmdp_huge_clear_flush(src_vma, src_addr, src_pmd);
- _dst_pmd = folio_mk_pmd(src_folio, dst_vma->vm_page_prot);
+ _dst_pmd = move_soft_dirty_pmd(src_pmdval);
+ _dst_pmd = clear_uffd_wp_pmd(_dst_pmd);
}
set_pmd_at(mm, dst_addr, dst_pmd, _dst_pmd);
@@ -3631,6 +3632,7 @@ static int __split_unmapped_folio(struct folio *folio, int new_order,
const bool is_anon = folio_test_anon(folio);
int old_order = folio_order(folio);
int start_order = split_type == SPLIT_TYPE_UNIFORM ? new_order : old_order - 1;
+ struct folio *old_folio = folio;
int split_order;
/*
@@ -3651,12 +3653,16 @@ static int __split_unmapped_folio(struct folio *folio, int new_order,
* uniform split has xas_split_alloc() called before
* irq is disabled to allocate enough memory, whereas
* non-uniform split can handle ENOMEM.
+ * Use the to-be-split folio, so that a parallel
+ * folio_try_get() waits on it until xarray is updated
+ * with after-split folios and the original one is
+ * unfrozen.
*/
- if (split_type == SPLIT_TYPE_UNIFORM)
- xas_split(xas, folio, old_order);
- else {
+ if (split_type == SPLIT_TYPE_UNIFORM) {
+ xas_split(xas, old_folio, old_order);
+ } else {
xas_set_order(xas, folio->index, split_order);
- xas_try_split(xas, folio, old_order);
+ xas_try_split(xas, old_folio, old_order);
if (xas_error(xas))
return xas_error(xas);
}
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 0beb6e22bc26..327eaa4074d3 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3101,7 +3101,7 @@ static __init void *alloc_bootmem(struct hstate *h, int nid, bool node_exact)
* extract the actual node first.
*/
if (m)
- listnode = early_pfn_to_nid(PHYS_PFN(virt_to_phys(m)));
+ listnode = early_pfn_to_nid(PHYS_PFN(__pa(m)));
}
if (m) {
@@ -3160,7 +3160,7 @@ found:
* The head struct page is used to get folio information by the HugeTLB
* subsystem like zone id and node id.
*/
- memblock_reserved_mark_noinit(virt_to_phys((void *)m + PAGE_SIZE),
+ memblock_reserved_mark_noinit(__pa((void *)m + PAGE_SIZE),
huge_page_size(h) - PAGE_SIZE);
return 1;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index a52da3a5e4fd..772bac21d155 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3086,7 +3086,7 @@ static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes,
if (!local_trylock(&obj_stock.lock)) {
if (pgdat)
- mod_objcg_mlstate(objcg, pgdat, idx, nr_bytes);
+ mod_objcg_mlstate(objcg, pgdat, idx, nr_acct);
nr_pages = nr_bytes >> PAGE_SHIFT;
nr_bytes = nr_bytes & (PAGE_SIZE - 1);
atomic_add(nr_bytes, &objcg->nr_charged_bytes);
diff --git a/mm/memfd_luo.c b/mm/memfd_luo.c
index e485b828d173..b8edb9f981d7 100644
--- a/mm/memfd_luo.c
+++ b/mm/memfd_luo.c
@@ -146,19 +146,56 @@ static int memfd_luo_preserve_folios(struct file *file,
for (i = 0; i < nr_folios; i++) {
struct memfd_luo_folio_ser *pfolio = &folios_ser[i];
struct folio *folio = folios[i];
- unsigned int flags = 0;
err = kho_preserve_folio(folio);
if (err)
goto err_unpreserve;
- if (folio_test_dirty(folio))
- flags |= MEMFD_LUO_FOLIO_DIRTY;
- if (folio_test_uptodate(folio))
- flags |= MEMFD_LUO_FOLIO_UPTODATE;
+ folio_lock(folio);
+
+ /*
+ * A dirty folio is one which has been written to. A clean folio
+ * is its opposite. Since a clean folio does not carry user
+ * data, it can be freed by page reclaim under memory pressure.
+ *
+ * Saving the dirty flag at prepare() time doesn't work since it
+ * can change later. Saving it at freeze() also won't work
+ * because the dirty bit is normally synced at unmap and there
+ * might still be a mapping of the file at freeze().
+ *
+ * To see why this is a problem, say a folio is clean at
+ * preserve, but gets dirtied later. The pfolio flags will mark
+ * it as clean. After retrieve, the next kernel might try to
+ * reclaim this folio under memory pressure, losing user data.
+ *
+ * Unconditionally mark it dirty to avoid this problem. This
+ * comes at the cost of making clean folios un-reclaimable after
+ * live update.
+ */
+ folio_mark_dirty(folio);
+
+ /*
+ * If the folio is not uptodate, it was fallocated but never
+ * used. Saving this flag at prepare() doesn't work since it
+ * might change later when someone uses the folio.
+ *
+ * Since we have taken the performance penalty of allocating,
+ * zeroing, and pinning all the folios in the holes, take a bit
+ * more and zero all non-uptodate folios too.
+ *
+ * NOTE: For someone looking to improve preserve performance,
+ * this is a good place to look.
+ */
+ if (!folio_test_uptodate(folio)) {
+ folio_zero_range(folio, 0, folio_size(folio));
+ flush_dcache_folio(folio);
+ folio_mark_uptodate(folio);
+ }
+
+ folio_unlock(folio);
pfolio->pfn = folio_pfn(folio);
- pfolio->flags = flags;
+ pfolio->flags = MEMFD_LUO_FOLIO_DIRTY | MEMFD_LUO_FOLIO_UPTODATE;
pfolio->index = folio->index;
}
diff --git a/mm/memory.c b/mm/memory.c
index 07778814b4a8..2f815a34d924 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4763,7 +4763,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
unlock_page(vmf->page);
put_page(vmf->page);
} else {
- pte_unmap_unlock(vmf->pte, vmf->ptl);
+ pte_unmap(vmf->pte);
+ softleaf_entry_wait_on_locked(entry, vmf->ptl);
}
} else if (softleaf_is_hwpoison(entry)) {
ret = VM_FAULT_HWPOISON;
diff --git a/mm/migrate.c b/mm/migrate.c
index 1bf2cf8c44dd..2c3d489ecf51 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -500,7 +500,7 @@ void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
if (!softleaf_is_migration(entry))
goto out;
- migration_entry_wait_on_locked(entry, ptl);
+ softleaf_entry_wait_on_locked(entry, ptl);
return;
out:
spin_unlock(ptl);
@@ -532,10 +532,10 @@ void migration_entry_wait_huge(struct vm_area_struct *vma, unsigned long addr, p
* If migration entry existed, safe to release vma lock
* here because the pgtable page won't be freed without the
* pgtable lock released. See comment right above pgtable
- * lock release in migration_entry_wait_on_locked().
+ * lock release in softleaf_entry_wait_on_locked().
*/
hugetlb_vma_unlock_read(vma);
- migration_entry_wait_on_locked(entry, ptl);
+ softleaf_entry_wait_on_locked(entry, ptl);
return;
}
@@ -553,7 +553,7 @@ void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd)
ptl = pmd_lock(mm, pmd);
if (!pmd_is_migration_entry(*pmd))
goto unlock;
- migration_entry_wait_on_locked(softleaf_from_pmd(*pmd), ptl);
+ softleaf_entry_wait_on_locked(softleaf_from_pmd(*pmd), ptl);
return;
unlock:
spin_unlock(ptl);
diff --git a/mm/migrate_device.c b/mm/migrate_device.c
index 0a8b31939640..8079676c8f1f 100644
--- a/mm/migrate_device.c
+++ b/mm/migrate_device.c
@@ -176,7 +176,7 @@ static int migrate_vma_collect_huge_pmd(pmd_t *pmdp, unsigned long start,
}
if (softleaf_is_migration(entry)) {
- migration_entry_wait_on_locked(entry, ptl);
+ softleaf_entry_wait_on_locked(entry, ptl);
spin_unlock(ptl);
return -EAGAIN;
}
diff --git a/mm/rmap.c b/mm/rmap.c
index 0f00570d1b9e..8f08090d7eb9 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -457,6 +457,13 @@ static void cleanup_partial_anon_vmas(struct vm_area_struct *vma)
list_del(&avc->same_vma);
anon_vma_chain_free(avc);
}
+
+ /*
+ * The anon_vma assigned to this VMA is no longer valid, as we were not
+ * able to correctly clone AVC state. Avoid inconsistent anon_vma tree
+ * state by resetting.
+ */
+ vma->anon_vma = NULL;
}
/**
@@ -1955,7 +1962,14 @@ static inline unsigned int folio_unmap_pte_batch(struct folio *folio,
if (userfaultfd_wp(vma))
return 1;
- return folio_pte_batch(folio, pvmw->pte, pte, max_nr);
+ /*
+ * If unmap fails, we need to restore the ptes. To avoid accidentally
+ * upgrading write permissions for ptes that were not originally
+ * writable, and to avoid losing the soft-dirty bit, use the
+ * appropriate FPB flags.
+ */
+ return folio_pte_batch_flags(folio, vma, pvmw->pte, &pte, max_nr,
+ FPB_RESPECT_WRITE | FPB_RESPECT_SOFT_DIRTY);
}
/*
@@ -2443,11 +2457,17 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
__maybe_unused pmd_t pmdval;
if (flags & TTU_SPLIT_HUGE_PMD) {
+ /*
+ * split_huge_pmd_locked() might leave the
+ * folio mapped through PTEs. Retry the walk
+ * so we can detect this scenario and properly
+ * abort the walk.
+ */
split_huge_pmd_locked(vma, pvmw.address,
pvmw.pmd, true);
- ret = false;
- page_vma_mapped_walk_done(&pvmw);
- break;
+ flags &= ~TTU_SPLIT_HUGE_PMD;
+ page_vma_mapped_walk_restart(&pvmw);
+ continue;
}
#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
pmdval = pmdp_get(pvmw.pmd);
diff --git a/mm/slub.c b/mm/slub.c
index 20cb4f3b636d..2b2d33cc735c 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2119,13 +2119,6 @@ static inline size_t obj_exts_alloc_size(struct kmem_cache *s,
size_t sz = sizeof(struct slabobj_ext) * slab->objects;
struct kmem_cache *obj_exts_cache;
- /*
- * slabobj_ext array for KMALLOC_CGROUP allocations
- * are served from KMALLOC_NORMAL caches.
- */
- if (!mem_alloc_profiling_enabled())
- return sz;
-
if (sz > KMALLOC_MAX_CACHE_SIZE)
return sz;
@@ -2797,6 +2790,7 @@ static void free_empty_sheaf(struct kmem_cache *s, struct slab_sheaf *sheaf)
if (s->flags & SLAB_KMALLOC)
mark_obj_codetag_empty(sheaf);
+ VM_WARN_ON_ONCE(sheaf->size > 0);
kfree(sheaf);
stat(s, SHEAF_FREE);
@@ -2828,6 +2822,7 @@ static int refill_sheaf(struct kmem_cache *s, struct slab_sheaf *sheaf,
return 0;
}
+static void sheaf_flush_unused(struct kmem_cache *s, struct slab_sheaf *sheaf);
static struct slab_sheaf *alloc_full_sheaf(struct kmem_cache *s, gfp_t gfp)
{
@@ -2837,6 +2832,7 @@ static struct slab_sheaf *alloc_full_sheaf(struct kmem_cache *s, gfp_t gfp)
return NULL;
if (refill_sheaf(s, sheaf, gfp | __GFP_NOMEMALLOC | __GFP_NOWARN)) {
+ sheaf_flush_unused(s, sheaf);
free_empty_sheaf(s, sheaf);
return NULL;
}
@@ -4623,6 +4619,7 @@ __pcs_replace_empty_main(struct kmem_cache *s, struct slub_percpu_sheaves *pcs,
* we must be very low on memory so don't bother
* with the barn
*/
+ sheaf_flush_unused(s, empty);
free_empty_sheaf(s, empty);
}
} else {
diff --git a/mm/zswap.c b/mm/zswap.c
index e6ec3295bdb0..16b2ef7223e1 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -942,9 +942,15 @@ static bool zswap_decompress(struct zswap_entry *entry, struct folio *folio)
/* zswap entries of length PAGE_SIZE are not compressed. */
if (entry->length == PAGE_SIZE) {
+ void *dst;
+
WARN_ON_ONCE(input->length != PAGE_SIZE);
- memcpy_from_sglist(kmap_local_folio(folio, 0), input, 0, PAGE_SIZE);
+
+ dst = kmap_local_folio(folio, 0);
+ memcpy_from_sglist(dst, input, 0, PAGE_SIZE);
dlen = PAGE_SIZE;
+ kunmap_local(dst);
+ flush_dcache_folio(folio);
} else {
sg_init_table(&output, 1);
sg_set_folio(&output, folio, PAGE_SIZE, 0);