summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig72
-rw-r--r--mm/Makefile1
-rw-r--r--mm/backing-dev.c7
-rw-r--r--mm/cma.c41
-rw-r--r--mm/compaction.c2
-rw-r--r--mm/damon/Kconfig2
-rw-r--r--mm/damon/core.c125
-rw-r--r--mm/damon/lru_sort.c56
-rw-r--r--mm/damon/ops-common.c11
-rw-r--r--mm/damon/ops-common.h2
-rw-r--r--mm/damon/paddr.c130
-rw-r--r--mm/damon/reclaim.c54
-rw-r--r--mm/damon/stat.c26
-rw-r--r--mm/damon/sysfs.c85
-rw-r--r--mm/damon/tests/core-kunit.h38
-rw-r--r--mm/damon/tests/vaddr-kunit.h2
-rw-r--r--mm/damon/vaddr.c105
-rw-r--r--mm/debug.c4
-rw-r--r--mm/execmem.c3
-rw-r--r--mm/filemap.c89
-rw-r--r--mm/gup.c140
-rw-r--r--mm/highmem.c10
-rw-r--r--mm/hmm.c70
-rw-r--r--mm/huge_memory.c246
-rw-r--r--mm/hugetlb.c193
-rw-r--r--mm/hugetlb_cma.c3
-rw-r--r--mm/hugetlb_cma.h6
-rw-r--r--mm/hwpoison-inject.c91
-rw-r--r--mm/internal.h26
-rw-r--r--mm/kasan/common.c22
-rw-r--r--mm/kasan/generic.c19
-rw-r--r--mm/kasan/hw_tags.c54
-rw-r--r--mm/kasan/init.c16
-rw-r--r--mm/kasan/kasan.h15
-rw-r--r--mm/kasan/kasan_test_c.c247
-rw-r--r--mm/kasan/shadow.c65
-rw-r--r--mm/kasan/sw_tags.c1
-rw-r--r--mm/kasan/tags.c2
-rw-r--r--mm/kfence/core.c12
-rw-r--r--mm/khugepaged.c178
-rw-r--r--mm/kmemleak.c27
-rw-r--r--mm/kmsan/core.c10
-rw-r--r--mm/kmsan/kmsan_test.c16
-rw-r--r--mm/ksm.c67
-rw-r--r--mm/memblock.c19
-rw-r--r--mm/memcontrol-v1.c8
-rw-r--r--mm/memcontrol.c48
-rw-r--r--mm/memfd.c4
-rw-r--r--mm/memory-failure.c142
-rw-r--r--mm/memory-tiers.c12
-rw-r--r--mm/memory.c391
-rw-r--r--mm/memory_hotplug.c14
-rw-r--r--mm/memremap.c25
-rw-r--r--mm/migrate.c113
-rw-r--r--mm/migrate_device.c2
-rw-r--r--mm/mincore.c70
-rw-r--r--mm/mlock.c6
-rw-r--r--mm/mm_init.c15
-rw-r--r--mm/mmap.c10
-rw-r--r--mm/mmap_lock.c109
-rw-r--r--mm/mmu_gather.c4
-rw-r--r--mm/mmzone.c4
-rw-r--r--mm/mremap.c9
-rw-r--r--mm/nommu.c17
-rw-r--r--mm/numa_emulation.c4
-rw-r--r--mm/numa_memblks.c6
-rw-r--r--mm/oom_kill.c52
-rw-r--r--mm/page-writeback.c46
-rw-r--r--mm/page_alloc.c212
-rw-r--r--mm/page_io.c12
-rw-r--r--mm/page_vma_mapped.c1
-rw-r--r--mm/pagewalk.c58
-rw-r--r--mm/percpu-km.c2
-rw-r--r--mm/percpu.c26
-rw-r--r--mm/readahead.c8
-rw-r--r--mm/rmap.c235
-rw-r--r--mm/shmem.c141
-rw-r--r--mm/show_mem.c17
-rw-r--r--mm/slab.h26
-rw-r--r--mm/slab_common.c37
-rw-r--r--mm/slub.c2491
-rw-r--r--mm/sparse-vmemmap.c11
-rw-r--r--mm/sparse.c21
-rw-r--r--mm/swap.c63
-rw-r--r--mm/swap.h321
-rw-r--r--mm/swap_state.c488
-rw-r--r--mm/swap_table.h130
-rw-r--r--mm/swapfile.c566
-rw-r--r--mm/userfaultfd.c236
-rw-r--r--mm/util.c110
-rw-r--r--mm/vma.c5
-rw-r--r--mm/vma.h30
-rw-r--r--mm/vma_init.c3
-rw-r--r--mm/vmalloc.c47
-rw-r--r--mm/vmscan.c67
-rw-r--r--mm/vmstat.c4
-rw-r--r--mm/workingset.c2
-rw-r--r--mm/zpdesc.h14
-rw-r--r--mm/zpool.c328
-rw-r--r--mm/zsmalloc.c83
-rw-r--r--mm/zswap.c269
101 files changed, 6511 insertions, 3074 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index e443fe8cd6cf..0e26f4fc8717 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -9,9 +9,6 @@ menu "Memory Management options"
config ARCH_NO_SWAP
bool
-config ZPOOL
- bool
-
menuconfig SWAP
bool "Support for paging of anonymous memory (swap)"
depends on MMU && BLOCK && !ARCH_NO_SWAP
@@ -26,7 +23,7 @@ config ZSWAP
bool "Compressed cache for swap pages"
depends on SWAP
select CRYPTO
- select ZPOOL
+ select ZSMALLOC
help
A lightweight compressed cache for swap pages. It takes
pages that are in the process of being swapped out and attempts to
@@ -125,45 +122,18 @@ config ZSWAP_COMPRESSOR_DEFAULT
default "zstd" if ZSWAP_COMPRESSOR_DEFAULT_ZSTD
default ""
-choice
- prompt "Default allocator"
- depends on ZSWAP
- default ZSWAP_ZPOOL_DEFAULT_ZSMALLOC if MMU
- help
- Selects the default allocator for the compressed cache for
- swap pages.
- The default is 'zbud' for compatibility, however please do
- read the description of each of the allocators below before
- making a right choice.
+config ZSMALLOC
+ tristate
- The selection made here can be overridden by using the kernel
- command line 'zswap.zpool=' option.
+if ZSMALLOC
-config ZSWAP_ZPOOL_DEFAULT_ZSMALLOC
- bool "zsmalloc"
- select ZSMALLOC
- help
- Use the zsmalloc allocator as the default allocator.
-endchoice
+menu "Zsmalloc allocator options"
+ depends on ZSMALLOC
-config ZSWAP_ZPOOL_DEFAULT
- string
- depends on ZSWAP
- default "zsmalloc" if ZSWAP_ZPOOL_DEFAULT_ZSMALLOC
- default ""
-
-config ZSMALLOC
- tristate
- prompt "N:1 compression allocator (zsmalloc)" if (ZSWAP || ZRAM)
- depends on MMU
- help
- zsmalloc is a slab-based memory allocator designed to store
- pages of various compression levels efficiently. It achieves
- the highest storage density with the least amount of fragmentation.
+comment "Zsmalloc is a common backend allocator for zswap & zram"
config ZSMALLOC_STAT
bool "Export zsmalloc statistics"
- depends on ZSMALLOC
select DEBUG_FS
help
This option enables code in the zsmalloc to collect various
@@ -175,7 +145,6 @@ config ZSMALLOC_CHAIN_SIZE
int "Maximum number of physical pages per-zspage"
default 8
range 4 16
- depends on ZSMALLOC
help
This option sets the upper limit on the number of physical pages
that a zmalloc page (zspage) can consist of. The optimal zspage
@@ -190,10 +159,15 @@ config ZSMALLOC_CHAIN_SIZE
For more information, see zsmalloc documentation.
+endmenu
+
+endif
+
menu "Slab allocator options"
config SLUB
def_bool y
+ select IRQ_WORK
config KVFREE_RCU_BATCHED
def_bool y
@@ -439,9 +413,8 @@ config SPARSEMEM_VMEMMAP_ENABLE
bool
config SPARSEMEM_VMEMMAP
- bool "Sparse Memory virtual memmap"
+ def_bool y
depends on SPARSEMEM && SPARSEMEM_VMEMMAP_ENABLE
- default y
help
SPARSEMEM_VMEMMAP uses a virtually mapped memmap to optimise
pfn_to_page and page_to_pfn operations. This is the most
@@ -776,7 +749,6 @@ config MEMORY_FAILURE
depends on MMU
depends on ARCH_SUPPORTS_MEMORY_FAILURE
bool "Enable recovery from hardware memory errors"
- select MEMORY_ISOLATION
select RAS
help
Enables code to recover from some memory failures on systems
@@ -823,6 +795,22 @@ config ARCH_WANT_GENERAL_HUGETLB
config ARCH_WANTS_THP_SWAP
def_bool n
+config PERSISTENT_HUGE_ZERO_FOLIO
+ bool "Allocate a PMD sized folio for zeroing"
+ depends on TRANSPARENT_HUGEPAGE
+ help
+ Enable this option to reduce the runtime refcounting overhead
+ of the huge zero folio and expand the places in the kernel
+ that can use huge zero folios. For instance, block I/O benefits
+ from access to large folios for zeroing memory.
+
+ With this option enabled, the huge zero folio is allocated
+ once and never freed. One full huge page's worth of memory shall
+ be used.
+
+ Say Y if your system has lots of memory. Say N if you are
+ memory constrained.
+
config MM_ID
def_bool n
@@ -1381,6 +1369,8 @@ config PT_RECLAIM
Note: now only empty user PTE page table pages will be reclaimed.
+config FIND_NORMAL_PAGE
+ def_bool n
source "mm/damon/Kconfig"
diff --git a/mm/Makefile b/mm/Makefile
index ef54aa615d9d..21abb3353550 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -115,7 +115,6 @@ obj-$(CONFIG_DEBUG_RODATA_TEST) += rodata_test.o
obj-$(CONFIG_DEBUG_VM_PGTABLE) += debug_vm_pgtable.o
obj-$(CONFIG_PAGE_OWNER) += page_owner.o
obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o
-obj-$(CONFIG_ZPOOL) += zpool.o
obj-$(CONFIG_ZSMALLOC) += zsmalloc.o
obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += early_ioremap.o
obj-$(CONFIG_CMA) += cma.o
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 783904d8c5ef..128b525b8811 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -510,7 +510,7 @@ static void wb_update_bandwidth_workfn(struct work_struct *work)
/*
* Initial write bandwidth: 100 MB/s
*/
-#define INIT_BW (100 << (20 - PAGE_SHIFT))
+#define INIT_BW MB_TO_PAGES(100)
static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi,
gfp_t gfp)
@@ -633,6 +633,7 @@ static void cgwb_release_workfn(struct work_struct *work)
wb_exit(wb);
bdi_put(bdi);
WARN_ON_ONCE(!list_empty(&wb->b_attached));
+ WARN_ON_ONCE(work_pending(&wb->switch_work));
call_rcu(&wb->rcu, cgwb_free_rcu);
}
@@ -709,6 +710,8 @@ static int cgwb_create(struct backing_dev_info *bdi,
wb->memcg_css = memcg_css;
wb->blkcg_css = blkcg_css;
INIT_LIST_HEAD(&wb->b_attached);
+ INIT_WORK(&wb->switch_work, inode_switch_wbs_work_fn);
+ init_llist_head(&wb->switch_wbs_ctxs);
INIT_WORK(&wb->release_work, cgwb_release_workfn);
set_bit(WB_registered, &wb->state);
bdi_get(bdi);
@@ -839,6 +842,8 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi)
if (!ret) {
bdi->wb.memcg_css = &root_mem_cgroup->css;
bdi->wb.blkcg_css = blkcg_root_css;
+ INIT_WORK(&bdi->wb.switch_work, inode_switch_wbs_work_fn);
+ init_llist_head(&bdi->wb.switch_wbs_ctxs);
}
return ret;
}
diff --git a/mm/cma.c b/mm/cma.c
index 2ffa4befb99a..813e6dc7b095 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -780,10 +780,8 @@ static int cma_range_alloc(struct cma *cma, struct cma_memrange *cmr,
unsigned long count, unsigned int align,
struct page **pagep, gfp_t gfp)
{
- unsigned long mask, offset;
- unsigned long pfn = -1;
- unsigned long start = 0;
unsigned long bitmap_maxno, bitmap_no, bitmap_count;
+ unsigned long start, pfn, mask, offset;
int ret = -EBUSY;
struct page *page = NULL;
@@ -795,7 +793,7 @@ static int cma_range_alloc(struct cma *cma, struct cma_memrange *cmr,
if (bitmap_count > bitmap_maxno)
goto out;
- for (;;) {
+ for (start = 0; ; start = bitmap_no + mask + 1) {
spin_lock_irq(&cma->lock);
/*
* If the request is larger than the available number
@@ -812,6 +810,22 @@ static int cma_range_alloc(struct cma *cma, struct cma_memrange *cmr,
spin_unlock_irq(&cma->lock);
break;
}
+
+ pfn = cmr->base_pfn + (bitmap_no << cma->order_per_bit);
+ page = pfn_to_page(pfn);
+
+ /*
+ * Do not hand out page ranges that are not contiguous, so
+ * callers can just iterate the pages without having to worry
+ * about these corner cases.
+ */
+ if (!page_range_contiguous(page, count)) {
+ spin_unlock_irq(&cma->lock);
+ pr_warn_ratelimited("%s: %s: skipping incompatible area [0x%lx-0x%lx]",
+ __func__, cma->name, pfn, pfn + count - 1);
+ continue;
+ }
+
bitmap_set(cmr->bitmap, bitmap_no, bitmap_count);
cma->available_count -= count;
/*
@@ -821,29 +835,24 @@ static int cma_range_alloc(struct cma *cma, struct cma_memrange *cmr,
*/
spin_unlock_irq(&cma->lock);
- pfn = cmr->base_pfn + (bitmap_no << cma->order_per_bit);
mutex_lock(&cma->alloc_mutex);
ret = alloc_contig_range(pfn, pfn + count, ACR_FLAGS_CMA, gfp);
mutex_unlock(&cma->alloc_mutex);
- if (ret == 0) {
- page = pfn_to_page(pfn);
+ if (!ret)
break;
- }
cma_clear_bitmap(cma, cmr, pfn, count);
if (ret != -EBUSY)
break;
pr_debug("%s(): memory range at pfn 0x%lx %p is busy, retrying\n",
- __func__, pfn, pfn_to_page(pfn));
+ __func__, pfn, page);
- trace_cma_alloc_busy_retry(cma->name, pfn, pfn_to_page(pfn),
- count, align);
- /* try again with a bit different memory target */
- start = bitmap_no + mask + 1;
+ trace_cma_alloc_busy_retry(cma->name, pfn, page, count, align);
}
out:
- *pagep = page;
+ if (!ret)
+ *pagep = page;
return ret;
}
@@ -864,7 +873,7 @@ static struct page *__cma_alloc(struct cma *cma, unsigned long count,
if (!count)
return page;
- trace_cma_alloc_start(name, count, align);
+ trace_cma_alloc_start(name, count, cma->available_count, cma->count, align);
for (r = 0; r < cma->nranges; r++) {
page = NULL;
@@ -882,7 +891,7 @@ static struct page *__cma_alloc(struct cma *cma, unsigned long count,
*/
if (page) {
for (i = 0; i < count; i++)
- page_kasan_tag_reset(nth_page(page, i));
+ page_kasan_tag_reset(page + i);
}
if (ret && !(gfp & __GFP_NOWARN)) {
diff --git a/mm/compaction.c b/mm/compaction.c
index bf021b31c7ec..1e8f8eca318c 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -989,7 +989,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
* Hugepage was successfully isolated and placed
* on the cc->migratepages list.
*/
- low_pfn += folio_nr_pages(folio) - 1;
+ low_pfn += folio_nr_pages(folio) - folio_page_idx(folio, page) - 1;
goto isolate_success_no_list;
}
diff --git a/mm/damon/Kconfig b/mm/damon/Kconfig
index b3171f9406c1..8c868f7035fc 100644
--- a/mm/damon/Kconfig
+++ b/mm/damon/Kconfig
@@ -104,7 +104,7 @@ config DAMON_STAT
config DAMON_STAT_ENABLED_DEFAULT
bool "Enable DAMON_STAT by default"
- depends on DAMON_PADDR
+ depends on DAMON_STAT
default DAMON_STAT
help
Whether to enable DAMON_STAT by default. Users can disable it in
diff --git a/mm/damon/core.c b/mm/damon/core.c
index 70eff5cbe6ee..93848b4c6944 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -201,6 +201,7 @@ static int damon_fill_regions_holes(struct damon_region *first,
* @t: the given target.
* @ranges: array of new monitoring target ranges.
* @nr_ranges: length of @ranges.
+ * @min_sz_region: minimum region size.
*
* This function adds new regions to, or modify existing regions of a
* monitoring target to fit in specific ranges.
@@ -208,7 +209,7 @@ static int damon_fill_regions_holes(struct damon_region *first,
* Return: 0 if success, or negative error code otherwise.
*/
int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges,
- unsigned int nr_ranges)
+ unsigned int nr_ranges, unsigned long min_sz_region)
{
struct damon_region *r, *next;
unsigned int i;
@@ -245,16 +246,16 @@ int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges,
/* no region intersects with this range */
newr = damon_new_region(
ALIGN_DOWN(range->start,
- DAMON_MIN_REGION),
- ALIGN(range->end, DAMON_MIN_REGION));
+ min_sz_region),
+ ALIGN(range->end, min_sz_region));
if (!newr)
return -ENOMEM;
damon_insert_region(newr, damon_prev_region(r), r, t);
} else {
/* resize intersecting regions to fit in this range */
first->ar.start = ALIGN_DOWN(range->start,
- DAMON_MIN_REGION);
- last->ar.end = ALIGN(range->end, DAMON_MIN_REGION);
+ min_sz_region);
+ last->ar.end = ALIGN(range->end, min_sz_region);
/* fill possible holes in the range */
err = damon_fill_regions_holes(first, last, t);
@@ -544,6 +545,9 @@ struct damon_ctx *damon_new_ctx(void)
ctx->attrs.min_nr_regions = 10;
ctx->attrs.max_nr_regions = 1000;
+ ctx->addr_unit = 1;
+ ctx->min_sz_region = DAMON_MIN_REGION;
+
INIT_LIST_HEAD(&ctx->adaptive_targets);
INIT_LIST_HEAD(&ctx->schemes);
@@ -570,6 +574,23 @@ void damon_destroy_ctx(struct damon_ctx *ctx)
kfree(ctx);
}
+static bool damon_attrs_equals(const struct damon_attrs *attrs1,
+ const struct damon_attrs *attrs2)
+{
+ const struct damon_intervals_goal *ig1 = &attrs1->intervals_goal;
+ const struct damon_intervals_goal *ig2 = &attrs2->intervals_goal;
+
+ return attrs1->sample_interval == attrs2->sample_interval &&
+ attrs1->aggr_interval == attrs2->aggr_interval &&
+ attrs1->ops_update_interval == attrs2->ops_update_interval &&
+ attrs1->min_nr_regions == attrs2->min_nr_regions &&
+ attrs1->max_nr_regions == attrs2->max_nr_regions &&
+ ig1->access_bp == ig2->access_bp &&
+ ig1->aggrs == ig2->aggrs &&
+ ig1->min_sample_us == ig2->min_sample_us &&
+ ig1->max_sample_us == ig2->max_sample_us;
+}
+
static unsigned int damon_age_for_new_attrs(unsigned int age,
struct damon_attrs *old_attrs, struct damon_attrs *new_attrs)
{
@@ -1108,8 +1129,8 @@ static struct damon_target *damon_nth_target(int n, struct damon_ctx *ctx)
*
* If @src has no region, @dst keeps current regions.
*/
-static int damon_commit_target_regions(
- struct damon_target *dst, struct damon_target *src)
+static int damon_commit_target_regions(struct damon_target *dst,
+ struct damon_target *src, unsigned long src_min_sz_region)
{
struct damon_region *src_region;
struct damon_addr_range *ranges;
@@ -1126,18 +1147,19 @@ static int damon_commit_target_regions(
i = 0;
damon_for_each_region(src_region, src)
ranges[i++] = src_region->ar;
- err = damon_set_regions(dst, ranges, i);
+ err = damon_set_regions(dst, ranges, i, src_min_sz_region);
kfree(ranges);
return err;
}
static int damon_commit_target(
struct damon_target *dst, bool dst_has_pid,
- struct damon_target *src, bool src_has_pid)
+ struct damon_target *src, bool src_has_pid,
+ unsigned long src_min_sz_region)
{
int err;
- err = damon_commit_target_regions(dst, src);
+ err = damon_commit_target_regions(dst, src, src_min_sz_region);
if (err)
return err;
if (dst_has_pid)
@@ -1159,7 +1181,8 @@ static int damon_commit_targets(
if (src_target) {
err = damon_commit_target(
dst_target, damon_target_has_pid(dst),
- src_target, damon_target_has_pid(src));
+ src_target, damon_target_has_pid(src),
+ src->min_sz_region);
if (err)
return err;
} else {
@@ -1182,7 +1205,8 @@ static int damon_commit_targets(
if (!new_target)
return -ENOMEM;
err = damon_commit_target(new_target, false,
- src_target, damon_target_has_pid(src));
+ src_target, damon_target_has_pid(src),
+ src->min_sz_region);
if (err) {
damon_destroy_target(new_target, NULL);
return err;
@@ -1222,10 +1246,14 @@ int damon_commit_ctx(struct damon_ctx *dst, struct damon_ctx *src)
* 2. ops update should be done after pid handling is done (target
* committing require putting pids).
*/
- err = damon_set_attrs(dst, &src->attrs);
- if (err)
- return err;
+ if (!damon_attrs_equals(&dst->attrs, &src->attrs)) {
+ err = damon_set_attrs(dst, &src->attrs);
+ if (err)
+ return err;
+ }
dst->ops = src->ops;
+ dst->addr_unit = src->addr_unit;
+ dst->min_sz_region = src->min_sz_region;
return 0;
}
@@ -1258,8 +1286,8 @@ static unsigned long damon_region_sz_limit(struct damon_ctx *ctx)
if (ctx->attrs.min_nr_regions)
sz /= ctx->attrs.min_nr_regions;
- if (sz < DAMON_MIN_REGION)
- sz = DAMON_MIN_REGION;
+ if (sz < ctx->min_sz_region)
+ sz = ctx->min_sz_region;
return sz;
}
@@ -1603,6 +1631,7 @@ static bool damos_valid_target(struct damon_ctx *c, struct damon_target *t,
* @t: The target of the region.
* @rp: The pointer to the region.
* @s: The scheme to be applied.
+ * @min_sz_region: minimum region size.
*
* If a quota of a scheme has exceeded in a quota charge window, the scheme's
* action would applied to only a part of the target access pattern fulfilling
@@ -1620,7 +1649,7 @@ static bool damos_valid_target(struct damon_ctx *c, struct damon_target *t,
* Return: true if the region should be entirely skipped, false otherwise.
*/
static bool damos_skip_charged_region(struct damon_target *t,
- struct damon_region **rp, struct damos *s)
+ struct damon_region **rp, struct damos *s, unsigned long min_sz_region)
{
struct damon_region *r = *rp;
struct damos_quota *quota = &s->quota;
@@ -1642,11 +1671,11 @@ static bool damos_skip_charged_region(struct damon_target *t,
if (quota->charge_addr_from && r->ar.start <
quota->charge_addr_from) {
sz_to_skip = ALIGN_DOWN(quota->charge_addr_from -
- r->ar.start, DAMON_MIN_REGION);
+ r->ar.start, min_sz_region);
if (!sz_to_skip) {
- if (damon_sz_region(r) <= DAMON_MIN_REGION)
+ if (damon_sz_region(r) <= min_sz_region)
return true;
- sz_to_skip = DAMON_MIN_REGION;
+ sz_to_skip = min_sz_region;
}
damon_split_region_at(t, r, sz_to_skip);
r = damon_next_region(r);
@@ -1671,7 +1700,8 @@ static void damos_update_stat(struct damos *s,
}
static bool damos_filter_match(struct damon_ctx *ctx, struct damon_target *t,
- struct damon_region *r, struct damos_filter *filter)
+ struct damon_region *r, struct damos_filter *filter,
+ unsigned long min_sz_region)
{
bool matched = false;
struct damon_target *ti;
@@ -1688,8 +1718,8 @@ static bool damos_filter_match(struct damon_ctx *ctx, struct damon_target *t,
matched = target_idx == filter->target_idx;
break;
case DAMOS_FILTER_TYPE_ADDR:
- start = ALIGN_DOWN(filter->addr_range.start, DAMON_MIN_REGION);
- end = ALIGN_DOWN(filter->addr_range.end, DAMON_MIN_REGION);
+ start = ALIGN_DOWN(filter->addr_range.start, min_sz_region);
+ end = ALIGN_DOWN(filter->addr_range.end, min_sz_region);
/* inside the range */
if (start <= r->ar.start && r->ar.end <= end) {
@@ -1725,7 +1755,7 @@ static bool damos_filter_out(struct damon_ctx *ctx, struct damon_target *t,
s->core_filters_allowed = false;
damos_for_each_filter(filter, s) {
- if (damos_filter_match(ctx, t, r, filter)) {
+ if (damos_filter_match(ctx, t, r, filter, ctx->min_sz_region)) {
if (filter->allow)
s->core_filters_allowed = true;
return !filter->allow;
@@ -1860,7 +1890,7 @@ static void damos_apply_scheme(struct damon_ctx *c, struct damon_target *t,
if (c->ops.apply_scheme) {
if (quota->esz && quota->charged_sz + sz > quota->esz) {
sz = ALIGN_DOWN(quota->esz - quota->charged_sz,
- DAMON_MIN_REGION);
+ c->min_sz_region);
if (!sz)
goto update_stat;
damon_split_region_at(t, r, sz);
@@ -1908,7 +1938,7 @@ static void damon_do_apply_schemes(struct damon_ctx *c,
if (quota->esz && quota->charged_sz >= quota->esz)
continue;
- if (damos_skip_charged_region(t, &r, s))
+ if (damos_skip_charged_region(t, &r, s, c->min_sz_region))
continue;
if (!damos_valid_target(c, t, r, s))
@@ -2073,8 +2103,8 @@ static void damos_set_effective_quota(struct damos_quota *quota)
if (quota->ms) {
if (quota->total_charged_ns)
- throughput = quota->total_charged_sz * 1000000 /
- quota->total_charged_ns;
+ throughput = mult_frac(quota->total_charged_sz, 1000000,
+ quota->total_charged_ns);
else
throughput = PAGE_SIZE * 1024;
esz = min(throughput * quota->ms, esz);
@@ -2111,6 +2141,12 @@ static void damos_adjust_quota(struct damon_ctx *c, struct damos *s)
if (!quota->ms && !quota->sz && list_empty(&quota->goals))
return;
+ /* First charge window */
+ if (!quota->total_charged_sz && !quota->charged_from) {
+ quota->charged_from = jiffies;
+ damos_set_effective_quota(quota);
+ }
+
/* New charge window starts */
if (time_after_eq(jiffies, quota->charged_from +
msecs_to_jiffies(quota->reset_interval))) {
@@ -2227,6 +2263,8 @@ static void damon_merge_regions_of(struct damon_target *t, unsigned int thres,
damon_for_each_region_safe(r, next, t) {
if (abs(r->nr_accesses - r->last_nr_accesses) > thres)
r->age = 0;
+ else if ((r->nr_accesses == 0) != (r->last_nr_accesses == 0))
+ r->age = 0;
else
r->age++;
@@ -2302,7 +2340,8 @@ static void damon_split_region_at(struct damon_target *t,
}
/* Split every region in the given target into 'nr_subs' regions */
-static void damon_split_regions_of(struct damon_target *t, int nr_subs)
+static void damon_split_regions_of(struct damon_target *t, int nr_subs,
+ unsigned long min_sz_region)
{
struct damon_region *r, *next;
unsigned long sz_region, sz_sub = 0;
@@ -2312,13 +2351,13 @@ static void damon_split_regions_of(struct damon_target *t, int nr_subs)
sz_region = damon_sz_region(r);
for (i = 0; i < nr_subs - 1 &&
- sz_region > 2 * DAMON_MIN_REGION; i++) {
+ sz_region > 2 * min_sz_region; i++) {
/*
* Randomly select size of left sub-region to be at
* least 10 percent and at most 90% of original region
*/
sz_sub = ALIGN_DOWN(damon_rand(1, 10) *
- sz_region / 10, DAMON_MIN_REGION);
+ sz_region / 10, min_sz_region);
/* Do not allow blank region */
if (sz_sub == 0 || sz_sub >= sz_region)
continue;
@@ -2358,7 +2397,7 @@ static void kdamond_split_regions(struct damon_ctx *ctx)
nr_subregions = 3;
damon_for_each_target(t, ctx)
- damon_split_regions_of(t, nr_subregions);
+ damon_split_regions_of(t, nr_subregions, ctx->min_sz_region);
last_nr_regions = nr_regions;
}
@@ -2475,10 +2514,14 @@ static void kdamond_call(struct damon_ctx *ctx, bool cancel)
mutex_lock(&ctx->call_controls_lock);
list_del(&control->list);
mutex_unlock(&ctx->call_controls_lock);
- if (!control->repeat)
+ if (!control->repeat) {
complete(&control->completion);
- else
+ } else if (control->canceled && control->dealloc_on_cancel) {
+ kfree(control);
+ continue;
+ } else {
list_add(&control->list, &repeat_controls);
+ }
}
control = list_first_entry_or_null(&repeat_controls,
struct damon_call_control, list);
@@ -2747,7 +2790,7 @@ int damon_set_region_biggest_system_ram_default(struct damon_target *t,
addr_range.start = *start;
addr_range.end = *end;
- return damon_set_regions(t, &addr_range, 1);
+ return damon_set_regions(t, &addr_range, 1, DAMON_MIN_REGION);
}
/*
@@ -2820,6 +2863,16 @@ void damon_update_region_access_rate(struct damon_region *r, bool accessed,
r->nr_accesses++;
}
+/**
+ * damon_initialized() - Return if DAMON is ready to be used.
+ *
+ * Return: true if DAMON is ready to be used, false otherwise.
+ */
+bool damon_initialized(void)
+{
+ return damon_region_cache != NULL;
+}
+
static int __init damon_init(void)
{
damon_region_cache = KMEM_CACHE(damon_region, 0);
diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c
index 151a9de5ad8b..42b9a656f9de 100644
--- a/mm/damon/lru_sort.c
+++ b/mm/damon/lru_sort.c
@@ -112,6 +112,13 @@ static unsigned long monitor_region_end __read_mostly;
module_param(monitor_region_end, ulong, 0600);
/*
+ * Scale factor for DAMON_LRU_SORT to ops address conversion.
+ *
+ * This parameter must not be set to 0.
+ */
+static unsigned long addr_unit __read_mostly = 1;
+
+/*
* PID of the DAMON thread
*
* If DAMON_LRU_SORT is enabled, this becomes the PID of the worker thread.
@@ -198,7 +205,21 @@ static int damon_lru_sort_apply_parameters(void)
if (err)
return err;
- err = damon_set_attrs(ctx, &damon_lru_sort_mon_attrs);
+ /*
+ * If monitor_region_start/end are unset, always silently
+ * reset addr_unit to 1.
+ */
+ if (!monitor_region_start && !monitor_region_end)
+ addr_unit = 1;
+ param_ctx->addr_unit = addr_unit;
+ param_ctx->min_sz_region = max(DAMON_MIN_REGION / addr_unit, 1);
+
+ if (!damon_lru_sort_mon_attrs.sample_interval) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ err = damon_set_attrs(param_ctx, &damon_lru_sort_mon_attrs);
if (err)
goto out;
@@ -285,6 +306,30 @@ static int damon_lru_sort_turn(bool on)
return damon_call(ctx, &call_control);
}
+static int damon_lru_sort_addr_unit_store(const char *val,
+ const struct kernel_param *kp)
+{
+ unsigned long input_addr_unit;
+ int err = kstrtoul(val, 0, &input_addr_unit);
+
+ if (err)
+ return err;
+ if (!input_addr_unit)
+ return -EINVAL;
+
+ addr_unit = input_addr_unit;
+ return 0;
+}
+
+static const struct kernel_param_ops addr_unit_param_ops = {
+ .set = damon_lru_sort_addr_unit_store,
+ .get = param_get_ulong,
+};
+
+module_param_cb(addr_unit, &addr_unit_param_ops, &addr_unit, 0600);
+MODULE_PARM_DESC(addr_unit,
+ "Scale factor for DAMON_LRU_SORT to ops address conversion (default: 1)");
+
static int damon_lru_sort_enabled_store(const char *val,
const struct kernel_param *kp)
{
@@ -300,7 +345,7 @@ static int damon_lru_sort_enabled_store(const char *val,
return 0;
/* Called before init function. The function will handle this. */
- if (!ctx)
+ if (!damon_initialized())
goto set_param_out;
err = damon_lru_sort_turn(enable);
@@ -323,8 +368,13 @@ MODULE_PARM_DESC(enabled,
static int __init damon_lru_sort_init(void)
{
- int err = damon_modules_new_paddr_ctx_target(&ctx, &target);
+ int err;
+ if (!damon_initialized()) {
+ err = -ENOMEM;
+ goto out;
+ }
+ err = damon_modules_new_paddr_ctx_target(&ctx, &target);
if (err)
goto out;
diff --git a/mm/damon/ops-common.c b/mm/damon/ops-common.c
index 99321ff5cb92..998c5180a603 100644
--- a/mm/damon/ops-common.c
+++ b/mm/damon/ops-common.c
@@ -303,7 +303,7 @@ static unsigned int __damon_migrate_folio_list(
* instead of migrated.
*/
.gfp_mask = (GFP_HIGHUSER_MOVABLE & ~__GFP_RECLAIM) |
- __GFP_NOWARN | __GFP_NOMEMALLOC | GFP_NOWAIT,
+ __GFP_NOMEMALLOC | GFP_NOWAIT,
.nid = target_nid,
};
@@ -412,3 +412,12 @@ unsigned long damon_migrate_pages(struct list_head *folio_list, int target_nid)
return nr_migrated;
}
+
+bool damos_ops_has_filter(struct damos *s)
+{
+ struct damos_filter *f;
+
+ damos_for_each_ops_filter(f, s)
+ return true;
+ return false;
+}
diff --git a/mm/damon/ops-common.h b/mm/damon/ops-common.h
index 61ad54aaf256..5efa5b5970de 100644
--- a/mm/damon/ops-common.h
+++ b/mm/damon/ops-common.h
@@ -21,3 +21,5 @@ int damon_hot_score(struct damon_ctx *c, struct damon_region *r,
bool damos_folio_filter_match(struct damos_filter *filter, struct folio *folio);
unsigned long damon_migrate_pages(struct list_head *folio_list, int target_nid);
+
+bool damos_ops_has_filter(struct damos *s);
diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c
index 53a55c5114fb..07a8aead439e 100644
--- a/mm/damon/paddr.c
+++ b/mm/damon/paddr.c
@@ -18,7 +18,26 @@
#include "../internal.h"
#include "ops-common.h"
-static void damon_pa_mkold(unsigned long paddr)
+static phys_addr_t damon_pa_phys_addr(
+ unsigned long addr, unsigned long addr_unit)
+{
+ return (phys_addr_t)addr * addr_unit;
+}
+
+static unsigned long damon_pa_core_addr(
+ phys_addr_t pa, unsigned long addr_unit)
+{
+ /*
+ * Use div_u64() for avoiding linking errors related with __udivdi3,
+ * __aeabi_uldivmod, or similar problems. This should also improve the
+ * performance optimization (read div_u64() comment for the detail).
+ */
+ if (sizeof(pa) == 8 && sizeof(addr_unit) == 4)
+ return div_u64(pa, addr_unit);
+ return pa / addr_unit;
+}
+
+static void damon_pa_mkold(phys_addr_t paddr)
{
struct folio *folio = damon_get_folio(PHYS_PFN(paddr));
@@ -29,11 +48,12 @@ static void damon_pa_mkold(unsigned long paddr)
folio_put(folio);
}
-static void __damon_pa_prepare_access_check(struct damon_region *r)
+static void __damon_pa_prepare_access_check(struct damon_region *r,
+ unsigned long addr_unit)
{
r->sampling_addr = damon_rand(r->ar.start, r->ar.end);
- damon_pa_mkold(r->sampling_addr);
+ damon_pa_mkold(damon_pa_phys_addr(r->sampling_addr, addr_unit));
}
static void damon_pa_prepare_access_checks(struct damon_ctx *ctx)
@@ -43,11 +63,11 @@ static void damon_pa_prepare_access_checks(struct damon_ctx *ctx)
damon_for_each_target(t, ctx) {
damon_for_each_region(r, t)
- __damon_pa_prepare_access_check(r);
+ __damon_pa_prepare_access_check(r, ctx->addr_unit);
}
}
-static bool damon_pa_young(unsigned long paddr, unsigned long *folio_sz)
+static bool damon_pa_young(phys_addr_t paddr, unsigned long *folio_sz)
{
struct folio *folio = damon_get_folio(PHYS_PFN(paddr));
bool accessed;
@@ -62,23 +82,25 @@ static bool damon_pa_young(unsigned long paddr, unsigned long *folio_sz)
}
static void __damon_pa_check_access(struct damon_region *r,
- struct damon_attrs *attrs)
+ struct damon_attrs *attrs, unsigned long addr_unit)
{
- static unsigned long last_addr;
+ static phys_addr_t last_addr;
static unsigned long last_folio_sz = PAGE_SIZE;
static bool last_accessed;
+ phys_addr_t sampling_addr = damon_pa_phys_addr(
+ r->sampling_addr, addr_unit);
/* If the region is in the last checked page, reuse the result */
if (ALIGN_DOWN(last_addr, last_folio_sz) ==
- ALIGN_DOWN(r->sampling_addr, last_folio_sz)) {
+ ALIGN_DOWN(sampling_addr, last_folio_sz)) {
damon_update_region_access_rate(r, last_accessed, attrs);
return;
}
- last_accessed = damon_pa_young(r->sampling_addr, &last_folio_sz);
+ last_accessed = damon_pa_young(sampling_addr, &last_folio_sz);
damon_update_region_access_rate(r, last_accessed, attrs);
- last_addr = r->sampling_addr;
+ last_addr = sampling_addr;
}
static unsigned int damon_pa_check_accesses(struct damon_ctx *ctx)
@@ -89,7 +111,8 @@ static unsigned int damon_pa_check_accesses(struct damon_ctx *ctx)
damon_for_each_target(t, ctx) {
damon_for_each_region(r, t) {
- __damon_pa_check_access(r, &ctx->attrs);
+ __damon_pa_check_access(
+ r, &ctx->attrs, ctx->addr_unit);
max_nr_accesses = max(r->nr_accesses, max_nr_accesses);
}
}
@@ -125,10 +148,11 @@ static bool damon_pa_invalid_damos_folio(struct folio *folio, struct damos *s)
return false;
}
-static unsigned long damon_pa_pageout(struct damon_region *r, struct damos *s,
+static unsigned long damon_pa_pageout(struct damon_region *r,
+ unsigned long addr_unit, struct damos *s,
unsigned long *sz_filter_passed)
{
- unsigned long addr, applied;
+ phys_addr_t addr, applied;
LIST_HEAD(folio_list);
bool install_young_filter = true;
struct damos_filter *filter;
@@ -149,8 +173,8 @@ static unsigned long damon_pa_pageout(struct damon_region *r, struct damos *s,
damos_add_filter(s, filter);
}
- addr = r->ar.start;
- while (addr < r->ar.end) {
+ addr = damon_pa_phys_addr(r->ar.start, addr_unit);
+ while (addr < damon_pa_phys_addr(r->ar.end, addr_unit)) {
folio = damon_get_folio(PHYS_PFN(addr));
if (damon_pa_invalid_damos_folio(folio, s)) {
addr += PAGE_SIZE;
@@ -160,7 +184,7 @@ static unsigned long damon_pa_pageout(struct damon_region *r, struct damos *s,
if (damos_pa_filter_out(s, folio))
goto put_folio;
else
- *sz_filter_passed += folio_size(folio);
+ *sz_filter_passed += folio_size(folio) / addr_unit;
folio_clear_referenced(folio);
folio_test_clear_young(folio);
@@ -179,18 +203,19 @@ put_folio:
applied = reclaim_pages(&folio_list);
cond_resched();
s->last_applied = folio;
- return applied * PAGE_SIZE;
+ return damon_pa_core_addr(applied * PAGE_SIZE, addr_unit);
}
static inline unsigned long damon_pa_mark_accessed_or_deactivate(
- struct damon_region *r, struct damos *s, bool mark_accessed,
+ struct damon_region *r, unsigned long addr_unit,
+ struct damos *s, bool mark_accessed,
unsigned long *sz_filter_passed)
{
- unsigned long addr, applied = 0;
+ phys_addr_t addr, applied = 0;
struct folio *folio;
- addr = r->ar.start;
- while (addr < r->ar.end) {
+ addr = damon_pa_phys_addr(r->ar.start, addr_unit);
+ while (addr < damon_pa_phys_addr(r->ar.end, addr_unit)) {
folio = damon_get_folio(PHYS_PFN(addr));
if (damon_pa_invalid_damos_folio(folio, s)) {
addr += PAGE_SIZE;
@@ -200,7 +225,7 @@ static inline unsigned long damon_pa_mark_accessed_or_deactivate(
if (damos_pa_filter_out(s, folio))
goto put_folio;
else
- *sz_filter_passed += folio_size(folio);
+ *sz_filter_passed += folio_size(folio) / addr_unit;
if (mark_accessed)
folio_mark_accessed(folio);
@@ -212,32 +237,35 @@ put_folio:
folio_put(folio);
}
s->last_applied = folio;
- return applied * PAGE_SIZE;
+ return damon_pa_core_addr(applied * PAGE_SIZE, addr_unit);
}
static unsigned long damon_pa_mark_accessed(struct damon_region *r,
- struct damos *s, unsigned long *sz_filter_passed)
+ unsigned long addr_unit, struct damos *s,
+ unsigned long *sz_filter_passed)
{
- return damon_pa_mark_accessed_or_deactivate(r, s, true,
+ return damon_pa_mark_accessed_or_deactivate(r, addr_unit, s, true,
sz_filter_passed);
}
static unsigned long damon_pa_deactivate_pages(struct damon_region *r,
- struct damos *s, unsigned long *sz_filter_passed)
+ unsigned long addr_unit, struct damos *s,
+ unsigned long *sz_filter_passed)
{
- return damon_pa_mark_accessed_or_deactivate(r, s, false,
+ return damon_pa_mark_accessed_or_deactivate(r, addr_unit, s, false,
sz_filter_passed);
}
-static unsigned long damon_pa_migrate(struct damon_region *r, struct damos *s,
+static unsigned long damon_pa_migrate(struct damon_region *r,
+ unsigned long addr_unit, struct damos *s,
unsigned long *sz_filter_passed)
{
- unsigned long addr, applied;
+ phys_addr_t addr, applied;
LIST_HEAD(folio_list);
struct folio *folio;
- addr = r->ar.start;
- while (addr < r->ar.end) {
+ addr = damon_pa_phys_addr(r->ar.start, addr_unit);
+ while (addr < damon_pa_phys_addr(r->ar.end, addr_unit)) {
folio = damon_get_folio(PHYS_PFN(addr));
if (damon_pa_invalid_damos_folio(folio, s)) {
addr += PAGE_SIZE;
@@ -247,7 +275,7 @@ static unsigned long damon_pa_migrate(struct damon_region *r, struct damos *s,
if (damos_pa_filter_out(s, folio))
goto put_folio;
else
- *sz_filter_passed += folio_size(folio);
+ *sz_filter_passed += folio_size(folio) / addr_unit;
if (!folio_isolate_lru(folio))
goto put_folio;
@@ -259,29 +287,21 @@ put_folio:
applied = damon_migrate_pages(&folio_list, s->target_nid);
cond_resched();
s->last_applied = folio;
- return applied * PAGE_SIZE;
+ return damon_pa_core_addr(applied * PAGE_SIZE, addr_unit);
}
-static bool damon_pa_scheme_has_filter(struct damos *s)
-{
- struct damos_filter *f;
-
- damos_for_each_ops_filter(f, s)
- return true;
- return false;
-}
-
-static unsigned long damon_pa_stat(struct damon_region *r, struct damos *s,
+static unsigned long damon_pa_stat(struct damon_region *r,
+ unsigned long addr_unit, struct damos *s,
unsigned long *sz_filter_passed)
{
- unsigned long addr;
+ phys_addr_t addr;
struct folio *folio;
- if (!damon_pa_scheme_has_filter(s))
+ if (!damos_ops_has_filter(s))
return 0;
- addr = r->ar.start;
- while (addr < r->ar.end) {
+ addr = damon_pa_phys_addr(r->ar.start, addr_unit);
+ while (addr < damon_pa_phys_addr(r->ar.end, addr_unit)) {
folio = damon_get_folio(PHYS_PFN(addr));
if (damon_pa_invalid_damos_folio(folio, s)) {
addr += PAGE_SIZE;
@@ -289,7 +309,7 @@ static unsigned long damon_pa_stat(struct damon_region *r, struct damos *s,
}
if (!damos_pa_filter_out(s, folio))
- *sz_filter_passed += folio_size(folio);
+ *sz_filter_passed += folio_size(folio) / addr_unit;
addr += folio_size(folio);
folio_put(folio);
}
@@ -301,18 +321,22 @@ static unsigned long damon_pa_apply_scheme(struct damon_ctx *ctx,
struct damon_target *t, struct damon_region *r,
struct damos *scheme, unsigned long *sz_filter_passed)
{
+ unsigned long aunit = ctx->addr_unit;
+
switch (scheme->action) {
case DAMOS_PAGEOUT:
- return damon_pa_pageout(r, scheme, sz_filter_passed);
+ return damon_pa_pageout(r, aunit, scheme, sz_filter_passed);
case DAMOS_LRU_PRIO:
- return damon_pa_mark_accessed(r, scheme, sz_filter_passed);
+ return damon_pa_mark_accessed(r, aunit, scheme,
+ sz_filter_passed);
case DAMOS_LRU_DEPRIO:
- return damon_pa_deactivate_pages(r, scheme, sz_filter_passed);
+ return damon_pa_deactivate_pages(r, aunit, scheme,
+ sz_filter_passed);
case DAMOS_MIGRATE_HOT:
case DAMOS_MIGRATE_COLD:
- return damon_pa_migrate(r, scheme, sz_filter_passed);
+ return damon_pa_migrate(r, aunit, scheme, sz_filter_passed);
case DAMOS_STAT:
- return damon_pa_stat(r, scheme, sz_filter_passed);
+ return damon_pa_stat(r, aunit, scheme, sz_filter_passed);
default:
/* DAMOS actions that not yet supported by 'paddr'. */
break;
diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c
index 3c71b4596676..7ba3d0f9a19a 100644
--- a/mm/damon/reclaim.c
+++ b/mm/damon/reclaim.c
@@ -129,6 +129,13 @@ static unsigned long monitor_region_end __read_mostly;
module_param(monitor_region_end, ulong, 0600);
/*
+ * Scale factor for DAMON_RECLAIM to ops address conversion.
+ *
+ * This parameter must not be set to 0.
+ */
+static unsigned long addr_unit __read_mostly = 1;
+
+/*
* Skip anonymous pages reclamation.
*
* If this parameter is set as ``Y``, DAMON_RECLAIM does not reclaim anonymous
@@ -194,6 +201,20 @@ static int damon_reclaim_apply_parameters(void)
if (err)
return err;
+ /*
+ * If monitor_region_start/end are unset, always silently
+ * reset addr_unit to 1.
+ */
+ if (!monitor_region_start && !monitor_region_end)
+ addr_unit = 1;
+ param_ctx->addr_unit = addr_unit;
+ param_ctx->min_sz_region = max(DAMON_MIN_REGION / addr_unit, 1);
+
+ if (!damon_reclaim_mon_attrs.aggr_interval) {
+ err = -EINVAL;
+ goto out;
+ }
+
err = damon_set_attrs(param_ctx, &damon_reclaim_mon_attrs);
if (err)
goto out;
@@ -289,6 +310,30 @@ static int damon_reclaim_turn(bool on)
return damon_call(ctx, &call_control);
}
+static int damon_reclaim_addr_unit_store(const char *val,
+ const struct kernel_param *kp)
+{
+ unsigned long input_addr_unit;
+ int err = kstrtoul(val, 0, &input_addr_unit);
+
+ if (err)
+ return err;
+ if (!input_addr_unit)
+ return -EINVAL;
+
+ addr_unit = input_addr_unit;
+ return 0;
+}
+
+static const struct kernel_param_ops addr_unit_param_ops = {
+ .set = damon_reclaim_addr_unit_store,
+ .get = param_get_ulong,
+};
+
+module_param_cb(addr_unit, &addr_unit_param_ops, &addr_unit, 0600);
+MODULE_PARM_DESC(addr_unit,
+ "Scale factor for DAMON_RECLAIM to ops address conversion (default: 1)");
+
static int damon_reclaim_enabled_store(const char *val,
const struct kernel_param *kp)
{
@@ -304,7 +349,7 @@ static int damon_reclaim_enabled_store(const char *val,
return 0;
/* Called before init function. The function will handle this. */
- if (!ctx)
+ if (!damon_initialized())
goto set_param_out;
err = damon_reclaim_turn(enable);
@@ -327,8 +372,13 @@ MODULE_PARM_DESC(enabled,
static int __init damon_reclaim_init(void)
{
- int err = damon_modules_new_paddr_ctx_target(&ctx, &target);
+ int err;
+ if (!damon_initialized()) {
+ err = -ENOMEM;
+ goto out;
+ }
+ err = damon_modules_new_paddr_ctx_target(&ctx, &target);
if (err)
goto out;
diff --git a/mm/damon/stat.c b/mm/damon/stat.c
index 87bcd8866d4b..d8010968bbed 100644
--- a/mm/damon/stat.c
+++ b/mm/damon/stat.c
@@ -34,11 +34,16 @@ module_param(estimated_memory_bandwidth, ulong, 0400);
MODULE_PARM_DESC(estimated_memory_bandwidth,
"Estimated memory bandwidth usage in bytes per second");
-static unsigned long memory_idle_ms_percentiles[101] __read_mostly = {0,};
-module_param_array(memory_idle_ms_percentiles, ulong, NULL, 0400);
+static long memory_idle_ms_percentiles[101] __read_mostly = {0,};
+module_param_array(memory_idle_ms_percentiles, long, NULL, 0400);
MODULE_PARM_DESC(memory_idle_ms_percentiles,
"Memory idle time percentiles in milliseconds");
+static unsigned long aggr_interval_us;
+module_param(aggr_interval_us, ulong, 0400);
+MODULE_PARM_DESC(aggr_interval_us,
+ "Current tuned aggregation interval in microseconds");
+
static struct damon_ctx *damon_stat_context;
static void damon_stat_set_estimated_memory_bandwidth(struct damon_ctx *c)
@@ -56,10 +61,10 @@ static void damon_stat_set_estimated_memory_bandwidth(struct damon_ctx *c)
MSEC_PER_SEC / c->attrs.aggr_interval;
}
-static unsigned int damon_stat_idletime(const struct damon_region *r)
+static int damon_stat_idletime(const struct damon_region *r)
{
if (r->nr_accesses)
- return 0;
+ return -1 * (r->age + 1);
return r->age + 1;
}
@@ -117,7 +122,7 @@ static void damon_stat_set_idletime_percentiles(struct damon_ctx *c)
while (next_percentile <= accounted_bytes * 100 / total_sz)
memory_idle_ms_percentiles[next_percentile++] =
damon_stat_idletime(region) *
- c->attrs.aggr_interval / USEC_PER_MSEC;
+ (long)c->attrs.aggr_interval / USEC_PER_MSEC;
}
kfree(sorted_regions);
}
@@ -133,6 +138,7 @@ static int damon_stat_damon_call_fn(void *data)
return 0;
last_refresh_jiffies = jiffies;
+ aggr_interval_us = c->attrs.aggr_interval;
damon_stat_set_estimated_memory_bandwidth(c);
damon_stat_set_idletime_percentiles(c);
return 0;
@@ -214,8 +220,6 @@ static void damon_stat_stop(void)
damon_destroy_ctx(damon_stat_context);
}
-static bool damon_stat_init_called;
-
static int damon_stat_enabled_store(
const char *val, const struct kernel_param *kp)
{
@@ -229,7 +233,7 @@ static int damon_stat_enabled_store(
if (is_enabled == enabled)
return 0;
- if (!damon_stat_init_called)
+ if (!damon_initialized())
/*
* probably called from command line parsing (parse_args()).
* Cannot call damon_new_ctx(). Let damon_stat_init() handle.
@@ -250,12 +254,16 @@ static int __init damon_stat_init(void)
{
int err = 0;
- damon_stat_init_called = true;
+ if (!damon_initialized()) {
+ err = -ENOMEM;
+ goto out;
+ }
/* probably set via command line */
if (enabled)
err = damon_stat_start();
+out:
if (err && enabled)
enabled = false;
return err;
diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c
index 6d2b0dab50cb..2fc722f998f8 100644
--- a/mm/damon/sysfs.c
+++ b/mm/damon/sysfs.c
@@ -834,6 +834,7 @@ static const struct damon_sysfs_ops_name damon_sysfs_ops_names[] = {
struct damon_sysfs_context {
struct kobject kobj;
enum damon_ops_id ops_id;
+ unsigned long addr_unit;
struct damon_sysfs_attrs *attrs;
struct damon_sysfs_targets *targets;
struct damon_sysfs_schemes *schemes;
@@ -849,6 +850,7 @@ static struct damon_sysfs_context *damon_sysfs_context_alloc(
return NULL;
context->kobj = (struct kobject){};
context->ops_id = ops_id;
+ context->addr_unit = 1;
return context;
}
@@ -997,6 +999,32 @@ static ssize_t operations_store(struct kobject *kobj,
return -EINVAL;
}
+static ssize_t addr_unit_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_context *context = container_of(kobj,
+ struct damon_sysfs_context, kobj);
+
+ return sysfs_emit(buf, "%lu\n", context->addr_unit);
+}
+
+static ssize_t addr_unit_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_context *context = container_of(kobj,
+ struct damon_sysfs_context, kobj);
+ unsigned long input_addr_unit;
+ int err = kstrtoul(buf, 0, &input_addr_unit);
+
+ if (err)
+ return err;
+ if (!input_addr_unit)
+ return -EINVAL;
+
+ context->addr_unit = input_addr_unit;
+ return count;
+}
+
static void damon_sysfs_context_release(struct kobject *kobj)
{
kfree(container_of(kobj, struct damon_sysfs_context, kobj));
@@ -1008,9 +1036,13 @@ static struct kobj_attribute damon_sysfs_context_avail_operations_attr =
static struct kobj_attribute damon_sysfs_context_operations_attr =
__ATTR_RW_MODE(operations, 0600);
+static struct kobj_attribute damon_sysfs_context_addr_unit_attr =
+ __ATTR_RW_MODE(addr_unit, 0600);
+
static struct attribute *damon_sysfs_context_attrs[] = {
&damon_sysfs_context_avail_operations_attr.attr,
&damon_sysfs_context_operations_attr.attr,
+ &damon_sysfs_context_addr_unit_attr.attr,
NULL,
};
ATTRIBUTE_GROUPS(damon_sysfs_context);
@@ -1260,14 +1292,18 @@ static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr,
{
struct damon_sysfs_kdamond *kdamond = container_of(kobj,
struct damon_sysfs_kdamond, kobj);
- struct damon_ctx *ctx = kdamond->damon_ctx;
- bool running;
+ struct damon_ctx *ctx;
+ bool running = false;
- if (!ctx)
- running = false;
- else
+ if (!mutex_trylock(&damon_sysfs_lock))
+ return -EBUSY;
+
+ ctx = kdamond->damon_ctx;
+ if (ctx)
running = damon_is_running(ctx);
+ mutex_unlock(&damon_sysfs_lock);
+
return sysfs_emit(buf, "%s\n", running ?
damon_sysfs_cmd_strs[DAMON_SYSFS_CMD_ON] :
damon_sysfs_cmd_strs[DAMON_SYSFS_CMD_OFF]);
@@ -1297,7 +1333,8 @@ static int damon_sysfs_set_attrs(struct damon_ctx *ctx,
}
static int damon_sysfs_set_regions(struct damon_target *t,
- struct damon_sysfs_regions *sysfs_regions)
+ struct damon_sysfs_regions *sysfs_regions,
+ unsigned long min_sz_region)
{
struct damon_addr_range *ranges = kmalloc_array(sysfs_regions->nr,
sizeof(*ranges), GFP_KERNEL | __GFP_NOWARN);
@@ -1319,7 +1356,7 @@ static int damon_sysfs_set_regions(struct damon_target *t,
if (ranges[i - 1].end > ranges[i].start)
goto out;
}
- err = damon_set_regions(t, ranges, sysfs_regions->nr);
+ err = damon_set_regions(t, ranges, sysfs_regions->nr, min_sz_region);
out:
kfree(ranges);
return err;
@@ -1340,7 +1377,7 @@ static int damon_sysfs_add_target(struct damon_sysfs_target *sys_target,
/* caller will destroy targets */
return -EINVAL;
}
- return damon_sysfs_set_regions(t, sys_target->regions);
+ return damon_sysfs_set_regions(t, sys_target->regions, ctx->min_sz_region);
}
static int damon_sysfs_add_targets(struct damon_ctx *ctx,
@@ -1397,6 +1434,11 @@ static int damon_sysfs_apply_inputs(struct damon_ctx *ctx,
err = damon_select_ops(ctx, sys_ctx->ops_id);
if (err)
return err;
+ ctx->addr_unit = sys_ctx->addr_unit;
+ /* addr_unit is respected by only DAMON_OPS_PADDR */
+ if (sys_ctx->ops_id == DAMON_OPS_PADDR)
+ ctx->min_sz_region = max(
+ DAMON_MIN_REGION / sys_ctx->addr_unit, 1);
err = damon_sysfs_set_attrs(ctx, sys_ctx->attrs);
if (err)
return err;
@@ -1530,14 +1572,10 @@ static int damon_sysfs_repeat_call_fn(void *data)
return 0;
}
-static struct damon_call_control damon_sysfs_repeat_call_control = {
- .fn = damon_sysfs_repeat_call_fn,
- .repeat = true,
-};
-
static int damon_sysfs_turn_damon_on(struct damon_sysfs_kdamond *kdamond)
{
struct damon_ctx *ctx;
+ struct damon_call_control *repeat_call_control;
int err;
if (damon_sysfs_kdamond_running(kdamond))
@@ -1550,18 +1588,29 @@ static int damon_sysfs_turn_damon_on(struct damon_sysfs_kdamond *kdamond)
damon_destroy_ctx(kdamond->damon_ctx);
kdamond->damon_ctx = NULL;
+ repeat_call_control = kmalloc(sizeof(*repeat_call_control),
+ GFP_KERNEL);
+ if (!repeat_call_control)
+ return -ENOMEM;
+
ctx = damon_sysfs_build_ctx(kdamond->contexts->contexts_arr[0]);
- if (IS_ERR(ctx))
+ if (IS_ERR(ctx)) {
+ kfree(repeat_call_control);
return PTR_ERR(ctx);
+ }
err = damon_start(&ctx, 1, false);
if (err) {
+ kfree(repeat_call_control);
damon_destroy_ctx(ctx);
return err;
}
kdamond->damon_ctx = ctx;
- damon_sysfs_repeat_call_control.data = kdamond;
- damon_call(ctx, &damon_sysfs_repeat_call_control);
+ repeat_call_control->fn = damon_sysfs_repeat_call_fn;
+ repeat_call_control->data = kdamond;
+ repeat_call_control->repeat = true;
+ repeat_call_control->dealloc_on_cancel = true;
+ damon_call(ctx, repeat_call_control);
return err;
}
@@ -1581,12 +1630,14 @@ static int damon_sysfs_damon_call(int (*fn)(void *data),
struct damon_sysfs_kdamond *kdamond)
{
struct damon_call_control call_control = {};
+ int err;
if (!kdamond->damon_ctx)
return -EINVAL;
call_control.fn = fn;
call_control.data = kdamond;
- return damon_call(kdamond->damon_ctx, &call_control);
+ err = damon_call(kdamond->damon_ctx, &call_control);
+ return err ? err : call_control.return_code;
}
struct damon_sysfs_schemes_walk_data {
diff --git a/mm/damon/tests/core-kunit.h b/mm/damon/tests/core-kunit.h
index dfedfff19940..51369e35298b 100644
--- a/mm/damon/tests/core-kunit.h
+++ b/mm/damon/tests/core-kunit.h
@@ -230,14 +230,14 @@ static void damon_test_split_regions_of(struct kunit *test)
t = damon_new_target();
r = damon_new_region(0, 22);
damon_add_region(r, t);
- damon_split_regions_of(t, 2);
+ damon_split_regions_of(t, 2, DAMON_MIN_REGION);
KUNIT_EXPECT_LE(test, damon_nr_regions(t), 2u);
damon_free_target(t);
t = damon_new_target();
r = damon_new_region(0, 220);
damon_add_region(r, t);
- damon_split_regions_of(t, 4);
+ damon_split_regions_of(t, 4, DAMON_MIN_REGION);
KUNIT_EXPECT_LE(test, damon_nr_regions(t), 4u);
damon_free_target(t);
damon_destroy_ctx(c);
@@ -303,7 +303,7 @@ static void damon_test_set_regions(struct kunit *test)
damon_add_region(r1, t);
damon_add_region(r2, t);
- damon_set_regions(t, &range, 1);
+ damon_set_regions(t, &range, 1, DAMON_MIN_REGION);
KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 3);
damon_for_each_region(r, t) {
@@ -419,6 +419,22 @@ static void damos_test_new_filter(struct kunit *test)
damos_destroy_filter(filter);
}
+static void damos_test_commit_filter(struct kunit *test)
+{
+ struct damos_filter *src_filter = damos_new_filter(
+ DAMOS_FILTER_TYPE_ANON, true, true);
+ struct damos_filter *dst_filter = damos_new_filter(
+ DAMOS_FILTER_TYPE_ACTIVE, false, false);
+
+ damos_commit_filter(dst_filter, src_filter);
+ KUNIT_EXPECT_EQ(test, dst_filter->type, src_filter->type);
+ KUNIT_EXPECT_EQ(test, dst_filter->matching, src_filter->matching);
+ KUNIT_EXPECT_EQ(test, dst_filter->allow, src_filter->allow);
+
+ damos_destroy_filter(src_filter);
+ damos_destroy_filter(dst_filter);
+}
+
static void damos_test_filter_out(struct kunit *test)
{
struct damon_target *t;
@@ -434,25 +450,29 @@ static void damos_test_filter_out(struct kunit *test)
damon_add_region(r, t);
/* region in the range */
- KUNIT_EXPECT_TRUE(test, damos_filter_match(NULL, t, r, f));
+ KUNIT_EXPECT_TRUE(test,
+ damos_filter_match(NULL, t, r, f, DAMON_MIN_REGION));
KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 1);
/* region before the range */
r->ar.start = DAMON_MIN_REGION * 1;
r->ar.end = DAMON_MIN_REGION * 2;
- KUNIT_EXPECT_FALSE(test, damos_filter_match(NULL, t, r, f));
+ KUNIT_EXPECT_FALSE(test,
+ damos_filter_match(NULL, t, r, f, DAMON_MIN_REGION));
KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 1);
/* region after the range */
r->ar.start = DAMON_MIN_REGION * 6;
r->ar.end = DAMON_MIN_REGION * 8;
- KUNIT_EXPECT_FALSE(test, damos_filter_match(NULL, t, r, f));
+ KUNIT_EXPECT_FALSE(test,
+ damos_filter_match(NULL, t, r, f, DAMON_MIN_REGION));
KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 1);
/* region started before the range */
r->ar.start = DAMON_MIN_REGION * 1;
r->ar.end = DAMON_MIN_REGION * 4;
- KUNIT_EXPECT_FALSE(test, damos_filter_match(NULL, t, r, f));
+ KUNIT_EXPECT_FALSE(test,
+ damos_filter_match(NULL, t, r, f, DAMON_MIN_REGION));
/* filter should have split the region */
KUNIT_EXPECT_EQ(test, r->ar.start, DAMON_MIN_REGION * 1);
KUNIT_EXPECT_EQ(test, r->ar.end, DAMON_MIN_REGION * 2);
@@ -465,7 +485,8 @@ static void damos_test_filter_out(struct kunit *test)
/* region started in the range */
r->ar.start = DAMON_MIN_REGION * 2;
r->ar.end = DAMON_MIN_REGION * 8;
- KUNIT_EXPECT_TRUE(test, damos_filter_match(NULL, t, r, f));
+ KUNIT_EXPECT_TRUE(test,
+ damos_filter_match(NULL, t, r, f, DAMON_MIN_REGION));
/* filter should have split the region */
KUNIT_EXPECT_EQ(test, r->ar.start, DAMON_MIN_REGION * 2);
KUNIT_EXPECT_EQ(test, r->ar.end, DAMON_MIN_REGION * 6);
@@ -594,6 +615,7 @@ static struct kunit_case damon_test_cases[] = {
KUNIT_CASE(damon_test_set_attrs),
KUNIT_CASE(damon_test_moving_sum),
KUNIT_CASE(damos_test_new_filter),
+ KUNIT_CASE(damos_test_commit_filter),
KUNIT_CASE(damos_test_filter_out),
KUNIT_CASE(damon_test_feed_loop_next_input),
KUNIT_CASE(damon_test_set_filters_default_reject),
diff --git a/mm/damon/tests/vaddr-kunit.h b/mm/damon/tests/vaddr-kunit.h
index d2b37ccf2cc0..fce38dd53cf8 100644
--- a/mm/damon/tests/vaddr-kunit.h
+++ b/mm/damon/tests/vaddr-kunit.h
@@ -141,7 +141,7 @@ static void damon_do_test_apply_three_regions(struct kunit *test,
damon_add_region(r, t);
}
- damon_set_regions(t, three_regions, 3);
+ damon_set_regions(t, three_regions, 3, DAMON_MIN_REGION);
for (i = 0; i < nr_expected / 2; i++) {
r = __nth_region_of(t, i);
diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c
index 87e825349bdf..8c048f9b129e 100644
--- a/mm/damon/vaddr.c
+++ b/mm/damon/vaddr.c
@@ -299,7 +299,7 @@ static void damon_va_update(struct damon_ctx *ctx)
damon_for_each_target(t, ctx) {
if (damon_va_three_regions(t, three_regions))
continue;
- damon_set_regions(t, three_regions, 3);
+ damon_set_regions(t, three_regions, 3, DAMON_MIN_REGION);
}
}
@@ -890,6 +890,107 @@ free_lists:
return applied * PAGE_SIZE;
}
+struct damos_va_stat_private {
+ struct damos *scheme;
+ unsigned long *sz_filter_passed;
+};
+
+static inline bool damos_va_invalid_folio(struct folio *folio,
+ struct damos *s)
+{
+ return !folio || folio == s->last_applied;
+}
+
+static int damos_va_stat_pmd_entry(pmd_t *pmd, unsigned long addr,
+ unsigned long next, struct mm_walk *walk)
+{
+ struct damos_va_stat_private *priv = walk->private;
+ struct damos *s = priv->scheme;
+ unsigned long *sz_filter_passed = priv->sz_filter_passed;
+ struct vm_area_struct *vma = walk->vma;
+ struct folio *folio;
+ spinlock_t *ptl;
+ pte_t *start_pte, *pte, ptent;
+ int nr;
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ if (pmd_trans_huge(*pmd)) {
+ pmd_t pmde;
+
+ ptl = pmd_trans_huge_lock(pmd, vma);
+ if (!ptl)
+ return 0;
+ pmde = pmdp_get(pmd);
+ if (!pmd_present(pmde))
+ goto huge_unlock;
+
+ folio = vm_normal_folio_pmd(vma, addr, pmde);
+
+ if (damos_va_invalid_folio(folio, s))
+ goto huge_unlock;
+
+ if (!damos_va_filter_out(s, folio, vma, addr, NULL, pmd))
+ *sz_filter_passed += folio_size(folio);
+ s->last_applied = folio;
+
+huge_unlock:
+ spin_unlock(ptl);
+ return 0;
+ }
+#endif
+ start_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+ if (!start_pte)
+ return 0;
+
+ for (; addr < next; pte += nr, addr += nr * PAGE_SIZE) {
+ nr = 1;
+ ptent = ptep_get(pte);
+
+ if (pte_none(ptent) || !pte_present(ptent))
+ continue;
+
+ folio = vm_normal_folio(vma, addr, ptent);
+
+ if (damos_va_invalid_folio(folio, s))
+ continue;
+
+ if (!damos_va_filter_out(s, folio, vma, addr, pte, NULL))
+ *sz_filter_passed += folio_size(folio);
+ nr = folio_nr_pages(folio);
+ s->last_applied = folio;
+ }
+ pte_unmap_unlock(start_pte, ptl);
+ return 0;
+}
+
+static unsigned long damos_va_stat(struct damon_target *target,
+ struct damon_region *r, struct damos *s,
+ unsigned long *sz_filter_passed)
+{
+ struct damos_va_stat_private priv;
+ struct mm_struct *mm;
+ struct mm_walk_ops walk_ops = {
+ .pmd_entry = damos_va_stat_pmd_entry,
+ .walk_lock = PGWALK_RDLOCK,
+ };
+
+ priv.scheme = s;
+ priv.sz_filter_passed = sz_filter_passed;
+
+ if (!damos_ops_has_filter(s))
+ return 0;
+
+ mm = damon_get_mm(target);
+ if (!mm)
+ return 0;
+
+ mmap_read_lock(mm);
+ walk_page_range(mm, r->ar.start, r->ar.end, &walk_ops, &priv);
+ mmap_read_unlock(mm);
+ mmput(mm);
+ return 0;
+}
+
static unsigned long damon_va_apply_scheme(struct damon_ctx *ctx,
struct damon_target *t, struct damon_region *r,
struct damos *scheme, unsigned long *sz_filter_passed)
@@ -916,7 +1017,7 @@ static unsigned long damon_va_apply_scheme(struct damon_ctx *ctx,
case DAMOS_MIGRATE_COLD:
return damos_va_migrate(t, r, scheme, sz_filter_passed);
case DAMOS_STAT:
- return 0;
+ return damos_va_stat(t, r, scheme, sz_filter_passed);
default:
/*
* DAMOS actions that are not yet supported by 'vaddr'.
diff --git a/mm/debug.c b/mm/debug.c
index b4388f4dcd4d..64ddb0c4b4be 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -182,7 +182,7 @@ void dump_mm(const struct mm_struct *mm)
"start_code %lx end_code %lx start_data %lx end_data %lx\n"
"start_brk %lx brk %lx start_stack %lx\n"
"arg_start %lx arg_end %lx env_start %lx env_end %lx\n"
- "binfmt %px flags %lx\n"
+ "binfmt %px flags %*pb\n"
#ifdef CONFIG_AIO
"ioctx_table %px\n"
#endif
@@ -211,7 +211,7 @@ void dump_mm(const struct mm_struct *mm)
mm->start_code, mm->end_code, mm->start_data, mm->end_data,
mm->start_brk, mm->brk, mm->start_stack,
mm->arg_start, mm->arg_end, mm->env_start, mm->env_end,
- mm->binfmt, mm->flags,
+ mm->binfmt, NUM_MM_FLAG_BITS, __mm_flags_get_bitmap(mm),
#ifdef CONFIG_AIO
mm->ioctx_table,
#endif
diff --git a/mm/execmem.c b/mm/execmem.c
index 0822305413ec..810a4ba9c924 100644
--- a/mm/execmem.c
+++ b/mm/execmem.c
@@ -38,9 +38,6 @@ static void *execmem_vmalloc(struct execmem_range *range, size_t size,
if (kasan)
vm_flags |= VM_DEFER_KMEMLEAK;
- if (vm_flags & VM_ALLOW_HUGE_VMAP)
- align = PMD_SIZE;
-
p = __vmalloc_node_range(size, align, start, end, gfp_flags,
pgprot, vm_flags, NUMA_NO_NODE,
__builtin_return_address(0));
diff --git a/mm/filemap.c b/mm/filemap.c
index 751838ef05e5..a52dd38d2b4a 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -190,6 +190,9 @@ static void filemap_unaccount_folio(struct address_space *mapping,
__lruvec_stat_mod_folio(folio, NR_FILE_THPS, -nr);
filemap_nr_thps_dec(mapping);
}
+ if (test_bit(AS_KERNEL_FILE, &folio->mapping->flags))
+ mod_node_page_state(folio_pgdat(folio),
+ NR_KERNEL_FILE_PAGES, -nr);
/*
* At this point folio must be either written or cleaned by
@@ -960,8 +963,14 @@ int filemap_add_folio(struct address_space *mapping, struct folio *folio,
{
void *shadow = NULL;
int ret;
+ struct mem_cgroup *tmp;
+ bool kernel_file = test_bit(AS_KERNEL_FILE, &mapping->flags);
+ if (kernel_file)
+ tmp = set_active_memcg(root_mem_cgroup);
ret = mem_cgroup_charge(folio, NULL, gfp);
+ if (kernel_file)
+ set_active_memcg(tmp);
if (ret)
return ret;
@@ -983,6 +992,10 @@ int filemap_add_folio(struct address_space *mapping, struct folio *folio,
if (!(gfp & __GFP_WRITE) && shadow)
workingset_refault(folio, shadow);
folio_add_lru(folio);
+ if (kernel_file)
+ mod_node_page_state(folio_pgdat(folio),
+ NR_KERNEL_FILE_PAGES,
+ folio_nr_pages(folio));
}
return ret;
}
@@ -1140,10 +1153,10 @@ static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync,
*/
flags = wait->flags;
if (flags & WQ_FLAG_EXCLUSIVE) {
- if (test_bit(key->bit_nr, &key->folio->flags))
+ if (test_bit(key->bit_nr, &key->folio->flags.f))
return -1;
if (flags & WQ_FLAG_CUSTOM) {
- if (test_and_set_bit(key->bit_nr, &key->folio->flags))
+ if (test_and_set_bit(key->bit_nr, &key->folio->flags.f))
return -1;
flags |= WQ_FLAG_DONE;
}
@@ -1226,9 +1239,9 @@ static inline bool folio_trylock_flag(struct folio *folio, int bit_nr,
struct wait_queue_entry *wait)
{
if (wait->flags & WQ_FLAG_EXCLUSIVE) {
- if (test_and_set_bit(bit_nr, &folio->flags))
+ if (test_and_set_bit(bit_nr, &folio->flags.f))
return false;
- } else if (test_bit(bit_nr, &folio->flags))
+ } else if (test_bit(bit_nr, &folio->flags.f))
return false;
wait->flags |= WQ_FLAG_WOKEN | WQ_FLAG_DONE;
@@ -1961,7 +1974,7 @@ no_page:
gfp &= ~__GFP_FS;
if (fgp_flags & FGP_NOWAIT) {
gfp &= ~GFP_KERNEL;
- gfp |= GFP_NOWAIT | __GFP_NOWARN;
+ gfp |= GFP_NOWAIT;
}
if (WARN_ON_ONCE(!(fgp_flags & (FGP_LOCK | FGP_FOR_MMAP))))
fgp_flags |= FGP_LOCK;
@@ -2447,6 +2460,9 @@ static bool filemap_range_uptodate(struct address_space *mapping,
pos -= folio_pos(folio);
}
+ if (pos == 0 && count >= folio_size(folio))
+ return false;
+
return mapping->a_ops->is_partially_uptodate(folio, pos, count);
}
@@ -2584,8 +2600,9 @@ static int filemap_get_pages(struct kiocb *iocb, size_t count,
unsigned int flags;
int err = 0;
- /* "last_index" is the index of the page beyond the end of the read */
- last_index = DIV_ROUND_UP(iocb->ki_pos + count, PAGE_SIZE);
+ /* "last_index" is the index of the folio beyond the end of the read */
+ last_index = round_up(iocb->ki_pos + count,
+ mapping_min_folio_nrbytes(mapping)) >> PAGE_SHIFT;
retry:
if (fatal_signal_pending(current))
return -EINTR;
@@ -2619,9 +2636,10 @@ retry:
goto err;
}
if (!folio_test_uptodate(folio)) {
- if ((iocb->ki_flags & IOCB_WAITQ) &&
- folio_batch_count(fbatch) > 1)
- iocb->ki_flags |= IOCB_NOWAIT;
+ if (folio_batch_count(fbatch) > 1) {
+ err = -EAGAIN;
+ goto err;
+ }
err = filemap_update_page(iocb, mapping, count, folio,
need_uptodate);
if (err)
@@ -3323,9 +3341,17 @@ static struct file *do_async_mmap_readahead(struct vm_fault *vmf,
if (vmf->vma->vm_flags & VM_RAND_READ || !ra->ra_pages)
return fpin;
- mmap_miss = READ_ONCE(ra->mmap_miss);
- if (mmap_miss)
- WRITE_ONCE(ra->mmap_miss, --mmap_miss);
+ /*
+ * If the folio is locked, we're likely racing against another fault.
+ * Don't touch the mmap_miss counter to avoid decreasing it multiple
+ * times for a single folio and break the balance with mmap_miss
+ * increase in do_sync_mmap_readahead().
+ */
+ if (likely(!folio_test_locked(folio))) {
+ mmap_miss = READ_ONCE(ra->mmap_miss);
+ if (mmap_miss)
+ WRITE_ONCE(ra->mmap_miss, --mmap_miss);
+ }
if (folio_test_readahead(folio)) {
fpin = maybe_unlock_mmap_for_io(vmf, fpin);
@@ -3639,10 +3665,26 @@ static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf,
unsigned long addr, unsigned int nr_pages,
unsigned long *rss, unsigned short *mmap_miss)
{
+ unsigned int ref_from_caller = 1;
vm_fault_t ret = 0;
struct page *page = folio_page(folio, start);
unsigned int count = 0;
pte_t *old_ptep = vmf->pte;
+ unsigned long addr0;
+
+ /*
+ * Map the large folio fully where possible.
+ *
+ * The folio must not cross VMA or page table boundary.
+ */
+ addr0 = addr - start * PAGE_SIZE;
+ if (folio_within_vma(folio, vmf->vma) &&
+ (addr0 & PMD_MASK) == ((addr0 + folio_size(folio) - 1) & PMD_MASK)) {
+ vmf->pte -= start;
+ page -= start;
+ addr = addr0;
+ nr_pages = folio_nr_pages(folio);
+ }
do {
if (PageHWPoison(page + count))
@@ -3672,7 +3714,8 @@ skip:
if (count) {
set_pte_range(vmf, folio, page, count, addr);
*rss += count;
- folio_ref_add(folio, count);
+ folio_ref_add(folio, count - ref_from_caller);
+ ref_from_caller = 0;
if (in_range(vmf->address, addr, count * PAGE_SIZE))
ret = VM_FAULT_NOPAGE;
}
@@ -3687,12 +3730,16 @@ skip:
if (count) {
set_pte_range(vmf, folio, page, count, addr);
*rss += count;
- folio_ref_add(folio, count);
+ folio_ref_add(folio, count - ref_from_caller);
+ ref_from_caller = 0;
if (in_range(vmf->address, addr, count * PAGE_SIZE))
ret = VM_FAULT_NOPAGE;
}
vmf->pte = old_ptep;
+ if (ref_from_caller)
+ /* Locked folios cannot get truncated. */
+ folio_ref_dec(folio);
return ret;
}
@@ -3705,7 +3752,7 @@ static vm_fault_t filemap_map_order0_folio(struct vm_fault *vmf,
struct page *page = &folio->page;
if (PageHWPoison(page))
- return ret;
+ goto out;
/* See comment of filemap_map_folio_range() */
if (!folio_test_workingset(folio))
@@ -3717,15 +3764,18 @@ static vm_fault_t filemap_map_order0_folio(struct vm_fault *vmf,
* the fault-around logic.
*/
if (!pte_none(ptep_get(vmf->pte)))
- return ret;
+ goto out;
if (vmf->address == addr)
ret = VM_FAULT_NOPAGE;
set_pte_range(vmf, folio, page, 1, addr);
(*rss)++;
- folio_ref_inc(folio);
+ return ret;
+out:
+ /* Locked folios cannot get truncated. */
+ folio_ref_dec(folio);
return ret;
}
@@ -3785,7 +3835,6 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf,
nr_pages, &rss, &mmap_miss);
folio_unlock(folio);
- folio_put(folio);
} while ((folio = next_uptodate_folio(&xas, mapping, end_pgoff)) != NULL);
add_mm_counter(vma->vm_mm, folio_type, rss);
pte_unmap_unlock(vmf->pte, vmf->ptl);
@@ -4491,7 +4540,7 @@ static void filemap_cachestat(struct address_space *mapping,
* invalidation, so there might not be
* a shadow in the swapcache (yet).
*/
- shadow = get_shadow_from_swap_cache(swp);
+ shadow = swap_cache_get_shadow(swp);
if (!shadow)
goto resched;
}
diff --git a/mm/gup.c b/mm/gup.c
index adffe663594d..a8ba5112e4d0 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -28,11 +28,6 @@
#include "internal.h"
#include "swap.h"
-struct follow_page_context {
- struct dev_pagemap *pgmap;
- unsigned int page_mask;
-};
-
static inline void sanity_check_pinned_pages(struct page **pages,
unsigned long npages)
{
@@ -148,7 +143,7 @@ int __must_check try_grab_folio(struct folio *folio, int refs,
if (WARN_ON_ONCE(folio_ref_count(folio) <= 0))
return -ENOMEM;
- if (unlikely(!(flags & FOLL_PCI_P2PDMA) && is_pci_p2pdma_page(&folio->page)))
+ if (unlikely(!(flags & FOLL_PCI_P2PDMA) && folio_is_pci_p2pdma(folio)))
return -EREMOTEIO;
if (flags & FOLL_GET)
@@ -237,7 +232,7 @@ void folio_add_pin(struct folio *folio)
static inline struct folio *gup_folio_range_next(struct page *start,
unsigned long npages, unsigned long i, unsigned int *ntails)
{
- struct page *next = nth_page(start, i);
+ struct page *next = start + i;
struct folio *folio = page_folio(next);
unsigned int nr = 1;
@@ -342,6 +337,10 @@ EXPORT_SYMBOL(unpin_user_pages_dirty_lock);
* "gup-pinned page range" refers to a range of pages that has had one of the
* pin_user_pages() variants called on that page.
*
+ * The page range must be truly physically contiguous: the page range
+ * corresponds to a contiguous PFN range and all pages can be iterated
+ * naturally.
+ *
* For the page ranges defined by [page .. page+npages], make that range (or
* its head pages, if a compound page) dirty, if @make_dirty is true, and if the
* page range was previously listed as clean.
@@ -359,6 +358,8 @@ void unpin_user_page_range_dirty_lock(struct page *page, unsigned long npages,
struct folio *folio;
unsigned int nr;
+ VM_WARN_ON_ONCE(!page_range_contiguous(page, npages));
+
for (i = 0; i < npages; i += nr) {
folio = gup_folio_range_next(page, npages, i, &nr);
if (make_dirty && !folio_test_dirty(folio)) {
@@ -475,29 +476,15 @@ EXPORT_SYMBOL_GPL(unpin_folios);
* lifecycle. Avoid setting the bit unless necessary, or it might cause write
* cache bouncing on large SMP machines for concurrent pinned gups.
*/
-static inline void mm_set_has_pinned_flag(unsigned long *mm_flags)
+static inline void mm_set_has_pinned_flag(struct mm_struct *mm)
{
- if (!test_bit(MMF_HAS_PINNED, mm_flags))
- set_bit(MMF_HAS_PINNED, mm_flags);
+ if (!mm_flags_test(MMF_HAS_PINNED, mm))
+ mm_flags_set(MMF_HAS_PINNED, mm);
}
#ifdef CONFIG_MMU
#ifdef CONFIG_HAVE_GUP_FAST
-static int record_subpages(struct page *page, unsigned long sz,
- unsigned long addr, unsigned long end,
- struct page **pages)
-{
- struct page *start_page;
- int nr;
-
- start_page = nth_page(page, (addr & (sz - 1)) >> PAGE_SHIFT);
- for (nr = 0; addr != end; nr++, addr += PAGE_SIZE)
- pages[nr] = nth_page(start_page, nr);
-
- return nr;
-}
-
/**
* try_grab_folio_fast() - Attempt to get or pin a folio in fast path.
* @page: pointer to page to be grabbed
@@ -661,7 +648,7 @@ static inline bool can_follow_write_pud(pud_t pud, struct page *page,
static struct page *follow_huge_pud(struct vm_area_struct *vma,
unsigned long addr, pud_t *pudp,
- int flags, struct follow_page_context *ctx)
+ int flags, unsigned long *page_mask)
{
struct mm_struct *mm = vma->vm_mm;
struct page *page;
@@ -688,7 +675,7 @@ static struct page *follow_huge_pud(struct vm_area_struct *vma,
if (ret)
page = ERR_PTR(ret);
else
- ctx->page_mask = HPAGE_PUD_NR - 1;
+ *page_mask = HPAGE_PUD_NR - 1;
return page;
}
@@ -714,7 +701,7 @@ static inline bool can_follow_write_pmd(pmd_t pmd, struct page *page,
static struct page *follow_huge_pmd(struct vm_area_struct *vma,
unsigned long addr, pmd_t *pmd,
unsigned int flags,
- struct follow_page_context *ctx)
+ unsigned long *page_mask)
{
struct mm_struct *mm = vma->vm_mm;
pmd_t pmdval = *pmd;
@@ -751,7 +738,7 @@ static struct page *follow_huge_pmd(struct vm_area_struct *vma,
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
- ctx->page_mask = HPAGE_PMD_NR - 1;
+ *page_mask = HPAGE_PMD_NR - 1;
return page;
}
@@ -759,7 +746,7 @@ static struct page *follow_huge_pmd(struct vm_area_struct *vma,
#else /* CONFIG_PGTABLE_HAS_HUGE_LEAVES */
static struct page *follow_huge_pud(struct vm_area_struct *vma,
unsigned long addr, pud_t *pudp,
- int flags, struct follow_page_context *ctx)
+ int flags, unsigned long *page_mask)
{
return NULL;
}
@@ -767,7 +754,7 @@ static struct page *follow_huge_pud(struct vm_area_struct *vma,
static struct page *follow_huge_pmd(struct vm_area_struct *vma,
unsigned long addr, pmd_t *pmd,
unsigned int flags,
- struct follow_page_context *ctx)
+ unsigned long *page_mask)
{
return NULL;
}
@@ -813,8 +800,7 @@ static inline bool can_follow_write_pte(pte_t pte, struct page *page,
}
static struct page *follow_page_pte(struct vm_area_struct *vma,
- unsigned long address, pmd_t *pmd, unsigned int flags,
- struct dev_pagemap **pgmap)
+ unsigned long address, pmd_t *pmd, unsigned int flags)
{
struct mm_struct *mm = vma->vm_mm;
struct folio *folio;
@@ -912,7 +898,7 @@ no_page:
static struct page *follow_pmd_mask(struct vm_area_struct *vma,
unsigned long address, pud_t *pudp,
unsigned int flags,
- struct follow_page_context *ctx)
+ unsigned long *page_mask)
{
pmd_t *pmd, pmdval;
spinlock_t *ptl;
@@ -926,7 +912,7 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma,
if (!pmd_present(pmdval))
return no_page_table(vma, flags, address);
if (likely(!pmd_leaf(pmdval)))
- return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
+ return follow_page_pte(vma, address, pmd, flags);
if (pmd_protnone(pmdval) && !gup_can_follow_protnone(vma, flags))
return no_page_table(vma, flags, address);
@@ -939,16 +925,16 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma,
}
if (unlikely(!pmd_leaf(pmdval))) {
spin_unlock(ptl);
- return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
+ return follow_page_pte(vma, address, pmd, flags);
}
if (pmd_trans_huge(pmdval) && (flags & FOLL_SPLIT_PMD)) {
spin_unlock(ptl);
split_huge_pmd(vma, pmd, address);
/* If pmd was left empty, stuff a page table in there quickly */
return pte_alloc(mm, pmd) ? ERR_PTR(-ENOMEM) :
- follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
+ follow_page_pte(vma, address, pmd, flags);
}
- page = follow_huge_pmd(vma, address, pmd, flags, ctx);
+ page = follow_huge_pmd(vma, address, pmd, flags, page_mask);
spin_unlock(ptl);
return page;
}
@@ -956,7 +942,7 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma,
static struct page *follow_pud_mask(struct vm_area_struct *vma,
unsigned long address, p4d_t *p4dp,
unsigned int flags,
- struct follow_page_context *ctx)
+ unsigned long *page_mask)
{
pud_t *pudp, pud;
spinlock_t *ptl;
@@ -969,7 +955,7 @@ static struct page *follow_pud_mask(struct vm_area_struct *vma,
return no_page_table(vma, flags, address);
if (pud_leaf(pud)) {
ptl = pud_lock(mm, pudp);
- page = follow_huge_pud(vma, address, pudp, flags, ctx);
+ page = follow_huge_pud(vma, address, pudp, flags, page_mask);
spin_unlock(ptl);
if (page)
return page;
@@ -978,13 +964,13 @@ static struct page *follow_pud_mask(struct vm_area_struct *vma,
if (unlikely(pud_bad(pud)))
return no_page_table(vma, flags, address);
- return follow_pmd_mask(vma, address, pudp, flags, ctx);
+ return follow_pmd_mask(vma, address, pudp, flags, page_mask);
}
static struct page *follow_p4d_mask(struct vm_area_struct *vma,
unsigned long address, pgd_t *pgdp,
unsigned int flags,
- struct follow_page_context *ctx)
+ unsigned long *page_mask)
{
p4d_t *p4dp, p4d;
@@ -995,7 +981,7 @@ static struct page *follow_p4d_mask(struct vm_area_struct *vma,
if (!p4d_present(p4d) || p4d_bad(p4d))
return no_page_table(vma, flags, address);
- return follow_pud_mask(vma, address, p4dp, flags, ctx);
+ return follow_pud_mask(vma, address, p4dp, flags, page_mask);
}
/**
@@ -1003,20 +989,16 @@ static struct page *follow_p4d_mask(struct vm_area_struct *vma,
* @vma: vm_area_struct mapping @address
* @address: virtual address to look up
* @flags: flags modifying lookup behaviour
- * @ctx: contains dev_pagemap for %ZONE_DEVICE memory pinning and a
- * pointer to output page_mask
+ * @page_mask: a pointer to output page_mask
*
* @flags can have FOLL_ flags set, defined in <linux/mm.h>
*
- * When getting pages from ZONE_DEVICE memory, the @ctx->pgmap caches
- * the device's dev_pagemap metadata to avoid repeating expensive lookups.
- *
* When getting an anonymous page and the caller has to trigger unsharing
* of a shared anonymous page first, -EMLINK is returned. The caller should
* trigger a fault with FAULT_FLAG_UNSHARE set. Note that unsharing is only
* relevant with FOLL_PIN and !FOLL_WRITE.
*
- * On output, the @ctx->page_mask is set according to the size of the page.
+ * On output, @page_mask is set according to the size of the page.
*
* Return: the mapped (struct page *), %NULL if no mapping exists, or
* an error pointer if there is a mapping to something not represented
@@ -1024,7 +1006,7 @@ static struct page *follow_p4d_mask(struct vm_area_struct *vma,
*/
static struct page *follow_page_mask(struct vm_area_struct *vma,
unsigned long address, unsigned int flags,
- struct follow_page_context *ctx)
+ unsigned long *page_mask)
{
pgd_t *pgd;
struct mm_struct *mm = vma->vm_mm;
@@ -1032,13 +1014,13 @@ static struct page *follow_page_mask(struct vm_area_struct *vma,
vma_pgtable_walk_begin(vma);
- ctx->page_mask = 0;
+ *page_mask = 0;
pgd = pgd_offset(mm, address);
if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
page = no_page_table(vma, flags, address);
else
- page = follow_p4d_mask(vma, address, pgd, flags, ctx);
+ page = follow_p4d_mask(vma, address, pgd, flags, page_mask);
vma_pgtable_walk_end(vma);
@@ -1376,7 +1358,7 @@ static long __get_user_pages(struct mm_struct *mm,
{
long ret = 0, i = 0;
struct vm_area_struct *vma = NULL;
- struct follow_page_context ctx = { NULL };
+ unsigned long page_mask = 0;
if (!nr_pages)
return 0;
@@ -1418,7 +1400,7 @@ static long __get_user_pages(struct mm_struct *mm,
pages ? &page : NULL);
if (ret)
goto out;
- ctx.page_mask = 0;
+ page_mask = 0;
goto next_page;
}
@@ -1441,7 +1423,7 @@ retry:
}
cond_resched();
- page = follow_page_mask(vma, start, gup_flags, &ctx);
+ page = follow_page_mask(vma, start, gup_flags, &page_mask);
if (!page || PTR_ERR(page) == -EMLINK) {
ret = faultin_page(vma, start, gup_flags,
PTR_ERR(page) == -EMLINK, locked);
@@ -1474,7 +1456,7 @@ retry:
goto out;
}
next_page:
- page_increm = 1 + (~(start >> PAGE_SHIFT) & ctx.page_mask);
+ page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask);
if (page_increm > nr_pages)
page_increm = nr_pages;
@@ -1512,7 +1494,7 @@ next_page:
}
for (j = 0; j < page_increm; j++) {
- subpage = nth_page(page, j);
+ subpage = page + j;
pages[i + j] = subpage;
flush_anon_page(vma, subpage, start + j * PAGE_SIZE);
flush_dcache_page(subpage);
@@ -1524,8 +1506,6 @@ next_page:
nr_pages -= page_increm;
} while (nr_pages);
out:
- if (ctx.pgmap)
- put_dev_pagemap(ctx.pgmap);
return i ? i : ret;
}
@@ -1693,7 +1673,7 @@ static __always_inline long __get_user_pages_locked(struct mm_struct *mm,
mmap_assert_locked(mm);
if (flags & FOLL_PIN)
- mm_set_has_pinned_flag(&mm->flags);
+ mm_set_has_pinned_flag(mm);
/*
* FOLL_PIN and FOLL_GET are mutually exclusive. Traditional behavior
@@ -2287,8 +2267,8 @@ static unsigned long collect_longterm_unpinnable_folios(
struct pages_or_folios *pofs)
{
unsigned long collected = 0;
- bool drain_allow = true;
struct folio *folio;
+ int drained = 0;
long i = 0;
for (folio = pofs_get_folio(pofs, i); folio;
@@ -2307,9 +2287,17 @@ static unsigned long collect_longterm_unpinnable_folios(
continue;
}
- if (!folio_test_lru(folio) && drain_allow) {
+ if (drained == 0 && folio_may_be_lru_cached(folio) &&
+ folio_ref_count(folio) !=
+ folio_expected_ref_count(folio) + 1) {
+ lru_add_drain();
+ drained = 1;
+ }
+ if (drained == 1 && folio_may_be_lru_cached(folio) &&
+ folio_ref_count(folio) !=
+ folio_expected_ref_count(folio) + 1) {
lru_add_drain_all();
- drain_allow = false;
+ drained = 2;
}
if (!folio_isolate_lru(folio))
@@ -2853,7 +2841,6 @@ static int gup_fast_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,
unsigned long end, unsigned int flags, struct page **pages,
int *nr)
{
- struct dev_pagemap *pgmap = NULL;
int ret = 0;
pte_t *ptep, *ptem;
@@ -2911,12 +2898,9 @@ static int gup_fast_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,
* see Documentation/core-api/pin_user_pages.rst for
* details.
*/
- if (flags & FOLL_PIN) {
- ret = arch_make_folio_accessible(folio);
- if (ret) {
- gup_put_folio(folio, 1, flags);
- goto pte_unmap;
- }
+ if ((flags & FOLL_PIN) && arch_make_folio_accessible(folio)) {
+ gup_put_folio(folio, 1, flags);
+ goto pte_unmap;
}
folio_set_referenced(folio);
pages[*nr] = page;
@@ -2926,8 +2910,6 @@ static int gup_fast_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,
ret = 1;
pte_unmap:
- if (pgmap)
- put_dev_pagemap(pgmap);
pte_unmap(ptem);
return ret;
}
@@ -2964,8 +2946,8 @@ static int gup_fast_pmd_leaf(pmd_t orig, pmd_t *pmdp, unsigned long addr,
if (pmd_special(orig))
return 0;
- page = pmd_page(orig);
- refs = record_subpages(page, PMD_SIZE, addr, end, pages + *nr);
+ refs = (end - addr) >> PAGE_SHIFT;
+ page = pmd_page(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
folio = try_grab_folio_fast(page, refs, flags);
if (!folio)
@@ -2985,7 +2967,10 @@ static int gup_fast_pmd_leaf(pmd_t orig, pmd_t *pmdp, unsigned long addr,
return 0;
}
+ pages += *nr;
*nr += refs;
+ for (; refs; refs--)
+ *(pages++) = page++;
folio_set_referenced(folio);
return 1;
}
@@ -3004,8 +2989,8 @@ static int gup_fast_pud_leaf(pud_t orig, pud_t *pudp, unsigned long addr,
if (pud_special(orig))
return 0;
- page = pud_page(orig);
- refs = record_subpages(page, PUD_SIZE, addr, end, pages + *nr);
+ refs = (end - addr) >> PAGE_SHIFT;
+ page = pud_page(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
folio = try_grab_folio_fast(page, refs, flags);
if (!folio)
@@ -3026,7 +3011,10 @@ static int gup_fast_pud_leaf(pud_t orig, pud_t *pudp, unsigned long addr,
return 0;
}
+ pages += *nr;
*nr += refs;
+ for (; refs; refs--)
+ *(pages++) = page++;
folio_set_referenced(folio);
return 1;
}
@@ -3210,7 +3198,7 @@ static int gup_fast_fallback(unsigned long start, unsigned long nr_pages,
return -EINVAL;
if (gup_flags & FOLL_PIN)
- mm_set_has_pinned_flag(&current->mm->flags);
+ mm_set_has_pinned_flag(current->mm);
if (!(gup_flags & FOLL_FAST_ONLY))
might_lock_read(&current->mm->mmap_lock);
diff --git a/mm/highmem.c b/mm/highmem.c
index ef3189b36cad..b5c8e4c2d5d4 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -61,7 +61,7 @@ static inline int kmap_local_calc_idx(int idx)
/*
* Determine color of virtual address where the page should be mapped.
*/
-static inline unsigned int get_pkmap_color(struct page *page)
+static inline unsigned int get_pkmap_color(const struct page *page)
{
return 0;
}
@@ -334,7 +334,7 @@ EXPORT_SYMBOL(kmap_high);
*
* This can be called from any context.
*/
-void *kmap_high_get(struct page *page)
+void *kmap_high_get(const struct page *page)
{
unsigned long vaddr, flags;
@@ -356,7 +356,7 @@ void *kmap_high_get(struct page *page)
* If ARCH_NEEDS_KMAP_HIGH_GET is not defined then this may be called
* only from user context.
*/
-void kunmap_high(struct page *page)
+void kunmap_high(const struct page *page)
{
unsigned long vaddr;
unsigned long nr;
@@ -508,7 +508,7 @@ static inline void kmap_local_idx_pop(void)
#endif
#ifndef arch_kmap_local_high_get
-static inline void *arch_kmap_local_high_get(struct page *page)
+static inline void *arch_kmap_local_high_get(const struct page *page)
{
return NULL;
}
@@ -572,7 +572,7 @@ void *__kmap_local_pfn_prot(unsigned long pfn, pgprot_t prot)
}
EXPORT_SYMBOL_GPL(__kmap_local_pfn_prot);
-void *__kmap_local_page_prot(struct page *page, pgprot_t prot)
+void *__kmap_local_page_prot(const struct page *page, pgprot_t prot)
{
void *kmap;
diff --git a/mm/hmm.c b/mm/hmm.c
index d545e2494994..3e00f08722d5 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -326,6 +326,68 @@ fault:
return hmm_vma_fault(addr, end, required_fault, walk);
}
+#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
+static int hmm_vma_handle_absent_pmd(struct mm_walk *walk, unsigned long start,
+ unsigned long end, unsigned long *hmm_pfns,
+ pmd_t pmd)
+{
+ struct hmm_vma_walk *hmm_vma_walk = walk->private;
+ struct hmm_range *range = hmm_vma_walk->range;
+ unsigned long npages = (end - start) >> PAGE_SHIFT;
+ unsigned long addr = start;
+ swp_entry_t entry = pmd_to_swp_entry(pmd);
+ unsigned int required_fault;
+
+ if (is_device_private_entry(entry) &&
+ pfn_swap_entry_folio(entry)->pgmap->owner ==
+ range->dev_private_owner) {
+ unsigned long cpu_flags = HMM_PFN_VALID |
+ hmm_pfn_flags_order(PMD_SHIFT - PAGE_SHIFT);
+ unsigned long pfn = swp_offset_pfn(entry);
+ unsigned long i;
+
+ if (is_writable_device_private_entry(entry))
+ cpu_flags |= HMM_PFN_WRITE;
+
+ /*
+ * Fully populate the PFN list though subsequent PFNs could be
+ * inferred, because drivers which are not yet aware of large
+ * folios probably do not support sparsely populated PFN lists.
+ */
+ for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) {
+ hmm_pfns[i] &= HMM_PFN_INOUT_FLAGS;
+ hmm_pfns[i] |= pfn | cpu_flags;
+ }
+
+ return 0;
+ }
+
+ required_fault = hmm_range_need_fault(hmm_vma_walk, hmm_pfns,
+ npages, 0);
+ if (required_fault) {
+ if (is_device_private_entry(entry))
+ return hmm_vma_fault(addr, end, required_fault, walk);
+ else
+ return -EFAULT;
+ }
+
+ return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR);
+}
+#else
+static int hmm_vma_handle_absent_pmd(struct mm_walk *walk, unsigned long start,
+ unsigned long end, unsigned long *hmm_pfns,
+ pmd_t pmd)
+{
+ struct hmm_vma_walk *hmm_vma_walk = walk->private;
+ struct hmm_range *range = hmm_vma_walk->range;
+ unsigned long npages = (end - start) >> PAGE_SHIFT;
+
+ if (hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0))
+ return -EFAULT;
+ return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR);
+}
+#endif /* CONFIG_ARCH_ENABLE_THP_MIGRATION */
+
static int hmm_vma_walk_pmd(pmd_t *pmdp,
unsigned long start,
unsigned long end,
@@ -354,11 +416,9 @@ again:
return hmm_pfns_fill(start, end, range, 0);
}
- if (!pmd_present(pmd)) {
- if (hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0))
- return -EFAULT;
- return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR);
- }
+ if (!pmd_present(pmd))
+ return hmm_vma_handle_absent_pmd(walk, start, end, hmm_pfns,
+ pmd);
if (pmd_trans_huge(pmd)) {
/*
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 9c38a95e9f09..5acca24bbabb 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -99,12 +99,12 @@ static inline bool file_thp_enabled(struct vm_area_struct *vma)
unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
vm_flags_t vm_flags,
- unsigned long tva_flags,
+ enum tva_type type,
unsigned long orders)
{
- bool smaps = tva_flags & TVA_SMAPS;
- bool in_pf = tva_flags & TVA_IN_PF;
- bool enforce_sysfs = tva_flags & TVA_ENFORCE_SYSFS;
+ const bool smaps = type == TVA_SMAPS;
+ const bool in_pf = type == TVA_PAGEFAULT;
+ const bool forced_collapse = type == TVA_FORCED_COLLAPSE;
unsigned long supported_orders;
/* Check the intersection of requested and supported orders. */
@@ -122,7 +122,7 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
if (!vma->vm_mm) /* vdso */
return 0;
- if (thp_disabled_by_hw() || vma_thp_disabled(vma, vm_flags))
+ if (thp_disabled_by_hw() || vma_thp_disabled(vma, vm_flags, forced_collapse))
return 0;
/* khugepaged doesn't collapse DAX vma, but page fault is fine. */
@@ -167,14 +167,14 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
if (!in_pf && shmem_file(vma->vm_file))
return orders & shmem_allowable_huge_orders(file_inode(vma->vm_file),
vma, vma->vm_pgoff, 0,
- !enforce_sysfs);
+ forced_collapse);
if (!vma_is_anonymous(vma)) {
/*
- * Enforce sysfs THP requirements as necessary. Anonymous vmas
+ * Enforce THP collapse requirements as necessary. Anonymous vmas
* were already handled in thp_vma_allowable_orders().
*/
- if (enforce_sysfs &&
+ if (!forced_collapse &&
(!hugepage_global_enabled() || (!(vm_flags & VM_HUGEPAGE) &&
!hugepage_global_always())))
return 0;
@@ -207,7 +207,7 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
return orders;
}
-static bool get_huge_zero_page(void)
+static bool get_huge_zero_folio(void)
{
struct folio *zero_folio;
retry:
@@ -237,7 +237,7 @@ retry:
return true;
}
-static void put_huge_zero_page(void)
+static void put_huge_zero_folio(void)
{
/*
* Counter should never go to zero here. Only shrinker can put
@@ -248,33 +248,39 @@ static void put_huge_zero_page(void)
struct folio *mm_get_huge_zero_folio(struct mm_struct *mm)
{
- if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
+ if (IS_ENABLED(CONFIG_PERSISTENT_HUGE_ZERO_FOLIO))
+ return huge_zero_folio;
+
+ if (mm_flags_test(MMF_HUGE_ZERO_FOLIO, mm))
return READ_ONCE(huge_zero_folio);
- if (!get_huge_zero_page())
+ if (!get_huge_zero_folio())
return NULL;
- if (test_and_set_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
- put_huge_zero_page();
+ if (mm_flags_test_and_set(MMF_HUGE_ZERO_FOLIO, mm))
+ put_huge_zero_folio();
return READ_ONCE(huge_zero_folio);
}
void mm_put_huge_zero_folio(struct mm_struct *mm)
{
- if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
- put_huge_zero_page();
+ if (IS_ENABLED(CONFIG_PERSISTENT_HUGE_ZERO_FOLIO))
+ return;
+
+ if (mm_flags_test(MMF_HUGE_ZERO_FOLIO, mm))
+ put_huge_zero_folio();
}
-static unsigned long shrink_huge_zero_page_count(struct shrinker *shrink,
- struct shrink_control *sc)
+static unsigned long shrink_huge_zero_folio_count(struct shrinker *shrink,
+ struct shrink_control *sc)
{
/* we can free zero page only if last reference remains */
return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0;
}
-static unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink,
- struct shrink_control *sc)
+static unsigned long shrink_huge_zero_folio_scan(struct shrinker *shrink,
+ struct shrink_control *sc)
{
if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) {
struct folio *zero_folio = xchg(&huge_zero_folio, NULL);
@@ -287,7 +293,7 @@ static unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink,
return 0;
}
-static struct shrinker *huge_zero_page_shrinker;
+static struct shrinker *huge_zero_folio_shrinker;
#ifdef CONFIG_SYSFS
static ssize_t enabled_show(struct kobject *kobj,
@@ -849,33 +855,47 @@ static inline void hugepage_exit_sysfs(struct kobject *hugepage_kobj)
static int __init thp_shrinker_init(void)
{
- huge_zero_page_shrinker = shrinker_alloc(0, "thp-zero");
- if (!huge_zero_page_shrinker)
- return -ENOMEM;
-
deferred_split_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE |
SHRINKER_MEMCG_AWARE |
SHRINKER_NONSLAB,
"thp-deferred_split");
- if (!deferred_split_shrinker) {
- shrinker_free(huge_zero_page_shrinker);
+ if (!deferred_split_shrinker)
return -ENOMEM;
- }
-
- huge_zero_page_shrinker->count_objects = shrink_huge_zero_page_count;
- huge_zero_page_shrinker->scan_objects = shrink_huge_zero_page_scan;
- shrinker_register(huge_zero_page_shrinker);
deferred_split_shrinker->count_objects = deferred_split_count;
deferred_split_shrinker->scan_objects = deferred_split_scan;
shrinker_register(deferred_split_shrinker);
+ if (IS_ENABLED(CONFIG_PERSISTENT_HUGE_ZERO_FOLIO)) {
+ /*
+ * Bump the reference of the huge_zero_folio and do not
+ * initialize the shrinker.
+ *
+ * huge_zero_folio will always be NULL on failure. We assume
+ * that get_huge_zero_folio() will most likely not fail as
+ * thp_shrinker_init() is invoked early on during boot.
+ */
+ if (!get_huge_zero_folio())
+ pr_warn("Allocating persistent huge zero folio failed\n");
+ return 0;
+ }
+
+ huge_zero_folio_shrinker = shrinker_alloc(0, "thp-zero");
+ if (!huge_zero_folio_shrinker) {
+ shrinker_free(deferred_split_shrinker);
+ return -ENOMEM;
+ }
+
+ huge_zero_folio_shrinker->count_objects = shrink_huge_zero_folio_count;
+ huge_zero_folio_shrinker->scan_objects = shrink_huge_zero_folio_scan;
+ shrinker_register(huge_zero_folio_shrinker);
+
return 0;
}
static void __init thp_shrinker_exit(void)
{
- shrinker_free(huge_zero_page_shrinker);
+ shrinker_free(huge_zero_folio_shrinker);
shrinker_free(deferred_split_shrinker);
}
@@ -911,7 +931,7 @@ static int __init hugepage_init(void)
* where the extra memory used could hurt more than TLB overhead
* is likely to save. The admin can still enable it through /sys.
*/
- if (totalram_pages() < (512 << (20 - PAGE_SHIFT))) {
+ if (totalram_pages() < MB_TO_PAGES(512)) {
transparent_hugepage_flags = 0;
return 0;
}
@@ -1125,7 +1145,7 @@ static unsigned long __thp_get_unmapped_area(struct file *filp,
off_sub = (off - ret) & (size - 1);
- if (test_bit(MMF_TOPDOWN, &current->mm->flags) && !off_sub)
+ if (mm_flags_test(MMF_TOPDOWN, current->mm) && !off_sub)
return ret + size;
ret += off_sub;
@@ -1309,6 +1329,7 @@ static void set_huge_zero_folio(pgtable_t pgtable, struct mm_struct *mm,
{
pmd_t entry;
entry = folio_mk_pmd(zero_folio, vma->vm_page_prot);
+ entry = pmd_mkspecial(entry);
pgtable_trans_huge_deposit(mm, pmd, pgtable);
set_pmd_at(mm, haddr, pmd, entry);
mm_inc_nr_ptes(mm);
@@ -1379,15 +1400,25 @@ struct folio_or_pfn {
bool is_folio;
};
-static int insert_pmd(struct vm_area_struct *vma, unsigned long addr,
+static vm_fault_t insert_pmd(struct vm_area_struct *vma, unsigned long addr,
pmd_t *pmd, struct folio_or_pfn fop, pgprot_t prot,
- bool write, pgtable_t pgtable)
+ bool write)
{
struct mm_struct *mm = vma->vm_mm;
+ pgtable_t pgtable = NULL;
+ spinlock_t *ptl;
pmd_t entry;
- lockdep_assert_held(pmd_lockptr(mm, pmd));
+ if (addr < vma->vm_start || addr >= vma->vm_end)
+ return VM_FAULT_SIGBUS;
+
+ if (arch_needs_pgtable_deposit()) {
+ pgtable = pte_alloc_one(vma->vm_mm);
+ if (!pgtable)
+ return VM_FAULT_OOM;
+ }
+ ptl = pmd_lock(mm, pmd);
if (!pmd_none(*pmd)) {
const unsigned long pfn = fop.is_folio ? folio_pfn(fop.folio) :
fop.pfn;
@@ -1395,23 +1426,26 @@ static int insert_pmd(struct vm_area_struct *vma, unsigned long addr,
if (write) {
if (pmd_pfn(*pmd) != pfn) {
WARN_ON_ONCE(!is_huge_zero_pmd(*pmd));
- return -EEXIST;
+ goto out_unlock;
}
entry = pmd_mkyoung(*pmd);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
if (pmdp_set_access_flags(vma, addr, pmd, entry, 1))
update_mmu_cache_pmd(vma, addr, pmd);
}
-
- return -EEXIST;
+ goto out_unlock;
}
if (fop.is_folio) {
entry = folio_mk_pmd(fop.folio, vma->vm_page_prot);
- folio_get(fop.folio);
- folio_add_file_rmap_pmd(fop.folio, &fop.folio->page, vma);
- add_mm_counter(mm, mm_counter_file(fop.folio), HPAGE_PMD_NR);
+ if (is_huge_zero_folio(fop.folio)) {
+ entry = pmd_mkspecial(entry);
+ } else {
+ folio_get(fop.folio);
+ folio_add_file_rmap_pmd(fop.folio, &fop.folio->page, vma);
+ add_mm_counter(mm, mm_counter_file(fop.folio), HPAGE_PMD_NR);
+ }
} else {
entry = pmd_mkhuge(pfn_pmd(fop.pfn, prot));
entry = pmd_mkspecial(entry);
@@ -1424,11 +1458,17 @@ static int insert_pmd(struct vm_area_struct *vma, unsigned long addr,
if (pgtable) {
pgtable_trans_huge_deposit(mm, pmd, pgtable);
mm_inc_nr_ptes(mm);
+ pgtable = NULL;
}
set_pmd_at(mm, addr, pmd, entry);
update_mmu_cache_pmd(vma, addr, pmd);
- return 0;
+
+out_unlock:
+ spin_unlock(ptl);
+ if (pgtable)
+ pte_free(mm, pgtable);
+ return VM_FAULT_NOPAGE;
}
/**
@@ -1450,9 +1490,6 @@ vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, unsigned long pfn,
struct folio_or_pfn fop = {
.pfn = pfn,
};
- pgtable_t pgtable = NULL;
- spinlock_t *ptl;
- int error;
/*
* If we had pmd_special, we could avoid all these restrictions,
@@ -1464,25 +1501,9 @@ vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, unsigned long pfn,
(VM_PFNMAP|VM_MIXEDMAP));
BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
- if (addr < vma->vm_start || addr >= vma->vm_end)
- return VM_FAULT_SIGBUS;
-
- if (arch_needs_pgtable_deposit()) {
- pgtable = pte_alloc_one(vma->vm_mm);
- if (!pgtable)
- return VM_FAULT_OOM;
- }
-
pfnmap_setup_cachemode_pfn(pfn, &pgprot);
- ptl = pmd_lock(vma->vm_mm, vmf->pmd);
- error = insert_pmd(vma, addr, vmf->pmd, fop, pgprot, write,
- pgtable);
- spin_unlock(ptl);
- if (error && pgtable)
- pte_free(vma->vm_mm, pgtable);
-
- return VM_FAULT_NOPAGE;
+ return insert_pmd(vma, addr, vmf->pmd, fop, pgprot, write);
}
EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd);
@@ -1491,35 +1512,15 @@ vm_fault_t vmf_insert_folio_pmd(struct vm_fault *vmf, struct folio *folio,
{
struct vm_area_struct *vma = vmf->vma;
unsigned long addr = vmf->address & PMD_MASK;
- struct mm_struct *mm = vma->vm_mm;
struct folio_or_pfn fop = {
.folio = folio,
.is_folio = true,
};
- spinlock_t *ptl;
- pgtable_t pgtable = NULL;
- int error;
-
- if (addr < vma->vm_start || addr >= vma->vm_end)
- return VM_FAULT_SIGBUS;
if (WARN_ON_ONCE(folio_order(folio) != PMD_ORDER))
return VM_FAULT_SIGBUS;
- if (arch_needs_pgtable_deposit()) {
- pgtable = pte_alloc_one(vma->vm_mm);
- if (!pgtable)
- return VM_FAULT_OOM;
- }
-
- ptl = pmd_lock(mm, vmf->pmd);
- error = insert_pmd(vma, addr, vmf->pmd, fop, vma->vm_page_prot,
- write, pgtable);
- spin_unlock(ptl);
- if (error && pgtable)
- pte_free(mm, pgtable);
-
- return VM_FAULT_NOPAGE;
+ return insert_pmd(vma, addr, vmf->pmd, fop, vma->vm_page_prot, write);
}
EXPORT_SYMBOL_GPL(vmf_insert_folio_pmd);
@@ -1531,25 +1532,30 @@ static pud_t maybe_pud_mkwrite(pud_t pud, struct vm_area_struct *vma)
return pud;
}
-static void insert_pud(struct vm_area_struct *vma, unsigned long addr,
+static vm_fault_t insert_pud(struct vm_area_struct *vma, unsigned long addr,
pud_t *pud, struct folio_or_pfn fop, pgprot_t prot, bool write)
{
struct mm_struct *mm = vma->vm_mm;
+ spinlock_t *ptl;
pud_t entry;
+ if (addr < vma->vm_start || addr >= vma->vm_end)
+ return VM_FAULT_SIGBUS;
+
+ ptl = pud_lock(mm, pud);
if (!pud_none(*pud)) {
const unsigned long pfn = fop.is_folio ? folio_pfn(fop.folio) :
fop.pfn;
if (write) {
if (WARN_ON_ONCE(pud_pfn(*pud) != pfn))
- return;
+ goto out_unlock;
entry = pud_mkyoung(*pud);
entry = maybe_pud_mkwrite(pud_mkdirty(entry), vma);
if (pudp_set_access_flags(vma, addr, pud, entry, 1))
update_mmu_cache_pud(vma, addr, pud);
}
- return;
+ goto out_unlock;
}
if (fop.is_folio) {
@@ -1568,6 +1574,9 @@ static void insert_pud(struct vm_area_struct *vma, unsigned long addr,
}
set_pud_at(mm, addr, pud, entry);
update_mmu_cache_pud(vma, addr, pud);
+out_unlock:
+ spin_unlock(ptl);
+ return VM_FAULT_NOPAGE;
}
/**
@@ -1589,7 +1598,6 @@ vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, unsigned long pfn,
struct folio_or_pfn fop = {
.pfn = pfn,
};
- spinlock_t *ptl;
/*
* If we had pud_special, we could avoid all these restrictions,
@@ -1601,16 +1609,9 @@ vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, unsigned long pfn,
(VM_PFNMAP|VM_MIXEDMAP));
BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
- if (addr < vma->vm_start || addr >= vma->vm_end)
- return VM_FAULT_SIGBUS;
-
pfnmap_setup_cachemode_pfn(pfn, &pgprot);
- ptl = pud_lock(vma->vm_mm, vmf->pud);
- insert_pud(vma, addr, vmf->pud, fop, pgprot, write);
- spin_unlock(ptl);
-
- return VM_FAULT_NOPAGE;
+ return insert_pud(vma, addr, vmf->pud, fop, pgprot, write);
}
EXPORT_SYMBOL_GPL(vmf_insert_pfn_pud);
@@ -1627,25 +1628,15 @@ vm_fault_t vmf_insert_folio_pud(struct vm_fault *vmf, struct folio *folio,
{
struct vm_area_struct *vma = vmf->vma;
unsigned long addr = vmf->address & PUD_MASK;
- pud_t *pud = vmf->pud;
- struct mm_struct *mm = vma->vm_mm;
struct folio_or_pfn fop = {
.folio = folio,
.is_folio = true,
};
- spinlock_t *ptl;
-
- if (addr < vma->vm_start || addr >= vma->vm_end)
- return VM_FAULT_SIGBUS;
if (WARN_ON_ONCE(folio_order(folio) != PUD_ORDER))
return VM_FAULT_SIGBUS;
- ptl = pud_lock(mm, pud);
- insert_pud(vma, addr, vmf->pud, fop, vma->vm_page_prot, write);
- spin_unlock(ptl);
-
- return VM_FAULT_NOPAGE;
+ return insert_pud(vma, addr, vmf->pud, fop, vma->vm_page_prot, write);
}
EXPORT_SYMBOL_GPL(vmf_insert_folio_pud);
#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
@@ -1675,7 +1666,8 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
int ret = -ENOMEM;
pmd = pmdp_get_lockless(src_pmd);
- if (unlikely(pmd_present(pmd) && pmd_special(pmd))) {
+ if (unlikely(pmd_present(pmd) && pmd_special(pmd) &&
+ !is_huge_zero_pmd(pmd))) {
dst_ptl = pmd_lock(dst_mm, dst_pmd);
src_ptl = pmd_lockptr(src_mm, src_pmd);
spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
@@ -3310,8 +3302,8 @@ static void __split_folio_to_order(struct folio *folio, int old_order,
* unreferenced sub-pages of an anonymous THP: we can simply drop
* PG_anon_exclusive (-> PG_mappedtodisk) for these here.
*/
- new_folio->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
- new_folio->flags |= (folio->flags &
+ new_folio->flags.f &= ~PAGE_FLAGS_CHECK_AT_PREP;
+ new_folio->flags.f |= (folio->flags.f &
((1L << PG_referenced) |
(1L << PG_swapbacked) |
(1L << PG_swapcache) |
@@ -3728,7 +3720,7 @@ static int __folio_split(struct folio *folio, unsigned int new_order,
/* Prevent deferred_split_scan() touching ->_refcount */
spin_lock(&ds_queue->split_queue_lock);
if (folio_ref_freeze(folio, 1 + extra_pins)) {
- struct address_space *swap_cache = NULL;
+ struct swap_cluster_info *ci = NULL;
struct lruvec *lruvec;
int expected_refs;
@@ -3772,8 +3764,7 @@ static int __folio_split(struct folio *folio, unsigned int new_order,
goto fail;
}
- swap_cache = swap_address_space(folio->swap);
- xa_lock(&swap_cache->i_pages);
+ ci = swap_cluster_get_and_lock(folio);
}
/* lock lru list/PageCompound, ref frozen by page_ref_freeze */
@@ -3805,10 +3796,8 @@ static int __folio_split(struct folio *folio, unsigned int new_order,
* Anonymous folio with swap cache.
* NOTE: shmem in swap cache is not supported yet.
*/
- if (swap_cache) {
- __xa_store(&swap_cache->i_pages,
- swap_cache_index(new_folio->swap),
- new_folio, 0);
+ if (ci) {
+ __swap_cache_replace_folio(ci, folio, new_folio);
continue;
}
@@ -3843,8 +3832,8 @@ static int __folio_split(struct folio *folio, unsigned int new_order,
unlock_page_lruvec(lruvec);
- if (swap_cache)
- xa_unlock(&swap_cache->i_pages);
+ if (ci)
+ swap_cluster_unlock(ci);
} else {
spin_unlock(&ds_queue->split_queue_lock);
ret = -EAGAIN;
@@ -4186,6 +4175,13 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
bool underused = false;
if (!folio_test_partially_mapped(folio)) {
+ /*
+ * See try_to_map_unused_to_zeropage(): we cannot
+ * optimize zero-filled pages after splitting an
+ * mlocked folio.
+ */
+ if (folio_test_mlocked(folio))
+ goto next;
underused = thp_underused(folio);
if (!underused)
goto next;
@@ -4327,8 +4323,8 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
goto out;
}
- pr_debug("Split huge pages in pid: %d, vaddr: [0x%lx - 0x%lx]\n",
- pid, vaddr_start, vaddr_end);
+ pr_debug("Split huge pages in pid: %d, vaddr: [0x%lx - 0x%lx], new_order: %u, in_folio_offset: %ld\n",
+ pid, vaddr_start, vaddr_end, new_order, in_folio_offset);
mmap_read_lock(mm);
/*
@@ -4438,8 +4434,8 @@ static int split_huge_pages_in_file(const char *file_path, pgoff_t off_start,
if (IS_ERR(candidate))
goto out;
- pr_debug("split file-backed THPs in file: %s, page offset: [0x%lx - 0x%lx]\n",
- file_path, off_start, off_end);
+ pr_debug("split file-backed THPs in file: %s, page offset: [0x%lx - 0x%lx], new_order: %u, in_folio_offset: %ld\n",
+ file_path, off_start, off_end, new_order, in_folio_offset);
mapping = candidate->f_mapping;
min_order = mapping_min_folio_order(mapping);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 753f99b4c718..6cac826cb61f 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1473,17 +1473,14 @@ static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE
#ifdef CONFIG_CONTIG_ALLOC
-static struct folio *alloc_gigantic_folio(struct hstate *h, gfp_t gfp_mask,
+static struct folio *alloc_gigantic_folio(int order, gfp_t gfp_mask,
int nid, nodemask_t *nodemask)
{
struct folio *folio;
- int order = huge_page_order(h);
bool retried = false;
- if (nid == NUMA_NO_NODE)
- nid = numa_mem_id();
retry:
- folio = hugetlb_cma_alloc_folio(h, gfp_mask, nid, nodemask);
+ folio = hugetlb_cma_alloc_folio(order, gfp_mask, nid, nodemask);
if (!folio) {
if (hugetlb_cma_exclusive_alloc())
return NULL;
@@ -1506,16 +1503,16 @@ retry:
}
#else /* !CONFIG_CONTIG_ALLOC */
-static struct folio *alloc_gigantic_folio(struct hstate *h, gfp_t gfp_mask,
- int nid, nodemask_t *nodemask)
+static struct folio *alloc_gigantic_folio(int order, gfp_t gfp_mask, int nid,
+ nodemask_t *nodemask)
{
return NULL;
}
#endif /* CONFIG_CONTIG_ALLOC */
#else /* !CONFIG_ARCH_HAS_GIGANTIC_PAGE */
-static struct folio *alloc_gigantic_folio(struct hstate *h, gfp_t gfp_mask,
- int nid, nodemask_t *nodemask)
+static struct folio *alloc_gigantic_folio(int order, gfp_t gfp_mask, int nid,
+ nodemask_t *nodemask)
{
return NULL;
}
@@ -1890,14 +1887,14 @@ void free_huge_folio(struct folio *folio)
/*
* Must be called with the hugetlb lock held
*/
-static void __prep_account_new_huge_page(struct hstate *h, int nid)
+static void account_new_hugetlb_folio(struct hstate *h, struct folio *folio)
{
lockdep_assert_held(&hugetlb_lock);
h->nr_huge_pages++;
- h->nr_huge_pages_node[nid]++;
+ h->nr_huge_pages_node[folio_nid(folio)]++;
}
-static void init_new_hugetlb_folio(struct hstate *h, struct folio *folio)
+static void init_new_hugetlb_folio(struct folio *folio)
{
__folio_set_hugetlb(folio);
INIT_LIST_HEAD(&folio->lru);
@@ -1906,20 +1903,6 @@ static void init_new_hugetlb_folio(struct hstate *h, struct folio *folio)
set_hugetlb_cgroup_rsvd(folio, NULL);
}
-static void __prep_new_hugetlb_folio(struct hstate *h, struct folio *folio)
-{
- init_new_hugetlb_folio(h, folio);
- hugetlb_vmemmap_optimize_folio(h, folio);
-}
-
-static void prep_new_hugetlb_folio(struct hstate *h, struct folio *folio, int nid)
-{
- __prep_new_hugetlb_folio(h, folio);
- spin_lock_irq(&hugetlb_lock);
- __prep_account_new_huge_page(h, nid);
- spin_unlock_irq(&hugetlb_lock);
-}
-
/*
* Find and lock address space (mapping) in write mode.
*
@@ -1940,11 +1923,9 @@ struct address_space *hugetlb_folio_mapping_lock_write(struct folio *folio)
return NULL;
}
-static struct folio *alloc_buddy_hugetlb_folio(struct hstate *h,
- gfp_t gfp_mask, int nid, nodemask_t *nmask,
- nodemask_t *node_alloc_noretry)
+static struct folio *alloc_buddy_hugetlb_folio(int order, gfp_t gfp_mask,
+ int nid, nodemask_t *nmask, nodemask_t *node_alloc_noretry)
{
- int order = huge_page_order(h);
struct folio *folio;
bool alloc_try_hard = true;
@@ -1959,8 +1940,6 @@ static struct folio *alloc_buddy_hugetlb_folio(struct hstate *h,
alloc_try_hard = false;
if (alloc_try_hard)
gfp_mask |= __GFP_RETRY_MAYFAIL;
- if (nid == NUMA_NO_NODE)
- nid = numa_mem_id();
folio = (struct folio *)__alloc_frozen_pages(gfp_mask, order, nid, nmask);
@@ -1994,36 +1973,36 @@ static struct folio *only_alloc_fresh_hugetlb_folio(struct hstate *h,
nodemask_t *node_alloc_noretry)
{
struct folio *folio;
+ int order = huge_page_order(h);
- if (hstate_is_gigantic(h))
- folio = alloc_gigantic_folio(h, gfp_mask, nid, nmask);
+ if (nid == NUMA_NO_NODE)
+ nid = numa_mem_id();
+
+ if (order_is_gigantic(order))
+ folio = alloc_gigantic_folio(order, gfp_mask, nid, nmask);
else
- folio = alloc_buddy_hugetlb_folio(h, gfp_mask, nid, nmask, node_alloc_noretry);
+ folio = alloc_buddy_hugetlb_folio(order, gfp_mask, nid, nmask,
+ node_alloc_noretry);
if (folio)
- init_new_hugetlb_folio(h, folio);
+ init_new_hugetlb_folio(folio);
return folio;
}
/*
- * Common helper to allocate a fresh hugetlb page. All specific allocators
- * should use this function to get new hugetlb pages
+ * Common helper to allocate a fresh hugetlb folio. All specific allocators
+ * should use this function to get new hugetlb folio
*
- * Note that returned page is 'frozen': ref count of head page and all tail
- * pages is zero.
+ * Note that returned folio is 'frozen': ref count of head page and all tail
+ * pages is zero, and the accounting must be done in the caller.
*/
static struct folio *alloc_fresh_hugetlb_folio(struct hstate *h,
gfp_t gfp_mask, int nid, nodemask_t *nmask)
{
struct folio *folio;
- if (hstate_is_gigantic(h))
- folio = alloc_gigantic_folio(h, gfp_mask, nid, nmask);
- else
- folio = alloc_buddy_hugetlb_folio(h, gfp_mask, nid, nmask, NULL);
- if (!folio)
- return NULL;
-
- prep_new_hugetlb_folio(h, folio, folio_nid(folio));
+ folio = only_alloc_fresh_hugetlb_folio(h, gfp_mask, nid, nmask, NULL);
+ if (folio)
+ hugetlb_vmemmap_optimize_folio(h, folio);
return folio;
}
@@ -2039,7 +2018,7 @@ static void prep_and_add_allocated_folios(struct hstate *h,
/* Add all new pool pages to free lists in one lock cycle */
spin_lock_irqsave(&hugetlb_lock, flags);
list_for_each_entry_safe(folio, tmp_f, folio_list, lru) {
- __prep_account_new_huge_page(h, folio_nid(folio));
+ account_new_hugetlb_folio(h, folio);
enqueue_hugetlb_folio(h, folio);
}
spin_unlock_irqrestore(&hugetlb_lock, flags);
@@ -2241,19 +2220,17 @@ static struct folio *alloc_surplus_hugetlb_folio(struct hstate *h,
goto out_unlock;
spin_unlock_irq(&hugetlb_lock);
- folio = only_alloc_fresh_hugetlb_folio(h, gfp_mask, nid, nmask, NULL);
+ folio = alloc_fresh_hugetlb_folio(h, gfp_mask, nid, nmask);
if (!folio)
return NULL;
- hugetlb_vmemmap_optimize_folio(h, folio);
-
spin_lock_irq(&hugetlb_lock);
/*
* nr_huge_pages needs to be adjusted within the same lock cycle
* as surplus_pages, otherwise it might confuse
* persistent_huge_pages() momentarily.
*/
- __prep_account_new_huge_page(h, folio_nid(folio));
+ account_new_hugetlb_folio(h, folio);
/*
* We could have raced with the pool size change.
@@ -2290,6 +2267,10 @@ static struct folio *alloc_migrate_hugetlb_folio(struct hstate *h, gfp_t gfp_mas
if (!folio)
return NULL;
+ spin_lock_irq(&hugetlb_lock);
+ account_new_hugetlb_folio(h, folio);
+ spin_unlock_irq(&hugetlb_lock);
+
/* fresh huge pages are frozen */
folio_ref_unfreeze(folio, 1);
/*
@@ -2836,18 +2817,17 @@ retry:
if (!new_folio) {
spin_unlock_irq(&hugetlb_lock);
gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
- new_folio = alloc_buddy_hugetlb_folio(h, gfp_mask, nid,
- NULL, NULL);
+ new_folio = alloc_fresh_hugetlb_folio(h, gfp_mask,
+ nid, NULL);
if (!new_folio)
return -ENOMEM;
- __prep_new_hugetlb_folio(h, new_folio);
goto retry;
}
/*
* Ok, old_folio is still a genuine free hugepage. Remove it from
* the freelist and decrease the counters. These will be
- * incremented again when calling __prep_account_new_huge_page()
+ * incremented again when calling account_new_hugetlb_folio()
* and enqueue_hugetlb_folio() for new_folio. The counters will
* remain stable since this happens under the lock.
*/
@@ -2857,7 +2837,7 @@ retry:
* Ref count on new_folio is already zero as it was dropped
* earlier. It can be directly added to the pool free list.
*/
- __prep_account_new_huge_page(h, nid);
+ account_new_hugetlb_folio(h, new_folio);
enqueue_hugetlb_folio(h, new_folio);
/*
@@ -2890,7 +2870,7 @@ int isolate_or_dissolve_huge_folio(struct folio *folio, struct list_head *list)
* alloc_contig_range and them. Return -ENOMEM as this has the effect
* of bailing out right away without further retrying.
*/
- if (folio_order(folio) > MAX_PAGE_ORDER)
+ if (order_is_gigantic(folio_order(folio)))
return -ENOMEM;
if (folio_ref_count(folio) && folio_isolate_hugetlb(folio, list))
@@ -3237,17 +3217,18 @@ static void __init hugetlb_folio_init_tail_vmemmap(struct folio *folio,
{
enum zone_type zone = zone_idx(folio_zone(folio));
int nid = folio_nid(folio);
+ struct page *page = folio_page(folio, start_page_number);
unsigned long head_pfn = folio_pfn(folio);
unsigned long pfn, end_pfn = head_pfn + end_page_number;
- int ret;
-
- for (pfn = head_pfn + start_page_number; pfn < end_pfn; pfn++) {
- struct page *page = pfn_to_page(pfn);
+ /*
+ * As we marked all tail pages with memblock_reserved_mark_noinit(),
+ * we must initialize them ourselves here.
+ */
+ for (pfn = head_pfn + start_page_number; pfn < end_pfn; page++, pfn++) {
__init_single_page(page, pfn, zone, nid);
prep_compound_tail((struct page *)folio, pfn - head_pfn);
- ret = page_ref_freeze(page, 1);
- VM_BUG_ON(!ret);
+ set_page_count(page, 0);
}
}
@@ -3257,12 +3238,15 @@ static void __init hugetlb_folio_init_vmemmap(struct folio *folio,
{
int ret;
- /* Prepare folio head */
+ /*
+ * This is an open-coded prep_compound_page() whereby we avoid
+ * walking pages twice by initializing/preparing+freezing them in the
+ * same go.
+ */
__folio_clear_reserved(folio);
__folio_set_head(folio);
ret = folio_ref_freeze(folio, 1);
VM_BUG_ON(!ret);
- /* Initialize the necessary tail struct pages */
hugetlb_folio_init_tail_vmemmap(folio, 1, nr_pages);
prep_compound_head((struct page *)folio, huge_page_order(h));
}
@@ -3327,7 +3311,7 @@ static void __init prep_and_add_bootmem_folios(struct hstate *h,
hugetlb_bootmem_init_migratetype(folio, h);
/* Subdivide locks to achieve better parallel performance */
spin_lock_irqsave(&hugetlb_lock, flags);
- __prep_account_new_huge_page(h, folio_nid(folio));
+ account_new_hugetlb_folio(h, folio);
enqueue_hugetlb_folio(h, folio);
spin_unlock_irqrestore(&hugetlb_lock, flags);
}
@@ -3423,7 +3407,7 @@ static void __init gather_bootmem_prealloc_node(unsigned long nid)
hugetlb_folio_init_vmemmap(folio, h,
HUGETLB_VMEMMAP_RESERVE_PAGES);
- init_new_hugetlb_folio(h, folio);
+ init_new_hugetlb_folio(folio);
if (hugetlb_bootmem_page_prehvo(m))
/*
@@ -3554,7 +3538,14 @@ static void __init hugetlb_pages_alloc_boot_node(unsigned long start, unsigned l
nodes_clear(node_alloc_noretry);
for (i = 0; i < num; ++i) {
- struct folio *folio = alloc_pool_huge_folio(h, &node_states[N_MEMORY],
+ struct folio *folio;
+
+ if (hugetlb_vmemmap_optimizable_size(h) &&
+ (si_mem_available() == 0) && !list_empty(&folio_list)) {
+ prep_and_add_allocated_folios(h, &folio_list);
+ INIT_LIST_HEAD(&folio_list);
+ }
+ folio = alloc_pool_huge_folio(h, &node_states[N_MEMORY],
&node_alloc_noretry, &next_node);
if (!folio)
break;
@@ -3589,10 +3580,9 @@ static unsigned long __init hugetlb_pages_alloc_boot(struct hstate *h)
unsigned long jiffies_start;
unsigned long jiffies_end;
+ unsigned long remaining;
job.thread_fn = hugetlb_pages_alloc_boot_node;
- job.start = 0;
- job.size = h->max_huge_pages;
/*
* job.max_threads is 25% of the available cpu threads by default.
@@ -3616,10 +3606,29 @@ static unsigned long __init hugetlb_pages_alloc_boot(struct hstate *h)
}
job.max_threads = hugepage_allocation_threads;
- job.min_chunk = h->max_huge_pages / hugepage_allocation_threads;
jiffies_start = jiffies;
- padata_do_multithreaded(&job);
+ do {
+ remaining = h->max_huge_pages - h->nr_huge_pages;
+
+ job.start = h->nr_huge_pages;
+ job.size = remaining;
+ job.min_chunk = remaining / hugepage_allocation_threads;
+ padata_do_multithreaded(&job);
+
+ if (h->nr_huge_pages == h->max_huge_pages)
+ break;
+
+ /*
+ * Retry only if the vmemmap optimization might have been able to free
+ * some memory back to the system.
+ */
+ if (!hugetlb_vmemmap_optimizable(h))
+ break;
+
+ /* Continue if progress was made in last iteration */
+ } while (remaining != (h->max_huge_pages - h->nr_huge_pages));
+
jiffies_end = jiffies;
pr_info("HugeTLB: allocation took %dms with hugepage_allocation_threads=%ld\n",
@@ -3654,6 +3663,9 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
return;
}
+ if (!h->max_huge_pages)
+ return;
+
/* do node specific alloc */
if (hugetlb_hstate_alloc_pages_specific_nodes(h))
return;
@@ -4035,7 +4047,7 @@ static long demote_free_hugetlb_folios(struct hstate *src, struct hstate *dst,
prep_compound_page(page, dst->order);
new_folio->mapping = NULL;
- init_new_hugetlb_folio(dst, new_folio);
+ init_new_hugetlb_folio(new_folio);
/* Copy the CMA flag so that it is freed correctly */
if (cma)
folio_set_hugetlb_cma(new_folio);
@@ -4654,6 +4666,7 @@ static int __init hugetlb_init(void)
BUILD_BUG_ON(sizeof_field(struct page, private) * BITS_PER_BYTE <
__NR_HPAGEFLAGS);
+ BUILD_BUG_ON_INVALID(HUGETLB_PAGE_ORDER > MAX_FOLIO_ORDER);
if (!hugepages_supported()) {
if (hugetlb_max_hstate || default_hstate_max_huge_pages)
@@ -4737,6 +4750,7 @@ void __init hugetlb_add_hstate(unsigned int order)
}
BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE);
BUG_ON(order < order_base_2(__NR_USED_SUBPAGE));
+ WARN_ON(order > MAX_FOLIO_ORDER);
h = &hstates[hugetlb_max_hstate++];
__mutex_init(&h->resize_lock, "resize mutex", &h->resize_key);
h->order = order;
@@ -5594,18 +5608,13 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
break;
}
- /*
- * If the pagetables are shared don't copy or take references.
- *
- * dst_pte == src_pte is the common case of src/dest sharing.
- * However, src could have 'unshared' and dst shares with
- * another vma. So page_count of ptep page is checked instead
- * to reliably determine whether pte is shared.
- */
- if (page_count(virt_to_page(dst_pte)) > 1) {
+#ifdef CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING
+ /* If the pagetables are shared, there is nothing to do */
+ if (ptdesc_pmd_is_shared(virt_to_ptdesc(dst_pte))) {
addr |= last_addr_mask;
continue;
}
+#endif
dst_ptl = huge_pte_lock(h, dst, dst_pte);
src_ptl = huge_pte_lockptr(h, src, src_pte);
@@ -5851,7 +5860,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
spinlock_t *ptl;
struct hstate *h = hstate_vma(vma);
unsigned long sz = huge_page_size(h);
- bool adjust_reservation = false;
+ bool adjust_reservation;
unsigned long last_addr_mask;
bool force_flush = false;
@@ -5944,6 +5953,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
sz);
hugetlb_count_sub(pages_per_huge_page(h), mm);
hugetlb_remove_rmap(folio);
+ spin_unlock(ptl);
/*
* Restore the reservation for anonymous page, otherwise the
@@ -5951,14 +5961,16 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
* If there we are freeing a surplus, do not set the restore
* reservation bit.
*/
+ adjust_reservation = false;
+
+ spin_lock_irq(&hugetlb_lock);
if (!h->surplus_huge_pages && __vma_private_lock(vma) &&
folio_test_anon(folio)) {
folio_set_hugetlb_restore_reserve(folio);
/* Reservation to be adjusted after the spin lock */
adjust_reservation = true;
}
-
- spin_unlock(ptl);
+ spin_unlock_irq(&hugetlb_lock);
/*
* Adjust the reservation for the region that will have the
@@ -6929,6 +6941,11 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte,
folio = alloc_hugetlb_folio(dst_vma, dst_addr, false);
if (IS_ERR(folio)) {
+ pte_t *actual_pte = hugetlb_walk(dst_vma, dst_addr, PMD_SIZE);
+ if (actual_pte) {
+ ret = -EEXIST;
+ goto out;
+ }
ret = -ENOMEM;
goto out;
}
@@ -7599,7 +7616,7 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
hugetlb_vma_assert_locked(vma);
if (sz != PMD_SIZE)
return 0;
- if (!ptdesc_pmd_pts_count(virt_to_ptdesc(ptep)))
+ if (!ptdesc_pmd_is_shared(virt_to_ptdesc(ptep)))
return 0;
pud_clear(pud);
diff --git a/mm/hugetlb_cma.c b/mm/hugetlb_cma.c
index f58ef4969e7a..e8e4dc7182d5 100644
--- a/mm/hugetlb_cma.c
+++ b/mm/hugetlb_cma.c
@@ -26,11 +26,10 @@ void hugetlb_cma_free_folio(struct folio *folio)
}
-struct folio *hugetlb_cma_alloc_folio(struct hstate *h, gfp_t gfp_mask,
+struct folio *hugetlb_cma_alloc_folio(int order, gfp_t gfp_mask,
int nid, nodemask_t *nodemask)
{
int node;
- int order = huge_page_order(h);
struct folio *folio = NULL;
if (hugetlb_cma[nid])
diff --git a/mm/hugetlb_cma.h b/mm/hugetlb_cma.h
index f7d7fb9880a2..2c2ec8a7e134 100644
--- a/mm/hugetlb_cma.h
+++ b/mm/hugetlb_cma.h
@@ -4,7 +4,7 @@
#ifdef CONFIG_CMA
void hugetlb_cma_free_folio(struct folio *folio);
-struct folio *hugetlb_cma_alloc_folio(struct hstate *h, gfp_t gfp_mask,
+struct folio *hugetlb_cma_alloc_folio(int order, gfp_t gfp_mask,
int nid, nodemask_t *nodemask);
struct huge_bootmem_page *hugetlb_cma_alloc_bootmem(struct hstate *h, int *nid,
bool node_exact);
@@ -18,8 +18,8 @@ static inline void hugetlb_cma_free_folio(struct folio *folio)
{
}
-static inline struct folio *hugetlb_cma_alloc_folio(struct hstate *h,
- gfp_t gfp_mask, int nid, nodemask_t *nodemask)
+static inline struct folio *hugetlb_cma_alloc_folio(int order, gfp_t gfp_mask,
+ int nid, nodemask_t *nodemask)
{
return NULL;
}
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c
index 7ecaa1900137..a11222572f97 100644
--- a/mm/hwpoison-inject.c
+++ b/mm/hwpoison-inject.c
@@ -7,8 +7,96 @@
#include <linux/swap.h>
#include <linux/pagemap.h>
#include <linux/hugetlb.h>
+#include <linux/page-flags.h>
+#include <linux/memcontrol.h>
#include "internal.h"
+static u32 hwpoison_filter_enable;
+static u32 hwpoison_filter_dev_major = ~0U;
+static u32 hwpoison_filter_dev_minor = ~0U;
+static u64 hwpoison_filter_flags_mask;
+static u64 hwpoison_filter_flags_value;
+
+static int hwpoison_filter_dev(struct page *p)
+{
+ struct folio *folio = page_folio(p);
+ struct address_space *mapping;
+ dev_t dev;
+
+ if (hwpoison_filter_dev_major == ~0U &&
+ hwpoison_filter_dev_minor == ~0U)
+ return 0;
+
+ mapping = folio_mapping(folio);
+ if (mapping == NULL || mapping->host == NULL)
+ return -EINVAL;
+
+ dev = mapping->host->i_sb->s_dev;
+ if (hwpoison_filter_dev_major != ~0U &&
+ hwpoison_filter_dev_major != MAJOR(dev))
+ return -EINVAL;
+ if (hwpoison_filter_dev_minor != ~0U &&
+ hwpoison_filter_dev_minor != MINOR(dev))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int hwpoison_filter_flags(struct page *p)
+{
+ if (!hwpoison_filter_flags_mask)
+ return 0;
+
+ if ((stable_page_flags(p) & hwpoison_filter_flags_mask) ==
+ hwpoison_filter_flags_value)
+ return 0;
+ else
+ return -EINVAL;
+}
+
+/*
+ * This allows stress tests to limit test scope to a collection of tasks
+ * by putting them under some memcg. This prevents killing unrelated/important
+ * processes such as /sbin/init. Note that the target task may share clean
+ * pages with init (eg. libc text), which is harmless. If the target task
+ * share _dirty_ pages with another task B, the test scheme must make sure B
+ * is also included in the memcg. At last, due to race conditions this filter
+ * can only guarantee that the page either belongs to the memcg tasks, or is
+ * a freed page.
+ */
+#ifdef CONFIG_MEMCG
+static u64 hwpoison_filter_memcg;
+static int hwpoison_filter_task(struct page *p)
+{
+ if (!hwpoison_filter_memcg)
+ return 0;
+
+ if (page_cgroup_ino(p) != hwpoison_filter_memcg)
+ return -EINVAL;
+
+ return 0;
+}
+#else
+static int hwpoison_filter_task(struct page *p) { return 0; }
+#endif
+
+static int hwpoison_filter(struct page *p)
+{
+ if (!hwpoison_filter_enable)
+ return 0;
+
+ if (hwpoison_filter_dev(p))
+ return -EINVAL;
+
+ if (hwpoison_filter_flags(p))
+ return -EINVAL;
+
+ if (hwpoison_filter_task(p))
+ return -EINVAL;
+
+ return 0;
+}
+
static struct dentry *hwpoison_dir;
static int hwpoison_inject(void *data, u64 val)
@@ -67,6 +155,7 @@ DEFINE_DEBUGFS_ATTRIBUTE(unpoison_fops, NULL, hwpoison_unpoison, "%lli\n");
static void __exit pfn_inject_exit(void)
{
hwpoison_filter_enable = 0;
+ hwpoison_filter_unregister();
debugfs_remove_recursive(hwpoison_dir);
}
@@ -105,6 +194,8 @@ static int __init pfn_inject_init(void)
&hwpoison_filter_memcg);
#endif
+ hwpoison_filter_register(hwpoison_filter);
+
return 0;
}
diff --git a/mm/internal.h b/mm/internal.h
index 45b725c3dc03..1561fc2ff5b8 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -755,6 +755,7 @@ static inline void folio_set_order(struct folio *folio, unsigned int order)
{
if (WARN_ON_ONCE(!order || !folio_test_large(folio)))
return;
+ VM_WARN_ON_ONCE(order > MAX_FOLIO_ORDER);
folio->_flags_1 = (folio->_flags_1 & ~0xffUL) | order;
#ifdef NR_PAGES_IN_LARGE_FOLIO
@@ -842,6 +843,10 @@ static inline struct page *alloc_frozen_pages_noprof(gfp_t gfp, unsigned int ord
#define alloc_frozen_pages(...) \
alloc_hooks(alloc_frozen_pages_noprof(__VA_ARGS__))
+struct page *alloc_frozen_pages_nolock_noprof(gfp_t gfp_flags, int nid, unsigned int order);
+#define alloc_frozen_pages_nolock(...) \
+ alloc_hooks(alloc_frozen_pages_nolock_noprof(__VA_ARGS__))
+
extern void zone_pcp_reset(struct zone *zone);
extern void zone_pcp_disable(struct zone *zone);
extern void zone_pcp_enable(struct zone *zone);
@@ -961,8 +966,8 @@ extern long populate_vma_page_range(struct vm_area_struct *vma,
unsigned long start, unsigned long end, int *locked);
extern long faultin_page_range(struct mm_struct *mm, unsigned long start,
unsigned long end, bool write, int *locked);
-extern bool mlock_future_ok(struct mm_struct *mm, vm_flags_t vm_flags,
- unsigned long bytes);
+bool mlock_future_ok(const struct mm_struct *mm, vm_flags_t vm_flags,
+ unsigned long bytes);
/*
* NOTE: This function can't tell whether the folio is "fully mapped" in the
@@ -1227,14 +1232,10 @@ static inline bool node_reclaim_enabled(void)
#ifdef CONFIG_MEMORY_FAILURE
int unmap_poisoned_folio(struct folio *folio, unsigned long pfn, bool must_kill);
void shake_folio(struct folio *folio);
-extern int hwpoison_filter(struct page *p);
-
-extern u32 hwpoison_filter_dev_major;
-extern u32 hwpoison_filter_dev_minor;
-extern u64 hwpoison_filter_flags_mask;
-extern u64 hwpoison_filter_flags_value;
-extern u64 hwpoison_filter_memcg;
-extern u32 hwpoison_filter_enable;
+typedef int hwpoison_filter_func_t(struct page *p);
+void hwpoison_filter_register(hwpoison_filter_func_t *filter);
+void hwpoison_filter_unregister(void);
+
#define MAGIC_HWPOISON 0x48575053U /* HWPS */
void SetPageHWPoisonTakenOff(struct page *page);
void ClearPageHWPoisonTakenOff(struct page *page);
@@ -1333,11 +1334,6 @@ extern const struct trace_print_flags pageflag_names[];
extern const struct trace_print_flags vmaflag_names[];
extern const struct trace_print_flags gfpflag_names[];
-static inline bool is_migrate_highatomic(enum migratetype migratetype)
-{
- return migratetype == MIGRATE_HIGHATOMIC;
-}
-
void setup_zone_pageset(struct zone *zone);
struct migration_target_control {
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index 9142964ab9c9..d4c14359feaf 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -32,6 +32,15 @@
#include "kasan.h"
#include "../slab.h"
+#if defined(CONFIG_ARCH_DEFER_KASAN) || defined(CONFIG_KASAN_HW_TAGS)
+/*
+ * Definition of the unified static key declared in kasan-enabled.h.
+ * This provides consistent runtime enable/disable across KASAN modes.
+ */
+DEFINE_STATIC_KEY_FALSE(kasan_flag_enabled);
+EXPORT_SYMBOL_GPL(kasan_flag_enabled);
+#endif
+
struct slab *kasan_addr_to_slab(const void *addr)
{
if (virt_addr_valid(addr))
@@ -246,15 +255,15 @@ static inline void poison_slab_object(struct kmem_cache *cache, void *object,
bool __kasan_slab_pre_free(struct kmem_cache *cache, void *object,
unsigned long ip)
{
- if (!kasan_arch_is_ready() || is_kfence_address(object))
+ if (is_kfence_address(object))
return false;
return check_slab_allocation(cache, object, ip);
}
bool __kasan_slab_free(struct kmem_cache *cache, void *object, bool init,
- bool still_accessible)
+ bool still_accessible, bool no_quarantine)
{
- if (!kasan_arch_is_ready() || is_kfence_address(object))
+ if (is_kfence_address(object))
return false;
/*
@@ -274,6 +283,9 @@ bool __kasan_slab_free(struct kmem_cache *cache, void *object, bool init,
poison_slab_object(cache, object, init);
+ if (no_quarantine)
+ return false;
+
/*
* If the object is put into quarantine, do not let slab put the object
* onto the freelist for now. The object's metadata is kept until the
@@ -293,7 +305,7 @@ bool __kasan_slab_free(struct kmem_cache *cache, void *object, bool init,
static inline bool check_page_allocation(void *ptr, unsigned long ip)
{
- if (!kasan_arch_is_ready())
+ if (!kasan_enabled())
return false;
if (ptr != page_address(virt_to_head_page(ptr))) {
@@ -522,7 +534,7 @@ bool __kasan_mempool_poison_object(void *ptr, unsigned long ip)
return true;
}
- if (is_kfence_address(ptr) || !kasan_arch_is_ready())
+ if (is_kfence_address(ptr))
return true;
slab = folio_slab(folio);
diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c
index d54e89f8c3e7..b413c46b3e04 100644
--- a/mm/kasan/generic.c
+++ b/mm/kasan/generic.c
@@ -37,6 +37,17 @@
#include "../slab.h"
/*
+ * Initialize Generic KASAN and enable runtime checks.
+ * This should be called from arch kasan_init() once shadow memory is ready.
+ */
+void __init kasan_init_generic(void)
+{
+ kasan_enable();
+
+ pr_info("KernelAddressSanitizer initialized (generic)\n");
+}
+
+/*
* All functions below always inlined so compiler could
* perform better optimizations in each of __asan_loadX/__assn_storeX
* depending on memory access size X.
@@ -165,7 +176,7 @@ static __always_inline bool check_region_inline(const void *addr,
size_t size, bool write,
unsigned long ret_ip)
{
- if (!kasan_arch_is_ready())
+ if (!kasan_enabled())
return true;
if (unlikely(size == 0))
@@ -193,7 +204,7 @@ bool kasan_byte_accessible(const void *addr)
{
s8 shadow_byte;
- if (!kasan_arch_is_ready())
+ if (!kasan_enabled())
return true;
shadow_byte = READ_ONCE(*(s8 *)kasan_mem_to_shadow(addr));
@@ -495,7 +506,7 @@ static void release_alloc_meta(struct kasan_alloc_meta *meta)
static void release_free_meta(const void *object, struct kasan_free_meta *meta)
{
- if (!kasan_arch_is_ready())
+ if (!kasan_enabled())
return;
/* Check if free meta is valid. */
@@ -562,7 +573,7 @@ void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags)
kasan_save_track(&alloc_meta->alloc_track, flags);
}
-void kasan_save_free_info(struct kmem_cache *cache, void *object)
+void __kasan_save_free_info(struct kmem_cache *cache, void *object)
{
struct kasan_free_meta *free_meta;
diff --git a/mm/kasan/hw_tags.c b/mm/kasan/hw_tags.c
index 9a6927394b54..1c373cc4b3fa 100644
--- a/mm/kasan/hw_tags.c
+++ b/mm/kasan/hw_tags.c
@@ -46,13 +46,6 @@ static enum kasan_arg_mode kasan_arg_mode __ro_after_init;
static enum kasan_arg_vmalloc kasan_arg_vmalloc __initdata;
/*
- * Whether KASAN is enabled at all.
- * The value remains false until KASAN is initialized by kasan_init_hw_tags().
- */
-DEFINE_STATIC_KEY_FALSE(kasan_flag_enabled);
-EXPORT_SYMBOL(kasan_flag_enabled);
-
-/*
* Whether the selected mode is synchronous, asynchronous, or asymmetric.
* Defaults to KASAN_MODE_SYNC.
*/
@@ -67,6 +60,9 @@ DEFINE_STATIC_KEY_FALSE(kasan_flag_vmalloc);
#endif
EXPORT_SYMBOL_GPL(kasan_flag_vmalloc);
+/* Whether to check write accesses only. */
+static bool kasan_flag_write_only = false;
+
#define PAGE_ALLOC_SAMPLE_DEFAULT 1
#define PAGE_ALLOC_SAMPLE_ORDER_DEFAULT 3
@@ -141,6 +137,23 @@ static int __init early_kasan_flag_vmalloc(char *arg)
}
early_param("kasan.vmalloc", early_kasan_flag_vmalloc);
+/* kasan.write_only=off/on */
+static int __init early_kasan_flag_write_only(char *arg)
+{
+ if (!arg)
+ return -EINVAL;
+
+ if (!strcmp(arg, "off"))
+ kasan_flag_write_only = false;
+ else if (!strcmp(arg, "on"))
+ kasan_flag_write_only = true;
+ else
+ return -EINVAL;
+
+ return 0;
+}
+early_param("kasan.write_only", early_kasan_flag_write_only);
+
static inline const char *kasan_mode_info(void)
{
if (kasan_mode == KASAN_MODE_ASYNC)
@@ -260,12 +273,13 @@ void __init kasan_init_hw_tags(void)
kasan_init_tags();
/* KASAN is now initialized, enable it. */
- static_branch_enable(&kasan_flag_enabled);
+ kasan_enable();
- pr_info("KernelAddressSanitizer initialized (hw-tags, mode=%s, vmalloc=%s, stacktrace=%s)\n",
+ pr_info("KernelAddressSanitizer initialized (hw-tags, mode=%s, vmalloc=%s, stacktrace=%s, write_only=%s)\n",
kasan_mode_info(),
str_on_off(kasan_vmalloc_enabled()),
- str_on_off(kasan_stack_collection_enabled()));
+ str_on_off(kasan_stack_collection_enabled()),
+ str_on_off(kasan_flag_write_only));
}
#ifdef CONFIG_KASAN_VMALLOC
@@ -392,6 +406,20 @@ void kasan_enable_hw_tags(void)
hw_enable_tag_checks_asymm();
else
hw_enable_tag_checks_sync();
+
+ /*
+ * CPUs can only be in one of two states:
+ * - All CPUs support the write_only feature
+ * - No CPUs support the write_only feature
+ *
+ * If the first CPU attempts hw_enable_tag_checks_write_only() and
+ * finds the feature unsupported, kasan_flag_write_only is set to OFF
+ * to avoid further unnecessary calls on other CPUs.
+ */
+ if (kasan_flag_write_only && hw_enable_tag_checks_write_only()) {
+ kasan_flag_write_only = false;
+ pr_err_once("write-only mode is not supported and thus not enabled\n");
+ }
}
#if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST)
@@ -404,4 +432,10 @@ VISIBLE_IF_KUNIT void kasan_force_async_fault(void)
}
EXPORT_SYMBOL_IF_KUNIT(kasan_force_async_fault);
+VISIBLE_IF_KUNIT bool kasan_write_only_enabled(void)
+{
+ return kasan_flag_write_only;
+}
+EXPORT_SYMBOL_IF_KUNIT(kasan_write_only_enabled);
+
#endif
diff --git a/mm/kasan/init.c b/mm/kasan/init.c
index ced6b29fcf76..f084e7a5df1e 100644
--- a/mm/kasan/init.c
+++ b/mm/kasan/init.c
@@ -13,9 +13,9 @@
#include <linux/mm.h>
#include <linux/pfn.h>
#include <linux/slab.h>
+#include <linux/pgalloc.h>
#include <asm/page.h>
-#include <asm/pgalloc.h>
#include "kasan.h"
@@ -191,7 +191,7 @@ static int __ref zero_p4d_populate(pgd_t *pgd, unsigned long addr,
pud_t *pud;
pmd_t *pmd;
- p4d_populate(&init_mm, p4d,
+ p4d_populate_kernel(addr, p4d,
lm_alias(kasan_early_shadow_pud));
pud = pud_offset(p4d, addr);
pud_populate(&init_mm, pud,
@@ -212,7 +212,7 @@ static int __ref zero_p4d_populate(pgd_t *pgd, unsigned long addr,
} else {
p = early_alloc(PAGE_SIZE, NUMA_NO_NODE);
pud_init(p);
- p4d_populate(&init_mm, p4d, p);
+ p4d_populate_kernel(addr, p4d, p);
}
}
zero_pud_populate(p4d, addr, next);
@@ -251,10 +251,10 @@ int __ref kasan_populate_early_shadow(const void *shadow_start,
* puds,pmds, so pgd_populate(), pud_populate()
* is noops.
*/
- pgd_populate(&init_mm, pgd,
+ pgd_populate_kernel(addr, pgd,
lm_alias(kasan_early_shadow_p4d));
p4d = p4d_offset(pgd, addr);
- p4d_populate(&init_mm, p4d,
+ p4d_populate_kernel(addr, p4d,
lm_alias(kasan_early_shadow_pud));
pud = pud_offset(p4d, addr);
pud_populate(&init_mm, pud,
@@ -266,14 +266,12 @@ int __ref kasan_populate_early_shadow(const void *shadow_start,
}
if (pgd_none(*pgd)) {
- p4d_t *p;
if (slab_is_available()) {
- p = p4d_alloc(&init_mm, pgd, addr);
- if (!p)
+ if (!p4d_alloc(&init_mm, pgd, addr))
return -ENOMEM;
} else {
- pgd_populate(&init_mm, pgd,
+ pgd_populate_kernel(addr, pgd,
early_alloc(PAGE_SIZE, NUMA_NO_NODE));
}
}
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index 129178be5e64..07fa7375a848 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -398,7 +398,13 @@ depot_stack_handle_t kasan_save_stack(gfp_t flags, depot_flags_t depot_flags);
void kasan_set_track(struct kasan_track *track, depot_stack_handle_t stack);
void kasan_save_track(struct kasan_track *track, gfp_t flags);
void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags);
-void kasan_save_free_info(struct kmem_cache *cache, void *object);
+
+void __kasan_save_free_info(struct kmem_cache *cache, void *object);
+static inline void kasan_save_free_info(struct kmem_cache *cache, void *object)
+{
+ if (kasan_enabled())
+ __kasan_save_free_info(cache, object);
+}
#ifdef CONFIG_KASAN_GENERIC
bool kasan_quarantine_put(struct kmem_cache *cache, void *object);
@@ -431,6 +437,7 @@ static inline const void *arch_kasan_set_tag(const void *addr, u8 tag)
#define hw_suppress_tag_checks_start() arch_suppress_tag_checks_start()
#define hw_suppress_tag_checks_stop() arch_suppress_tag_checks_stop()
#define hw_force_async_tag_fault() arch_force_async_tag_fault()
+#define hw_enable_tag_checks_write_only() arch_enable_tag_checks_write_only()
#define hw_get_random_tag() arch_get_random_tag()
#define hw_get_mem_tag(addr) arch_get_mem_tag(addr)
#define hw_set_mem_tag_range(addr, size, tag, init) \
@@ -451,11 +458,17 @@ void __init kasan_init_tags(void);
#if defined(CONFIG_KASAN_HW_TAGS) && IS_ENABLED(CONFIG_KASAN_KUNIT_TEST)
void kasan_force_async_fault(void);
+bool kasan_write_only_enabled(void);
#else /* CONFIG_KASAN_HW_TAGS && CONFIG_KASAN_KUNIT_TEST */
static inline void kasan_force_async_fault(void) { }
+static inline bool kasan_write_only_enabled(void)
+{
+ return false;
+}
+
#endif /* CONFIG_KASAN_HW_TAGS && CONFIG_KASAN_KUNIT_TEST */
#ifdef CONFIG_KASAN_SW_TAGS
diff --git a/mm/kasan/kasan_test_c.c b/mm/kasan/kasan_test_c.c
index e0968acc03aa..2cafca31b092 100644
--- a/mm/kasan/kasan_test_c.c
+++ b/mm/kasan/kasan_test_c.c
@@ -94,11 +94,14 @@ static void kasan_test_exit(struct kunit *test)
}
/**
- * KUNIT_EXPECT_KASAN_FAIL - check that the executed expression produces a
- * KASAN report; causes a KUnit test failure otherwise.
+ * KUNIT_EXPECT_KASAN_RESULT - checks whether the executed expression
+ * produces a KASAN report; causes a KUnit test failure when the result
+ * is different from @fail.
*
* @test: Currently executing KUnit test.
- * @expression: Expression that must produce a KASAN report.
+ * @expr: Expression to be tested.
+ * @expr_str: Expression to be tested encoded as a string.
+ * @fail: Whether expression should produce a KASAN report.
*
* For hardware tag-based KASAN, when a synchronous tag fault happens, tag
* checking is auto-disabled. When this happens, this test handler reenables
@@ -110,25 +113,29 @@ static void kasan_test_exit(struct kunit *test)
* Use READ/WRITE_ONCE() for the accesses and compiler barriers around the
* expression to prevent that.
*
- * In between KUNIT_EXPECT_KASAN_FAIL checks, test_status.report_found is kept
+ * In between KUNIT_EXPECT_KASAN_RESULT checks, test_status.report_found is kept
* as false. This allows detecting KASAN reports that happen outside of the
* checks by asserting !test_status.report_found at the start of
- * KUNIT_EXPECT_KASAN_FAIL and in kasan_test_exit.
+ * KUNIT_EXPECT_KASAN_RESULT and in kasan_test_exit.
*/
-#define KUNIT_EXPECT_KASAN_FAIL(test, expression) do { \
+#define KUNIT_EXPECT_KASAN_RESULT(test, expr, expr_str, fail) \
+do { \
if (IS_ENABLED(CONFIG_KASAN_HW_TAGS) && \
kasan_sync_fault_possible()) \
migrate_disable(); \
KUNIT_EXPECT_FALSE(test, READ_ONCE(test_status.report_found)); \
barrier(); \
- expression; \
+ expr; \
barrier(); \
if (kasan_async_fault_possible()) \
kasan_force_async_fault(); \
- if (!READ_ONCE(test_status.report_found)) { \
- KUNIT_FAIL(test, KUNIT_SUBTEST_INDENT "KASAN failure " \
- "expected in \"" #expression \
- "\", but none occurred"); \
+ if (READ_ONCE(test_status.report_found) != fail) { \
+ KUNIT_FAIL(test, KUNIT_SUBTEST_INDENT "KASAN failure" \
+ "%sexpected in \"" expr_str \
+ "\", but %soccurred", \
+ (fail ? " " : " not "), \
+ (test_status.report_found ? \
+ "" : "none ")); \
} \
if (IS_ENABLED(CONFIG_KASAN_HW_TAGS) && \
kasan_sync_fault_possible()) { \
@@ -141,6 +148,34 @@ static void kasan_test_exit(struct kunit *test)
WRITE_ONCE(test_status.async_fault, false); \
} while (0)
+/*
+ * KUNIT_EXPECT_KASAN_FAIL - check that the executed expression produces a
+ * KASAN report; causes a KUnit test failure otherwise.
+ *
+ * @test: Currently executing KUnit test.
+ * @expr: Expression that must produce a KASAN report.
+ */
+#define KUNIT_EXPECT_KASAN_FAIL(test, expr) \
+ KUNIT_EXPECT_KASAN_RESULT(test, expr, #expr, true)
+
+/*
+ * KUNIT_EXPECT_KASAN_FAIL_READ - check that the executed expression
+ * produces a KASAN report when the write-only mode is not enabled;
+ * causes a KUnit test failure otherwise.
+ *
+ * Note: At the moment, this macro does not check whether the produced
+ * KASAN report is a report about a bad read access. It is only intended
+ * for checking the write-only KASAN mode functionality without failing
+ * KASAN tests.
+ *
+ * @test: Currently executing KUnit test.
+ * @expr: Expression that must only produce a KASAN report
+ * when the write-only mode is not enabled.
+ */
+#define KUNIT_EXPECT_KASAN_FAIL_READ(test, expr) \
+ KUNIT_EXPECT_KASAN_RESULT(test, expr, #expr, \
+ !kasan_write_only_enabled()) \
+
#define KASAN_TEST_NEEDS_CONFIG_ON(test, config) do { \
if (!IS_ENABLED(config)) \
kunit_skip((test), "Test requires " #config "=y"); \
@@ -183,8 +218,8 @@ static void kmalloc_oob_right(struct kunit *test)
KUNIT_EXPECT_KASAN_FAIL(test, ptr[size + 5] = 'y');
/* Out-of-bounds access past the aligned kmalloc object. */
- KUNIT_EXPECT_KASAN_FAIL(test, ptr[0] =
- ptr[size + KASAN_GRANULE_SIZE + 5]);
+ KUNIT_EXPECT_KASAN_FAIL_READ(test, ptr[0] =
+ ptr[size + KASAN_GRANULE_SIZE + 5]);
kfree(ptr);
}
@@ -198,7 +233,7 @@ static void kmalloc_oob_left(struct kunit *test)
KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
OPTIMIZER_HIDE_VAR(ptr);
- KUNIT_EXPECT_KASAN_FAIL(test, *ptr = *(ptr - 1));
+ KUNIT_EXPECT_KASAN_FAIL_READ(test, *ptr = *(ptr - 1));
kfree(ptr);
}
@@ -211,7 +246,7 @@ static void kmalloc_node_oob_right(struct kunit *test)
KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
OPTIMIZER_HIDE_VAR(ptr);
- KUNIT_EXPECT_KASAN_FAIL(test, ptr[0] = ptr[size]);
+ KUNIT_EXPECT_KASAN_FAIL_READ(test, ptr[0] = ptr[size]);
kfree(ptr);
}
@@ -291,7 +326,7 @@ static void kmalloc_large_uaf(struct kunit *test)
KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
kfree(ptr);
- KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr)[0]);
+ KUNIT_EXPECT_KASAN_FAIL_READ(test, ((volatile char *)ptr)[0]);
}
static void kmalloc_large_invalid_free(struct kunit *test)
@@ -323,7 +358,7 @@ static void page_alloc_oob_right(struct kunit *test)
ptr = page_address(pages);
KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
- KUNIT_EXPECT_KASAN_FAIL(test, ptr[0] = ptr[size]);
+ KUNIT_EXPECT_KASAN_FAIL_READ(test, ptr[0] = ptr[size]);
free_pages((unsigned long)ptr, order);
}
@@ -338,7 +373,7 @@ static void page_alloc_uaf(struct kunit *test)
KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
free_pages((unsigned long)ptr, order);
- KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr)[0]);
+ KUNIT_EXPECT_KASAN_FAIL_READ(test, ((volatile char *)ptr)[0]);
}
static void krealloc_more_oob_helper(struct kunit *test,
@@ -458,7 +493,7 @@ static void krealloc_uaf(struct kunit *test)
KUNIT_EXPECT_KASAN_FAIL(test, ptr2 = krealloc(ptr1, size2, GFP_KERNEL));
KUNIT_ASSERT_NULL(test, ptr2);
- KUNIT_EXPECT_KASAN_FAIL(test, *(volatile char *)ptr1);
+ KUNIT_EXPECT_KASAN_FAIL_READ(test, *(volatile char *)ptr1);
}
static void kmalloc_oob_16(struct kunit *test)
@@ -501,7 +536,7 @@ static void kmalloc_uaf_16(struct kunit *test)
KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr2);
kfree(ptr2);
- KUNIT_EXPECT_KASAN_FAIL(test, *ptr1 = *ptr2);
+ KUNIT_EXPECT_KASAN_FAIL_READ(test, *ptr1 = *ptr2);
kfree(ptr1);
}
@@ -640,8 +675,8 @@ static void kmalloc_memmove_invalid_size(struct kunit *test)
memset((char *)ptr, 0, 64);
OPTIMIZER_HIDE_VAR(ptr);
OPTIMIZER_HIDE_VAR(invalid_size);
- KUNIT_EXPECT_KASAN_FAIL(test,
- memmove((char *)ptr, (char *)ptr + 4, invalid_size));
+ KUNIT_EXPECT_KASAN_FAIL_READ(test,
+ memmove((char *)ptr, (char *)ptr + 4, invalid_size));
kfree(ptr);
}
@@ -654,7 +689,7 @@ static void kmalloc_uaf(struct kunit *test)
KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
kfree(ptr);
- KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr)[8]);
+ KUNIT_EXPECT_KASAN_FAIL_READ(test, ((volatile char *)ptr)[8]);
}
static void kmalloc_uaf_memset(struct kunit *test)
@@ -701,7 +736,7 @@ again:
goto again;
}
- KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr1)[40]);
+ KUNIT_EXPECT_KASAN_FAIL_READ(test, ((volatile char *)ptr1)[40]);
KUNIT_EXPECT_PTR_NE(test, ptr1, ptr2);
kfree(ptr2);
@@ -727,19 +762,19 @@ static void kmalloc_uaf3(struct kunit *test)
KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr2);
kfree(ptr2);
- KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr1)[8]);
+ KUNIT_EXPECT_KASAN_FAIL_READ(test, ((volatile char *)ptr1)[8]);
}
static void kasan_atomics_helper(struct kunit *test, void *unsafe, void *safe)
{
int *i_unsafe = unsafe;
- KUNIT_EXPECT_KASAN_FAIL(test, READ_ONCE(*i_unsafe));
+ KUNIT_EXPECT_KASAN_FAIL_READ(test, READ_ONCE(*i_unsafe));
KUNIT_EXPECT_KASAN_FAIL(test, WRITE_ONCE(*i_unsafe, 42));
- KUNIT_EXPECT_KASAN_FAIL(test, smp_load_acquire(i_unsafe));
+ KUNIT_EXPECT_KASAN_FAIL_READ(test, smp_load_acquire(i_unsafe));
KUNIT_EXPECT_KASAN_FAIL(test, smp_store_release(i_unsafe, 42));
- KUNIT_EXPECT_KASAN_FAIL(test, atomic_read(unsafe));
+ KUNIT_EXPECT_KASAN_FAIL_READ(test, atomic_read(unsafe));
KUNIT_EXPECT_KASAN_FAIL(test, atomic_set(unsafe, 42));
KUNIT_EXPECT_KASAN_FAIL(test, atomic_add(42, unsafe));
KUNIT_EXPECT_KASAN_FAIL(test, atomic_sub(42, unsafe));
@@ -752,18 +787,31 @@ static void kasan_atomics_helper(struct kunit *test, void *unsafe, void *safe)
KUNIT_EXPECT_KASAN_FAIL(test, atomic_xchg(unsafe, 42));
KUNIT_EXPECT_KASAN_FAIL(test, atomic_cmpxchg(unsafe, 21, 42));
KUNIT_EXPECT_KASAN_FAIL(test, atomic_try_cmpxchg(unsafe, safe, 42));
- KUNIT_EXPECT_KASAN_FAIL(test, atomic_try_cmpxchg(safe, unsafe, 42));
+ /*
+ * The result of the test below may vary due to garbage values of
+ * unsafe in write-only mode.
+ * Therefore, skip this test when KASAN is configured in write-only mode.
+ */
+ if (!kasan_write_only_enabled())
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_try_cmpxchg(safe, unsafe, 42));
KUNIT_EXPECT_KASAN_FAIL(test, atomic_sub_and_test(42, unsafe));
KUNIT_EXPECT_KASAN_FAIL(test, atomic_dec_and_test(unsafe));
KUNIT_EXPECT_KASAN_FAIL(test, atomic_inc_and_test(unsafe));
KUNIT_EXPECT_KASAN_FAIL(test, atomic_add_negative(42, unsafe));
- KUNIT_EXPECT_KASAN_FAIL(test, atomic_add_unless(unsafe, 21, 42));
- KUNIT_EXPECT_KASAN_FAIL(test, atomic_inc_not_zero(unsafe));
- KUNIT_EXPECT_KASAN_FAIL(test, atomic_inc_unless_negative(unsafe));
- KUNIT_EXPECT_KASAN_FAIL(test, atomic_dec_unless_positive(unsafe));
- KUNIT_EXPECT_KASAN_FAIL(test, atomic_dec_if_positive(unsafe));
+ /*
+ * The result of the test below may vary due to garbage values of
+ * unsafe in write-only mode.
+ * Therefore, skip this test when KASAN is configured in write-only mode.
+ */
+ if (!kasan_write_only_enabled()) {
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_add_unless(unsafe, 21, 42));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_inc_not_zero(unsafe));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_inc_unless_negative(unsafe));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_dec_unless_positive(unsafe));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_dec_if_positive(unsafe));
+ }
- KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_read(unsafe));
+ KUNIT_EXPECT_KASAN_FAIL_READ(test, atomic_long_read(unsafe));
KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_set(unsafe, 42));
KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_add(42, unsafe));
KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_sub(42, unsafe));
@@ -776,16 +824,29 @@ static void kasan_atomics_helper(struct kunit *test, void *unsafe, void *safe)
KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_xchg(unsafe, 42));
KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_cmpxchg(unsafe, 21, 42));
KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_try_cmpxchg(unsafe, safe, 42));
- KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_try_cmpxchg(safe, unsafe, 42));
+ /*
+ * The result of the test below may vary due to garbage values of
+ * unsafe in write-only mode.
+ * Therefore, skip this test when KASAN is configured in write-only mode.
+ */
+ if (!kasan_write_only_enabled())
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_try_cmpxchg(safe, unsafe, 42));
KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_sub_and_test(42, unsafe));
KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_dec_and_test(unsafe));
KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_inc_and_test(unsafe));
KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_add_negative(42, unsafe));
- KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_add_unless(unsafe, 21, 42));
- KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_inc_not_zero(unsafe));
- KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_inc_unless_negative(unsafe));
- KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_dec_unless_positive(unsafe));
- KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_dec_if_positive(unsafe));
+ /*
+ * The result of the test below may vary due to garbage values of
+ * unsafe in write-only mode.
+ * Therefore, skip this test when KASAN is configured in write-only mode.
+ */
+ if (!kasan_write_only_enabled()) {
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_add_unless(unsafe, 21, 42));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_inc_not_zero(unsafe));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_inc_unless_negative(unsafe));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_dec_unless_positive(unsafe));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_dec_if_positive(unsafe));
+ }
}
static void kasan_atomics(struct kunit *test)
@@ -842,8 +903,8 @@ static void ksize_unpoisons_memory(struct kunit *test)
/* These must trigger a KASAN report. */
if (IS_ENABLED(CONFIG_KASAN_GENERIC))
KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr)[size]);
- KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr)[size + 5]);
- KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr)[real_size - 1]);
+ KUNIT_EXPECT_KASAN_FAIL_READ(test, ((volatile char *)ptr)[size + 5]);
+ KUNIT_EXPECT_KASAN_FAIL_READ(test, ((volatile char *)ptr)[real_size - 1]);
kfree(ptr);
}
@@ -863,8 +924,8 @@ static void ksize_uaf(struct kunit *test)
OPTIMIZER_HIDE_VAR(ptr);
KUNIT_EXPECT_KASAN_FAIL(test, ksize(ptr));
- KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr)[0]);
- KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr)[size]);
+ KUNIT_EXPECT_KASAN_FAIL_READ(test, ((volatile char *)ptr)[0]);
+ KUNIT_EXPECT_KASAN_FAIL_READ(test, ((volatile char *)ptr)[size]);
}
/*
@@ -899,9 +960,9 @@ static void rcu_uaf(struct kunit *test)
global_rcu_ptr = rcu_dereference_protected(
(struct kasan_rcu_info __rcu *)ptr, NULL);
- KUNIT_EXPECT_KASAN_FAIL(test,
- call_rcu(&global_rcu_ptr->rcu, rcu_uaf_reclaim);
- rcu_barrier());
+ KUNIT_EXPECT_KASAN_FAIL_READ(test,
+ call_rcu(&global_rcu_ptr->rcu, rcu_uaf_reclaim);
+ rcu_barrier());
}
static void workqueue_uaf_work(struct work_struct *work)
@@ -924,8 +985,8 @@ static void workqueue_uaf(struct kunit *test)
queue_work(workqueue, work);
destroy_workqueue(workqueue);
- KUNIT_EXPECT_KASAN_FAIL(test,
- ((volatile struct work_struct *)work)->data);
+ KUNIT_EXPECT_KASAN_FAIL_READ(test,
+ ((volatile struct work_struct *)work)->data);
}
static void kfree_via_page(struct kunit *test)
@@ -972,7 +1033,7 @@ static void kmem_cache_oob(struct kunit *test)
return;
}
- KUNIT_EXPECT_KASAN_FAIL(test, *p = p[size + OOB_TAG_OFF]);
+ KUNIT_EXPECT_KASAN_FAIL_READ(test, *p = p[size + OOB_TAG_OFF]);
kmem_cache_free(cache, p);
kmem_cache_destroy(cache);
@@ -1068,11 +1129,50 @@ static void kmem_cache_rcu_uaf(struct kunit *test)
*/
rcu_barrier();
- KUNIT_EXPECT_KASAN_FAIL(test, READ_ONCE(*p));
+ KUNIT_EXPECT_KASAN_FAIL_READ(test, READ_ONCE(*p));
kmem_cache_destroy(cache);
}
+/*
+ * Check that SLAB_TYPESAFE_BY_RCU objects are immediately reused when
+ * CONFIG_SLUB_RCU_DEBUG is off, and stay at the same address.
+ * Without this, KASAN builds would be unable to trigger bugs caused by
+ * SLAB_TYPESAFE_BY_RCU users handling reycled objects improperly.
+ */
+static void kmem_cache_rcu_reuse(struct kunit *test)
+{
+ char *p, *p2;
+ struct kmem_cache *cache;
+
+ KASAN_TEST_NEEDS_CONFIG_OFF(test, CONFIG_SLUB_RCU_DEBUG);
+
+ cache = kmem_cache_create("test_cache", 16, 0, SLAB_TYPESAFE_BY_RCU,
+ NULL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, cache);
+
+ migrate_disable();
+ p = kmem_cache_alloc(cache, GFP_KERNEL);
+ if (!p) {
+ kunit_err(test, "Allocation failed: %s\n", __func__);
+ goto out;
+ }
+
+ kmem_cache_free(cache, p);
+ p2 = kmem_cache_alloc(cache, GFP_KERNEL);
+ if (!p2) {
+ kunit_err(test, "Allocation failed: %s\n", __func__);
+ goto out;
+ }
+ KUNIT_EXPECT_PTR_EQ(test, p, p2);
+
+ kmem_cache_free(cache, p2);
+
+out:
+ migrate_enable();
+ kmem_cache_destroy(cache);
+}
+
static void kmem_cache_double_destroy(struct kunit *test)
{
struct kmem_cache *cache;
@@ -1207,7 +1307,7 @@ static void mempool_oob_right_helper(struct kunit *test, mempool_t *pool, size_t
KUNIT_EXPECT_KASAN_FAIL(test,
((volatile char *)&elem[size])[0]);
else
- KUNIT_EXPECT_KASAN_FAIL(test,
+ KUNIT_EXPECT_KASAN_FAIL_READ(test,
((volatile char *)&elem[round_up(size, KASAN_GRANULE_SIZE)])[0]);
mempool_free(elem, pool);
@@ -1273,7 +1373,7 @@ static void mempool_uaf_helper(struct kunit *test, mempool_t *pool, bool page)
mempool_free(elem, pool);
ptr = page ? page_address((struct page *)elem) : elem;
- KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr)[0]);
+ KUNIT_EXPECT_KASAN_FAIL_READ(test, ((volatile char *)ptr)[0]);
}
static void mempool_kmalloc_uaf(struct kunit *test)
@@ -1532,7 +1632,7 @@ static void kasan_memchr(struct kunit *test)
OPTIMIZER_HIDE_VAR(ptr);
OPTIMIZER_HIDE_VAR(size);
- KUNIT_EXPECT_KASAN_FAIL(test,
+ KUNIT_EXPECT_KASAN_FAIL_READ(test,
kasan_ptr_result = memchr(ptr, '1', size + 1));
kfree(ptr);
@@ -1559,7 +1659,7 @@ static void kasan_memcmp(struct kunit *test)
OPTIMIZER_HIDE_VAR(ptr);
OPTIMIZER_HIDE_VAR(size);
- KUNIT_EXPECT_KASAN_FAIL(test,
+ KUNIT_EXPECT_KASAN_FAIL_READ(test,
kasan_int_result = memcmp(ptr, arr, size+1));
kfree(ptr);
}
@@ -1578,9 +1678,11 @@ static void kasan_strings(struct kunit *test)
ptr = kmalloc(size, GFP_KERNEL | __GFP_ZERO);
KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+ OPTIMIZER_HIDE_VAR(ptr);
src = kmalloc(KASAN_GRANULE_SIZE, GFP_KERNEL | __GFP_ZERO);
strscpy(src, "f0cacc1a0000000", KASAN_GRANULE_SIZE);
+ OPTIMIZER_HIDE_VAR(src);
/*
* Make sure that strscpy() does not trigger KASAN if it overreads into
@@ -1594,7 +1696,7 @@ static void kasan_strings(struct kunit *test)
strscpy(ptr, src + 1, KASAN_GRANULE_SIZE));
/* strscpy should fail if the first byte is unreadable. */
- KUNIT_EXPECT_KASAN_FAIL(test, strscpy(ptr, src + KASAN_GRANULE_SIZE,
+ KUNIT_EXPECT_KASAN_FAIL_READ(test, strscpy(ptr, src + KASAN_GRANULE_SIZE,
KASAN_GRANULE_SIZE));
kfree(src);
@@ -1607,17 +1709,17 @@ static void kasan_strings(struct kunit *test)
* will likely point to zeroed byte.
*/
ptr += 16;
- KUNIT_EXPECT_KASAN_FAIL(test, kasan_ptr_result = strchr(ptr, '1'));
+ KUNIT_EXPECT_KASAN_FAIL_READ(test, kasan_ptr_result = strchr(ptr, '1'));
- KUNIT_EXPECT_KASAN_FAIL(test, kasan_ptr_result = strrchr(ptr, '1'));
+ KUNIT_EXPECT_KASAN_FAIL_READ(test, kasan_ptr_result = strrchr(ptr, '1'));
- KUNIT_EXPECT_KASAN_FAIL(test, kasan_int_result = strcmp(ptr, "2"));
+ KUNIT_EXPECT_KASAN_FAIL_READ(test, kasan_int_result = strcmp(ptr, "2"));
- KUNIT_EXPECT_KASAN_FAIL(test, kasan_int_result = strncmp(ptr, "2", 1));
+ KUNIT_EXPECT_KASAN_FAIL_READ(test, kasan_int_result = strncmp(ptr, "2", 1));
- KUNIT_EXPECT_KASAN_FAIL(test, kasan_int_result = strlen(ptr));
+ KUNIT_EXPECT_KASAN_FAIL_READ(test, kasan_int_result = strlen(ptr));
- KUNIT_EXPECT_KASAN_FAIL(test, kasan_int_result = strnlen(ptr, 1));
+ KUNIT_EXPECT_KASAN_FAIL_READ(test, kasan_int_result = strnlen(ptr, 1));
}
static void kasan_bitops_modify(struct kunit *test, int nr, void *addr)
@@ -1636,12 +1738,18 @@ static void kasan_bitops_test_and_modify(struct kunit *test, int nr, void *addr)
{
KUNIT_EXPECT_KASAN_FAIL(test, test_and_set_bit(nr, addr));
KUNIT_EXPECT_KASAN_FAIL(test, __test_and_set_bit(nr, addr));
- KUNIT_EXPECT_KASAN_FAIL(test, test_and_set_bit_lock(nr, addr));
+ /*
+ * When KASAN is running in write-only mode,
+ * a fault won't occur when the bit is set.
+ * Therefore, skip the test_and_set_bit_lock test in write-only mode.
+ */
+ if (!kasan_write_only_enabled())
+ KUNIT_EXPECT_KASAN_FAIL(test, test_and_set_bit_lock(nr, addr));
KUNIT_EXPECT_KASAN_FAIL(test, test_and_clear_bit(nr, addr));
KUNIT_EXPECT_KASAN_FAIL(test, __test_and_clear_bit(nr, addr));
KUNIT_EXPECT_KASAN_FAIL(test, test_and_change_bit(nr, addr));
KUNIT_EXPECT_KASAN_FAIL(test, __test_and_change_bit(nr, addr));
- KUNIT_EXPECT_KASAN_FAIL(test, kasan_int_result = test_bit(nr, addr));
+ KUNIT_EXPECT_KASAN_FAIL_READ(test, kasan_int_result = test_bit(nr, addr));
if (nr < 7)
KUNIT_EXPECT_KASAN_FAIL(test, kasan_int_result =
xor_unlock_is_negative_byte(1 << nr, addr));
@@ -1765,7 +1873,7 @@ static void vmalloc_oob(struct kunit *test)
KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)v_ptr)[size]);
/* An aligned access into the first out-of-bounds granule. */
- KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)v_ptr)[size + 5]);
+ KUNIT_EXPECT_KASAN_FAIL_READ(test, ((volatile char *)v_ptr)[size + 5]);
/* Check that in-bounds accesses to the physical page are valid. */
page = vmalloc_to_page(v_ptr);
@@ -2042,15 +2150,15 @@ static void copy_user_test_oob(struct kunit *test)
KUNIT_EXPECT_KASAN_FAIL(test,
unused = copy_from_user(kmem, usermem, size + 1));
- KUNIT_EXPECT_KASAN_FAIL(test,
+ KUNIT_EXPECT_KASAN_FAIL_READ(test,
unused = copy_to_user(usermem, kmem, size + 1));
KUNIT_EXPECT_KASAN_FAIL(test,
unused = __copy_from_user(kmem, usermem, size + 1));
- KUNIT_EXPECT_KASAN_FAIL(test,
+ KUNIT_EXPECT_KASAN_FAIL_READ(test,
unused = __copy_to_user(usermem, kmem, size + 1));
KUNIT_EXPECT_KASAN_FAIL(test,
unused = __copy_from_user_inatomic(kmem, usermem, size + 1));
- KUNIT_EXPECT_KASAN_FAIL(test,
+ KUNIT_EXPECT_KASAN_FAIL_READ(test,
unused = __copy_to_user_inatomic(usermem, kmem, size + 1));
/*
@@ -2104,6 +2212,7 @@ static struct kunit_case kasan_kunit_test_cases[] = {
KUNIT_CASE(kmem_cache_double_free),
KUNIT_CASE(kmem_cache_invalid_free),
KUNIT_CASE(kmem_cache_rcu_uaf),
+ KUNIT_CASE(kmem_cache_rcu_reuse),
KUNIT_CASE(kmem_cache_double_destroy),
KUNIT_CASE(kmem_cache_accounted),
KUNIT_CASE(kmem_cache_bulk),
diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c
index d2c70cd2afb1..5d2a876035d6 100644
--- a/mm/kasan/shadow.c
+++ b/mm/kasan/shadow.c
@@ -125,7 +125,7 @@ void kasan_poison(const void *addr, size_t size, u8 value, bool init)
{
void *shadow_start, *shadow_end;
- if (!kasan_arch_is_ready())
+ if (!kasan_enabled())
return;
/*
@@ -150,7 +150,7 @@ EXPORT_SYMBOL_GPL(kasan_poison);
#ifdef CONFIG_KASAN_GENERIC
void kasan_poison_last_granule(const void *addr, size_t size)
{
- if (!kasan_arch_is_ready())
+ if (!kasan_enabled())
return;
if (size & KASAN_GRANULE_MASK) {
@@ -305,8 +305,7 @@ static int kasan_populate_vmalloc_pte(pte_t *ptep, unsigned long addr,
pte_t pte;
int index;
- if (likely(!pte_none(ptep_get(ptep))))
- return 0;
+ arch_leave_lazy_mmu_mode();
index = PFN_DOWN(addr - data->start);
page = data->pages[index];
@@ -320,6 +319,8 @@ static int kasan_populate_vmalloc_pte(pte_t *ptep, unsigned long addr,
}
spin_unlock(&init_mm.page_table_lock);
+ arch_enter_lazy_mmu_mode();
+
return 0;
}
@@ -335,13 +336,13 @@ static void ___free_pages_bulk(struct page **pages, int nr_pages)
}
}
-static int ___alloc_pages_bulk(struct page **pages, int nr_pages)
+static int ___alloc_pages_bulk(struct page **pages, int nr_pages, gfp_t gfp_mask)
{
unsigned long nr_populated, nr_total = nr_pages;
struct page **page_array = pages;
while (nr_pages) {
- nr_populated = alloc_pages_bulk(GFP_KERNEL, nr_pages, pages);
+ nr_populated = alloc_pages_bulk(gfp_mask, nr_pages, pages);
if (!nr_populated) {
___free_pages_bulk(page_array, nr_total - nr_pages);
return -ENOMEM;
@@ -353,25 +354,42 @@ static int ___alloc_pages_bulk(struct page **pages, int nr_pages)
return 0;
}
-static int __kasan_populate_vmalloc(unsigned long start, unsigned long end)
+static int __kasan_populate_vmalloc(unsigned long start, unsigned long end, gfp_t gfp_mask)
{
unsigned long nr_pages, nr_total = PFN_UP(end - start);
struct vmalloc_populate_data data;
+ unsigned int flags;
int ret = 0;
- data.pages = (struct page **)__get_free_page(GFP_KERNEL | __GFP_ZERO);
+ data.pages = (struct page **)__get_free_page(gfp_mask | __GFP_ZERO);
if (!data.pages)
return -ENOMEM;
while (nr_total) {
nr_pages = min(nr_total, PAGE_SIZE / sizeof(data.pages[0]));
- ret = ___alloc_pages_bulk(data.pages, nr_pages);
+ ret = ___alloc_pages_bulk(data.pages, nr_pages, gfp_mask);
if (ret)
break;
data.start = start;
+
+ /*
+ * page tables allocations ignore external gfp mask, enforce it
+ * by the scope API
+ */
+ if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO)
+ flags = memalloc_nofs_save();
+ else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0)
+ flags = memalloc_noio_save();
+
ret = apply_to_page_range(&init_mm, start, nr_pages * PAGE_SIZE,
kasan_populate_vmalloc_pte, &data);
+
+ if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO)
+ memalloc_nofs_restore(flags);
+ else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0)
+ memalloc_noio_restore(flags);
+
___free_pages_bulk(data.pages, nr_pages);
if (ret)
break;
@@ -385,12 +403,12 @@ static int __kasan_populate_vmalloc(unsigned long start, unsigned long end)
return ret;
}
-int kasan_populate_vmalloc(unsigned long addr, unsigned long size)
+int kasan_populate_vmalloc(unsigned long addr, unsigned long size, gfp_t gfp_mask)
{
unsigned long shadow_start, shadow_end;
int ret;
- if (!kasan_arch_is_ready())
+ if (!kasan_enabled())
return 0;
if (!is_vmalloc_or_module_addr((void *)addr))
@@ -414,7 +432,7 @@ int kasan_populate_vmalloc(unsigned long addr, unsigned long size)
shadow_start = PAGE_ALIGN_DOWN(shadow_start);
shadow_end = PAGE_ALIGN(shadow_end);
- ret = __kasan_populate_vmalloc(shadow_start, shadow_end);
+ ret = __kasan_populate_vmalloc(shadow_start, shadow_end, gfp_mask);
if (ret)
return ret;
@@ -461,18 +479,23 @@ int kasan_populate_vmalloc(unsigned long addr, unsigned long size)
static int kasan_depopulate_vmalloc_pte(pte_t *ptep, unsigned long addr,
void *unused)
{
- unsigned long page;
+ pte_t pte;
+ int none;
- page = (unsigned long)__va(pte_pfn(ptep_get(ptep)) << PAGE_SHIFT);
+ arch_leave_lazy_mmu_mode();
spin_lock(&init_mm.page_table_lock);
-
- if (likely(!pte_none(ptep_get(ptep)))) {
+ pte = ptep_get(ptep);
+ none = pte_none(pte);
+ if (likely(!none))
pte_clear(&init_mm, addr, ptep);
- free_page(page);
- }
spin_unlock(&init_mm.page_table_lock);
+ if (likely(!none))
+ __free_page(pfn_to_page(pte_pfn(pte)));
+
+ arch_enter_lazy_mmu_mode();
+
return 0;
}
@@ -560,7 +583,7 @@ void kasan_release_vmalloc(unsigned long start, unsigned long end,
unsigned long region_start, region_end;
unsigned long size;
- if (!kasan_arch_is_ready())
+ if (!kasan_enabled())
return;
region_start = ALIGN(start, KASAN_MEMORY_PER_SHADOW_PAGE);
@@ -611,7 +634,7 @@ void *__kasan_unpoison_vmalloc(const void *start, unsigned long size,
* with setting memory tags, so the KASAN_VMALLOC_INIT flag is ignored.
*/
- if (!kasan_arch_is_ready())
+ if (!kasan_enabled())
return (void *)start;
if (!is_vmalloc_or_module_addr(start))
@@ -636,7 +659,7 @@ void *__kasan_unpoison_vmalloc(const void *start, unsigned long size,
*/
void __kasan_poison_vmalloc(const void *start, unsigned long size)
{
- if (!kasan_arch_is_ready())
+ if (!kasan_enabled())
return;
if (!is_vmalloc_or_module_addr(start))
diff --git a/mm/kasan/sw_tags.c b/mm/kasan/sw_tags.c
index b9382b5b6a37..c75741a74602 100644
--- a/mm/kasan/sw_tags.c
+++ b/mm/kasan/sw_tags.c
@@ -44,6 +44,7 @@ void __init kasan_init_sw_tags(void)
per_cpu(prng_state, cpu) = (u32)get_cycles();
kasan_init_tags();
+ kasan_enable();
pr_info("KernelAddressSanitizer initialized (sw-tags, stacktrace=%s)\n",
str_on_off(kasan_stack_collection_enabled()));
diff --git a/mm/kasan/tags.c b/mm/kasan/tags.c
index d65d48b85f90..b9f31293622b 100644
--- a/mm/kasan/tags.c
+++ b/mm/kasan/tags.c
@@ -142,7 +142,7 @@ void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags)
save_stack_info(cache, object, flags, false);
}
-void kasan_save_free_info(struct kmem_cache *cache, void *object)
+void __kasan_save_free_info(struct kmem_cache *cache, void *object)
{
save_stack_info(cache, object, 0, true);
}
diff --git a/mm/kfence/core.c b/mm/kfence/core.c
index 0ed3be100963..727c20c94ac5 100644
--- a/mm/kfence/core.c
+++ b/mm/kfence/core.c
@@ -594,15 +594,14 @@ static void rcu_guarded_free(struct rcu_head *h)
*/
static unsigned long kfence_init_pool(void)
{
- unsigned long addr;
- struct page *pages;
+ unsigned long addr, start_pfn;
int i;
if (!arch_kfence_init_pool())
return (unsigned long)__kfence_pool;
addr = (unsigned long)__kfence_pool;
- pages = virt_to_page(__kfence_pool);
+ start_pfn = PHYS_PFN(virt_to_phys(__kfence_pool));
/*
* Set up object pages: they must have PGTY_slab set to avoid freeing
@@ -613,11 +612,12 @@ static unsigned long kfence_init_pool(void)
* enters __slab_free() slow-path.
*/
for (i = 0; i < KFENCE_POOL_SIZE / PAGE_SIZE; i++) {
- struct slab *slab = page_slab(nth_page(pages, i));
+ struct slab *slab;
if (!i || (i % 2))
continue;
+ slab = page_slab(pfn_to_page(start_pfn + i));
__folio_set_slab(slab_folio(slab));
#ifdef CONFIG_MEMCG
slab->obj_exts = (unsigned long)&kfence_metadata_init[i / 2 - 1].obj_exts |
@@ -665,10 +665,12 @@ static unsigned long kfence_init_pool(void)
reset_slab:
for (i = 0; i < KFENCE_POOL_SIZE / PAGE_SIZE; i++) {
- struct slab *slab = page_slab(nth_page(pages, i));
+ struct slab *slab;
if (!i || (i % 2))
continue;
+
+ slab = page_slab(pfn_to_page(start_pfn + i));
#ifdef CONFIG_MEMCG
slab->obj_exts = 0;
#endif
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 6b40bdfd224c..7ab2d1a42df3 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -39,7 +39,6 @@ enum scan_result {
SCAN_PTE_NON_PRESENT,
SCAN_PTE_UFFD_WP,
SCAN_PTE_MAPPED_HUGEPAGE,
- SCAN_PAGE_RO,
SCAN_LACK_REFERENCED_PAGE,
SCAN_PAGE_NULL,
SCAN_SCAN_ABORT,
@@ -105,14 +104,6 @@ struct collapse_control {
};
/**
- * struct khugepaged_mm_slot - khugepaged information per mm that is being scanned
- * @slot: hash lookup from mm to mm_slot
- */
-struct khugepaged_mm_slot {
- struct mm_slot slot;
-};
-
-/**
* struct khugepaged_scan - cursor for scanning
* @mm_head: the head of the mm list to scan
* @mm_slot: the current mm_slot we are scanning
@@ -122,7 +113,7 @@ struct khugepaged_mm_slot {
*/
struct khugepaged_scan {
struct list_head mm_head;
- struct khugepaged_mm_slot *mm_slot;
+ struct mm_slot *mm_slot;
unsigned long address;
};
@@ -385,7 +376,10 @@ int hugepage_madvise(struct vm_area_struct *vma,
int __init khugepaged_init(void)
{
- mm_slot_cache = KMEM_CACHE(khugepaged_mm_slot, 0);
+ mm_slot_cache = kmem_cache_create("khugepaged_mm_slot",
+ sizeof(struct mm_slot),
+ __alignof__(struct mm_slot),
+ 0, NULL);
if (!mm_slot_cache)
return -ENOMEM;
@@ -410,7 +404,7 @@ static inline int hpage_collapse_test_exit(struct mm_struct *mm)
static inline int hpage_collapse_test_exit_or_disable(struct mm_struct *mm)
{
return hpage_collapse_test_exit(mm) ||
- test_bit(MMF_DISABLE_THP, &mm->flags);
+ mm_flags_test(MMF_DISABLE_THP_COMPLETELY, mm);
}
static bool hugepage_pmd_enabled(void)
@@ -439,21 +433,18 @@ static bool hugepage_pmd_enabled(void)
void __khugepaged_enter(struct mm_struct *mm)
{
- struct khugepaged_mm_slot *mm_slot;
struct mm_slot *slot;
int wakeup;
/* __khugepaged_exit() must not run from under us */
VM_BUG_ON_MM(hpage_collapse_test_exit(mm), mm);
- if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags)))
+ if (unlikely(mm_flags_test_and_set(MMF_VM_HUGEPAGE, mm)))
return;
- mm_slot = mm_slot_alloc(mm_slot_cache);
- if (!mm_slot)
+ slot = mm_slot_alloc(mm_slot_cache);
+ if (!slot)
return;
- slot = &mm_slot->slot;
-
spin_lock(&khugepaged_mm_lock);
mm_slot_insert(mm_slots_hash, mm, slot);
/*
@@ -472,24 +463,21 @@ void __khugepaged_enter(struct mm_struct *mm)
void khugepaged_enter_vma(struct vm_area_struct *vma,
vm_flags_t vm_flags)
{
- if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags) &&
+ if (!mm_flags_test(MMF_VM_HUGEPAGE, vma->vm_mm) &&
hugepage_pmd_enabled()) {
- if (thp_vma_allowable_order(vma, vm_flags, TVA_ENFORCE_SYSFS,
- PMD_ORDER))
+ if (thp_vma_allowable_order(vma, vm_flags, TVA_KHUGEPAGED, PMD_ORDER))
__khugepaged_enter(vma->vm_mm);
}
}
void __khugepaged_exit(struct mm_struct *mm)
{
- struct khugepaged_mm_slot *mm_slot;
struct mm_slot *slot;
int free = 0;
spin_lock(&khugepaged_mm_lock);
slot = mm_slot_lookup(mm_slots_hash, mm);
- mm_slot = mm_slot_entry(slot, struct khugepaged_mm_slot, slot);
- if (mm_slot && khugepaged_scan.mm_slot != mm_slot) {
+ if (slot && khugepaged_scan.mm_slot != slot) {
hash_del(&slot->hash);
list_del(&slot->mm_node);
free = 1;
@@ -497,10 +485,10 @@ void __khugepaged_exit(struct mm_struct *mm)
spin_unlock(&khugepaged_mm_lock);
if (free) {
- clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
- mm_slot_free(mm_slot_cache, mm_slot);
+ mm_flags_clear(MMF_VM_HUGEPAGE, mm);
+ mm_slot_free(mm_slot_cache, slot);
mmdrop(mm);
- } else if (mm_slot) {
+ } else if (slot) {
/*
* This is required to serialize against
* hpage_collapse_test_exit() (which is guaranteed to run
@@ -549,19 +537,19 @@ static void release_pte_pages(pte_t *pte, pte_t *_pte,
}
static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
- unsigned long address,
+ unsigned long start_addr,
pte_t *pte,
struct collapse_control *cc,
struct list_head *compound_pagelist)
{
struct page *page = NULL;
struct folio *folio = NULL;
+ unsigned long addr = start_addr;
pte_t *_pte;
int none_or_zero = 0, shared = 0, result = SCAN_FAIL, referenced = 0;
- bool writable = false;
for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
- _pte++, address += PAGE_SIZE) {
+ _pte++, addr += PAGE_SIZE) {
pte_t pteval = ptep_get(_pte);
if (pte_none(pteval) || (pte_present(pteval) &&
is_zero_pfn(pte_pfn(pteval)))) {
@@ -584,7 +572,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
result = SCAN_PTE_UFFD_WP;
goto out;
}
- page = vm_normal_page(vma, address, pteval);
+ page = vm_normal_page(vma, addr, pteval);
if (unlikely(!page) || unlikely(is_zone_device_page(page))) {
result = SCAN_PAGE_NULL;
goto out;
@@ -669,28 +657,23 @@ next:
*/
if (cc->is_khugepaged &&
(pte_young(pteval) || folio_test_young(folio) ||
- folio_test_referenced(folio) || mmu_notifier_test_young(vma->vm_mm,
- address)))
+ folio_test_referenced(folio) ||
+ mmu_notifier_test_young(vma->vm_mm, addr)))
referenced++;
-
- if (pte_write(pteval))
- writable = true;
}
- if (unlikely(!writable)) {
- result = SCAN_PAGE_RO;
- } else if (unlikely(cc->is_khugepaged && !referenced)) {
+ if (unlikely(cc->is_khugepaged && !referenced)) {
result = SCAN_LACK_REFERENCED_PAGE;
} else {
result = SCAN_SUCCEED;
trace_mm_collapse_huge_page_isolate(folio, none_or_zero,
- referenced, writable, result);
+ referenced, result);
return result;
}
out:
release_pte_pages(pte, _pte, compound_pagelist);
trace_mm_collapse_huge_page_isolate(folio, none_or_zero,
- referenced, writable, result);
+ referenced, result);
return result;
}
@@ -921,7 +904,8 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
struct collapse_control *cc)
{
struct vm_area_struct *vma;
- unsigned long tva_flags = cc->is_khugepaged ? TVA_ENFORCE_SYSFS : 0;
+ enum tva_type type = cc->is_khugepaged ? TVA_KHUGEPAGED :
+ TVA_FORCED_COLLAPSE;
if (unlikely(hpage_collapse_test_exit_or_disable(mm)))
return SCAN_ANY_PROCESS;
@@ -932,7 +916,7 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
if (!thp_vma_suitable_order(vma, address, PMD_ORDER))
return SCAN_ADDRESS_RANGE;
- if (!thp_vma_allowable_order(vma, vma->vm_flags, tva_flags, PMD_ORDER))
+ if (!thp_vma_allowable_order(vma, vma->vm_flags, type, PMD_ORDER))
return SCAN_VMA_CHECK;
/*
* Anon VMA expected, the address may be unmapped then
@@ -1003,21 +987,21 @@ static int check_pmd_still_valid(struct mm_struct *mm,
*/
static int __collapse_huge_page_swapin(struct mm_struct *mm,
struct vm_area_struct *vma,
- unsigned long haddr, pmd_t *pmd,
+ unsigned long start_addr, pmd_t *pmd,
int referenced)
{
int swapped_in = 0;
vm_fault_t ret = 0;
- unsigned long address, end = haddr + (HPAGE_PMD_NR * PAGE_SIZE);
+ unsigned long addr, end = start_addr + (HPAGE_PMD_NR * PAGE_SIZE);
int result;
pte_t *pte = NULL;
spinlock_t *ptl;
- for (address = haddr; address < end; address += PAGE_SIZE) {
+ for (addr = start_addr; addr < end; addr += PAGE_SIZE) {
struct vm_fault vmf = {
.vma = vma,
- .address = address,
- .pgoff = linear_page_index(vma, address),
+ .address = addr,
+ .pgoff = linear_page_index(vma, addr),
.flags = FAULT_FLAG_ALLOW_RETRY,
.pmd = pmd,
};
@@ -1027,7 +1011,7 @@ static int __collapse_huge_page_swapin(struct mm_struct *mm,
* Here the ptl is only used to check pte_same() in
* do_swap_page(), so readonly version is enough.
*/
- pte = pte_offset_map_ro_nolock(mm, pmd, address, &ptl);
+ pte = pte_offset_map_ro_nolock(mm, pmd, addr, &ptl);
if (!pte) {
mmap_read_unlock(mm);
result = SCAN_PMD_NULL;
@@ -1270,7 +1254,7 @@ out_nolock:
static int hpage_collapse_scan_pmd(struct mm_struct *mm,
struct vm_area_struct *vma,
- unsigned long address, bool *mmap_locked,
+ unsigned long start_addr, bool *mmap_locked,
struct collapse_control *cc)
{
pmd_t *pmd;
@@ -1279,27 +1263,26 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
int none_or_zero = 0, shared = 0;
struct page *page = NULL;
struct folio *folio = NULL;
- unsigned long _address;
+ unsigned long addr;
spinlock_t *ptl;
int node = NUMA_NO_NODE, unmapped = 0;
- bool writable = false;
- VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+ VM_BUG_ON(start_addr & ~HPAGE_PMD_MASK);
- result = find_pmd_or_thp_or_none(mm, address, &pmd);
+ result = find_pmd_or_thp_or_none(mm, start_addr, &pmd);
if (result != SCAN_SUCCEED)
goto out;
memset(cc->node_load, 0, sizeof(cc->node_load));
nodes_clear(cc->alloc_nmask);
- pte = pte_offset_map_lock(mm, pmd, address, &ptl);
+ pte = pte_offset_map_lock(mm, pmd, start_addr, &ptl);
if (!pte) {
result = SCAN_PMD_NULL;
goto out;
}
- for (_address = address, _pte = pte; _pte < pte + HPAGE_PMD_NR;
- _pte++, _address += PAGE_SIZE) {
+ for (addr = start_addr, _pte = pte; _pte < pte + HPAGE_PMD_NR;
+ _pte++, addr += PAGE_SIZE) {
pte_t pteval = ptep_get(_pte);
if (is_swap_pte(pteval)) {
++unmapped;
@@ -1346,10 +1329,8 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
result = SCAN_PTE_UFFD_WP;
goto out_unmap;
}
- if (pte_write(pteval))
- writable = true;
- page = vm_normal_page(vma, _address, pteval);
+ page = vm_normal_page(vma, addr, pteval);
if (unlikely(!page) || unlikely(is_zone_device_page(page))) {
result = SCAN_PAGE_NULL;
goto out_unmap;
@@ -1417,13 +1398,11 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
*/
if (cc->is_khugepaged &&
(pte_young(pteval) || folio_test_young(folio) ||
- folio_test_referenced(folio) || mmu_notifier_test_young(vma->vm_mm,
- address)))
+ folio_test_referenced(folio) ||
+ mmu_notifier_test_young(vma->vm_mm, addr)))
referenced++;
}
- if (!writable) {
- result = SCAN_PAGE_RO;
- } else if (cc->is_khugepaged &&
+ if (cc->is_khugepaged &&
(!referenced ||
(unmapped && referenced < HPAGE_PMD_NR / 2))) {
result = SCAN_LACK_REFERENCED_PAGE;
@@ -1433,20 +1412,19 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
out_unmap:
pte_unmap_unlock(pte, ptl);
if (result == SCAN_SUCCEED) {
- result = collapse_huge_page(mm, address, referenced,
+ result = collapse_huge_page(mm, start_addr, referenced,
unmapped, cc);
/* collapse_huge_page will return with the mmap_lock released */
*mmap_locked = false;
}
out:
- trace_mm_khugepaged_scan_pmd(mm, folio, writable, referenced,
+ trace_mm_khugepaged_scan_pmd(mm, folio, referenced,
none_or_zero, result, unmapped);
return result;
}
-static void collect_mm_slot(struct khugepaged_mm_slot *mm_slot)
+static void collect_mm_slot(struct mm_slot *slot)
{
- struct mm_slot *slot = &mm_slot->slot;
struct mm_struct *mm = slot->mm;
lockdep_assert_held(&khugepaged_mm_lock);
@@ -1459,11 +1437,11 @@ static void collect_mm_slot(struct khugepaged_mm_slot *mm_slot)
/*
* Not strictly needed because the mm exited already.
*
- * clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
+ * mm_flags_clear(MMF_VM_HUGEPAGE, mm);
*/
/* khugepaged_mm_lock actually not necessary for the below */
- mm_slot_free(mm_slot_cache, mm_slot);
+ mm_slot_free(mm_slot_cache, slot);
mmdrop(mm);
}
}
@@ -1472,15 +1450,32 @@ static void collect_mm_slot(struct khugepaged_mm_slot *mm_slot)
static int set_huge_pmd(struct vm_area_struct *vma, unsigned long addr,
pmd_t *pmdp, struct folio *folio, struct page *page)
{
+ struct mm_struct *mm = vma->vm_mm;
struct vm_fault vmf = {
.vma = vma,
.address = addr,
.flags = 0,
- .pmd = pmdp,
};
+ pgd_t *pgdp;
+ p4d_t *p4dp;
+ pud_t *pudp;
mmap_assert_locked(vma->vm_mm);
+ if (!pmdp) {
+ pgdp = pgd_offset(mm, addr);
+ p4dp = p4d_alloc(mm, pgdp, addr);
+ if (!p4dp)
+ return SCAN_FAIL;
+ pudp = pud_alloc(mm, p4dp, addr);
+ if (!pudp)
+ return SCAN_FAIL;
+ pmdp = pmd_alloc(mm, pudp, addr);
+ if (!pmdp)
+ return SCAN_FAIL;
+ }
+
+ vmf.pmd = pmdp;
if (do_set_pmd(&vmf, folio, page))
return SCAN_FAIL;
@@ -1533,9 +1528,9 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
* in the page cache with a single hugepage. If a mm were to fault-in
* this memory (mapped by a suitably aligned VMA), we'd get the hugepage
* and map it by a PMD, regardless of sysfs THP settings. As such, let's
- * analogously elide sysfs THP settings here.
+ * analogously elide sysfs THP settings here and force collapse.
*/
- if (!thp_vma_allowable_order(vma, vma->vm_flags, 0, PMD_ORDER))
+ if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_FORCED_COLLAPSE, PMD_ORDER))
return SCAN_VMA_CHECK;
/* Keep pmd pgtable for uffd-wp; see comment in retract_page_tables() */
@@ -1556,6 +1551,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
switch (result) {
case SCAN_SUCCEED:
break;
+ case SCAN_PMD_NULL:
case SCAN_PMD_NONE:
/*
* All pte entries have been removed and pmd cleared.
@@ -2388,7 +2384,6 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
__acquires(&khugepaged_mm_lock)
{
struct vma_iterator vmi;
- struct khugepaged_mm_slot *mm_slot;
struct mm_slot *slot;
struct mm_struct *mm;
struct vm_area_struct *vma;
@@ -2399,14 +2394,12 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
*result = SCAN_FAIL;
if (khugepaged_scan.mm_slot) {
- mm_slot = khugepaged_scan.mm_slot;
- slot = &mm_slot->slot;
+ slot = khugepaged_scan.mm_slot;
} else {
- slot = list_entry(khugepaged_scan.mm_head.next,
+ slot = list_first_entry(&khugepaged_scan.mm_head,
struct mm_slot, mm_node);
- mm_slot = mm_slot_entry(slot, struct khugepaged_mm_slot, slot);
khugepaged_scan.address = 0;
- khugepaged_scan.mm_slot = mm_slot;
+ khugepaged_scan.mm_slot = slot;
}
spin_unlock(&khugepaged_mm_lock);
@@ -2432,8 +2425,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
progress++;
break;
}
- if (!thp_vma_allowable_order(vma, vma->vm_flags,
- TVA_ENFORCE_SYSFS, PMD_ORDER)) {
+ if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_KHUGEPAGED, PMD_ORDER)) {
skip:
progress++;
continue;
@@ -2505,7 +2497,7 @@ breakouterloop:
breakouterloop_mmap_lock:
spin_lock(&khugepaged_mm_lock);
- VM_BUG_ON(khugepaged_scan.mm_slot != mm_slot);
+ VM_BUG_ON(khugepaged_scan.mm_slot != slot);
/*
* Release the current mm_slot if this mm is about to die, or
* if we scanned all vmas of this mm.
@@ -2516,18 +2508,15 @@ breakouterloop_mmap_lock:
* khugepaged runs here, khugepaged_exit will find
* mm_slot not pointing to the exiting mm.
*/
- if (slot->mm_node.next != &khugepaged_scan.mm_head) {
- slot = list_entry(slot->mm_node.next,
- struct mm_slot, mm_node);
- khugepaged_scan.mm_slot =
- mm_slot_entry(slot, struct khugepaged_mm_slot, slot);
+ if (!list_is_last(&slot->mm_node, &khugepaged_scan.mm_head)) {
+ khugepaged_scan.mm_slot = list_next_entry(slot, mm_node);
khugepaged_scan.address = 0;
} else {
khugepaged_scan.mm_slot = NULL;
khugepaged_full_scans++;
}
- collect_mm_slot(mm_slot);
+ collect_mm_slot(slot);
}
return progress;
@@ -2614,7 +2603,7 @@ static void khugepaged_wait_work(void)
static int khugepaged(void *none)
{
- struct khugepaged_mm_slot *mm_slot;
+ struct mm_slot *slot;
set_freezable();
set_user_nice(current, MAX_NICE);
@@ -2625,10 +2614,10 @@ static int khugepaged(void *none)
}
spin_lock(&khugepaged_mm_lock);
- mm_slot = khugepaged_scan.mm_slot;
+ slot = khugepaged_scan.mm_slot;
khugepaged_scan.mm_slot = NULL;
- if (mm_slot)
- collect_mm_slot(mm_slot);
+ if (slot)
+ collect_mm_slot(slot);
spin_unlock(&khugepaged_mm_lock);
return 0;
}
@@ -2767,7 +2756,7 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
BUG_ON(vma->vm_start > start);
BUG_ON(vma->vm_end < end);
- if (!thp_vma_allowable_order(vma, vma->vm_flags, 0, PMD_ORDER))
+ if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_FORCED_COLLAPSE, PMD_ORDER))
return -EINVAL;
cc = kmalloc(sizeof(*cc), GFP_KERNEL);
@@ -2832,7 +2821,6 @@ handle_result:
case SCAN_PMD_NULL:
case SCAN_PTE_NON_PRESENT:
case SCAN_PTE_UFFD_WP:
- case SCAN_PAGE_RO:
case SCAN_LACK_REFERENCED_PAGE:
case SCAN_PAGE_NULL:
case SCAN_PAGE_COUNT:
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 84265983f239..1ac56ceb29b6 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -437,9 +437,15 @@ static struct kmemleak_object *__lookup_object(unsigned long ptr, int alias,
else if (untagged_objp == untagged_ptr || alias)
return object;
else {
+ /*
+ * Printk deferring due to the kmemleak_lock held.
+ * This is done to avoid deadlock.
+ */
+ printk_deferred_enter();
kmemleak_warn("Found object by alias at 0x%08lx\n",
ptr);
dump_object_info(object);
+ printk_deferred_exit();
break;
}
}
@@ -736,6 +742,11 @@ static int __link_object(struct kmemleak_object *object, unsigned long ptr,
else if (untagged_objp + parent->size <= untagged_ptr)
link = &parent->rb_node.rb_right;
else {
+ /*
+ * Printk deferring due to the kmemleak_lock held.
+ * This is done to avoid deadlock.
+ */
+ printk_deferred_enter();
kmemleak_stop("Cannot insert 0x%lx into the object search tree (overlaps existing)\n",
ptr);
/*
@@ -743,6 +754,7 @@ static int __link_object(struct kmemleak_object *object, unsigned long ptr,
* be freed while the kmemleak_lock is held.
*/
dump_object_info(parent);
+ printk_deferred_exit();
return -EEXIST;
}
}
@@ -856,13 +868,8 @@ static void delete_object_part(unsigned long ptr, size_t size,
raw_spin_lock_irqsave(&kmemleak_lock, flags);
object = __find_and_remove_object(ptr, 1, objflags);
- if (!object) {
-#ifdef DEBUG
- kmemleak_warn("Partially freeing unknown object at 0x%08lx (size %zu)\n",
- ptr, size);
-#endif
+ if (!object)
goto unlock;
- }
/*
* Create one or two objects that may result from the memory block
@@ -882,8 +889,14 @@ static void delete_object_part(unsigned long ptr, size_t size,
unlock:
raw_spin_unlock_irqrestore(&kmemleak_lock, flags);
- if (object)
+ if (object) {
__delete_object(object);
+ } else {
+#ifdef DEBUG
+ kmemleak_warn("Partially freeing unknown object at 0x%08lx (size %zu)\n",
+ ptr, size);
+#endif
+ }
out:
if (object_l)
diff --git a/mm/kmsan/core.c b/mm/kmsan/core.c
index 1ea711786c52..8bca7fece47f 100644
--- a/mm/kmsan/core.c
+++ b/mm/kmsan/core.c
@@ -195,7 +195,8 @@ void kmsan_internal_set_shadow_origin(void *addr, size_t size, int b,
u32 origin, bool checked)
{
u64 address = (u64)addr;
- u32 *shadow_start, *origin_start;
+ void *shadow_start;
+ u32 *aligned_shadow, *origin_start;
size_t pad = 0;
KMSAN_WARN_ON(!kmsan_metadata_is_contiguous(addr, size));
@@ -214,9 +215,12 @@ void kmsan_internal_set_shadow_origin(void *addr, size_t size, int b,
}
__memset(shadow_start, b, size);
- if (!IS_ALIGNED(address, KMSAN_ORIGIN_SIZE)) {
+ if (IS_ALIGNED(address, KMSAN_ORIGIN_SIZE)) {
+ aligned_shadow = shadow_start;
+ } else {
pad = address % KMSAN_ORIGIN_SIZE;
address -= pad;
+ aligned_shadow = shadow_start - pad;
size += pad;
}
size = ALIGN(size, KMSAN_ORIGIN_SIZE);
@@ -230,7 +234,7 @@ void kmsan_internal_set_shadow_origin(void *addr, size_t size, int b,
* corresponding shadow slot is zero.
*/
for (int i = 0; i < size / KMSAN_ORIGIN_SIZE; i++) {
- if (origin || !shadow_start[i])
+ if (origin || !aligned_shadow[i])
origin_start[i] = origin;
}
}
diff --git a/mm/kmsan/kmsan_test.c b/mm/kmsan/kmsan_test.c
index c6c5b2bbede0..902ec48b1e3e 100644
--- a/mm/kmsan/kmsan_test.c
+++ b/mm/kmsan/kmsan_test.c
@@ -556,6 +556,21 @@ DEFINE_TEST_MEMSETXX(16)
DEFINE_TEST_MEMSETXX(32)
DEFINE_TEST_MEMSETXX(64)
+/* Test case: ensure that KMSAN does not access shadow memory out of bounds. */
+static void test_memset_on_guarded_buffer(struct kunit *test)
+{
+ void *buf = vmalloc(PAGE_SIZE);
+
+ kunit_info(test,
+ "memset() on ends of guarded buffer should not crash\n");
+
+ for (size_t size = 0; size <= 128; size++) {
+ memset(buf, 0xff, size);
+ memset(buf + PAGE_SIZE - size, 0xff, size);
+ }
+ vfree(buf);
+}
+
static noinline void fibonacci(int *array, int size, int start)
{
if (start < 2 || (start == size))
@@ -677,6 +692,7 @@ static struct kunit_case kmsan_test_cases[] = {
KUNIT_CASE(test_memset16),
KUNIT_CASE(test_memset32),
KUNIT_CASE(test_memset64),
+ KUNIT_CASE(test_memset_on_guarded_buffer),
KUNIT_CASE(test_long_origin_chain),
KUNIT_CASE(test_stackdepot_roundtrip),
KUNIT_CASE(test_unpoison_memory),
diff --git a/mm/ksm.c b/mm/ksm.c
index 160787bb121c..04019a15b25d 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1061,11 +1061,6 @@ struct ksm_stable_node *folio_stable_node(const struct folio *folio)
return folio_test_ksm(folio) ? folio_raw_mapping(folio) : NULL;
}
-static inline struct ksm_stable_node *page_stable_node(struct page *page)
-{
- return folio_stable_node(page_folio(page));
-}
-
static inline void folio_set_stable_node(struct folio *folio,
struct ksm_stable_node *stable_node)
{
@@ -1217,8 +1212,8 @@ mm_exiting:
spin_unlock(&ksm_mmlist_lock);
mm_slot_free(mm_slot_cache, mm_slot);
- clear_bit(MMF_VM_MERGEABLE, &mm->flags);
- clear_bit(MMF_VM_MERGE_ANY, &mm->flags);
+ mm_flags_clear(MMF_VM_MERGEABLE, mm);
+ mm_flags_clear(MMF_VM_MERGE_ANY, mm);
mmdrop(mm);
} else
spin_unlock(&ksm_mmlist_lock);
@@ -2225,6 +2220,7 @@ static void stable_tree_append(struct ksm_rmap_item *rmap_item,
*/
static void cmp_and_merge_page(struct page *page, struct ksm_rmap_item *rmap_item)
{
+ struct folio *folio = page_folio(page);
struct ksm_rmap_item *tree_rmap_item;
struct page *tree_page = NULL;
struct ksm_stable_node *stable_node;
@@ -2233,7 +2229,7 @@ static void cmp_and_merge_page(struct page *page, struct ksm_rmap_item *rmap_ite
int err;
bool max_page_sharing_bypass = false;
- stable_node = page_stable_node(page);
+ stable_node = folio_stable_node(folio);
if (stable_node) {
if (stable_node->head != &migrate_nodes &&
get_kpfn_nid(READ_ONCE(stable_node->kpfn)) !=
@@ -2272,7 +2268,7 @@ static void cmp_and_merge_page(struct page *page, struct ksm_rmap_item *rmap_ite
/* Start by searching for the folio in the stable tree */
kfolio = stable_tree_search(page);
- if (&kfolio->page == page && rmap_item->head == stable_node) {
+ if (kfolio == folio && rmap_item->head == stable_node) {
folio_put(kfolio);
return;
}
@@ -2353,10 +2349,11 @@ static void cmp_and_merge_page(struct page *page, struct ksm_rmap_item *rmap_ite
* the page is locked, it is better to skip it and
* perhaps try again later.
*/
- if (!trylock_page(page))
+ if (!folio_trylock(folio))
return;
split_huge_page(page);
- unlock_page(page);
+ folio = page_folio(page);
+ folio_unlock(folio);
}
}
}
@@ -2620,8 +2617,8 @@ no_vmas:
spin_unlock(&ksm_mmlist_lock);
mm_slot_free(mm_slot_cache, mm_slot);
- clear_bit(MMF_VM_MERGEABLE, &mm->flags);
- clear_bit(MMF_VM_MERGE_ANY, &mm->flags);
+ mm_flags_clear(MMF_VM_MERGEABLE, mm);
+ mm_flags_clear(MMF_VM_MERGE_ANY, mm);
mmap_read_unlock(mm);
mmdrop(mm);
} else {
@@ -2742,7 +2739,7 @@ static int __ksm_del_vma(struct vm_area_struct *vma)
vm_flags_t ksm_vma_flags(const struct mm_struct *mm, const struct file *file,
vm_flags_t vm_flags)
{
- if (test_bit(MMF_VM_MERGE_ANY, &mm->flags) &&
+ if (mm_flags_test(MMF_VM_MERGE_ANY, mm) &&
__ksm_should_add_vma(file, vm_flags))
vm_flags |= VM_MERGEABLE;
@@ -2784,16 +2781,16 @@ int ksm_enable_merge_any(struct mm_struct *mm)
{
int err;
- if (test_bit(MMF_VM_MERGE_ANY, &mm->flags))
+ if (mm_flags_test(MMF_VM_MERGE_ANY, mm))
return 0;
- if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) {
+ if (!mm_flags_test(MMF_VM_MERGEABLE, mm)) {
err = __ksm_enter(mm);
if (err)
return err;
}
- set_bit(MMF_VM_MERGE_ANY, &mm->flags);
+ mm_flags_set(MMF_VM_MERGE_ANY, mm);
ksm_add_vmas(mm);
return 0;
@@ -2815,7 +2812,7 @@ int ksm_disable_merge_any(struct mm_struct *mm)
{
int err;
- if (!test_bit(MMF_VM_MERGE_ANY, &mm->flags))
+ if (!mm_flags_test(MMF_VM_MERGE_ANY, mm))
return 0;
err = ksm_del_vmas(mm);
@@ -2824,7 +2821,7 @@ int ksm_disable_merge_any(struct mm_struct *mm)
return err;
}
- clear_bit(MMF_VM_MERGE_ANY, &mm->flags);
+ mm_flags_clear(MMF_VM_MERGE_ANY, mm);
return 0;
}
@@ -2832,9 +2829,9 @@ int ksm_disable(struct mm_struct *mm)
{
mmap_assert_write_locked(mm);
- if (!test_bit(MMF_VM_MERGEABLE, &mm->flags))
+ if (!mm_flags_test(MMF_VM_MERGEABLE, mm))
return 0;
- if (test_bit(MMF_VM_MERGE_ANY, &mm->flags))
+ if (mm_flags_test(MMF_VM_MERGE_ANY, mm))
return ksm_disable_merge_any(mm);
return ksm_del_vmas(mm);
}
@@ -2852,7 +2849,7 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
if (!vma_ksm_compatible(vma))
return 0;
- if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) {
+ if (!mm_flags_test(MMF_VM_MERGEABLE, mm)) {
err = __ksm_enter(mm);
if (err)
return err;
@@ -2912,7 +2909,7 @@ int __ksm_enter(struct mm_struct *mm)
list_add_tail(&slot->mm_node, &ksm_scan.mm_slot->slot.mm_node);
spin_unlock(&ksm_mmlist_lock);
- set_bit(MMF_VM_MERGEABLE, &mm->flags);
+ mm_flags_set(MMF_VM_MERGEABLE, mm);
mmgrab(mm);
if (needs_wakeup)
@@ -2939,23 +2936,25 @@ void __ksm_exit(struct mm_struct *mm)
spin_lock(&ksm_mmlist_lock);
slot = mm_slot_lookup(mm_slots_hash, mm);
- mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot);
- if (mm_slot && ksm_scan.mm_slot != mm_slot) {
- if (!mm_slot->rmap_list) {
- hash_del(&slot->hash);
- list_del(&slot->mm_node);
- easy_to_free = 1;
- } else {
- list_move(&slot->mm_node,
- &ksm_scan.mm_slot->slot.mm_node);
+ if (slot) {
+ mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot);
+ if (ksm_scan.mm_slot != mm_slot) {
+ if (!mm_slot->rmap_list) {
+ hash_del(&slot->hash);
+ list_del(&slot->mm_node);
+ easy_to_free = 1;
+ } else {
+ list_move(&slot->mm_node,
+ &ksm_scan.mm_slot->slot.mm_node);
+ }
}
}
spin_unlock(&ksm_mmlist_lock);
if (easy_to_free) {
mm_slot_free(mm_slot_cache, mm_slot);
- clear_bit(MMF_VM_MERGE_ANY, &mm->flags);
- clear_bit(MMF_VM_MERGEABLE, &mm->flags);
+ mm_flags_clear(MMF_VM_MERGE_ANY, mm);
+ mm_flags_clear(MMF_VM_MERGEABLE, mm);
mmdrop(mm);
} else if (mm_slot) {
mmap_write_lock(mm);
diff --git a/mm/memblock.c b/mm/memblock.c
index 154f1d73b61f..117d963e677c 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -780,9 +780,9 @@ bool __init_memblock memblock_validate_numa_coverage(unsigned long threshold_byt
}
if ((nr_pages << PAGE_SHIFT) > threshold_bytes) {
- mem_size_mb = memblock_phys_mem_size() >> 20;
+ mem_size_mb = memblock_phys_mem_size() / SZ_1M;
pr_err("NUMA: no nodes coverage for %luMB of %luMB RAM\n",
- (nr_pages << PAGE_SHIFT) >> 20, mem_size_mb);
+ (nr_pages << PAGE_SHIFT) / SZ_1M, mem_size_mb);
return false;
}
@@ -1091,13 +1091,20 @@ int __init_memblock memblock_clear_nomap(phys_addr_t base, phys_addr_t size)
/**
* memblock_reserved_mark_noinit - Mark a reserved memory region with flag
- * MEMBLOCK_RSRV_NOINIT which results in the struct pages not being initialized
- * for this region.
+ * MEMBLOCK_RSRV_NOINIT
+ *
* @base: the base phys addr of the region
* @size: the size of the region
*
- * struct pages will not be initialized for reserved memory regions marked with
- * %MEMBLOCK_RSRV_NOINIT.
+ * The struct pages for the reserved regions marked %MEMBLOCK_RSRV_NOINIT will
+ * not be fully initialized to allow the caller optimize their initialization.
+ *
+ * When %CONFIG_DEFERRED_STRUCT_PAGE_INIT is enabled, setting this flag
+ * completely bypasses the initialization of struct pages for such region.
+ *
+ * When %CONFIG_DEFERRED_STRUCT_PAGE_INIT is disabled, struct pages in this
+ * region will be initialized with default values but won't be marked as
+ * reserved.
*
* Return: 0 on success, -errno on failure.
*/
diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c
index 4b94731305b9..6eed14bff742 100644
--- a/mm/memcontrol-v1.c
+++ b/mm/memcontrol-v1.c
@@ -761,7 +761,7 @@ static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
size = thresholds->primary ? thresholds->primary->size + 1 : 1;
/* Allocate memory for new array of thresholds */
- new = kmalloc(struct_size(new, entries, size), GFP_KERNEL);
+ new = kmalloc(struct_size(new, entries, size), GFP_KERNEL_ACCOUNT);
if (!new) {
ret = -ENOMEM;
goto unlock;
@@ -924,7 +924,7 @@ static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg,
{
struct mem_cgroup_eventfd_list *event;
- event = kmalloc(sizeof(*event), GFP_KERNEL);
+ event = kmalloc(sizeof(*event), GFP_KERNEL_ACCOUNT);
if (!event)
return -ENOMEM;
@@ -1087,7 +1087,7 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
CLASS(fd, cfile)(cfd);
- event = kzalloc(sizeof(*event), GFP_KERNEL);
+ event = kzalloc(sizeof(*event), GFP_KERNEL_ACCOUNT);
if (!event)
return -ENOMEM;
@@ -2053,7 +2053,7 @@ struct cftype mem_cgroup_legacy_files[] = {
{
.name = "cgroup.event_control", /* XXX: for compat */
.write = memcg_write_event_control,
- .flags = CFTYPE_NO_PREFIX | CFTYPE_WORLD_WRITABLE,
+ .flags = CFTYPE_NO_PREFIX,
},
{
.name = "swappiness",
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 8dd7fbed5a94..e090f29eb03b 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -287,6 +287,7 @@ ino_t page_cgroup_ino(struct page *page)
rcu_read_unlock();
return ino;
}
+EXPORT_SYMBOL_GPL(page_cgroup_ino);
/* Subset of node_stat_item for memcg stats */
static const unsigned int memcg_node_stat_items[] = {
@@ -2203,7 +2204,7 @@ static unsigned long calculate_high_delay(struct mem_cgroup *memcg,
* try_charge() (context permitting), as well as from the userland
* return path where reclaim is always able to block.
*/
-void mem_cgroup_handle_over_high(gfp_t gfp_mask)
+void __mem_cgroup_handle_over_high(gfp_t gfp_mask)
{
unsigned long penalty_jiffies;
unsigned long pflags;
@@ -2213,9 +2214,6 @@ void mem_cgroup_handle_over_high(gfp_t gfp_mask)
struct mem_cgroup *memcg;
bool in_retry = false;
- if (likely(!nr_pages))
- return;
-
memcg = get_mem_cgroup_from_mm(current->mm);
current->memcg_nr_pages_over_high = 0;
@@ -2486,7 +2484,7 @@ done_restock:
if (current->memcg_nr_pages_over_high > MEMCG_CHARGE_BATCH &&
!(current->flags & PF_MEMALLOC) &&
gfpflags_allow_blocking(gfp_mask))
- mem_cgroup_handle_over_high(gfp_mask);
+ __mem_cgroup_handle_over_high(gfp_mask);
return 0;
}
@@ -5020,22 +5018,42 @@ out:
void mem_cgroup_sk_free(struct sock *sk)
{
- if (sk->sk_memcg)
- css_put(&sk->sk_memcg->css);
+ struct mem_cgroup *memcg = mem_cgroup_from_sk(sk);
+
+ if (memcg)
+ css_put(&memcg->css);
+}
+
+void mem_cgroup_sk_inherit(const struct sock *sk, struct sock *newsk)
+{
+ struct mem_cgroup *memcg;
+
+ if (sk->sk_memcg == newsk->sk_memcg)
+ return;
+
+ mem_cgroup_sk_free(newsk);
+
+ memcg = mem_cgroup_from_sk(sk);
+ if (memcg)
+ css_get(&memcg->css);
+
+ newsk->sk_memcg = sk->sk_memcg;
}
/**
- * mem_cgroup_charge_skmem - charge socket memory
- * @memcg: memcg to charge
+ * mem_cgroup_sk_charge - charge socket memory
+ * @sk: socket in memcg to charge
* @nr_pages: number of pages to charge
* @gfp_mask: reclaim mode
*
* Charges @nr_pages to @memcg. Returns %true if the charge fit within
* @memcg's configured limit, %false if it doesn't.
*/
-bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages,
- gfp_t gfp_mask)
+bool mem_cgroup_sk_charge(const struct sock *sk, unsigned int nr_pages,
+ gfp_t gfp_mask)
{
+ struct mem_cgroup *memcg = mem_cgroup_from_sk(sk);
+
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
return memcg1_charge_skmem(memcg, nr_pages, gfp_mask);
@@ -5048,12 +5066,14 @@ bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages,
}
/**
- * mem_cgroup_uncharge_skmem - uncharge socket memory
- * @memcg: memcg to uncharge
+ * mem_cgroup_sk_uncharge - uncharge socket memory
+ * @sk: socket in memcg to uncharge
* @nr_pages: number of pages to uncharge
*/
-void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
+void mem_cgroup_sk_uncharge(const struct sock *sk, unsigned int nr_pages)
{
+ struct mem_cgroup *memcg = mem_cgroup_from_sk(sk);
+
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
memcg1_uncharge_skmem(memcg, nr_pages);
return;
diff --git a/mm/memfd.c b/mm/memfd.c
index bbe679895ef6..1d109c1acf21 100644
--- a/mm/memfd.c
+++ b/mm/memfd.c
@@ -385,11 +385,11 @@ static int sanitize_flags(unsigned int *flags_ptr)
unsigned int flags = *flags_ptr;
if (!(flags & MFD_HUGETLB)) {
- if (flags & ~(unsigned int)MFD_ALL_FLAGS)
+ if (flags & ~MFD_ALL_FLAGS)
return -EINVAL;
} else {
/* Allow huge page size encoding in flags. */
- if (flags & ~(unsigned int)(MFD_ALL_FLAGS |
+ if (flags & ~(MFD_ALL_FLAGS |
(MFD_HUGE_MASK << MFD_HUGE_SHIFT)))
return -EINVAL;
}
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index fc30ca4804bf..3edebb0cda30 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -212,106 +212,34 @@ static bool page_handle_poison(struct page *page, bool hugepage_or_freepage, boo
return true;
}
-#if IS_ENABLED(CONFIG_HWPOISON_INJECT)
+static hwpoison_filter_func_t __rcu *hwpoison_filter_func __read_mostly;
-u32 hwpoison_filter_enable = 0;
-u32 hwpoison_filter_dev_major = ~0U;
-u32 hwpoison_filter_dev_minor = ~0U;
-u64 hwpoison_filter_flags_mask;
-u64 hwpoison_filter_flags_value;
-EXPORT_SYMBOL_GPL(hwpoison_filter_enable);
-EXPORT_SYMBOL_GPL(hwpoison_filter_dev_major);
-EXPORT_SYMBOL_GPL(hwpoison_filter_dev_minor);
-EXPORT_SYMBOL_GPL(hwpoison_filter_flags_mask);
-EXPORT_SYMBOL_GPL(hwpoison_filter_flags_value);
-
-static int hwpoison_filter_dev(struct page *p)
+void hwpoison_filter_register(hwpoison_filter_func_t *filter)
{
- struct folio *folio = page_folio(p);
- struct address_space *mapping;
- dev_t dev;
-
- if (hwpoison_filter_dev_major == ~0U &&
- hwpoison_filter_dev_minor == ~0U)
- return 0;
-
- mapping = folio_mapping(folio);
- if (mapping == NULL || mapping->host == NULL)
- return -EINVAL;
-
- dev = mapping->host->i_sb->s_dev;
- if (hwpoison_filter_dev_major != ~0U &&
- hwpoison_filter_dev_major != MAJOR(dev))
- return -EINVAL;
- if (hwpoison_filter_dev_minor != ~0U &&
- hwpoison_filter_dev_minor != MINOR(dev))
- return -EINVAL;
-
- return 0;
+ rcu_assign_pointer(hwpoison_filter_func, filter);
}
+EXPORT_SYMBOL_GPL(hwpoison_filter_register);
-static int hwpoison_filter_flags(struct page *p)
+void hwpoison_filter_unregister(void)
{
- if (!hwpoison_filter_flags_mask)
- return 0;
-
- if ((stable_page_flags(p) & hwpoison_filter_flags_mask) ==
- hwpoison_filter_flags_value)
- return 0;
- else
- return -EINVAL;
+ RCU_INIT_POINTER(hwpoison_filter_func, NULL);
+ synchronize_rcu();
}
+EXPORT_SYMBOL_GPL(hwpoison_filter_unregister);
-/*
- * This allows stress tests to limit test scope to a collection of tasks
- * by putting them under some memcg. This prevents killing unrelated/important
- * processes such as /sbin/init. Note that the target task may share clean
- * pages with init (eg. libc text), which is harmless. If the target task
- * share _dirty_ pages with another task B, the test scheme must make sure B
- * is also included in the memcg. At last, due to race conditions this filter
- * can only guarantee that the page either belongs to the memcg tasks, or is
- * a freed page.
- */
-#ifdef CONFIG_MEMCG
-u64 hwpoison_filter_memcg;
-EXPORT_SYMBOL_GPL(hwpoison_filter_memcg);
-static int hwpoison_filter_task(struct page *p)
+static int hwpoison_filter(struct page *p)
{
- if (!hwpoison_filter_memcg)
- return 0;
-
- if (page_cgroup_ino(p) != hwpoison_filter_memcg)
- return -EINVAL;
-
- return 0;
-}
-#else
-static int hwpoison_filter_task(struct page *p) { return 0; }
-#endif
-
-int hwpoison_filter(struct page *p)
-{
- if (!hwpoison_filter_enable)
- return 0;
-
- if (hwpoison_filter_dev(p))
- return -EINVAL;
-
- if (hwpoison_filter_flags(p))
- return -EINVAL;
+ int ret = 0;
+ hwpoison_filter_func_t *filter;
- if (hwpoison_filter_task(p))
- return -EINVAL;
+ rcu_read_lock();
+ filter = rcu_dereference(hwpoison_filter_func);
+ if (filter)
+ ret = filter(p);
+ rcu_read_unlock();
- return 0;
-}
-EXPORT_SYMBOL_GPL(hwpoison_filter);
-#else
-int hwpoison_filter(struct page *p)
-{
- return 0;
+ return ret;
}
-#endif
/*
* Kill all processes that have a poisoned page mapped and then isolate
@@ -956,7 +884,7 @@ static const char * const action_page_types[] = {
[MF_MSG_BUDDY] = "free buddy page",
[MF_MSG_DAX] = "dax page",
[MF_MSG_UNSPLIT_THP] = "unsplit thp",
- [MF_MSG_ALREADY_POISONED] = "already poisoned",
+ [MF_MSG_ALREADY_POISONED] = "already poisoned page",
[MF_MSG_UNKNOWN] = "unknown page",
};
@@ -1199,7 +1127,7 @@ static int me_swapcache_clean(struct page_state *ps, struct page *p)
struct folio *folio = page_folio(p);
int ret;
- delete_from_swap_cache(folio);
+ swap_cache_del_folio(folio);
ret = delete_from_lru_cache(folio) ? MF_FAILED : MF_RECOVERED;
folio_unlock(folio);
@@ -1349,9 +1277,10 @@ static int action_result(unsigned long pfn, enum mf_action_page_type type,
{
trace_memory_failure_event(pfn, type, result);
- num_poisoned_pages_inc(pfn);
-
- update_per_node_mf_stats(pfn, result);
+ if (type != MF_MSG_ALREADY_POISONED) {
+ num_poisoned_pages_inc(pfn);
+ update_per_node_mf_stats(pfn, result);
+ }
pr_err("%#lx: recovery action for %s: %s\n",
pfn, action_page_types[type], action_name[result]);
@@ -1707,10 +1636,10 @@ static int identify_page_state(unsigned long pfn, struct page *p,
* carried out only if the first check can't determine the page status.
*/
for (ps = error_states;; ps++)
- if ((p->flags & ps->mask) == ps->res)
+ if ((p->flags.f & ps->mask) == ps->res)
break;
- page_flags |= (p->flags & (1UL << PG_dirty));
+ page_flags |= (p->flags.f & (1UL << PG_dirty));
if (!ps->mask)
for (ps = error_states;; ps++)
@@ -2094,12 +2023,11 @@ retry:
*hugetlb = 0;
return 0;
} else if (res == -EHWPOISON) {
- pr_err("%#lx: already hardware poisoned\n", pfn);
if (flags & MF_ACTION_REQUIRED) {
folio = page_folio(p);
res = kill_accessing_process(current, folio_pfn(folio), flags);
- action_result(pfn, MF_MSG_ALREADY_POISONED, MF_FAILED);
}
+ action_result(pfn, MF_MSG_ALREADY_POISONED, MF_FAILED);
return res;
} else if (res == -EBUSY) {
if (!(flags & MF_NO_RETRY)) {
@@ -2137,7 +2065,7 @@ retry:
return action_result(pfn, MF_MSG_FREE_HUGE, res);
}
- page_flags = folio->flags;
+ page_flags = folio->flags.f;
if (!hwpoison_user_mappings(folio, p, pfn, flags)) {
folio_unlock(folio);
@@ -2266,7 +2194,7 @@ int memory_failure(unsigned long pfn, int flags)
goto unlock_mutex;
if (pfn_valid(pfn)) {
- pgmap = get_dev_pagemap(pfn, NULL);
+ pgmap = get_dev_pagemap(pfn);
put_ref_page(pfn, flags);
if (pgmap) {
res = memory_failure_dev_pagemap(pfn, flags,
@@ -2285,7 +2213,6 @@ try_again:
goto unlock_mutex;
if (TestSetPageHWPoison(p)) {
- pr_err("%#lx: already hardware poisoned\n", pfn);
res = -EHWPOISON;
if (flags & MF_ACTION_REQUIRED)
res = kill_accessing_process(current, pfn, flags);
@@ -2398,7 +2325,7 @@ try_again:
* folio_remove_rmap_*() in try_to_unmap_one(). So to determine page
* status correctly, we save a copy of the page flags at this time.
*/
- page_flags = folio->flags;
+ page_flags = folio->flags.f;
/*
* __munlock_folio() may clear a writeback folio's LRU flag without
@@ -2569,10 +2496,9 @@ int unpoison_memory(unsigned long pfn)
static DEFINE_RATELIMIT_STATE(unpoison_rs, DEFAULT_RATELIMIT_INTERVAL,
DEFAULT_RATELIMIT_BURST);
- if (!pfn_valid(pfn))
- return -ENXIO;
-
- p = pfn_to_page(pfn);
+ p = pfn_to_online_page(pfn);
+ if (!p)
+ return -EIO;
folio = page_folio(p);
mutex_lock(&mf_mutex);
@@ -2744,13 +2670,13 @@ static int soft_offline_in_use_page(struct page *page)
putback_movable_pages(&pagelist);
pr_info("%#lx: %s migration failed %ld, type %pGp\n",
- pfn, msg_page[huge], ret, &page->flags);
+ pfn, msg_page[huge], ret, &page->flags.f);
if (ret > 0)
ret = -EBUSY;
}
} else {
pr_info("%#lx: %s isolation failed, page count %d, type %pGp\n",
- pfn, msg_page[huge], page_count(page), &page->flags);
+ pfn, msg_page[huge], page_count(page), &page->flags.f);
ret = -EBUSY;
}
return ret;
diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
index 0382b6942b8b..0ea5c13f10a2 100644
--- a/mm/memory-tiers.c
+++ b/mm/memory-tiers.c
@@ -942,11 +942,23 @@ static ssize_t demotion_enabled_store(struct kobject *kobj,
const char *buf, size_t count)
{
ssize_t ret;
+ bool before = numa_demotion_enabled;
ret = kstrtobool(buf, &numa_demotion_enabled);
if (ret)
return ret;
+ /*
+ * Reset kswapd_failures statistics. They may no longer be
+ * valid since the policy for kswapd has changed.
+ */
+ if (before == false && numa_demotion_enabled == true) {
+ struct pglist_data *pgdat;
+
+ for_each_online_pgdat(pgdat)
+ atomic_set(&pgdat->kswapd_failures, 0);
+ }
+
return count;
}
diff --git a/mm/memory.c b/mm/memory.c
index 0ba4f6b71847..74b45e258323 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -491,22 +491,8 @@ static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss)
add_mm_counter(mm, i, rss[i]);
}
-/*
- * This function is called to print an error when a bad pte
- * is found. For example, we might have a PFN-mapped pte in
- * a region that doesn't allow it.
- *
- * The calling function must still handle the error.
- */
-static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
- pte_t pte, struct page *page)
+static bool is_bad_page_map_ratelimited(void)
{
- pgd_t *pgd = pgd_offset(vma->vm_mm, addr);
- p4d_t *p4d = p4d_offset(pgd, addr);
- pud_t *pud = pud_offset(p4d, addr);
- pmd_t *pmd = pmd_offset(pud, addr);
- struct address_space *mapping;
- pgoff_t index;
static unsigned long resume;
static unsigned long nr_shown;
static unsigned long nr_unshown;
@@ -518,7 +504,7 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
if (nr_shown == 60) {
if (time_before(jiffies, resume)) {
nr_unshown++;
- return;
+ return true;
}
if (nr_unshown) {
pr_alert("BUG: Bad page map: %lu messages suppressed\n",
@@ -529,15 +515,91 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
}
if (nr_shown++ == 0)
resume = jiffies + 60 * HZ;
+ return false;
+}
+
+static void __print_bad_page_map_pgtable(struct mm_struct *mm, unsigned long addr)
+{
+ unsigned long long pgdv, p4dv, pudv, pmdv;
+ p4d_t p4d, *p4dp;
+ pud_t pud, *pudp;
+ pmd_t pmd, *pmdp;
+ pgd_t *pgdp;
+
+ /*
+ * Although this looks like a fully lockless pgtable walk, it is not:
+ * see locking requirements for print_bad_page_map().
+ */
+ pgdp = pgd_offset(mm, addr);
+ pgdv = pgd_val(*pgdp);
+
+ if (!pgd_present(*pgdp) || pgd_leaf(*pgdp)) {
+ pr_alert("pgd:%08llx\n", pgdv);
+ return;
+ }
+
+ p4dp = p4d_offset(pgdp, addr);
+ p4d = p4dp_get(p4dp);
+ p4dv = p4d_val(p4d);
+
+ if (!p4d_present(p4d) || p4d_leaf(p4d)) {
+ pr_alert("pgd:%08llx p4d:%08llx\n", pgdv, p4dv);
+ return;
+ }
+
+ pudp = pud_offset(p4dp, addr);
+ pud = pudp_get(pudp);
+ pudv = pud_val(pud);
+
+ if (!pud_present(pud) || pud_leaf(pud)) {
+ pr_alert("pgd:%08llx p4d:%08llx pud:%08llx\n", pgdv, p4dv, pudv);
+ return;
+ }
+
+ pmdp = pmd_offset(pudp, addr);
+ pmd = pmdp_get(pmdp);
+ pmdv = pmd_val(pmd);
+
+ /*
+ * Dumping the PTE would be nice, but it's tricky with CONFIG_HIGHPTE,
+ * because the table should already be mapped by the caller and
+ * doing another map would be bad. print_bad_page_map() should
+ * already take care of printing the PTE.
+ */
+ pr_alert("pgd:%08llx p4d:%08llx pud:%08llx pmd:%08llx\n", pgdv,
+ p4dv, pudv, pmdv);
+}
+
+/*
+ * This function is called to print an error when a bad page table entry (e.g.,
+ * corrupted page table entry) is found. For example, we might have a
+ * PFN-mapped pte in a region that doesn't allow it.
+ *
+ * The calling function must still handle the error.
+ *
+ * This function must be called during a proper page table walk, as it will
+ * re-walk the page table to dump information: the caller MUST prevent page
+ * table teardown (by holding mmap, vma or rmap lock) and MUST hold the leaf
+ * page table lock.
+ */
+static void print_bad_page_map(struct vm_area_struct *vma,
+ unsigned long addr, unsigned long long entry, struct page *page,
+ enum pgtable_level level)
+{
+ struct address_space *mapping;
+ pgoff_t index;
+
+ if (is_bad_page_map_ratelimited())
+ return;
mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL;
index = linear_page_index(vma, addr);
- pr_alert("BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n",
- current->comm,
- (long long)pte_val(pte), (long long)pmd_val(*pmd));
+ pr_alert("BUG: Bad page map in process %s %s:%08llx", current->comm,
+ pgtable_level_to_str(level), entry);
+ __print_bad_page_map_pgtable(vma->vm_mm, addr);
if (page)
- dump_page(page, "bad pte");
+ dump_page(page, "bad page map");
pr_alert("addr:%px vm_flags:%08lx anon_vma:%px mapping:%px index:%lx\n",
(void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
pr_alert("file:%pD fault:%ps mmap:%ps mmap_prepare: %ps read_folio:%ps\n",
@@ -549,18 +611,39 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
dump_stack();
add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
}
+#define print_bad_pte(vma, addr, pte, page) \
+ print_bad_page_map(vma, addr, pte_val(pte), page, PGTABLE_LEVEL_PTE)
-/*
- * vm_normal_page -- This function gets the "struct page" associated with a pte.
+/**
+ * __vm_normal_page() - Get the "struct page" associated with a page table entry.
+ * @vma: The VMA mapping the page table entry.
+ * @addr: The address where the page table entry is mapped.
+ * @pfn: The PFN stored in the page table entry.
+ * @special: Whether the page table entry is marked "special".
+ * @level: The page table level for error reporting purposes only.
+ * @entry: The page table entry value for error reporting purposes only.
*
* "Special" mappings do not wish to be associated with a "struct page" (either
* it doesn't exist, or it exists but they don't want to touch it). In this
- * case, NULL is returned here. "Normal" mappings do have a struct page.
+ * case, NULL is returned here. "Normal" mappings do have a struct page and
+ * are ordinarily refcounted.
+ *
+ * Page mappings of the shared zero folios are always considered "special", as
+ * they are not ordinarily refcounted: neither the refcount nor the mapcount
+ * of these folios is adjusted when mapping them into user page tables.
+ * Selected page table walkers (such as GUP) can still identify mappings of the
+ * shared zero folios and work with the underlying "struct page".
*
- * There are 2 broad cases. Firstly, an architecture may define a pte_special()
- * pte bit, in which case this function is trivial. Secondly, an architecture
- * may not have a spare pte bit, which requires a more complicated scheme,
- * described below.
+ * There are 2 broad cases. Firstly, an architecture may define a "special"
+ * page table entry bit, such as pte_special(), in which case this function is
+ * trivial. Secondly, an architecture may not have a spare page table
+ * entry bit, which requires a more complicated scheme, described below.
+ *
+ * With CONFIG_FIND_NORMAL_PAGE, we might have the "special" bit set on
+ * page table entries that actually map "normal" pages: however, that page
+ * cannot be looked up through the PFN stored in the page table entry, but
+ * instead will be looked up through vm_ops->find_normal_page(). So far, this
+ * only applies to PTEs.
*
* A raw VM_PFNMAP mapping (ie. one that is not COWed) is always considered a
* special mapping (even if there are underlying and valid "struct pages").
@@ -585,72 +668,104 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
*
* VM_MIXEDMAP mappings can likewise contain memory with or without "struct
* page" backing, however the difference is that _all_ pages with a struct
- * page (that is, those where pfn_valid is true) are refcounted and considered
- * normal pages by the VM. The only exception are zeropages, which are
- * *never* refcounted.
+ * page (that is, those where pfn_valid is true, except the shared zero
+ * folios) are refcounted and considered normal pages by the VM.
*
* The disadvantage is that pages are refcounted (which can be slower and
* simply not an option for some PFNMAP users). The advantage is that we
* don't have to follow the strict linearity rule of PFNMAP mappings in
* order to support COWable mappings.
*
+ * Return: Returns the "struct page" if this is a "normal" mapping. Returns
+ * NULL if this is a "special" mapping.
*/
-struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
- pte_t pte)
+static inline struct page *__vm_normal_page(struct vm_area_struct *vma,
+ unsigned long addr, unsigned long pfn, bool special,
+ unsigned long long entry, enum pgtable_level level)
{
- unsigned long pfn = pte_pfn(pte);
-
if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL)) {
- if (likely(!pte_special(pte)))
- goto check_pfn;
- if (vma->vm_ops && vma->vm_ops->find_special_page)
- return vma->vm_ops->find_special_page(vma, addr);
- if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
- return NULL;
- if (is_zero_pfn(pfn))
- return NULL;
-
- print_bad_pte(vma, addr, pte, NULL);
- return NULL;
- }
-
- /* !CONFIG_ARCH_HAS_PTE_SPECIAL case follows: */
-
- if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
- if (vma->vm_flags & VM_MIXEDMAP) {
- if (!pfn_valid(pfn))
- return NULL;
- if (is_zero_pfn(pfn))
- return NULL;
- goto out;
- } else {
- unsigned long off;
- off = (addr - vma->vm_start) >> PAGE_SHIFT;
- if (pfn == vma->vm_pgoff + off)
+ if (unlikely(special)) {
+#ifdef CONFIG_FIND_NORMAL_PAGE
+ if (vma->vm_ops && vma->vm_ops->find_normal_page)
+ return vma->vm_ops->find_normal_page(vma, addr);
+#endif /* CONFIG_FIND_NORMAL_PAGE */
+ if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
return NULL;
- if (!is_cow_mapping(vma->vm_flags))
+ if (is_zero_pfn(pfn) || is_huge_zero_pfn(pfn))
return NULL;
+
+ print_bad_page_map(vma, addr, entry, NULL, level);
+ return NULL;
}
- }
+ /*
+ * With CONFIG_ARCH_HAS_PTE_SPECIAL, any special page table
+ * mappings (incl. shared zero folios) are marked accordingly.
+ */
+ } else {
+ if (unlikely(vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))) {
+ if (vma->vm_flags & VM_MIXEDMAP) {
+ /* If it has a "struct page", it's "normal". */
+ if (!pfn_valid(pfn))
+ return NULL;
+ } else {
+ unsigned long off = (addr - vma->vm_start) >> PAGE_SHIFT;
- if (is_zero_pfn(pfn))
- return NULL;
+ /* Only CoW'ed anon folios are "normal". */
+ if (pfn == vma->vm_pgoff + off)
+ return NULL;
+ if (!is_cow_mapping(vma->vm_flags))
+ return NULL;
+ }
+ }
+
+ if (is_zero_pfn(pfn) || is_huge_zero_pfn(pfn))
+ return NULL;
+ }
-check_pfn:
if (unlikely(pfn > highest_memmap_pfn)) {
- print_bad_pte(vma, addr, pte, NULL);
+ /* Corrupted page table entry. */
+ print_bad_page_map(vma, addr, entry, NULL, level);
return NULL;
}
-
/*
* NOTE! We still have PageReserved() pages in the page tables.
- * eg. VDSO mappings can cause them to exist.
+ * For example, VDSO mappings can cause them to exist.
*/
-out:
- VM_WARN_ON_ONCE(is_zero_pfn(pfn));
+ VM_WARN_ON_ONCE(is_zero_pfn(pfn) || is_huge_zero_pfn(pfn));
return pfn_to_page(pfn);
}
+/**
+ * vm_normal_page() - Get the "struct page" associated with a PTE
+ * @vma: The VMA mapping the @pte.
+ * @addr: The address where the @pte is mapped.
+ * @pte: The PTE.
+ *
+ * Get the "struct page" associated with a PTE. See __vm_normal_page()
+ * for details on "normal" and "special" mappings.
+ *
+ * Return: Returns the "struct page" if this is a "normal" mapping. Returns
+ * NULL if this is a "special" mapping.
+ */
+struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
+ pte_t pte)
+{
+ return __vm_normal_page(vma, addr, pte_pfn(pte), pte_special(pte),
+ pte_val(pte), PGTABLE_LEVEL_PTE);
+}
+
+/**
+ * vm_normal_folio() - Get the "struct folio" associated with a PTE
+ * @vma: The VMA mapping the @pte.
+ * @addr: The address where the @pte is mapped.
+ * @pte: The PTE.
+ *
+ * Get the "struct folio" associated with a PTE. See __vm_normal_page()
+ * for details on "normal" and "special" mappings.
+ *
+ * Return: Returns the "struct folio" if this is a "normal" mapping. Returns
+ * NULL if this is a "special" mapping.
+ */
struct folio *vm_normal_folio(struct vm_area_struct *vma, unsigned long addr,
pte_t pte)
{
@@ -662,43 +777,37 @@ struct folio *vm_normal_folio(struct vm_area_struct *vma, unsigned long addr,
}
#ifdef CONFIG_PGTABLE_HAS_HUGE_LEAVES
+/**
+ * vm_normal_page_pmd() - Get the "struct page" associated with a PMD
+ * @vma: The VMA mapping the @pmd.
+ * @addr: The address where the @pmd is mapped.
+ * @pmd: The PMD.
+ *
+ * Get the "struct page" associated with a PTE. See __vm_normal_page()
+ * for details on "normal" and "special" mappings.
+ *
+ * Return: Returns the "struct page" if this is a "normal" mapping. Returns
+ * NULL if this is a "special" mapping.
+ */
struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
pmd_t pmd)
{
- unsigned long pfn = pmd_pfn(pmd);
-
- /* Currently it's only used for huge pfnmaps */
- if (unlikely(pmd_special(pmd)))
- return NULL;
-
- if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
- if (vma->vm_flags & VM_MIXEDMAP) {
- if (!pfn_valid(pfn))
- return NULL;
- goto out;
- } else {
- unsigned long off;
- off = (addr - vma->vm_start) >> PAGE_SHIFT;
- if (pfn == vma->vm_pgoff + off)
- return NULL;
- if (!is_cow_mapping(vma->vm_flags))
- return NULL;
- }
- }
-
- if (is_huge_zero_pfn(pfn))
- return NULL;
- if (unlikely(pfn > highest_memmap_pfn))
- return NULL;
-
- /*
- * NOTE! We still have PageReserved() pages in the page tables.
- * eg. VDSO mappings can cause them to exist.
- */
-out:
- return pfn_to_page(pfn);
+ return __vm_normal_page(vma, addr, pmd_pfn(pmd), pmd_special(pmd),
+ pmd_val(pmd), PGTABLE_LEVEL_PMD);
}
+/**
+ * vm_normal_folio_pmd() - Get the "struct folio" associated with a PMD
+ * @vma: The VMA mapping the @pmd.
+ * @addr: The address where the @pmd is mapped.
+ * @pmd: The PMD.
+ *
+ * Get the "struct folio" associated with a PTE. See __vm_normal_page()
+ * for details on "normal" and "special" mappings.
+ *
+ * Return: Returns the "struct folio" if this is a "normal" mapping. Returns
+ * NULL if this is a "special" mapping.
+ */
struct folio *vm_normal_folio_pmd(struct vm_area_struct *vma,
unsigned long addr, pmd_t pmd)
{
@@ -708,6 +817,25 @@ struct folio *vm_normal_folio_pmd(struct vm_area_struct *vma,
return page_folio(page);
return NULL;
}
+
+/**
+ * vm_normal_page_pud() - Get the "struct page" associated with a PUD
+ * @vma: The VMA mapping the @pud.
+ * @addr: The address where the @pud is mapped.
+ * @pud: The PUD.
+ *
+ * Get the "struct page" associated with a PUD. See __vm_normal_page()
+ * for details on "normal" and "special" mappings.
+ *
+ * Return: Returns the "struct page" if this is a "normal" mapping. Returns
+ * NULL if this is a "special" mapping.
+ */
+struct page *vm_normal_page_pud(struct vm_area_struct *vma,
+ unsigned long addr, pud_t pud)
+{
+ return __vm_normal_page(vma, addr, pud_pfn(pud), pud_special(pud),
+ pud_val(pud), PGTABLE_LEVEL_PUD);
+}
#endif
/**
@@ -2138,8 +2266,7 @@ static int validate_page_before_insert(struct vm_area_struct *vma,
return -EINVAL;
return 0;
}
- if (folio_test_anon(folio) || folio_test_slab(folio) ||
- page_has_type(page))
+ if (folio_test_anon(folio) || page_has_type(page))
return -EINVAL;
flush_dcache_folio(folio);
return 0;
@@ -4387,8 +4514,8 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf)
* Get a list of all the (large) orders below PMD_ORDER that are enabled
* and suitable for swapping THP.
*/
- orders = thp_vma_allowable_orders(vma, vma->vm_flags,
- TVA_IN_PF | TVA_ENFORCE_SYSFS, BIT(PMD_ORDER) - 1);
+ orders = thp_vma_allowable_orders(vma, vma->vm_flags, TVA_PAGEFAULT,
+ BIT(PMD_ORDER) - 1);
orders = thp_vma_suitable_orders(vma, vmf->address, orders);
orders = thp_swap_suitable_orders(swp_offset(entry),
vmf->address, orders);
@@ -4532,9 +4659,9 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
if (unlikely(!si))
goto out;
- folio = swap_cache_get_folio(entry, vma, vmf->address);
+ folio = swap_cache_get_folio(entry);
if (folio)
- page = folio_file_page(folio, swp_offset(entry));
+ swap_update_readahead(folio, vma, vmf->address);
swapcache = folio;
if (!folio) {
@@ -4571,7 +4698,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
memcg1_swapin(entry, nr_pages);
- shadow = get_shadow_from_swap_cache(entry);
+ shadow = swap_cache_get_shadow(entry);
if (shadow)
workingset_refault(folio, shadow);
@@ -4605,20 +4732,13 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
ret = VM_FAULT_MAJOR;
count_vm_event(PGMAJFAULT);
count_memcg_event_mm(vma->vm_mm, PGMAJFAULT);
- page = folio_file_page(folio, swp_offset(entry));
- } else if (PageHWPoison(page)) {
- /*
- * hwpoisoned dirty swapcache pages are kept for killing
- * owner processes (which may be unknown at hwpoison time)
- */
- ret = VM_FAULT_HWPOISON;
- goto out_release;
}
ret |= folio_lock_or_retry(folio, vmf);
if (ret & VM_FAULT_RETRY)
goto out_release;
+ page = folio_file_page(folio, swp_offset(entry));
if (swapcache) {
/*
* Make sure folio_free_swap() or swapoff did not release the
@@ -4627,10 +4747,18 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
* swapcache, we need to check that the page's swap has not
* changed.
*/
- if (unlikely(!folio_test_swapcache(folio) ||
- page_swap_entry(page).val != entry.val))
+ if (unlikely(!folio_matches_swap_entry(folio, entry)))
goto out_page;
+ if (unlikely(PageHWPoison(page))) {
+ /*
+ * hwpoisoned dirty swapcache pages are kept for killing
+ * owner processes (which may be unknown at hwpoison time)
+ */
+ ret = VM_FAULT_HWPOISON;
+ goto out_page;
+ }
+
/*
* KSM sometimes has to copy on read faults, for example, if
* folio->index of non-ksm folios would be nonlinear inside the
@@ -4935,8 +5063,8 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf)
* for this vma. Then filter out the orders that can't be allocated over
* the faulting address and still be fully contained in the vma.
*/
- orders = thp_vma_allowable_orders(vma, vma->vm_flags,
- TVA_IN_PF | TVA_ENFORCE_SYSFS, BIT(PMD_ORDER) - 1);
+ orders = thp_vma_allowable_orders(vma, vma->vm_flags, TVA_PAGEFAULT,
+ BIT(PMD_ORDER) - 1);
orders = thp_vma_suitable_orders(vma, vmf->address, orders);
if (!orders)
@@ -5204,9 +5332,11 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct folio *folio, struct page *pa
* It is too late to allocate a small folio, we already have a large
* folio in the pagecache: especially s390 KVM cannot tolerate any
* PMD mappings, but PTE-mapped THP are fine. So let's simply refuse any
- * PMD mappings if THPs are disabled.
+ * PMD mappings if THPs are disabled. As we already have a THP,
+ * behave as if we are forcing a collapse.
*/
- if (thp_disabled_by_hw() || vma_thp_disabled(vma, vma->vm_flags))
+ if (thp_disabled_by_hw() || vma_thp_disabled(vma, vma->vm_flags,
+ /* forced_collapse=*/ true))
return ret;
if (!thp_vma_suitable_order(vma, haddr, PMD_ORDER))
@@ -5386,13 +5516,8 @@ fallback:
nr_pages = folio_nr_pages(folio);
- /*
- * Using per-page fault to maintain the uffd semantics, and same
- * approach also applies to non shmem/tmpfs faults to avoid
- * inflating the RSS of the process.
- */
- if (!vma_is_shmem(vma) || unlikely(userfaultfd_armed(vma)) ||
- unlikely(needs_fallback)) {
+ /* Using per-page fault to maintain the uffd semantics */
+ if (unlikely(userfaultfd_armed(vma)) || unlikely(needs_fallback)) {
nr_pages = 1;
} else if (nr_pages > 1) {
pgoff_t idx = folio_page_idx(folio, page);
@@ -6126,8 +6251,7 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
return VM_FAULT_OOM;
retry_pud:
if (pud_none(*vmf.pud) &&
- thp_vma_allowable_order(vma, vm_flags,
- TVA_IN_PF | TVA_ENFORCE_SYSFS, PUD_ORDER)) {
+ thp_vma_allowable_order(vma, vm_flags, TVA_PAGEFAULT, PUD_ORDER)) {
ret = create_huge_pud(&vmf);
if (!(ret & VM_FAULT_FALLBACK))
return ret;
@@ -6161,8 +6285,7 @@ retry_pud:
goto retry_pud;
if (pmd_none(*vmf.pmd) &&
- thp_vma_allowable_order(vma, vm_flags,
- TVA_IN_PF | TVA_ENFORCE_SYSFS, PMD_ORDER)) {
+ thp_vma_allowable_order(vma, vm_flags, TVA_PAGEFAULT, PMD_ORDER)) {
ret = create_huge_pmd(&vmf);
if (!(ret & VM_FAULT_FALLBACK))
return ret;
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 1f15af712bc3..e9f14de4a9c9 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -375,7 +375,7 @@ struct page *pfn_to_online_page(unsigned long pfn)
* the section may be 'offline' but 'valid'. Only
* get_dev_pagemap() can determine sub-section online status.
*/
- pgmap = get_dev_pagemap(pfn, NULL);
+ pgmap = get_dev_pagemap(pfn);
put_dev_pagemap(pgmap);
/* The presence of a pgmap indicates ZONE_DEVICE offline pfn */
@@ -955,7 +955,7 @@ static struct zone *default_kernel_zone_for_pfn(int nid, unsigned long start_pfn
* effectively unused by the kernel, yet they account to "present pages".
* Fortunately, these allocations are comparatively small in relevant setups
* (e.g., fraction of system memory).
- * b) Some hotplugged memory blocks in virtualized environments, esecially
+ * b) Some hotplugged memory blocks in virtualized environments, especially
* hotplugged by virtio-mem, look like they are completely present, however,
* only parts of the memory block are actually currently usable.
* "present pages" is an upper limit that can get reached at runtime. As
@@ -1815,8 +1815,14 @@ static void do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
pfn = folio_pfn(folio) + folio_nr_pages(folio) - 1;
if (folio_contain_hwpoisoned_page(folio)) {
- if (WARN_ON(folio_test_lru(folio)))
- folio_isolate_lru(folio);
+ /*
+ * unmap_poisoned_folio() cannot handle large folios
+ * in all cases yet.
+ */
+ if (folio_test_large(folio) && !folio_test_hugetlb(folio))
+ goto put_folio;
+ if (folio_test_lru(folio) && !folio_isolate_lru(folio))
+ goto put_folio;
if (folio_mapped(folio)) {
folio_lock(folio);
unmap_poisoned_folio(folio, pfn, false);
diff --git a/mm/memremap.c b/mm/memremap.c
index b0ce0d8254bd..46cb1b0b6f72 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -153,14 +153,14 @@ static int pagemap_range(struct dev_pagemap *pgmap, struct mhp_params *params,
"altmap not supported for multiple ranges\n"))
return -EINVAL;
- conflict_pgmap = get_dev_pagemap(PHYS_PFN(range->start), NULL);
+ conflict_pgmap = get_dev_pagemap(PHYS_PFN(range->start));
if (conflict_pgmap) {
WARN(1, "Conflicting mapping in same section\n");
put_dev_pagemap(conflict_pgmap);
return -ENOMEM;
}
- conflict_pgmap = get_dev_pagemap(PHYS_PFN(range->end), NULL);
+ conflict_pgmap = get_dev_pagemap(PHYS_PFN(range->end));
if (conflict_pgmap) {
WARN(1, "Conflicting mapping in same section\n");
put_dev_pagemap(conflict_pgmap);
@@ -275,6 +275,9 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid)
if (WARN_ONCE(!nr_range, "nr_range must be specified\n"))
return ERR_PTR(-EINVAL);
+ if (WARN_ONCE(pgmap->vmemmap_shift > MAX_FOLIO_ORDER,
+ "requested folio size unsupported\n"))
+ return ERR_PTR(-EINVAL);
switch (pgmap->type) {
case MEMORY_DEVICE_PRIVATE:
@@ -394,26 +397,12 @@ EXPORT_SYMBOL_GPL(devm_memunmap_pages);
/**
* get_dev_pagemap() - take a new live reference on the dev_pagemap for @pfn
* @pfn: page frame number to lookup page_map
- * @pgmap: optional known pgmap that already has a reference
- *
- * If @pgmap is non-NULL and covers @pfn it will be returned as-is. If @pgmap
- * is non-NULL but does not cover @pfn the reference to it will be released.
*/
-struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
- struct dev_pagemap *pgmap)
+struct dev_pagemap *get_dev_pagemap(unsigned long pfn)
{
+ struct dev_pagemap *pgmap;
resource_size_t phys = PFN_PHYS(pfn);
- /*
- * In the cached case we're already holding a live reference.
- */
- if (pgmap) {
- if (phys >= pgmap->range.start && phys <= pgmap->range.end)
- return pgmap;
- put_dev_pagemap(pgmap);
- }
-
- /* fall back to slow path lookup */
rcu_read_lock();
pgmap = xa_load(&pgmap_array, PHYS_PFN(phys));
if (pgmap && !percpu_ref_tryget_live_rcu(&pgmap->ref))
diff --git a/mm/migrate.c b/mm/migrate.c
index 9e5ef39ce73a..aee61a980374 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -231,18 +231,17 @@ static void putback_movable_ops_page(struct page *page)
* src and dst are also released by migration core. These pages will not be
* folios in the future, so that must be reworked.
*
- * Returns MIGRATEPAGE_SUCCESS on success, otherwise a negative error
- * code.
+ * Returns 0 on success, otherwise a negative error code.
*/
static int migrate_movable_ops_page(struct page *dst, struct page *src,
enum migrate_mode mode)
{
- int rc = MIGRATEPAGE_SUCCESS;
+ int rc;
VM_WARN_ON_ONCE_PAGE(!page_has_movable_ops(src), src);
VM_WARN_ON_ONCE_PAGE(!PageMovableOpsIsolated(src), src);
rc = page_movable_ops(src)->migrate_page(dst, src, mode);
- if (rc == MIGRATEPAGE_SUCCESS)
+ if (!rc)
ClearPageMovableOpsIsolated(src);
return rc;
}
@@ -564,10 +563,10 @@ static int __folio_migrate_mapping(struct address_space *mapping,
struct folio *newfolio, struct folio *folio, int expected_count)
{
XA_STATE(xas, &mapping->i_pages, folio_index(folio));
+ struct swap_cluster_info *ci = NULL;
struct zone *oldzone, *newzone;
int dirty;
long nr = folio_nr_pages(folio);
- long entries, i;
if (!mapping) {
/* Take off deferred split queue while frozen and memcg set */
@@ -587,15 +586,22 @@ static int __folio_migrate_mapping(struct address_space *mapping,
if (folio_test_swapbacked(folio))
__folio_set_swapbacked(newfolio);
- return MIGRATEPAGE_SUCCESS;
+ return 0;
}
oldzone = folio_zone(folio);
newzone = folio_zone(newfolio);
- xas_lock_irq(&xas);
+ if (folio_test_swapcache(folio))
+ ci = swap_cluster_get_and_lock_irq(folio);
+ else
+ xas_lock_irq(&xas);
+
if (!folio_ref_freeze(folio, expected_count)) {
- xas_unlock_irq(&xas);
+ if (ci)
+ swap_cluster_unlock_irq(ci);
+ else
+ xas_unlock_irq(&xas);
return -EAGAIN;
}
@@ -616,9 +622,6 @@ static int __folio_migrate_mapping(struct address_space *mapping,
if (folio_test_swapcache(folio)) {
folio_set_swapcache(newfolio);
newfolio->private = folio_get_private(folio);
- entries = nr;
- } else {
- entries = 1;
}
/* Move dirty while folio refs frozen and newfolio not yet exposed */
@@ -628,11 +631,10 @@ static int __folio_migrate_mapping(struct address_space *mapping,
folio_set_dirty(newfolio);
}
- /* Swap cache still stores N entries instead of a high-order entry */
- for (i = 0; i < entries; i++) {
+ if (folio_test_swapcache(folio))
+ __swap_cache_replace_folio(ci, folio, newfolio);
+ else
xas_store(&xas, newfolio);
- xas_next(&xas);
- }
/*
* Drop cache reference from old folio by unfreezing
@@ -641,8 +643,11 @@ static int __folio_migrate_mapping(struct address_space *mapping,
*/
folio_ref_unfreeze(folio, expected_count - nr);
- xas_unlock(&xas);
/* Leave irq disabled to prevent preemption while updating stats */
+ if (ci)
+ swap_cluster_unlock(ci);
+ else
+ xas_unlock(&xas);
/*
* If moved to a different zone then also account
@@ -688,7 +693,7 @@ static int __folio_migrate_mapping(struct address_space *mapping,
}
local_irq_enable();
- return MIGRATEPAGE_SUCCESS;
+ return 0;
}
int folio_migrate_mapping(struct address_space *mapping,
@@ -737,7 +742,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
xas_unlock_irq(&xas);
- return MIGRATEPAGE_SUCCESS;
+ return 0;
}
/*
@@ -853,14 +858,14 @@ static int __migrate_folio(struct address_space *mapping, struct folio *dst,
return rc;
rc = __folio_migrate_mapping(mapping, dst, src, expected_count);
- if (rc != MIGRATEPAGE_SUCCESS)
+ if (rc)
return rc;
if (src_private)
folio_attach_private(dst, folio_detach_private(src));
folio_migrate_flags(dst, src);
- return MIGRATEPAGE_SUCCESS;
+ return 0;
}
/**
@@ -967,7 +972,7 @@ recheck_buffers:
}
rc = filemap_migrate_folio(mapping, dst, src, mode);
- if (rc != MIGRATEPAGE_SUCCESS)
+ if (rc)
goto unlock_buffers;
bh = head;
@@ -1071,7 +1076,7 @@ static int fallback_migrate_folio(struct address_space *mapping,
*
* Return value:
* < 0 - error code
- * MIGRATEPAGE_SUCCESS - success
+ * 0 - success
*/
static int move_to_new_folio(struct folio *dst, struct folio *src,
enum migrate_mode mode)
@@ -1099,7 +1104,7 @@ static int move_to_new_folio(struct folio *dst, struct folio *src,
else
rc = fallback_migrate_folio(mapping, dst, src, mode);
- if (rc == MIGRATEPAGE_SUCCESS) {
+ if (!rc) {
/*
* For pagecache folios, src->mapping must be cleared before src
* is freed. Anonymous folios must stay anonymous until freed.
@@ -1189,7 +1194,7 @@ static void migrate_folio_done(struct folio *src,
static int migrate_folio_unmap(new_folio_t get_new_folio,
free_folio_t put_new_folio, unsigned long private,
struct folio *src, struct folio **dstp, enum migrate_mode mode,
- enum migrate_reason reason, struct list_head *ret)
+ struct list_head *ret)
{
struct folio *dst;
int rc = -EAGAIN;
@@ -1198,16 +1203,6 @@ static int migrate_folio_unmap(new_folio_t get_new_folio,
bool locked = false;
bool dst_locked = false;
- if (folio_ref_count(src) == 1) {
- /* Folio was freed from under us. So we are done. */
- folio_clear_active(src);
- folio_clear_unevictable(src);
- /* free_pages_prepare() will clear PG_isolated. */
- list_del(&src->lru);
- migrate_folio_done(src, reason);
- return MIGRATEPAGE_SUCCESS;
- }
-
dst = get_new_folio(src, private);
if (!dst)
return -ENOMEM;
@@ -1297,7 +1292,7 @@ static int migrate_folio_unmap(new_folio_t get_new_folio,
if (unlikely(page_has_movable_ops(&src->page))) {
__migrate_folio_record(dst, old_page_state, anon_vma);
- return MIGRATEPAGE_UNMAP;
+ return 0;
}
/*
@@ -1327,7 +1322,7 @@ static int migrate_folio_unmap(new_folio_t get_new_folio,
if (!folio_mapped(src)) {
__migrate_folio_record(dst, old_page_state, anon_vma);
- return MIGRATEPAGE_UNMAP;
+ return 0;
}
out:
@@ -1459,7 +1454,7 @@ static int unmap_and_move_huge_page(new_folio_t get_new_folio,
if (folio_ref_count(src) == 1) {
/* page was freed from under us. So we are done. */
folio_putback_hugetlb(src);
- return MIGRATEPAGE_SUCCESS;
+ return 0;
}
dst = get_new_folio(src, private);
@@ -1522,8 +1517,7 @@ static int unmap_and_move_huge_page(new_folio_t get_new_folio,
rc = move_to_new_folio(dst, src, mode);
if (page_was_mapped)
- remove_migration_ptes(src,
- rc == MIGRATEPAGE_SUCCESS ? dst : src, 0);
+ remove_migration_ptes(src, !rc ? dst : src, 0);
unlock_put_anon:
folio_unlock(dst);
@@ -1532,7 +1526,7 @@ put_anon:
if (anon_vma)
put_anon_vma(anon_vma);
- if (rc == MIGRATEPAGE_SUCCESS) {
+ if (!rc) {
move_hugetlb_state(src, dst, reason);
put_new_folio = NULL;
}
@@ -1540,7 +1534,7 @@ put_anon:
out_unlock:
folio_unlock(src);
out:
- if (rc == MIGRATEPAGE_SUCCESS)
+ if (!rc)
folio_putback_hugetlb(src);
else if (rc != -EAGAIN)
list_move_tail(&src->lru, ret);
@@ -1650,7 +1644,7 @@ static int migrate_hugetlbs(struct list_head *from, new_folio_t get_new_folio,
reason, ret_folios);
/*
* The rules are:
- * Success: hugetlb folio will be put back
+ * 0: hugetlb folio will be put back
* -EAGAIN: stay on the from list
* -ENOMEM: stay on the from list
* Other errno: put on ret_folios list
@@ -1667,7 +1661,7 @@ static int migrate_hugetlbs(struct list_head *from, new_folio_t get_new_folio,
retry++;
nr_retry_pages += nr_pages;
break;
- case MIGRATEPAGE_SUCCESS:
+ case 0:
stats->nr_succeeded += nr_pages;
break;
default:
@@ -1721,7 +1715,7 @@ static void migrate_folios_move(struct list_head *src_folios,
reason, ret_folios);
/*
* The rules are:
- * Success: folio will be freed
+ * 0: folio will be freed
* -EAGAIN: stay on the unmap_folios list
* Other errno: put on ret_folios list
*/
@@ -1731,7 +1725,7 @@ static void migrate_folios_move(struct list_head *src_folios,
*thp_retry += is_thp;
*nr_retry_pages += nr_pages;
break;
- case MIGRATEPAGE_SUCCESS:
+ case 0:
stats->nr_succeeded += nr_pages;
stats->nr_thp_succeeded += is_thp;
break;
@@ -1870,14 +1864,27 @@ static int migrate_pages_batch(struct list_head *from,
continue;
}
+ /*
+ * If we are holding the last folio reference, the folio
+ * was freed from under us, so just drop our reference.
+ */
+ if (likely(!page_has_movable_ops(&folio->page)) &&
+ folio_ref_count(folio) == 1) {
+ folio_clear_active(folio);
+ folio_clear_unevictable(folio);
+ list_del(&folio->lru);
+ migrate_folio_done(folio, reason);
+ stats->nr_succeeded += nr_pages;
+ stats->nr_thp_succeeded += is_thp;
+ continue;
+ }
+
rc = migrate_folio_unmap(get_new_folio, put_new_folio,
- private, folio, &dst, mode, reason,
- ret_folios);
+ private, folio, &dst, mode, ret_folios);
/*
* The rules are:
- * Success: folio will be freed
- * Unmap: folio will be put on unmap_folios list,
- * dst folio put on dst_folios list
+ * 0: folio will be put on unmap_folios list,
+ * dst folio put on dst_folios list
* -EAGAIN: stay on the from list
* -ENOMEM: stay on the from list
* Other errno: put on ret_folios list
@@ -1927,11 +1934,7 @@ static int migrate_pages_batch(struct list_head *from,
thp_retry += is_thp;
nr_retry_pages += nr_pages;
break;
- case MIGRATEPAGE_SUCCESS:
- stats->nr_succeeded += nr_pages;
- stats->nr_thp_succeeded += is_thp;
- break;
- case MIGRATEPAGE_UNMAP:
+ case 0:
list_move_tail(&folio->lru, &unmap_folios);
list_add_tail(&dst->lru, &dst_folios);
break;
diff --git a/mm/migrate_device.c b/mm/migrate_device.c
index e05e14d6eacd..abd9f6850db6 100644
--- a/mm/migrate_device.c
+++ b/mm/migrate_device.c
@@ -778,7 +778,7 @@ static void __migrate_device_pages(unsigned long *src_pfns,
if (migrate && migrate->fault_page == page)
extra_cnt = 1;
r = folio_migrate_mapping(mapping, newfolio, folio, extra_cnt);
- if (r != MIGRATEPAGE_SUCCESS)
+ if (r)
src_pfns[i] &= ~MIGRATE_PFN_MIGRATE;
else
folio_migrate_flags(newfolio, folio);
diff --git a/mm/mincore.c b/mm/mincore.c
index 10dabefc3acc..8ec4719370e1 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -47,6 +47,47 @@ static int mincore_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr,
return 0;
}
+static unsigned char mincore_swap(swp_entry_t entry, bool shmem)
+{
+ struct swap_info_struct *si;
+ struct folio *folio = NULL;
+ unsigned char present = 0;
+
+ if (!IS_ENABLED(CONFIG_SWAP)) {
+ WARN_ON(1);
+ return 0;
+ }
+
+ /*
+ * Shmem mapping may contain swapin error entries, which are
+ * absent. Page table may contain migration or hwpoison
+ * entries which are always uptodate.
+ */
+ if (non_swap_entry(entry))
+ return !shmem;
+
+ /*
+ * Shmem mapping lookup is lockless, so we need to grab the swap
+ * device. mincore page table walk locks the PTL, and the swap
+ * device is stable, avoid touching the si for better performance.
+ */
+ if (shmem) {
+ si = get_swap_device(entry);
+ if (!si)
+ return 0;
+ }
+ folio = swap_cache_get_folio(entry);
+ if (shmem)
+ put_swap_device(si);
+ /* The swap cache space contains either folio, shadow or NULL */
+ if (folio && !xa_is_value(folio)) {
+ present = folio_test_uptodate(folio);
+ folio_put(folio);
+ }
+
+ return present;
+}
+
/*
* Later we can get more picky about what "in core" means precisely.
* For now, simply check to see if the page is in the page cache,
@@ -64,8 +105,15 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t index)
* any other file mapping (ie. marked !present and faulted in with
* tmpfs's .fault). So swapped out tmpfs mappings are tested here.
*/
- folio = filemap_get_incore_folio(mapping, index);
- if (!IS_ERR(folio)) {
+ folio = filemap_get_entry(mapping, index);
+ if (folio) {
+ if (xa_is_value(folio)) {
+ if (shmem_mapping(mapping))
+ return mincore_swap(radix_to_swp_entry(folio),
+ true);
+ else
+ return 0;
+ }
present = folio_test_uptodate(folio);
folio_put(folio);
}
@@ -143,23 +191,7 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
for (i = 0; i < step; i++)
vec[i] = 1;
} else { /* pte is a swap entry */
- swp_entry_t entry = pte_to_swp_entry(pte);
-
- if (non_swap_entry(entry)) {
- /*
- * migration or hwpoison entries are always
- * uptodate
- */
- *vec = 1;
- } else {
-#ifdef CONFIG_SWAP
- *vec = mincore_page(swap_address_space(entry),
- swap_cache_index(entry));
-#else
- WARN_ON(1);
- *vec = 1;
-#endif
- }
+ *vec = mincore_swap(pte_to_swp_entry(pte), false);
}
vec += step;
}
diff --git a/mm/mlock.c b/mm/mlock.c
index a1d93ad33c6d..bb0776f5ef7c 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -255,7 +255,7 @@ void mlock_folio(struct folio *folio)
folio_get(folio);
if (!folio_batch_add(fbatch, mlock_lru(folio)) ||
- folio_test_large(folio) || lru_cache_disabled())
+ !folio_may_be_lru_cached(folio) || lru_cache_disabled())
mlock_folio_batch(fbatch);
local_unlock(&mlock_fbatch.lock);
}
@@ -278,7 +278,7 @@ void mlock_new_folio(struct folio *folio)
folio_get(folio);
if (!folio_batch_add(fbatch, mlock_new(folio)) ||
- folio_test_large(folio) || lru_cache_disabled())
+ !folio_may_be_lru_cached(folio) || lru_cache_disabled())
mlock_folio_batch(fbatch);
local_unlock(&mlock_fbatch.lock);
}
@@ -299,7 +299,7 @@ void munlock_folio(struct folio *folio)
*/
folio_get(folio);
if (!folio_batch_add(fbatch, folio) ||
- folio_test_large(folio) || lru_cache_disabled())
+ !folio_may_be_lru_cached(folio) || lru_cache_disabled())
mlock_folio_batch(fbatch);
local_unlock(&mlock_fbatch.lock);
}
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 5c21b3af216b..df614556741a 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -1091,6 +1091,12 @@ static void __ref memmap_init_compound(struct page *head,
unsigned long pfn, end_pfn = head_pfn + nr_pages;
unsigned int order = pgmap->vmemmap_shift;
+ /*
+ * We have to initialize the pages, including setting up page links.
+ * prep_compound_page() does not take care of that, so instead we
+ * open-code prep_compound_page() so we can take care of initializing
+ * the pages in the same go.
+ */
__SetPageHead(head);
for (pfn = head_pfn + 1; pfn < end_pfn; pfn++) {
struct page *page = pfn_to_page(pfn);
@@ -1098,15 +1104,8 @@ static void __ref memmap_init_compound(struct page *head,
__init_zone_device_page(page, pfn, zone_idx, nid, pgmap);
prep_compound_tail(head, pfn - head_pfn);
set_page_count(page, 0);
-
- /*
- * The first tail page stores important compound page info.
- * Call prep_compound_head() after the first tail page has
- * been initialized, to not have the data overwritten.
- */
- if (pfn == head_pfn + 1)
- prep_compound_head(head, order);
}
+ prep_compound_head(head, order);
}
void __ref memmap_init_zone_device(struct zone *zone,
diff --git a/mm/mmap.c b/mm/mmap.c
index 7306253cc3b5..5fd3b80fda1d 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -225,7 +225,7 @@ static inline unsigned long round_hint_to_min(unsigned long hint)
return hint;
}
-bool mlock_future_ok(struct mm_struct *mm, vm_flags_t vm_flags,
+bool mlock_future_ok(const struct mm_struct *mm, vm_flags_t vm_flags,
unsigned long bytes)
{
unsigned long locked_pages, limit_pages;
@@ -802,7 +802,7 @@ unsigned long mm_get_unmapped_area_vmflags(struct mm_struct *mm, struct file *fi
unsigned long pgoff, unsigned long flags,
vm_flags_t vm_flags)
{
- if (test_bit(MMF_TOPDOWN, &mm->flags))
+ if (mm_flags_test(MMF_TOPDOWN, mm))
return arch_get_unmapped_area_topdown(filp, addr, len, pgoff,
flags, vm_flags);
return arch_get_unmapped_area(filp, addr, len, pgoff, flags, vm_flags);
@@ -1284,7 +1284,7 @@ void exit_mmap(struct mm_struct *mm)
* Set MMF_OOM_SKIP to hide this task from the oom killer/reaper
* because the memory has been already freed.
*/
- set_bit(MMF_OOM_SKIP, &mm->flags);
+ mm_flags_set(MMF_OOM_SKIP, mm);
mmap_write_lock(mm);
mt_clear_in_rcu(&mm->mm_mt);
vma_iter_set(&vmi, vma->vm_end);
@@ -1859,14 +1859,14 @@ loop_out:
mas_set_range(&vmi.mas, mpnt->vm_start, mpnt->vm_end - 1);
mas_store(&vmi.mas, XA_ZERO_ENTRY);
/* Avoid OOM iterating a broken tree */
- set_bit(MMF_OOM_SKIP, &mm->flags);
+ mm_flags_set(MMF_OOM_SKIP, mm);
}
/*
* The mm_struct is going to exit, but the locks will be dropped
* first. Set the mm_struct as unstable is advisable as it is
* not fully initialised.
*/
- set_bit(MMF_UNSTABLE, &mm->flags);
+ mm_flags_set(MMF_UNSTABLE, mm);
}
out:
mmap_write_unlock(mm);
diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c
index b006cec8e6fe..0a0db5849b8e 100644
--- a/mm/mmap_lock.c
+++ b/mm/mmap_lock.c
@@ -128,6 +128,95 @@ void vma_mark_detached(struct vm_area_struct *vma)
}
/*
+ * Try to read-lock a vma. The function is allowed to occasionally yield false
+ * locked result to avoid performance overhead, in which case we fall back to
+ * using mmap_lock. The function should never yield false unlocked result.
+ * False locked result is possible if mm_lock_seq overflows or if vma gets
+ * reused and attached to a different mm before we lock it.
+ * Returns the vma on success, NULL on failure to lock and EAGAIN if vma got
+ * detached.
+ *
+ * IMPORTANT: RCU lock must be held upon entering the function, but upon error
+ * IT IS RELEASED. The caller must handle this correctly.
+ */
+static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm,
+ struct vm_area_struct *vma)
+{
+ struct mm_struct *other_mm;
+ int oldcnt;
+
+ RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "no rcu lock held");
+ /*
+ * Check before locking. A race might cause false locked result.
+ * We can use READ_ONCE() for the mm_lock_seq here, and don't need
+ * ACQUIRE semantics, because this is just a lockless check whose result
+ * we don't rely on for anything - the mm_lock_seq read against which we
+ * need ordering is below.
+ */
+ if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(mm->mm_lock_seq.sequence)) {
+ vma = NULL;
+ goto err;
+ }
+
+ /*
+ * If VMA_LOCK_OFFSET is set, __refcount_inc_not_zero_limited_acquire()
+ * will fail because VMA_REF_LIMIT is less than VMA_LOCK_OFFSET.
+ * Acquire fence is required here to avoid reordering against later
+ * vm_lock_seq check and checks inside lock_vma_under_rcu().
+ */
+ if (unlikely(!__refcount_inc_not_zero_limited_acquire(&vma->vm_refcnt, &oldcnt,
+ VMA_REF_LIMIT))) {
+ /* return EAGAIN if vma got detached from under us */
+ vma = oldcnt ? NULL : ERR_PTR(-EAGAIN);
+ goto err;
+ }
+
+ rwsem_acquire_read(&vma->vmlock_dep_map, 0, 1, _RET_IP_);
+
+ if (unlikely(vma->vm_mm != mm))
+ goto err_unstable;
+
+ /*
+ * Overflow of vm_lock_seq/mm_lock_seq might produce false locked result.
+ * False unlocked result is impossible because we modify and check
+ * vma->vm_lock_seq under vma->vm_refcnt protection and mm->mm_lock_seq
+ * modification invalidates all existing locks.
+ *
+ * We must use ACQUIRE semantics for the mm_lock_seq so that if we are
+ * racing with vma_end_write_all(), we only start reading from the VMA
+ * after it has been unlocked.
+ * This pairs with RELEASE semantics in vma_end_write_all().
+ */
+ if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&mm->mm_lock_seq))) {
+ vma_refcount_put(vma);
+ vma = NULL;
+ goto err;
+ }
+
+ return vma;
+err:
+ rcu_read_unlock();
+
+ return vma;
+err_unstable:
+ /*
+ * If vma got attached to another mm from under us, that mm is not
+ * stable and can be freed in the narrow window after vma->vm_refcnt
+ * is dropped and before rcuwait_wake_up(mm) is called. Grab it before
+ * releasing vma->vm_refcnt.
+ */
+ other_mm = vma->vm_mm; /* use a copy as vma can be freed after we drop vm_refcnt */
+
+ /* __mmdrop() is a heavy operation, do it after dropping RCU lock. */
+ rcu_read_unlock();
+ mmgrab(other_mm);
+ vma_refcount_put(vma);
+ mmdrop(other_mm);
+
+ return NULL;
+}
+
+/*
* Lookup and lock a VMA under RCU protection. Returned VMA is guaranteed to be
* stable and not isolated. If the VMA is not found or is being modified the
* function returns NULL.
@@ -138,11 +227,13 @@ struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
MA_STATE(mas, &mm->mm_mt, address, address);
struct vm_area_struct *vma;
- rcu_read_lock();
retry:
+ rcu_read_lock();
vma = mas_walk(&mas);
- if (!vma)
+ if (!vma) {
+ rcu_read_unlock();
goto inval;
+ }
vma = vma_start_read(mm, vma);
if (IS_ERR_OR_NULL(vma)) {
@@ -162,18 +253,17 @@ retry:
* From here on, we can access the VMA without worrying about which
* fields are accessible for RCU readers.
*/
+ rcu_read_unlock();
/* Check if the vma we locked is the right one. */
- if (unlikely(address < vma->vm_start || address >= vma->vm_end))
- goto inval_end_read;
+ if (unlikely(address < vma->vm_start || address >= vma->vm_end)) {
+ vma_end_read(vma);
+ goto inval;
+ }
- rcu_read_unlock();
return vma;
-inval_end_read:
- vma_end_read(vma);
inval:
- rcu_read_unlock();
count_vm_vma_lock_event(VMA_LOCK_ABORT);
return NULL;
}
@@ -228,6 +318,7 @@ retry:
*/
if (PTR_ERR(vma) == -EAGAIN) {
/* reset to search from the last address */
+ rcu_read_lock();
vma_iter_set(vmi, from_addr);
goto retry;
}
@@ -257,9 +348,9 @@ retry:
return vma;
fallback_unlock:
+ rcu_read_unlock();
vma_end_read(vma);
fallback:
- rcu_read_unlock();
vma = lock_next_vma_under_mmap_lock(mm, vmi, from_addr);
rcu_read_lock();
/* Reinitialize the iterator after re-entering rcu read section */
diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c
index b49cc6385f1f..374aa6f021c6 100644
--- a/mm/mmu_gather.c
+++ b/mm/mmu_gather.c
@@ -32,7 +32,7 @@ static bool tlb_next_batch(struct mmu_gather *tlb)
if (tlb->batch_count == MAX_GATHER_BATCH_COUNT)
return false;
- batch = (void *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
+ batch = (void *)__get_free_page(GFP_NOWAIT);
if (!batch)
return false;
@@ -364,7 +364,7 @@ void tlb_remove_table(struct mmu_gather *tlb, void *table)
struct mmu_table_batch **batch = &tlb->batch;
if (*batch == NULL) {
- *batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
+ *batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT);
if (*batch == NULL) {
tlb_table_invalidate(tlb);
tlb_remove_table_one(table);
diff --git a/mm/mmzone.c b/mm/mmzone.c
index f9baa8882fbf..0c8f181d9d50 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -99,14 +99,14 @@ int folio_xchg_last_cpupid(struct folio *folio, int cpupid)
unsigned long old_flags, flags;
int last_cpupid;
- old_flags = READ_ONCE(folio->flags);
+ old_flags = READ_ONCE(folio->flags.f);
do {
flags = old_flags;
last_cpupid = (flags >> LAST_CPUPID_PGSHIFT) & LAST_CPUPID_MASK;
flags &= ~(LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT);
flags |= (cpupid & LAST_CPUPID_MASK) << LAST_CPUPID_PGSHIFT;
- } while (unlikely(!try_cmpxchg(&folio->flags, &old_flags, flags)));
+ } while (unlikely(!try_cmpxchg(&folio->flags.f, &old_flags, flags)));
return last_cpupid;
}
diff --git a/mm/mremap.c b/mm/mremap.c
index e618a706aff5..35de0a7b910e 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -1774,15 +1774,18 @@ static unsigned long check_mremap_params(struct vma_remap_struct *vrm)
if (!vrm->new_len)
return -EINVAL;
- /* Is the new length or address silly? */
- if (vrm->new_len > TASK_SIZE ||
- vrm->new_addr > TASK_SIZE - vrm->new_len)
+ /* Is the new length silly? */
+ if (vrm->new_len > TASK_SIZE)
return -EINVAL;
/* Remainder of checks are for cases with specific new_addr. */
if (!vrm_implies_new_addr(vrm))
return 0;
+ /* Is the new address silly? */
+ if (vrm->new_addr > TASK_SIZE - vrm->new_len)
+ return -EINVAL;
+
/* The new address must be page-aligned. */
if (offset_in_page(vrm->new_addr))
return -EINVAL;
diff --git a/mm/nommu.c b/mm/nommu.c
index 8b819fafd57b..c3a23b082adb 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -64,7 +64,7 @@ const struct vm_operations_struct generic_file_vm_ops = {
*/
unsigned int kobjsize(const void *objp)
{
- struct page *page;
+ struct folio *folio;
/*
* If the object we have should not have ksize performed on it,
@@ -73,22 +73,22 @@ unsigned int kobjsize(const void *objp)
if (!objp || !virt_addr_valid(objp))
return 0;
- page = virt_to_head_page(objp);
+ folio = virt_to_folio(objp);
/*
* If the allocator sets PageSlab, we know the pointer came from
* kmalloc().
*/
- if (PageSlab(page))
+ if (folio_test_slab(folio))
return ksize(objp);
/*
- * If it's not a compound page, see if we have a matching VMA
+ * If it's not a large folio, see if we have a matching VMA
* region. This test is intentionally done in reverse order,
* so if there's no VMA, we still fall through and hand back
- * PAGE_SIZE for 0-order pages.
+ * PAGE_SIZE for 0-order folios.
*/
- if (!PageCompound(page)) {
+ if (!folio_test_large(folio)) {
struct vm_area_struct *vma;
vma = find_vma(current->mm, (unsigned long)objp);
@@ -100,7 +100,7 @@ unsigned int kobjsize(const void *objp)
* The ksize() function is only guaranteed to work for pointers
* returned by kmalloc(). So handle arbitrary pointers here.
*/
- return page_size(page);
+ return folio_size(folio);
}
void vfree(const void *addr)
@@ -119,7 +119,8 @@ void *__vmalloc_noprof(unsigned long size, gfp_t gfp_mask)
}
EXPORT_SYMBOL(__vmalloc_noprof);
-void *vrealloc_noprof(const void *p, size_t size, gfp_t flags)
+void *vrealloc_node_align_noprof(const void *p, size_t size, unsigned long align,
+ gfp_t flags, int node)
{
return krealloc_noprof(p, size, (flags | __GFP_COMP) & ~__GFP_HIGHMEM);
}
diff --git a/mm/numa_emulation.c b/mm/numa_emulation.c
index 9d55679d99ce..703c8fa05048 100644
--- a/mm/numa_emulation.c
+++ b/mm/numa_emulation.c
@@ -73,7 +73,7 @@ static int __init emu_setup_memblk(struct numa_meminfo *ei,
}
printk(KERN_INFO "Faking node %d at [mem %#018Lx-%#018Lx] (%LuMB)\n",
- nid, eb->start, eb->end - 1, (eb->end - eb->start) >> 20);
+ nid, eb->start, eb->end - 1, (eb->end - eb->start) / SZ_1M);
return 0;
}
@@ -264,7 +264,7 @@ static int __init split_nodes_size_interleave_uniform(struct numa_meminfo *ei,
min_size = ALIGN(max(min_size, FAKE_NODE_MIN_SIZE), FAKE_NODE_MIN_SIZE);
if (size < min_size) {
pr_err("Fake node size %LuMB too small, increasing to %LuMB\n",
- size >> 20, min_size >> 20);
+ size / SZ_1M, min_size / SZ_1M);
size = min_size;
}
size = ALIGN_DOWN(size, FAKE_NODE_MIN_SIZE);
diff --git a/mm/numa_memblks.c b/mm/numa_memblks.c
index 541a99c4071a..5b009a9cd8b4 100644
--- a/mm/numa_memblks.c
+++ b/mm/numa_memblks.c
@@ -76,7 +76,7 @@ static int __init numa_alloc_distance(void)
for (j = 0; j < cnt; j++)
numa_distance[i * cnt + j] = i == j ?
LOCAL_DISTANCE : REMOTE_DISTANCE;
- printk(KERN_DEBUG "NUMA: Initialized distance table, cnt=%d\n", cnt);
+ pr_debug("NUMA: Initialized distance table, cnt=%d\n", cnt);
return 0;
}
@@ -427,9 +427,9 @@ static int __init numa_register_meminfo(struct numa_meminfo *mi)
unsigned long pfn_align = node_map_pfn_alignment();
if (pfn_align && pfn_align < PAGES_PER_SECTION) {
- unsigned long node_align_mb = PFN_PHYS(pfn_align) >> 20;
+ unsigned long node_align_mb = PFN_PHYS(pfn_align) / SZ_1M;
- unsigned long sect_align_mb = PFN_PHYS(PAGES_PER_SECTION) >> 20;
+ unsigned long sect_align_mb = PFN_PHYS(PAGES_PER_SECTION) / SZ_1M;
pr_warn("Node alignment %luMB < min %luMB, rejecting NUMA config\n",
node_align_mb, sect_align_mb);
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 25923cfec9c6..c145b0feecc1 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -1,7 +1,7 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/mm/oom_kill.c
- *
+ *
* Copyright (C) 1998,2000 Rik van Riel
* Thanks go out to Claus Fischer for some serious inspiration and
* for goading me into coding this file...
@@ -218,7 +218,7 @@ long oom_badness(struct task_struct *p, unsigned long totalpages)
*/
adj = (long)p->signal->oom_score_adj;
if (adj == OOM_SCORE_ADJ_MIN ||
- test_bit(MMF_OOM_SKIP, &p->mm->flags) ||
+ mm_flags_test(MMF_OOM_SKIP, p->mm) ||
in_vfork(p)) {
task_unlock(p);
return LONG_MIN;
@@ -325,7 +325,7 @@ static int oom_evaluate_task(struct task_struct *task, void *arg)
* any memory is quite low.
*/
if (!is_sysrq_oom(oc) && tsk_is_oom_victim(task)) {
- if (test_bit(MMF_OOM_SKIP, &task->signal->oom_mm->flags))
+ if (mm_flags_test(MMF_OOM_SKIP, task->signal->oom_mm))
goto next;
goto abort;
}
@@ -490,12 +490,12 @@ static bool oom_killer_disabled __read_mostly;
* task's threads: if one of those is using this mm then this task was also
* using it.
*/
-bool process_shares_mm(struct task_struct *p, struct mm_struct *mm)
+bool process_shares_mm(const struct task_struct *p, const struct mm_struct *mm)
{
- struct task_struct *t;
+ const struct task_struct *t;
for_each_thread(p, t) {
- struct mm_struct *t_mm = READ_ONCE(t->mm);
+ const struct mm_struct *t_mm = READ_ONCE(t->mm);
if (t_mm)
return t_mm == mm;
}
@@ -516,7 +516,7 @@ static bool __oom_reap_task_mm(struct mm_struct *mm)
{
struct vm_area_struct *vma;
bool ret = true;
- VMA_ITERATOR(vmi, mm, 0);
+ MA_STATE(mas, &mm->mm_mt, ULONG_MAX, ULONG_MAX);
/*
* Tell all users of get_user/copy_from_user etc... that the content
@@ -524,9 +524,15 @@ static bool __oom_reap_task_mm(struct mm_struct *mm)
* should imply barriers already and the reader would hit a page fault
* if it stumbled over a reaped memory.
*/
- set_bit(MMF_UNSTABLE, &mm->flags);
+ mm_flags_set(MMF_UNSTABLE, mm);
- for_each_vma(vmi, vma) {
+ /*
+ * It might start racing with the dying task and compete for shared
+ * resources - e.g. page table lock contention has been observed.
+ * Reduce those races by reaping the oom victim from the other end
+ * of the address space.
+ */
+ mas_for_each_rev(&mas, vma, 0) {
if (vma->vm_flags & (VM_HUGETLB|VM_PFNMAP))
continue;
@@ -583,7 +589,7 @@ static bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
* under mmap_lock for reading because it serializes against the
* mmap_write_lock();mmap_write_unlock() cycle in exit_mmap().
*/
- if (test_bit(MMF_OOM_SKIP, &mm->flags)) {
+ if (mm_flags_test(MMF_OOM_SKIP, mm)) {
trace_skip_task_reaping(tsk->pid);
goto out_unlock;
}
@@ -619,7 +625,7 @@ static void oom_reap_task(struct task_struct *tsk)
schedule_timeout_idle(HZ/10);
if (attempts <= MAX_OOM_REAP_RETRIES ||
- test_bit(MMF_OOM_SKIP, &mm->flags))
+ mm_flags_test(MMF_OOM_SKIP, mm))
goto done;
pr_info("oom_reaper: unable to reap pid:%d (%s)\n",
@@ -634,7 +640,7 @@ done:
* Hide this mm from OOM killer because it has been either reaped or
* somebody can't call mmap_write_unlock(mm).
*/
- set_bit(MMF_OOM_SKIP, &mm->flags);
+ mm_flags_set(MMF_OOM_SKIP, mm);
/* Drop a reference taken by queue_oom_reaper */
put_task_struct(tsk);
@@ -670,7 +676,7 @@ static void wake_oom_reaper(struct timer_list *timer)
unsigned long flags;
/* The victim managed to terminate on its own - see exit_mmap */
- if (test_bit(MMF_OOM_SKIP, &mm->flags)) {
+ if (mm_flags_test(MMF_OOM_SKIP, mm)) {
put_task_struct(tsk);
return;
}
@@ -695,7 +701,7 @@ static void wake_oom_reaper(struct timer_list *timer)
static void queue_oom_reaper(struct task_struct *tsk)
{
/* mm is already queued? */
- if (test_and_set_bit(MMF_OOM_REAP_QUEUED, &tsk->signal->oom_mm->flags))
+ if (mm_flags_test_and_set(MMF_OOM_REAP_QUEUED, tsk->signal->oom_mm))
return;
get_task_struct(tsk);
@@ -772,12 +778,12 @@ static void mark_oom_victim(struct task_struct *tsk)
mmgrab(tsk->signal->oom_mm);
/*
- * Make sure that the task is woken up from uninterruptible sleep
- * if it is frozen because OOM killer wouldn't be able to free
- * any memory and livelock. freezing_slow_path will tell the freezer
- * that TIF_MEMDIE tasks should be ignored.
+ * Make sure that the process is woken up from uninterruptible sleep
+ * if it is frozen because OOM killer wouldn't be able to free any
+ * memory and livelock. The freezer will thaw the tasks that are OOM
+ * victims regardless of the PM freezing and cgroup freezing states.
*/
- __thaw_task(tsk);
+ thaw_process(tsk);
atomic_inc(&oom_victims);
cred = get_task_cred(tsk);
trace_mark_victim(tsk, cred->uid.val);
@@ -892,7 +898,7 @@ static bool task_will_free_mem(struct task_struct *task)
* This task has already been drained by the oom reaper so there are
* only small chances it will free some more
*/
- if (test_bit(MMF_OOM_SKIP, &mm->flags))
+ if (mm_flags_test(MMF_OOM_SKIP, mm))
return false;
if (atomic_read(&mm->mm_users) <= 1)
@@ -977,7 +983,7 @@ static void __oom_kill_process(struct task_struct *victim, const char *message)
continue;
if (is_global_init(p)) {
can_oom_reap = false;
- set_bit(MMF_OOM_SKIP, &mm->flags);
+ mm_flags_set(MMF_OOM_SKIP, mm);
pr_info("oom killer %d (%s) has mm pinned by %d (%s)\n",
task_pid_nr(victim), victim->comm,
task_pid_nr(p), p->comm);
@@ -1235,7 +1241,7 @@ SYSCALL_DEFINE2(process_mrelease, int, pidfd, unsigned int, flags)
reap = true;
else {
/* Error only if the work has not been done already */
- if (!test_bit(MMF_OOM_SKIP, &mm->flags))
+ if (!mm_flags_test(MMF_OOM_SKIP, mm))
ret = -EINVAL;
}
task_unlock(p);
@@ -1251,7 +1257,7 @@ SYSCALL_DEFINE2(process_mrelease, int, pidfd, unsigned int, flags)
* Check MMF_OOM_SKIP again under mmap_read_lock protection to ensure
* possible change in exit_mmap is seen
*/
- if (!test_bit(MMF_OOM_SKIP, &mm->flags) && !__oom_reap_task_mm(mm))
+ if (!mm_flags_test(MMF_OOM_SKIP, mm) && !__oom_reap_task_mm(mm))
ret = -EAGAIN;
mmap_read_unlock(mm);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 3e248d1c3969..5f90fd6a7137 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -38,10 +38,10 @@
#include <linux/sched/rt.h>
#include <linux/sched/signal.h>
#include <linux/mm_inline.h>
+#include <linux/shmem_fs.h>
#include <trace/events/writeback.h>
#include "internal.h"
-#include "swap.h"
/*
* Sleep at most 200ms at a time in balance_dirty_pages().
@@ -2590,36 +2590,6 @@ done:
}
EXPORT_SYMBOL_GPL(writeback_iter);
-/**
- * write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
- * @mapping: address space structure to write
- * @wbc: subtract the number of written pages from *@wbc->nr_to_write
- * @writepage: function called for each page
- * @data: data passed to writepage function
- *
- * Return: %0 on success, negative error code otherwise
- *
- * Note: please use writeback_iter() instead.
- */
-int write_cache_pages(struct address_space *mapping,
- struct writeback_control *wbc, writepage_t writepage,
- void *data)
-{
- struct folio *folio = NULL;
- int error;
-
- while ((folio = writeback_iter(mapping, wbc, folio, &error))) {
- error = writepage(folio, wbc, data);
- if (error == AOP_WRITEPAGE_ACTIVATE) {
- folio_unlock(folio);
- error = 0;
- }
- }
-
- return error;
-}
-EXPORT_SYMBOL(write_cache_pages);
-
int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
{
int ret;
@@ -2735,12 +2705,18 @@ void __folio_mark_dirty(struct folio *folio, struct address_space *mapping,
{
unsigned long flags;
+ /*
+ * Shmem writeback relies on swap, and swap writeback is LRU based,
+ * not using the dirty mark.
+ */
+ VM_WARN_ON_ONCE(folio_test_swapcache(folio) || shmem_mapping(mapping));
+
xa_lock_irqsave(&mapping->i_pages, flags);
if (folio->mapping) { /* Race with truncate? */
WARN_ON_ONCE(warn && !folio_test_uptodate(folio));
folio_account_dirtied(folio, mapping);
- __xa_set_mark(&mapping->i_pages, folio_index(folio),
- PAGECACHE_TAG_DIRTY);
+ __xa_set_mark(&mapping->i_pages, folio->index,
+ PAGECACHE_TAG_DIRTY);
}
xa_unlock_irqrestore(&mapping->i_pages, flags);
}
@@ -3019,7 +2995,7 @@ bool __folio_end_writeback(struct folio *folio)
xa_lock_irqsave(&mapping->i_pages, flags);
ret = folio_xor_flags_has_waiters(folio, 1 << PG_writeback);
- __xa_clear_mark(&mapping->i_pages, folio_index(folio),
+ __xa_clear_mark(&mapping->i_pages, folio->index,
PAGECACHE_TAG_WRITEBACK);
if (bdi->capabilities & BDI_CAP_WRITEBACK_ACCT) {
struct bdi_writeback *wb = inode_to_wb(inode);
@@ -3056,7 +3032,7 @@ void __folio_start_writeback(struct folio *folio, bool keep_write)
VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
if (mapping && mapping_use_writeback_tags(mapping)) {
- XA_STATE(xas, &mapping->i_pages, folio_index(folio));
+ XA_STATE(xas, &mapping->i_pages, folio->index);
struct inode *inode = mapping->host;
struct backing_dev_info *bdi = inode_to_bdi(inode);
unsigned long flags;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index d1d037f97c5f..600d9e981c23 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -355,7 +355,7 @@ static inline int pfn_to_bitidx(const struct page *page, unsigned long pfn)
static __always_inline bool is_standalone_pb_bit(enum pageblock_bits pb_bit)
{
- return pb_bit > PB_migrate_end && pb_bit < __NR_PAGEBLOCK_BITS;
+ return pb_bit >= PB_compact_skip && pb_bit < __NR_PAGEBLOCK_BITS;
}
static __always_inline void
@@ -370,7 +370,7 @@ get_pfnblock_bitmap_bitidx(const struct page *page, unsigned long pfn,
#else
BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);
#endif
- BUILD_BUG_ON(__MIGRATE_TYPE_END >= (1 << PB_migratetype_bits));
+ BUILD_BUG_ON(__MIGRATE_TYPE_END > MIGRATETYPE_MASK);
VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page);
bitmap = get_pageblock_bitmap(page, pfn);
@@ -538,8 +538,7 @@ static void set_pageblock_migratetype(struct page *page,
"Use set_pageblock_isolate() for pageblock isolation");
return;
}
- VM_WARN_ONCE(get_pfnblock_bit(page, page_to_pfn(page),
- PB_migrate_isolate),
+ VM_WARN_ONCE(get_pageblock_isolate(page),
"Use clear_pageblock_isolate() to unisolate pageblock");
/* MIGRATETYPE_AND_ISO_MASK clears PB_migrate_isolate if it is set */
#endif
@@ -797,7 +796,7 @@ static inline void account_freepages(struct zone *zone, int nr_pages,
if (is_migrate_cma(migratetype))
__mod_zone_page_state(zone, NR_FREE_CMA_PAGES, nr_pages);
- else if (is_migrate_highatomic(migratetype))
+ else if (migratetype == MIGRATE_HIGHATOMIC)
WRITE_ONCE(zone->nr_free_highatomic,
zone->nr_free_highatomic + nr_pages);
}
@@ -950,7 +949,7 @@ static inline void __free_one_page(struct page *page,
bool to_tail;
VM_BUG_ON(!zone_is_initialized(zone));
- VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page);
+ VM_BUG_ON_PAGE(page->flags.f & PAGE_FLAGS_CHECK_AT_PREP, page);
VM_BUG_ON(migratetype == -1);
VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page);
@@ -1043,7 +1042,7 @@ static inline bool page_expected_state(struct page *page,
page->memcg_data |
#endif
page_pool_page_is_pp(page) |
- (page->flags & check_flags)))
+ (page->flags.f & check_flags)))
return false;
return true;
@@ -1059,7 +1058,7 @@ static const char *page_bad_reason(struct page *page, unsigned long flags)
bad_reason = "non-NULL mapping";
if (unlikely(page_ref_count(page) != 0))
bad_reason = "nonzero _refcount";
- if (unlikely(page->flags & flags)) {
+ if (unlikely(page->flags.f & flags)) {
if (flags == PAGE_FLAGS_CHECK_AT_PREP)
bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag(s) set";
else
@@ -1358,7 +1357,7 @@ __always_inline bool free_pages_prepare(struct page *page,
int i;
if (compound) {
- page[1].flags &= ~PAGE_FLAGS_SECOND;
+ page[1].flags.f &= ~PAGE_FLAGS_SECOND;
#ifdef NR_PAGES_IN_LARGE_FOLIO
folio->_nr_pages = 0;
#endif
@@ -1372,7 +1371,7 @@ __always_inline bool free_pages_prepare(struct page *page,
continue;
}
}
- (page + i)->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
+ (page + i)->flags.f &= ~PAGE_FLAGS_CHECK_AT_PREP;
}
}
if (folio_test_anon(folio)) {
@@ -1391,7 +1390,7 @@ __always_inline bool free_pages_prepare(struct page *page,
}
page_cpupid_reset_last(page);
- page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
+ page->flags.f &= ~PAGE_FLAGS_CHECK_AT_PREP;
reset_page_owner(page, order);
page_table_check_free(page, order);
pgalloc_tag_sub(page, 1 << order);
@@ -1521,7 +1520,7 @@ static void add_page_to_zone_llist(struct zone *zone, struct page *page,
unsigned int order)
{
/* Remember the order */
- page->order = order;
+ page->private = order;
/* Add the page to the free list */
llist_add(&page->pcp_llist, &zone->trylock_free_pages);
}
@@ -1550,7 +1549,7 @@ static void free_one_page(struct zone *zone, struct page *page,
llnode = llist_del_all(llhead);
llist_for_each_entry_safe(p, tmp, llnode, pcp_llist) {
- unsigned int p_order = p->order;
+ unsigned int p_order = p->private;
split_large_buddy(zone, p, page_to_pfn(p), p_order, fpi_flags);
__count_vm_events(PGFREE, 1 << p_order);
@@ -2034,7 +2033,13 @@ static int move_freepages_block(struct zone *zone, struct page *page,
/* Look for a buddy that straddles start_pfn */
static unsigned long find_large_buddy(unsigned long start_pfn)
{
- int order = 0;
+ /*
+ * If start_pfn is not an order-0 PageBuddy, next PageBuddy containing
+ * start_pfn has minimal order of __ffs(start_pfn) + 1. Start checking
+ * the order with __ffs(start_pfn). If start_pfn is order-0 PageBuddy,
+ * the starting order does not matter.
+ */
+ int order = start_pfn ? __ffs(start_pfn) : MAX_PAGE_ORDER;
struct page *page;
unsigned long pfn = start_pfn;
@@ -2058,9 +2063,9 @@ static unsigned long find_large_buddy(unsigned long start_pfn)
static inline void toggle_pageblock_isolate(struct page *page, bool isolate)
{
if (isolate)
- set_pfnblock_bit(page, page_to_pfn(page), PB_migrate_isolate);
+ set_pageblock_isolate(page);
else
- clear_pfnblock_bit(page, page_to_pfn(page), PB_migrate_isolate);
+ clear_pageblock_isolate(page);
}
/**
@@ -2085,9 +2090,10 @@ static inline void toggle_pageblock_isolate(struct page *page, bool isolate)
static bool __move_freepages_block_isolate(struct zone *zone,
struct page *page, bool isolate)
{
- unsigned long start_pfn, pfn;
+ unsigned long start_pfn, buddy_pfn;
int from_mt;
int to_mt;
+ struct page *buddy;
if (isolate == get_pageblock_isolate(page)) {
VM_WARN_ONCE(1, "%s a pageblock that is already in that state",
@@ -2102,29 +2108,19 @@ static bool __move_freepages_block_isolate(struct zone *zone,
if (pageblock_order == MAX_PAGE_ORDER)
goto move;
- /* We're a tail block in a larger buddy */
- pfn = find_large_buddy(start_pfn);
- if (pfn != start_pfn) {
- struct page *buddy = pfn_to_page(pfn);
+ buddy_pfn = find_large_buddy(start_pfn);
+ buddy = pfn_to_page(buddy_pfn);
+ /* We're a part of a larger buddy */
+ if (PageBuddy(buddy) && buddy_order(buddy) > pageblock_order) {
int order = buddy_order(buddy);
del_page_from_free_list(buddy, zone, order,
- get_pfnblock_migratetype(buddy, pfn));
+ get_pfnblock_migratetype(buddy, buddy_pfn));
toggle_pageblock_isolate(page, isolate);
- split_large_buddy(zone, buddy, pfn, order, FPI_NONE);
+ split_large_buddy(zone, buddy, buddy_pfn, order, FPI_NONE);
return true;
}
- /* We're the starting block of a larger buddy */
- if (PageBuddy(page) && buddy_order(page) > pageblock_order) {
- int order = buddy_order(page);
-
- del_page_from_free_list(page, zone, order,
- get_pfnblock_migratetype(page, pfn));
- toggle_pageblock_isolate(page, isolate);
- split_large_buddy(zone, page, pfn, order, FPI_NONE);
- return true;
- }
move:
/* Use MIGRATETYPE_MASK to get non-isolate migratetype */
if (isolate) {
@@ -2864,14 +2860,29 @@ static void free_frozen_page_commit(struct zone *zone,
*/
return;
}
+
high = nr_pcp_high(pcp, zone, batch, free_high);
- if (pcp->count >= high) {
- free_pcppages_bulk(zone, nr_pcp_free(pcp, batch, high, free_high),
- pcp, pindex);
- if (test_bit(ZONE_BELOW_HIGH, &zone->flags) &&
- zone_watermark_ok(zone, 0, high_wmark_pages(zone),
- ZONE_MOVABLE, 0))
- clear_bit(ZONE_BELOW_HIGH, &zone->flags);
+ if (pcp->count < high)
+ return;
+
+ free_pcppages_bulk(zone, nr_pcp_free(pcp, batch, high, free_high),
+ pcp, pindex);
+ if (test_bit(ZONE_BELOW_HIGH, &zone->flags) &&
+ zone_watermark_ok(zone, 0, high_wmark_pages(zone),
+ ZONE_MOVABLE, 0)) {
+ struct pglist_data *pgdat = zone->zone_pgdat;
+ clear_bit(ZONE_BELOW_HIGH, &zone->flags);
+
+ /*
+ * Assume that memory pressure on this node is gone and may be
+ * in a reclaimable state. If a memory fallback node exists,
+ * direct reclaim may not have been triggered, causing a
+ * 'hopeless node' to stay in that state for a while. Let
+ * kswapd work again by resetting kswapd_failures.
+ */
+ if (atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES &&
+ next_memory_node(pgdat->node_id) < MAX_NUMNODES)
+ atomic_set(&pgdat->kswapd_failures, 0);
}
}
@@ -3724,6 +3735,8 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
struct pglist_data *last_pgdat = NULL;
bool last_pgdat_dirty_ok = false;
bool no_fallback;
+ bool skip_kswapd_nodes = nr_online_nodes > 1;
+ bool skipped_kswapd_nodes = false;
retry:
/*
@@ -3786,6 +3799,19 @@ retry:
}
}
+ /*
+ * If kswapd is already active on a node, keep looking
+ * for other nodes that might be idle. This can happen
+ * if another process has NUMA bindings and is causing
+ * kswapd wakeups on only some nodes. Avoid accidental
+ * "node_reclaim_mode"-like behavior in this case.
+ */
+ if (skip_kswapd_nodes &&
+ !waitqueue_active(&zone->zone_pgdat->kswapd_wait)) {
+ skipped_kswapd_nodes = true;
+ continue;
+ }
+
cond_accept_memory(zone, order, alloc_flags);
/*
@@ -3878,6 +3904,15 @@ try_this_zone:
}
/*
+ * If we skipped over nodes with active kswapds and found no
+ * idle nodes, retry and place anywhere the watermarks permit.
+ */
+ if (skip_kswapd_nodes && skipped_kswapd_nodes) {
+ skip_kswapd_nodes = false;
+ goto retry;
+ }
+
+ /*
* It's possible on a UMA machine to get through all zones that are
* fragmented. If avoiding fragmentation, reset and try again.
*/
@@ -4182,7 +4217,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
}
static inline bool
-should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_flags,
+should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
enum compact_result compact_result,
enum compact_priority *compact_priority,
int *compaction_retries)
@@ -4408,7 +4443,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask, unsigned int order)
if (!(gfp_mask & __GFP_NOMEMALLOC)) {
alloc_flags |= ALLOC_NON_BLOCK;
- if (order > 0)
+ if (order > 0 && (alloc_flags & ALLOC_MIN_RESERVE))
alloc_flags |= ALLOC_HIGHATOMIC;
}
@@ -5229,9 +5264,16 @@ static void ___free_pages(struct page *page, unsigned int order,
__free_frozen_pages(page, order, fpi_flags);
else if (!head) {
pgalloc_tag_sub_pages(tag, (1 << order) - 1);
- while (order-- > 0)
+ while (order-- > 0) {
+ /*
+ * The "tail" pages of this non-compound high-order
+ * page will have no code tags, so to avoid warnings
+ * mark them as empty.
+ */
+ clear_page_tag_ref(page + (1 << order));
__free_frozen_pages(page + (1 << order), order,
fpi_flags);
+ }
}
}
@@ -5270,6 +5312,15 @@ void free_pages_nolock(struct page *page, unsigned int order)
___free_pages(page, order, FPI_TRYLOCK);
}
+/**
+ * free_pages - Free pages allocated with __get_free_pages().
+ * @addr: The virtual address tied to a page returned from __get_free_pages().
+ * @order: The order of the allocation.
+ *
+ * This function behaves the same as __free_pages(). Use this function
+ * to free pages when you only have a valid virtual address. If you have
+ * the page, call __free_pages() instead.
+ */
void free_pages(unsigned long addr, unsigned int order)
{
if (addr != 0) {
@@ -5946,7 +5997,6 @@ static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonesta
pcp->high_min = BOOT_PAGESET_HIGH;
pcp->high_max = BOOT_PAGESET_HIGH;
pcp->batch = BOOT_PAGESET_BATCH;
- pcp->free_count = 0;
}
static void __zone_set_pageset_high_and_batch(struct zone *zone, unsigned long high_min,
@@ -6236,16 +6286,13 @@ static void calculate_totalreserve_pages(void)
unsigned long managed_pages = zone_managed_pages(zone);
/* Find valid and maximum lowmem_reserve in the zone */
- for (j = i; j < MAX_NR_ZONES; j++) {
- if (zone->lowmem_reserve[j] > max)
- max = zone->lowmem_reserve[j];
- }
+ for (j = i; j < MAX_NR_ZONES; j++)
+ max = max(max, zone->lowmem_reserve[j]);
/* we treat the high watermark as reserved pages. */
max += high_wmark_pages(zone);
- if (max > managed_pages)
- max = managed_pages;
+ max = min_t(unsigned long, max, managed_pages);
pgdat->totalreserve_pages += max;
@@ -6837,6 +6884,7 @@ static int __alloc_contig_verify_gfp_mask(gfp_t gfp_mask, gfp_t *gfp_cc_mask)
int alloc_contig_range_noprof(unsigned long start, unsigned long end,
acr_flags_t alloc_flags, gfp_t gfp_mask)
{
+ const unsigned int order = ilog2(end - start);
unsigned long outer_start, outer_end;
int ret = 0;
@@ -6854,6 +6902,14 @@ int alloc_contig_range_noprof(unsigned long start, unsigned long end,
PB_ISOLATE_MODE_CMA_ALLOC :
PB_ISOLATE_MODE_OTHER;
+ /*
+ * In contrast to the buddy, we allow for orders here that exceed
+ * MAX_PAGE_ORDER, so we must manually make sure that we are not
+ * exceeding the maximum folio order.
+ */
+ if (WARN_ON_ONCE((gfp_mask & __GFP_COMP) && order > MAX_FOLIO_ORDER))
+ return -EINVAL;
+
gfp_mask = current_gfp_context(gfp_mask);
if (__alloc_contig_verify_gfp_mask(gfp_mask, (gfp_t *)&cc.gfp_mask))
return -EINVAL;
@@ -6951,7 +7007,6 @@ int alloc_contig_range_noprof(unsigned long start, unsigned long end,
free_contig_range(end, outer_end - end);
} else if (start == outer_start && end == outer_end && is_power_of_2(end - start)) {
struct page *head = pfn_to_page(start);
- int order = ilog2(end - start);
check_new_pages(head, order);
prep_new_page(head, order, gfp_mask, 0);
@@ -7478,22 +7533,7 @@ static bool __free_unaccepted(struct page *page)
#endif /* CONFIG_UNACCEPTED_MEMORY */
-/**
- * alloc_pages_nolock - opportunistic reentrant allocation from any context
- * @nid: node to allocate from
- * @order: allocation order size
- *
- * Allocates pages of a given order from the given node. This is safe to
- * call from any context (from atomic, NMI, and also reentrant
- * allocator -> tracepoint -> alloc_pages_nolock_noprof).
- * Allocation is best effort and to be expected to fail easily so nobody should
- * rely on the success. Failures are not reported via warn_alloc().
- * See always fail conditions below.
- *
- * Return: allocated page or NULL on failure. NULL does not mean EBUSY or EAGAIN.
- * It means ENOMEM. There is no reason to call it again and expect !NULL.
- */
-struct page *alloc_pages_nolock_noprof(int nid, unsigned int order)
+struct page *alloc_frozen_pages_nolock_noprof(gfp_t gfp_flags, int nid, unsigned int order)
{
/*
* Do not specify __GFP_DIRECT_RECLAIM, since direct claim is not allowed.
@@ -7515,12 +7555,13 @@ struct page *alloc_pages_nolock_noprof(int nid, unsigned int order)
* specify it here to highlight that alloc_pages_nolock()
* doesn't want to deplete reserves.
*/
- gfp_t alloc_gfp = __GFP_NOWARN | __GFP_ZERO | __GFP_NOMEMALLOC
- | __GFP_ACCOUNT;
+ gfp_t alloc_gfp = __GFP_NOWARN | __GFP_ZERO | __GFP_NOMEMALLOC | __GFP_COMP
+ | gfp_flags;
unsigned int alloc_flags = ALLOC_TRYLOCK;
struct alloc_context ac = { };
struct page *page;
+ VM_WARN_ON_ONCE(gfp_flags & ~__GFP_ACCOUNT);
/*
* In PREEMPT_RT spin_trylock() will call raw_spin_lock() which is
* unsafe in NMI. If spin_trylock() is called from hard IRQ the current
@@ -7555,15 +7596,38 @@ struct page *alloc_pages_nolock_noprof(int nid, unsigned int order)
/* Unlike regular alloc_pages() there is no __alloc_pages_slowpath(). */
- if (page)
- set_page_refcounted(page);
-
- if (memcg_kmem_online() && page &&
+ if (memcg_kmem_online() && page && (gfp_flags & __GFP_ACCOUNT) &&
unlikely(__memcg_kmem_charge_page(page, alloc_gfp, order) != 0)) {
- free_pages_nolock(page, order);
+ __free_frozen_pages(page, order, FPI_TRYLOCK);
page = NULL;
}
trace_mm_page_alloc(page, order, alloc_gfp, ac.migratetype);
kmsan_alloc_page(page, order, alloc_gfp);
return page;
}
+/**
+ * alloc_pages_nolock - opportunistic reentrant allocation from any context
+ * @gfp_flags: GFP flags. Only __GFP_ACCOUNT allowed.
+ * @nid: node to allocate from
+ * @order: allocation order size
+ *
+ * Allocates pages of a given order from the given node. This is safe to
+ * call from any context (from atomic, NMI, and also reentrant
+ * allocator -> tracepoint -> alloc_pages_nolock_noprof).
+ * Allocation is best effort and to be expected to fail easily so nobody should
+ * rely on the success. Failures are not reported via warn_alloc().
+ * See always fail conditions below.
+ *
+ * Return: allocated page or NULL on failure. NULL does not mean EBUSY or EAGAIN.
+ * It means ENOMEM. There is no reason to call it again and expect !NULL.
+ */
+struct page *alloc_pages_nolock_noprof(gfp_t gfp_flags, int nid, unsigned int order)
+{
+ struct page *page;
+
+ page = alloc_frozen_pages_nolock_noprof(gfp_flags, nid, order);
+ if (page)
+ set_page_refcounted(page);
+ return page;
+}
+EXPORT_SYMBOL_GPL(alloc_pages_nolock_noprof);
diff --git a/mm/page_io.c b/mm/page_io.c
index a2056a5ecb13..3c342db77ce3 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -204,7 +204,7 @@ static bool is_folio_zero_filled(struct folio *folio)
static void swap_zeromap_folio_set(struct folio *folio)
{
struct obj_cgroup *objcg = get_obj_cgroup_from_folio(folio);
- struct swap_info_struct *sis = swp_swap_info(folio->swap);
+ struct swap_info_struct *sis = __swap_entry_to_info(folio->swap);
int nr_pages = folio_nr_pages(folio);
swp_entry_t entry;
unsigned int i;
@@ -223,7 +223,7 @@ static void swap_zeromap_folio_set(struct folio *folio)
static void swap_zeromap_folio_clear(struct folio *folio)
{
- struct swap_info_struct *sis = swp_swap_info(folio->swap);
+ struct swap_info_struct *sis = __swap_entry_to_info(folio->swap);
swp_entry_t entry;
unsigned int i;
@@ -374,7 +374,7 @@ static void sio_write_complete(struct kiocb *iocb, long ret)
static void swap_writepage_fs(struct folio *folio, struct swap_iocb **swap_plug)
{
struct swap_iocb *sio = swap_plug ? *swap_plug : NULL;
- struct swap_info_struct *sis = swp_swap_info(folio->swap);
+ struct swap_info_struct *sis = __swap_entry_to_info(folio->swap);
struct file *swap_file = sis->swap_file;
loff_t pos = swap_dev_pos(folio->swap);
@@ -446,7 +446,7 @@ static void swap_writepage_bdev_async(struct folio *folio,
void __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug)
{
- struct swap_info_struct *sis = swp_swap_info(folio->swap);
+ struct swap_info_struct *sis = __swap_entry_to_info(folio->swap);
VM_BUG_ON_FOLIO(!folio_test_swapcache(folio), folio);
/*
@@ -537,7 +537,7 @@ static bool swap_read_folio_zeromap(struct folio *folio)
static void swap_read_folio_fs(struct folio *folio, struct swap_iocb **plug)
{
- struct swap_info_struct *sis = swp_swap_info(folio->swap);
+ struct swap_info_struct *sis = __swap_entry_to_info(folio->swap);
struct swap_iocb *sio = NULL;
loff_t pos = swap_dev_pos(folio->swap);
@@ -608,7 +608,7 @@ static void swap_read_folio_bdev_async(struct folio *folio,
void swap_read_folio(struct folio *folio, struct swap_iocb **plug)
{
- struct swap_info_struct *sis = swp_swap_info(folio->swap);
+ struct swap_info_struct *sis = __swap_entry_to_info(folio->swap);
bool synchronous = sis->flags & SWP_SYNCHRONOUS_IO;
bool workingset = folio_test_workingset(folio);
unsigned long pflags;
diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
index e981a1a292d2..c498a91b6706 100644
--- a/mm/page_vma_mapped.c
+++ b/mm/page_vma_mapped.c
@@ -309,6 +309,7 @@ next_pte:
}
pte_unmap(pvmw->pte);
pvmw->pte = NULL;
+ pvmw->flags |= PVMW_PGTABLE_CROSSED;
goto restart;
}
pvmw->pte++;
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 648038247a8d..9f91cf85a5be 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -606,10 +606,32 @@ int walk_page_range(struct mm_struct *mm, unsigned long start,
int walk_kernel_page_table_range(unsigned long start, unsigned long end,
const struct mm_walk_ops *ops, pgd_t *pgd, void *private)
{
- struct mm_struct *mm = &init_mm;
+ /*
+ * Kernel intermediate page tables are usually not freed, so the mmap
+ * read lock is sufficient. But there are some exceptions.
+ * E.g. memory hot-remove. In which case, the mmap lock is insufficient
+ * to prevent the intermediate kernel pages tables belonging to the
+ * specified address range from being freed. The caller should take
+ * other actions to prevent this race.
+ */
+ mmap_assert_locked(&init_mm);
+
+ return walk_kernel_page_table_range_lockless(start, end, ops, pgd,
+ private);
+}
+
+/*
+ * Use this function to walk the kernel page tables locklessly. It should be
+ * guaranteed that the caller has exclusive access over the range they are
+ * operating on - that there should be no concurrent access, for example,
+ * changing permissions for vmalloc objects.
+ */
+int walk_kernel_page_table_range_lockless(unsigned long start, unsigned long end,
+ const struct mm_walk_ops *ops, pgd_t *pgd, void *private)
+{
struct mm_walk walk = {
.ops = ops,
- .mm = mm,
+ .mm = &init_mm,
.pgd = pgd,
.private = private,
.no_vma = true
@@ -620,16 +642,6 @@ int walk_kernel_page_table_range(unsigned long start, unsigned long end,
if (!check_ops_valid(ops))
return -EINVAL;
- /*
- * Kernel intermediate page tables are usually not freed, so the mmap
- * read lock is sufficient. But there are some exceptions.
- * E.g. memory hot-remove. In which case, the mmap lock is insufficient
- * to prevent the intermediate kernel pages tables belonging to the
- * specified address range from being freed. The caller should take
- * other actions to prevent this race.
- */
- mmap_assert_locked(mm);
-
return walk_pgd_range(start, end, &walk);
}
@@ -902,23 +914,23 @@ struct folio *folio_walk_start(struct folio_walk *fw,
fw->pudp = pudp;
fw->pud = pud;
- /*
- * TODO: FW_MIGRATION support for PUD migration entries
- * once there are relevant users.
- */
- if (!pud_present(pud) || pud_special(pud)) {
+ if (pud_none(pud)) {
spin_unlock(ptl);
goto not_found;
- } else if (!pud_leaf(pud)) {
+ } else if (pud_present(pud) && !pud_leaf(pud)) {
spin_unlock(ptl);
goto pmd_table;
+ } else if (pud_present(pud)) {
+ page = vm_normal_page_pud(vma, addr, pud);
+ if (page)
+ goto found;
}
/*
- * TODO: vm_normal_page_pud() will be handy once we want to
- * support PUD mappings in VM_PFNMAP|VM_MIXEDMAP VMAs.
+ * TODO: FW_MIGRATION support for PUD migration entries
+ * once there are relevant users.
*/
- page = pud_page(pud);
- goto found;
+ spin_unlock(ptl);
+ goto not_found;
}
pmd_table:
@@ -1004,7 +1016,7 @@ not_found:
found:
if (expose_page)
/* Note: Offset from the mapped page, not the folio start. */
- fw->page = nth_page(page, (addr & (entry_size - 1)) >> PAGE_SHIFT);
+ fw->page = page + ((addr & (entry_size - 1)) >> PAGE_SHIFT);
else
fw->page = NULL;
fw->ptl = ptl;
diff --git a/mm/percpu-km.c b/mm/percpu-km.c
index fe31aa19db81..4efa74a495cb 100644
--- a/mm/percpu-km.c
+++ b/mm/percpu-km.c
@@ -69,7 +69,7 @@ static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp)
}
for (i = 0; i < nr_pages; i++)
- pcpu_set_page_chunk(nth_page(pages, i), chunk);
+ pcpu_set_page_chunk(pages + i, chunk);
chunk->data = pages;
chunk->base_addr = page_address(pages);
diff --git a/mm/percpu.c b/mm/percpu.c
index d9cbaee92b60..81462ce5866e 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1734,7 +1734,7 @@ void __percpu *pcpu_alloc_noprof(size_t size, size_t align, bool reserved,
bool is_atomic;
bool do_warn;
struct obj_cgroup *objcg = NULL;
- static int warn_limit = 10;
+ static atomic_t warn_limit = ATOMIC_INIT(10);
struct pcpu_chunk *chunk, *next;
const char *err;
int slot, off, cpu, ret;
@@ -1904,13 +1904,17 @@ fail_unlock:
fail:
trace_percpu_alloc_percpu_fail(reserved, is_atomic, size, align);
- if (do_warn && warn_limit) {
- pr_warn("allocation failed, size=%zu align=%zu atomic=%d, %s\n",
- size, align, is_atomic, err);
- if (!is_atomic)
- dump_stack();
- if (!--warn_limit)
- pr_info("limit reached, disable warning\n");
+ if (do_warn) {
+ int remaining = atomic_dec_if_positive(&warn_limit);
+
+ if (remaining >= 0) {
+ pr_warn("allocation failed, size=%zu align=%zu atomic=%d, %s\n",
+ size, align, is_atomic, err);
+ if (!is_atomic)
+ dump_stack();
+ if (remaining == 0)
+ pr_info("limit reached, disable warning\n");
+ }
}
if (is_atomic) {
@@ -3108,7 +3112,7 @@ out_free:
#endif /* BUILD_EMBED_FIRST_CHUNK */
#ifdef BUILD_PAGE_FIRST_CHUNK
-#include <asm/pgalloc.h>
+#include <linux/pgalloc.h>
#ifndef P4D_TABLE_SIZE
#define P4D_TABLE_SIZE PAGE_SIZE
@@ -3134,13 +3138,13 @@ void __init __weak pcpu_populate_pte(unsigned long addr)
if (pgd_none(*pgd)) {
p4d = memblock_alloc_or_panic(P4D_TABLE_SIZE, P4D_TABLE_SIZE);
- pgd_populate(&init_mm, pgd, p4d);
+ pgd_populate_kernel(addr, pgd, p4d);
}
p4d = p4d_offset(pgd, addr);
if (p4d_none(*p4d)) {
pud = memblock_alloc_or_panic(PUD_TABLE_SIZE, PUD_TABLE_SIZE);
- p4d_populate(&init_mm, p4d, pud);
+ p4d_populate_kernel(addr, p4d, pud);
}
pud = pud_offset(p4d, addr);
diff --git a/mm/readahead.c b/mm/readahead.c
index 406756d34309..3a4b5d58eeb6 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -129,6 +129,9 @@
#include <linux/fadvise.h>
#include <linux/sched/mm.h>
+#define CREATE_TRACE_POINTS
+#include <trace/events/readahead.h>
+
#include "internal.h"
/*
@@ -225,6 +228,8 @@ void page_cache_ra_unbounded(struct readahead_control *ractl,
*/
unsigned int nofs = memalloc_nofs_save();
+ trace_page_cache_ra_unbounded(mapping->host, index, nr_to_read,
+ lookahead_size);
filemap_invalidate_lock_shared(mapping);
index = mapping_align_index(mapping, index);
@@ -470,6 +475,7 @@ void page_cache_ra_order(struct readahead_control *ractl,
gfp_t gfp = readahead_gfp_mask(mapping);
unsigned int new_order = ra->order;
+ trace_page_cache_ra_order(mapping->host, start, ra);
if (!mapping_large_folio_support(mapping)) {
ra->order = 0;
goto fallback;
@@ -554,6 +560,7 @@ void page_cache_sync_ra(struct readahead_control *ractl,
unsigned long max_pages, contig_count;
pgoff_t prev_index, miss;
+ trace_page_cache_sync_ra(ractl->mapping->host, index, ra, req_count);
/*
* Even if readahead is disabled, issue this request as readahead
* as we'll need it to satisfy the requested range. The forced
@@ -638,6 +645,7 @@ void page_cache_async_ra(struct readahead_control *ractl,
if (folio_test_writeback(folio))
return;
+ trace_page_cache_async_ra(ractl->mapping->host, index, ra, req_count);
folio_clear_readahead(folio);
if (blk_cgroup_congested())
diff --git a/mm/rmap.c b/mm/rmap.c
index 568198e9efc2..ac4f783d6ec2 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -79,7 +79,6 @@
#include <asm/tlbflush.h>
#define CREATE_TRACE_POINTS
-#include <trace/events/tlb.h>
#include <trace/events/migrate.h>
#include "internal.h"
@@ -285,7 +284,7 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) {
struct anon_vma *anon_vma;
- avc = anon_vma_chain_alloc(GFP_NOWAIT | __GFP_NOWARN);
+ avc = anon_vma_chain_alloc(GFP_NOWAIT);
if (unlikely(!avc)) {
unlock_anon_vma_root(root);
root = NULL;
@@ -851,34 +850,34 @@ static bool folio_referenced_one(struct folio *folio,
{
struct folio_referenced_arg *pra = arg;
DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
- int referenced = 0;
- unsigned long start = address, ptes = 0;
+ int ptes = 0, referenced = 0;
while (page_vma_mapped_walk(&pvmw)) {
address = pvmw.address;
if (vma->vm_flags & VM_LOCKED) {
- if (!folio_test_large(folio) || !pvmw.pte) {
- /* Restore the mlock which got missed */
- mlock_vma_folio(folio, vma);
- page_vma_mapped_walk_done(&pvmw);
- pra->vm_flags |= VM_LOCKED;
- return false; /* To break the loop */
- }
+ ptes++;
+ pra->mapcount--;
+
+ /* Only mlock fully mapped pages */
+ if (pvmw.pte && ptes != pvmw.nr_pages)
+ continue;
+
/*
- * For large folio fully mapped to VMA, will
- * be handled after the pvmw loop.
+ * All PTEs must be protected by page table lock in
+ * order to mlock the page.
*
- * For large folio cross VMA boundaries, it's
- * expected to be picked by page reclaim. But
- * should skip reference of pages which are in
- * the range of VM_LOCKED vma. As page reclaim
- * should just count the reference of pages out
- * the range of VM_LOCKED vma.
+ * If page table boundary has been cross, current ptl
+ * only protect part of ptes.
*/
- ptes++;
- pra->mapcount--;
- continue;
+ if (pvmw.flags & PVMW_PGTABLE_CROSSED)
+ continue;
+
+ /* Restore the mlock which got missed */
+ mlock_vma_folio(folio, vma);
+ page_vma_mapped_walk_done(&pvmw);
+ pra->vm_flags |= VM_LOCKED;
+ return false; /* To break the loop */
}
/*
@@ -914,23 +913,6 @@ static bool folio_referenced_one(struct folio *folio,
pra->mapcount--;
}
- if ((vma->vm_flags & VM_LOCKED) &&
- folio_test_large(folio) &&
- folio_within_vma(folio, vma)) {
- unsigned long s_align, e_align;
-
- s_align = ALIGN_DOWN(start, PMD_SIZE);
- e_align = ALIGN_DOWN(start + folio_size(folio) - 1, PMD_SIZE);
-
- /* folio doesn't cross page table boundary and fully mapped */
- if ((s_align == e_align) && (ptes == folio_nr_pages(folio))) {
- /* Restore the mlock which got missed */
- mlock_vma_folio(folio, vma);
- pra->vm_flags |= VM_LOCKED;
- return false; /* To break the loop */
- }
- }
-
if (referenced)
folio_clear_idle(folio);
if (folio_test_clear_young(folio))
@@ -1241,18 +1223,40 @@ int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff,
return page_vma_mkclean_one(&pvmw);
}
-static __always_inline unsigned int __folio_add_rmap(struct folio *folio,
+static void __folio_mod_stat(struct folio *folio, int nr, int nr_pmdmapped)
+{
+ int idx;
+
+ if (nr) {
+ idx = folio_test_anon(folio) ? NR_ANON_MAPPED : NR_FILE_MAPPED;
+ __lruvec_stat_mod_folio(folio, idx, nr);
+ }
+ if (nr_pmdmapped) {
+ if (folio_test_anon(folio)) {
+ idx = NR_ANON_THPS;
+ __lruvec_stat_mod_folio(folio, idx, nr_pmdmapped);
+ } else {
+ /* NR_*_PMDMAPPED are not maintained per-memcg */
+ idx = folio_test_swapbacked(folio) ?
+ NR_SHMEM_PMDMAPPED : NR_FILE_PMDMAPPED;
+ __mod_node_page_state(folio_pgdat(folio), idx,
+ nr_pmdmapped);
+ }
+ }
+}
+
+static __always_inline void __folio_add_rmap(struct folio *folio,
struct page *page, int nr_pages, struct vm_area_struct *vma,
- enum rmap_level level, int *nr_pmdmapped)
+ enum pgtable_level level)
{
atomic_t *mapped = &folio->_nr_pages_mapped;
const int orig_nr_pages = nr_pages;
- int first = 0, nr = 0;
+ int first = 0, nr = 0, nr_pmdmapped = 0;
__folio_rmap_sanity_checks(folio, page, nr_pages, level);
switch (level) {
- case RMAP_LEVEL_PTE:
+ case PGTABLE_LEVEL_PTE:
if (!folio_test_large(folio)) {
nr = atomic_inc_and_test(&folio->_mapcount);
break;
@@ -1278,12 +1282,12 @@ static __always_inline unsigned int __folio_add_rmap(struct folio *folio,
folio_add_large_mapcount(folio, orig_nr_pages, vma);
break;
- case RMAP_LEVEL_PMD:
- case RMAP_LEVEL_PUD:
+ case PGTABLE_LEVEL_PMD:
+ case PGTABLE_LEVEL_PUD:
first = atomic_inc_and_test(&folio->_entire_mapcount);
if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT)) {
- if (level == RMAP_LEVEL_PMD && first)
- *nr_pmdmapped = folio_large_nr_pages(folio);
+ if (level == PGTABLE_LEVEL_PMD && first)
+ nr_pmdmapped = folio_large_nr_pages(folio);
nr = folio_inc_return_large_mapcount(folio, vma);
if (nr == 1)
/* Was completely unmapped. */
@@ -1301,8 +1305,8 @@ static __always_inline unsigned int __folio_add_rmap(struct folio *folio,
* We only track PMD mappings of PMD-sized
* folios separately.
*/
- if (level == RMAP_LEVEL_PMD)
- *nr_pmdmapped = nr_pages;
+ if (level == PGTABLE_LEVEL_PMD)
+ nr_pmdmapped = nr_pages;
nr = nr_pages - (nr & FOLIO_PAGES_MAPPED);
/* Raced ahead of a remove and another add? */
if (unlikely(nr < 0))
@@ -1314,8 +1318,10 @@ static __always_inline unsigned int __folio_add_rmap(struct folio *folio,
}
folio_inc_large_mapcount(folio, vma);
break;
+ default:
+ BUILD_BUG();
}
- return nr;
+ __folio_mod_stat(folio, nr, nr_pmdmapped);
}
/**
@@ -1403,59 +1409,37 @@ static void __page_check_anon_rmap(const struct folio *folio,
page);
}
-static void __folio_mod_stat(struct folio *folio, int nr, int nr_pmdmapped)
-{
- int idx;
-
- if (nr) {
- idx = folio_test_anon(folio) ? NR_ANON_MAPPED : NR_FILE_MAPPED;
- __lruvec_stat_mod_folio(folio, idx, nr);
- }
- if (nr_pmdmapped) {
- if (folio_test_anon(folio)) {
- idx = NR_ANON_THPS;
- __lruvec_stat_mod_folio(folio, idx, nr_pmdmapped);
- } else {
- /* NR_*_PMDMAPPED are not maintained per-memcg */
- idx = folio_test_swapbacked(folio) ?
- NR_SHMEM_PMDMAPPED : NR_FILE_PMDMAPPED;
- __mod_node_page_state(folio_pgdat(folio), idx,
- nr_pmdmapped);
- }
- }
-}
-
static __always_inline void __folio_add_anon_rmap(struct folio *folio,
struct page *page, int nr_pages, struct vm_area_struct *vma,
- unsigned long address, rmap_t flags, enum rmap_level level)
+ unsigned long address, rmap_t flags, enum pgtable_level level)
{
- int i, nr, nr_pmdmapped = 0;
+ int i;
VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);
- nr = __folio_add_rmap(folio, page, nr_pages, vma, level, &nr_pmdmapped);
+ __folio_add_rmap(folio, page, nr_pages, vma, level);
if (likely(!folio_test_ksm(folio)))
__page_check_anon_rmap(folio, page, vma, address);
- __folio_mod_stat(folio, nr, nr_pmdmapped);
-
if (flags & RMAP_EXCLUSIVE) {
switch (level) {
- case RMAP_LEVEL_PTE:
+ case PGTABLE_LEVEL_PTE:
for (i = 0; i < nr_pages; i++)
SetPageAnonExclusive(page + i);
break;
- case RMAP_LEVEL_PMD:
+ case PGTABLE_LEVEL_PMD:
SetPageAnonExclusive(page);
break;
- case RMAP_LEVEL_PUD:
+ case PGTABLE_LEVEL_PUD:
/*
* Keep the compiler happy, we don't support anonymous
* PUD mappings.
*/
WARN_ON_ONCE(1);
break;
+ default:
+ BUILD_BUG();
}
}
@@ -1479,12 +1463,12 @@ static __always_inline void __folio_add_anon_rmap(struct folio *folio,
}
/*
- * For large folio, only mlock it if it's fully mapped to VMA. It's
- * not easy to check whether the large folio is fully mapped to VMA
- * here. Only mlock normal 4K folio and leave page reclaim to handle
- * large folio.
+ * Only mlock it if the folio is fully mapped to the VMA.
+ *
+ * Partially mapped folios can be split on reclaim and part outside
+ * of mlocked VMA can be evicted or freed.
*/
- if (!folio_test_large(folio))
+ if (folio_nr_pages(folio) == nr_pages)
mlock_vma_folio(folio, vma);
}
@@ -1509,7 +1493,7 @@ void folio_add_anon_rmap_ptes(struct folio *folio, struct page *page,
rmap_t flags)
{
__folio_add_anon_rmap(folio, page, nr_pages, vma, address, flags,
- RMAP_LEVEL_PTE);
+ PGTABLE_LEVEL_PTE);
}
/**
@@ -1530,7 +1514,7 @@ void folio_add_anon_rmap_pmd(struct folio *folio, struct page *page,
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
__folio_add_anon_rmap(folio, page, HPAGE_PMD_NR, vma, address, flags,
- RMAP_LEVEL_PMD);
+ PGTABLE_LEVEL_PMD);
#else
WARN_ON_ONCE(true);
#endif
@@ -1611,17 +1595,19 @@ void folio_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma,
static __always_inline void __folio_add_file_rmap(struct folio *folio,
struct page *page, int nr_pages, struct vm_area_struct *vma,
- enum rmap_level level)
+ enum pgtable_level level)
{
- int nr, nr_pmdmapped = 0;
-
VM_WARN_ON_FOLIO(folio_test_anon(folio), folio);
- nr = __folio_add_rmap(folio, page, nr_pages, vma, level, &nr_pmdmapped);
- __folio_mod_stat(folio, nr, nr_pmdmapped);
+ __folio_add_rmap(folio, page, nr_pages, vma, level);
- /* See comments in folio_add_anon_rmap_*() */
- if (!folio_test_large(folio))
+ /*
+ * Only mlock it if the folio is fully mapped to the VMA.
+ *
+ * Partially mapped folios can be split on reclaim and part outside
+ * of mlocked VMA can be evicted or freed.
+ */
+ if (folio_nr_pages(folio) == nr_pages)
mlock_vma_folio(folio, vma);
}
@@ -1639,7 +1625,7 @@ static __always_inline void __folio_add_file_rmap(struct folio *folio,
void folio_add_file_rmap_ptes(struct folio *folio, struct page *page,
int nr_pages, struct vm_area_struct *vma)
{
- __folio_add_file_rmap(folio, page, nr_pages, vma, RMAP_LEVEL_PTE);
+ __folio_add_file_rmap(folio, page, nr_pages, vma, PGTABLE_LEVEL_PTE);
}
/**
@@ -1656,7 +1642,7 @@ void folio_add_file_rmap_pmd(struct folio *folio, struct page *page,
struct vm_area_struct *vma)
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- __folio_add_file_rmap(folio, page, HPAGE_PMD_NR, vma, RMAP_LEVEL_PMD);
+ __folio_add_file_rmap(folio, page, HPAGE_PMD_NR, vma, PGTABLE_LEVEL_PMD);
#else
WARN_ON_ONCE(true);
#endif
@@ -1677,7 +1663,7 @@ void folio_add_file_rmap_pud(struct folio *folio, struct page *page,
{
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \
defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
- __folio_add_file_rmap(folio, page, HPAGE_PUD_NR, vma, RMAP_LEVEL_PUD);
+ __folio_add_file_rmap(folio, page, HPAGE_PUD_NR, vma, PGTABLE_LEVEL_PUD);
#else
WARN_ON_ONCE(true);
#endif
@@ -1685,7 +1671,7 @@ void folio_add_file_rmap_pud(struct folio *folio, struct page *page,
static __always_inline void __folio_remove_rmap(struct folio *folio,
struct page *page, int nr_pages, struct vm_area_struct *vma,
- enum rmap_level level)
+ enum pgtable_level level)
{
atomic_t *mapped = &folio->_nr_pages_mapped;
int last = 0, nr = 0, nr_pmdmapped = 0;
@@ -1694,7 +1680,7 @@ static __always_inline void __folio_remove_rmap(struct folio *folio,
__folio_rmap_sanity_checks(folio, page, nr_pages, level);
switch (level) {
- case RMAP_LEVEL_PTE:
+ case PGTABLE_LEVEL_PTE:
if (!folio_test_large(folio)) {
nr = atomic_add_negative(-1, &folio->_mapcount);
break;
@@ -1704,7 +1690,7 @@ static __always_inline void __folio_remove_rmap(struct folio *folio,
nr = folio_sub_return_large_mapcount(folio, nr_pages, vma);
if (!nr) {
/* Now completely unmapped. */
- nr = folio_nr_pages(folio);
+ nr = folio_large_nr_pages(folio);
} else {
partially_mapped = nr < folio_large_nr_pages(folio) &&
!folio_entire_mapcount(folio);
@@ -1724,11 +1710,11 @@ static __always_inline void __folio_remove_rmap(struct folio *folio,
partially_mapped = nr && atomic_read(mapped);
break;
- case RMAP_LEVEL_PMD:
- case RMAP_LEVEL_PUD:
+ case PGTABLE_LEVEL_PMD:
+ case PGTABLE_LEVEL_PUD:
if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT)) {
last = atomic_add_negative(-1, &folio->_entire_mapcount);
- if (level == RMAP_LEVEL_PMD && last)
+ if (level == PGTABLE_LEVEL_PMD && last)
nr_pmdmapped = folio_large_nr_pages(folio);
nr = folio_dec_return_large_mapcount(folio, vma);
if (!nr) {
@@ -1748,9 +1734,9 @@ static __always_inline void __folio_remove_rmap(struct folio *folio,
nr = atomic_sub_return_relaxed(ENTIRELY_MAPPED, mapped);
if (likely(nr < ENTIRELY_MAPPED)) {
nr_pages = folio_large_nr_pages(folio);
- if (level == RMAP_LEVEL_PMD)
+ if (level == PGTABLE_LEVEL_PMD)
nr_pmdmapped = nr_pages;
- nr = nr_pages - (nr & FOLIO_PAGES_MAPPED);
+ nr = nr_pages - nr;
/* Raced ahead of another remove and an add? */
if (unlikely(nr < 0))
nr = 0;
@@ -1762,6 +1748,8 @@ static __always_inline void __folio_remove_rmap(struct folio *folio,
partially_mapped = nr && nr < nr_pmdmapped;
break;
+ default:
+ BUILD_BUG();
}
/*
@@ -1801,7 +1789,7 @@ static __always_inline void __folio_remove_rmap(struct folio *folio,
void folio_remove_rmap_ptes(struct folio *folio, struct page *page,
int nr_pages, struct vm_area_struct *vma)
{
- __folio_remove_rmap(folio, page, nr_pages, vma, RMAP_LEVEL_PTE);
+ __folio_remove_rmap(folio, page, nr_pages, vma, PGTABLE_LEVEL_PTE);
}
/**
@@ -1818,7 +1806,7 @@ void folio_remove_rmap_pmd(struct folio *folio, struct page *page,
struct vm_area_struct *vma)
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- __folio_remove_rmap(folio, page, HPAGE_PMD_NR, vma, RMAP_LEVEL_PMD);
+ __folio_remove_rmap(folio, page, HPAGE_PMD_NR, vma, PGTABLE_LEVEL_PMD);
#else
WARN_ON_ONCE(true);
#endif
@@ -1839,7 +1827,7 @@ void folio_remove_rmap_pud(struct folio *folio, struct page *page,
{
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \
defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
- __folio_remove_rmap(folio, page, HPAGE_PUD_NR, vma, RMAP_LEVEL_PUD);
+ __folio_remove_rmap(folio, page, HPAGE_PUD_NR, vma, PGTABLE_LEVEL_PUD);
#else
WARN_ON_ONCE(true);
#endif
@@ -1887,6 +1875,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
unsigned long nr_pages = 1, end_addr;
unsigned long pfn;
unsigned long hsz = 0;
+ int ptes = 0;
/*
* When racing against e.g. zap_pte_range() on another cpu,
@@ -1927,10 +1916,34 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
*/
if (!(flags & TTU_IGNORE_MLOCK) &&
(vma->vm_flags & VM_LOCKED)) {
+ ptes++;
+
+ /*
+ * Set 'ret' to indicate the page cannot be unmapped.
+ *
+ * Do not jump to walk_abort immediately as additional
+ * iteration might be required to detect fully mapped
+ * folio an mlock it.
+ */
+ ret = false;
+
+ /* Only mlock fully mapped pages */
+ if (pvmw.pte && ptes != pvmw.nr_pages)
+ continue;
+
+ /*
+ * All PTEs must be protected by page table lock in
+ * order to mlock the page.
+ *
+ * If page table boundary has been cross, current ptl
+ * only protect part of ptes.
+ */
+ if (pvmw.flags & PVMW_PGTABLE_CROSSED)
+ goto walk_done;
+
/* Restore the mlock which got missed */
- if (!folio_test_large(folio))
- mlock_vma_folio(folio, vma);
- goto walk_abort;
+ mlock_vma_folio(folio, vma);
+ goto walk_done;
}
if (!pvmw.pte) {
diff --git a/mm/shmem.c b/mm/shmem.c
index e2c76a30802b..b9081b817d28 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -275,18 +275,18 @@ static const struct vm_operations_struct shmem_vm_ops;
static const struct vm_operations_struct shmem_anon_vm_ops;
static struct file_system_type shmem_fs_type;
-bool shmem_mapping(struct address_space *mapping)
+bool shmem_mapping(const struct address_space *mapping)
{
return mapping->a_ops == &shmem_aops;
}
EXPORT_SYMBOL_GPL(shmem_mapping);
-bool vma_is_anon_shmem(struct vm_area_struct *vma)
+bool vma_is_anon_shmem(const struct vm_area_struct *vma)
{
return vma->vm_ops == &shmem_anon_vm_ops;
}
-bool vma_is_shmem(struct vm_area_struct *vma)
+bool vma_is_shmem(const struct vm_area_struct *vma)
{
return vma_is_anon_shmem(vma) || vma->vm_ops == &shmem_vm_ops;
}
@@ -573,42 +573,6 @@ static int shmem_confirm_swap(struct address_space *mapping, pgoff_t index,
static int shmem_huge __read_mostly = SHMEM_HUGE_NEVER;
static int tmpfs_huge __read_mostly = SHMEM_HUGE_NEVER;
-/**
- * shmem_mapping_size_orders - Get allowable folio orders for the given file size.
- * @mapping: Target address_space.
- * @index: The page index.
- * @write_end: end of a write, could extend inode size.
- *
- * This returns huge orders for folios (when supported) based on the file size
- * which the mapping currently allows at the given index. The index is relevant
- * due to alignment considerations the mapping might have. The returned order
- * may be less than the size passed.
- *
- * Return: The orders.
- */
-static inline unsigned int
-shmem_mapping_size_orders(struct address_space *mapping, pgoff_t index, loff_t write_end)
-{
- unsigned int order;
- size_t size;
-
- if (!mapping_large_folio_support(mapping) || !write_end)
- return 0;
-
- /* Calculate the write size based on the write_end */
- size = write_end - (index << PAGE_SHIFT);
- order = filemap_get_order(size);
- if (!order)
- return 0;
-
- /* If we're not aligned, allocate a smaller folio */
- if (index & ((1UL << order) - 1))
- order = __ffs(index);
-
- order = min_t(size_t, order, MAX_PAGECACHE_ORDER);
- return order > 0 ? BIT(order + 1) - 1 : 0;
-}
-
static unsigned int shmem_get_orders_within_size(struct inode *inode,
unsigned long within_size_orders, pgoff_t index,
loff_t write_end)
@@ -655,22 +619,21 @@ static unsigned int shmem_huge_global_enabled(struct inode *inode, pgoff_t index
* For tmpfs mmap()'s huge order, we still use PMD-sized order to
* allocate huge pages due to lack of a write size hint.
*
- * Otherwise, tmpfs will allow getting a highest order hint based on
- * the size of write and fallocate paths, then will try each allowable
- * huge orders.
+ * For tmpfs with 'huge=always' or 'huge=within_size' mount option,
+ * we will always try PMD-sized order first. If that failed, it will
+ * fall back to small large folios.
*/
switch (SHMEM_SB(inode->i_sb)->huge) {
case SHMEM_HUGE_ALWAYS:
if (vma)
return maybe_pmd_order;
- return shmem_mapping_size_orders(inode->i_mapping, index, write_end);
+ return THP_ORDERS_ALL_FILE_DEFAULT;
case SHMEM_HUGE_WITHIN_SIZE:
if (vma)
within_size_orders = maybe_pmd_order;
else
- within_size_orders = shmem_mapping_size_orders(inode->i_mapping,
- index, write_end);
+ within_size_orders = THP_ORDERS_ALL_FILE_DEFAULT;
within_size_orders = shmem_get_orders_within_size(inode, within_size_orders,
index, write_end);
@@ -1006,15 +969,15 @@ unsigned long shmem_partial_swap_usage(struct address_space *mapping,
pgoff_t start, pgoff_t end)
{
XA_STATE(xas, &mapping->i_pages, start);
- struct page *page;
+ struct folio *folio;
unsigned long swapped = 0;
unsigned long max = end - 1;
rcu_read_lock();
- xas_for_each(&xas, page, max) {
- if (xas_retry(&xas, page))
+ xas_for_each(&xas, folio, max) {
+ if (xas_retry(&xas, folio))
continue;
- if (xa_is_value(page))
+ if (xa_is_value(folio))
swapped += 1 << xas_get_order(&xas);
if (xas.xa_index == max)
break;
@@ -1698,13 +1661,13 @@ try_split:
}
/*
- * The delete_from_swap_cache() below could be left for
+ * The swap_cache_del_folio() below could be left for
* shrink_folio_list()'s folio_free_swap() to dispose of;
* but I'm a little nervous about letting this folio out of
* shmem_writeout() in a hybrid half-tmpfs-half-swap state
* e.g. folio_mapping(folio) might give an unexpected answer.
*/
- delete_from_swap_cache(folio);
+ swap_cache_del_folio(folio);
goto redirty;
}
if (nr_pages > 1)
@@ -1817,7 +1780,7 @@ unsigned long shmem_allowable_huge_orders(struct inode *inode,
vm_flags_t vm_flags = vma ? vma->vm_flags : 0;
unsigned int global_orders;
- if (thp_disabled_by_hw() || (vma && vma_thp_disabled(vma, vm_flags)))
+ if (thp_disabled_by_hw() || (vma && vma_thp_disabled(vma, vm_flags, shmem_huge_force)))
return 0;
global_orders = shmem_huge_global_enabled(inode, index, write_end,
@@ -2082,7 +2045,7 @@ retry:
new->swap = entry;
memcg1_swapin(entry, nr_pages);
- shadow = get_shadow_from_swap_cache(entry);
+ shadow = swap_cache_get_shadow(entry);
if (shadow)
workingset_refault(new, shadow);
folio_add_lru(new);
@@ -2120,13 +2083,11 @@ static int shmem_replace_folio(struct folio **foliop, gfp_t gfp,
struct shmem_inode_info *info, pgoff_t index,
struct vm_area_struct *vma)
{
+ struct swap_cluster_info *ci;
struct folio *new, *old = *foliop;
swp_entry_t entry = old->swap;
- struct address_space *swap_mapping = swap_address_space(entry);
- pgoff_t swap_index = swap_cache_index(entry);
- XA_STATE(xas, &swap_mapping->i_pages, swap_index);
int nr_pages = folio_nr_pages(old);
- int error = 0, i;
+ int error = 0;
/*
* We have arrived here because our zones are constrained, so don't
@@ -2155,38 +2116,15 @@ static int shmem_replace_folio(struct folio **foliop, gfp_t gfp,
new->swap = entry;
folio_set_swapcache(new);
- /* Swap cache still stores N entries instead of a high-order entry */
- xa_lock_irq(&swap_mapping->i_pages);
- for (i = 0; i < nr_pages; i++) {
- void *item = xas_load(&xas);
+ ci = swap_cluster_get_and_lock_irq(old);
+ __swap_cache_replace_folio(ci, old, new);
+ mem_cgroup_replace_folio(old, new);
+ shmem_update_stats(new, nr_pages);
+ shmem_update_stats(old, -nr_pages);
+ swap_cluster_unlock_irq(ci);
- if (item != old) {
- error = -ENOENT;
- break;
- }
-
- xas_store(&xas, new);
- xas_next(&xas);
- }
- if (!error) {
- mem_cgroup_replace_folio(old, new);
- shmem_update_stats(new, nr_pages);
- shmem_update_stats(old, -nr_pages);
- }
- xa_unlock_irq(&swap_mapping->i_pages);
-
- if (unlikely(error)) {
- /*
- * Is this possible? I think not, now that our callers
- * check both the swapcache flag and folio->private
- * after getting the folio lock; but be defensive.
- * Reverse old to newpage for clear and free.
- */
- old = new;
- } else {
- folio_add_lru(new);
- *foliop = new;
- }
+ folio_add_lru(new);
+ *foliop = new;
folio_clear_swapcache(old);
old->private = NULL;
@@ -2220,7 +2158,7 @@ static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index,
nr_pages = folio_nr_pages(folio);
folio_wait_writeback(folio);
if (!skip_swapcache)
- delete_from_swap_cache(folio);
+ swap_cache_del_folio(folio);
/*
* Don't treat swapin error folio as alloced. Otherwise inode->i_blocks
* won't be 0 when inode is released and thus trigger WARN_ON(i_blocks)
@@ -2235,7 +2173,7 @@ static int shmem_split_large_entry(struct inode *inode, pgoff_t index,
{
struct address_space *mapping = inode->i_mapping;
XA_STATE_ORDER(xas, &mapping->i_pages, index, 0);
- int split_order = 0, entry_order;
+ int split_order = 0;
int i;
/* Convert user data gfp flags to xarray node gfp flags */
@@ -2253,15 +2191,12 @@ static int shmem_split_large_entry(struct inode *inode, pgoff_t index,
goto unlock;
}
- entry_order = xas_get_order(&xas);
-
- if (!entry_order)
+ cur_order = xas_get_order(&xas);
+ if (!cur_order)
goto unlock;
/* Try to split large swap entry in pagecache */
- cur_order = entry_order;
- swap_index = round_down(index, 1 << entry_order);
-
+ swap_index = round_down(index, 1 << cur_order);
split_order = xas_try_split_min_order(cur_order);
while (cur_order > 0) {
@@ -2354,7 +2289,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
}
/* Look it up and read it in.. */
- folio = swap_cache_get_folio(swap, NULL, 0);
+ folio = swap_cache_get_folio(swap);
if (!folio) {
if (data_race(si->flags & SWP_SYNCHRONOUS_IO)) {
/* Direct swapin skipping swap cache & readahead */
@@ -2379,6 +2314,8 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
count_vm_event(PGMAJFAULT);
count_memcg_event_mm(fault_mm, PGMAJFAULT);
}
+ } else {
+ swap_update_readahead(folio, NULL, 0);
}
if (order > folio_order(folio)) {
@@ -2430,7 +2367,6 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
goto failed;
}
folio_wait_writeback(folio);
- nr_pages = folio_nr_pages(folio);
/*
* Some architectures may have to restore extra metadata to the
@@ -2458,7 +2394,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
folio->swap.val = 0;
swapcache_clear(si, swap, nr_pages);
} else {
- delete_from_swap_cache(folio);
+ swap_cache_del_folio(folio);
}
folio_mark_dirty(folio);
swap_free_nr(swap, nr_pages);
@@ -5081,7 +5017,7 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc)
sb->s_flags |= SB_NOUSER;
}
sb->s_export_op = &shmem_export_ops;
- sb->s_flags |= SB_NOSEC | SB_I_VERSION;
+ sb->s_flags |= SB_NOSEC;
#if IS_ENABLED(CONFIG_UNICODE)
if (!ctx->encoding && ctx->strict_encoding) {
@@ -5341,7 +5277,7 @@ static const struct super_operations shmem_ops = {
.get_dquots = shmem_get_dquots,
#endif
.evict_inode = shmem_evict_inode,
- .drop_inode = generic_delete_inode,
+ .drop_inode = inode_just_drop,
.put_super = shmem_put_super,
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
.nr_cached_objects = shmem_unused_huge_count,
@@ -5385,6 +5321,9 @@ int shmem_init_fs_context(struct fs_context *fc)
fc->fs_private = ctx;
fc->ops = &shmem_fs_context_ops;
+#ifdef CONFIG_TMPFS
+ fc->sb_flags |= SB_I_VERSION;
+#endif
return 0;
}
diff --git a/mm/show_mem.c b/mm/show_mem.c
index 41999e94a56d..3a4b5207635d 100644
--- a/mm/show_mem.c
+++ b/mm/show_mem.c
@@ -278,7 +278,8 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z
#endif
K(node_page_state(pgdat, NR_PAGETABLE)),
K(node_page_state(pgdat, NR_SECONDARY_PAGETABLE)),
- str_yes_no(pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES),
+ str_yes_no(atomic_read(&pgdat->kswapd_failures) >=
+ MAX_RECLAIM_RETRIES),
K(node_page_state(pgdat, NR_BALLOON_PAGES)));
}
@@ -310,6 +311,7 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z
" inactive_file:%lukB"
" unevictable:%lukB"
" writepending:%lukB"
+ " zspages:%lukB"
" present:%lukB"
" managed:%lukB"
" mlocked:%lukB"
@@ -332,6 +334,11 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z
K(zone_page_state(zone, NR_ZONE_INACTIVE_FILE)),
K(zone_page_state(zone, NR_ZONE_UNEVICTABLE)),
K(zone_page_state(zone, NR_ZONE_WRITE_PENDING)),
+#if IS_ENABLED(CONFIG_ZSMALLOC)
+ K(zone_page_state(zone, NR_ZSPAGES)),
+#else
+ 0UL,
+#endif
K(zone->present_pages),
K(zone_managed_pages(zone)),
K(zone_page_state(zone, NR_MLOCK)),
@@ -419,13 +426,16 @@ void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx)
printk("%lu pages hwpoisoned\n", atomic_long_read(&num_poisoned_pages));
#endif
#ifdef CONFIG_MEM_ALLOC_PROFILING
- {
+ static DEFINE_SPINLOCK(mem_alloc_profiling_spinlock);
+
+ if (spin_trylock(&mem_alloc_profiling_spinlock)) {
struct codetag_bytes tags[10];
size_t i, nr;
nr = alloc_tag_top_users(tags, ARRAY_SIZE(tags), false);
if (nr) {
- pr_notice("Memory allocations:\n");
+ pr_notice("Memory allocations (profiling is currently turned %s):\n",
+ mem_alloc_profiling_enabled() ? "on" : "off");
for (i = 0; i < nr; i++) {
struct codetag *ct = tags[i].ct;
struct alloc_tag *tag = ct_to_alloc_tag(ct);
@@ -445,6 +455,7 @@ void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx)
ct->lineno, ct->function);
}
}
+ spin_unlock(&mem_alloc_profiling_spinlock);
}
#endif
}
diff --git a/mm/slab.h b/mm/slab.h
index 248b34c839b7..078daecc7cf5 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -50,13 +50,17 @@ typedef union {
/* Reuses the bits in struct page */
struct slab {
- unsigned long flags;
+ memdesc_flags_t flags;
struct kmem_cache *slab_cache;
union {
struct {
union {
struct list_head slab_list;
+ struct { /* For deferred deactivate_slab() */
+ struct llist_node llnode;
+ void *flush_freelist;
+ };
#ifdef CONFIG_SLUB_CPU_PARTIAL
struct {
struct slab *next;
@@ -174,12 +178,12 @@ static inline void *slab_address(const struct slab *slab)
static inline int slab_nid(const struct slab *slab)
{
- return folio_nid(slab_folio(slab));
+ return memdesc_nid(slab->flags);
}
static inline pg_data_t *slab_pgdat(const struct slab *slab)
{
- return folio_pgdat(slab_folio(slab));
+ return NODE_DATA(slab_nid(slab));
}
static inline struct slab *virt_to_slab(const void *addr)
@@ -234,7 +238,9 @@ struct kmem_cache_order_objects {
struct kmem_cache {
#ifndef CONFIG_SLUB_TINY
struct kmem_cache_cpu __percpu *cpu_slab;
+ struct lock_class_key lock_key;
#endif
+ struct slub_percpu_sheaves __percpu *cpu_sheaves;
/* Used for retrieving partial slabs, etc. */
slab_flags_t flags;
unsigned long min_partial;
@@ -248,6 +254,7 @@ struct kmem_cache {
/* Number of per cpu partial slabs to keep around */
unsigned int cpu_partial_slabs;
#endif
+ unsigned int sheaf_capacity;
struct kmem_cache_order_objects oo;
/* Allocation and freeing of slabs */
@@ -433,6 +440,9 @@ static inline bool is_kmalloc_normal(struct kmem_cache *s)
return !(s->flags & (SLAB_CACHE_DMA|SLAB_ACCOUNT|SLAB_RECLAIM_ACCOUNT));
}
+bool __kfree_rcu_sheaf(struct kmem_cache *s, void *obj);
+void flush_all_rcu_sheaves(void);
+
#define SLAB_CORE_FLAGS (SLAB_HWCACHE_ALIGN | SLAB_CACHE_DMA | \
SLAB_CACHE_DMA32 | SLAB_PANIC | \
SLAB_TYPESAFE_BY_RCU | SLAB_DEBUG_OBJECTS | \
@@ -526,8 +536,12 @@ static inline struct slabobj_ext *slab_obj_exts(struct slab *slab)
unsigned long obj_exts = READ_ONCE(slab->obj_exts);
#ifdef CONFIG_MEMCG
- VM_BUG_ON_PAGE(obj_exts && !(obj_exts & MEMCG_DATA_OBJEXTS),
- slab_page(slab));
+ /*
+ * obj_exts should be either NULL, a valid pointer with
+ * MEMCG_DATA_OBJEXTS bit set or be equal to OBJEXTS_ALLOC_FAIL.
+ */
+ VM_BUG_ON_PAGE(obj_exts && !(obj_exts & MEMCG_DATA_OBJEXTS) &&
+ obj_exts != OBJEXTS_ALLOC_FAIL, slab_page(slab));
VM_BUG_ON_PAGE(obj_exts & MEMCG_DATA_KMEM, slab_page(slab));
#endif
return (struct slabobj_ext *)(obj_exts & ~OBJEXTS_FLAGS_MASK);
@@ -656,6 +670,8 @@ void __kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab)
void __check_heap_object(const void *ptr, unsigned long n,
const struct slab *slab, bool to_user);
+void defer_free_barrier(void);
+
static inline bool slub_debug_orig_size(struct kmem_cache *s)
{
return (kmem_cache_debug_flags(s, SLAB_STORE_USER) &&
diff --git a/mm/slab_common.c b/mm/slab_common.c
index bfe7c40eeee1..932d13ada36c 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -163,6 +163,9 @@ int slab_unmergeable(struct kmem_cache *s)
return 1;
#endif
+ if (s->cpu_sheaves)
+ return 1;
+
/*
* We may have set a slab to be unmergeable during bootstrap.
*/
@@ -321,7 +324,7 @@ struct kmem_cache *__kmem_cache_create_args(const char *name,
object_size - args->usersize < args->useroffset))
args->usersize = args->useroffset = 0;
- if (!args->usersize)
+ if (!args->usersize && !args->sheaf_capacity)
s = __kmem_cache_alias(name, object_size, args->align, flags,
args->ctor);
if (s)
@@ -507,6 +510,9 @@ void kmem_cache_destroy(struct kmem_cache *s)
rcu_barrier();
}
+ /* Wait for deferred work from kmalloc/kfree_nolock() */
+ defer_free_barrier();
+
cpus_read_lock();
mutex_lock(&slab_mutex);
@@ -1605,6 +1611,30 @@ static void kfree_rcu_work(struct work_struct *work)
kvfree_rcu_list(head);
}
+static bool kfree_rcu_sheaf(void *obj)
+{
+ struct kmem_cache *s;
+ struct folio *folio;
+ struct slab *slab;
+
+ if (is_vmalloc_addr(obj))
+ return false;
+
+ folio = virt_to_folio(obj);
+ if (unlikely(!folio_test_slab(folio)))
+ return false;
+
+ slab = folio_slab(folio);
+ s = slab->slab_cache;
+ if (s->cpu_sheaves) {
+ if (likely(!IS_ENABLED(CONFIG_NUMA) ||
+ slab_nid(slab) == numa_mem_id()))
+ return __kfree_rcu_sheaf(s, obj);
+ }
+
+ return false;
+}
+
static bool
need_offload_krc(struct kfree_rcu_cpu *krcp)
{
@@ -1949,6 +1979,9 @@ void kvfree_call_rcu(struct rcu_head *head, void *ptr)
if (!head)
might_sleep();
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT) && kfree_rcu_sheaf(ptr))
+ return;
+
// Queue the object but don't yet schedule the batch.
if (debug_rcu_head_queue(ptr)) {
// Probable double kfree_rcu(), just leak.
@@ -2023,6 +2056,8 @@ void kvfree_rcu_barrier(void)
bool queued;
int i, cpu;
+ flush_all_rcu_sheaves();
+
/*
* Firstly we detach objects and queue them over an RCU-batch
* for all CPUs. Finally queued works are flushed for each CPU.
diff --git a/mm/slub.c b/mm/slub.c
index 30003763d224..584a5ff1828b 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -44,7 +44,8 @@
#include <kunit/test.h>
#include <kunit/test-bug.h>
#include <linux/sort.h>
-
+#include <linux/irq_work.h>
+#include <linux/kprobes.h>
#include <linux/debugfs.h>
#include <trace/events/kmem.h>
@@ -363,8 +364,12 @@ static inline void debugfs_slab_add(struct kmem_cache *s) { }
#endif
enum stat_item {
+ ALLOC_PCS, /* Allocation from percpu sheaf */
ALLOC_FASTPATH, /* Allocation from cpu slab */
ALLOC_SLOWPATH, /* Allocation by getting a new cpu slab */
+ FREE_PCS, /* Free to percpu sheaf */
+ FREE_RCU_SHEAF, /* Free to rcu_free sheaf */
+ FREE_RCU_SHEAF_FAIL, /* Failed to free to a rcu_free sheaf */
FREE_FASTPATH, /* Free to cpu slab */
FREE_SLOWPATH, /* Freeing not to cpu slab */
FREE_FROZEN, /* Freeing to frozen slab */
@@ -389,6 +394,19 @@ enum stat_item {
CPU_PARTIAL_FREE, /* Refill cpu partial on free */
CPU_PARTIAL_NODE, /* Refill cpu partial from node partial */
CPU_PARTIAL_DRAIN, /* Drain cpu partial to node partial */
+ SHEAF_FLUSH, /* Objects flushed from a sheaf */
+ SHEAF_REFILL, /* Objects refilled to a sheaf */
+ SHEAF_ALLOC, /* Allocation of an empty sheaf */
+ SHEAF_FREE, /* Freeing of an empty sheaf */
+ BARN_GET, /* Got full sheaf from barn */
+ BARN_GET_FAIL, /* Failed to get full sheaf from barn */
+ BARN_PUT, /* Put full sheaf to barn */
+ BARN_PUT_FAIL, /* Failed to put full sheaf to barn */
+ SHEAF_PREFILL_FAST, /* Sheaf prefill grabbed the spare sheaf */
+ SHEAF_PREFILL_SLOW, /* Sheaf prefill found no spare sheaf */
+ SHEAF_PREFILL_OVERSIZE, /* Allocation of oversize sheaf for prefill */
+ SHEAF_RETURN_FAST, /* Sheaf return reattached spare sheaf */
+ SHEAF_RETURN_SLOW, /* Sheaf return could not reattach spare */
NR_SLUB_STAT_ITEMS
};
@@ -409,7 +427,7 @@ struct kmem_cache_cpu {
#ifdef CONFIG_SLUB_CPU_PARTIAL
struct slab *partial; /* Partially allocated slabs */
#endif
- local_lock_t lock; /* Protects the fields above */
+ local_trylock_t lock; /* Protects the fields above */
#ifdef CONFIG_SLUB_STATS
unsigned int stat[NR_SLUB_STAT_ITEMS];
#endif
@@ -435,6 +453,37 @@ void stat_add(const struct kmem_cache *s, enum stat_item si, int v)
#endif
}
+#define MAX_FULL_SHEAVES 10
+#define MAX_EMPTY_SHEAVES 10
+
+struct node_barn {
+ spinlock_t lock;
+ struct list_head sheaves_full;
+ struct list_head sheaves_empty;
+ unsigned int nr_full;
+ unsigned int nr_empty;
+};
+
+struct slab_sheaf {
+ union {
+ struct rcu_head rcu_head;
+ struct list_head barn_list;
+ /* only used for prefilled sheafs */
+ unsigned int capacity;
+ };
+ struct kmem_cache *cache;
+ unsigned int size;
+ int node; /* only used for rcu_sheaf */
+ void *objects[];
+};
+
+struct slub_percpu_sheaves {
+ local_trylock_t lock;
+ struct slab_sheaf *main; /* never NULL when unlocked */
+ struct slab_sheaf *spare; /* empty or full, may be NULL */
+ struct slab_sheaf *rcu_free; /* for batching kfree_rcu() */
+};
+
/*
* The slab lists for all objects.
*/
@@ -447,6 +496,7 @@ struct kmem_cache_node {
atomic_long_t total_objects;
struct list_head full;
#endif
+ struct node_barn *barn;
};
static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node)
@@ -454,6 +504,12 @@ static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node)
return s->node[node];
}
+/* Get the barn of the current cpu's memory node */
+static inline struct node_barn *get_barn(struct kmem_cache *s)
+{
+ return get_node(s, numa_mem_id())->barn;
+}
+
/*
* Iterator over all nodes. The body will be executed for each node that has
* a kmem_cache_node structure allocated (which is true for all online nodes)
@@ -470,12 +526,19 @@ static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node)
*/
static nodemask_t slab_nodes;
-#ifndef CONFIG_SLUB_TINY
/*
* Workqueue used for flush_cpu_slab().
*/
static struct workqueue_struct *flushwq;
-#endif
+
+struct slub_flush_work {
+ struct work_struct work;
+ struct kmem_cache *s;
+ bool skip;
+};
+
+static DEFINE_MUTEX(flush_lock);
+static DEFINE_PER_CPU(struct slub_flush_work, slub_flush);
/********************************************************************
* Core slab cache functions
@@ -657,17 +720,17 @@ static inline unsigned int slub_get_cpu_partial(struct kmem_cache *s)
*/
static inline bool slab_test_pfmemalloc(const struct slab *slab)
{
- return test_bit(SL_pfmemalloc, &slab->flags);
+ return test_bit(SL_pfmemalloc, &slab->flags.f);
}
static inline void slab_set_pfmemalloc(struct slab *slab)
{
- set_bit(SL_pfmemalloc, &slab->flags);
+ set_bit(SL_pfmemalloc, &slab->flags.f);
}
static inline void __slab_clear_pfmemalloc(struct slab *slab)
{
- __clear_bit(SL_pfmemalloc, &slab->flags);
+ __clear_bit(SL_pfmemalloc, &slab->flags.f);
}
/*
@@ -675,12 +738,12 @@ static inline void __slab_clear_pfmemalloc(struct slab *slab)
*/
static __always_inline void slab_lock(struct slab *slab)
{
- bit_spin_lock(SL_locked, &slab->flags);
+ bit_spin_lock(SL_locked, &slab->flags.f);
}
static __always_inline void slab_unlock(struct slab *slab)
{
- bit_spin_unlock(SL_locked, &slab->flags);
+ bit_spin_unlock(SL_locked, &slab->flags.f);
}
static inline bool
@@ -822,6 +885,16 @@ static inline unsigned int get_orig_size(struct kmem_cache *s, void *object)
}
#ifdef CONFIG_SLUB_DEBUG
+
+/*
+ * For debugging context when we want to check if the struct slab pointer
+ * appears to be valid.
+ */
+static inline bool validate_slab_ptr(struct slab *slab)
+{
+ return PageSlab(slab_page(slab));
+}
+
static unsigned long object_map[BITS_TO_LONGS(MAX_OBJS_PER_PAGE)];
static DEFINE_SPINLOCK(object_map_lock);
@@ -962,19 +1035,19 @@ static struct track *get_track(struct kmem_cache *s, void *object,
}
#ifdef CONFIG_STACKDEPOT
-static noinline depot_stack_handle_t set_track_prepare(void)
+static noinline depot_stack_handle_t set_track_prepare(gfp_t gfp_flags)
{
depot_stack_handle_t handle;
unsigned long entries[TRACK_ADDRS_COUNT];
unsigned int nr_entries;
nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 3);
- handle = stack_depot_save(entries, nr_entries, GFP_NOWAIT);
+ handle = stack_depot_save(entries, nr_entries, gfp_flags);
return handle;
}
#else
-static inline depot_stack_handle_t set_track_prepare(void)
+static inline depot_stack_handle_t set_track_prepare(gfp_t gfp_flags)
{
return 0;
}
@@ -996,9 +1069,9 @@ static void set_track_update(struct kmem_cache *s, void *object,
}
static __always_inline void set_track(struct kmem_cache *s, void *object,
- enum track_item alloc, unsigned long addr)
+ enum track_item alloc, unsigned long addr, gfp_t gfp_flags)
{
- depot_stack_handle_t handle = set_track_prepare();
+ depot_stack_handle_t handle = set_track_prepare(gfp_flags);
set_track_update(s, object, alloc, addr, handle);
}
@@ -1046,7 +1119,7 @@ static void print_slab_info(const struct slab *slab)
{
pr_err("Slab 0x%p objects=%u used=%u fp=0x%p flags=%pGp\n",
slab, slab->objects, slab->inuse, slab->freelist,
- &slab->flags);
+ &slab->flags.f);
}
void skip_orig_size_check(struct kmem_cache *s, const void *object)
@@ -1140,7 +1213,12 @@ static void object_err(struct kmem_cache *s, struct slab *slab,
return;
slab_bug(s, reason);
- print_trailer(s, slab, object);
+ if (!object || !check_valid_pointer(s, slab, object)) {
+ print_slab_info(slab);
+ pr_err("Invalid pointer 0x%p\n", object);
+ } else {
+ print_trailer(s, slab, object);
+ }
add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
WARN_ON(1);
@@ -1444,15 +1522,15 @@ static int check_object(struct kmem_cache *s, struct slab *slab,
return ret;
}
+/*
+ * Checks if the slab state looks sane. Assumes the struct slab pointer
+ * was either obtained in a way that ensures it's valid, or validated
+ * by validate_slab_ptr()
+ */
static int check_slab(struct kmem_cache *s, struct slab *slab)
{
int maxobj;
- if (!folio_test_slab(slab_folio(slab))) {
- slab_err(s, slab, "Not a valid slab page");
- return 0;
- }
-
maxobj = order_objects(slab_order(slab), s->size);
if (slab->objects > maxobj) {
slab_err(s, slab, "objects %u > max %u",
@@ -1648,17 +1726,15 @@ static noinline bool alloc_debug_processing(struct kmem_cache *s,
return true;
bad:
- if (folio_test_slab(slab_folio(slab))) {
- /*
- * If this is a slab page then lets do the best we can
- * to avoid issues in the future. Marking all objects
- * as used avoids touching the remaining objects.
- */
- slab_fix(s, "Marking all objects used");
- slab->inuse = slab->objects;
- slab->freelist = NULL;
- slab->frozen = 1; /* mark consistency-failed slab as frozen */
- }
+ /*
+ * Let's do the best we can to avoid issues in the future. Marking all
+ * objects as used avoids touching the remaining objects.
+ */
+ slab_fix(s, "Marking all objects used");
+ slab->inuse = slab->objects;
+ slab->freelist = NULL;
+ slab->frozen = 1; /* mark consistency-failed slab as frozen */
+
return false;
}
@@ -1679,10 +1755,7 @@ static inline int free_consistency_checks(struct kmem_cache *s,
return 0;
if (unlikely(s != slab->slab_cache)) {
- if (!folio_test_slab(slab_folio(slab))) {
- slab_err(s, slab, "Attempt to free object(0x%p) outside of slab",
- object);
- } else if (!slab->slab_cache) {
+ if (!slab->slab_cache) {
slab_err(NULL, slab, "No slab cache for object 0x%p",
object);
} else {
@@ -1921,9 +1994,9 @@ static inline bool free_debug_processing(struct kmem_cache *s,
static inline void slab_pad_check(struct kmem_cache *s, struct slab *slab) {}
static inline int check_object(struct kmem_cache *s, struct slab *slab,
void *object, u8 val) { return 1; }
-static inline depot_stack_handle_t set_track_prepare(void) { return 0; }
+static inline depot_stack_handle_t set_track_prepare(gfp_t gfp_flags) { return 0; }
static inline void set_track(struct kmem_cache *s, void *object,
- enum track_item alloc, unsigned long addr) {}
+ enum track_item alloc, unsigned long addr, gfp_t gfp_flags) {}
static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n,
struct slab *slab) {}
static inline void remove_full(struct kmem_cache *s, struct kmem_cache_node *n,
@@ -1984,7 +2057,7 @@ static inline void handle_failed_objexts_alloc(unsigned long obj_exts,
* objects with no tag reference. Mark all references in this
* vector as empty to avoid warnings later on.
*/
- if (obj_exts & OBJEXTS_ALLOC_FAIL) {
+ if (obj_exts == OBJEXTS_ALLOC_FAIL) {
unsigned int i;
for (i = 0; i < objects; i++)
@@ -2017,6 +2090,7 @@ static inline void init_slab_obj_exts(struct slab *slab)
int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s,
gfp_t gfp, bool new_slab)
{
+ bool allow_spin = gfpflags_allow_spinning(gfp);
unsigned int objects = objs_per_slab(s, slab);
unsigned long new_exts;
unsigned long old_exts;
@@ -2025,17 +2099,32 @@ int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s,
gfp &= ~OBJCGS_CLEAR_MASK;
/* Prevent recursive extension vector allocation */
gfp |= __GFP_NO_OBJ_EXT;
- vec = kcalloc_node(objects, sizeof(struct slabobj_ext), gfp,
- slab_nid(slab));
+
+ /*
+ * Note that allow_spin may be false during early boot and its
+ * restricted GFP_BOOT_MASK. Due to kmalloc_nolock() only supporting
+ * architectures with cmpxchg16b, early obj_exts will be missing for
+ * very early allocations on those.
+ */
+ if (unlikely(!allow_spin)) {
+ size_t sz = objects * sizeof(struct slabobj_ext);
+
+ vec = kmalloc_nolock(sz, __GFP_ZERO | __GFP_NO_OBJ_EXT,
+ slab_nid(slab));
+ } else {
+ vec = kcalloc_node(objects, sizeof(struct slabobj_ext), gfp,
+ slab_nid(slab));
+ }
if (!vec) {
/* Mark vectors which failed to allocate */
- if (new_slab)
- mark_failed_objexts_alloc(slab);
+ mark_failed_objexts_alloc(slab);
return -ENOMEM;
}
new_exts = (unsigned long)vec;
+ if (unlikely(!allow_spin))
+ new_exts |= OBJEXTS_NOSPIN_ALLOC;
#ifdef CONFIG_MEMCG
new_exts |= MEMCG_DATA_OBJEXTS;
#endif
@@ -2056,7 +2145,10 @@ int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s,
* objcg vector should be reused.
*/
mark_objexts_empty(vec);
- kfree(vec);
+ if (unlikely(!allow_spin))
+ kfree_nolock(vec);
+ else
+ kfree(vec);
return 0;
}
@@ -2080,7 +2172,10 @@ static inline void free_slab_obj_exts(struct slab *slab)
* the extension for obj_exts is expected to be NULL.
*/
mark_objexts_empty(obj_exts);
- kfree(obj_exts);
+ if (unlikely(READ_ONCE(slab->obj_exts) & OBJEXTS_NOSPIN_ALLOC))
+ kfree_nolock(obj_exts);
+ else
+ kfree(obj_exts);
slab->obj_exts = 0;
}
@@ -2109,15 +2204,6 @@ prepare_slab_obj_exts_hook(struct kmem_cache *s, gfp_t flags, void *p)
{
struct slab *slab;
- if (!p)
- return NULL;
-
- if (s->flags & (SLAB_NO_OBJ_EXT | SLAB_NOLEAKTRACE))
- return NULL;
-
- if (flags & __GFP_NO_OBJ_EXT)
- return NULL;
-
slab = virt_to_slab(p);
if (!slab_obj_exts(slab) &&
alloc_slab_obj_exts(slab, s, flags, false)) {
@@ -2135,6 +2221,15 @@ __alloc_tagging_slab_alloc_hook(struct kmem_cache *s, void *object, gfp_t flags)
{
struct slabobj_ext *obj_exts;
+ if (!object)
+ return;
+
+ if (s->flags & (SLAB_NO_OBJ_EXT | SLAB_NOLEAKTRACE))
+ return;
+
+ if (flags & __GFP_NO_OBJ_EXT)
+ return;
+
obj_exts = prepare_slab_obj_exts_hook(s, flags, object);
/*
* Currently obj_exts is used only for allocation profiling.
@@ -2143,6 +2238,8 @@ __alloc_tagging_slab_alloc_hook(struct kmem_cache *s, void *object, gfp_t flags)
*/
if (likely(obj_exts))
alloc_tag_add(&obj_exts->ref, current->alloc_tag, s->size);
+ else
+ alloc_tag_set_inaccurate(current->alloc_tag);
}
static inline void
@@ -2414,7 +2511,7 @@ bool slab_free_hook(struct kmem_cache *s, void *x, bool init,
}
/* KASAN might put x into memory quarantine, delaying its reuse. */
- return !kasan_slab_free(s, x, init, still_accessible);
+ return !kasan_slab_free(s, x, init, still_accessible, false);
}
static __fastpath_inline
@@ -2473,17 +2570,463 @@ static void *setup_object(struct kmem_cache *s, void *object)
return object;
}
+static struct slab_sheaf *alloc_empty_sheaf(struct kmem_cache *s, gfp_t gfp)
+{
+ struct slab_sheaf *sheaf = kzalloc(struct_size(sheaf, objects,
+ s->sheaf_capacity), gfp);
+
+ if (unlikely(!sheaf))
+ return NULL;
+
+ sheaf->cache = s;
+
+ stat(s, SHEAF_ALLOC);
+
+ return sheaf;
+}
+
+static void free_empty_sheaf(struct kmem_cache *s, struct slab_sheaf *sheaf)
+{
+ kfree(sheaf);
+
+ stat(s, SHEAF_FREE);
+}
+
+static int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags,
+ size_t size, void **p);
+
+
+static int refill_sheaf(struct kmem_cache *s, struct slab_sheaf *sheaf,
+ gfp_t gfp)
+{
+ int to_fill = s->sheaf_capacity - sheaf->size;
+ int filled;
+
+ if (!to_fill)
+ return 0;
+
+ filled = __kmem_cache_alloc_bulk(s, gfp, to_fill,
+ &sheaf->objects[sheaf->size]);
+
+ sheaf->size += filled;
+
+ stat_add(s, SHEAF_REFILL, filled);
+
+ if (filled < to_fill)
+ return -ENOMEM;
+
+ return 0;
+}
+
+
+static struct slab_sheaf *alloc_full_sheaf(struct kmem_cache *s, gfp_t gfp)
+{
+ struct slab_sheaf *sheaf = alloc_empty_sheaf(s, gfp);
+
+ if (!sheaf)
+ return NULL;
+
+ if (refill_sheaf(s, sheaf, gfp)) {
+ free_empty_sheaf(s, sheaf);
+ return NULL;
+ }
+
+ return sheaf;
+}
+
+/*
+ * Maximum number of objects freed during a single flush of main pcs sheaf.
+ * Translates directly to an on-stack array size.
+ */
+#define PCS_BATCH_MAX 32U
+
+static void __kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p);
+
+/*
+ * Free all objects from the main sheaf. In order to perform
+ * __kmem_cache_free_bulk() outside of cpu_sheaves->lock, work in batches where
+ * object pointers are moved to a on-stack array under the lock. To bound the
+ * stack usage, limit each batch to PCS_BATCH_MAX.
+ *
+ * returns true if at least partially flushed
+ */
+static bool sheaf_flush_main(struct kmem_cache *s)
+{
+ struct slub_percpu_sheaves *pcs;
+ unsigned int batch, remaining;
+ void *objects[PCS_BATCH_MAX];
+ struct slab_sheaf *sheaf;
+ bool ret = false;
+
+next_batch:
+ if (!local_trylock(&s->cpu_sheaves->lock))
+ return ret;
+
+ pcs = this_cpu_ptr(s->cpu_sheaves);
+ sheaf = pcs->main;
+
+ batch = min(PCS_BATCH_MAX, sheaf->size);
+
+ sheaf->size -= batch;
+ memcpy(objects, sheaf->objects + sheaf->size, batch * sizeof(void *));
+
+ remaining = sheaf->size;
+
+ local_unlock(&s->cpu_sheaves->lock);
+
+ __kmem_cache_free_bulk(s, batch, &objects[0]);
+
+ stat_add(s, SHEAF_FLUSH, batch);
+
+ ret = true;
+
+ if (remaining)
+ goto next_batch;
+
+ return ret;
+}
+
+/*
+ * Free all objects from a sheaf that's unused, i.e. not linked to any
+ * cpu_sheaves, so we need no locking and batching. The locking is also not
+ * necessary when flushing cpu's sheaves (both spare and main) during cpu
+ * hotremove as the cpu is not executing anymore.
+ */
+static void sheaf_flush_unused(struct kmem_cache *s, struct slab_sheaf *sheaf)
+{
+ if (!sheaf->size)
+ return;
+
+ stat_add(s, SHEAF_FLUSH, sheaf->size);
+
+ __kmem_cache_free_bulk(s, sheaf->size, &sheaf->objects[0]);
+
+ sheaf->size = 0;
+}
+
+static void __rcu_free_sheaf_prepare(struct kmem_cache *s,
+ struct slab_sheaf *sheaf)
+{
+ bool init = slab_want_init_on_free(s);
+ void **p = &sheaf->objects[0];
+ unsigned int i = 0;
+
+ while (i < sheaf->size) {
+ struct slab *slab = virt_to_slab(p[i]);
+
+ memcg_slab_free_hook(s, slab, p + i, 1);
+ alloc_tagging_slab_free_hook(s, slab, p + i, 1);
+
+ if (unlikely(!slab_free_hook(s, p[i], init, true))) {
+ p[i] = p[--sheaf->size];
+ continue;
+ }
+
+ i++;
+ }
+}
+
+static void rcu_free_sheaf_nobarn(struct rcu_head *head)
+{
+ struct slab_sheaf *sheaf;
+ struct kmem_cache *s;
+
+ sheaf = container_of(head, struct slab_sheaf, rcu_head);
+ s = sheaf->cache;
+
+ __rcu_free_sheaf_prepare(s, sheaf);
+
+ sheaf_flush_unused(s, sheaf);
+
+ free_empty_sheaf(s, sheaf);
+}
+
+/*
+ * Caller needs to make sure migration is disabled in order to fully flush
+ * single cpu's sheaves
+ *
+ * must not be called from an irq
+ *
+ * flushing operations are rare so let's keep it simple and flush to slabs
+ * directly, skipping the barn
+ */
+static void pcs_flush_all(struct kmem_cache *s)
+{
+ struct slub_percpu_sheaves *pcs;
+ struct slab_sheaf *spare, *rcu_free;
+
+ local_lock(&s->cpu_sheaves->lock);
+ pcs = this_cpu_ptr(s->cpu_sheaves);
+
+ spare = pcs->spare;
+ pcs->spare = NULL;
+
+ rcu_free = pcs->rcu_free;
+ pcs->rcu_free = NULL;
+
+ local_unlock(&s->cpu_sheaves->lock);
+
+ if (spare) {
+ sheaf_flush_unused(s, spare);
+ free_empty_sheaf(s, spare);
+ }
+
+ if (rcu_free)
+ call_rcu(&rcu_free->rcu_head, rcu_free_sheaf_nobarn);
+
+ sheaf_flush_main(s);
+}
+
+static void __pcs_flush_all_cpu(struct kmem_cache *s, unsigned int cpu)
+{
+ struct slub_percpu_sheaves *pcs;
+
+ pcs = per_cpu_ptr(s->cpu_sheaves, cpu);
+
+ /* The cpu is not executing anymore so we don't need pcs->lock */
+ sheaf_flush_unused(s, pcs->main);
+ if (pcs->spare) {
+ sheaf_flush_unused(s, pcs->spare);
+ free_empty_sheaf(s, pcs->spare);
+ pcs->spare = NULL;
+ }
+
+ if (pcs->rcu_free) {
+ call_rcu(&pcs->rcu_free->rcu_head, rcu_free_sheaf_nobarn);
+ pcs->rcu_free = NULL;
+ }
+}
+
+static void pcs_destroy(struct kmem_cache *s)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ struct slub_percpu_sheaves *pcs;
+
+ pcs = per_cpu_ptr(s->cpu_sheaves, cpu);
+
+ /* can happen when unwinding failed create */
+ if (!pcs->main)
+ continue;
+
+ /*
+ * We have already passed __kmem_cache_shutdown() so everything
+ * was flushed and there should be no objects allocated from
+ * slabs, otherwise kmem_cache_destroy() would have aborted.
+ * Therefore something would have to be really wrong if the
+ * warnings here trigger, and we should rather leave objects and
+ * sheaves to leak in that case.
+ */
+
+ WARN_ON(pcs->spare);
+ WARN_ON(pcs->rcu_free);
+
+ if (!WARN_ON(pcs->main->size)) {
+ free_empty_sheaf(s, pcs->main);
+ pcs->main = NULL;
+ }
+ }
+
+ free_percpu(s->cpu_sheaves);
+ s->cpu_sheaves = NULL;
+}
+
+static struct slab_sheaf *barn_get_empty_sheaf(struct node_barn *barn)
+{
+ struct slab_sheaf *empty = NULL;
+ unsigned long flags;
+
+ if (!data_race(barn->nr_empty))
+ return NULL;
+
+ spin_lock_irqsave(&barn->lock, flags);
+
+ if (likely(barn->nr_empty)) {
+ empty = list_first_entry(&barn->sheaves_empty,
+ struct slab_sheaf, barn_list);
+ list_del(&empty->barn_list);
+ barn->nr_empty--;
+ }
+
+ spin_unlock_irqrestore(&barn->lock, flags);
+
+ return empty;
+}
+
+/*
+ * The following two functions are used mainly in cases where we have to undo an
+ * intended action due to a race or cpu migration. Thus they do not check the
+ * empty or full sheaf limits for simplicity.
+ */
+
+static void barn_put_empty_sheaf(struct node_barn *barn, struct slab_sheaf *sheaf)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&barn->lock, flags);
+
+ list_add(&sheaf->barn_list, &barn->sheaves_empty);
+ barn->nr_empty++;
+
+ spin_unlock_irqrestore(&barn->lock, flags);
+}
+
+static void barn_put_full_sheaf(struct node_barn *barn, struct slab_sheaf *sheaf)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&barn->lock, flags);
+
+ list_add(&sheaf->barn_list, &barn->sheaves_full);
+ barn->nr_full++;
+
+ spin_unlock_irqrestore(&barn->lock, flags);
+}
+
+static struct slab_sheaf *barn_get_full_or_empty_sheaf(struct node_barn *barn)
+{
+ struct slab_sheaf *sheaf = NULL;
+ unsigned long flags;
+
+ if (!data_race(barn->nr_full) && !data_race(barn->nr_empty))
+ return NULL;
+
+ spin_lock_irqsave(&barn->lock, flags);
+
+ if (barn->nr_full) {
+ sheaf = list_first_entry(&barn->sheaves_full, struct slab_sheaf,
+ barn_list);
+ list_del(&sheaf->barn_list);
+ barn->nr_full--;
+ } else if (barn->nr_empty) {
+ sheaf = list_first_entry(&barn->sheaves_empty,
+ struct slab_sheaf, barn_list);
+ list_del(&sheaf->barn_list);
+ barn->nr_empty--;
+ }
+
+ spin_unlock_irqrestore(&barn->lock, flags);
+
+ return sheaf;
+}
+
+/*
+ * If a full sheaf is available, return it and put the supplied empty one to
+ * barn. We ignore the limit on empty sheaves as the number of sheaves doesn't
+ * change.
+ */
+static struct slab_sheaf *
+barn_replace_empty_sheaf(struct node_barn *barn, struct slab_sheaf *empty)
+{
+ struct slab_sheaf *full = NULL;
+ unsigned long flags;
+
+ if (!data_race(barn->nr_full))
+ return NULL;
+
+ spin_lock_irqsave(&barn->lock, flags);
+
+ if (likely(barn->nr_full)) {
+ full = list_first_entry(&barn->sheaves_full, struct slab_sheaf,
+ barn_list);
+ list_del(&full->barn_list);
+ list_add(&empty->barn_list, &barn->sheaves_empty);
+ barn->nr_full--;
+ barn->nr_empty++;
+ }
+
+ spin_unlock_irqrestore(&barn->lock, flags);
+
+ return full;
+}
+
+/*
+ * If an empty sheaf is available, return it and put the supplied full one to
+ * barn. But if there are too many full sheaves, reject this with -E2BIG.
+ */
+static struct slab_sheaf *
+barn_replace_full_sheaf(struct node_barn *barn, struct slab_sheaf *full)
+{
+ struct slab_sheaf *empty;
+ unsigned long flags;
+
+ /* we don't repeat this check under barn->lock as it's not critical */
+ if (data_race(barn->nr_full) >= MAX_FULL_SHEAVES)
+ return ERR_PTR(-E2BIG);
+ if (!data_race(barn->nr_empty))
+ return ERR_PTR(-ENOMEM);
+
+ spin_lock_irqsave(&barn->lock, flags);
+
+ if (likely(barn->nr_empty)) {
+ empty = list_first_entry(&barn->sheaves_empty, struct slab_sheaf,
+ barn_list);
+ list_del(&empty->barn_list);
+ list_add(&full->barn_list, &barn->sheaves_full);
+ barn->nr_empty--;
+ barn->nr_full++;
+ } else {
+ empty = ERR_PTR(-ENOMEM);
+ }
+
+ spin_unlock_irqrestore(&barn->lock, flags);
+
+ return empty;
+}
+
+static void barn_init(struct node_barn *barn)
+{
+ spin_lock_init(&barn->lock);
+ INIT_LIST_HEAD(&barn->sheaves_full);
+ INIT_LIST_HEAD(&barn->sheaves_empty);
+ barn->nr_full = 0;
+ barn->nr_empty = 0;
+}
+
+static void barn_shrink(struct kmem_cache *s, struct node_barn *barn)
+{
+ struct list_head empty_list;
+ struct list_head full_list;
+ struct slab_sheaf *sheaf, *sheaf2;
+ unsigned long flags;
+
+ INIT_LIST_HEAD(&empty_list);
+ INIT_LIST_HEAD(&full_list);
+
+ spin_lock_irqsave(&barn->lock, flags);
+
+ list_splice_init(&barn->sheaves_full, &full_list);
+ barn->nr_full = 0;
+ list_splice_init(&barn->sheaves_empty, &empty_list);
+ barn->nr_empty = 0;
+
+ spin_unlock_irqrestore(&barn->lock, flags);
+
+ list_for_each_entry_safe(sheaf, sheaf2, &full_list, barn_list) {
+ sheaf_flush_unused(s, sheaf);
+ free_empty_sheaf(s, sheaf);
+ }
+
+ list_for_each_entry_safe(sheaf, sheaf2, &empty_list, barn_list)
+ free_empty_sheaf(s, sheaf);
+}
+
/*
* Slab allocation and freeing
*/
static inline struct slab *alloc_slab_page(gfp_t flags, int node,
- struct kmem_cache_order_objects oo)
+ struct kmem_cache_order_objects oo,
+ bool allow_spin)
{
struct folio *folio;
struct slab *slab;
unsigned int order = oo_order(oo);
- if (node == NUMA_NO_NODE)
+ if (unlikely(!allow_spin))
+ folio = (struct folio *)alloc_frozen_pages_nolock(0/* __GFP_COMP is implied */,
+ node, order);
+ else if (node == NUMA_NO_NODE)
folio = (struct folio *)alloc_frozen_pages(flags, order);
else
folio = (struct folio *)__alloc_frozen_pages(flags, order, node, NULL);
@@ -2633,6 +3176,7 @@ static __always_inline void unaccount_slab(struct slab *slab, int order,
static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
{
+ bool allow_spin = gfpflags_allow_spinning(flags);
struct slab *slab;
struct kmem_cache_order_objects oo = s->oo;
gfp_t alloc_gfp;
@@ -2652,7 +3196,11 @@ static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
if ((alloc_gfp & __GFP_DIRECT_RECLAIM) && oo_order(oo) > oo_order(s->min))
alloc_gfp = (alloc_gfp | __GFP_NOMEMALLOC) & ~__GFP_RECLAIM;
- slab = alloc_slab_page(alloc_gfp, node, oo);
+ /*
+ * __GFP_RECLAIM could be cleared on the first allocation attempt,
+ * so pass allow_spin flag directly.
+ */
+ slab = alloc_slab_page(alloc_gfp, node, oo, allow_spin);
if (unlikely(!slab)) {
oo = s->min;
alloc_gfp = flags;
@@ -2660,7 +3208,7 @@ static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
* Allocation may have failed due to fragmentation.
* Try a lower order alloc if possible
*/
- slab = alloc_slab_page(alloc_gfp, node, oo);
+ slab = alloc_slab_page(alloc_gfp, node, oo, allow_spin);
if (unlikely(!slab))
return NULL;
stat(s, ORDER_FALLBACK);
@@ -2755,17 +3303,17 @@ static void discard_slab(struct kmem_cache *s, struct slab *slab)
static inline bool slab_test_node_partial(const struct slab *slab)
{
- return test_bit(SL_partial, &slab->flags);
+ return test_bit(SL_partial, &slab->flags.f);
}
static inline void slab_set_node_partial(struct slab *slab)
{
- set_bit(SL_partial, &slab->flags);
+ set_bit(SL_partial, &slab->flags.f);
}
static inline void slab_clear_node_partial(struct slab *slab)
{
- clear_bit(SL_partial, &slab->flags);
+ clear_bit(SL_partial, &slab->flags.f);
}
/*
@@ -2811,13 +3359,21 @@ static void *alloc_single_from_partial(struct kmem_cache *s,
lockdep_assert_held(&n->list_lock);
+#ifdef CONFIG_SLUB_DEBUG
+ if (s->flags & SLAB_CONSISTENCY_CHECKS) {
+ if (!validate_slab_ptr(slab)) {
+ slab_err(s, slab, "Not a valid slab page");
+ return NULL;
+ }
+ }
+#endif
+
object = slab->freelist;
slab->freelist = get_freepointer(s, object);
slab->inuse++;
if (!alloc_debug_processing(s, slab, object, orig_size)) {
- if (folio_test_slab(slab_folio(slab)))
- remove_partial(n, slab);
+ remove_partial(n, slab);
return NULL;
}
@@ -2829,33 +3385,47 @@ static void *alloc_single_from_partial(struct kmem_cache *s,
return object;
}
+static void defer_deactivate_slab(struct slab *slab, void *flush_freelist);
+
/*
* Called only for kmem_cache_debug() caches to allocate from a freshly
* allocated slab. Allocate a single object instead of whole freelist
* and put the slab to the partial (or full) list.
*/
-static void *alloc_single_from_new_slab(struct kmem_cache *s,
- struct slab *slab, int orig_size)
+static void *alloc_single_from_new_slab(struct kmem_cache *s, struct slab *slab,
+ int orig_size, gfp_t gfpflags)
{
+ bool allow_spin = gfpflags_allow_spinning(gfpflags);
int nid = slab_nid(slab);
struct kmem_cache_node *n = get_node(s, nid);
unsigned long flags;
void *object;
+ if (!allow_spin && !spin_trylock_irqsave(&n->list_lock, flags)) {
+ /* Unlucky, discard newly allocated slab */
+ slab->frozen = 1;
+ defer_deactivate_slab(slab, NULL);
+ return NULL;
+ }
object = slab->freelist;
slab->freelist = get_freepointer(s, object);
slab->inuse = 1;
- if (!alloc_debug_processing(s, slab, object, orig_size))
+ if (!alloc_debug_processing(s, slab, object, orig_size)) {
/*
* It's not really expected that this would fail on a
* freshly allocated slab, but a concurrent memory
* corruption in theory could cause that.
+ * Leak memory of allocated slab.
*/
+ if (!allow_spin)
+ spin_unlock_irqrestore(&n->list_lock, flags);
return NULL;
+ }
- spin_lock_irqsave(&n->list_lock, flags);
+ if (allow_spin)
+ spin_lock_irqsave(&n->list_lock, flags);
if (slab->inuse == slab->objects)
add_full(s, n, slab);
@@ -2896,7 +3466,10 @@ static struct slab *get_partial_node(struct kmem_cache *s,
if (!n || !n->nr_partial)
return NULL;
- spin_lock_irqsave(&n->list_lock, flags);
+ if (gfpflags_allow_spinning(pc->flags))
+ spin_lock_irqsave(&n->list_lock, flags);
+ else if (!spin_trylock_irqsave(&n->list_lock, flags))
+ return NULL;
list_for_each_entry_safe(slab, slab2, &n->partial, slab_list) {
if (!pfmemalloc_match(slab, pc->flags))
continue;
@@ -3064,30 +3637,46 @@ static inline void note_cmpxchg_failure(const char *n,
pr_info("%s %s: cmpxchg redo ", n, s->name);
-#ifdef CONFIG_PREEMPTION
- if (tid_to_cpu(tid) != tid_to_cpu(actual_tid))
+ if (IS_ENABLED(CONFIG_PREEMPTION) &&
+ tid_to_cpu(tid) != tid_to_cpu(actual_tid)) {
pr_warn("due to cpu change %d -> %d\n",
tid_to_cpu(tid), tid_to_cpu(actual_tid));
- else
-#endif
- if (tid_to_event(tid) != tid_to_event(actual_tid))
+ } else if (tid_to_event(tid) != tid_to_event(actual_tid)) {
pr_warn("due to cpu running other code. Event %ld->%ld\n",
tid_to_event(tid), tid_to_event(actual_tid));
- else
+ } else {
pr_warn("for unknown reason: actual=%lx was=%lx target=%lx\n",
actual_tid, tid, next_tid(tid));
+ }
#endif
stat(s, CMPXCHG_DOUBLE_CPU_FAIL);
}
static void init_kmem_cache_cpus(struct kmem_cache *s)
{
+#ifdef CONFIG_PREEMPT_RT
+ /*
+ * Register lockdep key for non-boot kmem caches to avoid
+ * WARN_ON_ONCE(static_obj(key))) in lockdep_register_key()
+ */
+ bool finegrain_lockdep = !init_section_contains(s, 1);
+#else
+ /*
+ * Don't bother with different lockdep classes for each
+ * kmem_cache, since we only use local_trylock_irqsave().
+ */
+ bool finegrain_lockdep = false;
+#endif
int cpu;
struct kmem_cache_cpu *c;
+ if (finegrain_lockdep)
+ lockdep_register_key(&s->lock_key);
for_each_possible_cpu(cpu) {
c = per_cpu_ptr(s->cpu_slab, cpu);
- local_lock_init(&c->lock);
+ local_trylock_init(&c->lock);
+ if (finegrain_lockdep)
+ lockdep_set_class(&c->lock, &s->lock_key);
c->tid = init_tid(cpu);
}
}
@@ -3178,6 +3767,47 @@ static void deactivate_slab(struct kmem_cache *s, struct slab *slab,
}
}
+/*
+ * ___slab_alloc()'s caller is supposed to check if kmem_cache::kmem_cache_cpu::lock
+ * can be acquired without a deadlock before invoking the function.
+ *
+ * Without LOCKDEP we trust the code to be correct. kmalloc_nolock() is
+ * using local_lock_is_locked() properly before calling local_lock_cpu_slab(),
+ * and kmalloc() is not used in an unsupported context.
+ *
+ * With LOCKDEP, on PREEMPT_RT lockdep does its checking in local_lock_irqsave().
+ * On !PREEMPT_RT we use trylock to avoid false positives in NMI, but
+ * lockdep_assert() will catch a bug in case:
+ * #1
+ * kmalloc() -> ___slab_alloc() -> irqsave -> NMI -> bpf -> kmalloc_nolock()
+ * or
+ * #2
+ * kmalloc() -> ___slab_alloc() -> irqsave -> tracepoint/kprobe -> bpf -> kmalloc_nolock()
+ *
+ * On PREEMPT_RT an invocation is not possible from IRQ-off or preempt
+ * disabled context. The lock will always be acquired and if needed it
+ * block and sleep until the lock is available.
+ * #1 is possible in !PREEMPT_RT only.
+ * #2 is possible in both with a twist that irqsave is replaced with rt_spinlock:
+ * kmalloc() -> ___slab_alloc() -> rt_spin_lock(kmem_cache_A) ->
+ * tracepoint/kprobe -> bpf -> kmalloc_nolock() -> rt_spin_lock(kmem_cache_B)
+ *
+ * local_lock_is_locked() prevents the case kmem_cache_A == kmem_cache_B
+ */
+#if defined(CONFIG_PREEMPT_RT) || !defined(CONFIG_LOCKDEP)
+#define local_lock_cpu_slab(s, flags) \
+ local_lock_irqsave(&(s)->cpu_slab->lock, flags)
+#else
+#define local_lock_cpu_slab(s, flags) \
+ do { \
+ bool __l = local_trylock_irqsave(&(s)->cpu_slab->lock, flags); \
+ lockdep_assert(__l); \
+ } while (0)
+#endif
+
+#define local_unlock_cpu_slab(s, flags) \
+ local_unlock_irqrestore(&(s)->cpu_slab->lock, flags)
+
#ifdef CONFIG_SLUB_CPU_PARTIAL
static void __put_partials(struct kmem_cache *s, struct slab *partial_slab)
{
@@ -3262,7 +3892,7 @@ static void put_cpu_partial(struct kmem_cache *s, struct slab *slab, int drain)
unsigned long flags;
int slabs = 0;
- local_lock_irqsave(&s->cpu_slab->lock, flags);
+ local_lock_cpu_slab(s, flags);
oldslab = this_cpu_read(s->cpu_slab->partial);
@@ -3287,7 +3917,7 @@ static void put_cpu_partial(struct kmem_cache *s, struct slab *slab, int drain)
this_cpu_write(s->cpu_slab->partial, slab);
- local_unlock_irqrestore(&s->cpu_slab->lock, flags);
+ local_unlock_cpu_slab(s, flags);
if (slab_to_put) {
__put_partials(s, slab_to_put);
@@ -3344,11 +3974,40 @@ static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu)
put_partials_cpu(s, c);
}
-struct slub_flush_work {
- struct work_struct work;
- struct kmem_cache *s;
- bool skip;
-};
+static inline void flush_this_cpu_slab(struct kmem_cache *s)
+{
+ struct kmem_cache_cpu *c = this_cpu_ptr(s->cpu_slab);
+
+ if (c->slab)
+ flush_slab(s, c);
+
+ put_partials(s);
+}
+
+static bool has_cpu_slab(int cpu, struct kmem_cache *s)
+{
+ struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
+
+ return c->slab || slub_percpu_partial(c);
+}
+
+#else /* CONFIG_SLUB_TINY */
+static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) { }
+static inline bool has_cpu_slab(int cpu, struct kmem_cache *s) { return false; }
+static inline void flush_this_cpu_slab(struct kmem_cache *s) { }
+#endif /* CONFIG_SLUB_TINY */
+
+static bool has_pcs_used(int cpu, struct kmem_cache *s)
+{
+ struct slub_percpu_sheaves *pcs;
+
+ if (!s->cpu_sheaves)
+ return false;
+
+ pcs = per_cpu_ptr(s->cpu_sheaves, cpu);
+
+ return (pcs->spare || pcs->rcu_free || pcs->main->size);
+}
/*
* Flush cpu slab.
@@ -3358,30 +4017,18 @@ struct slub_flush_work {
static void flush_cpu_slab(struct work_struct *w)
{
struct kmem_cache *s;
- struct kmem_cache_cpu *c;
struct slub_flush_work *sfw;
sfw = container_of(w, struct slub_flush_work, work);
s = sfw->s;
- c = this_cpu_ptr(s->cpu_slab);
-
- if (c->slab)
- flush_slab(s, c);
-
- put_partials(s);
-}
-static bool has_cpu_slab(int cpu, struct kmem_cache *s)
-{
- struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
+ if (s->cpu_sheaves)
+ pcs_flush_all(s);
- return c->slab || slub_percpu_partial(c);
+ flush_this_cpu_slab(s);
}
-static DEFINE_MUTEX(flush_lock);
-static DEFINE_PER_CPU(struct slub_flush_work, slub_flush);
-
static void flush_all_cpus_locked(struct kmem_cache *s)
{
struct slub_flush_work *sfw;
@@ -3392,7 +4039,7 @@ static void flush_all_cpus_locked(struct kmem_cache *s)
for_each_online_cpu(cpu) {
sfw = &per_cpu(slub_flush, cpu);
- if (!has_cpu_slab(cpu, s)) {
+ if (!has_cpu_slab(cpu, s) && !has_pcs_used(cpu, s)) {
sfw->skip = true;
continue;
}
@@ -3419,6 +4066,74 @@ static void flush_all(struct kmem_cache *s)
cpus_read_unlock();
}
+static void flush_rcu_sheaf(struct work_struct *w)
+{
+ struct slub_percpu_sheaves *pcs;
+ struct slab_sheaf *rcu_free;
+ struct slub_flush_work *sfw;
+ struct kmem_cache *s;
+
+ sfw = container_of(w, struct slub_flush_work, work);
+ s = sfw->s;
+
+ local_lock(&s->cpu_sheaves->lock);
+ pcs = this_cpu_ptr(s->cpu_sheaves);
+
+ rcu_free = pcs->rcu_free;
+ pcs->rcu_free = NULL;
+
+ local_unlock(&s->cpu_sheaves->lock);
+
+ if (rcu_free)
+ call_rcu(&rcu_free->rcu_head, rcu_free_sheaf_nobarn);
+}
+
+
+/* needed for kvfree_rcu_barrier() */
+void flush_all_rcu_sheaves(void)
+{
+ struct slub_flush_work *sfw;
+ struct kmem_cache *s;
+ unsigned int cpu;
+
+ cpus_read_lock();
+ mutex_lock(&slab_mutex);
+
+ list_for_each_entry(s, &slab_caches, list) {
+ if (!s->cpu_sheaves)
+ continue;
+
+ mutex_lock(&flush_lock);
+
+ for_each_online_cpu(cpu) {
+ sfw = &per_cpu(slub_flush, cpu);
+
+ /*
+ * we don't check if rcu_free sheaf exists - racing
+ * __kfree_rcu_sheaf() might have just removed it.
+ * by executing flush_rcu_sheaf() on the cpu we make
+ * sure the __kfree_rcu_sheaf() finished its call_rcu()
+ */
+
+ INIT_WORK(&sfw->work, flush_rcu_sheaf);
+ sfw->s = s;
+ queue_work_on(cpu, flushwq, &sfw->work);
+ }
+
+ for_each_online_cpu(cpu) {
+ sfw = &per_cpu(slub_flush, cpu);
+ flush_work(&sfw->work);
+ }
+
+ mutex_unlock(&flush_lock);
+ }
+
+ mutex_unlock(&slab_mutex);
+ cpus_read_unlock();
+
+ rcu_barrier();
+}
+
/*
* Use the cpu notifier to insure that the cpu slabs are flushed when
* necessary.
@@ -3428,19 +4143,15 @@ static int slub_cpu_dead(unsigned int cpu)
struct kmem_cache *s;
mutex_lock(&slab_mutex);
- list_for_each_entry(s, &slab_caches, list)
+ list_for_each_entry(s, &slab_caches, list) {
__flush_cpu_slab(s, cpu);
+ if (s->cpu_sheaves)
+ __pcs_flush_all_cpu(s, cpu);
+ }
mutex_unlock(&slab_mutex);
return 0;
}
-#else /* CONFIG_SLUB_TINY */
-static inline void flush_all_cpus_locked(struct kmem_cache *s) { }
-static inline void flush_all(struct kmem_cache *s) { }
-static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) { }
-static inline int slub_cpu_dead(unsigned int cpu) { return 0; }
-#endif /* CONFIG_SLUB_TINY */
-
/*
* Check if the objects in a per cpu structure fit numa
* locality expectations.
@@ -3721,6 +4432,7 @@ static inline void *freeze_slab(struct kmem_cache *s, struct slab *slab)
static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
unsigned long addr, struct kmem_cache_cpu *c, unsigned int orig_size)
{
+ bool allow_spin = gfpflags_allow_spinning(gfpflags);
void *freelist;
struct slab *slab;
unsigned long flags;
@@ -3746,9 +4458,21 @@ reread_slab:
if (unlikely(!node_match(slab, node))) {
/*
* same as above but node_match() being false already
- * implies node != NUMA_NO_NODE
+ * implies node != NUMA_NO_NODE.
+ *
+ * We don't strictly honor pfmemalloc and NUMA preferences
+ * when !allow_spin because:
+ *
+ * 1. Most kmalloc() users allocate objects on the local node,
+ * so kmalloc_nolock() tries not to interfere with them by
+ * deactivating the cpu slab.
+ *
+ * 2. Deactivating due to NUMA or pfmemalloc mismatch may cause
+ * unnecessary slab allocations even when n->partial list
+ * is not empty.
*/
- if (!node_isset(node, slab_nodes)) {
+ if (!node_isset(node, slab_nodes) ||
+ !allow_spin) {
node = NUMA_NO_NODE;
} else {
stat(s, ALLOC_NODE_MISMATCH);
@@ -3761,13 +4485,14 @@ reread_slab:
* PFMEMALLOC but right now, we are losing the pfmemalloc
* information when the page leaves the per-cpu allocator
*/
- if (unlikely(!pfmemalloc_match(slab, gfpflags)))
+ if (unlikely(!pfmemalloc_match(slab, gfpflags) && allow_spin))
goto deactivate_slab;
/* must check again c->slab in case we got preempted and it changed */
- local_lock_irqsave(&s->cpu_slab->lock, flags);
+ local_lock_cpu_slab(s, flags);
+
if (unlikely(slab != c->slab)) {
- local_unlock_irqrestore(&s->cpu_slab->lock, flags);
+ local_unlock_cpu_slab(s, flags);
goto reread_slab;
}
freelist = c->freelist;
@@ -3779,7 +4504,7 @@ reread_slab:
if (!freelist) {
c->slab = NULL;
c->tid = next_tid(c->tid);
- local_unlock_irqrestore(&s->cpu_slab->lock, flags);
+ local_unlock_cpu_slab(s, flags);
stat(s, DEACTIVATE_BYPASS);
goto new_slab;
}
@@ -3798,34 +4523,34 @@ load_freelist:
VM_BUG_ON(!c->slab->frozen);
c->freelist = get_freepointer(s, freelist);
c->tid = next_tid(c->tid);
- local_unlock_irqrestore(&s->cpu_slab->lock, flags);
+ local_unlock_cpu_slab(s, flags);
return freelist;
deactivate_slab:
- local_lock_irqsave(&s->cpu_slab->lock, flags);
+ local_lock_cpu_slab(s, flags);
if (slab != c->slab) {
- local_unlock_irqrestore(&s->cpu_slab->lock, flags);
+ local_unlock_cpu_slab(s, flags);
goto reread_slab;
}
freelist = c->freelist;
c->slab = NULL;
c->freelist = NULL;
c->tid = next_tid(c->tid);
- local_unlock_irqrestore(&s->cpu_slab->lock, flags);
+ local_unlock_cpu_slab(s, flags);
deactivate_slab(s, slab, freelist);
new_slab:
#ifdef CONFIG_SLUB_CPU_PARTIAL
while (slub_percpu_partial(c)) {
- local_lock_irqsave(&s->cpu_slab->lock, flags);
+ local_lock_cpu_slab(s, flags);
if (unlikely(c->slab)) {
- local_unlock_irqrestore(&s->cpu_slab->lock, flags);
+ local_unlock_cpu_slab(s, flags);
goto reread_slab;
}
if (unlikely(!slub_percpu_partial(c))) {
- local_unlock_irqrestore(&s->cpu_slab->lock, flags);
+ local_unlock_cpu_slab(s, flags);
/* we were preempted and partial list got empty */
goto new_objects;
}
@@ -3834,7 +4559,8 @@ new_slab:
slub_set_percpu_partial(c, slab);
if (likely(node_match(slab, node) &&
- pfmemalloc_match(slab, gfpflags))) {
+ pfmemalloc_match(slab, gfpflags)) ||
+ !allow_spin) {
c->slab = slab;
freelist = get_freelist(s, slab);
VM_BUG_ON(!freelist);
@@ -3842,7 +4568,7 @@ new_slab:
goto load_freelist;
}
- local_unlock_irqrestore(&s->cpu_slab->lock, flags);
+ local_unlock_cpu_slab(s, flags);
slab->next = NULL;
__put_partials(s, slab);
@@ -3864,8 +4590,13 @@ new_objects:
* allocating new page from other nodes
*/
if (unlikely(node != NUMA_NO_NODE && !(gfpflags & __GFP_THISNODE)
- && try_thisnode))
- pc.flags = GFP_NOWAIT | __GFP_THISNODE;
+ && try_thisnode)) {
+ if (unlikely(!allow_spin))
+ /* Do not upgrade gfp to NOWAIT from more restrictive mode */
+ pc.flags = gfpflags | __GFP_THISNODE;
+ else
+ pc.flags = GFP_NOWAIT | __GFP_THISNODE;
+ }
pc.orig_size = orig_size;
slab = get_partial(s, node, &pc);
@@ -3876,9 +4607,14 @@ new_objects:
* For debug caches here we had to go through
* alloc_single_from_partial() so just store the
* tracking info and return the object.
+ *
+ * Due to disabled preemption we need to disallow
+ * blocking. The flags are further adjusted by
+ * gfp_nested_mask() in stack_depot itself.
*/
if (s->flags & SLAB_STORE_USER)
- set_track(s, freelist, TRACK_ALLOC, addr);
+ set_track(s, freelist, TRACK_ALLOC, addr,
+ gfpflags & ~(__GFP_DIRECT_RECLAIM));
return freelist;
}
@@ -3904,13 +4640,14 @@ new_objects:
stat(s, ALLOC_SLAB);
if (kmem_cache_debug(s)) {
- freelist = alloc_single_from_new_slab(s, slab, orig_size);
+ freelist = alloc_single_from_new_slab(s, slab, orig_size, gfpflags);
if (unlikely(!freelist))
goto new_objects;
if (s->flags & SLAB_STORE_USER)
- set_track(s, freelist, TRACK_ALLOC, addr);
+ set_track(s, freelist, TRACK_ALLOC, addr,
+ gfpflags & ~(__GFP_DIRECT_RECLAIM));
return freelist;
}
@@ -3926,7 +4663,7 @@ new_objects:
inc_slabs_node(s, slab_nid(slab), slab->objects);
- if (unlikely(!pfmemalloc_match(slab, gfpflags))) {
+ if (unlikely(!pfmemalloc_match(slab, gfpflags) && allow_spin)) {
/*
* For !pfmemalloc_match() case we don't load freelist so that
* we don't make further mismatched allocations easier.
@@ -3937,7 +4674,7 @@ new_objects:
retry_load_slab:
- local_lock_irqsave(&s->cpu_slab->lock, flags);
+ local_lock_cpu_slab(s, flags);
if (unlikely(c->slab)) {
void *flush_freelist = c->freelist;
struct slab *flush_slab = c->slab;
@@ -3946,9 +4683,14 @@ retry_load_slab:
c->freelist = NULL;
c->tid = next_tid(c->tid);
- local_unlock_irqrestore(&s->cpu_slab->lock, flags);
+ local_unlock_cpu_slab(s, flags);
- deactivate_slab(s, flush_slab, flush_freelist);
+ if (unlikely(!allow_spin)) {
+ /* Reentrant slub cannot take locks, defer */
+ defer_deactivate_slab(flush_slab, flush_freelist);
+ } else {
+ deactivate_slab(s, flush_slab, flush_freelist);
+ }
stat(s, CPUSLAB_FLUSH);
@@ -3958,6 +4700,19 @@ retry_load_slab:
goto load_freelist;
}
+/*
+ * We disallow kprobes in ___slab_alloc() to prevent reentrance
+ *
+ * kmalloc() -> ___slab_alloc() -> local_lock_cpu_slab() protected part of
+ * ___slab_alloc() manipulating c->freelist -> kprobe -> bpf ->
+ * kmalloc_nolock() or kfree_nolock() -> __update_cpu_freelist_fast()
+ * manipulating c->freelist without lock.
+ *
+ * This does not prevent kprobe in functions called from ___slab_alloc() such as
+ * local_lock_irqsave() itself, and that is fine, we only need to protect the
+ * c->freelist manipulation in ___slab_alloc() itself.
+ */
+NOKPROBE_SYMBOL(___slab_alloc);
/*
* A wrapper for ___slab_alloc() for contexts where preemption is not yet
@@ -3977,8 +4732,19 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
*/
c = slub_get_cpu_ptr(s->cpu_slab);
#endif
-
+ if (unlikely(!gfpflags_allow_spinning(gfpflags))) {
+ if (local_lock_is_locked(&s->cpu_slab->lock)) {
+ /*
+ * EBUSY is an internal signal to kmalloc_nolock() to
+ * retry a different bucket. It's not propagated
+ * to the caller.
+ */
+ p = ERR_PTR(-EBUSY);
+ goto out;
+ }
+ }
p = ___slab_alloc(s, gfpflags, node, addr, c, orig_size);
+out:
#ifdef CONFIG_PREEMPT_COUNT
slub_put_cpu_ptr(s->cpu_slab);
#endif
@@ -4102,7 +4868,7 @@ static void *__slab_alloc_node(struct kmem_cache *s,
return NULL;
}
- object = alloc_single_from_new_slab(s, slab, orig_size);
+ object = alloc_single_from_new_slab(s, slab, orig_size, gfpflags);
return object;
}
@@ -4181,8 +4947,9 @@ bool slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru,
if (p[i] && init && (!kasan_init ||
!kasan_has_integrated_init()))
memset(p[i], 0, zero_size);
- kmemleak_alloc_recursive(p[i], s->object_size, 1,
- s->flags, init_flags);
+ if (gfpflags_allow_spinning(flags))
+ kmemleak_alloc_recursive(p[i], s->object_size, 1,
+ s->flags, init_flags);
kmsan_slab_alloc(s, p[i], init_flags);
alloc_tagging_slab_alloc_hook(s, p[i], flags);
}
@@ -4191,6 +4958,251 @@ bool slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru,
}
/*
+ * Replace the empty main sheaf with a (at least partially) full sheaf.
+ *
+ * Must be called with the cpu_sheaves local lock locked. If successful, returns
+ * the pcs pointer and the local lock locked (possibly on a different cpu than
+ * initially called). If not successful, returns NULL and the local lock
+ * unlocked.
+ */
+static struct slub_percpu_sheaves *
+__pcs_replace_empty_main(struct kmem_cache *s, struct slub_percpu_sheaves *pcs, gfp_t gfp)
+{
+ struct slab_sheaf *empty = NULL;
+ struct slab_sheaf *full;
+ struct node_barn *barn;
+ bool can_alloc;
+
+ lockdep_assert_held(this_cpu_ptr(&s->cpu_sheaves->lock));
+
+ if (pcs->spare && pcs->spare->size > 0) {
+ swap(pcs->main, pcs->spare);
+ return pcs;
+ }
+
+ barn = get_barn(s);
+
+ full = barn_replace_empty_sheaf(barn, pcs->main);
+
+ if (full) {
+ stat(s, BARN_GET);
+ pcs->main = full;
+ return pcs;
+ }
+
+ stat(s, BARN_GET_FAIL);
+
+ can_alloc = gfpflags_allow_blocking(gfp);
+
+ if (can_alloc) {
+ if (pcs->spare) {
+ empty = pcs->spare;
+ pcs->spare = NULL;
+ } else {
+ empty = barn_get_empty_sheaf(barn);
+ }
+ }
+
+ local_unlock(&s->cpu_sheaves->lock);
+
+ if (!can_alloc)
+ return NULL;
+
+ if (empty) {
+ if (!refill_sheaf(s, empty, gfp)) {
+ full = empty;
+ } else {
+ /*
+ * we must be very low on memory so don't bother
+ * with the barn
+ */
+ free_empty_sheaf(s, empty);
+ }
+ } else {
+ full = alloc_full_sheaf(s, gfp);
+ }
+
+ if (!full)
+ return NULL;
+
+ /*
+ * we can reach here only when gfpflags_allow_blocking
+ * so this must not be an irq
+ */
+ local_lock(&s->cpu_sheaves->lock);
+ pcs = this_cpu_ptr(s->cpu_sheaves);
+
+ /*
+ * If we are returning empty sheaf, we either got it from the
+ * barn or had to allocate one. If we are returning a full
+ * sheaf, it's due to racing or being migrated to a different
+ * cpu. Breaching the barn's sheaf limits should be thus rare
+ * enough so just ignore them to simplify the recovery.
+ */
+
+ if (pcs->main->size == 0) {
+ barn_put_empty_sheaf(barn, pcs->main);
+ pcs->main = full;
+ return pcs;
+ }
+
+ if (!pcs->spare) {
+ pcs->spare = full;
+ return pcs;
+ }
+
+ if (pcs->spare->size == 0) {
+ barn_put_empty_sheaf(barn, pcs->spare);
+ pcs->spare = full;
+ return pcs;
+ }
+
+ barn_put_full_sheaf(barn, full);
+ stat(s, BARN_PUT);
+
+ return pcs;
+}
+
+static __fastpath_inline
+void *alloc_from_pcs(struct kmem_cache *s, gfp_t gfp, int node)
+{
+ struct slub_percpu_sheaves *pcs;
+ bool node_requested;
+ void *object;
+
+#ifdef CONFIG_NUMA
+ if (static_branch_unlikely(&strict_numa) &&
+ node == NUMA_NO_NODE) {
+
+ struct mempolicy *mpol = current->mempolicy;
+
+ if (mpol) {
+ /*
+ * Special BIND rule support. If the local node
+ * is in permitted set then do not redirect
+ * to a particular node.
+ * Otherwise we apply the memory policy to get
+ * the node we need to allocate on.
+ */
+ if (mpol->mode != MPOL_BIND ||
+ !node_isset(numa_mem_id(), mpol->nodes))
+
+ node = mempolicy_slab_node();
+ }
+ }
+#endif
+
+ node_requested = IS_ENABLED(CONFIG_NUMA) && node != NUMA_NO_NODE;
+
+ /*
+ * We assume the percpu sheaves contain only local objects although it's
+ * not completely guaranteed, so we verify later.
+ */
+ if (unlikely(node_requested && node != numa_mem_id()))
+ return NULL;
+
+ if (!local_trylock(&s->cpu_sheaves->lock))
+ return NULL;
+
+ pcs = this_cpu_ptr(s->cpu_sheaves);
+
+ if (unlikely(pcs->main->size == 0)) {
+ pcs = __pcs_replace_empty_main(s, pcs, gfp);
+ if (unlikely(!pcs))
+ return NULL;
+ }
+
+ object = pcs->main->objects[pcs->main->size - 1];
+
+ if (unlikely(node_requested)) {
+ /*
+ * Verify that the object was from the node we want. This could
+ * be false because of cpu migration during an unlocked part of
+ * the current allocation or previous freeing process.
+ */
+ if (folio_nid(virt_to_folio(object)) != node) {
+ local_unlock(&s->cpu_sheaves->lock);
+ return NULL;
+ }
+ }
+
+ pcs->main->size--;
+
+ local_unlock(&s->cpu_sheaves->lock);
+
+ stat(s, ALLOC_PCS);
+
+ return object;
+}
+
+static __fastpath_inline
+unsigned int alloc_from_pcs_bulk(struct kmem_cache *s, size_t size, void **p)
+{
+ struct slub_percpu_sheaves *pcs;
+ struct slab_sheaf *main;
+ unsigned int allocated = 0;
+ unsigned int batch;
+
+next_batch:
+ if (!local_trylock(&s->cpu_sheaves->lock))
+ return allocated;
+
+ pcs = this_cpu_ptr(s->cpu_sheaves);
+
+ if (unlikely(pcs->main->size == 0)) {
+
+ struct slab_sheaf *full;
+
+ if (pcs->spare && pcs->spare->size > 0) {
+ swap(pcs->main, pcs->spare);
+ goto do_alloc;
+ }
+
+ full = barn_replace_empty_sheaf(get_barn(s), pcs->main);
+
+ if (full) {
+ stat(s, BARN_GET);
+ pcs->main = full;
+ goto do_alloc;
+ }
+
+ stat(s, BARN_GET_FAIL);
+
+ local_unlock(&s->cpu_sheaves->lock);
+
+ /*
+ * Once full sheaves in barn are depleted, let the bulk
+ * allocation continue from slab pages, otherwise we would just
+ * be copying arrays of pointers twice.
+ */
+ return allocated;
+ }
+
+do_alloc:
+
+ main = pcs->main;
+ batch = min(size, main->size);
+
+ main->size -= batch;
+ memcpy(p, main->objects + main->size, batch * sizeof(void *));
+
+ local_unlock(&s->cpu_sheaves->lock);
+
+ stat_add(s, ALLOC_PCS, batch);
+
+ allocated += batch;
+
+ if (batch < size) {
+ p += batch;
+ size -= batch;
+ goto next_batch;
+ }
+
+ return allocated;
+}
+
+
+/*
* Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc)
* have the fastpath folded into their functions. So no function call
* overhead for requests that can be satisfied on the fastpath.
@@ -4214,7 +5226,11 @@ static __fastpath_inline void *slab_alloc_node(struct kmem_cache *s, struct list
if (unlikely(object))
goto out;
- object = __slab_alloc_node(s, gfpflags, node, addr, orig_size);
+ if (s->cpu_sheaves)
+ object = alloc_from_pcs(s, gfpflags, node);
+
+ if (!object)
+ object = __slab_alloc_node(s, gfpflags, node, addr, orig_size);
maybe_wipe_obj_freeptr(s, object);
init = slab_want_init_on_alloc(gfpflags, s);
@@ -4287,6 +5303,228 @@ void *kmem_cache_alloc_node_noprof(struct kmem_cache *s, gfp_t gfpflags, int nod
EXPORT_SYMBOL(kmem_cache_alloc_node_noprof);
/*
+ * returns a sheaf that has at least the requested size
+ * when prefilling is needed, do so with given gfp flags
+ *
+ * return NULL if sheaf allocation or prefilling failed
+ */
+struct slab_sheaf *
+kmem_cache_prefill_sheaf(struct kmem_cache *s, gfp_t gfp, unsigned int size)
+{
+ struct slub_percpu_sheaves *pcs;
+ struct slab_sheaf *sheaf = NULL;
+
+ if (unlikely(size > s->sheaf_capacity)) {
+
+ /*
+ * slab_debug disables cpu sheaves intentionally so all
+ * prefilled sheaves become "oversize" and we give up on
+ * performance for the debugging. Same with SLUB_TINY.
+ * Creating a cache without sheaves and then requesting a
+ * prefilled sheaf is however not expected, so warn.
+ */
+ WARN_ON_ONCE(s->sheaf_capacity == 0 &&
+ !IS_ENABLED(CONFIG_SLUB_TINY) &&
+ !(s->flags & SLAB_DEBUG_FLAGS));
+
+ sheaf = kzalloc(struct_size(sheaf, objects, size), gfp);
+ if (!sheaf)
+ return NULL;
+
+ stat(s, SHEAF_PREFILL_OVERSIZE);
+ sheaf->cache = s;
+ sheaf->capacity = size;
+
+ if (!__kmem_cache_alloc_bulk(s, gfp, size,
+ &sheaf->objects[0])) {
+ kfree(sheaf);
+ return NULL;
+ }
+
+ sheaf->size = size;
+
+ return sheaf;
+ }
+
+ local_lock(&s->cpu_sheaves->lock);
+ pcs = this_cpu_ptr(s->cpu_sheaves);
+
+ if (pcs->spare) {
+ sheaf = pcs->spare;
+ pcs->spare = NULL;
+ stat(s, SHEAF_PREFILL_FAST);
+ } else {
+ stat(s, SHEAF_PREFILL_SLOW);
+ sheaf = barn_get_full_or_empty_sheaf(get_barn(s));
+ if (sheaf && sheaf->size)
+ stat(s, BARN_GET);
+ else
+ stat(s, BARN_GET_FAIL);
+ }
+
+ local_unlock(&s->cpu_sheaves->lock);
+
+
+ if (!sheaf)
+ sheaf = alloc_empty_sheaf(s, gfp);
+
+ if (sheaf && sheaf->size < size) {
+ if (refill_sheaf(s, sheaf, gfp)) {
+ sheaf_flush_unused(s, sheaf);
+ free_empty_sheaf(s, sheaf);
+ sheaf = NULL;
+ }
+ }
+
+ if (sheaf)
+ sheaf->capacity = s->sheaf_capacity;
+
+ return sheaf;
+}
+
+/*
+ * Use this to return a sheaf obtained by kmem_cache_prefill_sheaf()
+ *
+ * If the sheaf cannot simply become the percpu spare sheaf, but there's space
+ * for a full sheaf in the barn, we try to refill the sheaf back to the cache's
+ * sheaf_capacity to avoid handling partially full sheaves.
+ *
+ * If the refill fails because gfp is e.g. GFP_NOWAIT, or the barn is full, the
+ * sheaf is instead flushed and freed.
+ */
+void kmem_cache_return_sheaf(struct kmem_cache *s, gfp_t gfp,
+ struct slab_sheaf *sheaf)
+{
+ struct slub_percpu_sheaves *pcs;
+ struct node_barn *barn;
+
+ if (unlikely(sheaf->capacity != s->sheaf_capacity)) {
+ sheaf_flush_unused(s, sheaf);
+ kfree(sheaf);
+ return;
+ }
+
+ local_lock(&s->cpu_sheaves->lock);
+ pcs = this_cpu_ptr(s->cpu_sheaves);
+ barn = get_barn(s);
+
+ if (!pcs->spare) {
+ pcs->spare = sheaf;
+ sheaf = NULL;
+ stat(s, SHEAF_RETURN_FAST);
+ }
+
+ local_unlock(&s->cpu_sheaves->lock);
+
+ if (!sheaf)
+ return;
+
+ stat(s, SHEAF_RETURN_SLOW);
+
+ /*
+ * If the barn has too many full sheaves or we fail to refill the sheaf,
+ * simply flush and free it.
+ */
+ if (data_race(barn->nr_full) >= MAX_FULL_SHEAVES ||
+ refill_sheaf(s, sheaf, gfp)) {
+ sheaf_flush_unused(s, sheaf);
+ free_empty_sheaf(s, sheaf);
+ return;
+ }
+
+ barn_put_full_sheaf(barn, sheaf);
+ stat(s, BARN_PUT);
+}
+
+/*
+ * refill a sheaf previously returned by kmem_cache_prefill_sheaf to at least
+ * the given size
+ *
+ * the sheaf might be replaced by a new one when requesting more than
+ * s->sheaf_capacity objects if such replacement is necessary, but the refill
+ * fails (returning -ENOMEM), the existing sheaf is left intact
+ *
+ * In practice we always refill to full sheaf's capacity.
+ */
+int kmem_cache_refill_sheaf(struct kmem_cache *s, gfp_t gfp,
+ struct slab_sheaf **sheafp, unsigned int size)
+{
+ struct slab_sheaf *sheaf;
+
+ /*
+ * TODO: do we want to support *sheaf == NULL to be equivalent of
+ * kmem_cache_prefill_sheaf() ?
+ */
+ if (!sheafp || !(*sheafp))
+ return -EINVAL;
+
+ sheaf = *sheafp;
+ if (sheaf->size >= size)
+ return 0;
+
+ if (likely(sheaf->capacity >= size)) {
+ if (likely(sheaf->capacity == s->sheaf_capacity))
+ return refill_sheaf(s, sheaf, gfp);
+
+ if (!__kmem_cache_alloc_bulk(s, gfp, sheaf->capacity - sheaf->size,
+ &sheaf->objects[sheaf->size])) {
+ return -ENOMEM;
+ }
+ sheaf->size = sheaf->capacity;
+
+ return 0;
+ }
+
+ /*
+ * We had a regular sized sheaf and need an oversize one, or we had an
+ * oversize one already but need a larger one now.
+ * This should be a very rare path so let's not complicate it.
+ */
+ sheaf = kmem_cache_prefill_sheaf(s, gfp, size);
+ if (!sheaf)
+ return -ENOMEM;
+
+ kmem_cache_return_sheaf(s, gfp, *sheafp);
+ *sheafp = sheaf;
+ return 0;
+}
+
+/*
+ * Allocate from a sheaf obtained by kmem_cache_prefill_sheaf()
+ *
+ * Guaranteed not to fail as many allocations as was the requested size.
+ * After the sheaf is emptied, it fails - no fallback to the slab cache itself.
+ *
+ * The gfp parameter is meant only to specify __GFP_ZERO or __GFP_ACCOUNT
+ * memcg charging is forced over limit if necessary, to avoid failure.
+ */
+void *
+kmem_cache_alloc_from_sheaf_noprof(struct kmem_cache *s, gfp_t gfp,
+ struct slab_sheaf *sheaf)
+{
+ void *ret = NULL;
+ bool init;
+
+ if (sheaf->size == 0)
+ goto out;
+
+ ret = sheaf->objects[--sheaf->size];
+
+ init = slab_want_init_on_alloc(gfp, s);
+
+ /* add __GFP_NOFAIL to force successful memcg charging */
+ slab_post_alloc_hook(s, NULL, gfp | __GFP_NOFAIL, 1, &ret, init, s->object_size);
+out:
+ trace_kmem_cache_alloc(_RET_IP_, ret, s, gfp, NUMA_NO_NODE);
+
+ return ret;
+}
+
+unsigned int kmem_cache_sheaf_size(struct slab_sheaf *sheaf)
+{
+ return sheaf->size;
+}
+/*
* To avoid unnecessary overhead, we pass through large allocation requests
* directly to the page allocator. We use __GFP_COMP, because we will need to
* know the allocation order to free the pages properly in kfree.
@@ -4378,6 +5616,96 @@ void *__kmalloc_noprof(size_t size, gfp_t flags)
}
EXPORT_SYMBOL(__kmalloc_noprof);
+/**
+ * kmalloc_nolock - Allocate an object of given size from any context.
+ * @size: size to allocate
+ * @gfp_flags: GFP flags. Only __GFP_ACCOUNT, __GFP_ZERO, __GFP_NO_OBJ_EXT
+ * allowed.
+ * @node: node number of the target node.
+ *
+ * Return: pointer to the new object or NULL in case of error.
+ * NULL does not mean EBUSY or EAGAIN. It means ENOMEM.
+ * There is no reason to call it again and expect !NULL.
+ */
+void *kmalloc_nolock_noprof(size_t size, gfp_t gfp_flags, int node)
+{
+ gfp_t alloc_gfp = __GFP_NOWARN | __GFP_NOMEMALLOC | gfp_flags;
+ struct kmem_cache *s;
+ bool can_retry = true;
+ void *ret = ERR_PTR(-EBUSY);
+
+ VM_WARN_ON_ONCE(gfp_flags & ~(__GFP_ACCOUNT | __GFP_ZERO |
+ __GFP_NO_OBJ_EXT));
+
+ if (unlikely(!size))
+ return ZERO_SIZE_PTR;
+
+ if (IS_ENABLED(CONFIG_PREEMPT_RT) && (in_nmi() || in_hardirq()))
+ /* kmalloc_nolock() in PREEMPT_RT is not supported from irq */
+ return NULL;
+retry:
+ if (unlikely(size > KMALLOC_MAX_CACHE_SIZE))
+ return NULL;
+ s = kmalloc_slab(size, NULL, alloc_gfp, _RET_IP_);
+
+ if (!(s->flags & __CMPXCHG_DOUBLE) && !kmem_cache_debug(s))
+ /*
+ * kmalloc_nolock() is not supported on architectures that
+ * don't implement cmpxchg16b, but debug caches don't use
+ * per-cpu slab and per-cpu partial slabs. They rely on
+ * kmem_cache_node->list_lock, so kmalloc_nolock() can
+ * attempt to allocate from debug caches by
+ * spin_trylock_irqsave(&n->list_lock, ...)
+ */
+ return NULL;
+
+ /*
+ * Do not call slab_alloc_node(), since trylock mode isn't
+ * compatible with slab_pre_alloc_hook/should_failslab and
+ * kfence_alloc. Hence call __slab_alloc_node() (at most twice)
+ * and slab_post_alloc_hook() directly.
+ *
+ * In !PREEMPT_RT ___slab_alloc() manipulates (freelist,tid) pair
+ * in irq saved region. It assumes that the same cpu will not
+ * __update_cpu_freelist_fast() into the same (freelist,tid) pair.
+ * Therefore use in_nmi() to check whether particular bucket is in
+ * irq protected section.
+ *
+ * If in_nmi() && local_lock_is_locked(s->cpu_slab) then it means that
+ * this cpu was interrupted somewhere inside ___slab_alloc() after
+ * it did local_lock_irqsave(&s->cpu_slab->lock, flags).
+ * In this case fast path with __update_cpu_freelist_fast() is not safe.
+ */
+#ifndef CONFIG_SLUB_TINY
+ if (!in_nmi() || !local_lock_is_locked(&s->cpu_slab->lock))
+#endif
+ ret = __slab_alloc_node(s, alloc_gfp, node, _RET_IP_, size);
+
+ if (PTR_ERR(ret) == -EBUSY) {
+ if (can_retry) {
+ /* pick the next kmalloc bucket */
+ size = s->object_size + 1;
+ /*
+ * Another alternative is to
+ * if (memcg) alloc_gfp &= ~__GFP_ACCOUNT;
+ * else if (!memcg) alloc_gfp |= __GFP_ACCOUNT;
+ * to retry from bucket of the same size.
+ */
+ can_retry = false;
+ goto retry;
+ }
+ ret = NULL;
+ }
+
+ maybe_wipe_obj_freeptr(s, ret);
+ slab_post_alloc_hook(s, NULL, alloc_gfp, 1, &ret,
+ slab_want_init_on_alloc(alloc_gfp, s), size);
+
+ ret = kasan_kmalloc(s, ret, size, alloc_gfp);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(kmalloc_nolock_noprof);
+
void *__kmalloc_node_track_caller_noprof(DECL_BUCKET_PARAMS(size, b), gfp_t flags,
int node, unsigned long caller)
{
@@ -4421,8 +5749,12 @@ static noinline void free_to_partial_list(
unsigned long flags;
depot_stack_handle_t handle = 0;
+ /*
+ * We cannot use GFP_NOWAIT as there are callsites where waking up
+ * kswapd could deadlock
+ */
if (s->flags & SLAB_STORE_USER)
- handle = set_track_prepare();
+ handle = set_track_prepare(__GFP_NOWARN);
spin_lock_irqsave(&n->list_lock, flags);
@@ -4591,6 +5923,537 @@ slab_empty:
discard_slab(s, slab);
}
+/*
+ * pcs is locked. We should have get rid of the spare sheaf and obtained an
+ * empty sheaf, while the main sheaf is full. We want to install the empty sheaf
+ * as a main sheaf, and make the current main sheaf a spare sheaf.
+ *
+ * However due to having relinquished the cpu_sheaves lock when obtaining
+ * the empty sheaf, we need to handle some unlikely but possible cases.
+ *
+ * If we put any sheaf to barn here, it's because we were interrupted or have
+ * been migrated to a different cpu, which should be rare enough so just ignore
+ * the barn's limits to simplify the handling.
+ *
+ * An alternative scenario that gets us here is when we fail
+ * barn_replace_full_sheaf(), because there's no empty sheaf available in the
+ * barn, so we had to allocate it by alloc_empty_sheaf(). But because we saw the
+ * limit on full sheaves was not exceeded, we assume it didn't change and just
+ * put the full sheaf there.
+ */
+static void __pcs_install_empty_sheaf(struct kmem_cache *s,
+ struct slub_percpu_sheaves *pcs, struct slab_sheaf *empty)
+{
+ struct node_barn *barn;
+
+ lockdep_assert_held(this_cpu_ptr(&s->cpu_sheaves->lock));
+
+ /* This is what we expect to find if nobody interrupted us. */
+ if (likely(!pcs->spare)) {
+ pcs->spare = pcs->main;
+ pcs->main = empty;
+ return;
+ }
+
+ barn = get_barn(s);
+
+ /*
+ * Unlikely because if the main sheaf had space, we would have just
+ * freed to it. Get rid of our empty sheaf.
+ */
+ if (pcs->main->size < s->sheaf_capacity) {
+ barn_put_empty_sheaf(barn, empty);
+ return;
+ }
+
+ /* Also unlikely for the same reason */
+ if (pcs->spare->size < s->sheaf_capacity) {
+ swap(pcs->main, pcs->spare);
+ barn_put_empty_sheaf(barn, empty);
+ return;
+ }
+
+ /*
+ * We probably failed barn_replace_full_sheaf() due to no empty sheaf
+ * available there, but we allocated one, so finish the job.
+ */
+ barn_put_full_sheaf(barn, pcs->main);
+ stat(s, BARN_PUT);
+ pcs->main = empty;
+}
+
+/*
+ * Replace the full main sheaf with a (at least partially) empty sheaf.
+ *
+ * Must be called with the cpu_sheaves local lock locked. If successful, returns
+ * the pcs pointer and the local lock locked (possibly on a different cpu than
+ * initially called). If not successful, returns NULL and the local lock
+ * unlocked.
+ */
+static struct slub_percpu_sheaves *
+__pcs_replace_full_main(struct kmem_cache *s, struct slub_percpu_sheaves *pcs)
+{
+ struct slab_sheaf *empty;
+ struct node_barn *barn;
+ bool put_fail;
+
+restart:
+ lockdep_assert_held(this_cpu_ptr(&s->cpu_sheaves->lock));
+
+ barn = get_barn(s);
+ put_fail = false;
+
+ if (!pcs->spare) {
+ empty = barn_get_empty_sheaf(barn);
+ if (empty) {
+ pcs->spare = pcs->main;
+ pcs->main = empty;
+ return pcs;
+ }
+ goto alloc_empty;
+ }
+
+ if (pcs->spare->size < s->sheaf_capacity) {
+ swap(pcs->main, pcs->spare);
+ return pcs;
+ }
+
+ empty = barn_replace_full_sheaf(barn, pcs->main);
+
+ if (!IS_ERR(empty)) {
+ stat(s, BARN_PUT);
+ pcs->main = empty;
+ return pcs;
+ }
+
+ if (PTR_ERR(empty) == -E2BIG) {
+ /* Since we got here, spare exists and is full */
+ struct slab_sheaf *to_flush = pcs->spare;
+
+ stat(s, BARN_PUT_FAIL);
+
+ pcs->spare = NULL;
+ local_unlock(&s->cpu_sheaves->lock);
+
+ sheaf_flush_unused(s, to_flush);
+ empty = to_flush;
+ goto got_empty;
+ }
+
+ /*
+ * We could not replace full sheaf because barn had no empty
+ * sheaves. We can still allocate it and put the full sheaf in
+ * __pcs_install_empty_sheaf(), but if we fail to allocate it,
+ * make sure to count the fail.
+ */
+ put_fail = true;
+
+alloc_empty:
+ local_unlock(&s->cpu_sheaves->lock);
+
+ empty = alloc_empty_sheaf(s, GFP_NOWAIT);
+ if (empty)
+ goto got_empty;
+
+ if (put_fail)
+ stat(s, BARN_PUT_FAIL);
+
+ if (!sheaf_flush_main(s))
+ return NULL;
+
+ if (!local_trylock(&s->cpu_sheaves->lock))
+ return NULL;
+
+ pcs = this_cpu_ptr(s->cpu_sheaves);
+
+ /*
+ * we flushed the main sheaf so it should be empty now,
+ * but in case we got preempted or migrated, we need to
+ * check again
+ */
+ if (pcs->main->size == s->sheaf_capacity)
+ goto restart;
+
+ return pcs;
+
+got_empty:
+ if (!local_trylock(&s->cpu_sheaves->lock)) {
+ barn_put_empty_sheaf(barn, empty);
+ return NULL;
+ }
+
+ pcs = this_cpu_ptr(s->cpu_sheaves);
+ __pcs_install_empty_sheaf(s, pcs, empty);
+
+ return pcs;
+}
+
+/*
+ * Free an object to the percpu sheaves.
+ * The object is expected to have passed slab_free_hook() already.
+ */
+static __fastpath_inline
+bool free_to_pcs(struct kmem_cache *s, void *object)
+{
+ struct slub_percpu_sheaves *pcs;
+
+ if (!local_trylock(&s->cpu_sheaves->lock))
+ return false;
+
+ pcs = this_cpu_ptr(s->cpu_sheaves);
+
+ if (unlikely(pcs->main->size == s->sheaf_capacity)) {
+
+ pcs = __pcs_replace_full_main(s, pcs);
+ if (unlikely(!pcs))
+ return false;
+ }
+
+ pcs->main->objects[pcs->main->size++] = object;
+
+ local_unlock(&s->cpu_sheaves->lock);
+
+ stat(s, FREE_PCS);
+
+ return true;
+}
+
+static void rcu_free_sheaf(struct rcu_head *head)
+{
+ struct slab_sheaf *sheaf;
+ struct node_barn *barn;
+ struct kmem_cache *s;
+
+ sheaf = container_of(head, struct slab_sheaf, rcu_head);
+
+ s = sheaf->cache;
+
+ /*
+ * This may remove some objects due to slab_free_hook() returning false,
+ * so that the sheaf might no longer be completely full. But it's easier
+ * to handle it as full (unless it became completely empty), as the code
+ * handles it fine. The only downside is that sheaf will serve fewer
+ * allocations when reused. It only happens due to debugging, which is a
+ * performance hit anyway.
+ */
+ __rcu_free_sheaf_prepare(s, sheaf);
+
+ barn = get_node(s, sheaf->node)->barn;
+
+ /* due to slab_free_hook() */
+ if (unlikely(sheaf->size == 0))
+ goto empty;
+
+ /*
+ * Checking nr_full/nr_empty outside lock avoids contention in case the
+ * barn is at the respective limit. Due to the race we might go over the
+ * limit but that should be rare and harmless.
+ */
+
+ if (data_race(barn->nr_full) < MAX_FULL_SHEAVES) {
+ stat(s, BARN_PUT);
+ barn_put_full_sheaf(barn, sheaf);
+ return;
+ }
+
+ stat(s, BARN_PUT_FAIL);
+ sheaf_flush_unused(s, sheaf);
+
+empty:
+ if (data_race(barn->nr_empty) < MAX_EMPTY_SHEAVES) {
+ barn_put_empty_sheaf(barn, sheaf);
+ return;
+ }
+
+ free_empty_sheaf(s, sheaf);
+}
+
+bool __kfree_rcu_sheaf(struct kmem_cache *s, void *obj)
+{
+ struct slub_percpu_sheaves *pcs;
+ struct slab_sheaf *rcu_sheaf;
+
+ if (!local_trylock(&s->cpu_sheaves->lock))
+ goto fail;
+
+ pcs = this_cpu_ptr(s->cpu_sheaves);
+
+ if (unlikely(!pcs->rcu_free)) {
+
+ struct slab_sheaf *empty;
+ struct node_barn *barn;
+
+ if (pcs->spare && pcs->spare->size == 0) {
+ pcs->rcu_free = pcs->spare;
+ pcs->spare = NULL;
+ goto do_free;
+ }
+
+ barn = get_barn(s);
+
+ empty = barn_get_empty_sheaf(barn);
+
+ if (empty) {
+ pcs->rcu_free = empty;
+ goto do_free;
+ }
+
+ local_unlock(&s->cpu_sheaves->lock);
+
+ empty = alloc_empty_sheaf(s, GFP_NOWAIT);
+
+ if (!empty)
+ goto fail;
+
+ if (!local_trylock(&s->cpu_sheaves->lock)) {
+ barn_put_empty_sheaf(barn, empty);
+ goto fail;
+ }
+
+ pcs = this_cpu_ptr(s->cpu_sheaves);
+
+ if (unlikely(pcs->rcu_free))
+ barn_put_empty_sheaf(barn, empty);
+ else
+ pcs->rcu_free = empty;
+ }
+
+do_free:
+
+ rcu_sheaf = pcs->rcu_free;
+
+ /*
+ * Since we flush immediately when size reaches capacity, we never reach
+ * this with size already at capacity, so no OOB write is possible.
+ */
+ rcu_sheaf->objects[rcu_sheaf->size++] = obj;
+
+ if (likely(rcu_sheaf->size < s->sheaf_capacity)) {
+ rcu_sheaf = NULL;
+ } else {
+ pcs->rcu_free = NULL;
+ rcu_sheaf->node = numa_mem_id();
+ }
+
+ /*
+ * we flush before local_unlock to make sure a racing
+ * flush_all_rcu_sheaves() doesn't miss this sheaf
+ */
+ if (rcu_sheaf)
+ call_rcu(&rcu_sheaf->rcu_head, rcu_free_sheaf);
+
+ local_unlock(&s->cpu_sheaves->lock);
+
+ stat(s, FREE_RCU_SHEAF);
+ return true;
+
+fail:
+ stat(s, FREE_RCU_SHEAF_FAIL);
+ return false;
+}
+
+/*
+ * Bulk free objects to the percpu sheaves.
+ * Unlike free_to_pcs() this includes the calls to all necessary hooks
+ * and the fallback to freeing to slab pages.
+ */
+static void free_to_pcs_bulk(struct kmem_cache *s, size_t size, void **p)
+{
+ struct slub_percpu_sheaves *pcs;
+ struct slab_sheaf *main, *empty;
+ bool init = slab_want_init_on_free(s);
+ unsigned int batch, i = 0;
+ struct node_barn *barn;
+ void *remote_objects[PCS_BATCH_MAX];
+ unsigned int remote_nr = 0;
+ int node = numa_mem_id();
+
+next_remote_batch:
+ while (i < size) {
+ struct slab *slab = virt_to_slab(p[i]);
+
+ memcg_slab_free_hook(s, slab, p + i, 1);
+ alloc_tagging_slab_free_hook(s, slab, p + i, 1);
+
+ if (unlikely(!slab_free_hook(s, p[i], init, false))) {
+ p[i] = p[--size];
+ if (!size)
+ goto flush_remote;
+ continue;
+ }
+
+ if (unlikely(IS_ENABLED(CONFIG_NUMA) && slab_nid(slab) != node)) {
+ remote_objects[remote_nr] = p[i];
+ p[i] = p[--size];
+ if (++remote_nr >= PCS_BATCH_MAX)
+ goto flush_remote;
+ continue;
+ }
+
+ i++;
+ }
+
+next_batch:
+ if (!local_trylock(&s->cpu_sheaves->lock))
+ goto fallback;
+
+ pcs = this_cpu_ptr(s->cpu_sheaves);
+
+ if (likely(pcs->main->size < s->sheaf_capacity))
+ goto do_free;
+
+ barn = get_barn(s);
+
+ if (!pcs->spare) {
+ empty = barn_get_empty_sheaf(barn);
+ if (!empty)
+ goto no_empty;
+
+ pcs->spare = pcs->main;
+ pcs->main = empty;
+ goto do_free;
+ }
+
+ if (pcs->spare->size < s->sheaf_capacity) {
+ swap(pcs->main, pcs->spare);
+ goto do_free;
+ }
+
+ empty = barn_replace_full_sheaf(barn, pcs->main);
+ if (IS_ERR(empty)) {
+ stat(s, BARN_PUT_FAIL);
+ goto no_empty;
+ }
+
+ stat(s, BARN_PUT);
+ pcs->main = empty;
+
+do_free:
+ main = pcs->main;
+ batch = min(size, s->sheaf_capacity - main->size);
+
+ memcpy(main->objects + main->size, p, batch * sizeof(void *));
+ main->size += batch;
+
+ local_unlock(&s->cpu_sheaves->lock);
+
+ stat_add(s, FREE_PCS, batch);
+
+ if (batch < size) {
+ p += batch;
+ size -= batch;
+ goto next_batch;
+ }
+
+ return;
+
+no_empty:
+ local_unlock(&s->cpu_sheaves->lock);
+
+ /*
+ * if we depleted all empty sheaves in the barn or there are too
+ * many full sheaves, free the rest to slab pages
+ */
+fallback:
+ __kmem_cache_free_bulk(s, size, p);
+
+flush_remote:
+ if (remote_nr) {
+ __kmem_cache_free_bulk(s, remote_nr, &remote_objects[0]);
+ if (i < size) {
+ remote_nr = 0;
+ goto next_remote_batch;
+ }
+ }
+}
+
+struct defer_free {
+ struct llist_head objects;
+ struct llist_head slabs;
+ struct irq_work work;
+};
+
+static void free_deferred_objects(struct irq_work *work);
+
+static DEFINE_PER_CPU(struct defer_free, defer_free_objects) = {
+ .objects = LLIST_HEAD_INIT(objects),
+ .slabs = LLIST_HEAD_INIT(slabs),
+ .work = IRQ_WORK_INIT(free_deferred_objects),
+};
+
+/*
+ * In PREEMPT_RT irq_work runs in per-cpu kthread, so it's safe
+ * to take sleeping spin_locks from __slab_free() and deactivate_slab().
+ * In !PREEMPT_RT irq_work will run after local_unlock_irqrestore().
+ */
+static void free_deferred_objects(struct irq_work *work)
+{
+ struct defer_free *df = container_of(work, struct defer_free, work);
+ struct llist_head *objs = &df->objects;
+ struct llist_head *slabs = &df->slabs;
+ struct llist_node *llnode, *pos, *t;
+
+ if (llist_empty(objs) && llist_empty(slabs))
+ return;
+
+ llnode = llist_del_all(objs);
+ llist_for_each_safe(pos, t, llnode) {
+ struct kmem_cache *s;
+ struct slab *slab;
+ void *x = pos;
+
+ slab = virt_to_slab(x);
+ s = slab->slab_cache;
+
+ /*
+ * We used freepointer in 'x' to link 'x' into df->objects.
+ * Clear it to NULL to avoid false positive detection
+ * of "Freepointer corruption".
+ */
+ *(void **)x = NULL;
+
+ /* Point 'x' back to the beginning of allocated object */
+ x -= s->offset;
+ __slab_free(s, slab, x, x, 1, _THIS_IP_);
+ }
+
+ llnode = llist_del_all(slabs);
+ llist_for_each_safe(pos, t, llnode) {
+ struct slab *slab = container_of(pos, struct slab, llnode);
+
+#ifdef CONFIG_SLUB_TINY
+ discard_slab(slab->slab_cache, slab);
+#else
+ deactivate_slab(slab->slab_cache, slab, slab->flush_freelist);
+#endif
+ }
+}
+
+static void defer_free(struct kmem_cache *s, void *head)
+{
+ struct defer_free *df = this_cpu_ptr(&defer_free_objects);
+
+ if (llist_add(head + s->offset, &df->objects))
+ irq_work_queue(&df->work);
+}
+
+static void defer_deactivate_slab(struct slab *slab, void *flush_freelist)
+{
+ struct defer_free *df = this_cpu_ptr(&defer_free_objects);
+
+ slab->flush_freelist = flush_freelist;
+ if (llist_add(&slab->llnode, &df->slabs))
+ irq_work_queue(&df->work);
+}
+
+void defer_free_barrier(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ irq_work_sync(&per_cpu_ptr(&defer_free_objects, cpu)->work);
+}
+
#ifndef CONFIG_SLUB_TINY
/*
* Fastpath with forced inlining to produce a kfree and kmem_cache_free that
@@ -4611,6 +6474,8 @@ static __always_inline void do_slab_free(struct kmem_cache *s,
struct slab *slab, void *head, void *tail,
int cnt, unsigned long addr)
{
+ /* cnt == 0 signals that it's called from kfree_nolock() */
+ bool allow_spin = cnt;
struct kmem_cache_cpu *c;
unsigned long tid;
void **freelist;
@@ -4629,10 +6494,29 @@ redo:
barrier();
if (unlikely(slab != c->slab)) {
- __slab_free(s, slab, head, tail, cnt, addr);
+ if (unlikely(!allow_spin)) {
+ /*
+ * __slab_free() can locklessly cmpxchg16 into a slab,
+ * but then it might need to take spin_lock or local_lock
+ * in put_cpu_partial() for further processing.
+ * Avoid the complexity and simply add to a deferred list.
+ */
+ defer_free(s, head);
+ } else {
+ __slab_free(s, slab, head, tail, cnt, addr);
+ }
return;
}
+ if (unlikely(!allow_spin)) {
+ if ((in_nmi() || !USE_LOCKLESS_FAST_PATH()) &&
+ local_lock_is_locked(&s->cpu_slab->lock)) {
+ defer_free(s, head);
+ return;
+ }
+ cnt = 1; /* restore cnt. kfree_nolock() frees one object at a time */
+ }
+
if (USE_LOCKLESS_FAST_PATH()) {
freelist = READ_ONCE(c->freelist);
@@ -4643,11 +6527,13 @@ redo:
goto redo;
}
} else {
+ __maybe_unused unsigned long flags = 0;
+
/* Update the free list under the local lock */
- local_lock(&s->cpu_slab->lock);
+ local_lock_cpu_slab(s, flags);
c = this_cpu_ptr(s->cpu_slab);
if (unlikely(slab != c->slab)) {
- local_unlock(&s->cpu_slab->lock);
+ local_unlock_cpu_slab(s, flags);
goto redo;
}
tid = c->tid;
@@ -4657,7 +6543,7 @@ redo:
c->freelist = head;
c->tid = next_tid(tid);
- local_unlock(&s->cpu_slab->lock);
+ local_unlock_cpu_slab(s, flags);
}
stat_add(s, FREE_FASTPATH, cnt);
}
@@ -4677,8 +6563,16 @@ void slab_free(struct kmem_cache *s, struct slab *slab, void *object,
memcg_slab_free_hook(s, slab, &object, 1);
alloc_tagging_slab_free_hook(s, slab, &object, 1);
- if (likely(slab_free_hook(s, object, slab_want_init_on_free(s), false)))
- do_slab_free(s, slab, object, object, 1, addr);
+ if (unlikely(!slab_free_hook(s, object, slab_want_init_on_free(s), false)))
+ return;
+
+ if (s->cpu_sheaves && likely(!IS_ENABLED(CONFIG_NUMA) ||
+ slab_nid(slab) == numa_mem_id())) {
+ if (likely(free_to_pcs(s, object)))
+ return;
+ }
+
+ do_slab_free(s, slab, object, object, 1, addr);
}
#ifdef CONFIG_MEMCG
@@ -4880,8 +6774,73 @@ void kfree(const void *object)
}
EXPORT_SYMBOL(kfree);
+/*
+ * Can be called while holding raw_spinlock_t or from IRQ and NMI,
+ * but ONLY for objects allocated by kmalloc_nolock().
+ * Debug checks (like kmemleak and kfence) were skipped on allocation,
+ * hence
+ * obj = kmalloc(); kfree_nolock(obj);
+ * will miss kmemleak/kfence book keeping and will cause false positives.
+ * large_kmalloc is not supported either.
+ */
+void kfree_nolock(const void *object)
+{
+ struct folio *folio;
+ struct slab *slab;
+ struct kmem_cache *s;
+ void *x = (void *)object;
+
+ if (unlikely(ZERO_OR_NULL_PTR(object)))
+ return;
+
+ folio = virt_to_folio(object);
+ if (unlikely(!folio_test_slab(folio))) {
+ WARN_ONCE(1, "large_kmalloc is not supported by kfree_nolock()");
+ return;
+ }
+
+ slab = folio_slab(folio);
+ s = slab->slab_cache;
+
+ memcg_slab_free_hook(s, slab, &x, 1);
+ alloc_tagging_slab_free_hook(s, slab, &x, 1);
+ /*
+ * Unlike slab_free() do NOT call the following:
+ * kmemleak_free_recursive(x, s->flags);
+ * debug_check_no_locks_freed(x, s->object_size);
+ * debug_check_no_obj_freed(x, s->object_size);
+ * __kcsan_check_access(x, s->object_size, ..);
+ * kfence_free(x);
+ * since they take spinlocks or not safe from any context.
+ */
+ kmsan_slab_free(s, x);
+ /*
+ * If KASAN finds a kernel bug it will do kasan_report_invalid_free()
+ * which will call raw_spin_lock_irqsave() which is technically
+ * unsafe from NMI, but take chance and report kernel bug.
+ * The sequence of
+ * kasan_report_invalid_free() -> raw_spin_lock_irqsave() -> NMI
+ * -> kfree_nolock() -> kasan_report_invalid_free() on the same CPU
+ * is double buggy and deserves to deadlock.
+ */
+ if (kasan_slab_pre_free(s, x))
+ return;
+ /*
+ * memcg, kasan_slab_pre_free are done for 'x'.
+ * The only thing left is kasan_poison without quarantine,
+ * since kasan quarantine takes locks and not supported from NMI.
+ */
+ kasan_slab_free(s, x, false, false, /* skip quarantine */true);
+#ifndef CONFIG_SLUB_TINY
+ do_slab_free(s, slab, x, x, 0, _RET_IP_);
+#else
+ defer_free(s, x);
+#endif
+}
+EXPORT_SYMBOL_GPL(kfree_nolock);
+
static __always_inline __realloc_size(2) void *
-__do_krealloc(const void *p, size_t new_size, gfp_t flags)
+__do_krealloc(const void *p, size_t new_size, unsigned long align, gfp_t flags, int nid)
{
void *ret;
size_t ks = 0;
@@ -4895,6 +6854,16 @@ __do_krealloc(const void *p, size_t new_size, gfp_t flags)
if (!kasan_check_byte(p))
return NULL;
+ /*
+ * If reallocation is not necessary (e. g. the new size is less
+ * than the current allocated size), the current allocation will be
+ * preserved unless __GFP_THISNODE is set. In the latter case a new
+ * allocation on the requested node will be attempted.
+ */
+ if (unlikely(flags & __GFP_THISNODE) && nid != NUMA_NO_NODE &&
+ nid != page_to_nid(virt_to_page(p)))
+ goto alloc_new;
+
if (is_kfence_address(p)) {
ks = orig_size = kfence_ksize(p);
} else {
@@ -4917,6 +6886,10 @@ __do_krealloc(const void *p, size_t new_size, gfp_t flags)
if (new_size > ks)
goto alloc_new;
+ /* If the old object doesn't satisfy the new alignment, allocate a new one */
+ if (!IS_ALIGNED((unsigned long)p, align))
+ goto alloc_new;
+
/* Zero out spare memory. */
if (want_init_on_alloc(flags)) {
kasan_disable_current();
@@ -4939,7 +6912,7 @@ __do_krealloc(const void *p, size_t new_size, gfp_t flags)
return (void *)p;
alloc_new:
- ret = kmalloc_node_track_caller_noprof(new_size, flags, NUMA_NO_NODE, _RET_IP_);
+ ret = kmalloc_node_track_caller_noprof(new_size, flags, nid, _RET_IP_);
if (ret && p) {
/* Disable KASAN checks as the object's redzone is accessed. */
kasan_disable_current();
@@ -4951,14 +6924,19 @@ alloc_new:
}
/**
- * krealloc - reallocate memory. The contents will remain unchanged.
+ * krealloc_node_align - reallocate memory. The contents will remain unchanged.
* @p: object to reallocate memory for.
* @new_size: how many bytes of memory are required.
+ * @align: desired alignment.
* @flags: the type of memory to allocate.
+ * @nid: NUMA node or NUMA_NO_NODE
*
* If @p is %NULL, krealloc() behaves exactly like kmalloc(). If @new_size
* is 0 and @p is not a %NULL pointer, the object pointed to is freed.
*
+ * Only alignments up to those guaranteed by kmalloc() will be honored. Please see
+ * Documentation/core-api/memory-allocation.rst for more details.
+ *
* If __GFP_ZERO logic is requested, callers must ensure that, starting with the
* initial memory allocation, every subsequent call to this API for the same
* memory allocation is flagged with __GFP_ZERO. Otherwise, it is possible that
@@ -4983,7 +6961,8 @@ alloc_new:
*
* Return: pointer to the allocated memory or %NULL in case of error
*/
-void *krealloc_noprof(const void *p, size_t new_size, gfp_t flags)
+void *krealloc_node_align_noprof(const void *p, size_t new_size, unsigned long align,
+ gfp_t flags, int nid)
{
void *ret;
@@ -4992,13 +6971,13 @@ void *krealloc_noprof(const void *p, size_t new_size, gfp_t flags)
return ZERO_SIZE_PTR;
}
- ret = __do_krealloc(p, new_size, flags);
+ ret = __do_krealloc(p, new_size, align, flags, nid);
if (ret && kasan_reset_tag(p) != kasan_reset_tag(ret))
kfree(p);
return ret;
}
-EXPORT_SYMBOL(krealloc_noprof);
+EXPORT_SYMBOL(krealloc_node_align_noprof);
static gfp_t kmalloc_gfp_adjust(gfp_t flags, size_t size)
{
@@ -5029,9 +7008,13 @@ static gfp_t kmalloc_gfp_adjust(gfp_t flags, size_t size)
* failure, fall back to non-contiguous (vmalloc) allocation.
* @size: size of the request.
* @b: which set of kmalloc buckets to allocate from.
+ * @align: desired alignment.
* @flags: gfp mask for the allocation - must be compatible (superset) with GFP_KERNEL.
* @node: numa node to allocate from
*
+ * Only alignments up to those guaranteed by kmalloc() will be honored. Please see
+ * Documentation/core-api/memory-allocation.rst for more details.
+ *
* Uses kmalloc to get the memory but if the allocation fails then falls back
* to the vmalloc allocator. Use kvfree for freeing the memory.
*
@@ -5041,7 +7024,8 @@ static gfp_t kmalloc_gfp_adjust(gfp_t flags, size_t size)
*
* Return: pointer to the allocated memory of %NULL in case of failure
*/
-void *__kvmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), gfp_t flags, int node)
+void *__kvmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), unsigned long align,
+ gfp_t flags, int node)
{
void *ret;
@@ -5071,7 +7055,7 @@ void *__kvmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), gfp_t flags, int node)
* about the resulting pointer, and cannot play
* protection games.
*/
- return __vmalloc_node_range_noprof(size, 1, VMALLOC_START, VMALLOC_END,
+ return __vmalloc_node_range_noprof(size, align, VMALLOC_START, VMALLOC_END,
flags, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP,
node, __builtin_return_address(0));
}
@@ -5115,14 +7099,19 @@ void kvfree_sensitive(const void *addr, size_t len)
EXPORT_SYMBOL(kvfree_sensitive);
/**
- * kvrealloc - reallocate memory; contents remain unchanged
+ * kvrealloc_node_align - reallocate memory; contents remain unchanged
* @p: object to reallocate memory for
* @size: the size to reallocate
+ * @align: desired alignment
* @flags: the flags for the page level allocator
+ * @nid: NUMA node id
*
* If @p is %NULL, kvrealloc() behaves exactly like kvmalloc(). If @size is 0
* and @p is not a %NULL pointer, the object pointed to is freed.
*
+ * Only alignments up to those guaranteed by kmalloc() will be honored. Please see
+ * Documentation/core-api/memory-allocation.rst for more details.
+ *
* If __GFP_ZERO logic is requested, callers must ensure that, starting with the
* initial memory allocation, every subsequent call to this API for the same
* memory allocation is flagged with __GFP_ZERO. Otherwise, it is possible that
@@ -5136,17 +7125,18 @@ EXPORT_SYMBOL(kvfree_sensitive);
*
* Return: pointer to the allocated memory or %NULL in case of error
*/
-void *kvrealloc_noprof(const void *p, size_t size, gfp_t flags)
+void *kvrealloc_node_align_noprof(const void *p, size_t size, unsigned long align,
+ gfp_t flags, int nid)
{
void *n;
if (is_vmalloc_addr(p))
- return vrealloc_noprof(p, size, flags);
+ return vrealloc_node_align_noprof(p, size, align, flags, nid);
- n = krealloc_noprof(p, size, kmalloc_gfp_adjust(flags, size));
+ n = krealloc_node_align_noprof(p, size, align, kmalloc_gfp_adjust(flags, size), nid);
if (!n) {
/* We failed to krealloc(), fall back to kvmalloc(). */
- n = kvmalloc_noprof(size, flags);
+ n = kvmalloc_node_align_noprof(size, align, flags, nid);
if (!n)
return NULL;
@@ -5162,7 +7152,7 @@ void *kvrealloc_noprof(const void *p, size_t size, gfp_t flags)
return n;
}
-EXPORT_SYMBOL(kvrealloc_noprof);
+EXPORT_SYMBOL(kvrealloc_node_align_noprof);
struct detached_freelist {
struct slab *slab;
@@ -5273,6 +7263,15 @@ void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
if (!size)
return;
+ /*
+ * freeing to sheaves is so incompatible with the detached freelist so
+ * once we go that way, we have to do everything differently
+ */
+ if (s && s->cpu_sheaves) {
+ free_to_pcs_bulk(s, size, p);
+ return;
+ }
+
do {
struct detached_freelist df;
@@ -5391,7 +7390,7 @@ error:
int kmem_cache_alloc_bulk_noprof(struct kmem_cache *s, gfp_t flags, size_t size,
void **p)
{
- int i;
+ unsigned int i = 0;
if (!size)
return 0;
@@ -5400,9 +7399,20 @@ int kmem_cache_alloc_bulk_noprof(struct kmem_cache *s, gfp_t flags, size_t size,
if (unlikely(!s))
return 0;
- i = __kmem_cache_alloc_bulk(s, flags, size, p);
- if (unlikely(i == 0))
- return 0;
+ if (s->cpu_sheaves)
+ i = alloc_from_pcs_bulk(s, size, p);
+
+ if (i < size) {
+ /*
+ * If we ran out of memory, don't bother with freeing back to
+ * the percpu sheaves, we have bigger problems.
+ */
+ if (unlikely(__kmem_cache_alloc_bulk(s, flags, size - i, p + i) == 0)) {
+ if (i > 0)
+ __kmem_cache_free_bulk(s, i, p);
+ return 0;
+ }
+ }
/*
* memcg and kmem_cache debug support and memory initialization.
@@ -5412,11 +7422,11 @@ int kmem_cache_alloc_bulk_noprof(struct kmem_cache *s, gfp_t flags, size_t size,
slab_want_init_on_alloc(flags, s), s->object_size))) {
return 0;
}
- return i;
+
+ return size;
}
EXPORT_SYMBOL(kmem_cache_alloc_bulk_noprof);
-
/*
* Object placement in a slab is made very easy because we always start at
* offset 0. If we tune the size of the object to the alignment then we can
@@ -5550,7 +7560,7 @@ static inline int calculate_order(unsigned int size)
}
static void
-init_kmem_cache_node(struct kmem_cache_node *n)
+init_kmem_cache_node(struct kmem_cache_node *n, struct node_barn *barn)
{
n->nr_partial = 0;
spin_lock_init(&n->list_lock);
@@ -5560,6 +7570,9 @@ init_kmem_cache_node(struct kmem_cache_node *n)
atomic_long_set(&n->total_objects, 0);
INIT_LIST_HEAD(&n->full);
#endif
+ n->barn = barn;
+ if (barn)
+ barn_init(barn);
}
#ifndef CONFIG_SLUB_TINY
@@ -5590,6 +7603,26 @@ static inline int alloc_kmem_cache_cpus(struct kmem_cache *s)
}
#endif /* CONFIG_SLUB_TINY */
+static int init_percpu_sheaves(struct kmem_cache *s)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ struct slub_percpu_sheaves *pcs;
+
+ pcs = per_cpu_ptr(s->cpu_sheaves, cpu);
+
+ local_trylock_init(&pcs->lock);
+
+ pcs->main = alloc_empty_sheaf(s, GFP_KERNEL);
+
+ if (!pcs->main)
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
static struct kmem_cache *kmem_cache_node;
/*
@@ -5625,7 +7658,7 @@ static void early_kmem_cache_node_alloc(int node)
slab->freelist = get_freepointer(kmem_cache_node, n);
slab->inuse = 1;
kmem_cache_node->node[node] = n;
- init_kmem_cache_node(n);
+ init_kmem_cache_node(n, NULL);
inc_slabs_node(kmem_cache_node, node, slab->objects);
/*
@@ -5641,6 +7674,13 @@ static void free_kmem_cache_nodes(struct kmem_cache *s)
struct kmem_cache_node *n;
for_each_kmem_cache_node(s, node, n) {
+ if (n->barn) {
+ WARN_ON(n->barn->nr_full);
+ WARN_ON(n->barn->nr_empty);
+ kfree(n->barn);
+ n->barn = NULL;
+ }
+
s->node[node] = NULL;
kmem_cache_free(kmem_cache_node, n);
}
@@ -5649,7 +7689,12 @@ static void free_kmem_cache_nodes(struct kmem_cache *s)
void __kmem_cache_release(struct kmem_cache *s)
{
cache_random_seq_destroy(s);
+ if (s->cpu_sheaves)
+ pcs_destroy(s);
#ifndef CONFIG_SLUB_TINY
+#ifdef CONFIG_PREEMPT_RT
+ lockdep_unregister_key(&s->lock_key);
+#endif
free_percpu(s->cpu_slab);
#endif
free_kmem_cache_nodes(s);
@@ -5661,20 +7706,29 @@ static int init_kmem_cache_nodes(struct kmem_cache *s)
for_each_node_mask(node, slab_nodes) {
struct kmem_cache_node *n;
+ struct node_barn *barn = NULL;
if (slab_state == DOWN) {
early_kmem_cache_node_alloc(node);
continue;
}
+
+ if (s->cpu_sheaves) {
+ barn = kmalloc_node(sizeof(*barn), GFP_KERNEL, node);
+
+ if (!barn)
+ return 0;
+ }
+
n = kmem_cache_alloc_node(kmem_cache_node,
GFP_KERNEL, node);
-
if (!n) {
- free_kmem_cache_nodes(s);
+ kfree(barn);
return 0;
}
- init_kmem_cache_node(n);
+ init_kmem_cache_node(n, barn);
+
s->node[node] = n;
}
return 1;
@@ -5929,8 +7983,15 @@ int __kmem_cache_shutdown(struct kmem_cache *s)
struct kmem_cache_node *n;
flush_all_cpus_locked(s);
+
+ /* we might have rcu sheaves in flight */
+ if (s->cpu_sheaves)
+ rcu_barrier();
+
/* Attempt to free all objects */
for_each_kmem_cache_node(s, node, n) {
+ if (n->barn)
+ barn_shrink(s, n->barn);
free_partial(s, n);
if (n->nr_partial || node_nr_slabs(n))
return 1;
@@ -6134,6 +8195,9 @@ static int __kmem_cache_do_shrink(struct kmem_cache *s)
for (i = 0; i < SHRINK_PROMOTE_MAX; i++)
INIT_LIST_HEAD(promote + i);
+ if (n->barn)
+ barn_shrink(s, n->barn);
+
spin_lock_irqsave(&n->list_lock, flags);
/*
@@ -6213,12 +8277,24 @@ static int slab_mem_going_online_callback(int nid)
*/
mutex_lock(&slab_mutex);
list_for_each_entry(s, &slab_caches, list) {
+ struct node_barn *barn = NULL;
+
/*
* The structure may already exist if the node was previously
* onlined and offlined.
*/
if (get_node(s, nid))
continue;
+
+ if (s->cpu_sheaves) {
+ barn = kmalloc_node(sizeof(*barn), GFP_KERNEL, nid);
+
+ if (!barn) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ }
+
/*
* XXX: kmem_cache_alloc_node will fallback to other nodes
* since memory is not yet available from the node that
@@ -6226,10 +8302,13 @@ static int slab_mem_going_online_callback(int nid)
*/
n = kmem_cache_alloc(kmem_cache_node, GFP_KERNEL);
if (!n) {
+ kfree(barn);
ret = -ENOMEM;
goto out;
}
- init_kmem_cache_node(n);
+
+ init_kmem_cache_node(n, barn);
+
s->node[nid] = n;
}
/*
@@ -6442,6 +8521,17 @@ int do_kmem_cache_create(struct kmem_cache *s, const char *name,
set_cpu_partial(s);
+ if (args->sheaf_capacity && !IS_ENABLED(CONFIG_SLUB_TINY)
+ && !(s->flags & SLAB_DEBUG_FLAGS)) {
+ s->cpu_sheaves = alloc_percpu(struct slub_percpu_sheaves);
+ if (!s->cpu_sheaves) {
+ err = -ENOMEM;
+ goto out;
+ }
+ // TODO: increase capacity to grow slab_sheaf up to next kmalloc size?
+ s->sheaf_capacity = args->sheaf_capacity;
+ }
+
#ifdef CONFIG_NUMA
s->remote_node_defrag_ratio = 1000;
#endif
@@ -6458,6 +8548,12 @@ int do_kmem_cache_create(struct kmem_cache *s, const char *name,
if (!alloc_kmem_cache_cpus(s))
goto out;
+ if (s->cpu_sheaves) {
+ err = init_percpu_sheaves(s);
+ if (err)
+ goto out;
+ }
+
err = 0;
/* Mutex is not taken during early boot */
@@ -6499,6 +8595,11 @@ static void validate_slab(struct kmem_cache *s, struct slab *slab,
void *p;
void *addr = slab_address(slab);
+ if (!validate_slab_ptr(slab)) {
+ slab_err(s, slab, "Not a valid slab page");
+ return;
+ }
+
if (!check_slab(s, slab) || !on_freelist(s, slab, NULL))
return;
@@ -6910,6 +9011,12 @@ static ssize_t order_show(struct kmem_cache *s, char *buf)
}
SLAB_ATTR_RO(order);
+static ssize_t sheaf_capacity_show(struct kmem_cache *s, char *buf)
+{
+ return sysfs_emit(buf, "%u\n", s->sheaf_capacity);
+}
+SLAB_ATTR_RO(sheaf_capacity);
+
static ssize_t min_partial_show(struct kmem_cache *s, char *buf)
{
return sysfs_emit(buf, "%lu\n", s->min_partial);
@@ -7257,8 +9364,12 @@ static ssize_t text##_store(struct kmem_cache *s, \
} \
SLAB_ATTR(text); \
+STAT_ATTR(ALLOC_PCS, alloc_cpu_sheaf);
STAT_ATTR(ALLOC_FASTPATH, alloc_fastpath);
STAT_ATTR(ALLOC_SLOWPATH, alloc_slowpath);
+STAT_ATTR(FREE_PCS, free_cpu_sheaf);
+STAT_ATTR(FREE_RCU_SHEAF, free_rcu_sheaf);
+STAT_ATTR(FREE_RCU_SHEAF_FAIL, free_rcu_sheaf_fail);
STAT_ATTR(FREE_FASTPATH, free_fastpath);
STAT_ATTR(FREE_SLOWPATH, free_slowpath);
STAT_ATTR(FREE_FROZEN, free_frozen);
@@ -7283,6 +9394,19 @@ STAT_ATTR(CPU_PARTIAL_ALLOC, cpu_partial_alloc);
STAT_ATTR(CPU_PARTIAL_FREE, cpu_partial_free);
STAT_ATTR(CPU_PARTIAL_NODE, cpu_partial_node);
STAT_ATTR(CPU_PARTIAL_DRAIN, cpu_partial_drain);
+STAT_ATTR(SHEAF_FLUSH, sheaf_flush);
+STAT_ATTR(SHEAF_REFILL, sheaf_refill);
+STAT_ATTR(SHEAF_ALLOC, sheaf_alloc);
+STAT_ATTR(SHEAF_FREE, sheaf_free);
+STAT_ATTR(BARN_GET, barn_get);
+STAT_ATTR(BARN_GET_FAIL, barn_get_fail);
+STAT_ATTR(BARN_PUT, barn_put);
+STAT_ATTR(BARN_PUT_FAIL, barn_put_fail);
+STAT_ATTR(SHEAF_PREFILL_FAST, sheaf_prefill_fast);
+STAT_ATTR(SHEAF_PREFILL_SLOW, sheaf_prefill_slow);
+STAT_ATTR(SHEAF_PREFILL_OVERSIZE, sheaf_prefill_oversize);
+STAT_ATTR(SHEAF_RETURN_FAST, sheaf_return_fast);
+STAT_ATTR(SHEAF_RETURN_SLOW, sheaf_return_slow);
#endif /* CONFIG_SLUB_STATS */
#ifdef CONFIG_KFENCE
@@ -7313,6 +9437,7 @@ static struct attribute *slab_attrs[] = {
&object_size_attr.attr,
&objs_per_slab_attr.attr,
&order_attr.attr,
+ &sheaf_capacity_attr.attr,
&min_partial_attr.attr,
&cpu_partial_attr.attr,
&objects_partial_attr.attr,
@@ -7344,8 +9469,12 @@ static struct attribute *slab_attrs[] = {
&remote_node_defrag_ratio_attr.attr,
#endif
#ifdef CONFIG_SLUB_STATS
+ &alloc_cpu_sheaf_attr.attr,
&alloc_fastpath_attr.attr,
&alloc_slowpath_attr.attr,
+ &free_cpu_sheaf_attr.attr,
+ &free_rcu_sheaf_attr.attr,
+ &free_rcu_sheaf_fail_attr.attr,
&free_fastpath_attr.attr,
&free_slowpath_attr.attr,
&free_frozen_attr.attr,
@@ -7370,6 +9499,19 @@ static struct attribute *slab_attrs[] = {
&cpu_partial_free_attr.attr,
&cpu_partial_node_attr.attr,
&cpu_partial_drain_attr.attr,
+ &sheaf_flush_attr.attr,
+ &sheaf_refill_attr.attr,
+ &sheaf_alloc_attr.attr,
+ &sheaf_free_attr.attr,
+ &barn_get_attr.attr,
+ &barn_get_fail_attr.attr,
+ &barn_put_attr.attr,
+ &barn_put_fail_attr.attr,
+ &sheaf_prefill_fast_attr.attr,
+ &sheaf_prefill_slow_attr.attr,
+ &sheaf_prefill_oversize_attr.attr,
+ &sheaf_return_fast_attr.attr,
+ &sheaf_return_slow_attr.attr,
#endif
#ifdef CONFIG_FAILSLAB
&failslab_attr.attr,
@@ -7711,15 +9853,12 @@ static void *slab_debugfs_next(struct seq_file *seq, void *v, loff_t *ppos)
return NULL;
}
-static int cmp_loc_by_count(const void *a, const void *b, const void *data)
+static int cmp_loc_by_count(const void *a, const void *b)
{
struct location *loc1 = (struct location *)a;
struct location *loc2 = (struct location *)b;
- if (loc1->count > loc2->count)
- return -1;
- else
- return 1;
+ return cmp_int(loc2->count, loc1->count);
}
static void *slab_debugfs_start(struct seq_file *seq, loff_t *ppos)
@@ -7781,8 +9920,8 @@ static int slab_debug_trace_open(struct inode *inode, struct file *filep)
}
/* Sort locations by count */
- sort_r(t->loc, t->count, sizeof(struct location),
- cmp_loc_by_count, NULL, NULL);
+ sort(t->loc, t->count, sizeof(struct location),
+ cmp_loc_by_count, NULL);
bitmap_free(obj_map);
return 0;
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index fd2ab5118e13..dbd8daccade2 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -27,9 +27,9 @@
#include <linux/spinlock.h>
#include <linux/vmalloc.h>
#include <linux/sched.h>
+#include <linux/pgalloc.h>
#include <asm/dma.h>
-#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
#include "hugetlb_vmemmap.h"
@@ -229,7 +229,7 @@ p4d_t * __meminit vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node)
if (!p)
return NULL;
pud_init(p);
- p4d_populate(&init_mm, p4d, p);
+ p4d_populate_kernel(addr, p4d, p);
}
return p4d;
}
@@ -241,7 +241,7 @@ pgd_t * __meminit vmemmap_pgd_populate(unsigned long addr, int node)
void *p = vmemmap_alloc_block_zero(PAGE_SIZE, node);
if (!p)
return NULL;
- pgd_populate(&init_mm, pgd, p);
+ pgd_populate_kernel(addr, pgd, p);
}
return pgd;
}
@@ -578,11 +578,6 @@ struct page * __meminit __populate_section_memmap(unsigned long pfn,
if (r < 0)
return NULL;
- if (system_state == SYSTEM_BOOTING)
- memmap_boot_pages_add(DIV_ROUND_UP(end - start, PAGE_SIZE));
- else
- memmap_pages_add(DIV_ROUND_UP(end - start, PAGE_SIZE));
-
return pfn_to_page(pfn);
}
diff --git a/mm/sparse.c b/mm/sparse.c
index 3c012cf83cc2..17c50a6415c2 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -43,11 +43,11 @@ static u8 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned;
static u16 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned;
#endif
-int page_to_nid(const struct page *page)
+int memdesc_nid(memdesc_flags_t mdf)
{
- return section_to_node_table[page_to_section(page)];
+ return section_to_node_table[memdesc_section(mdf)];
}
-EXPORT_SYMBOL(page_to_nid);
+EXPORT_SYMBOL(memdesc_nid);
static void set_section_nid(unsigned long section_nr, int nid)
{
@@ -454,9 +454,6 @@ static void __init sparse_buffer_init(unsigned long size, int nid)
*/
sparsemap_buf = memmap_alloc(size, section_map_size(), addr, nid, true);
sparsemap_buf_end = sparsemap_buf + size;
-#ifndef CONFIG_SPARSEMEM_VMEMMAP
- memmap_boot_pages_add(DIV_ROUND_UP(size, PAGE_SIZE));
-#endif
}
static void __init sparse_buffer_fini(void)
@@ -567,6 +564,8 @@ static void __init sparse_init_nid(int nid, unsigned long pnum_begin,
sparse_buffer_fini();
goto failed;
}
+ memmap_boot_pages_add(DIV_ROUND_UP(PAGES_PER_SECTION * sizeof(struct page),
+ PAGE_SIZE));
sparse_init_early_section(nid, map, pnum, 0);
}
}
@@ -680,7 +679,6 @@ static void depopulate_section_memmap(unsigned long pfn, unsigned long nr_pages,
unsigned long start = (unsigned long) pfn_to_page(pfn);
unsigned long end = start + nr_pages * sizeof(struct page);
- memmap_pages_add(-1L * (DIV_ROUND_UP(end - start, PAGE_SIZE)));
vmemmap_free(start, end, altmap);
}
static void free_map_bootmem(struct page *memmap)
@@ -856,10 +854,14 @@ static void section_deactivate(unsigned long pfn, unsigned long nr_pages,
* The memmap of early sections is always fully populated. See
* section_activate() and pfn_valid() .
*/
- if (!section_is_early)
+ if (!section_is_early) {
+ memmap_pages_add(-1L * (DIV_ROUND_UP(nr_pages * sizeof(struct page), PAGE_SIZE)));
depopulate_section_memmap(pfn, nr_pages, altmap);
- else if (memmap)
+ } else if (memmap) {
+ memmap_boot_pages_add(-1L * (DIV_ROUND_UP(nr_pages * sizeof(struct page),
+ PAGE_SIZE)));
free_map_bootmem(memmap);
+ }
if (empty)
ms->section_mem_map = (unsigned long)NULL;
@@ -904,6 +906,7 @@ static struct page * __meminit section_activate(int nid, unsigned long pfn,
section_deactivate(pfn, nr_pages, altmap);
return ERR_PTR(-ENOMEM);
}
+ memmap_pages_add(DIV_ROUND_UP(nr_pages * sizeof(struct page), PAGE_SIZE));
return memmap;
}
diff --git a/mm/swap.c b/mm/swap.c
index 3632dd061beb..2260dcd2775e 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -164,6 +164,10 @@ static void folio_batch_move_lru(struct folio_batch *fbatch, move_fn_t move_fn)
for (i = 0; i < folio_batch_count(fbatch); i++) {
struct folio *folio = fbatch->folios[i];
+ /* block memcg migration while the folio moves between lru */
+ if (move_fn != lru_add && !folio_test_clear_lru(folio))
+ continue;
+
folio_lruvec_relock_irqsave(folio, &lruvec, &flags);
move_fn(lruvec, folio);
@@ -176,14 +180,10 @@ static void folio_batch_move_lru(struct folio_batch *fbatch, move_fn_t move_fn)
}
static void __folio_batch_add_and_move(struct folio_batch __percpu *fbatch,
- struct folio *folio, move_fn_t move_fn,
- bool on_lru, bool disable_irq)
+ struct folio *folio, move_fn_t move_fn, bool disable_irq)
{
unsigned long flags;
- if (on_lru && !folio_test_clear_lru(folio))
- return;
-
folio_get(folio);
if (disable_irq)
@@ -191,8 +191,8 @@ static void __folio_batch_add_and_move(struct folio_batch __percpu *fbatch,
else
local_lock(&cpu_fbatches.lock);
- if (!folio_batch_add(this_cpu_ptr(fbatch), folio) || folio_test_large(folio) ||
- lru_cache_disabled())
+ if (!folio_batch_add(this_cpu_ptr(fbatch), folio) ||
+ !folio_may_be_lru_cached(folio) || lru_cache_disabled())
folio_batch_move_lru(this_cpu_ptr(fbatch), move_fn);
if (disable_irq)
@@ -201,13 +201,13 @@ static void __folio_batch_add_and_move(struct folio_batch __percpu *fbatch,
local_unlock(&cpu_fbatches.lock);
}
-#define folio_batch_add_and_move(folio, op, on_lru) \
- __folio_batch_add_and_move( \
- &cpu_fbatches.op, \
- folio, \
- op, \
- on_lru, \
- offsetof(struct cpu_fbatches, op) >= offsetof(struct cpu_fbatches, lock_irq) \
+#define folio_batch_add_and_move(folio, op) \
+ __folio_batch_add_and_move( \
+ &cpu_fbatches.op, \
+ folio, \
+ op, \
+ offsetof(struct cpu_fbatches, op) >= \
+ offsetof(struct cpu_fbatches, lock_irq) \
)
static void lru_move_tail(struct lruvec *lruvec, struct folio *folio)
@@ -231,10 +231,10 @@ static void lru_move_tail(struct lruvec *lruvec, struct folio *folio)
void folio_rotate_reclaimable(struct folio *folio)
{
if (folio_test_locked(folio) || folio_test_dirty(folio) ||
- folio_test_unevictable(folio))
+ folio_test_unevictable(folio) || !folio_test_lru(folio))
return;
- folio_batch_add_and_move(folio, lru_move_tail, true);
+ folio_batch_add_and_move(folio, lru_move_tail);
}
void lru_note_cost_unlock_irq(struct lruvec *lruvec, bool file,
@@ -328,10 +328,11 @@ static void folio_activate_drain(int cpu)
void folio_activate(struct folio *folio)
{
- if (folio_test_active(folio) || folio_test_unevictable(folio))
+ if (folio_test_active(folio) || folio_test_unevictable(folio) ||
+ !folio_test_lru(folio))
return;
- folio_batch_add_and_move(folio, lru_activate, true);
+ folio_batch_add_and_move(folio, lru_activate);
}
#else
@@ -387,14 +388,14 @@ static void __lru_cache_activate_folio(struct folio *folio)
static void lru_gen_inc_refs(struct folio *folio)
{
- unsigned long new_flags, old_flags = READ_ONCE(folio->flags);
+ unsigned long new_flags, old_flags = READ_ONCE(folio->flags.f);
if (folio_test_unevictable(folio))
return;
/* see the comment on LRU_REFS_FLAGS */
if (!folio_test_referenced(folio)) {
- set_mask_bits(&folio->flags, LRU_REFS_MASK, BIT(PG_referenced));
+ set_mask_bits(&folio->flags.f, LRU_REFS_MASK, BIT(PG_referenced));
return;
}
@@ -406,7 +407,7 @@ static void lru_gen_inc_refs(struct folio *folio)
}
new_flags = old_flags + BIT(LRU_REFS_PGOFF);
- } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags));
+ } while (!try_cmpxchg(&folio->flags.f, &old_flags, new_flags));
}
static bool lru_gen_clear_refs(struct folio *folio)
@@ -418,7 +419,7 @@ static bool lru_gen_clear_refs(struct folio *folio)
if (gen < 0)
return true;
- set_mask_bits(&folio->flags, LRU_REFS_FLAGS | BIT(PG_workingset), 0);
+ set_mask_bits(&folio->flags.f, LRU_REFS_FLAGS | BIT(PG_workingset), 0);
lrugen = &folio_lruvec(folio)->lrugen;
/* whether can do without shuffling under the LRU lock */
@@ -507,7 +508,7 @@ void folio_add_lru(struct folio *folio)
lru_gen_in_fault() && !(current->flags & PF_MEMALLOC))
folio_set_active(folio);
- folio_batch_add_and_move(folio, lru_add, false);
+ folio_batch_add_and_move(folio, lru_add);
}
EXPORT_SYMBOL(folio_add_lru);
@@ -685,13 +686,13 @@ void lru_add_drain_cpu(int cpu)
void deactivate_file_folio(struct folio *folio)
{
/* Deactivating an unevictable folio will not accelerate reclaim */
- if (folio_test_unevictable(folio))
+ if (folio_test_unevictable(folio) || !folio_test_lru(folio))
return;
if (lru_gen_enabled() && lru_gen_clear_refs(folio))
return;
- folio_batch_add_and_move(folio, lru_deactivate_file, true);
+ folio_batch_add_and_move(folio, lru_deactivate_file);
}
/*
@@ -704,13 +705,13 @@ void deactivate_file_folio(struct folio *folio)
*/
void folio_deactivate(struct folio *folio)
{
- if (folio_test_unevictable(folio))
+ if (folio_test_unevictable(folio) || !folio_test_lru(folio))
return;
if (lru_gen_enabled() ? lru_gen_clear_refs(folio) : !folio_test_active(folio))
return;
- folio_batch_add_and_move(folio, lru_deactivate, true);
+ folio_batch_add_and_move(folio, lru_deactivate);
}
/**
@@ -723,10 +724,11 @@ void folio_deactivate(struct folio *folio)
void folio_mark_lazyfree(struct folio *folio)
{
if (!folio_test_anon(folio) || !folio_test_swapbacked(folio) ||
+ !folio_test_lru(folio) ||
folio_test_swapcache(folio) || folio_test_unevictable(folio))
return;
- folio_batch_add_and_move(folio, lru_lazyfree, true);
+ folio_batch_add_and_move(folio, lru_lazyfree);
}
void lru_add_drain(void)
@@ -832,6 +834,9 @@ static inline void __lru_add_drain_all(bool force_all_cpus)
*/
this_gen = smp_load_acquire(&lru_drain_gen);
+ /* It helps everyone if we do our own local drain immediately. */
+ lru_add_drain();
+
mutex_lock(&lock);
/*
@@ -1096,7 +1101,7 @@ static const struct ctl_table swap_sysctl_table[] = {
*/
void __init swap_setup(void)
{
- unsigned long megs = totalram_pages() >> (20 - PAGE_SHIFT);
+ unsigned long megs = PAGES_TO_MB(totalram_pages());
/* Use a smaller cluster for small-memory machines */
if (megs < 16)
diff --git a/mm/swap.h b/mm/swap.h
index 911ad5ff0f89..8d8efdf1297a 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -2,15 +2,187 @@
#ifndef _MM_SWAP_H
#define _MM_SWAP_H
+#include <linux/atomic.h> /* for atomic_long_t */
struct mempolicy;
struct swap_iocb;
extern int page_cluster;
+#ifdef CONFIG_THP_SWAP
+#define SWAPFILE_CLUSTER HPAGE_PMD_NR
+#define swap_entry_order(order) (order)
+#else
+#define SWAPFILE_CLUSTER 256
+#define swap_entry_order(order) 0
+#endif
+
+extern struct swap_info_struct *swap_info[];
+
+/*
+ * We use this to track usage of a cluster. A cluster is a block of swap disk
+ * space with SWAPFILE_CLUSTER pages long and naturally aligns in disk. All
+ * free clusters are organized into a list. We fetch an entry from the list to
+ * get a free cluster.
+ *
+ * The flags field determines if a cluster is free. This is
+ * protected by cluster lock.
+ */
+struct swap_cluster_info {
+ spinlock_t lock; /*
+ * Protect swap_cluster_info fields
+ * other than list, and swap_info_struct->swap_map
+ * elements corresponding to the swap cluster.
+ */
+ u16 count;
+ u8 flags;
+ u8 order;
+ atomic_long_t __rcu *table; /* Swap table entries, see mm/swap_table.h */
+ struct list_head list;
+};
+
+/* All on-list cluster must have a non-zero flag. */
+enum swap_cluster_flags {
+ CLUSTER_FLAG_NONE = 0, /* For temporary off-list cluster */
+ CLUSTER_FLAG_FREE,
+ CLUSTER_FLAG_NONFULL,
+ CLUSTER_FLAG_FRAG,
+ /* Clusters with flags above are allocatable */
+ CLUSTER_FLAG_USABLE = CLUSTER_FLAG_FRAG,
+ CLUSTER_FLAG_FULL,
+ CLUSTER_FLAG_DISCARD,
+ CLUSTER_FLAG_MAX,
+};
+
#ifdef CONFIG_SWAP
#include <linux/swapops.h> /* for swp_offset */
#include <linux/blk_types.h> /* for bio_end_io_t */
+static inline unsigned int swp_cluster_offset(swp_entry_t entry)
+{
+ return swp_offset(entry) % SWAPFILE_CLUSTER;
+}
+
+/*
+ * Callers of all helpers below must ensure the entry, type, or offset is
+ * valid, and protect the swap device with reference count or locks.
+ */
+static inline struct swap_info_struct *__swap_type_to_info(int type)
+{
+ struct swap_info_struct *si;
+
+ si = READ_ONCE(swap_info[type]); /* rcu_dereference() */
+ VM_WARN_ON_ONCE(percpu_ref_is_zero(&si->users)); /* race with swapoff */
+ return si;
+}
+
+static inline struct swap_info_struct *__swap_entry_to_info(swp_entry_t entry)
+{
+ return __swap_type_to_info(swp_type(entry));
+}
+
+static inline struct swap_cluster_info *__swap_offset_to_cluster(
+ struct swap_info_struct *si, pgoff_t offset)
+{
+ VM_WARN_ON_ONCE(percpu_ref_is_zero(&si->users)); /* race with swapoff */
+ VM_WARN_ON_ONCE(offset >= si->max);
+ return &si->cluster_info[offset / SWAPFILE_CLUSTER];
+}
+
+static inline struct swap_cluster_info *__swap_entry_to_cluster(swp_entry_t entry)
+{
+ return __swap_offset_to_cluster(__swap_entry_to_info(entry),
+ swp_offset(entry));
+}
+
+static __always_inline struct swap_cluster_info *__swap_cluster_lock(
+ struct swap_info_struct *si, unsigned long offset, bool irq)
+{
+ struct swap_cluster_info *ci = __swap_offset_to_cluster(si, offset);
+
+ /*
+ * Nothing modifies swap cache in an IRQ context. All access to
+ * swap cache is wrapped by swap_cache_* helpers, and swap cache
+ * writeback is handled outside of IRQs. Swapin or swapout never
+ * occurs in IRQ, and neither does in-place split or replace.
+ *
+ * Besides, modifying swap cache requires synchronization with
+ * swap_map, which was never IRQ safe.
+ */
+ VM_WARN_ON_ONCE(!in_task());
+ VM_WARN_ON_ONCE(percpu_ref_is_zero(&si->users)); /* race with swapoff */
+ if (irq)
+ spin_lock_irq(&ci->lock);
+ else
+ spin_lock(&ci->lock);
+ return ci;
+}
+
+/**
+ * swap_cluster_lock - Lock and return the swap cluster of given offset.
+ * @si: swap device the cluster belongs to.
+ * @offset: the swap entry offset, pointing to a valid slot.
+ *
+ * Context: The caller must ensure the offset is in the valid range and
+ * protect the swap device with reference count or locks.
+ */
+static inline struct swap_cluster_info *swap_cluster_lock(
+ struct swap_info_struct *si, unsigned long offset)
+{
+ return __swap_cluster_lock(si, offset, false);
+}
+
+static inline struct swap_cluster_info *__swap_cluster_get_and_lock(
+ const struct folio *folio, bool irq)
+{
+ VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio);
+ VM_WARN_ON_ONCE_FOLIO(!folio_test_swapcache(folio), folio);
+ return __swap_cluster_lock(__swap_entry_to_info(folio->swap),
+ swp_offset(folio->swap), irq);
+}
+
+/*
+ * swap_cluster_get_and_lock - Locks the cluster that holds a folio's entries.
+ * @folio: The folio.
+ *
+ * This locks and returns the swap cluster that contains a folio's swap
+ * entries. The swap entries of a folio are always in one single cluster.
+ * The folio has to be locked so its swap entries won't change and the
+ * cluster won't be freed.
+ *
+ * Context: Caller must ensure the folio is locked and in the swap cache.
+ * Return: Pointer to the swap cluster.
+ */
+static inline struct swap_cluster_info *swap_cluster_get_and_lock(
+ const struct folio *folio)
+{
+ return __swap_cluster_get_and_lock(folio, false);
+}
+
+/*
+ * swap_cluster_get_and_lock_irq - Locks the cluster that holds a folio's entries.
+ * @folio: The folio.
+ *
+ * Same as swap_cluster_get_and_lock but also disable IRQ.
+ *
+ * Context: Caller must ensure the folio is locked and in the swap cache.
+ * Return: Pointer to the swap cluster.
+ */
+static inline struct swap_cluster_info *swap_cluster_get_and_lock_irq(
+ const struct folio *folio)
+{
+ return __swap_cluster_get_and_lock(folio, true);
+}
+
+static inline void swap_cluster_unlock(struct swap_cluster_info *ci)
+{
+ spin_unlock(&ci->lock);
+}
+
+static inline void swap_cluster_unlock_irq(struct swap_cluster_info *ci)
+{
+ spin_unlock_irq(&ci->lock);
+}
+
/* linux/mm/page_io.c */
int sio_pool_init(void);
struct swap_iocb;
@@ -26,14 +198,11 @@ int swap_writeout(struct folio *folio, struct swap_iocb **swap_plug);
void __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug);
/* linux/mm/swap_state.c */
-/* One swap address space for each 64M swap space */
-#define SWAP_ADDRESS_SPACE_SHIFT 14
-#define SWAP_ADDRESS_SPACE_PAGES (1 << SWAP_ADDRESS_SPACE_SHIFT)
-#define SWAP_ADDRESS_SPACE_MASK (SWAP_ADDRESS_SPACE_PAGES - 1)
-extern struct address_space *swapper_spaces[];
-#define swap_address_space(entry) \
- (&swapper_spaces[swp_type(entry)][swp_offset(entry) \
- >> SWAP_ADDRESS_SPACE_SHIFT])
+extern struct address_space swap_space __ro_after_init;
+static inline struct address_space *swap_address_space(swp_entry_t entry)
+{
+ return &swap_space;
+}
/*
* Return the swap device position of the swap entry.
@@ -43,30 +212,52 @@ static inline loff_t swap_dev_pos(swp_entry_t entry)
return ((loff_t)swp_offset(entry)) << PAGE_SHIFT;
}
-/*
- * Return the swap cache index of the swap entry.
+/**
+ * folio_matches_swap_entry - Check if a folio matches a given swap entry.
+ * @folio: The folio.
+ * @entry: The swap entry to check against.
+ *
+ * Context: The caller should have the folio locked to ensure it's stable
+ * and nothing will move it in or out of the swap cache.
+ * Return: true or false.
*/
-static inline pgoff_t swap_cache_index(swp_entry_t entry)
+static inline bool folio_matches_swap_entry(const struct folio *folio,
+ swp_entry_t entry)
{
- BUILD_BUG_ON((SWP_OFFSET_MASK | SWAP_ADDRESS_SPACE_MASK) != SWP_OFFSET_MASK);
- return swp_offset(entry) & SWAP_ADDRESS_SPACE_MASK;
+ swp_entry_t folio_entry = folio->swap;
+ long nr_pages = folio_nr_pages(folio);
+
+ VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio);
+ if (!folio_test_swapcache(folio))
+ return false;
+ VM_WARN_ON_ONCE_FOLIO(!IS_ALIGNED(folio_entry.val, nr_pages), folio);
+ return folio_entry.val == round_down(entry.val, nr_pages);
}
+/*
+ * All swap cache helpers below require the caller to ensure the swap entries
+ * used are valid and stablize the device by any of the following ways:
+ * - Hold a reference by get_swap_device(): this ensures a single entry is
+ * valid and increases the swap device's refcount.
+ * - Locking a folio in the swap cache: this ensures the folio's swap entries
+ * are valid and pinned, also implies reference to the device.
+ * - Locking anything referencing the swap entry: e.g. PTL that protects
+ * swap entries in the page table, similar to locking swap cache folio.
+ * - See the comment of get_swap_device() for more complex usage.
+ */
+struct folio *swap_cache_get_folio(swp_entry_t entry);
+void *swap_cache_get_shadow(swp_entry_t entry);
+void swap_cache_add_folio(struct folio *folio, swp_entry_t entry, void **shadow);
+void swap_cache_del_folio(struct folio *folio);
+/* Below helpers require the caller to lock and pass in the swap cluster. */
+void __swap_cache_del_folio(struct swap_cluster_info *ci,
+ struct folio *folio, swp_entry_t entry, void *shadow);
+void __swap_cache_replace_folio(struct swap_cluster_info *ci,
+ struct folio *old, struct folio *new);
+void __swap_cache_clear_shadow(swp_entry_t entry, int nr_ents);
+
void show_swap_cache_info(void);
-void *get_shadow_from_swap_cache(swp_entry_t entry);
-int add_to_swap_cache(struct folio *folio, swp_entry_t entry,
- gfp_t gfp, void **shadowp);
-void __delete_from_swap_cache(struct folio *folio,
- swp_entry_t entry, void *shadow);
-void delete_from_swap_cache(struct folio *folio);
-void clear_shadow_from_swap_cache(int type, unsigned long begin,
- unsigned long end);
void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry, int nr);
-struct folio *swap_cache_get_folio(swp_entry_t entry,
- struct vm_area_struct *vma, unsigned long addr);
-struct folio *filemap_get_incore_folio(struct address_space *mapping,
- pgoff_t index);
-
struct folio *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
struct vm_area_struct *vma, unsigned long addr,
struct swap_iocb **plug);
@@ -77,10 +268,12 @@ struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t flag,
struct mempolicy *mpol, pgoff_t ilx);
struct folio *swapin_readahead(swp_entry_t entry, gfp_t flag,
struct vm_fault *vmf);
+void swap_update_readahead(struct folio *folio, struct vm_area_struct *vma,
+ unsigned long addr);
static inline unsigned int folio_swap_flags(struct folio *folio)
{
- return swp_swap_info(folio->swap)->flags;
+ return __swap_entry_to_info(folio->swap)->flags;
}
/*
@@ -91,7 +284,7 @@ static inline unsigned int folio_swap_flags(struct folio *folio)
static inline int swap_zeromap_batch(swp_entry_t entry, int max_nr,
bool *is_zeromap)
{
- struct swap_info_struct *sis = swp_swap_info(entry);
+ struct swap_info_struct *sis = __swap_entry_to_info(entry);
unsigned long start = swp_offset(entry);
unsigned long end = start + max_nr;
bool first_bit;
@@ -110,7 +303,7 @@ static inline int swap_zeromap_batch(swp_entry_t entry, int max_nr,
static inline int non_swapcache_batch(swp_entry_t entry, int max_nr)
{
- struct swap_info_struct *si = swp_swap_info(entry);
+ struct swap_info_struct *si = __swap_entry_to_info(entry);
pgoff_t offset = swp_offset(entry);
int i;
@@ -129,6 +322,37 @@ static inline int non_swapcache_batch(swp_entry_t entry, int max_nr)
#else /* CONFIG_SWAP */
struct swap_iocb;
+static inline struct swap_cluster_info *swap_cluster_lock(
+ struct swap_info_struct *si, pgoff_t offset, bool irq)
+{
+ return NULL;
+}
+
+static inline struct swap_cluster_info *swap_cluster_get_and_lock(
+ struct folio *folio)
+{
+ return NULL;
+}
+
+static inline struct swap_cluster_info *swap_cluster_get_and_lock_irq(
+ struct folio *folio)
+{
+ return NULL;
+}
+
+static inline void swap_cluster_unlock(struct swap_cluster_info *ci)
+{
+}
+
+static inline void swap_cluster_unlock_irq(struct swap_cluster_info *ci)
+{
+}
+
+static inline struct swap_info_struct *__swap_entry_to_info(swp_entry_t entry)
+{
+ return NULL;
+}
+
static inline void swap_read_folio(struct folio *folio, struct swap_iocb **plug)
{
}
@@ -141,9 +365,9 @@ static inline struct address_space *swap_address_space(swp_entry_t entry)
return NULL;
}
-static inline pgoff_t swap_cache_index(swp_entry_t entry)
+static inline bool folio_matches_swap_entry(const struct folio *folio, swp_entry_t entry)
{
- return 0;
+ return false;
}
static inline void show_swap_cache_info(void)
@@ -162,6 +386,11 @@ static inline struct folio *swapin_readahead(swp_entry_t swp, gfp_t gfp_mask,
return NULL;
}
+static inline void swap_update_readahead(struct folio *folio,
+ struct vm_area_struct *vma, unsigned long addr)
+{
+}
+
static inline int swap_writeout(struct folio *folio,
struct swap_iocb **swap_plug)
{
@@ -172,41 +401,31 @@ static inline void swapcache_clear(struct swap_info_struct *si, swp_entry_t entr
{
}
-static inline struct folio *swap_cache_get_folio(swp_entry_t entry,
- struct vm_area_struct *vma, unsigned long addr)
+static inline struct folio *swap_cache_get_folio(swp_entry_t entry)
{
return NULL;
}
-static inline
-struct folio *filemap_get_incore_folio(struct address_space *mapping,
- pgoff_t index)
-{
- return filemap_get_folio(mapping, index);
-}
-
-static inline void *get_shadow_from_swap_cache(swp_entry_t entry)
+static inline void *swap_cache_get_shadow(swp_entry_t entry)
{
return NULL;
}
-static inline int add_to_swap_cache(struct folio *folio, swp_entry_t entry,
- gfp_t gfp_mask, void **shadowp)
+static inline void swap_cache_add_folio(struct folio *folio, swp_entry_t entry, void **shadow)
{
- return -1;
}
-static inline void __delete_from_swap_cache(struct folio *folio,
- swp_entry_t entry, void *shadow)
+static inline void swap_cache_del_folio(struct folio *folio)
{
}
-static inline void delete_from_swap_cache(struct folio *folio)
+static inline void __swap_cache_del_folio(struct swap_cluster_info *ci,
+ struct folio *folio, swp_entry_t entry, void *shadow)
{
}
-static inline void clear_shadow_from_swap_cache(int type, unsigned long begin,
- unsigned long end)
+static inline void __swap_cache_replace_folio(struct swap_cluster_info *ci,
+ struct folio *old, struct folio *new)
{
}
@@ -240,8 +459,10 @@ static inline int non_swapcache_batch(swp_entry_t entry, int max_nr)
*/
static inline pgoff_t folio_index(struct folio *folio)
{
+#ifdef CONFIG_SWAP
if (unlikely(folio_test_swapcache(folio)))
- return swap_cache_index(folio->swap);
+ return swp_offset(folio->swap);
+#endif
return folio->index;
}
diff --git a/mm/swap_state.c b/mm/swap_state.c
index c354435a0923..b13e9c4baa90 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -23,6 +23,7 @@
#include <linux/huge_mm.h>
#include <linux/shmem_fs.h>
#include "internal.h"
+#include "swap_table.h"
#include "swap.h"
/*
@@ -36,8 +37,11 @@ static const struct address_space_operations swap_aops = {
#endif
};
-struct address_space *swapper_spaces[MAX_SWAPFILES] __read_mostly;
-static unsigned int nr_swapper_spaces[MAX_SWAPFILES] __read_mostly;
+/* Set swap_space as read only as swap cache is handled by swap table */
+struct address_space swap_space __ro_after_init = {
+ .a_ops = &swap_aops,
+};
+
static bool enable_vma_readahead __read_mostly = true;
#define SWAP_RA_ORDER_CEILING 5
@@ -69,150 +73,237 @@ void show_swap_cache_info(void)
printk("Total swap = %lukB\n", K(total_swap_pages));
}
-void *get_shadow_from_swap_cache(swp_entry_t entry)
+/**
+ * swap_cache_get_folio - Looks up a folio in the swap cache.
+ * @entry: swap entry used for the lookup.
+ *
+ * A found folio will be returned unlocked and with its refcount increased.
+ *
+ * Context: Caller must ensure @entry is valid and protect the swap device
+ * with reference count or locks.
+ * Return: Returns the found folio on success, NULL otherwise. The caller
+ * must lock nd check if the folio still matches the swap entry before
+ * use (e.g., folio_matches_swap_entry).
+ */
+struct folio *swap_cache_get_folio(swp_entry_t entry)
{
- struct address_space *address_space = swap_address_space(entry);
- pgoff_t idx = swap_cache_index(entry);
- void *shadow;
+ unsigned long swp_tb;
+ struct folio *folio;
+
+ for (;;) {
+ swp_tb = swap_table_get(__swap_entry_to_cluster(entry),
+ swp_cluster_offset(entry));
+ if (!swp_tb_is_folio(swp_tb))
+ return NULL;
+ folio = swp_tb_to_folio(swp_tb);
+ if (likely(folio_try_get(folio)))
+ return folio;
+ }
- shadow = xa_load(&address_space->i_pages, idx);
- if (xa_is_value(shadow))
- return shadow;
return NULL;
}
-/*
- * add_to_swap_cache resembles filemap_add_folio on swapper_space,
- * but sets SwapCache flag and 'swap' instead of mapping and index.
+/**
+ * swap_cache_get_shadow - Looks up a shadow in the swap cache.
+ * @entry: swap entry used for the lookup.
+ *
+ * Context: Caller must ensure @entry is valid and protect the swap device
+ * with reference count or locks.
+ * Return: Returns either NULL or an XA_VALUE (shadow).
*/
-int add_to_swap_cache(struct folio *folio, swp_entry_t entry,
- gfp_t gfp, void **shadowp)
+void *swap_cache_get_shadow(swp_entry_t entry)
{
- struct address_space *address_space = swap_address_space(entry);
- pgoff_t idx = swap_cache_index(entry);
- XA_STATE_ORDER(xas, &address_space->i_pages, idx, folio_order(folio));
- unsigned long i, nr = folio_nr_pages(folio);
- void *old;
+ unsigned long swp_tb;
- xas_set_update(&xas, workingset_update_node);
+ swp_tb = swap_table_get(__swap_entry_to_cluster(entry),
+ swp_cluster_offset(entry));
+ if (swp_tb_is_shadow(swp_tb))
+ return swp_tb_to_shadow(swp_tb);
+ return NULL;
+}
- VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
- VM_BUG_ON_FOLIO(folio_test_swapcache(folio), folio);
- VM_BUG_ON_FOLIO(!folio_test_swapbacked(folio), folio);
+/**
+ * swap_cache_add_folio - Add a folio into the swap cache.
+ * @folio: The folio to be added.
+ * @entry: The swap entry corresponding to the folio.
+ * @gfp: gfp_mask for XArray node allocation.
+ * @shadowp: If a shadow is found, return the shadow.
+ *
+ * Context: Caller must ensure @entry is valid and protect the swap device
+ * with reference count or locks.
+ * The caller also needs to update the corresponding swap_map slots with
+ * SWAP_HAS_CACHE bit to avoid race or conflict.
+ */
+void swap_cache_add_folio(struct folio *folio, swp_entry_t entry, void **shadowp)
+{
+ void *shadow = NULL;
+ unsigned long old_tb, new_tb;
+ struct swap_cluster_info *ci;
+ unsigned int ci_start, ci_off, ci_end;
+ unsigned long nr_pages = folio_nr_pages(folio);
+
+ VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio);
+ VM_WARN_ON_ONCE_FOLIO(folio_test_swapcache(folio), folio);
+ VM_WARN_ON_ONCE_FOLIO(!folio_test_swapbacked(folio), folio);
+
+ new_tb = folio_to_swp_tb(folio);
+ ci_start = swp_cluster_offset(entry);
+ ci_end = ci_start + nr_pages;
+ ci_off = ci_start;
+ ci = swap_cluster_lock(__swap_entry_to_info(entry), swp_offset(entry));
+ do {
+ old_tb = __swap_table_xchg(ci, ci_off, new_tb);
+ WARN_ON_ONCE(swp_tb_is_folio(old_tb));
+ if (swp_tb_is_shadow(old_tb))
+ shadow = swp_tb_to_shadow(old_tb);
+ } while (++ci_off < ci_end);
- folio_ref_add(folio, nr);
+ folio_ref_add(folio, nr_pages);
folio_set_swapcache(folio);
folio->swap = entry;
+ swap_cluster_unlock(ci);
- do {
- xas_lock_irq(&xas);
- xas_create_range(&xas);
- if (xas_error(&xas))
- goto unlock;
- for (i = 0; i < nr; i++) {
- VM_BUG_ON_FOLIO(xas.xa_index != idx + i, folio);
- if (shadowp) {
- old = xas_load(&xas);
- if (xa_is_value(old))
- *shadowp = old;
- }
- xas_store(&xas, folio);
- xas_next(&xas);
- }
- address_space->nrpages += nr;
- __node_stat_mod_folio(folio, NR_FILE_PAGES, nr);
- __lruvec_stat_mod_folio(folio, NR_SWAPCACHE, nr);
-unlock:
- xas_unlock_irq(&xas);
- } while (xas_nomem(&xas, gfp));
-
- if (!xas_error(&xas))
- return 0;
+ node_stat_mod_folio(folio, NR_FILE_PAGES, nr_pages);
+ lruvec_stat_mod_folio(folio, NR_SWAPCACHE, nr_pages);
- folio_clear_swapcache(folio);
- folio_ref_sub(folio, nr);
- return xas_error(&xas);
+ if (shadowp)
+ *shadowp = shadow;
}
-/*
- * This must be called only on folios that have
- * been verified to be in the swap cache.
+/**
+ * __swap_cache_del_folio - Removes a folio from the swap cache.
+ * @ci: The locked swap cluster.
+ * @folio: The folio.
+ * @entry: The first swap entry that the folio corresponds to.
+ * @shadow: shadow value to be filled in the swap cache.
+ *
+ * Removes a folio from the swap cache and fills a shadow in place.
+ * This won't put the folio's refcount. The caller has to do that.
+ *
+ * Context: Caller must ensure the folio is locked and in the swap cache
+ * using the index of @entry, and lock the cluster that holds the entries.
*/
-void __delete_from_swap_cache(struct folio *folio,
- swp_entry_t entry, void *shadow)
+void __swap_cache_del_folio(struct swap_cluster_info *ci, struct folio *folio,
+ swp_entry_t entry, void *shadow)
{
- struct address_space *address_space = swap_address_space(entry);
- int i;
- long nr = folio_nr_pages(folio);
- pgoff_t idx = swap_cache_index(entry);
- XA_STATE(xas, &address_space->i_pages, idx);
-
- xas_set_update(&xas, workingset_update_node);
-
- VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
- VM_BUG_ON_FOLIO(!folio_test_swapcache(folio), folio);
- VM_BUG_ON_FOLIO(folio_test_writeback(folio), folio);
-
- for (i = 0; i < nr; i++) {
- void *entry = xas_store(&xas, shadow);
- VM_BUG_ON_PAGE(entry != folio, entry);
- xas_next(&xas);
- }
+ unsigned long old_tb, new_tb;
+ unsigned int ci_start, ci_off, ci_end;
+ unsigned long nr_pages = folio_nr_pages(folio);
+
+ VM_WARN_ON_ONCE(__swap_entry_to_cluster(entry) != ci);
+ VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio);
+ VM_WARN_ON_ONCE_FOLIO(!folio_test_swapcache(folio), folio);
+ VM_WARN_ON_ONCE_FOLIO(folio_test_writeback(folio), folio);
+
+ new_tb = shadow_swp_to_tb(shadow);
+ ci_start = swp_cluster_offset(entry);
+ ci_end = ci_start + nr_pages;
+ ci_off = ci_start;
+ do {
+ /* If shadow is NULL, we sets an empty shadow */
+ old_tb = __swap_table_xchg(ci, ci_off, new_tb);
+ WARN_ON_ONCE(!swp_tb_is_folio(old_tb) ||
+ swp_tb_to_folio(old_tb) != folio);
+ } while (++ci_off < ci_end);
+
folio->swap.val = 0;
folio_clear_swapcache(folio);
- address_space->nrpages -= nr;
- __node_stat_mod_folio(folio, NR_FILE_PAGES, -nr);
- __lruvec_stat_mod_folio(folio, NR_SWAPCACHE, -nr);
+ node_stat_mod_folio(folio, NR_FILE_PAGES, -nr_pages);
+ lruvec_stat_mod_folio(folio, NR_SWAPCACHE, -nr_pages);
}
-/*
- * This must be called only on folios that have
- * been verified to be in the swap cache and locked.
- * It will never put the folio into the free list,
- * the caller has a reference on the folio.
+/**
+ * swap_cache_del_folio - Removes a folio from the swap cache.
+ * @folio: The folio.
+ *
+ * Same as __swap_cache_del_folio, but handles lock and refcount. The
+ * caller must ensure the folio is either clean or has a swap count
+ * equal to zero, or it may cause data loss.
+ *
+ * Context: Caller must ensure the folio is locked and in the swap cache.
*/
-void delete_from_swap_cache(struct folio *folio)
+void swap_cache_del_folio(struct folio *folio)
{
+ struct swap_cluster_info *ci;
swp_entry_t entry = folio->swap;
- struct address_space *address_space = swap_address_space(entry);
- xa_lock_irq(&address_space->i_pages);
- __delete_from_swap_cache(folio, entry, NULL);
- xa_unlock_irq(&address_space->i_pages);
+ ci = swap_cluster_lock(__swap_entry_to_info(entry), swp_offset(entry));
+ __swap_cache_del_folio(ci, folio, entry, NULL);
+ swap_cluster_unlock(ci);
put_swap_folio(folio, entry);
folio_ref_sub(folio, folio_nr_pages(folio));
}
-void clear_shadow_from_swap_cache(int type, unsigned long begin,
- unsigned long end)
+/**
+ * __swap_cache_replace_folio - Replace a folio in the swap cache.
+ * @ci: The locked swap cluster.
+ * @old: The old folio to be replaced.
+ * @new: The new folio.
+ *
+ * Replace an existing folio in the swap cache with a new folio. The
+ * caller is responsible for setting up the new folio's flag and swap
+ * entries. Replacement will take the new folio's swap entry value as
+ * the starting offset to override all slots covered by the new folio.
+ *
+ * Context: Caller must ensure both folios are locked, and lock the
+ * cluster that holds the old folio to be replaced.
+ */
+void __swap_cache_replace_folio(struct swap_cluster_info *ci,
+ struct folio *old, struct folio *new)
{
- unsigned long curr = begin;
- void *old;
-
- for (;;) {
- swp_entry_t entry = swp_entry(type, curr);
- unsigned long index = curr & SWAP_ADDRESS_SPACE_MASK;
- struct address_space *address_space = swap_address_space(entry);
- XA_STATE(xas, &address_space->i_pages, index);
-
- xas_set_update(&xas, workingset_update_node);
-
- xa_lock_irq(&address_space->i_pages);
- xas_for_each(&xas, old, min(index + (end - curr), SWAP_ADDRESS_SPACE_PAGES)) {
- if (!xa_is_value(old))
- continue;
- xas_store(&xas, NULL);
- }
- xa_unlock_irq(&address_space->i_pages);
+ swp_entry_t entry = new->swap;
+ unsigned long nr_pages = folio_nr_pages(new);
+ unsigned int ci_off = swp_cluster_offset(entry);
+ unsigned int ci_end = ci_off + nr_pages;
+ unsigned long old_tb, new_tb;
+
+ VM_WARN_ON_ONCE(!folio_test_swapcache(old) || !folio_test_swapcache(new));
+ VM_WARN_ON_ONCE(!folio_test_locked(old) || !folio_test_locked(new));
+ VM_WARN_ON_ONCE(!entry.val);
+
+ /* Swap cache still stores N entries instead of a high-order entry */
+ new_tb = folio_to_swp_tb(new);
+ do {
+ old_tb = __swap_table_xchg(ci, ci_off, new_tb);
+ WARN_ON_ONCE(!swp_tb_is_folio(old_tb) || swp_tb_to_folio(old_tb) != old);
+ } while (++ci_off < ci_end);
- /* search the next swapcache until we meet end */
- curr = ALIGN((curr + 1), SWAP_ADDRESS_SPACE_PAGES);
- if (curr > end)
- break;
+ /*
+ * If the old folio is partially replaced (e.g., splitting a large
+ * folio, the old folio is shrunk, and new split sub folios replace
+ * the shrunk part), ensure the new folio doesn't overlap it.
+ */
+ if (IS_ENABLED(CONFIG_DEBUG_VM) &&
+ folio_order(old) != folio_order(new)) {
+ ci_off = swp_cluster_offset(old->swap);
+ ci_end = ci_off + folio_nr_pages(old);
+ while (ci_off++ < ci_end)
+ WARN_ON_ONCE(swp_tb_to_folio(__swap_table_get(ci, ci_off)) != old);
}
}
+/**
+ * swap_cache_clear_shadow - Clears a set of shadows in the swap cache.
+ * @entry: The starting index entry.
+ * @nr_ents: How many slots need to be cleared.
+ *
+ * Context: Caller must ensure the range is valid, all in one single cluster,
+ * not occupied by any folio, and lock the cluster.
+ */
+void __swap_cache_clear_shadow(swp_entry_t entry, int nr_ents)
+{
+ struct swap_cluster_info *ci = __swap_entry_to_cluster(entry);
+ unsigned int ci_off = swp_cluster_offset(entry), ci_end;
+ unsigned long old;
+
+ ci_end = ci_off + nr_ents;
+ do {
+ old = __swap_table_xchg(ci, ci_off, null_to_swp_tb());
+ WARN_ON_ONCE(swp_tb_is_folio(old));
+ } while (++ci_off < ci_end);
+}
+
/*
* If we are the only user, then try to free up the swap cache.
*
@@ -272,100 +363,50 @@ static inline bool swap_use_vma_readahead(void)
return READ_ONCE(enable_vma_readahead) && !atomic_read(&nr_rotate_swap);
}
-/*
- * Lookup a swap entry in the swap cache. A found folio will be returned
- * unlocked and with its refcount incremented - we rely on the kernel
- * lock getting page table operations atomic even if we drop the folio
- * lock before returning.
- *
- * Caller must lock the swap device or hold a reference to keep it valid.
+/**
+ * swap_update_readahead - Update the readahead statistics of VMA or globally.
+ * @folio: the swap cache folio that just got hit.
+ * @vma: the VMA that should be updated, could be NULL for global update.
+ * @addr: the addr that triggered the swapin, ignored if @vma is NULL.
*/
-struct folio *swap_cache_get_folio(swp_entry_t entry,
- struct vm_area_struct *vma, unsigned long addr)
+void swap_update_readahead(struct folio *folio, struct vm_area_struct *vma,
+ unsigned long addr)
{
- struct folio *folio;
-
- folio = filemap_get_folio(swap_address_space(entry), swap_cache_index(entry));
- if (!IS_ERR(folio)) {
- bool vma_ra = swap_use_vma_readahead();
- bool readahead;
+ bool readahead, vma_ra = swap_use_vma_readahead();
- /*
- * At the moment, we don't support PG_readahead for anon THP
- * so let's bail out rather than confusing the readahead stat.
- */
- if (unlikely(folio_test_large(folio)))
- return folio;
-
- readahead = folio_test_clear_readahead(folio);
- if (vma && vma_ra) {
- unsigned long ra_val;
- int win, hits;
-
- ra_val = GET_SWAP_RA_VAL(vma);
- win = SWAP_RA_WIN(ra_val);
- hits = SWAP_RA_HITS(ra_val);
- if (readahead)
- hits = min_t(int, hits + 1, SWAP_RA_HITS_MAX);
- atomic_long_set(&vma->swap_readahead_info,
- SWAP_RA_VAL(addr, win, hits));
- }
-
- if (readahead) {
- count_vm_event(SWAP_RA_HIT);
- if (!vma || !vma_ra)
- atomic_inc(&swapin_readahead_hits);
- }
- } else {
- folio = NULL;
+ /*
+ * At the moment, we don't support PG_readahead for anon THP
+ * so let's bail out rather than confusing the readahead stat.
+ */
+ if (unlikely(folio_test_large(folio)))
+ return;
+
+ readahead = folio_test_clear_readahead(folio);
+ if (vma && vma_ra) {
+ unsigned long ra_val;
+ int win, hits;
+
+ ra_val = GET_SWAP_RA_VAL(vma);
+ win = SWAP_RA_WIN(ra_val);
+ hits = SWAP_RA_HITS(ra_val);
+ if (readahead)
+ hits = min_t(int, hits + 1, SWAP_RA_HITS_MAX);
+ atomic_long_set(&vma->swap_readahead_info,
+ SWAP_RA_VAL(addr, win, hits));
}
- return folio;
-}
-
-/**
- * filemap_get_incore_folio - Find and get a folio from the page or swap caches.
- * @mapping: The address_space to search.
- * @index: The page cache index.
- *
- * This differs from filemap_get_folio() in that it will also look for the
- * folio in the swap cache.
- *
- * Return: The found folio or %NULL.
- */
-struct folio *filemap_get_incore_folio(struct address_space *mapping,
- pgoff_t index)
-{
- swp_entry_t swp;
- struct swap_info_struct *si;
- struct folio *folio = filemap_get_entry(mapping, index);
-
- if (!folio)
- return ERR_PTR(-ENOENT);
- if (!xa_is_value(folio))
- return folio;
- if (!shmem_mapping(mapping))
- return ERR_PTR(-ENOENT);
-
- swp = radix_to_swp_entry(folio);
- /* There might be swapin error entries in shmem mapping. */
- if (non_swap_entry(swp))
- return ERR_PTR(-ENOENT);
- /* Prevent swapoff from happening to us */
- si = get_swap_device(swp);
- if (!si)
- return ERR_PTR(-ENOENT);
- index = swap_cache_index(swp);
- folio = filemap_get_folio(swap_address_space(swp), index);
- put_swap_device(si);
- return folio;
+ if (readahead) {
+ count_vm_event(SWAP_RA_HIT);
+ if (!vma || !vma_ra)
+ atomic_inc(&swapin_readahead_hits);
+ }
}
struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
struct mempolicy *mpol, pgoff_t ilx, bool *new_page_allocated,
bool skip_if_exists)
{
- struct swap_info_struct *si = swp_swap_info(entry);
+ struct swap_info_struct *si = __swap_entry_to_info(entry);
struct folio *folio;
struct folio *new_folio = NULL;
struct folio *result = NULL;
@@ -374,14 +415,13 @@ struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
*new_page_allocated = false;
for (;;) {
int err;
+
/*
- * First check the swap cache. Since this is normally
- * called after swap_cache_get_folio() failed, re-calling
- * that would confuse statistics.
+ * Check the swap cache first, if a cached folio is found,
+ * return it unlocked. The caller will lock and check it.
*/
- folio = filemap_get_folio(swap_address_space(entry),
- swap_cache_index(entry));
- if (!IS_ERR(folio))
+ folio = swap_cache_get_folio(entry);
+ if (folio)
goto got_folio;
/*
@@ -423,7 +463,7 @@ struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
goto put_and_return;
/*
- * We might race against __delete_from_swap_cache(), and
+ * We might race against __swap_cache_del_folio(), and
* stumble across a swap_map entry whose SWAP_HAS_CACHE
* has not yet been cleared. Or race against another
* __read_swap_cache_async(), which has set SWAP_HAS_CACHE
@@ -441,10 +481,7 @@ struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
if (mem_cgroup_swapin_charge_folio(new_folio, NULL, gfp_mask, entry))
goto fail_unlock;
- /* May fail (-ENOMEM) if XArray node allocation failed. */
- if (add_to_swap_cache(new_folio, entry, gfp_mask & GFP_RECLAIM_MASK, &shadow))
- goto fail_unlock;
-
+ swap_cache_add_folio(new_folio, entry, &shadow);
memcg1_swapin(entry, 1);
if (shadow)
@@ -590,7 +627,7 @@ struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
unsigned long offset = entry_offset;
unsigned long start_offset, end_offset;
unsigned long mask;
- struct swap_info_struct *si = swp_swap_info(entry);
+ struct swap_info_struct *si = __swap_entry_to_info(entry);
struct blk_plug plug;
struct swap_iocb *splug = NULL;
bool page_allocated;
@@ -636,41 +673,6 @@ skip:
return folio;
}
-int init_swap_address_space(unsigned int type, unsigned long nr_pages)
-{
- struct address_space *spaces, *space;
- unsigned int i, nr;
-
- nr = DIV_ROUND_UP(nr_pages, SWAP_ADDRESS_SPACE_PAGES);
- spaces = kvcalloc(nr, sizeof(struct address_space), GFP_KERNEL);
- if (!spaces)
- return -ENOMEM;
- for (i = 0; i < nr; i++) {
- space = spaces + i;
- xa_init_flags(&space->i_pages, XA_FLAGS_LOCK_IRQ);
- atomic_set(&space->i_mmap_writable, 0);
- space->a_ops = &swap_aops;
- /* swap cache doesn't use writeback related tags */
- mapping_set_no_writeback_tags(space);
- }
- nr_swapper_spaces[type] = nr;
- swapper_spaces[type] = spaces;
-
- return 0;
-}
-
-void exit_swap_address_space(unsigned int type)
-{
- int i;
- struct address_space *spaces = swapper_spaces[type];
-
- for (i = 0; i < nr_swapper_spaces[type]; i++)
- VM_WARN_ON_ONCE(!mapping_empty(&spaces[i]));
- kvfree(spaces);
- nr_swapper_spaces[type] = 0;
- swapper_spaces[type] = NULL;
-}
-
static int swap_vma_ra_win(struct vm_fault *vmf, unsigned long *start,
unsigned long *end)
{
@@ -843,7 +845,7 @@ static const struct attribute_group swap_attr_group = {
.attrs = swap_attrs,
};
-static int __init swap_init_sysfs(void)
+static int __init swap_init(void)
{
int err;
struct kobject *swap_kobj;
@@ -858,11 +860,13 @@ static int __init swap_init_sysfs(void)
pr_err("failed to register swap group\n");
goto delete_obj;
}
+ /* Swap cache writeback is LRU based, no tags for it */
+ mapping_set_no_writeback_tags(&swap_space);
return 0;
delete_obj:
kobject_put(swap_kobj);
return err;
}
-subsys_initcall(swap_init_sysfs);
+subsys_initcall(swap_init);
#endif
diff --git a/mm/swap_table.h b/mm/swap_table.h
new file mode 100644
index 000000000000..ea244a57a5b7
--- /dev/null
+++ b/mm/swap_table.h
@@ -0,0 +1,130 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _MM_SWAP_TABLE_H
+#define _MM_SWAP_TABLE_H
+
+#include <linux/rcupdate.h>
+#include <linux/atomic.h>
+#include "swap.h"
+
+/* A typical flat array in each cluster as swap table */
+struct swap_table {
+ atomic_long_t entries[SWAPFILE_CLUSTER];
+};
+
+#define SWP_TABLE_USE_PAGE (sizeof(struct swap_table) == PAGE_SIZE)
+
+/*
+ * A swap table entry represents the status of a swap slot on a swap
+ * (physical or virtual) device. The swap table in each cluster is a
+ * 1:1 map of the swap slots in this cluster.
+ *
+ * Each swap table entry could be a pointer (folio), a XA_VALUE
+ * (shadow), or NULL.
+ */
+
+/*
+ * Helpers for casting one type of info into a swap table entry.
+ */
+static inline unsigned long null_to_swp_tb(void)
+{
+ BUILD_BUG_ON(sizeof(unsigned long) != sizeof(atomic_long_t));
+ return 0;
+}
+
+static inline unsigned long folio_to_swp_tb(struct folio *folio)
+{
+ BUILD_BUG_ON(sizeof(unsigned long) != sizeof(void *));
+ return (unsigned long)folio;
+}
+
+static inline unsigned long shadow_swp_to_tb(void *shadow)
+{
+ BUILD_BUG_ON((BITS_PER_XA_VALUE + 1) !=
+ BITS_PER_BYTE * sizeof(unsigned long));
+ VM_WARN_ON_ONCE(shadow && !xa_is_value(shadow));
+ return (unsigned long)shadow;
+}
+
+/*
+ * Helpers for swap table entry type checking.
+ */
+static inline bool swp_tb_is_null(unsigned long swp_tb)
+{
+ return !swp_tb;
+}
+
+static inline bool swp_tb_is_folio(unsigned long swp_tb)
+{
+ return !xa_is_value((void *)swp_tb) && !swp_tb_is_null(swp_tb);
+}
+
+static inline bool swp_tb_is_shadow(unsigned long swp_tb)
+{
+ return xa_is_value((void *)swp_tb);
+}
+
+/*
+ * Helpers for retrieving info from swap table.
+ */
+static inline struct folio *swp_tb_to_folio(unsigned long swp_tb)
+{
+ VM_WARN_ON(!swp_tb_is_folio(swp_tb));
+ return (void *)swp_tb;
+}
+
+static inline void *swp_tb_to_shadow(unsigned long swp_tb)
+{
+ VM_WARN_ON(!swp_tb_is_shadow(swp_tb));
+ return (void *)swp_tb;
+}
+
+/*
+ * Helpers for accessing or modifying the swap table of a cluster,
+ * the swap cluster must be locked.
+ */
+static inline void __swap_table_set(struct swap_cluster_info *ci,
+ unsigned int off, unsigned long swp_tb)
+{
+ atomic_long_t *table = rcu_dereference_protected(ci->table, true);
+
+ lockdep_assert_held(&ci->lock);
+ VM_WARN_ON_ONCE(off >= SWAPFILE_CLUSTER);
+ atomic_long_set(&table[off], swp_tb);
+}
+
+static inline unsigned long __swap_table_xchg(struct swap_cluster_info *ci,
+ unsigned int off, unsigned long swp_tb)
+{
+ atomic_long_t *table = rcu_dereference_protected(ci->table, true);
+
+ lockdep_assert_held(&ci->lock);
+ VM_WARN_ON_ONCE(off >= SWAPFILE_CLUSTER);
+ /* Ordering is guaranteed by cluster lock, relax */
+ return atomic_long_xchg_relaxed(&table[off], swp_tb);
+}
+
+static inline unsigned long __swap_table_get(struct swap_cluster_info *ci,
+ unsigned int off)
+{
+ atomic_long_t *table;
+
+ VM_WARN_ON_ONCE(off >= SWAPFILE_CLUSTER);
+ table = rcu_dereference_check(ci->table, lockdep_is_held(&ci->lock));
+
+ return atomic_long_read(&table[off]);
+}
+
+static inline unsigned long swap_table_get(struct swap_cluster_info *ci,
+ unsigned int off)
+{
+ atomic_long_t *table;
+ unsigned long swp_tb;
+
+ rcu_read_lock();
+ table = rcu_dereference(ci->table);
+ swp_tb = table ? atomic_long_read(&table[off]) : null_to_swp_tb();
+ rcu_read_unlock();
+
+ return swp_tb;
+}
+#endif
diff --git a/mm/swapfile.c b/mm/swapfile.c
index b4f3cc712580..10760240a3a2 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -46,6 +46,7 @@
#include <asm/tlbflush.h>
#include <linux/swapops.h>
#include <linux/swap_cgroup.h>
+#include "swap_table.h"
#include "internal.h"
#include "swap.h"
@@ -58,9 +59,9 @@ static void swap_entries_free(struct swap_info_struct *si,
static void swap_range_alloc(struct swap_info_struct *si,
unsigned int nr_entries);
static bool folio_swapcache_freeable(struct folio *folio);
-static struct swap_cluster_info *lock_cluster(struct swap_info_struct *si,
- unsigned long offset);
-static inline void unlock_cluster(struct swap_cluster_info *ci);
+static void move_cluster(struct swap_info_struct *si,
+ struct swap_cluster_info *ci, struct list_head *list,
+ enum swap_cluster_flags new_flags);
static DEFINE_SPINLOCK(swap_lock);
static unsigned int nr_swapfiles;
@@ -105,7 +106,9 @@ static PLIST_HEAD(swap_active_head);
static struct plist_head *swap_avail_heads;
static DEFINE_SPINLOCK(swap_avail_lock);
-static struct swap_info_struct *swap_info[MAX_SWAPFILES];
+struct swap_info_struct *swap_info[MAX_SWAPFILES];
+
+static struct kmem_cache *swap_table_cachep;
static DEFINE_MUTEX(swapon_mutex);
@@ -127,14 +130,20 @@ static DEFINE_PER_CPU(struct percpu_swap_cluster, percpu_swap_cluster) = {
.lock = INIT_LOCAL_LOCK(),
};
-static struct swap_info_struct *swap_type_to_swap_info(int type)
+/* May return NULL on invalid type, caller must check for NULL return */
+static struct swap_info_struct *swap_type_to_info(int type)
{
if (type >= MAX_SWAPFILES)
return NULL;
-
return READ_ONCE(swap_info[type]); /* rcu_dereference() */
}
+/* May return NULL on invalid entry, caller must check for NULL return */
+static struct swap_info_struct *swap_entry_to_info(swp_entry_t entry)
+{
+ return swap_type_to_info(swp_type(entry));
+}
+
static inline unsigned char swap_count(unsigned char ent)
{
return ent & ~SWAP_HAS_CACHE; /* may include COUNT_CONTINUED flag */
@@ -212,16 +221,15 @@ static bool swap_is_last_map(struct swap_info_struct *si,
static int __try_to_reclaim_swap(struct swap_info_struct *si,
unsigned long offset, unsigned long flags)
{
- swp_entry_t entry = swp_entry(si->type, offset);
- struct address_space *address_space = swap_address_space(entry);
+ const swp_entry_t entry = swp_entry(si->type, offset);
struct swap_cluster_info *ci;
struct folio *folio;
int ret, nr_pages;
bool need_reclaim;
again:
- folio = filemap_get_folio(address_space, swap_cache_index(entry));
- if (IS_ERR(folio))
+ folio = swap_cache_get_folio(entry);
+ if (!folio)
return 0;
nr_pages = folio_nr_pages(folio);
@@ -241,13 +249,12 @@ again:
* Offset could point to the middle of a large folio, or folio
* may no longer point to the expected offset before it's locked.
*/
- entry = folio->swap;
- if (offset < swp_offset(entry) || offset >= swp_offset(entry) + nr_pages) {
+ if (!folio_matches_swap_entry(folio, entry)) {
folio_unlock(folio);
folio_put(folio);
goto again;
}
- offset = swp_offset(entry);
+ offset = swp_offset(folio->swap);
need_reclaim = ((flags & TTRS_ANYWAY) ||
((flags & TTRS_UNMAPPED) && !folio_mapped(folio)) ||
@@ -260,13 +267,13 @@ again:
* swap_map is HAS_CACHE only, which means the slots have no page table
* reference or pending writeback, and can't be allocated to others.
*/
- ci = lock_cluster(si, offset);
+ ci = swap_cluster_lock(si, offset);
need_reclaim = swap_only_has_cache(si, offset, nr_pages);
- unlock_cluster(ci);
+ swap_cluster_unlock(ci);
if (!need_reclaim)
goto out_unlock;
- delete_from_swap_cache(folio);
+ swap_cache_del_folio(folio);
folio_set_dirty(folio);
ret = nr_pages;
out_unlock:
@@ -347,7 +354,7 @@ offset_to_swap_extent(struct swap_info_struct *sis, unsigned long offset)
sector_t swap_folio_sector(struct folio *folio)
{
- struct swap_info_struct *sis = swp_swap_info(folio->swap);
+ struct swap_info_struct *sis = __swap_entry_to_info(folio->swap);
struct swap_extent *se;
sector_t sector;
pgoff_t offset;
@@ -387,19 +394,6 @@ static void discard_swap_cluster(struct swap_info_struct *si,
}
}
-#ifdef CONFIG_THP_SWAP
-#define SWAPFILE_CLUSTER HPAGE_PMD_NR
-
-#define swap_entry_order(order) (order)
-#else
-#define SWAPFILE_CLUSTER 256
-
-/*
- * Define swap_entry_order() as constant to let compiler to optimize
- * out some code if !CONFIG_THP_SWAP
- */
-#define swap_entry_order(order) 0
-#endif
#define LATENCY_LIMIT 256
static inline bool cluster_is_empty(struct swap_cluster_info *info)
@@ -412,10 +406,17 @@ static inline bool cluster_is_discard(struct swap_cluster_info *info)
return info->flags == CLUSTER_FLAG_DISCARD;
}
+static inline bool cluster_table_is_alloced(struct swap_cluster_info *ci)
+{
+ return rcu_dereference_protected(ci->table, lockdep_is_held(&ci->lock));
+}
+
static inline bool cluster_is_usable(struct swap_cluster_info *ci, int order)
{
if (unlikely(ci->flags > CLUSTER_FLAG_USABLE))
return false;
+ if (!cluster_table_is_alloced(ci))
+ return false;
if (!order)
return true;
return cluster_is_empty(ci) || order == ci->order;
@@ -427,32 +428,126 @@ static inline unsigned int cluster_index(struct swap_info_struct *si,
return ci - si->cluster_info;
}
-static inline struct swap_cluster_info *offset_to_cluster(struct swap_info_struct *si,
- unsigned long offset)
-{
- return &si->cluster_info[offset / SWAPFILE_CLUSTER];
-}
-
static inline unsigned int cluster_offset(struct swap_info_struct *si,
struct swap_cluster_info *ci)
{
return cluster_index(si, ci) * SWAPFILE_CLUSTER;
}
-static inline struct swap_cluster_info *lock_cluster(struct swap_info_struct *si,
- unsigned long offset)
+static struct swap_table *swap_table_alloc(gfp_t gfp)
{
- struct swap_cluster_info *ci;
+ struct folio *folio;
- ci = offset_to_cluster(si, offset);
- spin_lock(&ci->lock);
+ if (!SWP_TABLE_USE_PAGE)
+ return kmem_cache_zalloc(swap_table_cachep, gfp);
- return ci;
+ folio = folio_alloc(gfp | __GFP_ZERO, 0);
+ if (folio)
+ return folio_address(folio);
+ return NULL;
+}
+
+static void swap_table_free_folio_rcu_cb(struct rcu_head *head)
+{
+ struct folio *folio;
+
+ folio = page_folio(container_of(head, struct page, rcu_head));
+ folio_put(folio);
+}
+
+static void swap_table_free(struct swap_table *table)
+{
+ if (!SWP_TABLE_USE_PAGE) {
+ kmem_cache_free(swap_table_cachep, table);
+ return;
+ }
+
+ call_rcu(&(folio_page(virt_to_folio(table), 0)->rcu_head),
+ swap_table_free_folio_rcu_cb);
+}
+
+static void swap_cluster_free_table(struct swap_cluster_info *ci)
+{
+ unsigned int ci_off;
+ struct swap_table *table;
+
+ /* Only empty cluster's table is allow to be freed */
+ lockdep_assert_held(&ci->lock);
+ VM_WARN_ON_ONCE(!cluster_is_empty(ci));
+ for (ci_off = 0; ci_off < SWAPFILE_CLUSTER; ci_off++)
+ VM_WARN_ON_ONCE(!swp_tb_is_null(__swap_table_get(ci, ci_off)));
+ table = (void *)rcu_dereference_protected(ci->table, true);
+ rcu_assign_pointer(ci->table, NULL);
+
+ swap_table_free(table);
}
-static inline void unlock_cluster(struct swap_cluster_info *ci)
+/*
+ * Allocate swap table for one cluster. Attempt an atomic allocation first,
+ * then fallback to sleeping allocation.
+ */
+static struct swap_cluster_info *
+swap_cluster_alloc_table(struct swap_info_struct *si,
+ struct swap_cluster_info *ci)
{
+ struct swap_table *table;
+
+ /*
+ * Only cluster isolation from the allocator does table allocation.
+ * Swap allocator uses percpu clusters and holds the local lock.
+ */
+ lockdep_assert_held(&ci->lock);
+ lockdep_assert_held(&this_cpu_ptr(&percpu_swap_cluster)->lock);
+
+ /* The cluster must be free and was just isolated from the free list. */
+ VM_WARN_ON_ONCE(ci->flags || !cluster_is_empty(ci));
+
+ table = swap_table_alloc(__GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN);
+ if (table) {
+ rcu_assign_pointer(ci->table, table);
+ return ci;
+ }
+
+ /*
+ * Try a sleep allocation. Each isolated free cluster may cause
+ * a sleep allocation, but there is a limited number of them, so
+ * the potential recursive allocation is limited.
+ */
spin_unlock(&ci->lock);
+ if (!(si->flags & SWP_SOLIDSTATE))
+ spin_unlock(&si->global_cluster_lock);
+ local_unlock(&percpu_swap_cluster.lock);
+
+ table = swap_table_alloc(__GFP_HIGH | __GFP_NOMEMALLOC | GFP_KERNEL);
+
+ /*
+ * Back to atomic context. We might have migrated to a new CPU with a
+ * usable percpu cluster. But just keep using the isolated cluster to
+ * make things easier. Migration indicates a slight change of workload
+ * so using a new free cluster might not be a bad idea, and the worst
+ * could happen with ignoring the percpu cluster is fragmentation,
+ * which is acceptable since this fallback and race is rare.
+ */
+ local_lock(&percpu_swap_cluster.lock);
+ if (!(si->flags & SWP_SOLIDSTATE))
+ spin_lock(&si->global_cluster_lock);
+ spin_lock(&ci->lock);
+
+ /* Nothing except this helper should touch a dangling empty cluster. */
+ if (WARN_ON_ONCE(cluster_table_is_alloced(ci))) {
+ if (table)
+ swap_table_free(table);
+ return ci;
+ }
+
+ if (!table) {
+ move_cluster(si, ci, &si->free_clusters, CLUSTER_FLAG_FREE);
+ spin_unlock(&ci->lock);
+ return NULL;
+ }
+
+ rcu_assign_pointer(ci->table, table);
+ return ci;
}
static void move_cluster(struct swap_info_struct *si,
@@ -470,11 +565,6 @@ static void move_cluster(struct swap_info_struct *si,
else
list_move_tail(&ci->list, list);
spin_unlock(&si->lock);
-
- if (ci->flags == CLUSTER_FLAG_FRAG)
- atomic_long_dec(&si->frag_cluster_nr[ci->order]);
- else if (new_flags == CLUSTER_FLAG_FRAG)
- atomic_long_inc(&si->frag_cluster_nr[ci->order]);
ci->flags = new_flags;
}
@@ -489,7 +579,7 @@ static void swap_cluster_schedule_discard(struct swap_info_struct *si,
static void __free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci)
{
- lockdep_assert_held(&ci->lock);
+ swap_cluster_free_table(ci);
move_cluster(si, ci, &si->free_clusters, CLUSTER_FLAG_FREE);
ci->order = 0;
}
@@ -504,15 +594,11 @@ static void __free_cluster(struct swap_info_struct *si, struct swap_cluster_info
* this returns NULL for an non-empty list.
*/
static struct swap_cluster_info *isolate_lock_cluster(
- struct swap_info_struct *si, struct list_head *list)
+ struct swap_info_struct *si, struct list_head *list, int order)
{
- struct swap_cluster_info *ci, *ret = NULL;
+ struct swap_cluster_info *ci, *found = NULL;
spin_lock(&si->lock);
-
- if (unlikely(!(si->flags & SWP_WRITEOK)))
- goto out;
-
list_for_each_entry(ci, list, list) {
if (!spin_trylock(&ci->lock))
continue;
@@ -524,13 +610,19 @@ static struct swap_cluster_info *isolate_lock_cluster(
list_del(&ci->list);
ci->flags = CLUSTER_FLAG_NONE;
- ret = ci;
+ found = ci;
break;
}
-out:
spin_unlock(&si->lock);
- return ret;
+ if (found && !cluster_table_is_alloced(found)) {
+ /* Only an empty free cluster's swap table can be freed. */
+ VM_WARN_ON_ONCE(list != &si->free_clusters);
+ VM_WARN_ON_ONCE(!cluster_is_empty(found));
+ return swap_cluster_alloc_table(si, found);
+ }
+
+ return found;
}
/*
@@ -663,17 +755,27 @@ static void relocate_cluster(struct swap_info_struct *si,
* added to free cluster list and its usage counter will be increased by 1.
* Only used for initialization.
*/
-static void inc_cluster_info_page(struct swap_info_struct *si,
+static int inc_cluster_info_page(struct swap_info_struct *si,
struct swap_cluster_info *cluster_info, unsigned long page_nr)
{
unsigned long idx = page_nr / SWAPFILE_CLUSTER;
+ struct swap_table *table;
struct swap_cluster_info *ci;
ci = cluster_info + idx;
+ if (!ci->table) {
+ table = swap_table_alloc(GFP_KERNEL);
+ if (!table)
+ return -ENOMEM;
+ rcu_assign_pointer(ci->table, table);
+ }
+
ci->count++;
VM_BUG_ON(ci->count > SWAPFILE_CLUSTER);
VM_BUG_ON(ci->flags);
+
+ return 0;
}
static bool cluster_reclaim_range(struct swap_info_struct *si,
@@ -742,6 +844,26 @@ static bool cluster_scan_range(struct swap_info_struct *si,
return true;
}
+/*
+ * Currently, the swap table is not used for count tracking, just
+ * do a sanity check here to ensure nothing leaked, so the swap
+ * table should be empty upon freeing.
+ */
+static void swap_cluster_assert_table_empty(struct swap_cluster_info *ci,
+ unsigned int start, unsigned int nr)
+{
+ unsigned int ci_off = start % SWAPFILE_CLUSTER;
+ unsigned int ci_end = ci_off + nr;
+ unsigned long swp_tb;
+
+ if (IS_ENABLED(CONFIG_DEBUG_VM)) {
+ do {
+ swp_tb = __swap_table_get(ci, ci_off);
+ VM_WARN_ON_ONCE(!swp_tb_is_null(swp_tb));
+ } while (++ci_off < ci_end);
+ }
+}
+
static bool cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster_info *ci,
unsigned int start, unsigned char usage,
unsigned int order)
@@ -761,6 +883,7 @@ static bool cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster
ci->order = order;
memset(si->swap_map + start, usage, nr_pages);
+ swap_cluster_assert_table_empty(ci, start, nr_pages);
swap_range_alloc(si, nr_pages);
ci->count += nr_pages;
@@ -815,7 +938,7 @@ static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si,
}
out:
relocate_cluster(si, ci);
- unlock_cluster(ci);
+ swap_cluster_unlock(ci);
if (si->flags & SWP_SOLIDSTATE) {
this_cpu_write(percpu_swap_cluster.offset[order], next);
this_cpu_write(percpu_swap_cluster.si[order], si);
@@ -825,6 +948,29 @@ out:
return found;
}
+static unsigned int alloc_swap_scan_list(struct swap_info_struct *si,
+ struct list_head *list,
+ unsigned int order,
+ unsigned char usage,
+ bool scan_all)
+{
+ unsigned int found = SWAP_ENTRY_INVALID;
+
+ do {
+ struct swap_cluster_info *ci = isolate_lock_cluster(si, list, order);
+ unsigned long offset;
+
+ if (!ci)
+ break;
+ offset = cluster_offset(si, ci);
+ found = alloc_swap_scan_cluster(si, ci, offset, order, usage);
+ if (found)
+ break;
+ } while (scan_all);
+
+ return found;
+}
+
static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force)
{
long to_scan = 1;
@@ -836,7 +982,7 @@ static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force)
if (force)
to_scan = swap_usage_in_pages(si) / SWAPFILE_CLUSTER;
- while ((ci = isolate_lock_cluster(si, &si->full_clusters))) {
+ while ((ci = isolate_lock_cluster(si, &si->full_clusters, 0))) {
offset = cluster_offset(si, ci);
end = min(si->max, offset + SWAPFILE_CLUSTER);
to_scan--;
@@ -859,7 +1005,7 @@ static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force)
if (ci->flags == CLUSTER_FLAG_NONE)
relocate_cluster(si, ci);
- unlock_cluster(ci);
+ swap_cluster_unlock(ci);
if (to_scan <= 0)
break;
}
@@ -898,7 +1044,7 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o
if (offset == SWAP_ENTRY_INVALID)
goto new_cluster;
- ci = lock_cluster(si, offset);
+ ci = swap_cluster_lock(si, offset);
/* Cluster could have been used by another order */
if (cluster_is_usable(ci, order)) {
if (cluster_is_empty(ci))
@@ -906,53 +1052,53 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o
found = alloc_swap_scan_cluster(si, ci, offset,
order, usage);
} else {
- unlock_cluster(ci);
+ swap_cluster_unlock(ci);
}
if (found)
goto done;
}
new_cluster:
- ci = isolate_lock_cluster(si, &si->free_clusters);
- if (ci) {
- found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci),
- order, usage);
+ /*
+ * If the device need discard, prefer new cluster over nonfull
+ * to spread out the writes.
+ */
+ if (si->flags & SWP_PAGE_DISCARD) {
+ found = alloc_swap_scan_list(si, &si->free_clusters, order, usage,
+ false);
+ if (found)
+ goto done;
+ }
+
+ if (order < PMD_ORDER) {
+ found = alloc_swap_scan_list(si, &si->nonfull_clusters[order],
+ order, usage, true);
+ if (found)
+ goto done;
+ }
+
+ if (!(si->flags & SWP_PAGE_DISCARD)) {
+ found = alloc_swap_scan_list(si, &si->free_clusters, order, usage,
+ false);
if (found)
goto done;
}
- /* Try reclaim from full clusters if free clusters list is drained */
+ /* Try reclaim full clusters if free and nonfull lists are drained */
if (vm_swap_full())
swap_reclaim_full_clusters(si, false);
if (order < PMD_ORDER) {
- unsigned int frags = 0, frags_existing;
-
- while ((ci = isolate_lock_cluster(si, &si->nonfull_clusters[order]))) {
- found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci),
- order, usage);
- if (found)
- goto done;
- /* Clusters failed to allocate are moved to frag_clusters */
- frags++;
- }
-
- frags_existing = atomic_long_read(&si->frag_cluster_nr[order]);
- while (frags < frags_existing &&
- (ci = isolate_lock_cluster(si, &si->frag_clusters[order]))) {
- atomic_long_dec(&si->frag_cluster_nr[order]);
- /*
- * Rotate the frag list to iterate, they were all
- * failing high order allocation or moved here due to
- * per-CPU usage, but they could contain newly released
- * reclaimable (eg. lazy-freed swap cache) slots.
- */
- found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci),
- order, usage);
- if (found)
- goto done;
- frags++;
- }
+ /*
+ * Scan only one fragment cluster is good enough. Order 0
+ * allocation will surely success, and large allocation
+ * failure is not critical. Scanning one cluster still
+ * keeps the list rotated and reclaimed (for HAS_CACHE).
+ */
+ found = alloc_swap_scan_list(si, &si->frag_clusters[order], order,
+ usage, false);
+ if (found)
+ goto done;
}
/*
@@ -971,24 +1117,20 @@ new_cluster:
* Clusters here have at least one usable slots and can't fail order 0
* allocation, but reclaim may drop si->lock and race with another user.
*/
- while ((ci = isolate_lock_cluster(si, &si->frag_clusters[o]))) {
- atomic_long_dec(&si->frag_cluster_nr[o]);
- found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci),
- 0, usage);
- if (found)
- goto done;
- }
+ found = alloc_swap_scan_list(si, &si->frag_clusters[o],
+ 0, usage, true);
+ if (found)
+ goto done;
- while ((ci = isolate_lock_cluster(si, &si->nonfull_clusters[o]))) {
- found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci),
- 0, usage);
- if (found)
- goto done;
- }
+ found = alloc_swap_scan_list(si, &si->nonfull_clusters[o],
+ 0, usage, true);
+ if (found)
+ goto done;
}
done:
if (!(si->flags & SWP_SOLIDSTATE))
spin_unlock(&si->global_cluster_lock);
+
return found;
}
@@ -1145,7 +1287,7 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
swap_slot_free_notify(si->bdev, offset);
offset++;
}
- clear_shadow_from_swap_cache(si->type, begin, end);
+ __swap_cache_clear_shadow(swp_entry(si->type, begin), nr_entries);
/*
* Make sure that try_to_unuse() observes si->inuse_pages reaching 0
@@ -1192,7 +1334,7 @@ static bool swap_alloc_fast(swp_entry_t *entry,
if (!si || !offset || !get_swap_device_info(si))
return false;
- ci = lock_cluster(si, offset);
+ ci = swap_cluster_lock(si, offset);
if (cluster_is_usable(ci, order)) {
if (cluster_is_empty(ci))
offset = cluster_offset(si, ci);
@@ -1200,7 +1342,7 @@ static bool swap_alloc_fast(swp_entry_t *entry,
if (found)
*entry = swp_entry(si->type, found);
} else {
- unlock_cluster(ci);
+ swap_cluster_unlock(ci);
}
put_swap_device(si);
@@ -1302,16 +1444,7 @@ int folio_alloc_swap(struct folio *folio, gfp_t gfp)
if (!entry.val)
return -ENOMEM;
- /*
- * XArray node allocations from PF_MEMALLOC contexts could
- * completely exhaust the page allocator. __GFP_NOMEMALLOC
- * stops emergency reserves from being allocated.
- *
- * TODO: this could cause a theoretical memory reclaim
- * deadlock in the swap out path.
- */
- if (add_to_swap_cache(folio, entry, gfp | __GFP_NOMEMALLOC, NULL))
- goto out_free;
+ swap_cache_add_folio(folio, entry, NULL);
return 0;
@@ -1327,7 +1460,7 @@ static struct swap_info_struct *_swap_info_get(swp_entry_t entry)
if (!entry.val)
goto out;
- si = swp_swap_info(entry);
+ si = swap_entry_to_info(entry);
if (!si)
goto bad_nofile;
if (data_race(!(si->flags & SWP_USED)))
@@ -1442,7 +1575,7 @@ struct swap_info_struct *get_swap_device(swp_entry_t entry)
if (!entry.val)
goto out;
- si = swp_swap_info(entry);
+ si = swap_entry_to_info(entry);
if (!si)
goto bad_nofile;
if (!get_swap_device_info(si))
@@ -1468,14 +1601,14 @@ static void swap_entries_put_cache(struct swap_info_struct *si,
unsigned long offset = swp_offset(entry);
struct swap_cluster_info *ci;
- ci = lock_cluster(si, offset);
- if (swap_only_has_cache(si, offset, nr))
+ ci = swap_cluster_lock(si, offset);
+ if (swap_only_has_cache(si, offset, nr)) {
swap_entries_free(si, ci, entry, nr);
- else {
+ } else {
for (int i = 0; i < nr; i++, entry.val++)
swap_entry_put_locked(si, ci, entry, SWAP_HAS_CACHE);
}
- unlock_cluster(ci);
+ swap_cluster_unlock(ci);
}
static bool swap_entries_put_map(struct swap_info_struct *si,
@@ -1493,7 +1626,7 @@ static bool swap_entries_put_map(struct swap_info_struct *si,
if (count != 1 && count != SWAP_MAP_SHMEM)
goto fallback;
- ci = lock_cluster(si, offset);
+ ci = swap_cluster_lock(si, offset);
if (!swap_is_last_map(si, offset, nr, &has_cache)) {
goto locked_fallback;
}
@@ -1502,21 +1635,20 @@ static bool swap_entries_put_map(struct swap_info_struct *si,
else
for (i = 0; i < nr; i++)
WRITE_ONCE(si->swap_map[offset + i], SWAP_HAS_CACHE);
- unlock_cluster(ci);
+ swap_cluster_unlock(ci);
return has_cache;
fallback:
- ci = lock_cluster(si, offset);
+ ci = swap_cluster_lock(si, offset);
locked_fallback:
for (i = 0; i < nr; i++, entry.val++) {
count = swap_entry_put_locked(si, ci, entry, 1);
if (count == SWAP_HAS_CACHE)
has_cache = true;
}
- unlock_cluster(ci);
+ swap_cluster_unlock(ci);
return has_cache;
-
}
/*
@@ -1566,7 +1698,7 @@ static void swap_entries_free(struct swap_info_struct *si,
unsigned char *map_end = map + nr_pages;
/* It should never free entries across different clusters */
- VM_BUG_ON(ci != offset_to_cluster(si, offset + nr_pages - 1));
+ VM_BUG_ON(ci != __swap_offset_to_cluster(si, offset + nr_pages - 1));
VM_BUG_ON(cluster_is_empty(ci));
VM_BUG_ON(ci->count < nr_pages);
@@ -1578,6 +1710,7 @@ static void swap_entries_free(struct swap_info_struct *si,
mem_cgroup_uncharge_swap(entry, nr_pages);
swap_range_free(si, offset, nr_pages);
+ swap_cluster_assert_table_empty(ci, offset, nr_pages);
if (!ci->count)
free_cluster(si, ci);
@@ -1624,7 +1757,7 @@ void put_swap_folio(struct folio *folio, swp_entry_t entry)
int __swap_count(swp_entry_t entry)
{
- struct swap_info_struct *si = swp_swap_info(entry);
+ struct swap_info_struct *si = __swap_entry_to_info(entry);
pgoff_t offset = swp_offset(entry);
return swap_count(si->swap_map[offset]);
@@ -1641,9 +1774,9 @@ bool swap_entry_swapped(struct swap_info_struct *si, swp_entry_t entry)
struct swap_cluster_info *ci;
int count;
- ci = lock_cluster(si, offset);
+ ci = swap_cluster_lock(si, offset);
count = swap_count(si->swap_map[offset]);
- unlock_cluster(ci);
+ swap_cluster_unlock(ci);
return !!count;
}
@@ -1666,7 +1799,7 @@ int swp_swapcount(swp_entry_t entry)
offset = swp_offset(entry);
- ci = lock_cluster(si, offset);
+ ci = swap_cluster_lock(si, offset);
count = swap_count(si->swap_map[offset]);
if (!(count & COUNT_CONTINUED))
@@ -1689,7 +1822,7 @@ int swp_swapcount(swp_entry_t entry)
n *= (SWAP_CONT_MAX + 1);
} while (tmp_count & COUNT_CONTINUED);
out:
- unlock_cluster(ci);
+ swap_cluster_unlock(ci);
return count;
}
@@ -1704,7 +1837,7 @@ static bool swap_page_trans_huge_swapped(struct swap_info_struct *si,
int i;
bool ret = false;
- ci = lock_cluster(si, offset);
+ ci = swap_cluster_lock(si, offset);
if (nr_pages == 1) {
if (swap_count(map[roffset]))
ret = true;
@@ -1717,7 +1850,7 @@ static bool swap_page_trans_huge_swapped(struct swap_info_struct *si,
}
}
unlock_out:
- unlock_cluster(ci);
+ swap_cluster_unlock(ci);
return ret;
}
@@ -1781,7 +1914,7 @@ bool folio_free_swap(struct folio *folio)
if (folio_swapped(folio))
return false;
- delete_from_swap_cache(folio);
+ swap_cache_del_folio(folio);
folio_set_dirty(folio);
return true;
}
@@ -1855,7 +1988,7 @@ out:
swp_entry_t get_swap_page_of_type(int type)
{
- struct swap_info_struct *si = swap_type_to_swap_info(type);
+ struct swap_info_struct *si = swap_type_to_info(type);
unsigned long offset;
swp_entry_t entry = {0};
@@ -1865,7 +1998,13 @@ swp_entry_t get_swap_page_of_type(int type)
/* This is called for allocating swap entry, not cache */
if (get_swap_device_info(si)) {
if (si->flags & SWP_WRITEOK) {
+ /*
+ * Grab the local lock to be complaint
+ * with swap table allocation.
+ */
+ local_lock(&percpu_swap_cluster.lock);
offset = cluster_alloc_swap_entry(si, 0, 1);
+ local_unlock(&percpu_swap_cluster.lock);
if (offset) {
entry = swp_entry(si->type, offset);
atomic_long_dec(&nr_swap_pages);
@@ -1936,7 +2075,7 @@ int find_first_swap(dev_t *device)
*/
sector_t swapdev_block(int type, pgoff_t offset)
{
- struct swap_info_struct *si = swap_type_to_swap_info(type);
+ struct swap_info_struct *si = swap_type_to_info(type);
struct swap_extent *se;
if (!si || !(si->flags & SWP_WRITEOK))
@@ -1992,6 +2131,13 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
bool hwpoisoned = false;
int ret = 1;
+ /*
+ * If the folio is removed from swap cache by others, continue to
+ * unuse other PTEs. try_to_unuse may try again if we missed this one.
+ */
+ if (!folio_matches_swap_entry(folio, entry))
+ return 0;
+
swapcache = folio;
folio = ksm_might_need_to_copy(folio, vma, addr);
if (unlikely(!folio))
@@ -2118,7 +2264,7 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
pte_unmap(pte);
pte = NULL;
- folio = swap_cache_get_folio(entry, vma, addr);
+ folio = swap_cache_get_folio(entry);
if (!folio) {
struct vm_fault vmf = {
.vma = vma,
@@ -2243,6 +2389,8 @@ static int unuse_mm(struct mm_struct *mm, unsigned int type)
VMA_ITERATOR(vmi, mm, 0);
mmap_read_lock(mm);
+ if (check_stable_address_space(mm))
+ goto unlock;
for_each_vma(vmi, vma) {
if (vma->anon_vma && !is_vm_hugetlb_page(vma)) {
ret = unuse_vma(vma, type);
@@ -2252,6 +2400,7 @@ static int unuse_mm(struct mm_struct *mm, unsigned int type)
cond_resched();
}
+unlock:
mmap_read_unlock(mm);
return ret;
}
@@ -2344,8 +2493,8 @@ retry:
(i = find_next_to_unuse(si, i)) != 0) {
entry = swp_entry(type, i);
- folio = filemap_get_folio(swap_address_space(entry), swap_cache_index(entry));
- if (IS_ERR(folio))
+ folio = swap_cache_get_folio(entry);
+ if (!folio)
continue;
/*
@@ -2644,9 +2793,30 @@ static void wait_for_allocation(struct swap_info_struct *si)
BUG_ON(si->flags & SWP_WRITEOK);
for (offset = 0; offset < end; offset += SWAPFILE_CLUSTER) {
- ci = lock_cluster(si, offset);
- unlock_cluster(ci);
+ ci = swap_cluster_lock(si, offset);
+ swap_cluster_unlock(ci);
+ }
+}
+
+static void free_cluster_info(struct swap_cluster_info *cluster_info,
+ unsigned long maxpages)
+{
+ struct swap_cluster_info *ci;
+ int i, nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
+
+ if (!cluster_info)
+ return;
+ for (i = 0; i < nr_clusters; i++) {
+ ci = cluster_info + i;
+ /* Cluster with bad marks count will have a remaining table */
+ spin_lock(&ci->lock);
+ if (rcu_dereference_protected(ci->table, true)) {
+ ci->count = 0;
+ swap_cluster_free_table(ci);
+ }
+ spin_unlock(&ci->lock);
}
+ kvfree(cluster_info);
}
/*
@@ -2681,6 +2851,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
struct address_space *mapping;
struct inode *inode;
struct filename *pathname;
+ unsigned int maxpages;
int err, found = 0;
if (!capable(CAP_SYS_ADMIN))
@@ -2783,12 +2954,13 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
swap_file = p->swap_file;
p->swap_file = NULL;
- p->max = 0;
swap_map = p->swap_map;
p->swap_map = NULL;
zeromap = p->zeromap;
p->zeromap = NULL;
+ maxpages = p->max;
cluster_info = p->cluster_info;
+ p->max = 0;
p->cluster_info = NULL;
spin_unlock(&p->lock);
spin_unlock(&swap_lock);
@@ -2799,10 +2971,9 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
p->global_cluster = NULL;
vfree(swap_map);
kvfree(zeromap);
- kvfree(cluster_info);
+ free_cluster_info(cluster_info, maxpages);
/* Destroy swap account information */
swap_cgroup_swapoff(p->type);
- exit_swap_address_space(p->type);
inode = mapping->host;
@@ -2858,7 +3029,7 @@ static void *swap_start(struct seq_file *swap, loff_t *pos)
if (!l)
return SEQ_START_TOKEN;
- for (type = 0; (si = swap_type_to_swap_info(type)); type++) {
+ for (type = 0; (si = swap_type_to_info(type)); type++) {
if (!(si->flags & SWP_USED) || !si->swap_map)
continue;
if (!--l)
@@ -2879,7 +3050,7 @@ static void *swap_next(struct seq_file *swap, void *v, loff_t *pos)
type = si->type + 1;
++(*pos);
- for (; (si = swap_type_to_swap_info(type)); type++) {
+ for (; (si = swap_type_to_info(type)); type++) {
if (!(si->flags & SWP_USED) || !si->swap_map)
continue;
return si;
@@ -3166,21 +3337,14 @@ static int setup_swap_map(struct swap_info_struct *si,
return 0;
}
-#define SWAP_CLUSTER_INFO_COLS \
- DIV_ROUND_UP(L1_CACHE_BYTES, sizeof(struct swap_cluster_info))
-#define SWAP_CLUSTER_SPACE_COLS \
- DIV_ROUND_UP(SWAP_ADDRESS_SPACE_PAGES, SWAPFILE_CLUSTER)
-#define SWAP_CLUSTER_COLS \
- max_t(unsigned int, SWAP_CLUSTER_INFO_COLS, SWAP_CLUSTER_SPACE_COLS)
-
static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si,
union swap_header *swap_header,
unsigned long maxpages)
{
unsigned long nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
struct swap_cluster_info *cluster_info;
- unsigned long i, j, idx;
int err = -ENOMEM;
+ unsigned long i;
cluster_info = kvcalloc(nr_clusters, sizeof(*cluster_info), GFP_KERNEL);
if (!cluster_info)
@@ -3206,16 +3370,23 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si,
* See setup_swap_map(): header page, bad pages,
* and the EOF part of the last cluster.
*/
- inc_cluster_info_page(si, cluster_info, 0);
+ err = inc_cluster_info_page(si, cluster_info, 0);
+ if (err)
+ goto err;
for (i = 0; i < swap_header->info.nr_badpages; i++) {
unsigned int page_nr = swap_header->info.badpages[i];
if (page_nr >= maxpages)
continue;
- inc_cluster_info_page(si, cluster_info, page_nr);
+ err = inc_cluster_info_page(si, cluster_info, page_nr);
+ if (err)
+ goto err;
+ }
+ for (i = maxpages; i < round_up(maxpages, SWAPFILE_CLUSTER); i++) {
+ err = inc_cluster_info_page(si, cluster_info, i);
+ if (err)
+ goto err;
}
- for (i = maxpages; i < round_up(maxpages, SWAPFILE_CLUSTER); i++)
- inc_cluster_info_page(si, cluster_info, i);
INIT_LIST_HEAD(&si->free_clusters);
INIT_LIST_HEAD(&si->full_clusters);
@@ -3224,34 +3395,23 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si,
for (i = 0; i < SWAP_NR_ORDERS; i++) {
INIT_LIST_HEAD(&si->nonfull_clusters[i]);
INIT_LIST_HEAD(&si->frag_clusters[i]);
- atomic_long_set(&si->frag_cluster_nr[i], 0);
}
- /*
- * Reduce false cache line sharing between cluster_info and
- * sharing same address space.
- */
- for (j = 0; j < SWAP_CLUSTER_COLS; j++) {
- for (i = 0; i < DIV_ROUND_UP(nr_clusters, SWAP_CLUSTER_COLS); i++) {
- struct swap_cluster_info *ci;
- idx = i * SWAP_CLUSTER_COLS + j;
- ci = cluster_info + idx;
- if (idx >= nr_clusters)
- continue;
- if (ci->count) {
- ci->flags = CLUSTER_FLAG_NONFULL;
- list_add_tail(&ci->list, &si->nonfull_clusters[0]);
- continue;
- }
+ for (i = 0; i < nr_clusters; i++) {
+ struct swap_cluster_info *ci = &cluster_info[i];
+
+ if (ci->count) {
+ ci->flags = CLUSTER_FLAG_NONFULL;
+ list_add_tail(&ci->list, &si->nonfull_clusters[0]);
+ } else {
ci->flags = CLUSTER_FLAG_FREE;
list_add_tail(&ci->list, &si->free_clusters);
}
}
return cluster_info;
-
err_free:
- kvfree(cluster_info);
+ free_cluster_info(cluster_info, maxpages);
err:
return ERR_PTR(err);
}
@@ -3445,13 +3605,9 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
}
}
- error = init_swap_address_space(si->type, maxpages);
- if (error)
- goto bad_swap_unlock_inode;
-
error = zswap_swapon(si->type, maxpages);
if (error)
- goto free_swap_address_space;
+ goto bad_swap_unlock_inode;
/*
* Flush any pending IO and dirty mappings before we start using this
@@ -3486,8 +3642,6 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
goto out;
free_swap_zswap:
zswap_swapoff(si->type);
-free_swap_address_space:
- exit_swap_address_space(si->type);
bad_swap_unlock_inode:
inode_unlock(inode);
bad_swap:
@@ -3502,7 +3656,8 @@ bad_swap:
spin_unlock(&swap_lock);
vfree(swap_map);
kvfree(zeromap);
- kvfree(cluster_info);
+ if (cluster_info)
+ free_cluster_info(cluster_info, maxpages);
if (inced_nr_rotate_swap)
atomic_dec(&nr_rotate_swap);
if (swap_file)
@@ -3553,7 +3708,7 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr)
unsigned char has_cache;
int err, i;
- si = swp_swap_info(entry);
+ si = swap_entry_to_info(entry);
if (WARN_ON_ONCE(!si)) {
pr_err("%s%08lx\n", Bad_file, entry.val);
return -EINVAL;
@@ -3562,7 +3717,7 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr)
offset = swp_offset(entry);
VM_WARN_ON(nr > SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER);
VM_WARN_ON(usage == 1 && nr > 1);
- ci = lock_cluster(si, offset);
+ ci = swap_cluster_lock(si, offset);
err = 0;
for (i = 0; i < nr; i++) {
@@ -3617,7 +3772,7 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr)
}
unlock_out:
- unlock_cluster(ci);
+ swap_cluster_unlock(ci);
return err;
}
@@ -3668,11 +3823,6 @@ void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry, int nr)
swap_entries_put_cache(si, entry, nr);
}
-struct swap_info_struct *swp_swap_info(swp_entry_t entry)
-{
- return swap_type_to_swap_info(swp_type(entry));
-}
-
/*
* add_swap_count_continuation - called when a swap count is duplicated
* beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's
@@ -3716,7 +3866,7 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
offset = swp_offset(entry);
- ci = lock_cluster(si, offset);
+ ci = swap_cluster_lock(si, offset);
count = swap_count(si->swap_map[offset]);
@@ -3776,7 +3926,7 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
out_unlock_cont:
spin_unlock(&si->cont_lock);
out:
- unlock_cluster(ci);
+ swap_cluster_unlock(ci);
put_swap_device(si);
outer:
if (page)
@@ -3950,6 +4100,16 @@ static int __init swapfile_init(void)
swapfile_maximum_size = arch_max_swapfile_size();
+ /*
+ * Once a cluster is freed, it's swap table content is read
+ * only, and all swap cache readers (swap_cache_*) verifies
+ * the content before use. So it's safe to use RCU slab here.
+ */
+ if (!SWP_TABLE_USE_PAGE)
+ swap_table_cachep = kmem_cache_create("swap_table",
+ sizeof(struct swap_table),
+ 0, SLAB_PANIC | SLAB_TYPESAFE_BY_RCU, NULL);
+
#ifdef CONFIG_MIGRATION
if (swapfile_maximum_size >= (1UL << SWP_MIG_TOTAL_BITS))
swap_migration_ad_supported = true;
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 45e6290e2e8b..af61b95c89e4 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -1026,18 +1026,64 @@ static inline bool is_pte_pages_stable(pte_t *dst_pte, pte_t *src_pte,
pmd_same(dst_pmdval, pmdp_get_lockless(dst_pmd));
}
-static int move_present_pte(struct mm_struct *mm,
- struct vm_area_struct *dst_vma,
- struct vm_area_struct *src_vma,
- unsigned long dst_addr, unsigned long src_addr,
- pte_t *dst_pte, pte_t *src_pte,
- pte_t orig_dst_pte, pte_t orig_src_pte,
- pmd_t *dst_pmd, pmd_t dst_pmdval,
- spinlock_t *dst_ptl, spinlock_t *src_ptl,
- struct folio *src_folio)
+/*
+ * Checks if the two ptes and the corresponding folio are eligible for batched
+ * move. If so, then returns pointer to the locked folio. Otherwise, returns NULL.
+ *
+ * NOTE: folio's reference is not required as the whole operation is within
+ * PTL's critical section.
+ */
+static struct folio *check_ptes_for_batched_move(struct vm_area_struct *src_vma,
+ unsigned long src_addr,
+ pte_t *src_pte, pte_t *dst_pte,
+ struct anon_vma *src_anon_vma)
+{
+ pte_t orig_dst_pte, orig_src_pte;
+ struct folio *folio;
+
+ orig_dst_pte = ptep_get(dst_pte);
+ if (!pte_none(orig_dst_pte))
+ return NULL;
+
+ orig_src_pte = ptep_get(src_pte);
+ if (!pte_present(orig_src_pte) || is_zero_pfn(pte_pfn(orig_src_pte)))
+ return NULL;
+
+ folio = vm_normal_folio(src_vma, src_addr, orig_src_pte);
+ if (!folio || !folio_trylock(folio))
+ return NULL;
+ if (!PageAnonExclusive(&folio->page) || folio_test_large(folio) ||
+ folio_anon_vma(folio) != src_anon_vma) {
+ folio_unlock(folio);
+ return NULL;
+ }
+ return folio;
+}
+
+/*
+ * Moves src folios to dst in a batch as long as they share the same
+ * anon_vma as the first folio, are not large, and can successfully
+ * take the lock via folio_trylock().
+ */
+static long move_present_ptes(struct mm_struct *mm,
+ struct vm_area_struct *dst_vma,
+ struct vm_area_struct *src_vma,
+ unsigned long dst_addr, unsigned long src_addr,
+ pte_t *dst_pte, pte_t *src_pte,
+ pte_t orig_dst_pte, pte_t orig_src_pte,
+ pmd_t *dst_pmd, pmd_t dst_pmdval,
+ spinlock_t *dst_ptl, spinlock_t *src_ptl,
+ struct folio **first_src_folio, unsigned long len,
+ struct anon_vma *src_anon_vma)
{
int err = 0;
+ struct folio *src_folio = *first_src_folio;
+ unsigned long src_start = src_addr;
+ unsigned long src_end;
+ len = pmd_addr_end(dst_addr, dst_addr + len) - dst_addr;
+ src_end = pmd_addr_end(src_addr, src_addr + len);
+ flush_cache_range(src_vma, src_addr, src_end);
double_pt_lock(dst_ptl, src_ptl);
if (!is_pte_pages_stable(dst_pte, src_pte, orig_dst_pte, orig_src_pte,
@@ -1051,31 +1097,56 @@ static int move_present_pte(struct mm_struct *mm,
err = -EBUSY;
goto out;
}
+ /* It's safe to drop the reference now as the page-table is holding one. */
+ folio_put(*first_src_folio);
+ *first_src_folio = NULL;
+ arch_enter_lazy_mmu_mode();
+
+ while (true) {
+ orig_src_pte = ptep_get_and_clear(mm, src_addr, src_pte);
+ /* Folio got pinned from under us. Put it back and fail the move. */
+ if (folio_maybe_dma_pinned(src_folio)) {
+ set_pte_at(mm, src_addr, src_pte, orig_src_pte);
+ err = -EBUSY;
+ break;
+ }
- orig_src_pte = ptep_clear_flush(src_vma, src_addr, src_pte);
- /* Folio got pinned from under us. Put it back and fail the move. */
- if (folio_maybe_dma_pinned(src_folio)) {
- set_pte_at(mm, src_addr, src_pte, orig_src_pte);
- err = -EBUSY;
- goto out;
- }
-
- folio_move_anon_rmap(src_folio, dst_vma);
- src_folio->index = linear_page_index(dst_vma, dst_addr);
+ folio_move_anon_rmap(src_folio, dst_vma);
+ src_folio->index = linear_page_index(dst_vma, dst_addr);
- orig_dst_pte = folio_mk_pte(src_folio, dst_vma->vm_page_prot);
- /* Set soft dirty bit so userspace can notice the pte was moved */
+ orig_dst_pte = folio_mk_pte(src_folio, dst_vma->vm_page_prot);
+ /* Set soft dirty bit so userspace can notice the pte was moved */
#ifdef CONFIG_MEM_SOFT_DIRTY
- orig_dst_pte = pte_mksoft_dirty(orig_dst_pte);
+ orig_dst_pte = pte_mksoft_dirty(orig_dst_pte);
#endif
- if (pte_dirty(orig_src_pte))
- orig_dst_pte = pte_mkdirty(orig_dst_pte);
- orig_dst_pte = pte_mkwrite(orig_dst_pte, dst_vma);
+ if (pte_dirty(orig_src_pte))
+ orig_dst_pte = pte_mkdirty(orig_dst_pte);
+ orig_dst_pte = pte_mkwrite(orig_dst_pte, dst_vma);
+ set_pte_at(mm, dst_addr, dst_pte, orig_dst_pte);
+
+ src_addr += PAGE_SIZE;
+ if (src_addr == src_end)
+ break;
+ dst_addr += PAGE_SIZE;
+ dst_pte++;
+ src_pte++;
- set_pte_at(mm, dst_addr, dst_pte, orig_dst_pte);
+ folio_unlock(src_folio);
+ src_folio = check_ptes_for_batched_move(src_vma, src_addr, src_pte,
+ dst_pte, src_anon_vma);
+ if (!src_folio)
+ break;
+ }
+
+ arch_leave_lazy_mmu_mode();
+ if (src_addr > src_start)
+ flush_tlb_range(src_vma, src_start, src_addr);
+
+ if (src_folio)
+ folio_unlock(src_folio);
out:
double_pt_unlock(dst_ptl, src_ptl);
- return err;
+ return src_addr > src_start ? src_addr - src_start : err;
}
static int move_swap_pte(struct mm_struct *mm, struct vm_area_struct *dst_vma,
@@ -1140,7 +1211,7 @@ static int move_swap_pte(struct mm_struct *mm, struct vm_area_struct *dst_vma,
set_pte_at(mm, dst_addr, dst_pte, orig_src_pte);
double_pt_unlock(dst_ptl, src_ptl);
- return 0;
+ return PAGE_SIZE;
}
static int move_zeropage_pte(struct mm_struct *mm,
@@ -1167,20 +1238,20 @@ static int move_zeropage_pte(struct mm_struct *mm,
set_pte_at(mm, dst_addr, dst_pte, zero_pte);
double_pt_unlock(dst_ptl, src_ptl);
- return 0;
+ return PAGE_SIZE;
}
/*
- * The mmap_lock for reading is held by the caller. Just move the page
- * from src_pmd to dst_pmd if possible, and return true if succeeded
- * in moving the page.
+ * The mmap_lock for reading is held by the caller. Just move the page(s)
+ * from src_pmd to dst_pmd if possible, and return number of bytes moved.
+ * On failure, an error code is returned.
*/
-static int move_pages_pte(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd,
- struct vm_area_struct *dst_vma,
- struct vm_area_struct *src_vma,
- unsigned long dst_addr, unsigned long src_addr,
- __u64 mode)
+static long move_pages_ptes(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd,
+ struct vm_area_struct *dst_vma,
+ struct vm_area_struct *src_vma,
+ unsigned long dst_addr, unsigned long src_addr,
+ unsigned long len, __u64 mode)
{
swp_entry_t entry;
struct swap_info_struct *si = NULL;
@@ -1194,11 +1265,10 @@ static int move_pages_pte(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd,
struct folio *src_folio = NULL;
struct anon_vma *src_anon_vma = NULL;
struct mmu_notifier_range range;
- int err = 0;
+ long ret = 0;
- flush_cache_range(src_vma, src_addr, src_addr + PAGE_SIZE);
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
- src_addr, src_addr + PAGE_SIZE);
+ src_addr, src_addr + len);
mmu_notifier_invalidate_range_start(&range);
retry:
/*
@@ -1212,7 +1282,7 @@ retry:
/* Retry if a huge pmd materialized from under us */
if (unlikely(!dst_pte)) {
- err = -EAGAIN;
+ ret = -EAGAIN;
goto out;
}
@@ -1231,14 +1301,14 @@ retry:
* transparent huge pages under us.
*/
if (unlikely(!src_pte)) {
- err = -EAGAIN;
+ ret = -EAGAIN;
goto out;
}
/* Sanity checks before the operation */
if (pmd_none(*dst_pmd) || pmd_none(*src_pmd) ||
pmd_trans_huge(*dst_pmd) || pmd_trans_huge(*src_pmd)) {
- err = -EINVAL;
+ ret = -EINVAL;
goto out;
}
@@ -1246,7 +1316,7 @@ retry:
orig_dst_pte = ptep_get(dst_pte);
spin_unlock(dst_ptl);
if (!pte_none(orig_dst_pte)) {
- err = -EEXIST;
+ ret = -EEXIST;
goto out;
}
@@ -1255,21 +1325,21 @@ retry:
spin_unlock(src_ptl);
if (pte_none(orig_src_pte)) {
if (!(mode & UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES))
- err = -ENOENT;
+ ret = -ENOENT;
else /* nothing to do to move a hole */
- err = 0;
+ ret = PAGE_SIZE;
goto out;
}
/* If PTE changed after we locked the folio them start over */
if (src_folio && unlikely(!pte_same(src_folio_pte, orig_src_pte))) {
- err = -EAGAIN;
+ ret = -EAGAIN;
goto out;
}
if (pte_present(orig_src_pte)) {
if (is_zero_pfn(pte_pfn(orig_src_pte))) {
- err = move_zeropage_pte(mm, dst_vma, src_vma,
+ ret = move_zeropage_pte(mm, dst_vma, src_vma,
dst_addr, src_addr, dst_pte, src_pte,
orig_dst_pte, orig_src_pte,
dst_pmd, dst_pmdval, dst_ptl, src_ptl);
@@ -1292,14 +1362,14 @@ retry:
spin_lock(src_ptl);
if (!pte_same(orig_src_pte, ptep_get(src_pte))) {
spin_unlock(src_ptl);
- err = -EAGAIN;
+ ret = -EAGAIN;
goto out;
}
folio = vm_normal_folio(src_vma, src_addr, orig_src_pte);
if (!folio || !PageAnonExclusive(&folio->page)) {
spin_unlock(src_ptl);
- err = -EBUSY;
+ ret = -EBUSY;
goto out;
}
@@ -1313,7 +1383,7 @@ retry:
*/
if (!locked && folio_test_large(folio)) {
spin_unlock(src_ptl);
- err = -EAGAIN;
+ ret = -EAGAIN;
goto out;
}
@@ -1332,7 +1402,7 @@ retry:
}
if (WARN_ON_ONCE(!folio_test_anon(src_folio))) {
- err = -EBUSY;
+ ret = -EBUSY;
goto out;
}
}
@@ -1343,8 +1413,8 @@ retry:
pte_unmap(src_pte);
pte_unmap(dst_pte);
src_pte = dst_pte = NULL;
- err = split_folio(src_folio);
- if (err)
+ ret = split_folio(src_folio);
+ if (ret)
goto out;
/* have to reacquire the folio after it got split */
folio_unlock(src_folio);
@@ -1362,7 +1432,7 @@ retry:
src_anon_vma = folio_get_anon_vma(src_folio);
if (!src_anon_vma) {
/* page was unmapped from under us */
- err = -EAGAIN;
+ ret = -EAGAIN;
goto out;
}
if (!anon_vma_trylock_write(src_anon_vma)) {
@@ -1375,10 +1445,11 @@ retry:
}
}
- err = move_present_pte(mm, dst_vma, src_vma,
- dst_addr, src_addr, dst_pte, src_pte,
- orig_dst_pte, orig_src_pte, dst_pmd,
- dst_pmdval, dst_ptl, src_ptl, src_folio);
+ ret = move_present_ptes(mm, dst_vma, src_vma,
+ dst_addr, src_addr, dst_pte, src_pte,
+ orig_dst_pte, orig_src_pte, dst_pmd,
+ dst_pmdval, dst_ptl, src_ptl, &src_folio,
+ len, src_anon_vma);
} else {
struct folio *folio = NULL;
@@ -1389,20 +1460,20 @@ retry:
pte_unmap(dst_pte);
src_pte = dst_pte = NULL;
migration_entry_wait(mm, src_pmd, src_addr);
- err = -EAGAIN;
+ ret = -EAGAIN;
} else
- err = -EFAULT;
+ ret = -EFAULT;
goto out;
}
if (!pte_swp_exclusive(orig_src_pte)) {
- err = -EBUSY;
+ ret = -EBUSY;
goto out;
}
si = get_swap_device(entry);
if (unlikely(!si)) {
- err = -EAGAIN;
+ ret = -EAGAIN;
goto out;
}
/*
@@ -1418,11 +1489,10 @@ retry:
* separately to allow proper handling.
*/
if (!src_folio)
- folio = filemap_get_folio(swap_address_space(entry),
- swap_cache_index(entry));
- if (!IS_ERR_OR_NULL(folio)) {
+ folio = swap_cache_get_folio(entry);
+ if (folio) {
if (folio_test_large(folio)) {
- err = -EBUSY;
+ ret = -EBUSY;
folio_put(folio);
goto out;
}
@@ -1439,7 +1509,7 @@ retry:
goto retry;
}
}
- err = move_swap_pte(mm, dst_vma, dst_addr, src_addr, dst_pte, src_pte,
+ ret = move_swap_pte(mm, dst_vma, dst_addr, src_addr, dst_pte, src_pte,
orig_dst_pte, orig_src_pte, dst_pmd, dst_pmdval,
dst_ptl, src_ptl, src_folio, si, entry);
}
@@ -1453,15 +1523,20 @@ out:
folio_unlock(src_folio);
folio_put(src_folio);
}
- if (dst_pte)
- pte_unmap(dst_pte);
+ /*
+ * Unmap in reverse order (LIFO) to maintain proper kmap_local
+ * index ordering when CONFIG_HIGHPTE is enabled. We mapped dst_pte
+ * first, then src_pte, so we must unmap src_pte first, then dst_pte.
+ */
if (src_pte)
pte_unmap(src_pte);
+ if (dst_pte)
+ pte_unmap(dst_pte);
mmu_notifier_invalidate_range_end(&range);
if (si)
put_swap_device(si);
- return err;
+ return ret;
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -1732,7 +1807,7 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start,
{
struct mm_struct *mm = ctx->mm;
struct vm_area_struct *src_vma, *dst_vma;
- unsigned long src_addr, dst_addr;
+ unsigned long src_addr, dst_addr, src_end;
pmd_t *src_pmd, *dst_pmd;
long err = -EINVAL;
ssize_t moved = 0;
@@ -1775,8 +1850,8 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start,
if (err)
goto out_unlock;
- for (src_addr = src_start, dst_addr = dst_start;
- src_addr < src_start + len;) {
+ for (src_addr = src_start, dst_addr = dst_start, src_end = src_start + len;
+ src_addr < src_end;) {
spinlock_t *ptl;
pmd_t dst_pmdval;
unsigned long step_size;
@@ -1844,6 +1919,8 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start,
dst_addr, src_addr);
step_size = HPAGE_PMD_SIZE;
} else {
+ long ret;
+
if (pmd_none(*src_pmd)) {
if (!(mode & UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES)) {
err = -ENOENT;
@@ -1860,10 +1937,13 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start,
break;
}
- err = move_pages_pte(mm, dst_pmd, src_pmd,
- dst_vma, src_vma,
- dst_addr, src_addr, mode);
- step_size = PAGE_SIZE;
+ ret = move_pages_ptes(mm, dst_pmd, src_pmd,
+ dst_vma, src_vma, dst_addr,
+ src_addr, src_end - src_addr, mode);
+ if (ret < 0)
+ err = ret;
+ else
+ step_size = ret;
}
cond_resched();
diff --git a/mm/util.c b/mm/util.c
index f814e6a59ab1..6c1d64ed0221 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -315,7 +315,7 @@ void *memdup_user_nul(const void __user *src, size_t len)
EXPORT_SYMBOL(memdup_user_nul);
/* Check if the vma is being used as a stack by this task */
-int vma_is_stack_for_current(struct vm_area_struct *vma)
+int vma_is_stack_for_current(const struct vm_area_struct *vma)
{
struct task_struct * __maybe_unused t = current;
@@ -410,7 +410,7 @@ unsigned long arch_mmap_rnd(void)
return rnd << PAGE_SHIFT;
}
-static int mmap_is_legacy(struct rlimit *rlim_stack)
+static int mmap_is_legacy(const struct rlimit *rlim_stack)
{
if (current->personality & ADDR_COMPAT_LAYOUT)
return 1;
@@ -431,7 +431,7 @@ static int mmap_is_legacy(struct rlimit *rlim_stack)
#define MIN_GAP (SZ_128M)
#define MAX_GAP (STACK_TOP / 6 * 5)
-static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack)
+static unsigned long mmap_base(const unsigned long rnd, const struct rlimit *rlim_stack)
{
#ifdef CONFIG_STACK_GROWSUP
/*
@@ -462,7 +462,7 @@ static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack)
#endif
}
-void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
+void arch_pick_mmap_layout(struct mm_struct *mm, const struct rlimit *rlim_stack)
{
unsigned long random_factor = 0UL;
@@ -471,17 +471,17 @@ void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
if (mmap_is_legacy(rlim_stack)) {
mm->mmap_base = TASK_UNMAPPED_BASE + random_factor;
- clear_bit(MMF_TOPDOWN, &mm->flags);
+ mm_flags_clear(MMF_TOPDOWN, mm);
} else {
mm->mmap_base = mmap_base(random_factor, rlim_stack);
- set_bit(MMF_TOPDOWN, &mm->flags);
+ mm_flags_set(MMF_TOPDOWN, mm);
}
}
#elif defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT)
-void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
+void arch_pick_mmap_layout(struct mm_struct *mm, const struct rlimit *rlim_stack)
{
mm->mmap_base = TASK_UNMAPPED_BASE;
- clear_bit(MMF_TOPDOWN, &mm->flags);
+ mm_flags_clear(MMF_TOPDOWN, mm);
}
#endif
#ifdef CONFIG_MMU
@@ -504,7 +504,7 @@ EXPORT_SYMBOL_IF_KUNIT(arch_pick_mmap_layout);
* * -ENOMEM if RLIMIT_MEMLOCK would be exceeded.
*/
int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc,
- struct task_struct *task, bool bypass_rlim)
+ const struct task_struct *task, bool bypass_rlim)
{
unsigned long locked_vm, limit;
int ret = 0;
@@ -688,7 +688,7 @@ struct anon_vma *folio_anon_vma(const struct folio *folio)
* You can call this for folios which aren't in the swap cache or page
* cache and it will return NULL.
*/
-struct address_space *folio_mapping(struct folio *folio)
+struct address_space *folio_mapping(const struct folio *folio)
{
struct address_space *mapping;
@@ -926,7 +926,7 @@ EXPORT_SYMBOL_GPL(vm_memory_committed);
* Note this is a helper function intended to be used by LSMs which
* wish to use this logic.
*/
-int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
+int __vm_enough_memory(const struct mm_struct *mm, long pages, int cap_sys_admin)
{
long allowed;
unsigned long bytes_failed;
@@ -1134,16 +1134,50 @@ EXPORT_SYMBOL(flush_dcache_folio);
#endif
/**
+ * __compat_vma_mmap_prepare() - See description for compat_vma_mmap_prepare()
+ * for details. This is the same operation, only with a specific file operations
+ * struct which may or may not be the same as vma->vm_file->f_op.
+ * @f_op: The file operations whose .mmap_prepare() hook is specified.
+ * @file: The file which backs or will back the mapping.
+ * @vma: The VMA to apply the .mmap_prepare() hook to.
+ * Returns: 0 on success or error.
+ */
+int __compat_vma_mmap_prepare(const struct file_operations *f_op,
+ struct file *file, struct vm_area_struct *vma)
+{
+ struct vm_area_desc desc = {
+ .mm = vma->vm_mm,
+ .file = file,
+ .start = vma->vm_start,
+ .end = vma->vm_end,
+
+ .pgoff = vma->vm_pgoff,
+ .vm_file = vma->vm_file,
+ .vm_flags = vma->vm_flags,
+ .page_prot = vma->vm_page_prot,
+ };
+ int err;
+
+ err = f_op->mmap_prepare(&desc);
+ if (err)
+ return err;
+ set_vma_from_desc(vma, &desc);
+
+ return 0;
+}
+EXPORT_SYMBOL(__compat_vma_mmap_prepare);
+
+/**
* compat_vma_mmap_prepare() - Apply the file's .mmap_prepare() hook to an
- * existing VMA
- * @file: The file which possesss an f_op->mmap_prepare() hook
+ * existing VMA.
+ * @file: The file which possesss an f_op->mmap_prepare() hook.
* @vma: The VMA to apply the .mmap_prepare() hook to.
*
* Ordinarily, .mmap_prepare() is invoked directly upon mmap(). However, certain
- * 'wrapper' file systems invoke a nested mmap hook of an underlying file.
+ * stacked filesystems invoke a nested mmap hook of an underlying file.
*
* Until all filesystems are converted to use .mmap_prepare(), we must be
- * conservative and continue to invoke these 'wrapper' filesystems using the
+ * conservative and continue to invoke these stacked filesystems using the
* deprecated .mmap() hook.
*
* However we have a problem if the underlying file system possesses an
@@ -1161,15 +1195,7 @@ EXPORT_SYMBOL(flush_dcache_folio);
*/
int compat_vma_mmap_prepare(struct file *file, struct vm_area_struct *vma)
{
- struct vm_area_desc desc;
- int err;
-
- err = file->f_op->mmap_prepare(vma_to_desc(vma, &desc));
- if (err)
- return err;
- set_vma_from_desc(vma, &desc);
-
- return 0;
+ return __compat_vma_mmap_prepare(file->f_op, file, vma);
}
EXPORT_SYMBOL(compat_vma_mmap_prepare);
@@ -1281,3 +1307,39 @@ unsigned int folio_pte_batch(struct folio *folio, pte_t *ptep, pte_t pte,
return folio_pte_batch_flags(folio, NULL, ptep, &pte, max_nr, 0);
}
#endif /* CONFIG_MMU */
+
+#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
+/**
+ * page_range_contiguous - test whether the page range is contiguous
+ * @page: the start of the page range.
+ * @nr_pages: the number of pages in the range.
+ *
+ * Test whether the page range is contiguous, such that they can be iterated
+ * naively, corresponding to iterating a contiguous PFN range.
+ *
+ * This function should primarily only be used for debug checks, or when
+ * working with page ranges that are not naturally contiguous (e.g., pages
+ * within a folio are).
+ *
+ * Returns true if contiguous, otherwise false.
+ */
+bool page_range_contiguous(const struct page *page, unsigned long nr_pages)
+{
+ const unsigned long start_pfn = page_to_pfn(page);
+ const unsigned long end_pfn = start_pfn + nr_pages;
+ unsigned long pfn;
+
+ /*
+ * The memmap is allocated per memory section, so no need to check
+ * within the first section. However, we need to check each other
+ * spanned memory section once, making sure the first page in a
+ * section could similarly be reached by just iterating pages.
+ */
+ for (pfn = ALIGN(start_pfn, PAGES_PER_SECTION);
+ pfn < end_pfn; pfn += PAGES_PER_SECTION)
+ if (unlikely(page + (pfn - start_pfn) != pfn_to_page(pfn)))
+ return false;
+ return true;
+}
+EXPORT_SYMBOL(page_range_contiguous);
+#endif
diff --git a/mm/vma.c b/mm/vma.c
index 3b12c7579831..abe0da33c844 100644
--- a/mm/vma.c
+++ b/mm/vma.c
@@ -2572,11 +2572,12 @@ static int call_mmap_prepare(struct mmap_state *map)
int err;
struct vm_area_desc desc = {
.mm = map->mm,
+ .file = map->file,
.start = map->addr,
.end = map->end,
.pgoff = map->pgoff,
- .file = map->file,
+ .vm_file = map->file,
.vm_flags = map->vm_flags,
.page_prot = map->page_prot,
};
@@ -2588,7 +2589,7 @@ static int call_mmap_prepare(struct mmap_state *map)
/* Update fields permitted to be changed. */
map->pgoff = desc.pgoff;
- map->file = desc.file;
+ map->file = desc.vm_file;
map->vm_flags = desc.vm_flags;
map->page_prot = desc.page_prot;
/* User-defined fields. */
diff --git a/mm/vma.h b/mm/vma.h
index b123a9cdedb0..9183fe549009 100644
--- a/mm/vma.h
+++ b/mm/vma.h
@@ -145,7 +145,7 @@ struct vma_merge_struct {
*/
bool __remove_middle :1;
/*
- * Internal flag used during the merge operationr to indicate we will
+ * Internal flag used during the merge operation to indicate we will
* remove vmg->next.
*/
bool __remove_next :1;
@@ -222,31 +222,11 @@ static inline int vma_iter_store_gfp(struct vma_iterator *vmi,
return 0;
}
-
/*
- * Temporary helper functions for file systems which wrap an invocation of
+ * Temporary helper function for stacked mmap handlers which specify
* f_op->mmap() but which might have an underlying file system which implements
* f_op->mmap_prepare().
*/
-
-static inline struct vm_area_desc *vma_to_desc(struct vm_area_struct *vma,
- struct vm_area_desc *desc)
-{
- desc->mm = vma->vm_mm;
- desc->start = vma->vm_start;
- desc->end = vma->vm_end;
-
- desc->pgoff = vma->vm_pgoff;
- desc->file = vma->vm_file;
- desc->vm_flags = vma->vm_flags;
- desc->page_prot = vma->vm_page_prot;
-
- desc->vm_ops = NULL;
- desc->private_data = NULL;
-
- return desc;
-}
-
static inline void set_vma_from_desc(struct vm_area_struct *vma,
struct vm_area_desc *desc)
{
@@ -258,9 +238,9 @@ static inline void set_vma_from_desc(struct vm_area_struct *vma,
/* Mutable fields. Populated with initial state. */
vma->vm_pgoff = desc->pgoff;
- if (vma->vm_file != desc->file)
- vma_set_file(vma, desc->file);
- if (vma->vm_flags != desc->vm_flags)
+ if (desc->vm_file != vma->vm_file)
+ vma_set_file(vma, desc->vm_file);
+ if (desc->vm_flags != vma->vm_flags)
vm_flags_set(vma, desc->vm_flags);
vma->vm_page_prot = desc->page_prot;
diff --git a/mm/vma_init.c b/mm/vma_init.c
index 8e53c7943561..3c0b65950510 100644
--- a/mm/vma_init.c
+++ b/mm/vma_init.c
@@ -1,7 +1,7 @@
// SPDX-License-Identifier: GPL-2.0-or-later
/*
- * Functions for initialisaing, allocating, freeing and duplicating VMAs. Shared
+ * Functions for initializing, allocating, freeing and duplicating VMAs. Shared
* between CONFIG_MMU and non-CONFIG_MMU kernel configurations.
*/
@@ -16,6 +16,7 @@ void __init vma_state_init(void)
struct kmem_cache_args args = {
.use_freeptr_offset = true,
.freeptr_offset = offsetof(struct vm_area_struct, vm_freeptr),
+ .sheaf_capacity = 32,
};
vm_area_cachep = kmem_cache_create("vm_area_struct",
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 6dbcdceecae1..798b2ed21e46 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -2026,6 +2026,8 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
if (unlikely(!vmap_initialized))
return ERR_PTR(-EBUSY);
+ /* Only reclaim behaviour flags are relevant. */
+ gfp_mask = gfp_mask & GFP_RECLAIM_MASK;
might_sleep();
/*
@@ -2038,8 +2040,6 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
*/
va = node_alloc(size, align, vstart, vend, &addr, &vn_id);
if (!va) {
- gfp_mask = gfp_mask & GFP_RECLAIM_MASK;
-
va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node);
if (unlikely(!va))
return ERR_PTR(-ENOMEM);
@@ -2057,6 +2057,12 @@ retry:
addr = __alloc_vmap_area(&free_vmap_area_root, &free_vmap_area_list,
size, align, vstart, vend);
spin_unlock(&free_vmap_area_lock);
+
+ /*
+ * This is not a fast path. Check if yielding is needed. This
+ * is the only reschedule point in the vmalloc() path.
+ */
+ cond_resched();
}
trace_alloc_vmap_area(addr, size, align, vstart, vend, IS_ERR_VALUE(addr));
@@ -2089,7 +2095,7 @@ retry:
BUG_ON(va->va_start < vstart);
BUG_ON(va->va_end > vend);
- ret = kasan_populate_vmalloc(addr, size);
+ ret = kasan_populate_vmalloc(addr, size, gfp_mask);
if (ret) {
free_vmap_area(va);
return ERR_PTR(ret);
@@ -3622,7 +3628,6 @@ vm_area_alloc_pages(gfp_t gfp, int nid,
pages + nr_allocated);
nr_allocated += nr;
- cond_resched();
/*
* If zero or pages were obtained partly,
@@ -3664,7 +3669,6 @@ vm_area_alloc_pages(gfp_t gfp, int nid,
for (i = 0; i < (1U << order); i++)
pages[nr_allocated + i] = page + i;
- cond_resched();
nr_allocated += 1U << order;
}
@@ -4089,19 +4093,29 @@ void *vzalloc_node_noprof(unsigned long size, int node)
EXPORT_SYMBOL(vzalloc_node_noprof);
/**
- * vrealloc - reallocate virtually contiguous memory; contents remain unchanged
+ * vrealloc_node_align_noprof - reallocate virtually contiguous memory; contents
+ * remain unchanged
* @p: object to reallocate memory for
* @size: the size to reallocate
+ * @align: requested alignment
* @flags: the flags for the page level allocator
+ * @nid: node number of the target node
+ *
+ * If @p is %NULL, vrealloc_XXX() behaves exactly like vmalloc_XXX(). If @size
+ * is 0 and @p is not a %NULL pointer, the object pointed to is freed.
*
- * If @p is %NULL, vrealloc() behaves exactly like vmalloc(). If @size is 0 and
- * @p is not a %NULL pointer, the object pointed to is freed.
+ * If the caller wants the new memory to be on specific node *only*,
+ * __GFP_THISNODE flag should be set, otherwise the function will try to avoid
+ * reallocation and possibly disregard the specified @nid.
*
* If __GFP_ZERO logic is requested, callers must ensure that, starting with the
* initial memory allocation, every subsequent call to this API for the same
* memory allocation is flagged with __GFP_ZERO. Otherwise, it is possible that
* __GFP_ZERO is not fully honored by this API.
*
+ * Requesting an alignment that is bigger than the alignment of the existing
+ * allocation will fail.
+ *
* In any case, the contents of the object pointed to are preserved up to the
* lesser of the new and old sizes.
*
@@ -4111,7 +4125,8 @@ EXPORT_SYMBOL(vzalloc_node_noprof);
* Return: pointer to the allocated memory; %NULL if @size is zero or in case of
* failure
*/
-void *vrealloc_noprof(const void *p, size_t size, gfp_t flags)
+void *vrealloc_node_align_noprof(const void *p, size_t size, unsigned long align,
+ gfp_t flags, int nid)
{
struct vm_struct *vm = NULL;
size_t alloced_size = 0;
@@ -4135,6 +4150,12 @@ void *vrealloc_noprof(const void *p, size_t size, gfp_t flags)
if (WARN(alloced_size < old_size,
"vrealloc() has mismatched area vs requested sizes (%p)\n", p))
return NULL;
+ if (WARN(!IS_ALIGNED((unsigned long)p, align),
+ "will not reallocate with a bigger alignment (0x%lx)\n", align))
+ return NULL;
+ if (unlikely(flags & __GFP_THISNODE) && nid != NUMA_NO_NODE &&
+ nid != page_to_nid(vmalloc_to_page(p)))
+ goto need_realloc;
}
/*
@@ -4165,8 +4186,10 @@ void *vrealloc_noprof(const void *p, size_t size, gfp_t flags)
return (void *)p;
}
+need_realloc:
/* TODO: Grow the vm_area, i.e. allocate and map additional pages. */
- n = __vmalloc_noprof(size, flags);
+ n = __vmalloc_node_noprof(size, align, flags, nid, __builtin_return_address(0));
+
if (!n)
return NULL;
@@ -4826,7 +4849,7 @@ retry:
/* populate the kasan shadow space */
for (area = 0; area < nr_vms; area++) {
- if (kasan_populate_vmalloc(vas[area]->va_start, sizes[area]))
+ if (kasan_populate_vmalloc(vas[area]->va_start, sizes[area], GFP_KERNEL))
goto err_free_shadow;
}
@@ -5177,7 +5200,7 @@ static void vmap_init_nodes(void)
int n = clamp_t(unsigned int, num_possible_cpus(), 1, 128);
if (n > 1) {
- vn = kmalloc_array(n, sizeof(*vn), GFP_NOWAIT | __GFP_NOWARN);
+ vn = kmalloc_array(n, sizeof(*vn), GFP_NOWAIT);
if (vn) {
/* Node partition is 16 pages. */
vmap_zone_size = (1 << 4) * PAGE_SIZE;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index a48aec8bfd92..b2fc8b626d3d 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -398,14 +398,7 @@ unsigned long zone_reclaimable_pages(struct zone *zone)
if (can_reclaim_anon_pages(NULL, zone_to_nid(zone), NULL))
nr += zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_ANON) +
zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_ANON);
- /*
- * If there are no reclaimable file-backed or anonymous pages,
- * ensure zones with sufficient free pages are not skipped.
- * This prevents zones like DMA32 from being ignored in reclaim
- * scenarios where they can still help alleviate memory pressure.
- */
- if (nr == 0)
- nr = zone_page_state_snapshot(zone, NR_FREE_PAGES);
+
return nr;
}
@@ -525,7 +518,7 @@ static bool skip_throttle_noprogress(pg_data_t *pgdat)
* If kswapd is disabled, reschedule if necessary but do not
* throttle as the system is likely near OOM.
*/
- if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
+ if (atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES)
return true;
/*
@@ -737,13 +730,18 @@ static int __remove_mapping(struct address_space *mapping, struct folio *folio,
{
int refcount;
void *shadow = NULL;
+ struct swap_cluster_info *ci;
BUG_ON(!folio_test_locked(folio));
BUG_ON(mapping != folio_mapping(folio));
- if (!folio_test_swapcache(folio))
+ if (folio_test_swapcache(folio)) {
+ ci = swap_cluster_get_and_lock_irq(folio);
+ } else {
spin_lock(&mapping->host->i_lock);
- xa_lock_irq(&mapping->i_pages);
+ xa_lock_irq(&mapping->i_pages);
+ }
+
/*
* The non racy check for a busy folio.
*
@@ -783,9 +781,9 @@ static int __remove_mapping(struct address_space *mapping, struct folio *folio,
if (reclaimed && !mapping_exiting(mapping))
shadow = workingset_eviction(folio, target_memcg);
- __delete_from_swap_cache(folio, swap, shadow);
+ __swap_cache_del_folio(ci, folio, swap, shadow);
memcg1_swapout(folio, swap);
- xa_unlock_irq(&mapping->i_pages);
+ swap_cluster_unlock_irq(ci);
put_swap_folio(folio, swap);
} else {
void (*free_folio)(struct folio *);
@@ -823,9 +821,12 @@ static int __remove_mapping(struct address_space *mapping, struct folio *folio,
return 1;
cannot_free:
- xa_unlock_irq(&mapping->i_pages);
- if (!folio_test_swapcache(folio))
+ if (folio_test_swapcache(folio)) {
+ swap_cluster_unlock_irq(ci);
+ } else {
+ xa_unlock_irq(&mapping->i_pages);
spin_unlock(&mapping->host->i_lock);
+ }
return 0;
}
@@ -888,11 +889,11 @@ static bool lru_gen_set_refs(struct folio *folio)
{
/* see the comment on LRU_REFS_FLAGS */
if (!folio_test_referenced(folio) && !folio_test_workingset(folio)) {
- set_mask_bits(&folio->flags, LRU_REFS_MASK, BIT(PG_referenced));
+ set_mask_bits(&folio->flags.f, LRU_REFS_MASK, BIT(PG_referenced));
return false;
}
- set_mask_bits(&folio->flags, LRU_REFS_FLAGS, BIT(PG_workingset));
+ set_mask_bits(&folio->flags.f, LRU_REFS_FLAGS, BIT(PG_workingset));
return true;
}
#else
@@ -3257,13 +3258,13 @@ static bool positive_ctrl_err(struct ctrl_pos *sp, struct ctrl_pos *pv)
/* promote pages accessed through page tables */
static int folio_update_gen(struct folio *folio, int gen)
{
- unsigned long new_flags, old_flags = READ_ONCE(folio->flags);
+ unsigned long new_flags, old_flags = READ_ONCE(folio->flags.f);
VM_WARN_ON_ONCE(gen >= MAX_NR_GENS);
/* see the comment on LRU_REFS_FLAGS */
if (!folio_test_referenced(folio) && !folio_test_workingset(folio)) {
- set_mask_bits(&folio->flags, LRU_REFS_MASK, BIT(PG_referenced));
+ set_mask_bits(&folio->flags.f, LRU_REFS_MASK, BIT(PG_referenced));
return -1;
}
@@ -3274,7 +3275,7 @@ static int folio_update_gen(struct folio *folio, int gen)
new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_FLAGS);
new_flags |= ((gen + 1UL) << LRU_GEN_PGOFF) | BIT(PG_workingset);
- } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags));
+ } while (!try_cmpxchg(&folio->flags.f, &old_flags, new_flags));
return ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
}
@@ -3285,7 +3286,7 @@ static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclai
int type = folio_is_file_lru(folio);
struct lru_gen_folio *lrugen = &lruvec->lrugen;
int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]);
- unsigned long new_flags, old_flags = READ_ONCE(folio->flags);
+ unsigned long new_flags, old_flags = READ_ONCE(folio->flags.f);
VM_WARN_ON_ONCE_FOLIO(!(old_flags & LRU_GEN_MASK), folio);
@@ -3302,7 +3303,7 @@ static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclai
/* for folio_end_writeback() */
if (reclaiming)
new_flags |= BIT(PG_reclaim);
- } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags));
+ } while (!try_cmpxchg(&folio->flags.f, &old_flags, new_flags));
lru_gen_update_size(lruvec, folio, old_gen, new_gen);
@@ -4507,7 +4508,7 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_c
}
/* ineligible */
- if (!folio_test_lru(folio) || zone > sc->reclaim_idx) {
+ if (zone > sc->reclaim_idx) {
gen = folio_inc_gen(lruvec, folio, false);
list_move_tail(&folio->lru, &lrugen->folios[gen][type][zone]);
return true;
@@ -4553,7 +4554,7 @@ static bool isolate_folio(struct lruvec *lruvec, struct folio *folio, struct sca
/* see the comment on LRU_REFS_FLAGS */
if (!folio_test_referenced(folio))
- set_mask_bits(&folio->flags, LRU_REFS_MASK, 0);
+ set_mask_bits(&folio->flags.f, LRU_REFS_MASK, 0);
/* for shrink_folio_list() */
folio_clear_reclaim(folio);
@@ -4766,7 +4767,7 @@ retry:
/* don't add rejected folios to the oldest generation */
if (lru_gen_folio_seq(lruvec, folio, false) == min_seq[type])
- set_mask_bits(&folio->flags, LRU_REFS_FLAGS, BIT(PG_active));
+ set_mask_bits(&folio->flags.f, LRU_REFS_FLAGS, BIT(PG_active));
}
spin_lock_irq(&lruvec->lru_lock);
@@ -5100,7 +5101,7 @@ static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control *
blk_finish_plug(&plug);
done:
if (sc->nr_reclaimed > reclaimed)
- pgdat->kswapd_failures = 0;
+ atomic_set(&pgdat->kswapd_failures, 0);
}
/******************************************************************************
@@ -5561,6 +5562,7 @@ static int run_cmd(char cmd, int memcg_id, int nid, unsigned long seq,
if (memcg_id != mem_cgroup_id(memcg))
goto done;
+ sc->target_mem_cgroup = memcg;
lruvec = get_lruvec(memcg, nid);
if (swappiness < MIN_SWAPPINESS)
@@ -5597,6 +5599,7 @@ static ssize_t lru_gen_seq_write(struct file *file, const char __user *src,
.may_swap = true,
.reclaim_idx = MAX_NR_ZONES - 1,
.gfp_mask = GFP_KERNEL,
+ .proactive = true,
};
buf = kvmalloc(len + 1, GFP_KERNEL);
@@ -6177,7 +6180,7 @@ again:
* successful direct reclaim run will revive a dormant kswapd.
*/
if (reclaimable)
- pgdat->kswapd_failures = 0;
+ atomic_set(&pgdat->kswapd_failures, 0);
else if (sc->cache_trim_mode)
sc->cache_trim_mode_failed = 1;
}
@@ -6489,11 +6492,11 @@ static bool allow_direct_reclaim(pg_data_t *pgdat)
int i;
bool wmark_ok;
- if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
+ if (atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES)
return true;
for_each_managed_zone_pgdat(zone, pgdat, i, ZONE_NORMAL) {
- if (!zone_reclaimable_pages(zone))
+ if (!zone_reclaimable_pages(zone) && zone_page_state_snapshot(zone, NR_FREE_PAGES))
continue;
pfmemalloc_reserve += min_wmark_pages(zone);
@@ -6899,7 +6902,7 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order,
wake_up_all(&pgdat->pfmemalloc_wait);
/* Hopeless node, leave it to direct reclaim */
- if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
+ if (atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES)
return true;
if (pgdat_balanced(pgdat, order, highest_zoneidx)) {
@@ -7167,7 +7170,7 @@ restart:
}
if (!sc.nr_reclaimed)
- pgdat->kswapd_failures++;
+ atomic_inc(&pgdat->kswapd_failures);
out:
clear_reclaim_active(pgdat, highest_zoneidx);
@@ -7426,7 +7429,7 @@ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order,
return;
/* Hopeless node, leave it to direct reclaim if possible */
- if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ||
+ if (atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES ||
(pgdat_balanced(pgdat, order, highest_zoneidx) &&
!pgdat_watermark_boosted(pgdat, highest_zoneidx))) {
/*
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 71cd1ceba191..bb09c032eecf 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1280,6 +1280,7 @@ const char * const vmstat_text[] = {
#ifdef CONFIG_NUMA_BALANCING
[I(PGPROMOTE_SUCCESS)] = "pgpromote_success",
[I(PGPROMOTE_CANDIDATE)] = "pgpromote_candidate",
+ [I(PGPROMOTE_CANDIDATE_NRL)] = "pgpromote_candidate_nrl",
#endif
[I(PGDEMOTE_KSWAPD)] = "pgdemote_kswapd",
[I(PGDEMOTE_DIRECT)] = "pgdemote_direct",
@@ -1289,6 +1290,7 @@ const char * const vmstat_text[] = {
[I(NR_HUGETLB)] = "nr_hugetlb",
#endif
[I(NR_BALLOON_PAGES)] = "nr_balloon_pages",
+ [I(NR_KERNEL_FILE_PAGES)] = "nr_kernel_file_pages",
#undef I
/* system-wide enum vm_stat_item counters */
@@ -1846,7 +1848,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
seq_printf(m,
"\n node_unreclaimable: %u"
"\n start_pfn: %lu",
- pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES,
+ atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES,
zone->zone_start_pfn);
seq_putc(m, '\n');
}
diff --git a/mm/workingset.c b/mm/workingset.c
index 6e7f4cb1b9a7..68a76a91111f 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -318,7 +318,7 @@ static void lru_gen_refault(struct folio *folio, void *shadow)
folio_set_workingset(folio);
mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + type, delta);
} else
- set_mask_bits(&folio->flags, LRU_REFS_MASK, (refs - 1UL) << LRU_REFS_PGOFF);
+ set_mask_bits(&folio->flags.f, LRU_REFS_MASK, (refs - 1UL) << LRU_REFS_PGOFF);
unlock:
rcu_read_unlock();
}
diff --git a/mm/zpdesc.h b/mm/zpdesc.h
index 25bf5ea0beb8..b8258dc78548 100644
--- a/mm/zpdesc.h
+++ b/mm/zpdesc.h
@@ -1,5 +1,5 @@
/* SPDX-License-Identifier: GPL-2.0 */
-/* zpdesc.h: zswap.zpool memory descriptor
+/* zpdesc.h: zsmalloc pool memory descriptor
*
* Written by Alex Shi <alexs@kernel.org>
* Hyeonggon Yoo <42.hyeyoo@gmail.com>
@@ -11,14 +11,14 @@
#include <linux/pagemap.h>
/*
- * struct zpdesc - Memory descriptor for zpool memory.
+ * struct zpdesc - Memory descriptor for zsmalloc pool memory.
* @flags: Page flags, mostly unused by zsmalloc.
* @lru: Indirectly used by page migration.
* @movable_ops: Used by page migration.
- * @next: Next zpdesc in a zspage in zsmalloc zpool.
- * @handle: For huge zspage in zsmalloc zpool.
+ * @next: Next zpdesc in a zspage in zsmalloc pool.
+ * @handle: For huge zspage in zsmalloc pool.
* @zspage: Points to the zspage this zpdesc is a part of.
- * @first_obj_offset: First object offset in zsmalloc zpool.
+ * @first_obj_offset: First object offset in zsmalloc pool.
* @_refcount: The number of references to this zpdesc.
*
* This struct overlays struct page for now. Do not modify without a good
@@ -79,8 +79,8 @@ static_assert(sizeof(struct zpdesc) <= sizeof(struct page));
* zpdesc_folio - The folio allocated for a zpdesc
* @zp: The zpdesc.
*
- * Zpdescs are descriptors for zpool memory. The zpool memory itself is
- * allocated as folios that contain the zpool objects, and zpdesc uses specific
+ * Zpdescs are descriptors for zsmalloc memory. The memory itself is allocated
+ * as folios that contain the zsmalloc objects, and zpdesc uses specific
* fields in the first struct page of the folio - those fields are now accessed
* by struct zpdesc.
*
diff --git a/mm/zpool.c b/mm/zpool.c
deleted file mode 100644
index 0a71d03369f1..000000000000
--- a/mm/zpool.c
+++ /dev/null
@@ -1,328 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * zpool memory storage api
- *
- * Copyright (C) 2014 Dan Streetman
- *
- * This is a common frontend for memory storage pool implementations.
- * Typically, this is used to store compressed memory.
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/list.h>
-#include <linux/types.h>
-#include <linux/mm.h>
-#include <linux/slab.h>
-#include <linux/spinlock.h>
-#include <linux/module.h>
-#include <linux/zpool.h>
-
-struct zpool {
- struct zpool_driver *driver;
- void *pool;
-};
-
-static LIST_HEAD(drivers_head);
-static DEFINE_SPINLOCK(drivers_lock);
-
-/**
- * zpool_register_driver() - register a zpool implementation.
- * @driver: driver to register
- */
-void zpool_register_driver(struct zpool_driver *driver)
-{
- spin_lock(&drivers_lock);
- atomic_set(&driver->refcount, 0);
- list_add(&driver->list, &drivers_head);
- spin_unlock(&drivers_lock);
-}
-EXPORT_SYMBOL(zpool_register_driver);
-
-/**
- * zpool_unregister_driver() - unregister a zpool implementation.
- * @driver: driver to unregister.
- *
- * Module usage counting is used to prevent using a driver
- * while/after unloading, so if this is called from module
- * exit function, this should never fail; if called from
- * other than the module exit function, and this returns
- * failure, the driver is in use and must remain available.
- */
-int zpool_unregister_driver(struct zpool_driver *driver)
-{
- int ret = 0, refcount;
-
- spin_lock(&drivers_lock);
- refcount = atomic_read(&driver->refcount);
- WARN_ON(refcount < 0);
- if (refcount > 0)
- ret = -EBUSY;
- else
- list_del(&driver->list);
- spin_unlock(&drivers_lock);
-
- return ret;
-}
-EXPORT_SYMBOL(zpool_unregister_driver);
-
-/* this assumes @type is null-terminated. */
-static struct zpool_driver *zpool_get_driver(const char *type)
-{
- struct zpool_driver *driver;
-
- spin_lock(&drivers_lock);
- list_for_each_entry(driver, &drivers_head, list) {
- if (!strcmp(driver->type, type)) {
- bool got = try_module_get(driver->owner);
-
- if (got)
- atomic_inc(&driver->refcount);
- spin_unlock(&drivers_lock);
- return got ? driver : NULL;
- }
- }
-
- spin_unlock(&drivers_lock);
- return NULL;
-}
-
-static void zpool_put_driver(struct zpool_driver *driver)
-{
- atomic_dec(&driver->refcount);
- module_put(driver->owner);
-}
-
-/**
- * zpool_has_pool() - Check if the pool driver is available
- * @type: The type of the zpool to check (e.g. zsmalloc)
- *
- * This checks if the @type pool driver is available. This will try to load
- * the requested module, if needed, but there is no guarantee the module will
- * still be loaded and available immediately after calling. If this returns
- * true, the caller should assume the pool is available, but must be prepared
- * to handle the @zpool_create_pool() returning failure. However if this
- * returns false, the caller should assume the requested pool type is not
- * available; either the requested pool type module does not exist, or could
- * not be loaded, and calling @zpool_create_pool() with the pool type will
- * fail.
- *
- * The @type string must be null-terminated.
- *
- * Returns: true if @type pool is available, false if not
- */
-bool zpool_has_pool(char *type)
-{
- struct zpool_driver *driver = zpool_get_driver(type);
-
- if (!driver) {
- request_module("zpool-%s", type);
- driver = zpool_get_driver(type);
- }
-
- if (!driver)
- return false;
-
- zpool_put_driver(driver);
- return true;
-}
-EXPORT_SYMBOL(zpool_has_pool);
-
-/**
- * zpool_create_pool() - Create a new zpool
- * @type: The type of the zpool to create (e.g. zsmalloc)
- * @name: The name of the zpool (e.g. zram0, zswap)
- * @gfp: The GFP flags to use when allocating the pool.
- *
- * This creates a new zpool of the specified type. The gfp flags will be
- * used when allocating memory, if the implementation supports it. If the
- * ops param is NULL, then the created zpool will not be evictable.
- *
- * Implementations must guarantee this to be thread-safe.
- *
- * The @type and @name strings must be null-terminated.
- *
- * Returns: New zpool on success, NULL on failure.
- */
-struct zpool *zpool_create_pool(const char *type, const char *name, gfp_t gfp)
-{
- struct zpool_driver *driver;
- struct zpool *zpool;
-
- pr_debug("creating pool type %s\n", type);
-
- driver = zpool_get_driver(type);
-
- if (!driver) {
- request_module("zpool-%s", type);
- driver = zpool_get_driver(type);
- }
-
- if (!driver) {
- pr_err("no driver for type %s\n", type);
- return NULL;
- }
-
- zpool = kmalloc(sizeof(*zpool), gfp);
- if (!zpool) {
- pr_err("couldn't create zpool - out of memory\n");
- zpool_put_driver(driver);
- return NULL;
- }
-
- zpool->driver = driver;
- zpool->pool = driver->create(name, gfp);
-
- if (!zpool->pool) {
- pr_err("couldn't create %s pool\n", type);
- zpool_put_driver(driver);
- kfree(zpool);
- return NULL;
- }
-
- pr_debug("created pool type %s\n", type);
-
- return zpool;
-}
-
-/**
- * zpool_destroy_pool() - Destroy a zpool
- * @zpool: The zpool to destroy.
- *
- * Implementations must guarantee this to be thread-safe,
- * however only when destroying different pools. The same
- * pool should only be destroyed once, and should not be used
- * after it is destroyed.
- *
- * This destroys an existing zpool. The zpool should not be in use.
- */
-void zpool_destroy_pool(struct zpool *zpool)
-{
- pr_debug("destroying pool type %s\n", zpool->driver->type);
-
- zpool->driver->destroy(zpool->pool);
- zpool_put_driver(zpool->driver);
- kfree(zpool);
-}
-
-/**
- * zpool_get_type() - Get the type of the zpool
- * @zpool: The zpool to check
- *
- * This returns the type of the pool.
- *
- * Implementations must guarantee this to be thread-safe.
- *
- * Returns: The type of zpool.
- */
-const char *zpool_get_type(struct zpool *zpool)
-{
- return zpool->driver->type;
-}
-
-/**
- * zpool_malloc() - Allocate memory
- * @zpool: The zpool to allocate from.
- * @size: The amount of memory to allocate.
- * @gfp: The GFP flags to use when allocating memory.
- * @handle: Pointer to the handle to set
- * @nid: The preferred node id.
- *
- * This allocates the requested amount of memory from the pool.
- * The gfp flags will be used when allocating memory, if the
- * implementation supports it. The provided @handle will be
- * set to the allocated object handle. The allocation will
- * prefer the NUMA node specified by @nid.
- *
- * Implementations must guarantee this to be thread-safe.
- *
- * Returns: 0 on success, negative value on error.
- */
-int zpool_malloc(struct zpool *zpool, size_t size, gfp_t gfp,
- unsigned long *handle, const int nid)
-{
- return zpool->driver->malloc(zpool->pool, size, gfp, handle, nid);
-}
-
-/**
- * zpool_free() - Free previously allocated memory
- * @zpool: The zpool that allocated the memory.
- * @handle: The handle to the memory to free.
- *
- * This frees previously allocated memory. This does not guarantee
- * that the pool will actually free memory, only that the memory
- * in the pool will become available for use by the pool.
- *
- * Implementations must guarantee this to be thread-safe,
- * however only when freeing different handles. The same
- * handle should only be freed once, and should not be used
- * after freeing.
- */
-void zpool_free(struct zpool *zpool, unsigned long handle)
-{
- zpool->driver->free(zpool->pool, handle);
-}
-
-/**
- * zpool_obj_read_begin() - Start reading from a previously allocated handle.
- * @zpool: The zpool that the handle was allocated from
- * @handle: The handle to read from
- * @local_copy: A local buffer to use if needed.
- *
- * This starts a read operation of a previously allocated handle. The passed
- * @local_copy buffer may be used if needed by copying the memory into.
- * zpool_obj_read_end() MUST be called after the read is completed to undo any
- * actions taken (e.g. release locks).
- *
- * Returns: A pointer to the handle memory to be read, if @local_copy is used,
- * the returned pointer is @local_copy.
- */
-void *zpool_obj_read_begin(struct zpool *zpool, unsigned long handle,
- void *local_copy)
-{
- return zpool->driver->obj_read_begin(zpool->pool, handle, local_copy);
-}
-
-/**
- * zpool_obj_read_end() - Finish reading from a previously allocated handle.
- * @zpool: The zpool that the handle was allocated from
- * @handle: The handle to read from
- * @handle_mem: The pointer returned by zpool_obj_read_begin()
- *
- * Finishes a read operation previously started by zpool_obj_read_begin().
- */
-void zpool_obj_read_end(struct zpool *zpool, unsigned long handle,
- void *handle_mem)
-{
- zpool->driver->obj_read_end(zpool->pool, handle, handle_mem);
-}
-
-/**
- * zpool_obj_write() - Write to a previously allocated handle.
- * @zpool: The zpool that the handle was allocated from
- * @handle: The handle to read from
- * @handle_mem: The memory to copy from into the handle.
- * @mem_len: The length of memory to be written.
- *
- */
-void zpool_obj_write(struct zpool *zpool, unsigned long handle,
- void *handle_mem, size_t mem_len)
-{
- zpool->driver->obj_write(zpool->pool, handle, handle_mem, mem_len);
-}
-
-/**
- * zpool_get_total_pages() - The total size of the pool
- * @zpool: The zpool to check
- *
- * This returns the total size in pages of the pool.
- *
- * Returns: Total size of the zpool in pages.
- */
-u64 zpool_get_total_pages(struct zpool *zpool)
-{
- return zpool->driver->total_pages(zpool->pool);
-}
-
-MODULE_AUTHOR("Dan Streetman <ddstreet@ieee.org>");
-MODULE_DESCRIPTION("Common API for compressed memory storage");
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 805a10b41266..5bf832f9c05c 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -36,7 +36,6 @@
#include <linux/types.h>
#include <linux/debugfs.h>
#include <linux/zsmalloc.h>
-#include <linux/zpool.h>
#include <linux/fs.h>
#include <linux/workqueue.h>
#include "zpdesc.h"
@@ -433,78 +432,6 @@ static void record_obj(unsigned long handle, unsigned long obj)
*(unsigned long *)handle = obj;
}
-/* zpool driver */
-
-#ifdef CONFIG_ZPOOL
-
-static void *zs_zpool_create(const char *name, gfp_t gfp)
-{
- /*
- * Ignore global gfp flags: zs_malloc() may be invoked from
- * different contexts and its caller must provide a valid
- * gfp mask.
- */
- return zs_create_pool(name);
-}
-
-static void zs_zpool_destroy(void *pool)
-{
- zs_destroy_pool(pool);
-}
-
-static int zs_zpool_malloc(void *pool, size_t size, gfp_t gfp,
- unsigned long *handle, const int nid)
-{
- *handle = zs_malloc(pool, size, gfp, nid);
-
- if (IS_ERR_VALUE(*handle))
- return PTR_ERR((void *)*handle);
- return 0;
-}
-static void zs_zpool_free(void *pool, unsigned long handle)
-{
- zs_free(pool, handle);
-}
-
-static void *zs_zpool_obj_read_begin(void *pool, unsigned long handle,
- void *local_copy)
-{
- return zs_obj_read_begin(pool, handle, local_copy);
-}
-
-static void zs_zpool_obj_read_end(void *pool, unsigned long handle,
- void *handle_mem)
-{
- zs_obj_read_end(pool, handle, handle_mem);
-}
-
-static void zs_zpool_obj_write(void *pool, unsigned long handle,
- void *handle_mem, size_t mem_len)
-{
- zs_obj_write(pool, handle, handle_mem, mem_len);
-}
-
-static u64 zs_zpool_total_pages(void *pool)
-{
- return zs_get_total_pages(pool);
-}
-
-static struct zpool_driver zs_zpool_driver = {
- .type = "zsmalloc",
- .owner = THIS_MODULE,
- .create = zs_zpool_create,
- .destroy = zs_zpool_destroy,
- .malloc = zs_zpool_malloc,
- .free = zs_zpool_free,
- .obj_read_begin = zs_zpool_obj_read_begin,
- .obj_read_end = zs_zpool_obj_read_end,
- .obj_write = zs_zpool_obj_write,
- .total_pages = zs_zpool_total_pages,
-};
-
-MODULE_ALIAS("zpool-zsmalloc");
-#endif /* CONFIG_ZPOOL */
-
static inline bool __maybe_unused is_first_zpdesc(struct zpdesc *zpdesc)
{
return PagePrivate(zpdesc_page(zpdesc));
@@ -1746,7 +1673,7 @@ static int zs_page_migrate(struct page *newpage, struct page *page,
* instead.
*/
if (!zpdesc->zspage)
- return MIGRATEPAGE_SUCCESS;
+ return 0;
/* The page is locked, so this pointer must remain valid */
zspage = get_zspage(zpdesc);
@@ -1813,7 +1740,7 @@ static int zs_page_migrate(struct page *newpage, struct page *page,
reset_zpdesc(zpdesc);
zpdesc_put(zpdesc);
- return MIGRATEPAGE_SUCCESS;
+ return 0;
}
static void zs_page_putback(struct page *page)
@@ -2248,9 +2175,6 @@ static int __init zs_init(void)
{
int rc __maybe_unused;
-#ifdef CONFIG_ZPOOL
- zpool_register_driver(&zs_zpool_driver);
-#endif
#ifdef CONFIG_COMPACTION
rc = set_movable_ops(&zsmalloc_mops, PGTY_zsmalloc);
if (rc)
@@ -2262,9 +2186,6 @@ static int __init zs_init(void)
static void __exit zs_exit(void)
{
-#ifdef CONFIG_ZPOOL
- zpool_unregister_driver(&zs_zpool_driver);
-#endif
#ifdef CONFIG_COMPACTION
set_movable_ops(NULL, PGTY_zsmalloc);
#endif
diff --git a/mm/zswap.c b/mm/zswap.c
index 3c0fd8a13718..c1af782e54ec 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -25,7 +25,6 @@
#include <linux/scatterlist.h>
#include <linux/mempolicy.h>
#include <linux/mempool.h>
-#include <linux/zpool.h>
#include <crypto/acompress.h>
#include <linux/zswap.h>
#include <linux/mm_types.h>
@@ -35,6 +34,7 @@
#include <linux/pagemap.h>
#include <linux/workqueue.h>
#include <linux/list_lru.h>
+#include <linux/zsmalloc.h>
#include "swap.h"
#include "internal.h"
@@ -42,8 +42,10 @@
/*********************************
* statistics
**********************************/
-/* The number of compressed pages currently stored in zswap */
+/* The number of pages currently stored in zswap */
atomic_long_t zswap_stored_pages = ATOMIC_LONG_INIT(0);
+/* The number of incompressible pages currently stored in zswap */
+static atomic_long_t zswap_stored_incompressible_pages = ATOMIC_LONG_INIT(0);
/*
* The statistics below are not protected from concurrent access for
@@ -105,16 +107,6 @@ static const struct kernel_param_ops zswap_compressor_param_ops = {
module_param_cb(compressor, &zswap_compressor_param_ops,
&zswap_compressor, 0644);
-/* Compressed storage zpool to use */
-static char *zswap_zpool_type = CONFIG_ZSWAP_ZPOOL_DEFAULT;
-static int zswap_zpool_param_set(const char *, const struct kernel_param *);
-static const struct kernel_param_ops zswap_zpool_param_ops = {
- .set = zswap_zpool_param_set,
- .get = param_get_charp,
- .free = param_free_charp,
-};
-module_param_cb(zpool, &zswap_zpool_param_ops, &zswap_zpool_type, 0644);
-
/* The maximum percentage of memory that the compressed pool can occupy */
static unsigned int zswap_max_pool_percent = 20;
module_param_named(max_pool_percent, zswap_max_pool_percent, uint, 0644);
@@ -159,7 +151,7 @@ struct crypto_acomp_ctx {
* needs to be verified that it's still valid in the tree.
*/
struct zswap_pool {
- struct zpool *zpool;
+ struct zs_pool *zs_pool;
struct crypto_acomp_ctx __percpu *acomp_ctx;
struct percpu_ref ref;
struct list_head list;
@@ -191,7 +183,7 @@ static struct shrinker *zswap_shrinker;
* logic if referenced is unset. See comments in the shrinker
* section for context.
* pool - the zswap_pool the entry's data is in
- * handle - zpool allocation handle that stores the compressed page data
+ * handle - zsmalloc allocation handle that stores the compressed page data
* objcg - the obj_cgroup that the compressed memory is charged to
* lru - handle to the pool's lru used to evict pages.
*/
@@ -212,7 +204,7 @@ static unsigned int nr_zswap_trees[MAX_SWAPFILES];
static LIST_HEAD(zswap_pools);
/* protects zswap_pools list modification */
static DEFINE_SPINLOCK(zswap_pools_lock);
-/* pool counter to provide unique names to zpool */
+/* pool counter to provide unique names to zsmalloc */
static atomic_t zswap_pools_count = ATOMIC_INIT(0);
enum zswap_init_type {
@@ -233,38 +225,31 @@ static bool zswap_has_pool;
* helpers and fwd declarations
**********************************/
+/* One swap address space for each 64M swap space */
+#define ZSWAP_ADDRESS_SPACE_SHIFT 14
+#define ZSWAP_ADDRESS_SPACE_PAGES (1 << ZSWAP_ADDRESS_SPACE_SHIFT)
static inline struct xarray *swap_zswap_tree(swp_entry_t swp)
{
return &zswap_trees[swp_type(swp)][swp_offset(swp)
- >> SWAP_ADDRESS_SPACE_SHIFT];
+ >> ZSWAP_ADDRESS_SPACE_SHIFT];
}
-#define zswap_pool_debug(msg, p) \
- pr_debug("%s pool %s/%s\n", msg, (p)->tfm_name, \
- zpool_get_type((p)->zpool))
+#define zswap_pool_debug(msg, p) \
+ pr_debug("%s pool %s\n", msg, (p)->tfm_name)
/*********************************
* pool functions
**********************************/
static void __zswap_pool_empty(struct percpu_ref *ref);
-static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
+static struct zswap_pool *zswap_pool_create(char *compressor)
{
struct zswap_pool *pool;
char name[38]; /* 'zswap' + 32 char (max) num + \0 */
- gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
int ret, cpu;
- if (!zswap_has_pool) {
- /* if either are unset, pool initialization failed, and we
- * need both params to be set correctly before trying to
- * create a pool.
- */
- if (!strcmp(type, ZSWAP_PARAM_UNSET))
- return NULL;
- if (!strcmp(compressor, ZSWAP_PARAM_UNSET))
- return NULL;
- }
+ if (!zswap_has_pool && !strcmp(compressor, ZSWAP_PARAM_UNSET))
+ return NULL;
pool = kzalloc(sizeof(*pool), GFP_KERNEL);
if (!pool)
@@ -272,12 +257,9 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
/* unique name for each pool specifically required by zsmalloc */
snprintf(name, 38, "zswap%x", atomic_inc_return(&zswap_pools_count));
- pool->zpool = zpool_create_pool(type, name, gfp);
- if (!pool->zpool) {
- pr_err("%s zpool not available\n", type);
+ pool->zs_pool = zs_create_pool(name);
+ if (!pool->zs_pool)
goto error;
- }
- pr_debug("using %s zpool\n", zpool_get_type(pool->zpool));
strscpy(pool->tfm_name, compressor, sizeof(pool->tfm_name));
@@ -313,52 +295,29 @@ ref_fail:
error:
if (pool->acomp_ctx)
free_percpu(pool->acomp_ctx);
- if (pool->zpool)
- zpool_destroy_pool(pool->zpool);
+ if (pool->zs_pool)
+ zs_destroy_pool(pool->zs_pool);
kfree(pool);
return NULL;
}
static struct zswap_pool *__zswap_pool_create_fallback(void)
{
- bool has_comp, has_zpool;
-
- has_comp = crypto_has_acomp(zswap_compressor, 0, 0);
- if (!has_comp && strcmp(zswap_compressor,
- CONFIG_ZSWAP_COMPRESSOR_DEFAULT)) {
+ if (!crypto_has_acomp(zswap_compressor, 0, 0) &&
+ strcmp(zswap_compressor, CONFIG_ZSWAP_COMPRESSOR_DEFAULT)) {
pr_err("compressor %s not available, using default %s\n",
zswap_compressor, CONFIG_ZSWAP_COMPRESSOR_DEFAULT);
param_free_charp(&zswap_compressor);
zswap_compressor = CONFIG_ZSWAP_COMPRESSOR_DEFAULT;
- has_comp = crypto_has_acomp(zswap_compressor, 0, 0);
- }
- if (!has_comp) {
- pr_err("default compressor %s not available\n",
- zswap_compressor);
- param_free_charp(&zswap_compressor);
- zswap_compressor = ZSWAP_PARAM_UNSET;
- }
-
- has_zpool = zpool_has_pool(zswap_zpool_type);
- if (!has_zpool && strcmp(zswap_zpool_type,
- CONFIG_ZSWAP_ZPOOL_DEFAULT)) {
- pr_err("zpool %s not available, using default %s\n",
- zswap_zpool_type, CONFIG_ZSWAP_ZPOOL_DEFAULT);
- param_free_charp(&zswap_zpool_type);
- zswap_zpool_type = CONFIG_ZSWAP_ZPOOL_DEFAULT;
- has_zpool = zpool_has_pool(zswap_zpool_type);
- }
- if (!has_zpool) {
- pr_err("default zpool %s not available\n",
- zswap_zpool_type);
- param_free_charp(&zswap_zpool_type);
- zswap_zpool_type = ZSWAP_PARAM_UNSET;
}
- if (!has_comp || !has_zpool)
+ /* Default compressor should be available. Kconfig bug? */
+ if (WARN_ON_ONCE(!crypto_has_acomp(zswap_compressor, 0, 0))) {
+ zswap_compressor = ZSWAP_PARAM_UNSET;
return NULL;
+ }
- return zswap_pool_create(zswap_zpool_type, zswap_compressor);
+ return zswap_pool_create(zswap_compressor);
}
static void zswap_pool_destroy(struct zswap_pool *pool)
@@ -368,7 +327,7 @@ static void zswap_pool_destroy(struct zswap_pool *pool)
cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node);
free_percpu(pool->acomp_ctx);
- zpool_destroy_pool(pool->zpool);
+ zs_destroy_pool(pool->zs_pool);
kfree(pool);
}
@@ -460,7 +419,7 @@ static struct zswap_pool *zswap_pool_current_get(void)
}
/* type and compressor must be null-terminated */
-static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor)
+static struct zswap_pool *zswap_pool_find_get(char *compressor)
{
struct zswap_pool *pool;
@@ -469,8 +428,6 @@ static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor)
list_for_each_entry_rcu(pool, &zswap_pools, list) {
if (strcmp(pool->tfm_name, compressor))
continue;
- if (strcmp(zpool_get_type(pool->zpool), type))
- continue;
/* if we can't get it, it's about to be destroyed */
if (!zswap_pool_tryget(pool))
continue;
@@ -497,7 +454,7 @@ unsigned long zswap_total_pages(void)
rcu_read_lock();
list_for_each_entry_rcu(pool, &zswap_pools, list)
- total += zpool_get_total_pages(pool->zpool);
+ total += zs_get_total_pages(pool->zs_pool);
rcu_read_unlock();
return total;
@@ -522,33 +479,22 @@ static bool zswap_check_limits(void)
* param callbacks
**********************************/
-static bool zswap_pool_changed(const char *s, const struct kernel_param *kp)
-{
- /* no change required */
- if (!strcmp(s, *(char **)kp->arg) && zswap_has_pool)
- return false;
- return true;
-}
-
-/* val must be a null-terminated string */
-static int __zswap_param_set(const char *val, const struct kernel_param *kp,
- char *type, char *compressor)
+static int zswap_compressor_param_set(const char *val, const struct kernel_param *kp)
{
struct zswap_pool *pool, *put_pool = NULL;
char *s = strstrip((char *)val);
+ bool create_pool = false;
int ret = 0;
- bool new_pool = false;
mutex_lock(&zswap_init_lock);
switch (zswap_init_state) {
case ZSWAP_UNINIT:
- /* if this is load-time (pre-init) param setting,
- * don't create a pool; that's done during init.
- */
+ /* Handled in zswap_setup() */
ret = param_set_charp(s, kp);
break;
case ZSWAP_INIT_SUCCEED:
- new_pool = zswap_pool_changed(s, kp);
+ if (!zswap_has_pool || strcmp(s, *(char **)kp->arg))
+ create_pool = true;
break;
case ZSWAP_INIT_FAILED:
pr_err("can't set param, initialization failed\n");
@@ -556,30 +502,17 @@ static int __zswap_param_set(const char *val, const struct kernel_param *kp,
}
mutex_unlock(&zswap_init_lock);
- /* no need to create a new pool, return directly */
- if (!new_pool)
+ if (!create_pool)
return ret;
- if (!type) {
- if (!zpool_has_pool(s)) {
- pr_err("zpool %s not available\n", s);
- return -ENOENT;
- }
- type = s;
- } else if (!compressor) {
- if (!crypto_has_acomp(s, 0, 0)) {
- pr_err("compressor %s not available\n", s);
- return -ENOENT;
- }
- compressor = s;
- } else {
- WARN_ON(1);
- return -EINVAL;
+ if (!crypto_has_acomp(s, 0, 0)) {
+ pr_err("compressor %s not available\n", s);
+ return -ENOENT;
}
spin_lock_bh(&zswap_pools_lock);
- pool = zswap_pool_find_get(type, compressor);
+ pool = zswap_pool_find_get(s);
if (pool) {
zswap_pool_debug("using existing", pool);
WARN_ON(pool == zswap_pool_current());
@@ -589,7 +522,7 @@ static int __zswap_param_set(const char *val, const struct kernel_param *kp,
spin_unlock_bh(&zswap_pools_lock);
if (!pool)
- pool = zswap_pool_create(type, compressor);
+ pool = zswap_pool_create(s);
else {
/*
* Restore the initial ref dropped by percpu_ref_kill()
@@ -614,7 +547,8 @@ static int __zswap_param_set(const char *val, const struct kernel_param *kp,
list_add_rcu(&pool->list, &zswap_pools);
zswap_has_pool = true;
} else if (pool) {
- /* add the possibly pre-existing pool to the end of the pools
+ /*
+ * Add the possibly pre-existing pool to the end of the pools
* list; if it's new (and empty) then it'll be removed and
* destroyed by the put after we drop the lock
*/
@@ -624,18 +558,8 @@ static int __zswap_param_set(const char *val, const struct kernel_param *kp,
spin_unlock_bh(&zswap_pools_lock);
- if (!zswap_has_pool && !pool) {
- /* if initial pool creation failed, and this pool creation also
- * failed, maybe both compressor and zpool params were bad.
- * Allow changing this param, so pool creation will succeed
- * when the other param is changed. We already verified this
- * param is ok in the zpool_has_pool() or crypto_has_acomp()
- * checks above.
- */
- ret = param_set_charp(s, kp);
- }
-
- /* drop the ref from either the old current pool,
+ /*
+ * Drop the ref from either the old current pool,
* or the new pool we failed to add
*/
if (put_pool)
@@ -644,18 +568,6 @@ static int __zswap_param_set(const char *val, const struct kernel_param *kp,
return ret;
}
-static int zswap_compressor_param_set(const char *val,
- const struct kernel_param *kp)
-{
- return __zswap_param_set(val, kp, zswap_zpool_type, NULL);
-}
-
-static int zswap_zpool_param_set(const char *val,
- const struct kernel_param *kp)
-{
- return __zswap_param_set(val, kp, NULL, zswap_compressor);
-}
-
static int zswap_enabled_param_set(const char *val,
const struct kernel_param *kp)
{
@@ -799,18 +711,20 @@ static void zswap_entry_cache_free(struct zswap_entry *entry)
}
/*
- * Carries out the common pattern of freeing and entry's zpool allocation,
+ * Carries out the common pattern of freeing an entry's zsmalloc allocation,
* freeing the entry itself, and decrementing the number of stored pages.
*/
static void zswap_entry_free(struct zswap_entry *entry)
{
zswap_lru_del(&zswap_list_lru, entry);
- zpool_free(entry->pool->zpool, entry->handle);
+ zs_free(entry->pool->zs_pool, entry->handle);
zswap_pool_put(entry->pool);
if (entry->objcg) {
obj_cgroup_uncharge_zswap(entry->objcg, entry->length);
obj_cgroup_put(entry->objcg);
}
+ if (entry->length == PAGE_SIZE)
+ atomic_long_dec(&zswap_stored_incompressible_pages);
zswap_entry_cache_free(entry);
atomic_long_dec(&zswap_stored_pages);
}
@@ -827,7 +741,7 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
u8 *buffer = NULL;
int ret;
- buffer = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu));
+ buffer = kmalloc_node(PAGE_SIZE, GFP_KERNEL, cpu_to_node(cpu));
if (!buffer) {
ret = -ENOMEM;
goto fail;
@@ -945,21 +859,16 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry,
int comp_ret = 0, alloc_ret = 0;
unsigned int dlen = PAGE_SIZE;
unsigned long handle;
- struct zpool *zpool;
gfp_t gfp;
u8 *dst;
+ bool mapped = false;
acomp_ctx = acomp_ctx_get_cpu_lock(pool);
dst = acomp_ctx->buffer;
sg_init_table(&input, 1);
sg_set_page(&input, page, PAGE_SIZE, 0);
- /*
- * We need PAGE_SIZE * 2 here since there maybe over-compression case,
- * and hardware-accelerators may won't check the dst buffer size, so
- * giving the dst buffer with enough length to avoid buffer overflow.
- */
- sg_init_one(&output, dst, PAGE_SIZE * 2);
+ sg_init_one(&output, dst, PAGE_SIZE);
acomp_request_set_params(acomp_ctx->req, &input, &output, PAGE_SIZE, dlen);
/*
@@ -976,20 +885,41 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry,
*/
comp_ret = crypto_wait_req(crypto_acomp_compress(acomp_ctx->req), &acomp_ctx->wait);
dlen = acomp_ctx->req->dlen;
- if (comp_ret)
- goto unlock;
- zpool = pool->zpool;
+ /*
+ * If a page cannot be compressed into a size smaller than PAGE_SIZE,
+ * save the content as is without a compression, to keep the LRU order
+ * of writebacks. If writeback is disabled, reject the page since it
+ * only adds metadata overhead. swap_writeout() will put the page back
+ * to the active LRU list in the case.
+ */
+ if (comp_ret || !dlen || dlen >= PAGE_SIZE) {
+ dlen = PAGE_SIZE;
+ if (!mem_cgroup_zswap_writeback_enabled(
+ folio_memcg(page_folio(page)))) {
+ comp_ret = comp_ret ? comp_ret : -EINVAL;
+ goto unlock;
+ }
+ comp_ret = 0;
+ dlen = PAGE_SIZE;
+ dst = kmap_local_page(page);
+ mapped = true;
+ }
+
gfp = GFP_NOWAIT | __GFP_NORETRY | __GFP_HIGHMEM | __GFP_MOVABLE;
- alloc_ret = zpool_malloc(zpool, dlen, gfp, &handle, page_to_nid(page));
- if (alloc_ret)
+ handle = zs_malloc(pool->zs_pool, dlen, gfp, page_to_nid(page));
+ if (IS_ERR_VALUE(handle)) {
+ alloc_ret = PTR_ERR((void *)handle);
goto unlock;
+ }
- zpool_obj_write(zpool, handle, dst, dlen);
+ zs_obj_write(pool->zs_pool, handle, dst, dlen);
entry->handle = handle;
entry->length = dlen;
unlock:
+ if (mapped)
+ kunmap_local(dst);
if (comp_ret == -ENOSPC || alloc_ret == -ENOSPC)
zswap_reject_compress_poor++;
else if (comp_ret)
@@ -1003,17 +933,23 @@ unlock:
static bool zswap_decompress(struct zswap_entry *entry, struct folio *folio)
{
- struct zpool *zpool = entry->pool->zpool;
+ struct zswap_pool *pool = entry->pool;
struct scatterlist input, output;
struct crypto_acomp_ctx *acomp_ctx;
- int decomp_ret, dlen;
+ int decomp_ret = 0, dlen = PAGE_SIZE;
u8 *src, *obj;
- acomp_ctx = acomp_ctx_get_cpu_lock(entry->pool);
- obj = zpool_obj_read_begin(zpool, entry->handle, acomp_ctx->buffer);
+ acomp_ctx = acomp_ctx_get_cpu_lock(pool);
+ obj = zs_obj_read_begin(pool->zs_pool, entry->handle, acomp_ctx->buffer);
+
+ /* zswap entries of length PAGE_SIZE are not compressed. */
+ if (entry->length == PAGE_SIZE) {
+ memcpy_to_folio(folio, 0, obj, entry->length);
+ goto read_done;
+ }
/*
- * zpool_obj_read_begin() might return a kmap address of highmem when
+ * zs_obj_read_begin() might return a kmap address of highmem when
* acomp_ctx->buffer is not used. However, sg_init_one() does not
* handle highmem addresses, so copy the object to acomp_ctx->buffer.
*/
@@ -1032,7 +968,8 @@ static bool zswap_decompress(struct zswap_entry *entry, struct folio *folio)
decomp_ret = crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait);
dlen = acomp_ctx->req->dlen;
- zpool_obj_read_end(zpool, entry->handle, obj);
+read_done:
+ zs_obj_read_end(pool->zs_pool, entry->handle, obj);
acomp_ctx_put_unlock(acomp_ctx);
if (!decomp_ret && dlen == PAGE_SIZE)
@@ -1135,7 +1072,7 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
out:
if (ret && ret != -EEXIST) {
- delete_from_swap_cache(folio);
+ swap_cache_del_folio(folio);
folio_unlock(folio);
}
folio_put(folio);
@@ -1524,6 +1461,8 @@ static bool zswap_store_page(struct page *page,
obj_cgroup_charge_zswap(objcg, entry->length);
}
atomic_long_inc(&zswap_stored_pages);
+ if (entry->length == PAGE_SIZE)
+ atomic_long_inc(&zswap_stored_incompressible_pages);
/*
* We finish initializing the entry while it's already in xarray.
@@ -1547,7 +1486,7 @@ static bool zswap_store_page(struct page *page,
return true;
store_failed:
- zpool_free(pool->zpool, entry->handle);
+ zs_free(pool->zs_pool, entry->handle);
compress_failed:
zswap_entry_cache_free(entry);
return false;
@@ -1738,7 +1677,7 @@ int zswap_swapon(int type, unsigned long nr_pages)
struct xarray *trees, *tree;
unsigned int nr, i;
- nr = DIV_ROUND_UP(nr_pages, SWAP_ADDRESS_SPACE_PAGES);
+ nr = DIV_ROUND_UP(nr_pages, ZSWAP_ADDRESS_SPACE_PAGES);
trees = kvcalloc(nr, sizeof(*tree), GFP_KERNEL);
if (!trees) {
pr_err("alloc failed, zswap disabled for swap type %d\n", type);
@@ -1792,6 +1731,14 @@ static int debugfs_get_stored_pages(void *data, u64 *val)
}
DEFINE_DEBUGFS_ATTRIBUTE(stored_pages_fops, debugfs_get_stored_pages, NULL, "%llu\n");
+static int debugfs_get_stored_incompressible_pages(void *data, u64 *val)
+{
+ *val = atomic_long_read(&zswap_stored_incompressible_pages);
+ return 0;
+}
+DEFINE_DEBUGFS_ATTRIBUTE(stored_incompressible_pages_fops,
+ debugfs_get_stored_incompressible_pages, NULL, "%llu\n");
+
static int zswap_debugfs_init(void)
{
if (!debugfs_initialized())
@@ -1819,6 +1766,9 @@ static int zswap_debugfs_init(void)
zswap_debugfs_root, NULL, &total_size_fops);
debugfs_create_file("stored_pages", 0444,
zswap_debugfs_root, NULL, &stored_pages_fops);
+ debugfs_create_file("stored_incompressible_pages", 0444,
+ zswap_debugfs_root, NULL,
+ &stored_incompressible_pages_fops);
return 0;
}
@@ -1866,8 +1816,7 @@ static int zswap_setup(void)
pool = __zswap_pool_create_fallback();
if (pool) {
- pr_info("loaded using pool %s/%s\n", pool->tfm_name,
- zpool_get_type(pool->zpool));
+ pr_info("loaded using pool %s\n", pool->tfm_name);
list_add(&pool->list, &zswap_pools);
zswap_has_pool = true;
static_branch_enable(&zswap_ever_enabled);