diff options
Diffstat (limited to 'mm')
101 files changed, 6511 insertions, 3074 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index e443fe8cd6cf..0e26f4fc8717 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -9,9 +9,6 @@ menu "Memory Management options" config ARCH_NO_SWAP bool -config ZPOOL - bool - menuconfig SWAP bool "Support for paging of anonymous memory (swap)" depends on MMU && BLOCK && !ARCH_NO_SWAP @@ -26,7 +23,7 @@ config ZSWAP bool "Compressed cache for swap pages" depends on SWAP select CRYPTO - select ZPOOL + select ZSMALLOC help A lightweight compressed cache for swap pages. It takes pages that are in the process of being swapped out and attempts to @@ -125,45 +122,18 @@ config ZSWAP_COMPRESSOR_DEFAULT default "zstd" if ZSWAP_COMPRESSOR_DEFAULT_ZSTD default "" -choice - prompt "Default allocator" - depends on ZSWAP - default ZSWAP_ZPOOL_DEFAULT_ZSMALLOC if MMU - help - Selects the default allocator for the compressed cache for - swap pages. - The default is 'zbud' for compatibility, however please do - read the description of each of the allocators below before - making a right choice. +config ZSMALLOC + tristate - The selection made here can be overridden by using the kernel - command line 'zswap.zpool=' option. +if ZSMALLOC -config ZSWAP_ZPOOL_DEFAULT_ZSMALLOC - bool "zsmalloc" - select ZSMALLOC - help - Use the zsmalloc allocator as the default allocator. -endchoice +menu "Zsmalloc allocator options" + depends on ZSMALLOC -config ZSWAP_ZPOOL_DEFAULT - string - depends on ZSWAP - default "zsmalloc" if ZSWAP_ZPOOL_DEFAULT_ZSMALLOC - default "" - -config ZSMALLOC - tristate - prompt "N:1 compression allocator (zsmalloc)" if (ZSWAP || ZRAM) - depends on MMU - help - zsmalloc is a slab-based memory allocator designed to store - pages of various compression levels efficiently. It achieves - the highest storage density with the least amount of fragmentation. +comment "Zsmalloc is a common backend allocator for zswap & zram" config ZSMALLOC_STAT bool "Export zsmalloc statistics" - depends on ZSMALLOC select DEBUG_FS help This option enables code in the zsmalloc to collect various @@ -175,7 +145,6 @@ config ZSMALLOC_CHAIN_SIZE int "Maximum number of physical pages per-zspage" default 8 range 4 16 - depends on ZSMALLOC help This option sets the upper limit on the number of physical pages that a zmalloc page (zspage) can consist of. The optimal zspage @@ -190,10 +159,15 @@ config ZSMALLOC_CHAIN_SIZE For more information, see zsmalloc documentation. +endmenu + +endif + menu "Slab allocator options" config SLUB def_bool y + select IRQ_WORK config KVFREE_RCU_BATCHED def_bool y @@ -439,9 +413,8 @@ config SPARSEMEM_VMEMMAP_ENABLE bool config SPARSEMEM_VMEMMAP - bool "Sparse Memory virtual memmap" + def_bool y depends on SPARSEMEM && SPARSEMEM_VMEMMAP_ENABLE - default y help SPARSEMEM_VMEMMAP uses a virtually mapped memmap to optimise pfn_to_page and page_to_pfn operations. This is the most @@ -776,7 +749,6 @@ config MEMORY_FAILURE depends on MMU depends on ARCH_SUPPORTS_MEMORY_FAILURE bool "Enable recovery from hardware memory errors" - select MEMORY_ISOLATION select RAS help Enables code to recover from some memory failures on systems @@ -823,6 +795,22 @@ config ARCH_WANT_GENERAL_HUGETLB config ARCH_WANTS_THP_SWAP def_bool n +config PERSISTENT_HUGE_ZERO_FOLIO + bool "Allocate a PMD sized folio for zeroing" + depends on TRANSPARENT_HUGEPAGE + help + Enable this option to reduce the runtime refcounting overhead + of the huge zero folio and expand the places in the kernel + that can use huge zero folios. For instance, block I/O benefits + from access to large folios for zeroing memory. + + With this option enabled, the huge zero folio is allocated + once and never freed. One full huge page's worth of memory shall + be used. + + Say Y if your system has lots of memory. Say N if you are + memory constrained. + config MM_ID def_bool n @@ -1381,6 +1369,8 @@ config PT_RECLAIM Note: now only empty user PTE page table pages will be reclaimed. +config FIND_NORMAL_PAGE + def_bool n source "mm/damon/Kconfig" diff --git a/mm/Makefile b/mm/Makefile index ef54aa615d9d..21abb3353550 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -115,7 +115,6 @@ obj-$(CONFIG_DEBUG_RODATA_TEST) += rodata_test.o obj-$(CONFIG_DEBUG_VM_PGTABLE) += debug_vm_pgtable.o obj-$(CONFIG_PAGE_OWNER) += page_owner.o obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o -obj-$(CONFIG_ZPOOL) += zpool.o obj-$(CONFIG_ZSMALLOC) += zsmalloc.o obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += early_ioremap.o obj-$(CONFIG_CMA) += cma.o diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 783904d8c5ef..128b525b8811 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -510,7 +510,7 @@ static void wb_update_bandwidth_workfn(struct work_struct *work) /* * Initial write bandwidth: 100 MB/s */ -#define INIT_BW (100 << (20 - PAGE_SHIFT)) +#define INIT_BW MB_TO_PAGES(100) static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi, gfp_t gfp) @@ -633,6 +633,7 @@ static void cgwb_release_workfn(struct work_struct *work) wb_exit(wb); bdi_put(bdi); WARN_ON_ONCE(!list_empty(&wb->b_attached)); + WARN_ON_ONCE(work_pending(&wb->switch_work)); call_rcu(&wb->rcu, cgwb_free_rcu); } @@ -709,6 +710,8 @@ static int cgwb_create(struct backing_dev_info *bdi, wb->memcg_css = memcg_css; wb->blkcg_css = blkcg_css; INIT_LIST_HEAD(&wb->b_attached); + INIT_WORK(&wb->switch_work, inode_switch_wbs_work_fn); + init_llist_head(&wb->switch_wbs_ctxs); INIT_WORK(&wb->release_work, cgwb_release_workfn); set_bit(WB_registered, &wb->state); bdi_get(bdi); @@ -839,6 +842,8 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi) if (!ret) { bdi->wb.memcg_css = &root_mem_cgroup->css; bdi->wb.blkcg_css = blkcg_root_css; + INIT_WORK(&bdi->wb.switch_work, inode_switch_wbs_work_fn); + init_llist_head(&bdi->wb.switch_wbs_ctxs); } return ret; } @@ -780,10 +780,8 @@ static int cma_range_alloc(struct cma *cma, struct cma_memrange *cmr, unsigned long count, unsigned int align, struct page **pagep, gfp_t gfp) { - unsigned long mask, offset; - unsigned long pfn = -1; - unsigned long start = 0; unsigned long bitmap_maxno, bitmap_no, bitmap_count; + unsigned long start, pfn, mask, offset; int ret = -EBUSY; struct page *page = NULL; @@ -795,7 +793,7 @@ static int cma_range_alloc(struct cma *cma, struct cma_memrange *cmr, if (bitmap_count > bitmap_maxno) goto out; - for (;;) { + for (start = 0; ; start = bitmap_no + mask + 1) { spin_lock_irq(&cma->lock); /* * If the request is larger than the available number @@ -812,6 +810,22 @@ static int cma_range_alloc(struct cma *cma, struct cma_memrange *cmr, spin_unlock_irq(&cma->lock); break; } + + pfn = cmr->base_pfn + (bitmap_no << cma->order_per_bit); + page = pfn_to_page(pfn); + + /* + * Do not hand out page ranges that are not contiguous, so + * callers can just iterate the pages without having to worry + * about these corner cases. + */ + if (!page_range_contiguous(page, count)) { + spin_unlock_irq(&cma->lock); + pr_warn_ratelimited("%s: %s: skipping incompatible area [0x%lx-0x%lx]", + __func__, cma->name, pfn, pfn + count - 1); + continue; + } + bitmap_set(cmr->bitmap, bitmap_no, bitmap_count); cma->available_count -= count; /* @@ -821,29 +835,24 @@ static int cma_range_alloc(struct cma *cma, struct cma_memrange *cmr, */ spin_unlock_irq(&cma->lock); - pfn = cmr->base_pfn + (bitmap_no << cma->order_per_bit); mutex_lock(&cma->alloc_mutex); ret = alloc_contig_range(pfn, pfn + count, ACR_FLAGS_CMA, gfp); mutex_unlock(&cma->alloc_mutex); - if (ret == 0) { - page = pfn_to_page(pfn); + if (!ret) break; - } cma_clear_bitmap(cma, cmr, pfn, count); if (ret != -EBUSY) break; pr_debug("%s(): memory range at pfn 0x%lx %p is busy, retrying\n", - __func__, pfn, pfn_to_page(pfn)); + __func__, pfn, page); - trace_cma_alloc_busy_retry(cma->name, pfn, pfn_to_page(pfn), - count, align); - /* try again with a bit different memory target */ - start = bitmap_no + mask + 1; + trace_cma_alloc_busy_retry(cma->name, pfn, page, count, align); } out: - *pagep = page; + if (!ret) + *pagep = page; return ret; } @@ -864,7 +873,7 @@ static struct page *__cma_alloc(struct cma *cma, unsigned long count, if (!count) return page; - trace_cma_alloc_start(name, count, align); + trace_cma_alloc_start(name, count, cma->available_count, cma->count, align); for (r = 0; r < cma->nranges; r++) { page = NULL; @@ -882,7 +891,7 @@ static struct page *__cma_alloc(struct cma *cma, unsigned long count, */ if (page) { for (i = 0; i < count; i++) - page_kasan_tag_reset(nth_page(page, i)); + page_kasan_tag_reset(page + i); } if (ret && !(gfp & __GFP_NOWARN)) { diff --git a/mm/compaction.c b/mm/compaction.c index bf021b31c7ec..1e8f8eca318c 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -989,7 +989,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, * Hugepage was successfully isolated and placed * on the cc->migratepages list. */ - low_pfn += folio_nr_pages(folio) - 1; + low_pfn += folio_nr_pages(folio) - folio_page_idx(folio, page) - 1; goto isolate_success_no_list; } diff --git a/mm/damon/Kconfig b/mm/damon/Kconfig index b3171f9406c1..8c868f7035fc 100644 --- a/mm/damon/Kconfig +++ b/mm/damon/Kconfig @@ -104,7 +104,7 @@ config DAMON_STAT config DAMON_STAT_ENABLED_DEFAULT bool "Enable DAMON_STAT by default" - depends on DAMON_PADDR + depends on DAMON_STAT default DAMON_STAT help Whether to enable DAMON_STAT by default. Users can disable it in diff --git a/mm/damon/core.c b/mm/damon/core.c index 70eff5cbe6ee..93848b4c6944 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -201,6 +201,7 @@ static int damon_fill_regions_holes(struct damon_region *first, * @t: the given target. * @ranges: array of new monitoring target ranges. * @nr_ranges: length of @ranges. + * @min_sz_region: minimum region size. * * This function adds new regions to, or modify existing regions of a * monitoring target to fit in specific ranges. @@ -208,7 +209,7 @@ static int damon_fill_regions_holes(struct damon_region *first, * Return: 0 if success, or negative error code otherwise. */ int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges, - unsigned int nr_ranges) + unsigned int nr_ranges, unsigned long min_sz_region) { struct damon_region *r, *next; unsigned int i; @@ -245,16 +246,16 @@ int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges, /* no region intersects with this range */ newr = damon_new_region( ALIGN_DOWN(range->start, - DAMON_MIN_REGION), - ALIGN(range->end, DAMON_MIN_REGION)); + min_sz_region), + ALIGN(range->end, min_sz_region)); if (!newr) return -ENOMEM; damon_insert_region(newr, damon_prev_region(r), r, t); } else { /* resize intersecting regions to fit in this range */ first->ar.start = ALIGN_DOWN(range->start, - DAMON_MIN_REGION); - last->ar.end = ALIGN(range->end, DAMON_MIN_REGION); + min_sz_region); + last->ar.end = ALIGN(range->end, min_sz_region); /* fill possible holes in the range */ err = damon_fill_regions_holes(first, last, t); @@ -544,6 +545,9 @@ struct damon_ctx *damon_new_ctx(void) ctx->attrs.min_nr_regions = 10; ctx->attrs.max_nr_regions = 1000; + ctx->addr_unit = 1; + ctx->min_sz_region = DAMON_MIN_REGION; + INIT_LIST_HEAD(&ctx->adaptive_targets); INIT_LIST_HEAD(&ctx->schemes); @@ -570,6 +574,23 @@ void damon_destroy_ctx(struct damon_ctx *ctx) kfree(ctx); } +static bool damon_attrs_equals(const struct damon_attrs *attrs1, + const struct damon_attrs *attrs2) +{ + const struct damon_intervals_goal *ig1 = &attrs1->intervals_goal; + const struct damon_intervals_goal *ig2 = &attrs2->intervals_goal; + + return attrs1->sample_interval == attrs2->sample_interval && + attrs1->aggr_interval == attrs2->aggr_interval && + attrs1->ops_update_interval == attrs2->ops_update_interval && + attrs1->min_nr_regions == attrs2->min_nr_regions && + attrs1->max_nr_regions == attrs2->max_nr_regions && + ig1->access_bp == ig2->access_bp && + ig1->aggrs == ig2->aggrs && + ig1->min_sample_us == ig2->min_sample_us && + ig1->max_sample_us == ig2->max_sample_us; +} + static unsigned int damon_age_for_new_attrs(unsigned int age, struct damon_attrs *old_attrs, struct damon_attrs *new_attrs) { @@ -1108,8 +1129,8 @@ static struct damon_target *damon_nth_target(int n, struct damon_ctx *ctx) * * If @src has no region, @dst keeps current regions. */ -static int damon_commit_target_regions( - struct damon_target *dst, struct damon_target *src) +static int damon_commit_target_regions(struct damon_target *dst, + struct damon_target *src, unsigned long src_min_sz_region) { struct damon_region *src_region; struct damon_addr_range *ranges; @@ -1126,18 +1147,19 @@ static int damon_commit_target_regions( i = 0; damon_for_each_region(src_region, src) ranges[i++] = src_region->ar; - err = damon_set_regions(dst, ranges, i); + err = damon_set_regions(dst, ranges, i, src_min_sz_region); kfree(ranges); return err; } static int damon_commit_target( struct damon_target *dst, bool dst_has_pid, - struct damon_target *src, bool src_has_pid) + struct damon_target *src, bool src_has_pid, + unsigned long src_min_sz_region) { int err; - err = damon_commit_target_regions(dst, src); + err = damon_commit_target_regions(dst, src, src_min_sz_region); if (err) return err; if (dst_has_pid) @@ -1159,7 +1181,8 @@ static int damon_commit_targets( if (src_target) { err = damon_commit_target( dst_target, damon_target_has_pid(dst), - src_target, damon_target_has_pid(src)); + src_target, damon_target_has_pid(src), + src->min_sz_region); if (err) return err; } else { @@ -1182,7 +1205,8 @@ static int damon_commit_targets( if (!new_target) return -ENOMEM; err = damon_commit_target(new_target, false, - src_target, damon_target_has_pid(src)); + src_target, damon_target_has_pid(src), + src->min_sz_region); if (err) { damon_destroy_target(new_target, NULL); return err; @@ -1222,10 +1246,14 @@ int damon_commit_ctx(struct damon_ctx *dst, struct damon_ctx *src) * 2. ops update should be done after pid handling is done (target * committing require putting pids). */ - err = damon_set_attrs(dst, &src->attrs); - if (err) - return err; + if (!damon_attrs_equals(&dst->attrs, &src->attrs)) { + err = damon_set_attrs(dst, &src->attrs); + if (err) + return err; + } dst->ops = src->ops; + dst->addr_unit = src->addr_unit; + dst->min_sz_region = src->min_sz_region; return 0; } @@ -1258,8 +1286,8 @@ static unsigned long damon_region_sz_limit(struct damon_ctx *ctx) if (ctx->attrs.min_nr_regions) sz /= ctx->attrs.min_nr_regions; - if (sz < DAMON_MIN_REGION) - sz = DAMON_MIN_REGION; + if (sz < ctx->min_sz_region) + sz = ctx->min_sz_region; return sz; } @@ -1603,6 +1631,7 @@ static bool damos_valid_target(struct damon_ctx *c, struct damon_target *t, * @t: The target of the region. * @rp: The pointer to the region. * @s: The scheme to be applied. + * @min_sz_region: minimum region size. * * If a quota of a scheme has exceeded in a quota charge window, the scheme's * action would applied to only a part of the target access pattern fulfilling @@ -1620,7 +1649,7 @@ static bool damos_valid_target(struct damon_ctx *c, struct damon_target *t, * Return: true if the region should be entirely skipped, false otherwise. */ static bool damos_skip_charged_region(struct damon_target *t, - struct damon_region **rp, struct damos *s) + struct damon_region **rp, struct damos *s, unsigned long min_sz_region) { struct damon_region *r = *rp; struct damos_quota *quota = &s->quota; @@ -1642,11 +1671,11 @@ static bool damos_skip_charged_region(struct damon_target *t, if (quota->charge_addr_from && r->ar.start < quota->charge_addr_from) { sz_to_skip = ALIGN_DOWN(quota->charge_addr_from - - r->ar.start, DAMON_MIN_REGION); + r->ar.start, min_sz_region); if (!sz_to_skip) { - if (damon_sz_region(r) <= DAMON_MIN_REGION) + if (damon_sz_region(r) <= min_sz_region) return true; - sz_to_skip = DAMON_MIN_REGION; + sz_to_skip = min_sz_region; } damon_split_region_at(t, r, sz_to_skip); r = damon_next_region(r); @@ -1671,7 +1700,8 @@ static void damos_update_stat(struct damos *s, } static bool damos_filter_match(struct damon_ctx *ctx, struct damon_target *t, - struct damon_region *r, struct damos_filter *filter) + struct damon_region *r, struct damos_filter *filter, + unsigned long min_sz_region) { bool matched = false; struct damon_target *ti; @@ -1688,8 +1718,8 @@ static bool damos_filter_match(struct damon_ctx *ctx, struct damon_target *t, matched = target_idx == filter->target_idx; break; case DAMOS_FILTER_TYPE_ADDR: - start = ALIGN_DOWN(filter->addr_range.start, DAMON_MIN_REGION); - end = ALIGN_DOWN(filter->addr_range.end, DAMON_MIN_REGION); + start = ALIGN_DOWN(filter->addr_range.start, min_sz_region); + end = ALIGN_DOWN(filter->addr_range.end, min_sz_region); /* inside the range */ if (start <= r->ar.start && r->ar.end <= end) { @@ -1725,7 +1755,7 @@ static bool damos_filter_out(struct damon_ctx *ctx, struct damon_target *t, s->core_filters_allowed = false; damos_for_each_filter(filter, s) { - if (damos_filter_match(ctx, t, r, filter)) { + if (damos_filter_match(ctx, t, r, filter, ctx->min_sz_region)) { if (filter->allow) s->core_filters_allowed = true; return !filter->allow; @@ -1860,7 +1890,7 @@ static void damos_apply_scheme(struct damon_ctx *c, struct damon_target *t, if (c->ops.apply_scheme) { if (quota->esz && quota->charged_sz + sz > quota->esz) { sz = ALIGN_DOWN(quota->esz - quota->charged_sz, - DAMON_MIN_REGION); + c->min_sz_region); if (!sz) goto update_stat; damon_split_region_at(t, r, sz); @@ -1908,7 +1938,7 @@ static void damon_do_apply_schemes(struct damon_ctx *c, if (quota->esz && quota->charged_sz >= quota->esz) continue; - if (damos_skip_charged_region(t, &r, s)) + if (damos_skip_charged_region(t, &r, s, c->min_sz_region)) continue; if (!damos_valid_target(c, t, r, s)) @@ -2073,8 +2103,8 @@ static void damos_set_effective_quota(struct damos_quota *quota) if (quota->ms) { if (quota->total_charged_ns) - throughput = quota->total_charged_sz * 1000000 / - quota->total_charged_ns; + throughput = mult_frac(quota->total_charged_sz, 1000000, + quota->total_charged_ns); else throughput = PAGE_SIZE * 1024; esz = min(throughput * quota->ms, esz); @@ -2111,6 +2141,12 @@ static void damos_adjust_quota(struct damon_ctx *c, struct damos *s) if (!quota->ms && !quota->sz && list_empty("a->goals)) return; + /* First charge window */ + if (!quota->total_charged_sz && !quota->charged_from) { + quota->charged_from = jiffies; + damos_set_effective_quota(quota); + } + /* New charge window starts */ if (time_after_eq(jiffies, quota->charged_from + msecs_to_jiffies(quota->reset_interval))) { @@ -2227,6 +2263,8 @@ static void damon_merge_regions_of(struct damon_target *t, unsigned int thres, damon_for_each_region_safe(r, next, t) { if (abs(r->nr_accesses - r->last_nr_accesses) > thres) r->age = 0; + else if ((r->nr_accesses == 0) != (r->last_nr_accesses == 0)) + r->age = 0; else r->age++; @@ -2302,7 +2340,8 @@ static void damon_split_region_at(struct damon_target *t, } /* Split every region in the given target into 'nr_subs' regions */ -static void damon_split_regions_of(struct damon_target *t, int nr_subs) +static void damon_split_regions_of(struct damon_target *t, int nr_subs, + unsigned long min_sz_region) { struct damon_region *r, *next; unsigned long sz_region, sz_sub = 0; @@ -2312,13 +2351,13 @@ static void damon_split_regions_of(struct damon_target *t, int nr_subs) sz_region = damon_sz_region(r); for (i = 0; i < nr_subs - 1 && - sz_region > 2 * DAMON_MIN_REGION; i++) { + sz_region > 2 * min_sz_region; i++) { /* * Randomly select size of left sub-region to be at * least 10 percent and at most 90% of original region */ sz_sub = ALIGN_DOWN(damon_rand(1, 10) * - sz_region / 10, DAMON_MIN_REGION); + sz_region / 10, min_sz_region); /* Do not allow blank region */ if (sz_sub == 0 || sz_sub >= sz_region) continue; @@ -2358,7 +2397,7 @@ static void kdamond_split_regions(struct damon_ctx *ctx) nr_subregions = 3; damon_for_each_target(t, ctx) - damon_split_regions_of(t, nr_subregions); + damon_split_regions_of(t, nr_subregions, ctx->min_sz_region); last_nr_regions = nr_regions; } @@ -2475,10 +2514,14 @@ static void kdamond_call(struct damon_ctx *ctx, bool cancel) mutex_lock(&ctx->call_controls_lock); list_del(&control->list); mutex_unlock(&ctx->call_controls_lock); - if (!control->repeat) + if (!control->repeat) { complete(&control->completion); - else + } else if (control->canceled && control->dealloc_on_cancel) { + kfree(control); + continue; + } else { list_add(&control->list, &repeat_controls); + } } control = list_first_entry_or_null(&repeat_controls, struct damon_call_control, list); @@ -2747,7 +2790,7 @@ int damon_set_region_biggest_system_ram_default(struct damon_target *t, addr_range.start = *start; addr_range.end = *end; - return damon_set_regions(t, &addr_range, 1); + return damon_set_regions(t, &addr_range, 1, DAMON_MIN_REGION); } /* @@ -2820,6 +2863,16 @@ void damon_update_region_access_rate(struct damon_region *r, bool accessed, r->nr_accesses++; } +/** + * damon_initialized() - Return if DAMON is ready to be used. + * + * Return: true if DAMON is ready to be used, false otherwise. + */ +bool damon_initialized(void) +{ + return damon_region_cache != NULL; +} + static int __init damon_init(void) { damon_region_cache = KMEM_CACHE(damon_region, 0); diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c index 151a9de5ad8b..42b9a656f9de 100644 --- a/mm/damon/lru_sort.c +++ b/mm/damon/lru_sort.c @@ -112,6 +112,13 @@ static unsigned long monitor_region_end __read_mostly; module_param(monitor_region_end, ulong, 0600); /* + * Scale factor for DAMON_LRU_SORT to ops address conversion. + * + * This parameter must not be set to 0. + */ +static unsigned long addr_unit __read_mostly = 1; + +/* * PID of the DAMON thread * * If DAMON_LRU_SORT is enabled, this becomes the PID of the worker thread. @@ -198,7 +205,21 @@ static int damon_lru_sort_apply_parameters(void) if (err) return err; - err = damon_set_attrs(ctx, &damon_lru_sort_mon_attrs); + /* + * If monitor_region_start/end are unset, always silently + * reset addr_unit to 1. + */ + if (!monitor_region_start && !monitor_region_end) + addr_unit = 1; + param_ctx->addr_unit = addr_unit; + param_ctx->min_sz_region = max(DAMON_MIN_REGION / addr_unit, 1); + + if (!damon_lru_sort_mon_attrs.sample_interval) { + err = -EINVAL; + goto out; + } + + err = damon_set_attrs(param_ctx, &damon_lru_sort_mon_attrs); if (err) goto out; @@ -285,6 +306,30 @@ static int damon_lru_sort_turn(bool on) return damon_call(ctx, &call_control); } +static int damon_lru_sort_addr_unit_store(const char *val, + const struct kernel_param *kp) +{ + unsigned long input_addr_unit; + int err = kstrtoul(val, 0, &input_addr_unit); + + if (err) + return err; + if (!input_addr_unit) + return -EINVAL; + + addr_unit = input_addr_unit; + return 0; +} + +static const struct kernel_param_ops addr_unit_param_ops = { + .set = damon_lru_sort_addr_unit_store, + .get = param_get_ulong, +}; + +module_param_cb(addr_unit, &addr_unit_param_ops, &addr_unit, 0600); +MODULE_PARM_DESC(addr_unit, + "Scale factor for DAMON_LRU_SORT to ops address conversion (default: 1)"); + static int damon_lru_sort_enabled_store(const char *val, const struct kernel_param *kp) { @@ -300,7 +345,7 @@ static int damon_lru_sort_enabled_store(const char *val, return 0; /* Called before init function. The function will handle this. */ - if (!ctx) + if (!damon_initialized()) goto set_param_out; err = damon_lru_sort_turn(enable); @@ -323,8 +368,13 @@ MODULE_PARM_DESC(enabled, static int __init damon_lru_sort_init(void) { - int err = damon_modules_new_paddr_ctx_target(&ctx, &target); + int err; + if (!damon_initialized()) { + err = -ENOMEM; + goto out; + } + err = damon_modules_new_paddr_ctx_target(&ctx, &target); if (err) goto out; diff --git a/mm/damon/ops-common.c b/mm/damon/ops-common.c index 99321ff5cb92..998c5180a603 100644 --- a/mm/damon/ops-common.c +++ b/mm/damon/ops-common.c @@ -303,7 +303,7 @@ static unsigned int __damon_migrate_folio_list( * instead of migrated. */ .gfp_mask = (GFP_HIGHUSER_MOVABLE & ~__GFP_RECLAIM) | - __GFP_NOWARN | __GFP_NOMEMALLOC | GFP_NOWAIT, + __GFP_NOMEMALLOC | GFP_NOWAIT, .nid = target_nid, }; @@ -412,3 +412,12 @@ unsigned long damon_migrate_pages(struct list_head *folio_list, int target_nid) return nr_migrated; } + +bool damos_ops_has_filter(struct damos *s) +{ + struct damos_filter *f; + + damos_for_each_ops_filter(f, s) + return true; + return false; +} diff --git a/mm/damon/ops-common.h b/mm/damon/ops-common.h index 61ad54aaf256..5efa5b5970de 100644 --- a/mm/damon/ops-common.h +++ b/mm/damon/ops-common.h @@ -21,3 +21,5 @@ int damon_hot_score(struct damon_ctx *c, struct damon_region *r, bool damos_folio_filter_match(struct damos_filter *filter, struct folio *folio); unsigned long damon_migrate_pages(struct list_head *folio_list, int target_nid); + +bool damos_ops_has_filter(struct damos *s); diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c index 53a55c5114fb..07a8aead439e 100644 --- a/mm/damon/paddr.c +++ b/mm/damon/paddr.c @@ -18,7 +18,26 @@ #include "../internal.h" #include "ops-common.h" -static void damon_pa_mkold(unsigned long paddr) +static phys_addr_t damon_pa_phys_addr( + unsigned long addr, unsigned long addr_unit) +{ + return (phys_addr_t)addr * addr_unit; +} + +static unsigned long damon_pa_core_addr( + phys_addr_t pa, unsigned long addr_unit) +{ + /* + * Use div_u64() for avoiding linking errors related with __udivdi3, + * __aeabi_uldivmod, or similar problems. This should also improve the + * performance optimization (read div_u64() comment for the detail). + */ + if (sizeof(pa) == 8 && sizeof(addr_unit) == 4) + return div_u64(pa, addr_unit); + return pa / addr_unit; +} + +static void damon_pa_mkold(phys_addr_t paddr) { struct folio *folio = damon_get_folio(PHYS_PFN(paddr)); @@ -29,11 +48,12 @@ static void damon_pa_mkold(unsigned long paddr) folio_put(folio); } -static void __damon_pa_prepare_access_check(struct damon_region *r) +static void __damon_pa_prepare_access_check(struct damon_region *r, + unsigned long addr_unit) { r->sampling_addr = damon_rand(r->ar.start, r->ar.end); - damon_pa_mkold(r->sampling_addr); + damon_pa_mkold(damon_pa_phys_addr(r->sampling_addr, addr_unit)); } static void damon_pa_prepare_access_checks(struct damon_ctx *ctx) @@ -43,11 +63,11 @@ static void damon_pa_prepare_access_checks(struct damon_ctx *ctx) damon_for_each_target(t, ctx) { damon_for_each_region(r, t) - __damon_pa_prepare_access_check(r); + __damon_pa_prepare_access_check(r, ctx->addr_unit); } } -static bool damon_pa_young(unsigned long paddr, unsigned long *folio_sz) +static bool damon_pa_young(phys_addr_t paddr, unsigned long *folio_sz) { struct folio *folio = damon_get_folio(PHYS_PFN(paddr)); bool accessed; @@ -62,23 +82,25 @@ static bool damon_pa_young(unsigned long paddr, unsigned long *folio_sz) } static void __damon_pa_check_access(struct damon_region *r, - struct damon_attrs *attrs) + struct damon_attrs *attrs, unsigned long addr_unit) { - static unsigned long last_addr; + static phys_addr_t last_addr; static unsigned long last_folio_sz = PAGE_SIZE; static bool last_accessed; + phys_addr_t sampling_addr = damon_pa_phys_addr( + r->sampling_addr, addr_unit); /* If the region is in the last checked page, reuse the result */ if (ALIGN_DOWN(last_addr, last_folio_sz) == - ALIGN_DOWN(r->sampling_addr, last_folio_sz)) { + ALIGN_DOWN(sampling_addr, last_folio_sz)) { damon_update_region_access_rate(r, last_accessed, attrs); return; } - last_accessed = damon_pa_young(r->sampling_addr, &last_folio_sz); + last_accessed = damon_pa_young(sampling_addr, &last_folio_sz); damon_update_region_access_rate(r, last_accessed, attrs); - last_addr = r->sampling_addr; + last_addr = sampling_addr; } static unsigned int damon_pa_check_accesses(struct damon_ctx *ctx) @@ -89,7 +111,8 @@ static unsigned int damon_pa_check_accesses(struct damon_ctx *ctx) damon_for_each_target(t, ctx) { damon_for_each_region(r, t) { - __damon_pa_check_access(r, &ctx->attrs); + __damon_pa_check_access( + r, &ctx->attrs, ctx->addr_unit); max_nr_accesses = max(r->nr_accesses, max_nr_accesses); } } @@ -125,10 +148,11 @@ static bool damon_pa_invalid_damos_folio(struct folio *folio, struct damos *s) return false; } -static unsigned long damon_pa_pageout(struct damon_region *r, struct damos *s, +static unsigned long damon_pa_pageout(struct damon_region *r, + unsigned long addr_unit, struct damos *s, unsigned long *sz_filter_passed) { - unsigned long addr, applied; + phys_addr_t addr, applied; LIST_HEAD(folio_list); bool install_young_filter = true; struct damos_filter *filter; @@ -149,8 +173,8 @@ static unsigned long damon_pa_pageout(struct damon_region *r, struct damos *s, damos_add_filter(s, filter); } - addr = r->ar.start; - while (addr < r->ar.end) { + addr = damon_pa_phys_addr(r->ar.start, addr_unit); + while (addr < damon_pa_phys_addr(r->ar.end, addr_unit)) { folio = damon_get_folio(PHYS_PFN(addr)); if (damon_pa_invalid_damos_folio(folio, s)) { addr += PAGE_SIZE; @@ -160,7 +184,7 @@ static unsigned long damon_pa_pageout(struct damon_region *r, struct damos *s, if (damos_pa_filter_out(s, folio)) goto put_folio; else - *sz_filter_passed += folio_size(folio); + *sz_filter_passed += folio_size(folio) / addr_unit; folio_clear_referenced(folio); folio_test_clear_young(folio); @@ -179,18 +203,19 @@ put_folio: applied = reclaim_pages(&folio_list); cond_resched(); s->last_applied = folio; - return applied * PAGE_SIZE; + return damon_pa_core_addr(applied * PAGE_SIZE, addr_unit); } static inline unsigned long damon_pa_mark_accessed_or_deactivate( - struct damon_region *r, struct damos *s, bool mark_accessed, + struct damon_region *r, unsigned long addr_unit, + struct damos *s, bool mark_accessed, unsigned long *sz_filter_passed) { - unsigned long addr, applied = 0; + phys_addr_t addr, applied = 0; struct folio *folio; - addr = r->ar.start; - while (addr < r->ar.end) { + addr = damon_pa_phys_addr(r->ar.start, addr_unit); + while (addr < damon_pa_phys_addr(r->ar.end, addr_unit)) { folio = damon_get_folio(PHYS_PFN(addr)); if (damon_pa_invalid_damos_folio(folio, s)) { addr += PAGE_SIZE; @@ -200,7 +225,7 @@ static inline unsigned long damon_pa_mark_accessed_or_deactivate( if (damos_pa_filter_out(s, folio)) goto put_folio; else - *sz_filter_passed += folio_size(folio); + *sz_filter_passed += folio_size(folio) / addr_unit; if (mark_accessed) folio_mark_accessed(folio); @@ -212,32 +237,35 @@ put_folio: folio_put(folio); } s->last_applied = folio; - return applied * PAGE_SIZE; + return damon_pa_core_addr(applied * PAGE_SIZE, addr_unit); } static unsigned long damon_pa_mark_accessed(struct damon_region *r, - struct damos *s, unsigned long *sz_filter_passed) + unsigned long addr_unit, struct damos *s, + unsigned long *sz_filter_passed) { - return damon_pa_mark_accessed_or_deactivate(r, s, true, + return damon_pa_mark_accessed_or_deactivate(r, addr_unit, s, true, sz_filter_passed); } static unsigned long damon_pa_deactivate_pages(struct damon_region *r, - struct damos *s, unsigned long *sz_filter_passed) + unsigned long addr_unit, struct damos *s, + unsigned long *sz_filter_passed) { - return damon_pa_mark_accessed_or_deactivate(r, s, false, + return damon_pa_mark_accessed_or_deactivate(r, addr_unit, s, false, sz_filter_passed); } -static unsigned long damon_pa_migrate(struct damon_region *r, struct damos *s, +static unsigned long damon_pa_migrate(struct damon_region *r, + unsigned long addr_unit, struct damos *s, unsigned long *sz_filter_passed) { - unsigned long addr, applied; + phys_addr_t addr, applied; LIST_HEAD(folio_list); struct folio *folio; - addr = r->ar.start; - while (addr < r->ar.end) { + addr = damon_pa_phys_addr(r->ar.start, addr_unit); + while (addr < damon_pa_phys_addr(r->ar.end, addr_unit)) { folio = damon_get_folio(PHYS_PFN(addr)); if (damon_pa_invalid_damos_folio(folio, s)) { addr += PAGE_SIZE; @@ -247,7 +275,7 @@ static unsigned long damon_pa_migrate(struct damon_region *r, struct damos *s, if (damos_pa_filter_out(s, folio)) goto put_folio; else - *sz_filter_passed += folio_size(folio); + *sz_filter_passed += folio_size(folio) / addr_unit; if (!folio_isolate_lru(folio)) goto put_folio; @@ -259,29 +287,21 @@ put_folio: applied = damon_migrate_pages(&folio_list, s->target_nid); cond_resched(); s->last_applied = folio; - return applied * PAGE_SIZE; + return damon_pa_core_addr(applied * PAGE_SIZE, addr_unit); } -static bool damon_pa_scheme_has_filter(struct damos *s) -{ - struct damos_filter *f; - - damos_for_each_ops_filter(f, s) - return true; - return false; -} - -static unsigned long damon_pa_stat(struct damon_region *r, struct damos *s, +static unsigned long damon_pa_stat(struct damon_region *r, + unsigned long addr_unit, struct damos *s, unsigned long *sz_filter_passed) { - unsigned long addr; + phys_addr_t addr; struct folio *folio; - if (!damon_pa_scheme_has_filter(s)) + if (!damos_ops_has_filter(s)) return 0; - addr = r->ar.start; - while (addr < r->ar.end) { + addr = damon_pa_phys_addr(r->ar.start, addr_unit); + while (addr < damon_pa_phys_addr(r->ar.end, addr_unit)) { folio = damon_get_folio(PHYS_PFN(addr)); if (damon_pa_invalid_damos_folio(folio, s)) { addr += PAGE_SIZE; @@ -289,7 +309,7 @@ static unsigned long damon_pa_stat(struct damon_region *r, struct damos *s, } if (!damos_pa_filter_out(s, folio)) - *sz_filter_passed += folio_size(folio); + *sz_filter_passed += folio_size(folio) / addr_unit; addr += folio_size(folio); folio_put(folio); } @@ -301,18 +321,22 @@ static unsigned long damon_pa_apply_scheme(struct damon_ctx *ctx, struct damon_target *t, struct damon_region *r, struct damos *scheme, unsigned long *sz_filter_passed) { + unsigned long aunit = ctx->addr_unit; + switch (scheme->action) { case DAMOS_PAGEOUT: - return damon_pa_pageout(r, scheme, sz_filter_passed); + return damon_pa_pageout(r, aunit, scheme, sz_filter_passed); case DAMOS_LRU_PRIO: - return damon_pa_mark_accessed(r, scheme, sz_filter_passed); + return damon_pa_mark_accessed(r, aunit, scheme, + sz_filter_passed); case DAMOS_LRU_DEPRIO: - return damon_pa_deactivate_pages(r, scheme, sz_filter_passed); + return damon_pa_deactivate_pages(r, aunit, scheme, + sz_filter_passed); case DAMOS_MIGRATE_HOT: case DAMOS_MIGRATE_COLD: - return damon_pa_migrate(r, scheme, sz_filter_passed); + return damon_pa_migrate(r, aunit, scheme, sz_filter_passed); case DAMOS_STAT: - return damon_pa_stat(r, scheme, sz_filter_passed); + return damon_pa_stat(r, aunit, scheme, sz_filter_passed); default: /* DAMOS actions that not yet supported by 'paddr'. */ break; diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c index 3c71b4596676..7ba3d0f9a19a 100644 --- a/mm/damon/reclaim.c +++ b/mm/damon/reclaim.c @@ -129,6 +129,13 @@ static unsigned long monitor_region_end __read_mostly; module_param(monitor_region_end, ulong, 0600); /* + * Scale factor for DAMON_RECLAIM to ops address conversion. + * + * This parameter must not be set to 0. + */ +static unsigned long addr_unit __read_mostly = 1; + +/* * Skip anonymous pages reclamation. * * If this parameter is set as ``Y``, DAMON_RECLAIM does not reclaim anonymous @@ -194,6 +201,20 @@ static int damon_reclaim_apply_parameters(void) if (err) return err; + /* + * If monitor_region_start/end are unset, always silently + * reset addr_unit to 1. + */ + if (!monitor_region_start && !monitor_region_end) + addr_unit = 1; + param_ctx->addr_unit = addr_unit; + param_ctx->min_sz_region = max(DAMON_MIN_REGION / addr_unit, 1); + + if (!damon_reclaim_mon_attrs.aggr_interval) { + err = -EINVAL; + goto out; + } + err = damon_set_attrs(param_ctx, &damon_reclaim_mon_attrs); if (err) goto out; @@ -289,6 +310,30 @@ static int damon_reclaim_turn(bool on) return damon_call(ctx, &call_control); } +static int damon_reclaim_addr_unit_store(const char *val, + const struct kernel_param *kp) +{ + unsigned long input_addr_unit; + int err = kstrtoul(val, 0, &input_addr_unit); + + if (err) + return err; + if (!input_addr_unit) + return -EINVAL; + + addr_unit = input_addr_unit; + return 0; +} + +static const struct kernel_param_ops addr_unit_param_ops = { + .set = damon_reclaim_addr_unit_store, + .get = param_get_ulong, +}; + +module_param_cb(addr_unit, &addr_unit_param_ops, &addr_unit, 0600); +MODULE_PARM_DESC(addr_unit, + "Scale factor for DAMON_RECLAIM to ops address conversion (default: 1)"); + static int damon_reclaim_enabled_store(const char *val, const struct kernel_param *kp) { @@ -304,7 +349,7 @@ static int damon_reclaim_enabled_store(const char *val, return 0; /* Called before init function. The function will handle this. */ - if (!ctx) + if (!damon_initialized()) goto set_param_out; err = damon_reclaim_turn(enable); @@ -327,8 +372,13 @@ MODULE_PARM_DESC(enabled, static int __init damon_reclaim_init(void) { - int err = damon_modules_new_paddr_ctx_target(&ctx, &target); + int err; + if (!damon_initialized()) { + err = -ENOMEM; + goto out; + } + err = damon_modules_new_paddr_ctx_target(&ctx, &target); if (err) goto out; diff --git a/mm/damon/stat.c b/mm/damon/stat.c index 87bcd8866d4b..d8010968bbed 100644 --- a/mm/damon/stat.c +++ b/mm/damon/stat.c @@ -34,11 +34,16 @@ module_param(estimated_memory_bandwidth, ulong, 0400); MODULE_PARM_DESC(estimated_memory_bandwidth, "Estimated memory bandwidth usage in bytes per second"); -static unsigned long memory_idle_ms_percentiles[101] __read_mostly = {0,}; -module_param_array(memory_idle_ms_percentiles, ulong, NULL, 0400); +static long memory_idle_ms_percentiles[101] __read_mostly = {0,}; +module_param_array(memory_idle_ms_percentiles, long, NULL, 0400); MODULE_PARM_DESC(memory_idle_ms_percentiles, "Memory idle time percentiles in milliseconds"); +static unsigned long aggr_interval_us; +module_param(aggr_interval_us, ulong, 0400); +MODULE_PARM_DESC(aggr_interval_us, + "Current tuned aggregation interval in microseconds"); + static struct damon_ctx *damon_stat_context; static void damon_stat_set_estimated_memory_bandwidth(struct damon_ctx *c) @@ -56,10 +61,10 @@ static void damon_stat_set_estimated_memory_bandwidth(struct damon_ctx *c) MSEC_PER_SEC / c->attrs.aggr_interval; } -static unsigned int damon_stat_idletime(const struct damon_region *r) +static int damon_stat_idletime(const struct damon_region *r) { if (r->nr_accesses) - return 0; + return -1 * (r->age + 1); return r->age + 1; } @@ -117,7 +122,7 @@ static void damon_stat_set_idletime_percentiles(struct damon_ctx *c) while (next_percentile <= accounted_bytes * 100 / total_sz) memory_idle_ms_percentiles[next_percentile++] = damon_stat_idletime(region) * - c->attrs.aggr_interval / USEC_PER_MSEC; + (long)c->attrs.aggr_interval / USEC_PER_MSEC; } kfree(sorted_regions); } @@ -133,6 +138,7 @@ static int damon_stat_damon_call_fn(void *data) return 0; last_refresh_jiffies = jiffies; + aggr_interval_us = c->attrs.aggr_interval; damon_stat_set_estimated_memory_bandwidth(c); damon_stat_set_idletime_percentiles(c); return 0; @@ -214,8 +220,6 @@ static void damon_stat_stop(void) damon_destroy_ctx(damon_stat_context); } -static bool damon_stat_init_called; - static int damon_stat_enabled_store( const char *val, const struct kernel_param *kp) { @@ -229,7 +233,7 @@ static int damon_stat_enabled_store( if (is_enabled == enabled) return 0; - if (!damon_stat_init_called) + if (!damon_initialized()) /* * probably called from command line parsing (parse_args()). * Cannot call damon_new_ctx(). Let damon_stat_init() handle. @@ -250,12 +254,16 @@ static int __init damon_stat_init(void) { int err = 0; - damon_stat_init_called = true; + if (!damon_initialized()) { + err = -ENOMEM; + goto out; + } /* probably set via command line */ if (enabled) err = damon_stat_start(); +out: if (err && enabled) enabled = false; return err; diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index 6d2b0dab50cb..2fc722f998f8 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -834,6 +834,7 @@ static const struct damon_sysfs_ops_name damon_sysfs_ops_names[] = { struct damon_sysfs_context { struct kobject kobj; enum damon_ops_id ops_id; + unsigned long addr_unit; struct damon_sysfs_attrs *attrs; struct damon_sysfs_targets *targets; struct damon_sysfs_schemes *schemes; @@ -849,6 +850,7 @@ static struct damon_sysfs_context *damon_sysfs_context_alloc( return NULL; context->kobj = (struct kobject){}; context->ops_id = ops_id; + context->addr_unit = 1; return context; } @@ -997,6 +999,32 @@ static ssize_t operations_store(struct kobject *kobj, return -EINVAL; } +static ssize_t addr_unit_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_context *context = container_of(kobj, + struct damon_sysfs_context, kobj); + + return sysfs_emit(buf, "%lu\n", context->addr_unit); +} + +static ssize_t addr_unit_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_context *context = container_of(kobj, + struct damon_sysfs_context, kobj); + unsigned long input_addr_unit; + int err = kstrtoul(buf, 0, &input_addr_unit); + + if (err) + return err; + if (!input_addr_unit) + return -EINVAL; + + context->addr_unit = input_addr_unit; + return count; +} + static void damon_sysfs_context_release(struct kobject *kobj) { kfree(container_of(kobj, struct damon_sysfs_context, kobj)); @@ -1008,9 +1036,13 @@ static struct kobj_attribute damon_sysfs_context_avail_operations_attr = static struct kobj_attribute damon_sysfs_context_operations_attr = __ATTR_RW_MODE(operations, 0600); +static struct kobj_attribute damon_sysfs_context_addr_unit_attr = + __ATTR_RW_MODE(addr_unit, 0600); + static struct attribute *damon_sysfs_context_attrs[] = { &damon_sysfs_context_avail_operations_attr.attr, &damon_sysfs_context_operations_attr.attr, + &damon_sysfs_context_addr_unit_attr.attr, NULL, }; ATTRIBUTE_GROUPS(damon_sysfs_context); @@ -1260,14 +1292,18 @@ static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr, { struct damon_sysfs_kdamond *kdamond = container_of(kobj, struct damon_sysfs_kdamond, kobj); - struct damon_ctx *ctx = kdamond->damon_ctx; - bool running; + struct damon_ctx *ctx; + bool running = false; - if (!ctx) - running = false; - else + if (!mutex_trylock(&damon_sysfs_lock)) + return -EBUSY; + + ctx = kdamond->damon_ctx; + if (ctx) running = damon_is_running(ctx); + mutex_unlock(&damon_sysfs_lock); + return sysfs_emit(buf, "%s\n", running ? damon_sysfs_cmd_strs[DAMON_SYSFS_CMD_ON] : damon_sysfs_cmd_strs[DAMON_SYSFS_CMD_OFF]); @@ -1297,7 +1333,8 @@ static int damon_sysfs_set_attrs(struct damon_ctx *ctx, } static int damon_sysfs_set_regions(struct damon_target *t, - struct damon_sysfs_regions *sysfs_regions) + struct damon_sysfs_regions *sysfs_regions, + unsigned long min_sz_region) { struct damon_addr_range *ranges = kmalloc_array(sysfs_regions->nr, sizeof(*ranges), GFP_KERNEL | __GFP_NOWARN); @@ -1319,7 +1356,7 @@ static int damon_sysfs_set_regions(struct damon_target *t, if (ranges[i - 1].end > ranges[i].start) goto out; } - err = damon_set_regions(t, ranges, sysfs_regions->nr); + err = damon_set_regions(t, ranges, sysfs_regions->nr, min_sz_region); out: kfree(ranges); return err; @@ -1340,7 +1377,7 @@ static int damon_sysfs_add_target(struct damon_sysfs_target *sys_target, /* caller will destroy targets */ return -EINVAL; } - return damon_sysfs_set_regions(t, sys_target->regions); + return damon_sysfs_set_regions(t, sys_target->regions, ctx->min_sz_region); } static int damon_sysfs_add_targets(struct damon_ctx *ctx, @@ -1397,6 +1434,11 @@ static int damon_sysfs_apply_inputs(struct damon_ctx *ctx, err = damon_select_ops(ctx, sys_ctx->ops_id); if (err) return err; + ctx->addr_unit = sys_ctx->addr_unit; + /* addr_unit is respected by only DAMON_OPS_PADDR */ + if (sys_ctx->ops_id == DAMON_OPS_PADDR) + ctx->min_sz_region = max( + DAMON_MIN_REGION / sys_ctx->addr_unit, 1); err = damon_sysfs_set_attrs(ctx, sys_ctx->attrs); if (err) return err; @@ -1530,14 +1572,10 @@ static int damon_sysfs_repeat_call_fn(void *data) return 0; } -static struct damon_call_control damon_sysfs_repeat_call_control = { - .fn = damon_sysfs_repeat_call_fn, - .repeat = true, -}; - static int damon_sysfs_turn_damon_on(struct damon_sysfs_kdamond *kdamond) { struct damon_ctx *ctx; + struct damon_call_control *repeat_call_control; int err; if (damon_sysfs_kdamond_running(kdamond)) @@ -1550,18 +1588,29 @@ static int damon_sysfs_turn_damon_on(struct damon_sysfs_kdamond *kdamond) damon_destroy_ctx(kdamond->damon_ctx); kdamond->damon_ctx = NULL; + repeat_call_control = kmalloc(sizeof(*repeat_call_control), + GFP_KERNEL); + if (!repeat_call_control) + return -ENOMEM; + ctx = damon_sysfs_build_ctx(kdamond->contexts->contexts_arr[0]); - if (IS_ERR(ctx)) + if (IS_ERR(ctx)) { + kfree(repeat_call_control); return PTR_ERR(ctx); + } err = damon_start(&ctx, 1, false); if (err) { + kfree(repeat_call_control); damon_destroy_ctx(ctx); return err; } kdamond->damon_ctx = ctx; - damon_sysfs_repeat_call_control.data = kdamond; - damon_call(ctx, &damon_sysfs_repeat_call_control); + repeat_call_control->fn = damon_sysfs_repeat_call_fn; + repeat_call_control->data = kdamond; + repeat_call_control->repeat = true; + repeat_call_control->dealloc_on_cancel = true; + damon_call(ctx, repeat_call_control); return err; } @@ -1581,12 +1630,14 @@ static int damon_sysfs_damon_call(int (*fn)(void *data), struct damon_sysfs_kdamond *kdamond) { struct damon_call_control call_control = {}; + int err; if (!kdamond->damon_ctx) return -EINVAL; call_control.fn = fn; call_control.data = kdamond; - return damon_call(kdamond->damon_ctx, &call_control); + err = damon_call(kdamond->damon_ctx, &call_control); + return err ? err : call_control.return_code; } struct damon_sysfs_schemes_walk_data { diff --git a/mm/damon/tests/core-kunit.h b/mm/damon/tests/core-kunit.h index dfedfff19940..51369e35298b 100644 --- a/mm/damon/tests/core-kunit.h +++ b/mm/damon/tests/core-kunit.h @@ -230,14 +230,14 @@ static void damon_test_split_regions_of(struct kunit *test) t = damon_new_target(); r = damon_new_region(0, 22); damon_add_region(r, t); - damon_split_regions_of(t, 2); + damon_split_regions_of(t, 2, DAMON_MIN_REGION); KUNIT_EXPECT_LE(test, damon_nr_regions(t), 2u); damon_free_target(t); t = damon_new_target(); r = damon_new_region(0, 220); damon_add_region(r, t); - damon_split_regions_of(t, 4); + damon_split_regions_of(t, 4, DAMON_MIN_REGION); KUNIT_EXPECT_LE(test, damon_nr_regions(t), 4u); damon_free_target(t); damon_destroy_ctx(c); @@ -303,7 +303,7 @@ static void damon_test_set_regions(struct kunit *test) damon_add_region(r1, t); damon_add_region(r2, t); - damon_set_regions(t, &range, 1); + damon_set_regions(t, &range, 1, DAMON_MIN_REGION); KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 3); damon_for_each_region(r, t) { @@ -419,6 +419,22 @@ static void damos_test_new_filter(struct kunit *test) damos_destroy_filter(filter); } +static void damos_test_commit_filter(struct kunit *test) +{ + struct damos_filter *src_filter = damos_new_filter( + DAMOS_FILTER_TYPE_ANON, true, true); + struct damos_filter *dst_filter = damos_new_filter( + DAMOS_FILTER_TYPE_ACTIVE, false, false); + + damos_commit_filter(dst_filter, src_filter); + KUNIT_EXPECT_EQ(test, dst_filter->type, src_filter->type); + KUNIT_EXPECT_EQ(test, dst_filter->matching, src_filter->matching); + KUNIT_EXPECT_EQ(test, dst_filter->allow, src_filter->allow); + + damos_destroy_filter(src_filter); + damos_destroy_filter(dst_filter); +} + static void damos_test_filter_out(struct kunit *test) { struct damon_target *t; @@ -434,25 +450,29 @@ static void damos_test_filter_out(struct kunit *test) damon_add_region(r, t); /* region in the range */ - KUNIT_EXPECT_TRUE(test, damos_filter_match(NULL, t, r, f)); + KUNIT_EXPECT_TRUE(test, + damos_filter_match(NULL, t, r, f, DAMON_MIN_REGION)); KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 1); /* region before the range */ r->ar.start = DAMON_MIN_REGION * 1; r->ar.end = DAMON_MIN_REGION * 2; - KUNIT_EXPECT_FALSE(test, damos_filter_match(NULL, t, r, f)); + KUNIT_EXPECT_FALSE(test, + damos_filter_match(NULL, t, r, f, DAMON_MIN_REGION)); KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 1); /* region after the range */ r->ar.start = DAMON_MIN_REGION * 6; r->ar.end = DAMON_MIN_REGION * 8; - KUNIT_EXPECT_FALSE(test, damos_filter_match(NULL, t, r, f)); + KUNIT_EXPECT_FALSE(test, + damos_filter_match(NULL, t, r, f, DAMON_MIN_REGION)); KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 1); /* region started before the range */ r->ar.start = DAMON_MIN_REGION * 1; r->ar.end = DAMON_MIN_REGION * 4; - KUNIT_EXPECT_FALSE(test, damos_filter_match(NULL, t, r, f)); + KUNIT_EXPECT_FALSE(test, + damos_filter_match(NULL, t, r, f, DAMON_MIN_REGION)); /* filter should have split the region */ KUNIT_EXPECT_EQ(test, r->ar.start, DAMON_MIN_REGION * 1); KUNIT_EXPECT_EQ(test, r->ar.end, DAMON_MIN_REGION * 2); @@ -465,7 +485,8 @@ static void damos_test_filter_out(struct kunit *test) /* region started in the range */ r->ar.start = DAMON_MIN_REGION * 2; r->ar.end = DAMON_MIN_REGION * 8; - KUNIT_EXPECT_TRUE(test, damos_filter_match(NULL, t, r, f)); + KUNIT_EXPECT_TRUE(test, + damos_filter_match(NULL, t, r, f, DAMON_MIN_REGION)); /* filter should have split the region */ KUNIT_EXPECT_EQ(test, r->ar.start, DAMON_MIN_REGION * 2); KUNIT_EXPECT_EQ(test, r->ar.end, DAMON_MIN_REGION * 6); @@ -594,6 +615,7 @@ static struct kunit_case damon_test_cases[] = { KUNIT_CASE(damon_test_set_attrs), KUNIT_CASE(damon_test_moving_sum), KUNIT_CASE(damos_test_new_filter), + KUNIT_CASE(damos_test_commit_filter), KUNIT_CASE(damos_test_filter_out), KUNIT_CASE(damon_test_feed_loop_next_input), KUNIT_CASE(damon_test_set_filters_default_reject), diff --git a/mm/damon/tests/vaddr-kunit.h b/mm/damon/tests/vaddr-kunit.h index d2b37ccf2cc0..fce38dd53cf8 100644 --- a/mm/damon/tests/vaddr-kunit.h +++ b/mm/damon/tests/vaddr-kunit.h @@ -141,7 +141,7 @@ static void damon_do_test_apply_three_regions(struct kunit *test, damon_add_region(r, t); } - damon_set_regions(t, three_regions, 3); + damon_set_regions(t, three_regions, 3, DAMON_MIN_REGION); for (i = 0; i < nr_expected / 2; i++) { r = __nth_region_of(t, i); diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index 87e825349bdf..8c048f9b129e 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -299,7 +299,7 @@ static void damon_va_update(struct damon_ctx *ctx) damon_for_each_target(t, ctx) { if (damon_va_three_regions(t, three_regions)) continue; - damon_set_regions(t, three_regions, 3); + damon_set_regions(t, three_regions, 3, DAMON_MIN_REGION); } } @@ -890,6 +890,107 @@ free_lists: return applied * PAGE_SIZE; } +struct damos_va_stat_private { + struct damos *scheme; + unsigned long *sz_filter_passed; +}; + +static inline bool damos_va_invalid_folio(struct folio *folio, + struct damos *s) +{ + return !folio || folio == s->last_applied; +} + +static int damos_va_stat_pmd_entry(pmd_t *pmd, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + struct damos_va_stat_private *priv = walk->private; + struct damos *s = priv->scheme; + unsigned long *sz_filter_passed = priv->sz_filter_passed; + struct vm_area_struct *vma = walk->vma; + struct folio *folio; + spinlock_t *ptl; + pte_t *start_pte, *pte, ptent; + int nr; + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + if (pmd_trans_huge(*pmd)) { + pmd_t pmde; + + ptl = pmd_trans_huge_lock(pmd, vma); + if (!ptl) + return 0; + pmde = pmdp_get(pmd); + if (!pmd_present(pmde)) + goto huge_unlock; + + folio = vm_normal_folio_pmd(vma, addr, pmde); + + if (damos_va_invalid_folio(folio, s)) + goto huge_unlock; + + if (!damos_va_filter_out(s, folio, vma, addr, NULL, pmd)) + *sz_filter_passed += folio_size(folio); + s->last_applied = folio; + +huge_unlock: + spin_unlock(ptl); + return 0; + } +#endif + start_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); + if (!start_pte) + return 0; + + for (; addr < next; pte += nr, addr += nr * PAGE_SIZE) { + nr = 1; + ptent = ptep_get(pte); + + if (pte_none(ptent) || !pte_present(ptent)) + continue; + + folio = vm_normal_folio(vma, addr, ptent); + + if (damos_va_invalid_folio(folio, s)) + continue; + + if (!damos_va_filter_out(s, folio, vma, addr, pte, NULL)) + *sz_filter_passed += folio_size(folio); + nr = folio_nr_pages(folio); + s->last_applied = folio; + } + pte_unmap_unlock(start_pte, ptl); + return 0; +} + +static unsigned long damos_va_stat(struct damon_target *target, + struct damon_region *r, struct damos *s, + unsigned long *sz_filter_passed) +{ + struct damos_va_stat_private priv; + struct mm_struct *mm; + struct mm_walk_ops walk_ops = { + .pmd_entry = damos_va_stat_pmd_entry, + .walk_lock = PGWALK_RDLOCK, + }; + + priv.scheme = s; + priv.sz_filter_passed = sz_filter_passed; + + if (!damos_ops_has_filter(s)) + return 0; + + mm = damon_get_mm(target); + if (!mm) + return 0; + + mmap_read_lock(mm); + walk_page_range(mm, r->ar.start, r->ar.end, &walk_ops, &priv); + mmap_read_unlock(mm); + mmput(mm); + return 0; +} + static unsigned long damon_va_apply_scheme(struct damon_ctx *ctx, struct damon_target *t, struct damon_region *r, struct damos *scheme, unsigned long *sz_filter_passed) @@ -916,7 +1017,7 @@ static unsigned long damon_va_apply_scheme(struct damon_ctx *ctx, case DAMOS_MIGRATE_COLD: return damos_va_migrate(t, r, scheme, sz_filter_passed); case DAMOS_STAT: - return 0; + return damos_va_stat(t, r, scheme, sz_filter_passed); default: /* * DAMOS actions that are not yet supported by 'vaddr'. diff --git a/mm/debug.c b/mm/debug.c index b4388f4dcd4d..64ddb0c4b4be 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -182,7 +182,7 @@ void dump_mm(const struct mm_struct *mm) "start_code %lx end_code %lx start_data %lx end_data %lx\n" "start_brk %lx brk %lx start_stack %lx\n" "arg_start %lx arg_end %lx env_start %lx env_end %lx\n" - "binfmt %px flags %lx\n" + "binfmt %px flags %*pb\n" #ifdef CONFIG_AIO "ioctx_table %px\n" #endif @@ -211,7 +211,7 @@ void dump_mm(const struct mm_struct *mm) mm->start_code, mm->end_code, mm->start_data, mm->end_data, mm->start_brk, mm->brk, mm->start_stack, mm->arg_start, mm->arg_end, mm->env_start, mm->env_end, - mm->binfmt, mm->flags, + mm->binfmt, NUM_MM_FLAG_BITS, __mm_flags_get_bitmap(mm), #ifdef CONFIG_AIO mm->ioctx_table, #endif diff --git a/mm/execmem.c b/mm/execmem.c index 0822305413ec..810a4ba9c924 100644 --- a/mm/execmem.c +++ b/mm/execmem.c @@ -38,9 +38,6 @@ static void *execmem_vmalloc(struct execmem_range *range, size_t size, if (kasan) vm_flags |= VM_DEFER_KMEMLEAK; - if (vm_flags & VM_ALLOW_HUGE_VMAP) - align = PMD_SIZE; - p = __vmalloc_node_range(size, align, start, end, gfp_flags, pgprot, vm_flags, NUMA_NO_NODE, __builtin_return_address(0)); diff --git a/mm/filemap.c b/mm/filemap.c index 751838ef05e5..a52dd38d2b4a 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -190,6 +190,9 @@ static void filemap_unaccount_folio(struct address_space *mapping, __lruvec_stat_mod_folio(folio, NR_FILE_THPS, -nr); filemap_nr_thps_dec(mapping); } + if (test_bit(AS_KERNEL_FILE, &folio->mapping->flags)) + mod_node_page_state(folio_pgdat(folio), + NR_KERNEL_FILE_PAGES, -nr); /* * At this point folio must be either written or cleaned by @@ -960,8 +963,14 @@ int filemap_add_folio(struct address_space *mapping, struct folio *folio, { void *shadow = NULL; int ret; + struct mem_cgroup *tmp; + bool kernel_file = test_bit(AS_KERNEL_FILE, &mapping->flags); + if (kernel_file) + tmp = set_active_memcg(root_mem_cgroup); ret = mem_cgroup_charge(folio, NULL, gfp); + if (kernel_file) + set_active_memcg(tmp); if (ret) return ret; @@ -983,6 +992,10 @@ int filemap_add_folio(struct address_space *mapping, struct folio *folio, if (!(gfp & __GFP_WRITE) && shadow) workingset_refault(folio, shadow); folio_add_lru(folio); + if (kernel_file) + mod_node_page_state(folio_pgdat(folio), + NR_KERNEL_FILE_PAGES, + folio_nr_pages(folio)); } return ret; } @@ -1140,10 +1153,10 @@ static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync, */ flags = wait->flags; if (flags & WQ_FLAG_EXCLUSIVE) { - if (test_bit(key->bit_nr, &key->folio->flags)) + if (test_bit(key->bit_nr, &key->folio->flags.f)) return -1; if (flags & WQ_FLAG_CUSTOM) { - if (test_and_set_bit(key->bit_nr, &key->folio->flags)) + if (test_and_set_bit(key->bit_nr, &key->folio->flags.f)) return -1; flags |= WQ_FLAG_DONE; } @@ -1226,9 +1239,9 @@ static inline bool folio_trylock_flag(struct folio *folio, int bit_nr, struct wait_queue_entry *wait) { if (wait->flags & WQ_FLAG_EXCLUSIVE) { - if (test_and_set_bit(bit_nr, &folio->flags)) + if (test_and_set_bit(bit_nr, &folio->flags.f)) return false; - } else if (test_bit(bit_nr, &folio->flags)) + } else if (test_bit(bit_nr, &folio->flags.f)) return false; wait->flags |= WQ_FLAG_WOKEN | WQ_FLAG_DONE; @@ -1961,7 +1974,7 @@ no_page: gfp &= ~__GFP_FS; if (fgp_flags & FGP_NOWAIT) { gfp &= ~GFP_KERNEL; - gfp |= GFP_NOWAIT | __GFP_NOWARN; + gfp |= GFP_NOWAIT; } if (WARN_ON_ONCE(!(fgp_flags & (FGP_LOCK | FGP_FOR_MMAP)))) fgp_flags |= FGP_LOCK; @@ -2447,6 +2460,9 @@ static bool filemap_range_uptodate(struct address_space *mapping, pos -= folio_pos(folio); } + if (pos == 0 && count >= folio_size(folio)) + return false; + return mapping->a_ops->is_partially_uptodate(folio, pos, count); } @@ -2584,8 +2600,9 @@ static int filemap_get_pages(struct kiocb *iocb, size_t count, unsigned int flags; int err = 0; - /* "last_index" is the index of the page beyond the end of the read */ - last_index = DIV_ROUND_UP(iocb->ki_pos + count, PAGE_SIZE); + /* "last_index" is the index of the folio beyond the end of the read */ + last_index = round_up(iocb->ki_pos + count, + mapping_min_folio_nrbytes(mapping)) >> PAGE_SHIFT; retry: if (fatal_signal_pending(current)) return -EINTR; @@ -2619,9 +2636,10 @@ retry: goto err; } if (!folio_test_uptodate(folio)) { - if ((iocb->ki_flags & IOCB_WAITQ) && - folio_batch_count(fbatch) > 1) - iocb->ki_flags |= IOCB_NOWAIT; + if (folio_batch_count(fbatch) > 1) { + err = -EAGAIN; + goto err; + } err = filemap_update_page(iocb, mapping, count, folio, need_uptodate); if (err) @@ -3323,9 +3341,17 @@ static struct file *do_async_mmap_readahead(struct vm_fault *vmf, if (vmf->vma->vm_flags & VM_RAND_READ || !ra->ra_pages) return fpin; - mmap_miss = READ_ONCE(ra->mmap_miss); - if (mmap_miss) - WRITE_ONCE(ra->mmap_miss, --mmap_miss); + /* + * If the folio is locked, we're likely racing against another fault. + * Don't touch the mmap_miss counter to avoid decreasing it multiple + * times for a single folio and break the balance with mmap_miss + * increase in do_sync_mmap_readahead(). + */ + if (likely(!folio_test_locked(folio))) { + mmap_miss = READ_ONCE(ra->mmap_miss); + if (mmap_miss) + WRITE_ONCE(ra->mmap_miss, --mmap_miss); + } if (folio_test_readahead(folio)) { fpin = maybe_unlock_mmap_for_io(vmf, fpin); @@ -3639,10 +3665,26 @@ static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf, unsigned long addr, unsigned int nr_pages, unsigned long *rss, unsigned short *mmap_miss) { + unsigned int ref_from_caller = 1; vm_fault_t ret = 0; struct page *page = folio_page(folio, start); unsigned int count = 0; pte_t *old_ptep = vmf->pte; + unsigned long addr0; + + /* + * Map the large folio fully where possible. + * + * The folio must not cross VMA or page table boundary. + */ + addr0 = addr - start * PAGE_SIZE; + if (folio_within_vma(folio, vmf->vma) && + (addr0 & PMD_MASK) == ((addr0 + folio_size(folio) - 1) & PMD_MASK)) { + vmf->pte -= start; + page -= start; + addr = addr0; + nr_pages = folio_nr_pages(folio); + } do { if (PageHWPoison(page + count)) @@ -3672,7 +3714,8 @@ skip: if (count) { set_pte_range(vmf, folio, page, count, addr); *rss += count; - folio_ref_add(folio, count); + folio_ref_add(folio, count - ref_from_caller); + ref_from_caller = 0; if (in_range(vmf->address, addr, count * PAGE_SIZE)) ret = VM_FAULT_NOPAGE; } @@ -3687,12 +3730,16 @@ skip: if (count) { set_pte_range(vmf, folio, page, count, addr); *rss += count; - folio_ref_add(folio, count); + folio_ref_add(folio, count - ref_from_caller); + ref_from_caller = 0; if (in_range(vmf->address, addr, count * PAGE_SIZE)) ret = VM_FAULT_NOPAGE; } vmf->pte = old_ptep; + if (ref_from_caller) + /* Locked folios cannot get truncated. */ + folio_ref_dec(folio); return ret; } @@ -3705,7 +3752,7 @@ static vm_fault_t filemap_map_order0_folio(struct vm_fault *vmf, struct page *page = &folio->page; if (PageHWPoison(page)) - return ret; + goto out; /* See comment of filemap_map_folio_range() */ if (!folio_test_workingset(folio)) @@ -3717,15 +3764,18 @@ static vm_fault_t filemap_map_order0_folio(struct vm_fault *vmf, * the fault-around logic. */ if (!pte_none(ptep_get(vmf->pte))) - return ret; + goto out; if (vmf->address == addr) ret = VM_FAULT_NOPAGE; set_pte_range(vmf, folio, page, 1, addr); (*rss)++; - folio_ref_inc(folio); + return ret; +out: + /* Locked folios cannot get truncated. */ + folio_ref_dec(folio); return ret; } @@ -3785,7 +3835,6 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf, nr_pages, &rss, &mmap_miss); folio_unlock(folio); - folio_put(folio); } while ((folio = next_uptodate_folio(&xas, mapping, end_pgoff)) != NULL); add_mm_counter(vma->vm_mm, folio_type, rss); pte_unmap_unlock(vmf->pte, vmf->ptl); @@ -4491,7 +4540,7 @@ static void filemap_cachestat(struct address_space *mapping, * invalidation, so there might not be * a shadow in the swapcache (yet). */ - shadow = get_shadow_from_swap_cache(swp); + shadow = swap_cache_get_shadow(swp); if (!shadow) goto resched; } @@ -28,11 +28,6 @@ #include "internal.h" #include "swap.h" -struct follow_page_context { - struct dev_pagemap *pgmap; - unsigned int page_mask; -}; - static inline void sanity_check_pinned_pages(struct page **pages, unsigned long npages) { @@ -148,7 +143,7 @@ int __must_check try_grab_folio(struct folio *folio, int refs, if (WARN_ON_ONCE(folio_ref_count(folio) <= 0)) return -ENOMEM; - if (unlikely(!(flags & FOLL_PCI_P2PDMA) && is_pci_p2pdma_page(&folio->page))) + if (unlikely(!(flags & FOLL_PCI_P2PDMA) && folio_is_pci_p2pdma(folio))) return -EREMOTEIO; if (flags & FOLL_GET) @@ -237,7 +232,7 @@ void folio_add_pin(struct folio *folio) static inline struct folio *gup_folio_range_next(struct page *start, unsigned long npages, unsigned long i, unsigned int *ntails) { - struct page *next = nth_page(start, i); + struct page *next = start + i; struct folio *folio = page_folio(next); unsigned int nr = 1; @@ -342,6 +337,10 @@ EXPORT_SYMBOL(unpin_user_pages_dirty_lock); * "gup-pinned page range" refers to a range of pages that has had one of the * pin_user_pages() variants called on that page. * + * The page range must be truly physically contiguous: the page range + * corresponds to a contiguous PFN range and all pages can be iterated + * naturally. + * * For the page ranges defined by [page .. page+npages], make that range (or * its head pages, if a compound page) dirty, if @make_dirty is true, and if the * page range was previously listed as clean. @@ -359,6 +358,8 @@ void unpin_user_page_range_dirty_lock(struct page *page, unsigned long npages, struct folio *folio; unsigned int nr; + VM_WARN_ON_ONCE(!page_range_contiguous(page, npages)); + for (i = 0; i < npages; i += nr) { folio = gup_folio_range_next(page, npages, i, &nr); if (make_dirty && !folio_test_dirty(folio)) { @@ -475,29 +476,15 @@ EXPORT_SYMBOL_GPL(unpin_folios); * lifecycle. Avoid setting the bit unless necessary, or it might cause write * cache bouncing on large SMP machines for concurrent pinned gups. */ -static inline void mm_set_has_pinned_flag(unsigned long *mm_flags) +static inline void mm_set_has_pinned_flag(struct mm_struct *mm) { - if (!test_bit(MMF_HAS_PINNED, mm_flags)) - set_bit(MMF_HAS_PINNED, mm_flags); + if (!mm_flags_test(MMF_HAS_PINNED, mm)) + mm_flags_set(MMF_HAS_PINNED, mm); } #ifdef CONFIG_MMU #ifdef CONFIG_HAVE_GUP_FAST -static int record_subpages(struct page *page, unsigned long sz, - unsigned long addr, unsigned long end, - struct page **pages) -{ - struct page *start_page; - int nr; - - start_page = nth_page(page, (addr & (sz - 1)) >> PAGE_SHIFT); - for (nr = 0; addr != end; nr++, addr += PAGE_SIZE) - pages[nr] = nth_page(start_page, nr); - - return nr; -} - /** * try_grab_folio_fast() - Attempt to get or pin a folio in fast path. * @page: pointer to page to be grabbed @@ -661,7 +648,7 @@ static inline bool can_follow_write_pud(pud_t pud, struct page *page, static struct page *follow_huge_pud(struct vm_area_struct *vma, unsigned long addr, pud_t *pudp, - int flags, struct follow_page_context *ctx) + int flags, unsigned long *page_mask) { struct mm_struct *mm = vma->vm_mm; struct page *page; @@ -688,7 +675,7 @@ static struct page *follow_huge_pud(struct vm_area_struct *vma, if (ret) page = ERR_PTR(ret); else - ctx->page_mask = HPAGE_PUD_NR - 1; + *page_mask = HPAGE_PUD_NR - 1; return page; } @@ -714,7 +701,7 @@ static inline bool can_follow_write_pmd(pmd_t pmd, struct page *page, static struct page *follow_huge_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, unsigned int flags, - struct follow_page_context *ctx) + unsigned long *page_mask) { struct mm_struct *mm = vma->vm_mm; pmd_t pmdval = *pmd; @@ -751,7 +738,7 @@ static struct page *follow_huge_pmd(struct vm_area_struct *vma, #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT; - ctx->page_mask = HPAGE_PMD_NR - 1; + *page_mask = HPAGE_PMD_NR - 1; return page; } @@ -759,7 +746,7 @@ static struct page *follow_huge_pmd(struct vm_area_struct *vma, #else /* CONFIG_PGTABLE_HAS_HUGE_LEAVES */ static struct page *follow_huge_pud(struct vm_area_struct *vma, unsigned long addr, pud_t *pudp, - int flags, struct follow_page_context *ctx) + int flags, unsigned long *page_mask) { return NULL; } @@ -767,7 +754,7 @@ static struct page *follow_huge_pud(struct vm_area_struct *vma, static struct page *follow_huge_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, unsigned int flags, - struct follow_page_context *ctx) + unsigned long *page_mask) { return NULL; } @@ -813,8 +800,7 @@ static inline bool can_follow_write_pte(pte_t pte, struct page *page, } static struct page *follow_page_pte(struct vm_area_struct *vma, - unsigned long address, pmd_t *pmd, unsigned int flags, - struct dev_pagemap **pgmap) + unsigned long address, pmd_t *pmd, unsigned int flags) { struct mm_struct *mm = vma->vm_mm; struct folio *folio; @@ -912,7 +898,7 @@ no_page: static struct page *follow_pmd_mask(struct vm_area_struct *vma, unsigned long address, pud_t *pudp, unsigned int flags, - struct follow_page_context *ctx) + unsigned long *page_mask) { pmd_t *pmd, pmdval; spinlock_t *ptl; @@ -926,7 +912,7 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma, if (!pmd_present(pmdval)) return no_page_table(vma, flags, address); if (likely(!pmd_leaf(pmdval))) - return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap); + return follow_page_pte(vma, address, pmd, flags); if (pmd_protnone(pmdval) && !gup_can_follow_protnone(vma, flags)) return no_page_table(vma, flags, address); @@ -939,16 +925,16 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma, } if (unlikely(!pmd_leaf(pmdval))) { spin_unlock(ptl); - return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap); + return follow_page_pte(vma, address, pmd, flags); } if (pmd_trans_huge(pmdval) && (flags & FOLL_SPLIT_PMD)) { spin_unlock(ptl); split_huge_pmd(vma, pmd, address); /* If pmd was left empty, stuff a page table in there quickly */ return pte_alloc(mm, pmd) ? ERR_PTR(-ENOMEM) : - follow_page_pte(vma, address, pmd, flags, &ctx->pgmap); + follow_page_pte(vma, address, pmd, flags); } - page = follow_huge_pmd(vma, address, pmd, flags, ctx); + page = follow_huge_pmd(vma, address, pmd, flags, page_mask); spin_unlock(ptl); return page; } @@ -956,7 +942,7 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma, static struct page *follow_pud_mask(struct vm_area_struct *vma, unsigned long address, p4d_t *p4dp, unsigned int flags, - struct follow_page_context *ctx) + unsigned long *page_mask) { pud_t *pudp, pud; spinlock_t *ptl; @@ -969,7 +955,7 @@ static struct page *follow_pud_mask(struct vm_area_struct *vma, return no_page_table(vma, flags, address); if (pud_leaf(pud)) { ptl = pud_lock(mm, pudp); - page = follow_huge_pud(vma, address, pudp, flags, ctx); + page = follow_huge_pud(vma, address, pudp, flags, page_mask); spin_unlock(ptl); if (page) return page; @@ -978,13 +964,13 @@ static struct page *follow_pud_mask(struct vm_area_struct *vma, if (unlikely(pud_bad(pud))) return no_page_table(vma, flags, address); - return follow_pmd_mask(vma, address, pudp, flags, ctx); + return follow_pmd_mask(vma, address, pudp, flags, page_mask); } static struct page *follow_p4d_mask(struct vm_area_struct *vma, unsigned long address, pgd_t *pgdp, unsigned int flags, - struct follow_page_context *ctx) + unsigned long *page_mask) { p4d_t *p4dp, p4d; @@ -995,7 +981,7 @@ static struct page *follow_p4d_mask(struct vm_area_struct *vma, if (!p4d_present(p4d) || p4d_bad(p4d)) return no_page_table(vma, flags, address); - return follow_pud_mask(vma, address, p4dp, flags, ctx); + return follow_pud_mask(vma, address, p4dp, flags, page_mask); } /** @@ -1003,20 +989,16 @@ static struct page *follow_p4d_mask(struct vm_area_struct *vma, * @vma: vm_area_struct mapping @address * @address: virtual address to look up * @flags: flags modifying lookup behaviour - * @ctx: contains dev_pagemap for %ZONE_DEVICE memory pinning and a - * pointer to output page_mask + * @page_mask: a pointer to output page_mask * * @flags can have FOLL_ flags set, defined in <linux/mm.h> * - * When getting pages from ZONE_DEVICE memory, the @ctx->pgmap caches - * the device's dev_pagemap metadata to avoid repeating expensive lookups. - * * When getting an anonymous page and the caller has to trigger unsharing * of a shared anonymous page first, -EMLINK is returned. The caller should * trigger a fault with FAULT_FLAG_UNSHARE set. Note that unsharing is only * relevant with FOLL_PIN and !FOLL_WRITE. * - * On output, the @ctx->page_mask is set according to the size of the page. + * On output, @page_mask is set according to the size of the page. * * Return: the mapped (struct page *), %NULL if no mapping exists, or * an error pointer if there is a mapping to something not represented @@ -1024,7 +1006,7 @@ static struct page *follow_p4d_mask(struct vm_area_struct *vma, */ static struct page *follow_page_mask(struct vm_area_struct *vma, unsigned long address, unsigned int flags, - struct follow_page_context *ctx) + unsigned long *page_mask) { pgd_t *pgd; struct mm_struct *mm = vma->vm_mm; @@ -1032,13 +1014,13 @@ static struct page *follow_page_mask(struct vm_area_struct *vma, vma_pgtable_walk_begin(vma); - ctx->page_mask = 0; + *page_mask = 0; pgd = pgd_offset(mm, address); if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) page = no_page_table(vma, flags, address); else - page = follow_p4d_mask(vma, address, pgd, flags, ctx); + page = follow_p4d_mask(vma, address, pgd, flags, page_mask); vma_pgtable_walk_end(vma); @@ -1376,7 +1358,7 @@ static long __get_user_pages(struct mm_struct *mm, { long ret = 0, i = 0; struct vm_area_struct *vma = NULL; - struct follow_page_context ctx = { NULL }; + unsigned long page_mask = 0; if (!nr_pages) return 0; @@ -1418,7 +1400,7 @@ static long __get_user_pages(struct mm_struct *mm, pages ? &page : NULL); if (ret) goto out; - ctx.page_mask = 0; + page_mask = 0; goto next_page; } @@ -1441,7 +1423,7 @@ retry: } cond_resched(); - page = follow_page_mask(vma, start, gup_flags, &ctx); + page = follow_page_mask(vma, start, gup_flags, &page_mask); if (!page || PTR_ERR(page) == -EMLINK) { ret = faultin_page(vma, start, gup_flags, PTR_ERR(page) == -EMLINK, locked); @@ -1474,7 +1456,7 @@ retry: goto out; } next_page: - page_increm = 1 + (~(start >> PAGE_SHIFT) & ctx.page_mask); + page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask); if (page_increm > nr_pages) page_increm = nr_pages; @@ -1512,7 +1494,7 @@ next_page: } for (j = 0; j < page_increm; j++) { - subpage = nth_page(page, j); + subpage = page + j; pages[i + j] = subpage; flush_anon_page(vma, subpage, start + j * PAGE_SIZE); flush_dcache_page(subpage); @@ -1524,8 +1506,6 @@ next_page: nr_pages -= page_increm; } while (nr_pages); out: - if (ctx.pgmap) - put_dev_pagemap(ctx.pgmap); return i ? i : ret; } @@ -1693,7 +1673,7 @@ static __always_inline long __get_user_pages_locked(struct mm_struct *mm, mmap_assert_locked(mm); if (flags & FOLL_PIN) - mm_set_has_pinned_flag(&mm->flags); + mm_set_has_pinned_flag(mm); /* * FOLL_PIN and FOLL_GET are mutually exclusive. Traditional behavior @@ -2287,8 +2267,8 @@ static unsigned long collect_longterm_unpinnable_folios( struct pages_or_folios *pofs) { unsigned long collected = 0; - bool drain_allow = true; struct folio *folio; + int drained = 0; long i = 0; for (folio = pofs_get_folio(pofs, i); folio; @@ -2307,9 +2287,17 @@ static unsigned long collect_longterm_unpinnable_folios( continue; } - if (!folio_test_lru(folio) && drain_allow) { + if (drained == 0 && folio_may_be_lru_cached(folio) && + folio_ref_count(folio) != + folio_expected_ref_count(folio) + 1) { + lru_add_drain(); + drained = 1; + } + if (drained == 1 && folio_may_be_lru_cached(folio) && + folio_ref_count(folio) != + folio_expected_ref_count(folio) + 1) { lru_add_drain_all(); - drain_allow = false; + drained = 2; } if (!folio_isolate_lru(folio)) @@ -2853,7 +2841,6 @@ static int gup_fast_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { - struct dev_pagemap *pgmap = NULL; int ret = 0; pte_t *ptep, *ptem; @@ -2911,12 +2898,9 @@ static int gup_fast_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr, * see Documentation/core-api/pin_user_pages.rst for * details. */ - if (flags & FOLL_PIN) { - ret = arch_make_folio_accessible(folio); - if (ret) { - gup_put_folio(folio, 1, flags); - goto pte_unmap; - } + if ((flags & FOLL_PIN) && arch_make_folio_accessible(folio)) { + gup_put_folio(folio, 1, flags); + goto pte_unmap; } folio_set_referenced(folio); pages[*nr] = page; @@ -2926,8 +2910,6 @@ static int gup_fast_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr, ret = 1; pte_unmap: - if (pgmap) - put_dev_pagemap(pgmap); pte_unmap(ptem); return ret; } @@ -2964,8 +2946,8 @@ static int gup_fast_pmd_leaf(pmd_t orig, pmd_t *pmdp, unsigned long addr, if (pmd_special(orig)) return 0; - page = pmd_page(orig); - refs = record_subpages(page, PMD_SIZE, addr, end, pages + *nr); + refs = (end - addr) >> PAGE_SHIFT; + page = pmd_page(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); folio = try_grab_folio_fast(page, refs, flags); if (!folio) @@ -2985,7 +2967,10 @@ static int gup_fast_pmd_leaf(pmd_t orig, pmd_t *pmdp, unsigned long addr, return 0; } + pages += *nr; *nr += refs; + for (; refs; refs--) + *(pages++) = page++; folio_set_referenced(folio); return 1; } @@ -3004,8 +2989,8 @@ static int gup_fast_pud_leaf(pud_t orig, pud_t *pudp, unsigned long addr, if (pud_special(orig)) return 0; - page = pud_page(orig); - refs = record_subpages(page, PUD_SIZE, addr, end, pages + *nr); + refs = (end - addr) >> PAGE_SHIFT; + page = pud_page(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); folio = try_grab_folio_fast(page, refs, flags); if (!folio) @@ -3026,7 +3011,10 @@ static int gup_fast_pud_leaf(pud_t orig, pud_t *pudp, unsigned long addr, return 0; } + pages += *nr; *nr += refs; + for (; refs; refs--) + *(pages++) = page++; folio_set_referenced(folio); return 1; } @@ -3210,7 +3198,7 @@ static int gup_fast_fallback(unsigned long start, unsigned long nr_pages, return -EINVAL; if (gup_flags & FOLL_PIN) - mm_set_has_pinned_flag(¤t->mm->flags); + mm_set_has_pinned_flag(current->mm); if (!(gup_flags & FOLL_FAST_ONLY)) might_lock_read(¤t->mm->mmap_lock); diff --git a/mm/highmem.c b/mm/highmem.c index ef3189b36cad..b5c8e4c2d5d4 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -61,7 +61,7 @@ static inline int kmap_local_calc_idx(int idx) /* * Determine color of virtual address where the page should be mapped. */ -static inline unsigned int get_pkmap_color(struct page *page) +static inline unsigned int get_pkmap_color(const struct page *page) { return 0; } @@ -334,7 +334,7 @@ EXPORT_SYMBOL(kmap_high); * * This can be called from any context. */ -void *kmap_high_get(struct page *page) +void *kmap_high_get(const struct page *page) { unsigned long vaddr, flags; @@ -356,7 +356,7 @@ void *kmap_high_get(struct page *page) * If ARCH_NEEDS_KMAP_HIGH_GET is not defined then this may be called * only from user context. */ -void kunmap_high(struct page *page) +void kunmap_high(const struct page *page) { unsigned long vaddr; unsigned long nr; @@ -508,7 +508,7 @@ static inline void kmap_local_idx_pop(void) #endif #ifndef arch_kmap_local_high_get -static inline void *arch_kmap_local_high_get(struct page *page) +static inline void *arch_kmap_local_high_get(const struct page *page) { return NULL; } @@ -572,7 +572,7 @@ void *__kmap_local_pfn_prot(unsigned long pfn, pgprot_t prot) } EXPORT_SYMBOL_GPL(__kmap_local_pfn_prot); -void *__kmap_local_page_prot(struct page *page, pgprot_t prot) +void *__kmap_local_page_prot(const struct page *page, pgprot_t prot) { void *kmap; @@ -326,6 +326,68 @@ fault: return hmm_vma_fault(addr, end, required_fault, walk); } +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION +static int hmm_vma_handle_absent_pmd(struct mm_walk *walk, unsigned long start, + unsigned long end, unsigned long *hmm_pfns, + pmd_t pmd) +{ + struct hmm_vma_walk *hmm_vma_walk = walk->private; + struct hmm_range *range = hmm_vma_walk->range; + unsigned long npages = (end - start) >> PAGE_SHIFT; + unsigned long addr = start; + swp_entry_t entry = pmd_to_swp_entry(pmd); + unsigned int required_fault; + + if (is_device_private_entry(entry) && + pfn_swap_entry_folio(entry)->pgmap->owner == + range->dev_private_owner) { + unsigned long cpu_flags = HMM_PFN_VALID | + hmm_pfn_flags_order(PMD_SHIFT - PAGE_SHIFT); + unsigned long pfn = swp_offset_pfn(entry); + unsigned long i; + + if (is_writable_device_private_entry(entry)) + cpu_flags |= HMM_PFN_WRITE; + + /* + * Fully populate the PFN list though subsequent PFNs could be + * inferred, because drivers which are not yet aware of large + * folios probably do not support sparsely populated PFN lists. + */ + for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) { + hmm_pfns[i] &= HMM_PFN_INOUT_FLAGS; + hmm_pfns[i] |= pfn | cpu_flags; + } + + return 0; + } + + required_fault = hmm_range_need_fault(hmm_vma_walk, hmm_pfns, + npages, 0); + if (required_fault) { + if (is_device_private_entry(entry)) + return hmm_vma_fault(addr, end, required_fault, walk); + else + return -EFAULT; + } + + return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR); +} +#else +static int hmm_vma_handle_absent_pmd(struct mm_walk *walk, unsigned long start, + unsigned long end, unsigned long *hmm_pfns, + pmd_t pmd) +{ + struct hmm_vma_walk *hmm_vma_walk = walk->private; + struct hmm_range *range = hmm_vma_walk->range; + unsigned long npages = (end - start) >> PAGE_SHIFT; + + if (hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0)) + return -EFAULT; + return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR); +} +#endif /* CONFIG_ARCH_ENABLE_THP_MIGRATION */ + static int hmm_vma_walk_pmd(pmd_t *pmdp, unsigned long start, unsigned long end, @@ -354,11 +416,9 @@ again: return hmm_pfns_fill(start, end, range, 0); } - if (!pmd_present(pmd)) { - if (hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0)) - return -EFAULT; - return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR); - } + if (!pmd_present(pmd)) + return hmm_vma_handle_absent_pmd(walk, start, end, hmm_pfns, + pmd); if (pmd_trans_huge(pmd)) { /* diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 9c38a95e9f09..5acca24bbabb 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -99,12 +99,12 @@ static inline bool file_thp_enabled(struct vm_area_struct *vma) unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, vm_flags_t vm_flags, - unsigned long tva_flags, + enum tva_type type, unsigned long orders) { - bool smaps = tva_flags & TVA_SMAPS; - bool in_pf = tva_flags & TVA_IN_PF; - bool enforce_sysfs = tva_flags & TVA_ENFORCE_SYSFS; + const bool smaps = type == TVA_SMAPS; + const bool in_pf = type == TVA_PAGEFAULT; + const bool forced_collapse = type == TVA_FORCED_COLLAPSE; unsigned long supported_orders; /* Check the intersection of requested and supported orders. */ @@ -122,7 +122,7 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, if (!vma->vm_mm) /* vdso */ return 0; - if (thp_disabled_by_hw() || vma_thp_disabled(vma, vm_flags)) + if (thp_disabled_by_hw() || vma_thp_disabled(vma, vm_flags, forced_collapse)) return 0; /* khugepaged doesn't collapse DAX vma, but page fault is fine. */ @@ -167,14 +167,14 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, if (!in_pf && shmem_file(vma->vm_file)) return orders & shmem_allowable_huge_orders(file_inode(vma->vm_file), vma, vma->vm_pgoff, 0, - !enforce_sysfs); + forced_collapse); if (!vma_is_anonymous(vma)) { /* - * Enforce sysfs THP requirements as necessary. Anonymous vmas + * Enforce THP collapse requirements as necessary. Anonymous vmas * were already handled in thp_vma_allowable_orders(). */ - if (enforce_sysfs && + if (!forced_collapse && (!hugepage_global_enabled() || (!(vm_flags & VM_HUGEPAGE) && !hugepage_global_always()))) return 0; @@ -207,7 +207,7 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, return orders; } -static bool get_huge_zero_page(void) +static bool get_huge_zero_folio(void) { struct folio *zero_folio; retry: @@ -237,7 +237,7 @@ retry: return true; } -static void put_huge_zero_page(void) +static void put_huge_zero_folio(void) { /* * Counter should never go to zero here. Only shrinker can put @@ -248,33 +248,39 @@ static void put_huge_zero_page(void) struct folio *mm_get_huge_zero_folio(struct mm_struct *mm) { - if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags)) + if (IS_ENABLED(CONFIG_PERSISTENT_HUGE_ZERO_FOLIO)) + return huge_zero_folio; + + if (mm_flags_test(MMF_HUGE_ZERO_FOLIO, mm)) return READ_ONCE(huge_zero_folio); - if (!get_huge_zero_page()) + if (!get_huge_zero_folio()) return NULL; - if (test_and_set_bit(MMF_HUGE_ZERO_PAGE, &mm->flags)) - put_huge_zero_page(); + if (mm_flags_test_and_set(MMF_HUGE_ZERO_FOLIO, mm)) + put_huge_zero_folio(); return READ_ONCE(huge_zero_folio); } void mm_put_huge_zero_folio(struct mm_struct *mm) { - if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags)) - put_huge_zero_page(); + if (IS_ENABLED(CONFIG_PERSISTENT_HUGE_ZERO_FOLIO)) + return; + + if (mm_flags_test(MMF_HUGE_ZERO_FOLIO, mm)) + put_huge_zero_folio(); } -static unsigned long shrink_huge_zero_page_count(struct shrinker *shrink, - struct shrink_control *sc) +static unsigned long shrink_huge_zero_folio_count(struct shrinker *shrink, + struct shrink_control *sc) { /* we can free zero page only if last reference remains */ return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0; } -static unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink, - struct shrink_control *sc) +static unsigned long shrink_huge_zero_folio_scan(struct shrinker *shrink, + struct shrink_control *sc) { if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) { struct folio *zero_folio = xchg(&huge_zero_folio, NULL); @@ -287,7 +293,7 @@ static unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink, return 0; } -static struct shrinker *huge_zero_page_shrinker; +static struct shrinker *huge_zero_folio_shrinker; #ifdef CONFIG_SYSFS static ssize_t enabled_show(struct kobject *kobj, @@ -849,33 +855,47 @@ static inline void hugepage_exit_sysfs(struct kobject *hugepage_kobj) static int __init thp_shrinker_init(void) { - huge_zero_page_shrinker = shrinker_alloc(0, "thp-zero"); - if (!huge_zero_page_shrinker) - return -ENOMEM; - deferred_split_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE | SHRINKER_NONSLAB, "thp-deferred_split"); - if (!deferred_split_shrinker) { - shrinker_free(huge_zero_page_shrinker); + if (!deferred_split_shrinker) return -ENOMEM; - } - - huge_zero_page_shrinker->count_objects = shrink_huge_zero_page_count; - huge_zero_page_shrinker->scan_objects = shrink_huge_zero_page_scan; - shrinker_register(huge_zero_page_shrinker); deferred_split_shrinker->count_objects = deferred_split_count; deferred_split_shrinker->scan_objects = deferred_split_scan; shrinker_register(deferred_split_shrinker); + if (IS_ENABLED(CONFIG_PERSISTENT_HUGE_ZERO_FOLIO)) { + /* + * Bump the reference of the huge_zero_folio and do not + * initialize the shrinker. + * + * huge_zero_folio will always be NULL on failure. We assume + * that get_huge_zero_folio() will most likely not fail as + * thp_shrinker_init() is invoked early on during boot. + */ + if (!get_huge_zero_folio()) + pr_warn("Allocating persistent huge zero folio failed\n"); + return 0; + } + + huge_zero_folio_shrinker = shrinker_alloc(0, "thp-zero"); + if (!huge_zero_folio_shrinker) { + shrinker_free(deferred_split_shrinker); + return -ENOMEM; + } + + huge_zero_folio_shrinker->count_objects = shrink_huge_zero_folio_count; + huge_zero_folio_shrinker->scan_objects = shrink_huge_zero_folio_scan; + shrinker_register(huge_zero_folio_shrinker); + return 0; } static void __init thp_shrinker_exit(void) { - shrinker_free(huge_zero_page_shrinker); + shrinker_free(huge_zero_folio_shrinker); shrinker_free(deferred_split_shrinker); } @@ -911,7 +931,7 @@ static int __init hugepage_init(void) * where the extra memory used could hurt more than TLB overhead * is likely to save. The admin can still enable it through /sys. */ - if (totalram_pages() < (512 << (20 - PAGE_SHIFT))) { + if (totalram_pages() < MB_TO_PAGES(512)) { transparent_hugepage_flags = 0; return 0; } @@ -1125,7 +1145,7 @@ static unsigned long __thp_get_unmapped_area(struct file *filp, off_sub = (off - ret) & (size - 1); - if (test_bit(MMF_TOPDOWN, ¤t->mm->flags) && !off_sub) + if (mm_flags_test(MMF_TOPDOWN, current->mm) && !off_sub) return ret + size; ret += off_sub; @@ -1309,6 +1329,7 @@ static void set_huge_zero_folio(pgtable_t pgtable, struct mm_struct *mm, { pmd_t entry; entry = folio_mk_pmd(zero_folio, vma->vm_page_prot); + entry = pmd_mkspecial(entry); pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, haddr, pmd, entry); mm_inc_nr_ptes(mm); @@ -1379,15 +1400,25 @@ struct folio_or_pfn { bool is_folio; }; -static int insert_pmd(struct vm_area_struct *vma, unsigned long addr, +static vm_fault_t insert_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, struct folio_or_pfn fop, pgprot_t prot, - bool write, pgtable_t pgtable) + bool write) { struct mm_struct *mm = vma->vm_mm; + pgtable_t pgtable = NULL; + spinlock_t *ptl; pmd_t entry; - lockdep_assert_held(pmd_lockptr(mm, pmd)); + if (addr < vma->vm_start || addr >= vma->vm_end) + return VM_FAULT_SIGBUS; + + if (arch_needs_pgtable_deposit()) { + pgtable = pte_alloc_one(vma->vm_mm); + if (!pgtable) + return VM_FAULT_OOM; + } + ptl = pmd_lock(mm, pmd); if (!pmd_none(*pmd)) { const unsigned long pfn = fop.is_folio ? folio_pfn(fop.folio) : fop.pfn; @@ -1395,23 +1426,26 @@ static int insert_pmd(struct vm_area_struct *vma, unsigned long addr, if (write) { if (pmd_pfn(*pmd) != pfn) { WARN_ON_ONCE(!is_huge_zero_pmd(*pmd)); - return -EEXIST; + goto out_unlock; } entry = pmd_mkyoung(*pmd); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); if (pmdp_set_access_flags(vma, addr, pmd, entry, 1)) update_mmu_cache_pmd(vma, addr, pmd); } - - return -EEXIST; + goto out_unlock; } if (fop.is_folio) { entry = folio_mk_pmd(fop.folio, vma->vm_page_prot); - folio_get(fop.folio); - folio_add_file_rmap_pmd(fop.folio, &fop.folio->page, vma); - add_mm_counter(mm, mm_counter_file(fop.folio), HPAGE_PMD_NR); + if (is_huge_zero_folio(fop.folio)) { + entry = pmd_mkspecial(entry); + } else { + folio_get(fop.folio); + folio_add_file_rmap_pmd(fop.folio, &fop.folio->page, vma); + add_mm_counter(mm, mm_counter_file(fop.folio), HPAGE_PMD_NR); + } } else { entry = pmd_mkhuge(pfn_pmd(fop.pfn, prot)); entry = pmd_mkspecial(entry); @@ -1424,11 +1458,17 @@ static int insert_pmd(struct vm_area_struct *vma, unsigned long addr, if (pgtable) { pgtable_trans_huge_deposit(mm, pmd, pgtable); mm_inc_nr_ptes(mm); + pgtable = NULL; } set_pmd_at(mm, addr, pmd, entry); update_mmu_cache_pmd(vma, addr, pmd); - return 0; + +out_unlock: + spin_unlock(ptl); + if (pgtable) + pte_free(mm, pgtable); + return VM_FAULT_NOPAGE; } /** @@ -1450,9 +1490,6 @@ vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, unsigned long pfn, struct folio_or_pfn fop = { .pfn = pfn, }; - pgtable_t pgtable = NULL; - spinlock_t *ptl; - int error; /* * If we had pmd_special, we could avoid all these restrictions, @@ -1464,25 +1501,9 @@ vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, unsigned long pfn, (VM_PFNMAP|VM_MIXEDMAP)); BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags)); - if (addr < vma->vm_start || addr >= vma->vm_end) - return VM_FAULT_SIGBUS; - - if (arch_needs_pgtable_deposit()) { - pgtable = pte_alloc_one(vma->vm_mm); - if (!pgtable) - return VM_FAULT_OOM; - } - pfnmap_setup_cachemode_pfn(pfn, &pgprot); - ptl = pmd_lock(vma->vm_mm, vmf->pmd); - error = insert_pmd(vma, addr, vmf->pmd, fop, pgprot, write, - pgtable); - spin_unlock(ptl); - if (error && pgtable) - pte_free(vma->vm_mm, pgtable); - - return VM_FAULT_NOPAGE; + return insert_pmd(vma, addr, vmf->pmd, fop, pgprot, write); } EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd); @@ -1491,35 +1512,15 @@ vm_fault_t vmf_insert_folio_pmd(struct vm_fault *vmf, struct folio *folio, { struct vm_area_struct *vma = vmf->vma; unsigned long addr = vmf->address & PMD_MASK; - struct mm_struct *mm = vma->vm_mm; struct folio_or_pfn fop = { .folio = folio, .is_folio = true, }; - spinlock_t *ptl; - pgtable_t pgtable = NULL; - int error; - - if (addr < vma->vm_start || addr >= vma->vm_end) - return VM_FAULT_SIGBUS; if (WARN_ON_ONCE(folio_order(folio) != PMD_ORDER)) return VM_FAULT_SIGBUS; - if (arch_needs_pgtable_deposit()) { - pgtable = pte_alloc_one(vma->vm_mm); - if (!pgtable) - return VM_FAULT_OOM; - } - - ptl = pmd_lock(mm, vmf->pmd); - error = insert_pmd(vma, addr, vmf->pmd, fop, vma->vm_page_prot, - write, pgtable); - spin_unlock(ptl); - if (error && pgtable) - pte_free(mm, pgtable); - - return VM_FAULT_NOPAGE; + return insert_pmd(vma, addr, vmf->pmd, fop, vma->vm_page_prot, write); } EXPORT_SYMBOL_GPL(vmf_insert_folio_pmd); @@ -1531,25 +1532,30 @@ static pud_t maybe_pud_mkwrite(pud_t pud, struct vm_area_struct *vma) return pud; } -static void insert_pud(struct vm_area_struct *vma, unsigned long addr, +static vm_fault_t insert_pud(struct vm_area_struct *vma, unsigned long addr, pud_t *pud, struct folio_or_pfn fop, pgprot_t prot, bool write) { struct mm_struct *mm = vma->vm_mm; + spinlock_t *ptl; pud_t entry; + if (addr < vma->vm_start || addr >= vma->vm_end) + return VM_FAULT_SIGBUS; + + ptl = pud_lock(mm, pud); if (!pud_none(*pud)) { const unsigned long pfn = fop.is_folio ? folio_pfn(fop.folio) : fop.pfn; if (write) { if (WARN_ON_ONCE(pud_pfn(*pud) != pfn)) - return; + goto out_unlock; entry = pud_mkyoung(*pud); entry = maybe_pud_mkwrite(pud_mkdirty(entry), vma); if (pudp_set_access_flags(vma, addr, pud, entry, 1)) update_mmu_cache_pud(vma, addr, pud); } - return; + goto out_unlock; } if (fop.is_folio) { @@ -1568,6 +1574,9 @@ static void insert_pud(struct vm_area_struct *vma, unsigned long addr, } set_pud_at(mm, addr, pud, entry); update_mmu_cache_pud(vma, addr, pud); +out_unlock: + spin_unlock(ptl); + return VM_FAULT_NOPAGE; } /** @@ -1589,7 +1598,6 @@ vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, unsigned long pfn, struct folio_or_pfn fop = { .pfn = pfn, }; - spinlock_t *ptl; /* * If we had pud_special, we could avoid all these restrictions, @@ -1601,16 +1609,9 @@ vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, unsigned long pfn, (VM_PFNMAP|VM_MIXEDMAP)); BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags)); - if (addr < vma->vm_start || addr >= vma->vm_end) - return VM_FAULT_SIGBUS; - pfnmap_setup_cachemode_pfn(pfn, &pgprot); - ptl = pud_lock(vma->vm_mm, vmf->pud); - insert_pud(vma, addr, vmf->pud, fop, pgprot, write); - spin_unlock(ptl); - - return VM_FAULT_NOPAGE; + return insert_pud(vma, addr, vmf->pud, fop, pgprot, write); } EXPORT_SYMBOL_GPL(vmf_insert_pfn_pud); @@ -1627,25 +1628,15 @@ vm_fault_t vmf_insert_folio_pud(struct vm_fault *vmf, struct folio *folio, { struct vm_area_struct *vma = vmf->vma; unsigned long addr = vmf->address & PUD_MASK; - pud_t *pud = vmf->pud; - struct mm_struct *mm = vma->vm_mm; struct folio_or_pfn fop = { .folio = folio, .is_folio = true, }; - spinlock_t *ptl; - - if (addr < vma->vm_start || addr >= vma->vm_end) - return VM_FAULT_SIGBUS; if (WARN_ON_ONCE(folio_order(folio) != PUD_ORDER)) return VM_FAULT_SIGBUS; - ptl = pud_lock(mm, pud); - insert_pud(vma, addr, vmf->pud, fop, vma->vm_page_prot, write); - spin_unlock(ptl); - - return VM_FAULT_NOPAGE; + return insert_pud(vma, addr, vmf->pud, fop, vma->vm_page_prot, write); } EXPORT_SYMBOL_GPL(vmf_insert_folio_pud); #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ @@ -1675,7 +1666,8 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, int ret = -ENOMEM; pmd = pmdp_get_lockless(src_pmd); - if (unlikely(pmd_present(pmd) && pmd_special(pmd))) { + if (unlikely(pmd_present(pmd) && pmd_special(pmd) && + !is_huge_zero_pmd(pmd))) { dst_ptl = pmd_lock(dst_mm, dst_pmd); src_ptl = pmd_lockptr(src_mm, src_pmd); spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); @@ -3310,8 +3302,8 @@ static void __split_folio_to_order(struct folio *folio, int old_order, * unreferenced sub-pages of an anonymous THP: we can simply drop * PG_anon_exclusive (-> PG_mappedtodisk) for these here. */ - new_folio->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; - new_folio->flags |= (folio->flags & + new_folio->flags.f &= ~PAGE_FLAGS_CHECK_AT_PREP; + new_folio->flags.f |= (folio->flags.f & ((1L << PG_referenced) | (1L << PG_swapbacked) | (1L << PG_swapcache) | @@ -3728,7 +3720,7 @@ static int __folio_split(struct folio *folio, unsigned int new_order, /* Prevent deferred_split_scan() touching ->_refcount */ spin_lock(&ds_queue->split_queue_lock); if (folio_ref_freeze(folio, 1 + extra_pins)) { - struct address_space *swap_cache = NULL; + struct swap_cluster_info *ci = NULL; struct lruvec *lruvec; int expected_refs; @@ -3772,8 +3764,7 @@ static int __folio_split(struct folio *folio, unsigned int new_order, goto fail; } - swap_cache = swap_address_space(folio->swap); - xa_lock(&swap_cache->i_pages); + ci = swap_cluster_get_and_lock(folio); } /* lock lru list/PageCompound, ref frozen by page_ref_freeze */ @@ -3805,10 +3796,8 @@ static int __folio_split(struct folio *folio, unsigned int new_order, * Anonymous folio with swap cache. * NOTE: shmem in swap cache is not supported yet. */ - if (swap_cache) { - __xa_store(&swap_cache->i_pages, - swap_cache_index(new_folio->swap), - new_folio, 0); + if (ci) { + __swap_cache_replace_folio(ci, folio, new_folio); continue; } @@ -3843,8 +3832,8 @@ static int __folio_split(struct folio *folio, unsigned int new_order, unlock_page_lruvec(lruvec); - if (swap_cache) - xa_unlock(&swap_cache->i_pages); + if (ci) + swap_cluster_unlock(ci); } else { spin_unlock(&ds_queue->split_queue_lock); ret = -EAGAIN; @@ -4186,6 +4175,13 @@ static unsigned long deferred_split_scan(struct shrinker *shrink, bool underused = false; if (!folio_test_partially_mapped(folio)) { + /* + * See try_to_map_unused_to_zeropage(): we cannot + * optimize zero-filled pages after splitting an + * mlocked folio. + */ + if (folio_test_mlocked(folio)) + goto next; underused = thp_underused(folio); if (!underused) goto next; @@ -4327,8 +4323,8 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start, goto out; } - pr_debug("Split huge pages in pid: %d, vaddr: [0x%lx - 0x%lx]\n", - pid, vaddr_start, vaddr_end); + pr_debug("Split huge pages in pid: %d, vaddr: [0x%lx - 0x%lx], new_order: %u, in_folio_offset: %ld\n", + pid, vaddr_start, vaddr_end, new_order, in_folio_offset); mmap_read_lock(mm); /* @@ -4438,8 +4434,8 @@ static int split_huge_pages_in_file(const char *file_path, pgoff_t off_start, if (IS_ERR(candidate)) goto out; - pr_debug("split file-backed THPs in file: %s, page offset: [0x%lx - 0x%lx]\n", - file_path, off_start, off_end); + pr_debug("split file-backed THPs in file: %s, page offset: [0x%lx - 0x%lx], new_order: %u, in_folio_offset: %ld\n", + file_path, off_start, off_end, new_order, in_folio_offset); mapping = candidate->f_mapping; min_order = mapping_min_folio_order(mapping); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 753f99b4c718..6cac826cb61f 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1473,17 +1473,14 @@ static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed) #ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE #ifdef CONFIG_CONTIG_ALLOC -static struct folio *alloc_gigantic_folio(struct hstate *h, gfp_t gfp_mask, +static struct folio *alloc_gigantic_folio(int order, gfp_t gfp_mask, int nid, nodemask_t *nodemask) { struct folio *folio; - int order = huge_page_order(h); bool retried = false; - if (nid == NUMA_NO_NODE) - nid = numa_mem_id(); retry: - folio = hugetlb_cma_alloc_folio(h, gfp_mask, nid, nodemask); + folio = hugetlb_cma_alloc_folio(order, gfp_mask, nid, nodemask); if (!folio) { if (hugetlb_cma_exclusive_alloc()) return NULL; @@ -1506,16 +1503,16 @@ retry: } #else /* !CONFIG_CONTIG_ALLOC */ -static struct folio *alloc_gigantic_folio(struct hstate *h, gfp_t gfp_mask, - int nid, nodemask_t *nodemask) +static struct folio *alloc_gigantic_folio(int order, gfp_t gfp_mask, int nid, + nodemask_t *nodemask) { return NULL; } #endif /* CONFIG_CONTIG_ALLOC */ #else /* !CONFIG_ARCH_HAS_GIGANTIC_PAGE */ -static struct folio *alloc_gigantic_folio(struct hstate *h, gfp_t gfp_mask, - int nid, nodemask_t *nodemask) +static struct folio *alloc_gigantic_folio(int order, gfp_t gfp_mask, int nid, + nodemask_t *nodemask) { return NULL; } @@ -1890,14 +1887,14 @@ void free_huge_folio(struct folio *folio) /* * Must be called with the hugetlb lock held */ -static void __prep_account_new_huge_page(struct hstate *h, int nid) +static void account_new_hugetlb_folio(struct hstate *h, struct folio *folio) { lockdep_assert_held(&hugetlb_lock); h->nr_huge_pages++; - h->nr_huge_pages_node[nid]++; + h->nr_huge_pages_node[folio_nid(folio)]++; } -static void init_new_hugetlb_folio(struct hstate *h, struct folio *folio) +static void init_new_hugetlb_folio(struct folio *folio) { __folio_set_hugetlb(folio); INIT_LIST_HEAD(&folio->lru); @@ -1906,20 +1903,6 @@ static void init_new_hugetlb_folio(struct hstate *h, struct folio *folio) set_hugetlb_cgroup_rsvd(folio, NULL); } -static void __prep_new_hugetlb_folio(struct hstate *h, struct folio *folio) -{ - init_new_hugetlb_folio(h, folio); - hugetlb_vmemmap_optimize_folio(h, folio); -} - -static void prep_new_hugetlb_folio(struct hstate *h, struct folio *folio, int nid) -{ - __prep_new_hugetlb_folio(h, folio); - spin_lock_irq(&hugetlb_lock); - __prep_account_new_huge_page(h, nid); - spin_unlock_irq(&hugetlb_lock); -} - /* * Find and lock address space (mapping) in write mode. * @@ -1940,11 +1923,9 @@ struct address_space *hugetlb_folio_mapping_lock_write(struct folio *folio) return NULL; } -static struct folio *alloc_buddy_hugetlb_folio(struct hstate *h, - gfp_t gfp_mask, int nid, nodemask_t *nmask, - nodemask_t *node_alloc_noretry) +static struct folio *alloc_buddy_hugetlb_folio(int order, gfp_t gfp_mask, + int nid, nodemask_t *nmask, nodemask_t *node_alloc_noretry) { - int order = huge_page_order(h); struct folio *folio; bool alloc_try_hard = true; @@ -1959,8 +1940,6 @@ static struct folio *alloc_buddy_hugetlb_folio(struct hstate *h, alloc_try_hard = false; if (alloc_try_hard) gfp_mask |= __GFP_RETRY_MAYFAIL; - if (nid == NUMA_NO_NODE) - nid = numa_mem_id(); folio = (struct folio *)__alloc_frozen_pages(gfp_mask, order, nid, nmask); @@ -1994,36 +1973,36 @@ static struct folio *only_alloc_fresh_hugetlb_folio(struct hstate *h, nodemask_t *node_alloc_noretry) { struct folio *folio; + int order = huge_page_order(h); - if (hstate_is_gigantic(h)) - folio = alloc_gigantic_folio(h, gfp_mask, nid, nmask); + if (nid == NUMA_NO_NODE) + nid = numa_mem_id(); + + if (order_is_gigantic(order)) + folio = alloc_gigantic_folio(order, gfp_mask, nid, nmask); else - folio = alloc_buddy_hugetlb_folio(h, gfp_mask, nid, nmask, node_alloc_noretry); + folio = alloc_buddy_hugetlb_folio(order, gfp_mask, nid, nmask, + node_alloc_noretry); if (folio) - init_new_hugetlb_folio(h, folio); + init_new_hugetlb_folio(folio); return folio; } /* - * Common helper to allocate a fresh hugetlb page. All specific allocators - * should use this function to get new hugetlb pages + * Common helper to allocate a fresh hugetlb folio. All specific allocators + * should use this function to get new hugetlb folio * - * Note that returned page is 'frozen': ref count of head page and all tail - * pages is zero. + * Note that returned folio is 'frozen': ref count of head page and all tail + * pages is zero, and the accounting must be done in the caller. */ static struct folio *alloc_fresh_hugetlb_folio(struct hstate *h, gfp_t gfp_mask, int nid, nodemask_t *nmask) { struct folio *folio; - if (hstate_is_gigantic(h)) - folio = alloc_gigantic_folio(h, gfp_mask, nid, nmask); - else - folio = alloc_buddy_hugetlb_folio(h, gfp_mask, nid, nmask, NULL); - if (!folio) - return NULL; - - prep_new_hugetlb_folio(h, folio, folio_nid(folio)); + folio = only_alloc_fresh_hugetlb_folio(h, gfp_mask, nid, nmask, NULL); + if (folio) + hugetlb_vmemmap_optimize_folio(h, folio); return folio; } @@ -2039,7 +2018,7 @@ static void prep_and_add_allocated_folios(struct hstate *h, /* Add all new pool pages to free lists in one lock cycle */ spin_lock_irqsave(&hugetlb_lock, flags); list_for_each_entry_safe(folio, tmp_f, folio_list, lru) { - __prep_account_new_huge_page(h, folio_nid(folio)); + account_new_hugetlb_folio(h, folio); enqueue_hugetlb_folio(h, folio); } spin_unlock_irqrestore(&hugetlb_lock, flags); @@ -2241,19 +2220,17 @@ static struct folio *alloc_surplus_hugetlb_folio(struct hstate *h, goto out_unlock; spin_unlock_irq(&hugetlb_lock); - folio = only_alloc_fresh_hugetlb_folio(h, gfp_mask, nid, nmask, NULL); + folio = alloc_fresh_hugetlb_folio(h, gfp_mask, nid, nmask); if (!folio) return NULL; - hugetlb_vmemmap_optimize_folio(h, folio); - spin_lock_irq(&hugetlb_lock); /* * nr_huge_pages needs to be adjusted within the same lock cycle * as surplus_pages, otherwise it might confuse * persistent_huge_pages() momentarily. */ - __prep_account_new_huge_page(h, folio_nid(folio)); + account_new_hugetlb_folio(h, folio); /* * We could have raced with the pool size change. @@ -2290,6 +2267,10 @@ static struct folio *alloc_migrate_hugetlb_folio(struct hstate *h, gfp_t gfp_mas if (!folio) return NULL; + spin_lock_irq(&hugetlb_lock); + account_new_hugetlb_folio(h, folio); + spin_unlock_irq(&hugetlb_lock); + /* fresh huge pages are frozen */ folio_ref_unfreeze(folio, 1); /* @@ -2836,18 +2817,17 @@ retry: if (!new_folio) { spin_unlock_irq(&hugetlb_lock); gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; - new_folio = alloc_buddy_hugetlb_folio(h, gfp_mask, nid, - NULL, NULL); + new_folio = alloc_fresh_hugetlb_folio(h, gfp_mask, + nid, NULL); if (!new_folio) return -ENOMEM; - __prep_new_hugetlb_folio(h, new_folio); goto retry; } /* * Ok, old_folio is still a genuine free hugepage. Remove it from * the freelist and decrease the counters. These will be - * incremented again when calling __prep_account_new_huge_page() + * incremented again when calling account_new_hugetlb_folio() * and enqueue_hugetlb_folio() for new_folio. The counters will * remain stable since this happens under the lock. */ @@ -2857,7 +2837,7 @@ retry: * Ref count on new_folio is already zero as it was dropped * earlier. It can be directly added to the pool free list. */ - __prep_account_new_huge_page(h, nid); + account_new_hugetlb_folio(h, new_folio); enqueue_hugetlb_folio(h, new_folio); /* @@ -2890,7 +2870,7 @@ int isolate_or_dissolve_huge_folio(struct folio *folio, struct list_head *list) * alloc_contig_range and them. Return -ENOMEM as this has the effect * of bailing out right away without further retrying. */ - if (folio_order(folio) > MAX_PAGE_ORDER) + if (order_is_gigantic(folio_order(folio))) return -ENOMEM; if (folio_ref_count(folio) && folio_isolate_hugetlb(folio, list)) @@ -3237,17 +3217,18 @@ static void __init hugetlb_folio_init_tail_vmemmap(struct folio *folio, { enum zone_type zone = zone_idx(folio_zone(folio)); int nid = folio_nid(folio); + struct page *page = folio_page(folio, start_page_number); unsigned long head_pfn = folio_pfn(folio); unsigned long pfn, end_pfn = head_pfn + end_page_number; - int ret; - - for (pfn = head_pfn + start_page_number; pfn < end_pfn; pfn++) { - struct page *page = pfn_to_page(pfn); + /* + * As we marked all tail pages with memblock_reserved_mark_noinit(), + * we must initialize them ourselves here. + */ + for (pfn = head_pfn + start_page_number; pfn < end_pfn; page++, pfn++) { __init_single_page(page, pfn, zone, nid); prep_compound_tail((struct page *)folio, pfn - head_pfn); - ret = page_ref_freeze(page, 1); - VM_BUG_ON(!ret); + set_page_count(page, 0); } } @@ -3257,12 +3238,15 @@ static void __init hugetlb_folio_init_vmemmap(struct folio *folio, { int ret; - /* Prepare folio head */ + /* + * This is an open-coded prep_compound_page() whereby we avoid + * walking pages twice by initializing/preparing+freezing them in the + * same go. + */ __folio_clear_reserved(folio); __folio_set_head(folio); ret = folio_ref_freeze(folio, 1); VM_BUG_ON(!ret); - /* Initialize the necessary tail struct pages */ hugetlb_folio_init_tail_vmemmap(folio, 1, nr_pages); prep_compound_head((struct page *)folio, huge_page_order(h)); } @@ -3327,7 +3311,7 @@ static void __init prep_and_add_bootmem_folios(struct hstate *h, hugetlb_bootmem_init_migratetype(folio, h); /* Subdivide locks to achieve better parallel performance */ spin_lock_irqsave(&hugetlb_lock, flags); - __prep_account_new_huge_page(h, folio_nid(folio)); + account_new_hugetlb_folio(h, folio); enqueue_hugetlb_folio(h, folio); spin_unlock_irqrestore(&hugetlb_lock, flags); } @@ -3423,7 +3407,7 @@ static void __init gather_bootmem_prealloc_node(unsigned long nid) hugetlb_folio_init_vmemmap(folio, h, HUGETLB_VMEMMAP_RESERVE_PAGES); - init_new_hugetlb_folio(h, folio); + init_new_hugetlb_folio(folio); if (hugetlb_bootmem_page_prehvo(m)) /* @@ -3554,7 +3538,14 @@ static void __init hugetlb_pages_alloc_boot_node(unsigned long start, unsigned l nodes_clear(node_alloc_noretry); for (i = 0; i < num; ++i) { - struct folio *folio = alloc_pool_huge_folio(h, &node_states[N_MEMORY], + struct folio *folio; + + if (hugetlb_vmemmap_optimizable_size(h) && + (si_mem_available() == 0) && !list_empty(&folio_list)) { + prep_and_add_allocated_folios(h, &folio_list); + INIT_LIST_HEAD(&folio_list); + } + folio = alloc_pool_huge_folio(h, &node_states[N_MEMORY], &node_alloc_noretry, &next_node); if (!folio) break; @@ -3589,10 +3580,9 @@ static unsigned long __init hugetlb_pages_alloc_boot(struct hstate *h) unsigned long jiffies_start; unsigned long jiffies_end; + unsigned long remaining; job.thread_fn = hugetlb_pages_alloc_boot_node; - job.start = 0; - job.size = h->max_huge_pages; /* * job.max_threads is 25% of the available cpu threads by default. @@ -3616,10 +3606,29 @@ static unsigned long __init hugetlb_pages_alloc_boot(struct hstate *h) } job.max_threads = hugepage_allocation_threads; - job.min_chunk = h->max_huge_pages / hugepage_allocation_threads; jiffies_start = jiffies; - padata_do_multithreaded(&job); + do { + remaining = h->max_huge_pages - h->nr_huge_pages; + + job.start = h->nr_huge_pages; + job.size = remaining; + job.min_chunk = remaining / hugepage_allocation_threads; + padata_do_multithreaded(&job); + + if (h->nr_huge_pages == h->max_huge_pages) + break; + + /* + * Retry only if the vmemmap optimization might have been able to free + * some memory back to the system. + */ + if (!hugetlb_vmemmap_optimizable(h)) + break; + + /* Continue if progress was made in last iteration */ + } while (remaining != (h->max_huge_pages - h->nr_huge_pages)); + jiffies_end = jiffies; pr_info("HugeTLB: allocation took %dms with hugepage_allocation_threads=%ld\n", @@ -3654,6 +3663,9 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h) return; } + if (!h->max_huge_pages) + return; + /* do node specific alloc */ if (hugetlb_hstate_alloc_pages_specific_nodes(h)) return; @@ -4035,7 +4047,7 @@ static long demote_free_hugetlb_folios(struct hstate *src, struct hstate *dst, prep_compound_page(page, dst->order); new_folio->mapping = NULL; - init_new_hugetlb_folio(dst, new_folio); + init_new_hugetlb_folio(new_folio); /* Copy the CMA flag so that it is freed correctly */ if (cma) folio_set_hugetlb_cma(new_folio); @@ -4654,6 +4666,7 @@ static int __init hugetlb_init(void) BUILD_BUG_ON(sizeof_field(struct page, private) * BITS_PER_BYTE < __NR_HPAGEFLAGS); + BUILD_BUG_ON_INVALID(HUGETLB_PAGE_ORDER > MAX_FOLIO_ORDER); if (!hugepages_supported()) { if (hugetlb_max_hstate || default_hstate_max_huge_pages) @@ -4737,6 +4750,7 @@ void __init hugetlb_add_hstate(unsigned int order) } BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE); BUG_ON(order < order_base_2(__NR_USED_SUBPAGE)); + WARN_ON(order > MAX_FOLIO_ORDER); h = &hstates[hugetlb_max_hstate++]; __mutex_init(&h->resize_lock, "resize mutex", &h->resize_key); h->order = order; @@ -5594,18 +5608,13 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, break; } - /* - * If the pagetables are shared don't copy or take references. - * - * dst_pte == src_pte is the common case of src/dest sharing. - * However, src could have 'unshared' and dst shares with - * another vma. So page_count of ptep page is checked instead - * to reliably determine whether pte is shared. - */ - if (page_count(virt_to_page(dst_pte)) > 1) { +#ifdef CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING + /* If the pagetables are shared, there is nothing to do */ + if (ptdesc_pmd_is_shared(virt_to_ptdesc(dst_pte))) { addr |= last_addr_mask; continue; } +#endif dst_ptl = huge_pte_lock(h, dst, dst_pte); src_ptl = huge_pte_lockptr(h, src, src_pte); @@ -5851,7 +5860,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, spinlock_t *ptl; struct hstate *h = hstate_vma(vma); unsigned long sz = huge_page_size(h); - bool adjust_reservation = false; + bool adjust_reservation; unsigned long last_addr_mask; bool force_flush = false; @@ -5944,6 +5953,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, sz); hugetlb_count_sub(pages_per_huge_page(h), mm); hugetlb_remove_rmap(folio); + spin_unlock(ptl); /* * Restore the reservation for anonymous page, otherwise the @@ -5951,14 +5961,16 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, * If there we are freeing a surplus, do not set the restore * reservation bit. */ + adjust_reservation = false; + + spin_lock_irq(&hugetlb_lock); if (!h->surplus_huge_pages && __vma_private_lock(vma) && folio_test_anon(folio)) { folio_set_hugetlb_restore_reserve(folio); /* Reservation to be adjusted after the spin lock */ adjust_reservation = true; } - - spin_unlock(ptl); + spin_unlock_irq(&hugetlb_lock); /* * Adjust the reservation for the region that will have the @@ -6929,6 +6941,11 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte, folio = alloc_hugetlb_folio(dst_vma, dst_addr, false); if (IS_ERR(folio)) { + pte_t *actual_pte = hugetlb_walk(dst_vma, dst_addr, PMD_SIZE); + if (actual_pte) { + ret = -EEXIST; + goto out; + } ret = -ENOMEM; goto out; } @@ -7599,7 +7616,7 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, hugetlb_vma_assert_locked(vma); if (sz != PMD_SIZE) return 0; - if (!ptdesc_pmd_pts_count(virt_to_ptdesc(ptep))) + if (!ptdesc_pmd_is_shared(virt_to_ptdesc(ptep))) return 0; pud_clear(pud); diff --git a/mm/hugetlb_cma.c b/mm/hugetlb_cma.c index f58ef4969e7a..e8e4dc7182d5 100644 --- a/mm/hugetlb_cma.c +++ b/mm/hugetlb_cma.c @@ -26,11 +26,10 @@ void hugetlb_cma_free_folio(struct folio *folio) } -struct folio *hugetlb_cma_alloc_folio(struct hstate *h, gfp_t gfp_mask, +struct folio *hugetlb_cma_alloc_folio(int order, gfp_t gfp_mask, int nid, nodemask_t *nodemask) { int node; - int order = huge_page_order(h); struct folio *folio = NULL; if (hugetlb_cma[nid]) diff --git a/mm/hugetlb_cma.h b/mm/hugetlb_cma.h index f7d7fb9880a2..2c2ec8a7e134 100644 --- a/mm/hugetlb_cma.h +++ b/mm/hugetlb_cma.h @@ -4,7 +4,7 @@ #ifdef CONFIG_CMA void hugetlb_cma_free_folio(struct folio *folio); -struct folio *hugetlb_cma_alloc_folio(struct hstate *h, gfp_t gfp_mask, +struct folio *hugetlb_cma_alloc_folio(int order, gfp_t gfp_mask, int nid, nodemask_t *nodemask); struct huge_bootmem_page *hugetlb_cma_alloc_bootmem(struct hstate *h, int *nid, bool node_exact); @@ -18,8 +18,8 @@ static inline void hugetlb_cma_free_folio(struct folio *folio) { } -static inline struct folio *hugetlb_cma_alloc_folio(struct hstate *h, - gfp_t gfp_mask, int nid, nodemask_t *nodemask) +static inline struct folio *hugetlb_cma_alloc_folio(int order, gfp_t gfp_mask, + int nid, nodemask_t *nodemask) { return NULL; } diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c index 7ecaa1900137..a11222572f97 100644 --- a/mm/hwpoison-inject.c +++ b/mm/hwpoison-inject.c @@ -7,8 +7,96 @@ #include <linux/swap.h> #include <linux/pagemap.h> #include <linux/hugetlb.h> +#include <linux/page-flags.h> +#include <linux/memcontrol.h> #include "internal.h" +static u32 hwpoison_filter_enable; +static u32 hwpoison_filter_dev_major = ~0U; +static u32 hwpoison_filter_dev_minor = ~0U; +static u64 hwpoison_filter_flags_mask; +static u64 hwpoison_filter_flags_value; + +static int hwpoison_filter_dev(struct page *p) +{ + struct folio *folio = page_folio(p); + struct address_space *mapping; + dev_t dev; + + if (hwpoison_filter_dev_major == ~0U && + hwpoison_filter_dev_minor == ~0U) + return 0; + + mapping = folio_mapping(folio); + if (mapping == NULL || mapping->host == NULL) + return -EINVAL; + + dev = mapping->host->i_sb->s_dev; + if (hwpoison_filter_dev_major != ~0U && + hwpoison_filter_dev_major != MAJOR(dev)) + return -EINVAL; + if (hwpoison_filter_dev_minor != ~0U && + hwpoison_filter_dev_minor != MINOR(dev)) + return -EINVAL; + + return 0; +} + +static int hwpoison_filter_flags(struct page *p) +{ + if (!hwpoison_filter_flags_mask) + return 0; + + if ((stable_page_flags(p) & hwpoison_filter_flags_mask) == + hwpoison_filter_flags_value) + return 0; + else + return -EINVAL; +} + +/* + * This allows stress tests to limit test scope to a collection of tasks + * by putting them under some memcg. This prevents killing unrelated/important + * processes such as /sbin/init. Note that the target task may share clean + * pages with init (eg. libc text), which is harmless. If the target task + * share _dirty_ pages with another task B, the test scheme must make sure B + * is also included in the memcg. At last, due to race conditions this filter + * can only guarantee that the page either belongs to the memcg tasks, or is + * a freed page. + */ +#ifdef CONFIG_MEMCG +static u64 hwpoison_filter_memcg; +static int hwpoison_filter_task(struct page *p) +{ + if (!hwpoison_filter_memcg) + return 0; + + if (page_cgroup_ino(p) != hwpoison_filter_memcg) + return -EINVAL; + + return 0; +} +#else +static int hwpoison_filter_task(struct page *p) { return 0; } +#endif + +static int hwpoison_filter(struct page *p) +{ + if (!hwpoison_filter_enable) + return 0; + + if (hwpoison_filter_dev(p)) + return -EINVAL; + + if (hwpoison_filter_flags(p)) + return -EINVAL; + + if (hwpoison_filter_task(p)) + return -EINVAL; + + return 0; +} + static struct dentry *hwpoison_dir; static int hwpoison_inject(void *data, u64 val) @@ -67,6 +155,7 @@ DEFINE_DEBUGFS_ATTRIBUTE(unpoison_fops, NULL, hwpoison_unpoison, "%lli\n"); static void __exit pfn_inject_exit(void) { hwpoison_filter_enable = 0; + hwpoison_filter_unregister(); debugfs_remove_recursive(hwpoison_dir); } @@ -105,6 +194,8 @@ static int __init pfn_inject_init(void) &hwpoison_filter_memcg); #endif + hwpoison_filter_register(hwpoison_filter); + return 0; } diff --git a/mm/internal.h b/mm/internal.h index 45b725c3dc03..1561fc2ff5b8 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -755,6 +755,7 @@ static inline void folio_set_order(struct folio *folio, unsigned int order) { if (WARN_ON_ONCE(!order || !folio_test_large(folio))) return; + VM_WARN_ON_ONCE(order > MAX_FOLIO_ORDER); folio->_flags_1 = (folio->_flags_1 & ~0xffUL) | order; #ifdef NR_PAGES_IN_LARGE_FOLIO @@ -842,6 +843,10 @@ static inline struct page *alloc_frozen_pages_noprof(gfp_t gfp, unsigned int ord #define alloc_frozen_pages(...) \ alloc_hooks(alloc_frozen_pages_noprof(__VA_ARGS__)) +struct page *alloc_frozen_pages_nolock_noprof(gfp_t gfp_flags, int nid, unsigned int order); +#define alloc_frozen_pages_nolock(...) \ + alloc_hooks(alloc_frozen_pages_nolock_noprof(__VA_ARGS__)) + extern void zone_pcp_reset(struct zone *zone); extern void zone_pcp_disable(struct zone *zone); extern void zone_pcp_enable(struct zone *zone); @@ -961,8 +966,8 @@ extern long populate_vma_page_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, int *locked); extern long faultin_page_range(struct mm_struct *mm, unsigned long start, unsigned long end, bool write, int *locked); -extern bool mlock_future_ok(struct mm_struct *mm, vm_flags_t vm_flags, - unsigned long bytes); +bool mlock_future_ok(const struct mm_struct *mm, vm_flags_t vm_flags, + unsigned long bytes); /* * NOTE: This function can't tell whether the folio is "fully mapped" in the @@ -1227,14 +1232,10 @@ static inline bool node_reclaim_enabled(void) #ifdef CONFIG_MEMORY_FAILURE int unmap_poisoned_folio(struct folio *folio, unsigned long pfn, bool must_kill); void shake_folio(struct folio *folio); -extern int hwpoison_filter(struct page *p); - -extern u32 hwpoison_filter_dev_major; -extern u32 hwpoison_filter_dev_minor; -extern u64 hwpoison_filter_flags_mask; -extern u64 hwpoison_filter_flags_value; -extern u64 hwpoison_filter_memcg; -extern u32 hwpoison_filter_enable; +typedef int hwpoison_filter_func_t(struct page *p); +void hwpoison_filter_register(hwpoison_filter_func_t *filter); +void hwpoison_filter_unregister(void); + #define MAGIC_HWPOISON 0x48575053U /* HWPS */ void SetPageHWPoisonTakenOff(struct page *page); void ClearPageHWPoisonTakenOff(struct page *page); @@ -1333,11 +1334,6 @@ extern const struct trace_print_flags pageflag_names[]; extern const struct trace_print_flags vmaflag_names[]; extern const struct trace_print_flags gfpflag_names[]; -static inline bool is_migrate_highatomic(enum migratetype migratetype) -{ - return migratetype == MIGRATE_HIGHATOMIC; -} - void setup_zone_pageset(struct zone *zone); struct migration_target_control { diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 9142964ab9c9..d4c14359feaf 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -32,6 +32,15 @@ #include "kasan.h" #include "../slab.h" +#if defined(CONFIG_ARCH_DEFER_KASAN) || defined(CONFIG_KASAN_HW_TAGS) +/* + * Definition of the unified static key declared in kasan-enabled.h. + * This provides consistent runtime enable/disable across KASAN modes. + */ +DEFINE_STATIC_KEY_FALSE(kasan_flag_enabled); +EXPORT_SYMBOL_GPL(kasan_flag_enabled); +#endif + struct slab *kasan_addr_to_slab(const void *addr) { if (virt_addr_valid(addr)) @@ -246,15 +255,15 @@ static inline void poison_slab_object(struct kmem_cache *cache, void *object, bool __kasan_slab_pre_free(struct kmem_cache *cache, void *object, unsigned long ip) { - if (!kasan_arch_is_ready() || is_kfence_address(object)) + if (is_kfence_address(object)) return false; return check_slab_allocation(cache, object, ip); } bool __kasan_slab_free(struct kmem_cache *cache, void *object, bool init, - bool still_accessible) + bool still_accessible, bool no_quarantine) { - if (!kasan_arch_is_ready() || is_kfence_address(object)) + if (is_kfence_address(object)) return false; /* @@ -274,6 +283,9 @@ bool __kasan_slab_free(struct kmem_cache *cache, void *object, bool init, poison_slab_object(cache, object, init); + if (no_quarantine) + return false; + /* * If the object is put into quarantine, do not let slab put the object * onto the freelist for now. The object's metadata is kept until the @@ -293,7 +305,7 @@ bool __kasan_slab_free(struct kmem_cache *cache, void *object, bool init, static inline bool check_page_allocation(void *ptr, unsigned long ip) { - if (!kasan_arch_is_ready()) + if (!kasan_enabled()) return false; if (ptr != page_address(virt_to_head_page(ptr))) { @@ -522,7 +534,7 @@ bool __kasan_mempool_poison_object(void *ptr, unsigned long ip) return true; } - if (is_kfence_address(ptr) || !kasan_arch_is_ready()) + if (is_kfence_address(ptr)) return true; slab = folio_slab(folio); diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c index d54e89f8c3e7..b413c46b3e04 100644 --- a/mm/kasan/generic.c +++ b/mm/kasan/generic.c @@ -37,6 +37,17 @@ #include "../slab.h" /* + * Initialize Generic KASAN and enable runtime checks. + * This should be called from arch kasan_init() once shadow memory is ready. + */ +void __init kasan_init_generic(void) +{ + kasan_enable(); + + pr_info("KernelAddressSanitizer initialized (generic)\n"); +} + +/* * All functions below always inlined so compiler could * perform better optimizations in each of __asan_loadX/__assn_storeX * depending on memory access size X. @@ -165,7 +176,7 @@ static __always_inline bool check_region_inline(const void *addr, size_t size, bool write, unsigned long ret_ip) { - if (!kasan_arch_is_ready()) + if (!kasan_enabled()) return true; if (unlikely(size == 0)) @@ -193,7 +204,7 @@ bool kasan_byte_accessible(const void *addr) { s8 shadow_byte; - if (!kasan_arch_is_ready()) + if (!kasan_enabled()) return true; shadow_byte = READ_ONCE(*(s8 *)kasan_mem_to_shadow(addr)); @@ -495,7 +506,7 @@ static void release_alloc_meta(struct kasan_alloc_meta *meta) static void release_free_meta(const void *object, struct kasan_free_meta *meta) { - if (!kasan_arch_is_ready()) + if (!kasan_enabled()) return; /* Check if free meta is valid. */ @@ -562,7 +573,7 @@ void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags) kasan_save_track(&alloc_meta->alloc_track, flags); } -void kasan_save_free_info(struct kmem_cache *cache, void *object) +void __kasan_save_free_info(struct kmem_cache *cache, void *object) { struct kasan_free_meta *free_meta; diff --git a/mm/kasan/hw_tags.c b/mm/kasan/hw_tags.c index 9a6927394b54..1c373cc4b3fa 100644 --- a/mm/kasan/hw_tags.c +++ b/mm/kasan/hw_tags.c @@ -46,13 +46,6 @@ static enum kasan_arg_mode kasan_arg_mode __ro_after_init; static enum kasan_arg_vmalloc kasan_arg_vmalloc __initdata; /* - * Whether KASAN is enabled at all. - * The value remains false until KASAN is initialized by kasan_init_hw_tags(). - */ -DEFINE_STATIC_KEY_FALSE(kasan_flag_enabled); -EXPORT_SYMBOL(kasan_flag_enabled); - -/* * Whether the selected mode is synchronous, asynchronous, or asymmetric. * Defaults to KASAN_MODE_SYNC. */ @@ -67,6 +60,9 @@ DEFINE_STATIC_KEY_FALSE(kasan_flag_vmalloc); #endif EXPORT_SYMBOL_GPL(kasan_flag_vmalloc); +/* Whether to check write accesses only. */ +static bool kasan_flag_write_only = false; + #define PAGE_ALLOC_SAMPLE_DEFAULT 1 #define PAGE_ALLOC_SAMPLE_ORDER_DEFAULT 3 @@ -141,6 +137,23 @@ static int __init early_kasan_flag_vmalloc(char *arg) } early_param("kasan.vmalloc", early_kasan_flag_vmalloc); +/* kasan.write_only=off/on */ +static int __init early_kasan_flag_write_only(char *arg) +{ + if (!arg) + return -EINVAL; + + if (!strcmp(arg, "off")) + kasan_flag_write_only = false; + else if (!strcmp(arg, "on")) + kasan_flag_write_only = true; + else + return -EINVAL; + + return 0; +} +early_param("kasan.write_only", early_kasan_flag_write_only); + static inline const char *kasan_mode_info(void) { if (kasan_mode == KASAN_MODE_ASYNC) @@ -260,12 +273,13 @@ void __init kasan_init_hw_tags(void) kasan_init_tags(); /* KASAN is now initialized, enable it. */ - static_branch_enable(&kasan_flag_enabled); + kasan_enable(); - pr_info("KernelAddressSanitizer initialized (hw-tags, mode=%s, vmalloc=%s, stacktrace=%s)\n", + pr_info("KernelAddressSanitizer initialized (hw-tags, mode=%s, vmalloc=%s, stacktrace=%s, write_only=%s)\n", kasan_mode_info(), str_on_off(kasan_vmalloc_enabled()), - str_on_off(kasan_stack_collection_enabled())); + str_on_off(kasan_stack_collection_enabled()), + str_on_off(kasan_flag_write_only)); } #ifdef CONFIG_KASAN_VMALLOC @@ -392,6 +406,20 @@ void kasan_enable_hw_tags(void) hw_enable_tag_checks_asymm(); else hw_enable_tag_checks_sync(); + + /* + * CPUs can only be in one of two states: + * - All CPUs support the write_only feature + * - No CPUs support the write_only feature + * + * If the first CPU attempts hw_enable_tag_checks_write_only() and + * finds the feature unsupported, kasan_flag_write_only is set to OFF + * to avoid further unnecessary calls on other CPUs. + */ + if (kasan_flag_write_only && hw_enable_tag_checks_write_only()) { + kasan_flag_write_only = false; + pr_err_once("write-only mode is not supported and thus not enabled\n"); + } } #if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST) @@ -404,4 +432,10 @@ VISIBLE_IF_KUNIT void kasan_force_async_fault(void) } EXPORT_SYMBOL_IF_KUNIT(kasan_force_async_fault); +VISIBLE_IF_KUNIT bool kasan_write_only_enabled(void) +{ + return kasan_flag_write_only; +} +EXPORT_SYMBOL_IF_KUNIT(kasan_write_only_enabled); + #endif diff --git a/mm/kasan/init.c b/mm/kasan/init.c index ced6b29fcf76..f084e7a5df1e 100644 --- a/mm/kasan/init.c +++ b/mm/kasan/init.c @@ -13,9 +13,9 @@ #include <linux/mm.h> #include <linux/pfn.h> #include <linux/slab.h> +#include <linux/pgalloc.h> #include <asm/page.h> -#include <asm/pgalloc.h> #include "kasan.h" @@ -191,7 +191,7 @@ static int __ref zero_p4d_populate(pgd_t *pgd, unsigned long addr, pud_t *pud; pmd_t *pmd; - p4d_populate(&init_mm, p4d, + p4d_populate_kernel(addr, p4d, lm_alias(kasan_early_shadow_pud)); pud = pud_offset(p4d, addr); pud_populate(&init_mm, pud, @@ -212,7 +212,7 @@ static int __ref zero_p4d_populate(pgd_t *pgd, unsigned long addr, } else { p = early_alloc(PAGE_SIZE, NUMA_NO_NODE); pud_init(p); - p4d_populate(&init_mm, p4d, p); + p4d_populate_kernel(addr, p4d, p); } } zero_pud_populate(p4d, addr, next); @@ -251,10 +251,10 @@ int __ref kasan_populate_early_shadow(const void *shadow_start, * puds,pmds, so pgd_populate(), pud_populate() * is noops. */ - pgd_populate(&init_mm, pgd, + pgd_populate_kernel(addr, pgd, lm_alias(kasan_early_shadow_p4d)); p4d = p4d_offset(pgd, addr); - p4d_populate(&init_mm, p4d, + p4d_populate_kernel(addr, p4d, lm_alias(kasan_early_shadow_pud)); pud = pud_offset(p4d, addr); pud_populate(&init_mm, pud, @@ -266,14 +266,12 @@ int __ref kasan_populate_early_shadow(const void *shadow_start, } if (pgd_none(*pgd)) { - p4d_t *p; if (slab_is_available()) { - p = p4d_alloc(&init_mm, pgd, addr); - if (!p) + if (!p4d_alloc(&init_mm, pgd, addr)) return -ENOMEM; } else { - pgd_populate(&init_mm, pgd, + pgd_populate_kernel(addr, pgd, early_alloc(PAGE_SIZE, NUMA_NO_NODE)); } } diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index 129178be5e64..07fa7375a848 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -398,7 +398,13 @@ depot_stack_handle_t kasan_save_stack(gfp_t flags, depot_flags_t depot_flags); void kasan_set_track(struct kasan_track *track, depot_stack_handle_t stack); void kasan_save_track(struct kasan_track *track, gfp_t flags); void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags); -void kasan_save_free_info(struct kmem_cache *cache, void *object); + +void __kasan_save_free_info(struct kmem_cache *cache, void *object); +static inline void kasan_save_free_info(struct kmem_cache *cache, void *object) +{ + if (kasan_enabled()) + __kasan_save_free_info(cache, object); +} #ifdef CONFIG_KASAN_GENERIC bool kasan_quarantine_put(struct kmem_cache *cache, void *object); @@ -431,6 +437,7 @@ static inline const void *arch_kasan_set_tag(const void *addr, u8 tag) #define hw_suppress_tag_checks_start() arch_suppress_tag_checks_start() #define hw_suppress_tag_checks_stop() arch_suppress_tag_checks_stop() #define hw_force_async_tag_fault() arch_force_async_tag_fault() +#define hw_enable_tag_checks_write_only() arch_enable_tag_checks_write_only() #define hw_get_random_tag() arch_get_random_tag() #define hw_get_mem_tag(addr) arch_get_mem_tag(addr) #define hw_set_mem_tag_range(addr, size, tag, init) \ @@ -451,11 +458,17 @@ void __init kasan_init_tags(void); #if defined(CONFIG_KASAN_HW_TAGS) && IS_ENABLED(CONFIG_KASAN_KUNIT_TEST) void kasan_force_async_fault(void); +bool kasan_write_only_enabled(void); #else /* CONFIG_KASAN_HW_TAGS && CONFIG_KASAN_KUNIT_TEST */ static inline void kasan_force_async_fault(void) { } +static inline bool kasan_write_only_enabled(void) +{ + return false; +} + #endif /* CONFIG_KASAN_HW_TAGS && CONFIG_KASAN_KUNIT_TEST */ #ifdef CONFIG_KASAN_SW_TAGS diff --git a/mm/kasan/kasan_test_c.c b/mm/kasan/kasan_test_c.c index e0968acc03aa..2cafca31b092 100644 --- a/mm/kasan/kasan_test_c.c +++ b/mm/kasan/kasan_test_c.c @@ -94,11 +94,14 @@ static void kasan_test_exit(struct kunit *test) } /** - * KUNIT_EXPECT_KASAN_FAIL - check that the executed expression produces a - * KASAN report; causes a KUnit test failure otherwise. + * KUNIT_EXPECT_KASAN_RESULT - checks whether the executed expression + * produces a KASAN report; causes a KUnit test failure when the result + * is different from @fail. * * @test: Currently executing KUnit test. - * @expression: Expression that must produce a KASAN report. + * @expr: Expression to be tested. + * @expr_str: Expression to be tested encoded as a string. + * @fail: Whether expression should produce a KASAN report. * * For hardware tag-based KASAN, when a synchronous tag fault happens, tag * checking is auto-disabled. When this happens, this test handler reenables @@ -110,25 +113,29 @@ static void kasan_test_exit(struct kunit *test) * Use READ/WRITE_ONCE() for the accesses and compiler barriers around the * expression to prevent that. * - * In between KUNIT_EXPECT_KASAN_FAIL checks, test_status.report_found is kept + * In between KUNIT_EXPECT_KASAN_RESULT checks, test_status.report_found is kept * as false. This allows detecting KASAN reports that happen outside of the * checks by asserting !test_status.report_found at the start of - * KUNIT_EXPECT_KASAN_FAIL and in kasan_test_exit. + * KUNIT_EXPECT_KASAN_RESULT and in kasan_test_exit. */ -#define KUNIT_EXPECT_KASAN_FAIL(test, expression) do { \ +#define KUNIT_EXPECT_KASAN_RESULT(test, expr, expr_str, fail) \ +do { \ if (IS_ENABLED(CONFIG_KASAN_HW_TAGS) && \ kasan_sync_fault_possible()) \ migrate_disable(); \ KUNIT_EXPECT_FALSE(test, READ_ONCE(test_status.report_found)); \ barrier(); \ - expression; \ + expr; \ barrier(); \ if (kasan_async_fault_possible()) \ kasan_force_async_fault(); \ - if (!READ_ONCE(test_status.report_found)) { \ - KUNIT_FAIL(test, KUNIT_SUBTEST_INDENT "KASAN failure " \ - "expected in \"" #expression \ - "\", but none occurred"); \ + if (READ_ONCE(test_status.report_found) != fail) { \ + KUNIT_FAIL(test, KUNIT_SUBTEST_INDENT "KASAN failure" \ + "%sexpected in \"" expr_str \ + "\", but %soccurred", \ + (fail ? " " : " not "), \ + (test_status.report_found ? \ + "" : "none ")); \ } \ if (IS_ENABLED(CONFIG_KASAN_HW_TAGS) && \ kasan_sync_fault_possible()) { \ @@ -141,6 +148,34 @@ static void kasan_test_exit(struct kunit *test) WRITE_ONCE(test_status.async_fault, false); \ } while (0) +/* + * KUNIT_EXPECT_KASAN_FAIL - check that the executed expression produces a + * KASAN report; causes a KUnit test failure otherwise. + * + * @test: Currently executing KUnit test. + * @expr: Expression that must produce a KASAN report. + */ +#define KUNIT_EXPECT_KASAN_FAIL(test, expr) \ + KUNIT_EXPECT_KASAN_RESULT(test, expr, #expr, true) + +/* + * KUNIT_EXPECT_KASAN_FAIL_READ - check that the executed expression + * produces a KASAN report when the write-only mode is not enabled; + * causes a KUnit test failure otherwise. + * + * Note: At the moment, this macro does not check whether the produced + * KASAN report is a report about a bad read access. It is only intended + * for checking the write-only KASAN mode functionality without failing + * KASAN tests. + * + * @test: Currently executing KUnit test. + * @expr: Expression that must only produce a KASAN report + * when the write-only mode is not enabled. + */ +#define KUNIT_EXPECT_KASAN_FAIL_READ(test, expr) \ + KUNIT_EXPECT_KASAN_RESULT(test, expr, #expr, \ + !kasan_write_only_enabled()) \ + #define KASAN_TEST_NEEDS_CONFIG_ON(test, config) do { \ if (!IS_ENABLED(config)) \ kunit_skip((test), "Test requires " #config "=y"); \ @@ -183,8 +218,8 @@ static void kmalloc_oob_right(struct kunit *test) KUNIT_EXPECT_KASAN_FAIL(test, ptr[size + 5] = 'y'); /* Out-of-bounds access past the aligned kmalloc object. */ - KUNIT_EXPECT_KASAN_FAIL(test, ptr[0] = - ptr[size + KASAN_GRANULE_SIZE + 5]); + KUNIT_EXPECT_KASAN_FAIL_READ(test, ptr[0] = + ptr[size + KASAN_GRANULE_SIZE + 5]); kfree(ptr); } @@ -198,7 +233,7 @@ static void kmalloc_oob_left(struct kunit *test) KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); OPTIMIZER_HIDE_VAR(ptr); - KUNIT_EXPECT_KASAN_FAIL(test, *ptr = *(ptr - 1)); + KUNIT_EXPECT_KASAN_FAIL_READ(test, *ptr = *(ptr - 1)); kfree(ptr); } @@ -211,7 +246,7 @@ static void kmalloc_node_oob_right(struct kunit *test) KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); OPTIMIZER_HIDE_VAR(ptr); - KUNIT_EXPECT_KASAN_FAIL(test, ptr[0] = ptr[size]); + KUNIT_EXPECT_KASAN_FAIL_READ(test, ptr[0] = ptr[size]); kfree(ptr); } @@ -291,7 +326,7 @@ static void kmalloc_large_uaf(struct kunit *test) KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); kfree(ptr); - KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr)[0]); + KUNIT_EXPECT_KASAN_FAIL_READ(test, ((volatile char *)ptr)[0]); } static void kmalloc_large_invalid_free(struct kunit *test) @@ -323,7 +358,7 @@ static void page_alloc_oob_right(struct kunit *test) ptr = page_address(pages); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); - KUNIT_EXPECT_KASAN_FAIL(test, ptr[0] = ptr[size]); + KUNIT_EXPECT_KASAN_FAIL_READ(test, ptr[0] = ptr[size]); free_pages((unsigned long)ptr, order); } @@ -338,7 +373,7 @@ static void page_alloc_uaf(struct kunit *test) KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); free_pages((unsigned long)ptr, order); - KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr)[0]); + KUNIT_EXPECT_KASAN_FAIL_READ(test, ((volatile char *)ptr)[0]); } static void krealloc_more_oob_helper(struct kunit *test, @@ -458,7 +493,7 @@ static void krealloc_uaf(struct kunit *test) KUNIT_EXPECT_KASAN_FAIL(test, ptr2 = krealloc(ptr1, size2, GFP_KERNEL)); KUNIT_ASSERT_NULL(test, ptr2); - KUNIT_EXPECT_KASAN_FAIL(test, *(volatile char *)ptr1); + KUNIT_EXPECT_KASAN_FAIL_READ(test, *(volatile char *)ptr1); } static void kmalloc_oob_16(struct kunit *test) @@ -501,7 +536,7 @@ static void kmalloc_uaf_16(struct kunit *test) KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr2); kfree(ptr2); - KUNIT_EXPECT_KASAN_FAIL(test, *ptr1 = *ptr2); + KUNIT_EXPECT_KASAN_FAIL_READ(test, *ptr1 = *ptr2); kfree(ptr1); } @@ -640,8 +675,8 @@ static void kmalloc_memmove_invalid_size(struct kunit *test) memset((char *)ptr, 0, 64); OPTIMIZER_HIDE_VAR(ptr); OPTIMIZER_HIDE_VAR(invalid_size); - KUNIT_EXPECT_KASAN_FAIL(test, - memmove((char *)ptr, (char *)ptr + 4, invalid_size)); + KUNIT_EXPECT_KASAN_FAIL_READ(test, + memmove((char *)ptr, (char *)ptr + 4, invalid_size)); kfree(ptr); } @@ -654,7 +689,7 @@ static void kmalloc_uaf(struct kunit *test) KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); kfree(ptr); - KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr)[8]); + KUNIT_EXPECT_KASAN_FAIL_READ(test, ((volatile char *)ptr)[8]); } static void kmalloc_uaf_memset(struct kunit *test) @@ -701,7 +736,7 @@ again: goto again; } - KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr1)[40]); + KUNIT_EXPECT_KASAN_FAIL_READ(test, ((volatile char *)ptr1)[40]); KUNIT_EXPECT_PTR_NE(test, ptr1, ptr2); kfree(ptr2); @@ -727,19 +762,19 @@ static void kmalloc_uaf3(struct kunit *test) KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr2); kfree(ptr2); - KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr1)[8]); + KUNIT_EXPECT_KASAN_FAIL_READ(test, ((volatile char *)ptr1)[8]); } static void kasan_atomics_helper(struct kunit *test, void *unsafe, void *safe) { int *i_unsafe = unsafe; - KUNIT_EXPECT_KASAN_FAIL(test, READ_ONCE(*i_unsafe)); + KUNIT_EXPECT_KASAN_FAIL_READ(test, READ_ONCE(*i_unsafe)); KUNIT_EXPECT_KASAN_FAIL(test, WRITE_ONCE(*i_unsafe, 42)); - KUNIT_EXPECT_KASAN_FAIL(test, smp_load_acquire(i_unsafe)); + KUNIT_EXPECT_KASAN_FAIL_READ(test, smp_load_acquire(i_unsafe)); KUNIT_EXPECT_KASAN_FAIL(test, smp_store_release(i_unsafe, 42)); - KUNIT_EXPECT_KASAN_FAIL(test, atomic_read(unsafe)); + KUNIT_EXPECT_KASAN_FAIL_READ(test, atomic_read(unsafe)); KUNIT_EXPECT_KASAN_FAIL(test, atomic_set(unsafe, 42)); KUNIT_EXPECT_KASAN_FAIL(test, atomic_add(42, unsafe)); KUNIT_EXPECT_KASAN_FAIL(test, atomic_sub(42, unsafe)); @@ -752,18 +787,31 @@ static void kasan_atomics_helper(struct kunit *test, void *unsafe, void *safe) KUNIT_EXPECT_KASAN_FAIL(test, atomic_xchg(unsafe, 42)); KUNIT_EXPECT_KASAN_FAIL(test, atomic_cmpxchg(unsafe, 21, 42)); KUNIT_EXPECT_KASAN_FAIL(test, atomic_try_cmpxchg(unsafe, safe, 42)); - KUNIT_EXPECT_KASAN_FAIL(test, atomic_try_cmpxchg(safe, unsafe, 42)); + /* + * The result of the test below may vary due to garbage values of + * unsafe in write-only mode. + * Therefore, skip this test when KASAN is configured in write-only mode. + */ + if (!kasan_write_only_enabled()) + KUNIT_EXPECT_KASAN_FAIL(test, atomic_try_cmpxchg(safe, unsafe, 42)); KUNIT_EXPECT_KASAN_FAIL(test, atomic_sub_and_test(42, unsafe)); KUNIT_EXPECT_KASAN_FAIL(test, atomic_dec_and_test(unsafe)); KUNIT_EXPECT_KASAN_FAIL(test, atomic_inc_and_test(unsafe)); KUNIT_EXPECT_KASAN_FAIL(test, atomic_add_negative(42, unsafe)); - KUNIT_EXPECT_KASAN_FAIL(test, atomic_add_unless(unsafe, 21, 42)); - KUNIT_EXPECT_KASAN_FAIL(test, atomic_inc_not_zero(unsafe)); - KUNIT_EXPECT_KASAN_FAIL(test, atomic_inc_unless_negative(unsafe)); - KUNIT_EXPECT_KASAN_FAIL(test, atomic_dec_unless_positive(unsafe)); - KUNIT_EXPECT_KASAN_FAIL(test, atomic_dec_if_positive(unsafe)); + /* + * The result of the test below may vary due to garbage values of + * unsafe in write-only mode. + * Therefore, skip this test when KASAN is configured in write-only mode. + */ + if (!kasan_write_only_enabled()) { + KUNIT_EXPECT_KASAN_FAIL(test, atomic_add_unless(unsafe, 21, 42)); + KUNIT_EXPECT_KASAN_FAIL(test, atomic_inc_not_zero(unsafe)); + KUNIT_EXPECT_KASAN_FAIL(test, atomic_inc_unless_negative(unsafe)); + KUNIT_EXPECT_KASAN_FAIL(test, atomic_dec_unless_positive(unsafe)); + KUNIT_EXPECT_KASAN_FAIL(test, atomic_dec_if_positive(unsafe)); + } - KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_read(unsafe)); + KUNIT_EXPECT_KASAN_FAIL_READ(test, atomic_long_read(unsafe)); KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_set(unsafe, 42)); KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_add(42, unsafe)); KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_sub(42, unsafe)); @@ -776,16 +824,29 @@ static void kasan_atomics_helper(struct kunit *test, void *unsafe, void *safe) KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_xchg(unsafe, 42)); KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_cmpxchg(unsafe, 21, 42)); KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_try_cmpxchg(unsafe, safe, 42)); - KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_try_cmpxchg(safe, unsafe, 42)); + /* + * The result of the test below may vary due to garbage values of + * unsafe in write-only mode. + * Therefore, skip this test when KASAN is configured in write-only mode. + */ + if (!kasan_write_only_enabled()) + KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_try_cmpxchg(safe, unsafe, 42)); KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_sub_and_test(42, unsafe)); KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_dec_and_test(unsafe)); KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_inc_and_test(unsafe)); KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_add_negative(42, unsafe)); - KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_add_unless(unsafe, 21, 42)); - KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_inc_not_zero(unsafe)); - KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_inc_unless_negative(unsafe)); - KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_dec_unless_positive(unsafe)); - KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_dec_if_positive(unsafe)); + /* + * The result of the test below may vary due to garbage values of + * unsafe in write-only mode. + * Therefore, skip this test when KASAN is configured in write-only mode. + */ + if (!kasan_write_only_enabled()) { + KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_add_unless(unsafe, 21, 42)); + KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_inc_not_zero(unsafe)); + KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_inc_unless_negative(unsafe)); + KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_dec_unless_positive(unsafe)); + KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_dec_if_positive(unsafe)); + } } static void kasan_atomics(struct kunit *test) @@ -842,8 +903,8 @@ static void ksize_unpoisons_memory(struct kunit *test) /* These must trigger a KASAN report. */ if (IS_ENABLED(CONFIG_KASAN_GENERIC)) KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr)[size]); - KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr)[size + 5]); - KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr)[real_size - 1]); + KUNIT_EXPECT_KASAN_FAIL_READ(test, ((volatile char *)ptr)[size + 5]); + KUNIT_EXPECT_KASAN_FAIL_READ(test, ((volatile char *)ptr)[real_size - 1]); kfree(ptr); } @@ -863,8 +924,8 @@ static void ksize_uaf(struct kunit *test) OPTIMIZER_HIDE_VAR(ptr); KUNIT_EXPECT_KASAN_FAIL(test, ksize(ptr)); - KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr)[0]); - KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr)[size]); + KUNIT_EXPECT_KASAN_FAIL_READ(test, ((volatile char *)ptr)[0]); + KUNIT_EXPECT_KASAN_FAIL_READ(test, ((volatile char *)ptr)[size]); } /* @@ -899,9 +960,9 @@ static void rcu_uaf(struct kunit *test) global_rcu_ptr = rcu_dereference_protected( (struct kasan_rcu_info __rcu *)ptr, NULL); - KUNIT_EXPECT_KASAN_FAIL(test, - call_rcu(&global_rcu_ptr->rcu, rcu_uaf_reclaim); - rcu_barrier()); + KUNIT_EXPECT_KASAN_FAIL_READ(test, + call_rcu(&global_rcu_ptr->rcu, rcu_uaf_reclaim); + rcu_barrier()); } static void workqueue_uaf_work(struct work_struct *work) @@ -924,8 +985,8 @@ static void workqueue_uaf(struct kunit *test) queue_work(workqueue, work); destroy_workqueue(workqueue); - KUNIT_EXPECT_KASAN_FAIL(test, - ((volatile struct work_struct *)work)->data); + KUNIT_EXPECT_KASAN_FAIL_READ(test, + ((volatile struct work_struct *)work)->data); } static void kfree_via_page(struct kunit *test) @@ -972,7 +1033,7 @@ static void kmem_cache_oob(struct kunit *test) return; } - KUNIT_EXPECT_KASAN_FAIL(test, *p = p[size + OOB_TAG_OFF]); + KUNIT_EXPECT_KASAN_FAIL_READ(test, *p = p[size + OOB_TAG_OFF]); kmem_cache_free(cache, p); kmem_cache_destroy(cache); @@ -1068,11 +1129,50 @@ static void kmem_cache_rcu_uaf(struct kunit *test) */ rcu_barrier(); - KUNIT_EXPECT_KASAN_FAIL(test, READ_ONCE(*p)); + KUNIT_EXPECT_KASAN_FAIL_READ(test, READ_ONCE(*p)); kmem_cache_destroy(cache); } +/* + * Check that SLAB_TYPESAFE_BY_RCU objects are immediately reused when + * CONFIG_SLUB_RCU_DEBUG is off, and stay at the same address. + * Without this, KASAN builds would be unable to trigger bugs caused by + * SLAB_TYPESAFE_BY_RCU users handling reycled objects improperly. + */ +static void kmem_cache_rcu_reuse(struct kunit *test) +{ + char *p, *p2; + struct kmem_cache *cache; + + KASAN_TEST_NEEDS_CONFIG_OFF(test, CONFIG_SLUB_RCU_DEBUG); + + cache = kmem_cache_create("test_cache", 16, 0, SLAB_TYPESAFE_BY_RCU, + NULL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, cache); + + migrate_disable(); + p = kmem_cache_alloc(cache, GFP_KERNEL); + if (!p) { + kunit_err(test, "Allocation failed: %s\n", __func__); + goto out; + } + + kmem_cache_free(cache, p); + p2 = kmem_cache_alloc(cache, GFP_KERNEL); + if (!p2) { + kunit_err(test, "Allocation failed: %s\n", __func__); + goto out; + } + KUNIT_EXPECT_PTR_EQ(test, p, p2); + + kmem_cache_free(cache, p2); + +out: + migrate_enable(); + kmem_cache_destroy(cache); +} + static void kmem_cache_double_destroy(struct kunit *test) { struct kmem_cache *cache; @@ -1207,7 +1307,7 @@ static void mempool_oob_right_helper(struct kunit *test, mempool_t *pool, size_t KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)&elem[size])[0]); else - KUNIT_EXPECT_KASAN_FAIL(test, + KUNIT_EXPECT_KASAN_FAIL_READ(test, ((volatile char *)&elem[round_up(size, KASAN_GRANULE_SIZE)])[0]); mempool_free(elem, pool); @@ -1273,7 +1373,7 @@ static void mempool_uaf_helper(struct kunit *test, mempool_t *pool, bool page) mempool_free(elem, pool); ptr = page ? page_address((struct page *)elem) : elem; - KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr)[0]); + KUNIT_EXPECT_KASAN_FAIL_READ(test, ((volatile char *)ptr)[0]); } static void mempool_kmalloc_uaf(struct kunit *test) @@ -1532,7 +1632,7 @@ static void kasan_memchr(struct kunit *test) OPTIMIZER_HIDE_VAR(ptr); OPTIMIZER_HIDE_VAR(size); - KUNIT_EXPECT_KASAN_FAIL(test, + KUNIT_EXPECT_KASAN_FAIL_READ(test, kasan_ptr_result = memchr(ptr, '1', size + 1)); kfree(ptr); @@ -1559,7 +1659,7 @@ static void kasan_memcmp(struct kunit *test) OPTIMIZER_HIDE_VAR(ptr); OPTIMIZER_HIDE_VAR(size); - KUNIT_EXPECT_KASAN_FAIL(test, + KUNIT_EXPECT_KASAN_FAIL_READ(test, kasan_int_result = memcmp(ptr, arr, size+1)); kfree(ptr); } @@ -1578,9 +1678,11 @@ static void kasan_strings(struct kunit *test) ptr = kmalloc(size, GFP_KERNEL | __GFP_ZERO); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + OPTIMIZER_HIDE_VAR(ptr); src = kmalloc(KASAN_GRANULE_SIZE, GFP_KERNEL | __GFP_ZERO); strscpy(src, "f0cacc1a0000000", KASAN_GRANULE_SIZE); + OPTIMIZER_HIDE_VAR(src); /* * Make sure that strscpy() does not trigger KASAN if it overreads into @@ -1594,7 +1696,7 @@ static void kasan_strings(struct kunit *test) strscpy(ptr, src + 1, KASAN_GRANULE_SIZE)); /* strscpy should fail if the first byte is unreadable. */ - KUNIT_EXPECT_KASAN_FAIL(test, strscpy(ptr, src + KASAN_GRANULE_SIZE, + KUNIT_EXPECT_KASAN_FAIL_READ(test, strscpy(ptr, src + KASAN_GRANULE_SIZE, KASAN_GRANULE_SIZE)); kfree(src); @@ -1607,17 +1709,17 @@ static void kasan_strings(struct kunit *test) * will likely point to zeroed byte. */ ptr += 16; - KUNIT_EXPECT_KASAN_FAIL(test, kasan_ptr_result = strchr(ptr, '1')); + KUNIT_EXPECT_KASAN_FAIL_READ(test, kasan_ptr_result = strchr(ptr, '1')); - KUNIT_EXPECT_KASAN_FAIL(test, kasan_ptr_result = strrchr(ptr, '1')); + KUNIT_EXPECT_KASAN_FAIL_READ(test, kasan_ptr_result = strrchr(ptr, '1')); - KUNIT_EXPECT_KASAN_FAIL(test, kasan_int_result = strcmp(ptr, "2")); + KUNIT_EXPECT_KASAN_FAIL_READ(test, kasan_int_result = strcmp(ptr, "2")); - KUNIT_EXPECT_KASAN_FAIL(test, kasan_int_result = strncmp(ptr, "2", 1)); + KUNIT_EXPECT_KASAN_FAIL_READ(test, kasan_int_result = strncmp(ptr, "2", 1)); - KUNIT_EXPECT_KASAN_FAIL(test, kasan_int_result = strlen(ptr)); + KUNIT_EXPECT_KASAN_FAIL_READ(test, kasan_int_result = strlen(ptr)); - KUNIT_EXPECT_KASAN_FAIL(test, kasan_int_result = strnlen(ptr, 1)); + KUNIT_EXPECT_KASAN_FAIL_READ(test, kasan_int_result = strnlen(ptr, 1)); } static void kasan_bitops_modify(struct kunit *test, int nr, void *addr) @@ -1636,12 +1738,18 @@ static void kasan_bitops_test_and_modify(struct kunit *test, int nr, void *addr) { KUNIT_EXPECT_KASAN_FAIL(test, test_and_set_bit(nr, addr)); KUNIT_EXPECT_KASAN_FAIL(test, __test_and_set_bit(nr, addr)); - KUNIT_EXPECT_KASAN_FAIL(test, test_and_set_bit_lock(nr, addr)); + /* + * When KASAN is running in write-only mode, + * a fault won't occur when the bit is set. + * Therefore, skip the test_and_set_bit_lock test in write-only mode. + */ + if (!kasan_write_only_enabled()) + KUNIT_EXPECT_KASAN_FAIL(test, test_and_set_bit_lock(nr, addr)); KUNIT_EXPECT_KASAN_FAIL(test, test_and_clear_bit(nr, addr)); KUNIT_EXPECT_KASAN_FAIL(test, __test_and_clear_bit(nr, addr)); KUNIT_EXPECT_KASAN_FAIL(test, test_and_change_bit(nr, addr)); KUNIT_EXPECT_KASAN_FAIL(test, __test_and_change_bit(nr, addr)); - KUNIT_EXPECT_KASAN_FAIL(test, kasan_int_result = test_bit(nr, addr)); + KUNIT_EXPECT_KASAN_FAIL_READ(test, kasan_int_result = test_bit(nr, addr)); if (nr < 7) KUNIT_EXPECT_KASAN_FAIL(test, kasan_int_result = xor_unlock_is_negative_byte(1 << nr, addr)); @@ -1765,7 +1873,7 @@ static void vmalloc_oob(struct kunit *test) KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)v_ptr)[size]); /* An aligned access into the first out-of-bounds granule. */ - KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)v_ptr)[size + 5]); + KUNIT_EXPECT_KASAN_FAIL_READ(test, ((volatile char *)v_ptr)[size + 5]); /* Check that in-bounds accesses to the physical page are valid. */ page = vmalloc_to_page(v_ptr); @@ -2042,15 +2150,15 @@ static void copy_user_test_oob(struct kunit *test) KUNIT_EXPECT_KASAN_FAIL(test, unused = copy_from_user(kmem, usermem, size + 1)); - KUNIT_EXPECT_KASAN_FAIL(test, + KUNIT_EXPECT_KASAN_FAIL_READ(test, unused = copy_to_user(usermem, kmem, size + 1)); KUNIT_EXPECT_KASAN_FAIL(test, unused = __copy_from_user(kmem, usermem, size + 1)); - KUNIT_EXPECT_KASAN_FAIL(test, + KUNIT_EXPECT_KASAN_FAIL_READ(test, unused = __copy_to_user(usermem, kmem, size + 1)); KUNIT_EXPECT_KASAN_FAIL(test, unused = __copy_from_user_inatomic(kmem, usermem, size + 1)); - KUNIT_EXPECT_KASAN_FAIL(test, + KUNIT_EXPECT_KASAN_FAIL_READ(test, unused = __copy_to_user_inatomic(usermem, kmem, size + 1)); /* @@ -2104,6 +2212,7 @@ static struct kunit_case kasan_kunit_test_cases[] = { KUNIT_CASE(kmem_cache_double_free), KUNIT_CASE(kmem_cache_invalid_free), KUNIT_CASE(kmem_cache_rcu_uaf), + KUNIT_CASE(kmem_cache_rcu_reuse), KUNIT_CASE(kmem_cache_double_destroy), KUNIT_CASE(kmem_cache_accounted), KUNIT_CASE(kmem_cache_bulk), diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c index d2c70cd2afb1..5d2a876035d6 100644 --- a/mm/kasan/shadow.c +++ b/mm/kasan/shadow.c @@ -125,7 +125,7 @@ void kasan_poison(const void *addr, size_t size, u8 value, bool init) { void *shadow_start, *shadow_end; - if (!kasan_arch_is_ready()) + if (!kasan_enabled()) return; /* @@ -150,7 +150,7 @@ EXPORT_SYMBOL_GPL(kasan_poison); #ifdef CONFIG_KASAN_GENERIC void kasan_poison_last_granule(const void *addr, size_t size) { - if (!kasan_arch_is_ready()) + if (!kasan_enabled()) return; if (size & KASAN_GRANULE_MASK) { @@ -305,8 +305,7 @@ static int kasan_populate_vmalloc_pte(pte_t *ptep, unsigned long addr, pte_t pte; int index; - if (likely(!pte_none(ptep_get(ptep)))) - return 0; + arch_leave_lazy_mmu_mode(); index = PFN_DOWN(addr - data->start); page = data->pages[index]; @@ -320,6 +319,8 @@ static int kasan_populate_vmalloc_pte(pte_t *ptep, unsigned long addr, } spin_unlock(&init_mm.page_table_lock); + arch_enter_lazy_mmu_mode(); + return 0; } @@ -335,13 +336,13 @@ static void ___free_pages_bulk(struct page **pages, int nr_pages) } } -static int ___alloc_pages_bulk(struct page **pages, int nr_pages) +static int ___alloc_pages_bulk(struct page **pages, int nr_pages, gfp_t gfp_mask) { unsigned long nr_populated, nr_total = nr_pages; struct page **page_array = pages; while (nr_pages) { - nr_populated = alloc_pages_bulk(GFP_KERNEL, nr_pages, pages); + nr_populated = alloc_pages_bulk(gfp_mask, nr_pages, pages); if (!nr_populated) { ___free_pages_bulk(page_array, nr_total - nr_pages); return -ENOMEM; @@ -353,25 +354,42 @@ static int ___alloc_pages_bulk(struct page **pages, int nr_pages) return 0; } -static int __kasan_populate_vmalloc(unsigned long start, unsigned long end) +static int __kasan_populate_vmalloc(unsigned long start, unsigned long end, gfp_t gfp_mask) { unsigned long nr_pages, nr_total = PFN_UP(end - start); struct vmalloc_populate_data data; + unsigned int flags; int ret = 0; - data.pages = (struct page **)__get_free_page(GFP_KERNEL | __GFP_ZERO); + data.pages = (struct page **)__get_free_page(gfp_mask | __GFP_ZERO); if (!data.pages) return -ENOMEM; while (nr_total) { nr_pages = min(nr_total, PAGE_SIZE / sizeof(data.pages[0])); - ret = ___alloc_pages_bulk(data.pages, nr_pages); + ret = ___alloc_pages_bulk(data.pages, nr_pages, gfp_mask); if (ret) break; data.start = start; + + /* + * page tables allocations ignore external gfp mask, enforce it + * by the scope API + */ + if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO) + flags = memalloc_nofs_save(); + else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0) + flags = memalloc_noio_save(); + ret = apply_to_page_range(&init_mm, start, nr_pages * PAGE_SIZE, kasan_populate_vmalloc_pte, &data); + + if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO) + memalloc_nofs_restore(flags); + else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0) + memalloc_noio_restore(flags); + ___free_pages_bulk(data.pages, nr_pages); if (ret) break; @@ -385,12 +403,12 @@ static int __kasan_populate_vmalloc(unsigned long start, unsigned long end) return ret; } -int kasan_populate_vmalloc(unsigned long addr, unsigned long size) +int kasan_populate_vmalloc(unsigned long addr, unsigned long size, gfp_t gfp_mask) { unsigned long shadow_start, shadow_end; int ret; - if (!kasan_arch_is_ready()) + if (!kasan_enabled()) return 0; if (!is_vmalloc_or_module_addr((void *)addr)) @@ -414,7 +432,7 @@ int kasan_populate_vmalloc(unsigned long addr, unsigned long size) shadow_start = PAGE_ALIGN_DOWN(shadow_start); shadow_end = PAGE_ALIGN(shadow_end); - ret = __kasan_populate_vmalloc(shadow_start, shadow_end); + ret = __kasan_populate_vmalloc(shadow_start, shadow_end, gfp_mask); if (ret) return ret; @@ -461,18 +479,23 @@ int kasan_populate_vmalloc(unsigned long addr, unsigned long size) static int kasan_depopulate_vmalloc_pte(pte_t *ptep, unsigned long addr, void *unused) { - unsigned long page; + pte_t pte; + int none; - page = (unsigned long)__va(pte_pfn(ptep_get(ptep)) << PAGE_SHIFT); + arch_leave_lazy_mmu_mode(); spin_lock(&init_mm.page_table_lock); - - if (likely(!pte_none(ptep_get(ptep)))) { + pte = ptep_get(ptep); + none = pte_none(pte); + if (likely(!none)) pte_clear(&init_mm, addr, ptep); - free_page(page); - } spin_unlock(&init_mm.page_table_lock); + if (likely(!none)) + __free_page(pfn_to_page(pte_pfn(pte))); + + arch_enter_lazy_mmu_mode(); + return 0; } @@ -560,7 +583,7 @@ void kasan_release_vmalloc(unsigned long start, unsigned long end, unsigned long region_start, region_end; unsigned long size; - if (!kasan_arch_is_ready()) + if (!kasan_enabled()) return; region_start = ALIGN(start, KASAN_MEMORY_PER_SHADOW_PAGE); @@ -611,7 +634,7 @@ void *__kasan_unpoison_vmalloc(const void *start, unsigned long size, * with setting memory tags, so the KASAN_VMALLOC_INIT flag is ignored. */ - if (!kasan_arch_is_ready()) + if (!kasan_enabled()) return (void *)start; if (!is_vmalloc_or_module_addr(start)) @@ -636,7 +659,7 @@ void *__kasan_unpoison_vmalloc(const void *start, unsigned long size, */ void __kasan_poison_vmalloc(const void *start, unsigned long size) { - if (!kasan_arch_is_ready()) + if (!kasan_enabled()) return; if (!is_vmalloc_or_module_addr(start)) diff --git a/mm/kasan/sw_tags.c b/mm/kasan/sw_tags.c index b9382b5b6a37..c75741a74602 100644 --- a/mm/kasan/sw_tags.c +++ b/mm/kasan/sw_tags.c @@ -44,6 +44,7 @@ void __init kasan_init_sw_tags(void) per_cpu(prng_state, cpu) = (u32)get_cycles(); kasan_init_tags(); + kasan_enable(); pr_info("KernelAddressSanitizer initialized (sw-tags, stacktrace=%s)\n", str_on_off(kasan_stack_collection_enabled())); diff --git a/mm/kasan/tags.c b/mm/kasan/tags.c index d65d48b85f90..b9f31293622b 100644 --- a/mm/kasan/tags.c +++ b/mm/kasan/tags.c @@ -142,7 +142,7 @@ void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags) save_stack_info(cache, object, flags, false); } -void kasan_save_free_info(struct kmem_cache *cache, void *object) +void __kasan_save_free_info(struct kmem_cache *cache, void *object) { save_stack_info(cache, object, 0, true); } diff --git a/mm/kfence/core.c b/mm/kfence/core.c index 0ed3be100963..727c20c94ac5 100644 --- a/mm/kfence/core.c +++ b/mm/kfence/core.c @@ -594,15 +594,14 @@ static void rcu_guarded_free(struct rcu_head *h) */ static unsigned long kfence_init_pool(void) { - unsigned long addr; - struct page *pages; + unsigned long addr, start_pfn; int i; if (!arch_kfence_init_pool()) return (unsigned long)__kfence_pool; addr = (unsigned long)__kfence_pool; - pages = virt_to_page(__kfence_pool); + start_pfn = PHYS_PFN(virt_to_phys(__kfence_pool)); /* * Set up object pages: they must have PGTY_slab set to avoid freeing @@ -613,11 +612,12 @@ static unsigned long kfence_init_pool(void) * enters __slab_free() slow-path. */ for (i = 0; i < KFENCE_POOL_SIZE / PAGE_SIZE; i++) { - struct slab *slab = page_slab(nth_page(pages, i)); + struct slab *slab; if (!i || (i % 2)) continue; + slab = page_slab(pfn_to_page(start_pfn + i)); __folio_set_slab(slab_folio(slab)); #ifdef CONFIG_MEMCG slab->obj_exts = (unsigned long)&kfence_metadata_init[i / 2 - 1].obj_exts | @@ -665,10 +665,12 @@ static unsigned long kfence_init_pool(void) reset_slab: for (i = 0; i < KFENCE_POOL_SIZE / PAGE_SIZE; i++) { - struct slab *slab = page_slab(nth_page(pages, i)); + struct slab *slab; if (!i || (i % 2)) continue; + + slab = page_slab(pfn_to_page(start_pfn + i)); #ifdef CONFIG_MEMCG slab->obj_exts = 0; #endif diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 6b40bdfd224c..7ab2d1a42df3 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -39,7 +39,6 @@ enum scan_result { SCAN_PTE_NON_PRESENT, SCAN_PTE_UFFD_WP, SCAN_PTE_MAPPED_HUGEPAGE, - SCAN_PAGE_RO, SCAN_LACK_REFERENCED_PAGE, SCAN_PAGE_NULL, SCAN_SCAN_ABORT, @@ -105,14 +104,6 @@ struct collapse_control { }; /** - * struct khugepaged_mm_slot - khugepaged information per mm that is being scanned - * @slot: hash lookup from mm to mm_slot - */ -struct khugepaged_mm_slot { - struct mm_slot slot; -}; - -/** * struct khugepaged_scan - cursor for scanning * @mm_head: the head of the mm list to scan * @mm_slot: the current mm_slot we are scanning @@ -122,7 +113,7 @@ struct khugepaged_mm_slot { */ struct khugepaged_scan { struct list_head mm_head; - struct khugepaged_mm_slot *mm_slot; + struct mm_slot *mm_slot; unsigned long address; }; @@ -385,7 +376,10 @@ int hugepage_madvise(struct vm_area_struct *vma, int __init khugepaged_init(void) { - mm_slot_cache = KMEM_CACHE(khugepaged_mm_slot, 0); + mm_slot_cache = kmem_cache_create("khugepaged_mm_slot", + sizeof(struct mm_slot), + __alignof__(struct mm_slot), + 0, NULL); if (!mm_slot_cache) return -ENOMEM; @@ -410,7 +404,7 @@ static inline int hpage_collapse_test_exit(struct mm_struct *mm) static inline int hpage_collapse_test_exit_or_disable(struct mm_struct *mm) { return hpage_collapse_test_exit(mm) || - test_bit(MMF_DISABLE_THP, &mm->flags); + mm_flags_test(MMF_DISABLE_THP_COMPLETELY, mm); } static bool hugepage_pmd_enabled(void) @@ -439,21 +433,18 @@ static bool hugepage_pmd_enabled(void) void __khugepaged_enter(struct mm_struct *mm) { - struct khugepaged_mm_slot *mm_slot; struct mm_slot *slot; int wakeup; /* __khugepaged_exit() must not run from under us */ VM_BUG_ON_MM(hpage_collapse_test_exit(mm), mm); - if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) + if (unlikely(mm_flags_test_and_set(MMF_VM_HUGEPAGE, mm))) return; - mm_slot = mm_slot_alloc(mm_slot_cache); - if (!mm_slot) + slot = mm_slot_alloc(mm_slot_cache); + if (!slot) return; - slot = &mm_slot->slot; - spin_lock(&khugepaged_mm_lock); mm_slot_insert(mm_slots_hash, mm, slot); /* @@ -472,24 +463,21 @@ void __khugepaged_enter(struct mm_struct *mm) void khugepaged_enter_vma(struct vm_area_struct *vma, vm_flags_t vm_flags) { - if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags) && + if (!mm_flags_test(MMF_VM_HUGEPAGE, vma->vm_mm) && hugepage_pmd_enabled()) { - if (thp_vma_allowable_order(vma, vm_flags, TVA_ENFORCE_SYSFS, - PMD_ORDER)) + if (thp_vma_allowable_order(vma, vm_flags, TVA_KHUGEPAGED, PMD_ORDER)) __khugepaged_enter(vma->vm_mm); } } void __khugepaged_exit(struct mm_struct *mm) { - struct khugepaged_mm_slot *mm_slot; struct mm_slot *slot; int free = 0; spin_lock(&khugepaged_mm_lock); slot = mm_slot_lookup(mm_slots_hash, mm); - mm_slot = mm_slot_entry(slot, struct khugepaged_mm_slot, slot); - if (mm_slot && khugepaged_scan.mm_slot != mm_slot) { + if (slot && khugepaged_scan.mm_slot != slot) { hash_del(&slot->hash); list_del(&slot->mm_node); free = 1; @@ -497,10 +485,10 @@ void __khugepaged_exit(struct mm_struct *mm) spin_unlock(&khugepaged_mm_lock); if (free) { - clear_bit(MMF_VM_HUGEPAGE, &mm->flags); - mm_slot_free(mm_slot_cache, mm_slot); + mm_flags_clear(MMF_VM_HUGEPAGE, mm); + mm_slot_free(mm_slot_cache, slot); mmdrop(mm); - } else if (mm_slot) { + } else if (slot) { /* * This is required to serialize against * hpage_collapse_test_exit() (which is guaranteed to run @@ -549,19 +537,19 @@ static void release_pte_pages(pte_t *pte, pte_t *_pte, } static int __collapse_huge_page_isolate(struct vm_area_struct *vma, - unsigned long address, + unsigned long start_addr, pte_t *pte, struct collapse_control *cc, struct list_head *compound_pagelist) { struct page *page = NULL; struct folio *folio = NULL; + unsigned long addr = start_addr; pte_t *_pte; int none_or_zero = 0, shared = 0, result = SCAN_FAIL, referenced = 0; - bool writable = false; for (_pte = pte; _pte < pte + HPAGE_PMD_NR; - _pte++, address += PAGE_SIZE) { + _pte++, addr += PAGE_SIZE) { pte_t pteval = ptep_get(_pte); if (pte_none(pteval) || (pte_present(pteval) && is_zero_pfn(pte_pfn(pteval)))) { @@ -584,7 +572,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, result = SCAN_PTE_UFFD_WP; goto out; } - page = vm_normal_page(vma, address, pteval); + page = vm_normal_page(vma, addr, pteval); if (unlikely(!page) || unlikely(is_zone_device_page(page))) { result = SCAN_PAGE_NULL; goto out; @@ -669,28 +657,23 @@ next: */ if (cc->is_khugepaged && (pte_young(pteval) || folio_test_young(folio) || - folio_test_referenced(folio) || mmu_notifier_test_young(vma->vm_mm, - address))) + folio_test_referenced(folio) || + mmu_notifier_test_young(vma->vm_mm, addr))) referenced++; - - if (pte_write(pteval)) - writable = true; } - if (unlikely(!writable)) { - result = SCAN_PAGE_RO; - } else if (unlikely(cc->is_khugepaged && !referenced)) { + if (unlikely(cc->is_khugepaged && !referenced)) { result = SCAN_LACK_REFERENCED_PAGE; } else { result = SCAN_SUCCEED; trace_mm_collapse_huge_page_isolate(folio, none_or_zero, - referenced, writable, result); + referenced, result); return result; } out: release_pte_pages(pte, _pte, compound_pagelist); trace_mm_collapse_huge_page_isolate(folio, none_or_zero, - referenced, writable, result); + referenced, result); return result; } @@ -921,7 +904,8 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, struct collapse_control *cc) { struct vm_area_struct *vma; - unsigned long tva_flags = cc->is_khugepaged ? TVA_ENFORCE_SYSFS : 0; + enum tva_type type = cc->is_khugepaged ? TVA_KHUGEPAGED : + TVA_FORCED_COLLAPSE; if (unlikely(hpage_collapse_test_exit_or_disable(mm))) return SCAN_ANY_PROCESS; @@ -932,7 +916,7 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, if (!thp_vma_suitable_order(vma, address, PMD_ORDER)) return SCAN_ADDRESS_RANGE; - if (!thp_vma_allowable_order(vma, vma->vm_flags, tva_flags, PMD_ORDER)) + if (!thp_vma_allowable_order(vma, vma->vm_flags, type, PMD_ORDER)) return SCAN_VMA_CHECK; /* * Anon VMA expected, the address may be unmapped then @@ -1003,21 +987,21 @@ static int check_pmd_still_valid(struct mm_struct *mm, */ static int __collapse_huge_page_swapin(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long haddr, pmd_t *pmd, + unsigned long start_addr, pmd_t *pmd, int referenced) { int swapped_in = 0; vm_fault_t ret = 0; - unsigned long address, end = haddr + (HPAGE_PMD_NR * PAGE_SIZE); + unsigned long addr, end = start_addr + (HPAGE_PMD_NR * PAGE_SIZE); int result; pte_t *pte = NULL; spinlock_t *ptl; - for (address = haddr; address < end; address += PAGE_SIZE) { + for (addr = start_addr; addr < end; addr += PAGE_SIZE) { struct vm_fault vmf = { .vma = vma, - .address = address, - .pgoff = linear_page_index(vma, address), + .address = addr, + .pgoff = linear_page_index(vma, addr), .flags = FAULT_FLAG_ALLOW_RETRY, .pmd = pmd, }; @@ -1027,7 +1011,7 @@ static int __collapse_huge_page_swapin(struct mm_struct *mm, * Here the ptl is only used to check pte_same() in * do_swap_page(), so readonly version is enough. */ - pte = pte_offset_map_ro_nolock(mm, pmd, address, &ptl); + pte = pte_offset_map_ro_nolock(mm, pmd, addr, &ptl); if (!pte) { mmap_read_unlock(mm); result = SCAN_PMD_NULL; @@ -1270,7 +1254,7 @@ out_nolock: static int hpage_collapse_scan_pmd(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, bool *mmap_locked, + unsigned long start_addr, bool *mmap_locked, struct collapse_control *cc) { pmd_t *pmd; @@ -1279,27 +1263,26 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm, int none_or_zero = 0, shared = 0; struct page *page = NULL; struct folio *folio = NULL; - unsigned long _address; + unsigned long addr; spinlock_t *ptl; int node = NUMA_NO_NODE, unmapped = 0; - bool writable = false; - VM_BUG_ON(address & ~HPAGE_PMD_MASK); + VM_BUG_ON(start_addr & ~HPAGE_PMD_MASK); - result = find_pmd_or_thp_or_none(mm, address, &pmd); + result = find_pmd_or_thp_or_none(mm, start_addr, &pmd); if (result != SCAN_SUCCEED) goto out; memset(cc->node_load, 0, sizeof(cc->node_load)); nodes_clear(cc->alloc_nmask); - pte = pte_offset_map_lock(mm, pmd, address, &ptl); + pte = pte_offset_map_lock(mm, pmd, start_addr, &ptl); if (!pte) { result = SCAN_PMD_NULL; goto out; } - for (_address = address, _pte = pte; _pte < pte + HPAGE_PMD_NR; - _pte++, _address += PAGE_SIZE) { + for (addr = start_addr, _pte = pte; _pte < pte + HPAGE_PMD_NR; + _pte++, addr += PAGE_SIZE) { pte_t pteval = ptep_get(_pte); if (is_swap_pte(pteval)) { ++unmapped; @@ -1346,10 +1329,8 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm, result = SCAN_PTE_UFFD_WP; goto out_unmap; } - if (pte_write(pteval)) - writable = true; - page = vm_normal_page(vma, _address, pteval); + page = vm_normal_page(vma, addr, pteval); if (unlikely(!page) || unlikely(is_zone_device_page(page))) { result = SCAN_PAGE_NULL; goto out_unmap; @@ -1417,13 +1398,11 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm, */ if (cc->is_khugepaged && (pte_young(pteval) || folio_test_young(folio) || - folio_test_referenced(folio) || mmu_notifier_test_young(vma->vm_mm, - address))) + folio_test_referenced(folio) || + mmu_notifier_test_young(vma->vm_mm, addr))) referenced++; } - if (!writable) { - result = SCAN_PAGE_RO; - } else if (cc->is_khugepaged && + if (cc->is_khugepaged && (!referenced || (unmapped && referenced < HPAGE_PMD_NR / 2))) { result = SCAN_LACK_REFERENCED_PAGE; @@ -1433,20 +1412,19 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm, out_unmap: pte_unmap_unlock(pte, ptl); if (result == SCAN_SUCCEED) { - result = collapse_huge_page(mm, address, referenced, + result = collapse_huge_page(mm, start_addr, referenced, unmapped, cc); /* collapse_huge_page will return with the mmap_lock released */ *mmap_locked = false; } out: - trace_mm_khugepaged_scan_pmd(mm, folio, writable, referenced, + trace_mm_khugepaged_scan_pmd(mm, folio, referenced, none_or_zero, result, unmapped); return result; } -static void collect_mm_slot(struct khugepaged_mm_slot *mm_slot) +static void collect_mm_slot(struct mm_slot *slot) { - struct mm_slot *slot = &mm_slot->slot; struct mm_struct *mm = slot->mm; lockdep_assert_held(&khugepaged_mm_lock); @@ -1459,11 +1437,11 @@ static void collect_mm_slot(struct khugepaged_mm_slot *mm_slot) /* * Not strictly needed because the mm exited already. * - * clear_bit(MMF_VM_HUGEPAGE, &mm->flags); + * mm_flags_clear(MMF_VM_HUGEPAGE, mm); */ /* khugepaged_mm_lock actually not necessary for the below */ - mm_slot_free(mm_slot_cache, mm_slot); + mm_slot_free(mm_slot_cache, slot); mmdrop(mm); } } @@ -1472,15 +1450,32 @@ static void collect_mm_slot(struct khugepaged_mm_slot *mm_slot) static int set_huge_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp, struct folio *folio, struct page *page) { + struct mm_struct *mm = vma->vm_mm; struct vm_fault vmf = { .vma = vma, .address = addr, .flags = 0, - .pmd = pmdp, }; + pgd_t *pgdp; + p4d_t *p4dp; + pud_t *pudp; mmap_assert_locked(vma->vm_mm); + if (!pmdp) { + pgdp = pgd_offset(mm, addr); + p4dp = p4d_alloc(mm, pgdp, addr); + if (!p4dp) + return SCAN_FAIL; + pudp = pud_alloc(mm, p4dp, addr); + if (!pudp) + return SCAN_FAIL; + pmdp = pmd_alloc(mm, pudp, addr); + if (!pmdp) + return SCAN_FAIL; + } + + vmf.pmd = pmdp; if (do_set_pmd(&vmf, folio, page)) return SCAN_FAIL; @@ -1533,9 +1528,9 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, * in the page cache with a single hugepage. If a mm were to fault-in * this memory (mapped by a suitably aligned VMA), we'd get the hugepage * and map it by a PMD, regardless of sysfs THP settings. As such, let's - * analogously elide sysfs THP settings here. + * analogously elide sysfs THP settings here and force collapse. */ - if (!thp_vma_allowable_order(vma, vma->vm_flags, 0, PMD_ORDER)) + if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_FORCED_COLLAPSE, PMD_ORDER)) return SCAN_VMA_CHECK; /* Keep pmd pgtable for uffd-wp; see comment in retract_page_tables() */ @@ -1556,6 +1551,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, switch (result) { case SCAN_SUCCEED: break; + case SCAN_PMD_NULL: case SCAN_PMD_NONE: /* * All pte entries have been removed and pmd cleared. @@ -2388,7 +2384,6 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, __acquires(&khugepaged_mm_lock) { struct vma_iterator vmi; - struct khugepaged_mm_slot *mm_slot; struct mm_slot *slot; struct mm_struct *mm; struct vm_area_struct *vma; @@ -2399,14 +2394,12 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, *result = SCAN_FAIL; if (khugepaged_scan.mm_slot) { - mm_slot = khugepaged_scan.mm_slot; - slot = &mm_slot->slot; + slot = khugepaged_scan.mm_slot; } else { - slot = list_entry(khugepaged_scan.mm_head.next, + slot = list_first_entry(&khugepaged_scan.mm_head, struct mm_slot, mm_node); - mm_slot = mm_slot_entry(slot, struct khugepaged_mm_slot, slot); khugepaged_scan.address = 0; - khugepaged_scan.mm_slot = mm_slot; + khugepaged_scan.mm_slot = slot; } spin_unlock(&khugepaged_mm_lock); @@ -2432,8 +2425,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, progress++; break; } - if (!thp_vma_allowable_order(vma, vma->vm_flags, - TVA_ENFORCE_SYSFS, PMD_ORDER)) { + if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_KHUGEPAGED, PMD_ORDER)) { skip: progress++; continue; @@ -2505,7 +2497,7 @@ breakouterloop: breakouterloop_mmap_lock: spin_lock(&khugepaged_mm_lock); - VM_BUG_ON(khugepaged_scan.mm_slot != mm_slot); + VM_BUG_ON(khugepaged_scan.mm_slot != slot); /* * Release the current mm_slot if this mm is about to die, or * if we scanned all vmas of this mm. @@ -2516,18 +2508,15 @@ breakouterloop_mmap_lock: * khugepaged runs here, khugepaged_exit will find * mm_slot not pointing to the exiting mm. */ - if (slot->mm_node.next != &khugepaged_scan.mm_head) { - slot = list_entry(slot->mm_node.next, - struct mm_slot, mm_node); - khugepaged_scan.mm_slot = - mm_slot_entry(slot, struct khugepaged_mm_slot, slot); + if (!list_is_last(&slot->mm_node, &khugepaged_scan.mm_head)) { + khugepaged_scan.mm_slot = list_next_entry(slot, mm_node); khugepaged_scan.address = 0; } else { khugepaged_scan.mm_slot = NULL; khugepaged_full_scans++; } - collect_mm_slot(mm_slot); + collect_mm_slot(slot); } return progress; @@ -2614,7 +2603,7 @@ static void khugepaged_wait_work(void) static int khugepaged(void *none) { - struct khugepaged_mm_slot *mm_slot; + struct mm_slot *slot; set_freezable(); set_user_nice(current, MAX_NICE); @@ -2625,10 +2614,10 @@ static int khugepaged(void *none) } spin_lock(&khugepaged_mm_lock); - mm_slot = khugepaged_scan.mm_slot; + slot = khugepaged_scan.mm_slot; khugepaged_scan.mm_slot = NULL; - if (mm_slot) - collect_mm_slot(mm_slot); + if (slot) + collect_mm_slot(slot); spin_unlock(&khugepaged_mm_lock); return 0; } @@ -2767,7 +2756,7 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start, BUG_ON(vma->vm_start > start); BUG_ON(vma->vm_end < end); - if (!thp_vma_allowable_order(vma, vma->vm_flags, 0, PMD_ORDER)) + if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_FORCED_COLLAPSE, PMD_ORDER)) return -EINVAL; cc = kmalloc(sizeof(*cc), GFP_KERNEL); @@ -2832,7 +2821,6 @@ handle_result: case SCAN_PMD_NULL: case SCAN_PTE_NON_PRESENT: case SCAN_PTE_UFFD_WP: - case SCAN_PAGE_RO: case SCAN_LACK_REFERENCED_PAGE: case SCAN_PAGE_NULL: case SCAN_PAGE_COUNT: diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 84265983f239..1ac56ceb29b6 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -437,9 +437,15 @@ static struct kmemleak_object *__lookup_object(unsigned long ptr, int alias, else if (untagged_objp == untagged_ptr || alias) return object; else { + /* + * Printk deferring due to the kmemleak_lock held. + * This is done to avoid deadlock. + */ + printk_deferred_enter(); kmemleak_warn("Found object by alias at 0x%08lx\n", ptr); dump_object_info(object); + printk_deferred_exit(); break; } } @@ -736,6 +742,11 @@ static int __link_object(struct kmemleak_object *object, unsigned long ptr, else if (untagged_objp + parent->size <= untagged_ptr) link = &parent->rb_node.rb_right; else { + /* + * Printk deferring due to the kmemleak_lock held. + * This is done to avoid deadlock. + */ + printk_deferred_enter(); kmemleak_stop("Cannot insert 0x%lx into the object search tree (overlaps existing)\n", ptr); /* @@ -743,6 +754,7 @@ static int __link_object(struct kmemleak_object *object, unsigned long ptr, * be freed while the kmemleak_lock is held. */ dump_object_info(parent); + printk_deferred_exit(); return -EEXIST; } } @@ -856,13 +868,8 @@ static void delete_object_part(unsigned long ptr, size_t size, raw_spin_lock_irqsave(&kmemleak_lock, flags); object = __find_and_remove_object(ptr, 1, objflags); - if (!object) { -#ifdef DEBUG - kmemleak_warn("Partially freeing unknown object at 0x%08lx (size %zu)\n", - ptr, size); -#endif + if (!object) goto unlock; - } /* * Create one or two objects that may result from the memory block @@ -882,8 +889,14 @@ static void delete_object_part(unsigned long ptr, size_t size, unlock: raw_spin_unlock_irqrestore(&kmemleak_lock, flags); - if (object) + if (object) { __delete_object(object); + } else { +#ifdef DEBUG + kmemleak_warn("Partially freeing unknown object at 0x%08lx (size %zu)\n", + ptr, size); +#endif + } out: if (object_l) diff --git a/mm/kmsan/core.c b/mm/kmsan/core.c index 1ea711786c52..8bca7fece47f 100644 --- a/mm/kmsan/core.c +++ b/mm/kmsan/core.c @@ -195,7 +195,8 @@ void kmsan_internal_set_shadow_origin(void *addr, size_t size, int b, u32 origin, bool checked) { u64 address = (u64)addr; - u32 *shadow_start, *origin_start; + void *shadow_start; + u32 *aligned_shadow, *origin_start; size_t pad = 0; KMSAN_WARN_ON(!kmsan_metadata_is_contiguous(addr, size)); @@ -214,9 +215,12 @@ void kmsan_internal_set_shadow_origin(void *addr, size_t size, int b, } __memset(shadow_start, b, size); - if (!IS_ALIGNED(address, KMSAN_ORIGIN_SIZE)) { + if (IS_ALIGNED(address, KMSAN_ORIGIN_SIZE)) { + aligned_shadow = shadow_start; + } else { pad = address % KMSAN_ORIGIN_SIZE; address -= pad; + aligned_shadow = shadow_start - pad; size += pad; } size = ALIGN(size, KMSAN_ORIGIN_SIZE); @@ -230,7 +234,7 @@ void kmsan_internal_set_shadow_origin(void *addr, size_t size, int b, * corresponding shadow slot is zero. */ for (int i = 0; i < size / KMSAN_ORIGIN_SIZE; i++) { - if (origin || !shadow_start[i]) + if (origin || !aligned_shadow[i]) origin_start[i] = origin; } } diff --git a/mm/kmsan/kmsan_test.c b/mm/kmsan/kmsan_test.c index c6c5b2bbede0..902ec48b1e3e 100644 --- a/mm/kmsan/kmsan_test.c +++ b/mm/kmsan/kmsan_test.c @@ -556,6 +556,21 @@ DEFINE_TEST_MEMSETXX(16) DEFINE_TEST_MEMSETXX(32) DEFINE_TEST_MEMSETXX(64) +/* Test case: ensure that KMSAN does not access shadow memory out of bounds. */ +static void test_memset_on_guarded_buffer(struct kunit *test) +{ + void *buf = vmalloc(PAGE_SIZE); + + kunit_info(test, + "memset() on ends of guarded buffer should not crash\n"); + + for (size_t size = 0; size <= 128; size++) { + memset(buf, 0xff, size); + memset(buf + PAGE_SIZE - size, 0xff, size); + } + vfree(buf); +} + static noinline void fibonacci(int *array, int size, int start) { if (start < 2 || (start == size)) @@ -677,6 +692,7 @@ static struct kunit_case kmsan_test_cases[] = { KUNIT_CASE(test_memset16), KUNIT_CASE(test_memset32), KUNIT_CASE(test_memset64), + KUNIT_CASE(test_memset_on_guarded_buffer), KUNIT_CASE(test_long_origin_chain), KUNIT_CASE(test_stackdepot_roundtrip), KUNIT_CASE(test_unpoison_memory), @@ -1061,11 +1061,6 @@ struct ksm_stable_node *folio_stable_node(const struct folio *folio) return folio_test_ksm(folio) ? folio_raw_mapping(folio) : NULL; } -static inline struct ksm_stable_node *page_stable_node(struct page *page) -{ - return folio_stable_node(page_folio(page)); -} - static inline void folio_set_stable_node(struct folio *folio, struct ksm_stable_node *stable_node) { @@ -1217,8 +1212,8 @@ mm_exiting: spin_unlock(&ksm_mmlist_lock); mm_slot_free(mm_slot_cache, mm_slot); - clear_bit(MMF_VM_MERGEABLE, &mm->flags); - clear_bit(MMF_VM_MERGE_ANY, &mm->flags); + mm_flags_clear(MMF_VM_MERGEABLE, mm); + mm_flags_clear(MMF_VM_MERGE_ANY, mm); mmdrop(mm); } else spin_unlock(&ksm_mmlist_lock); @@ -2225,6 +2220,7 @@ static void stable_tree_append(struct ksm_rmap_item *rmap_item, */ static void cmp_and_merge_page(struct page *page, struct ksm_rmap_item *rmap_item) { + struct folio *folio = page_folio(page); struct ksm_rmap_item *tree_rmap_item; struct page *tree_page = NULL; struct ksm_stable_node *stable_node; @@ -2233,7 +2229,7 @@ static void cmp_and_merge_page(struct page *page, struct ksm_rmap_item *rmap_ite int err; bool max_page_sharing_bypass = false; - stable_node = page_stable_node(page); + stable_node = folio_stable_node(folio); if (stable_node) { if (stable_node->head != &migrate_nodes && get_kpfn_nid(READ_ONCE(stable_node->kpfn)) != @@ -2272,7 +2268,7 @@ static void cmp_and_merge_page(struct page *page, struct ksm_rmap_item *rmap_ite /* Start by searching for the folio in the stable tree */ kfolio = stable_tree_search(page); - if (&kfolio->page == page && rmap_item->head == stable_node) { + if (kfolio == folio && rmap_item->head == stable_node) { folio_put(kfolio); return; } @@ -2353,10 +2349,11 @@ static void cmp_and_merge_page(struct page *page, struct ksm_rmap_item *rmap_ite * the page is locked, it is better to skip it and * perhaps try again later. */ - if (!trylock_page(page)) + if (!folio_trylock(folio)) return; split_huge_page(page); - unlock_page(page); + folio = page_folio(page); + folio_unlock(folio); } } } @@ -2620,8 +2617,8 @@ no_vmas: spin_unlock(&ksm_mmlist_lock); mm_slot_free(mm_slot_cache, mm_slot); - clear_bit(MMF_VM_MERGEABLE, &mm->flags); - clear_bit(MMF_VM_MERGE_ANY, &mm->flags); + mm_flags_clear(MMF_VM_MERGEABLE, mm); + mm_flags_clear(MMF_VM_MERGE_ANY, mm); mmap_read_unlock(mm); mmdrop(mm); } else { @@ -2742,7 +2739,7 @@ static int __ksm_del_vma(struct vm_area_struct *vma) vm_flags_t ksm_vma_flags(const struct mm_struct *mm, const struct file *file, vm_flags_t vm_flags) { - if (test_bit(MMF_VM_MERGE_ANY, &mm->flags) && + if (mm_flags_test(MMF_VM_MERGE_ANY, mm) && __ksm_should_add_vma(file, vm_flags)) vm_flags |= VM_MERGEABLE; @@ -2784,16 +2781,16 @@ int ksm_enable_merge_any(struct mm_struct *mm) { int err; - if (test_bit(MMF_VM_MERGE_ANY, &mm->flags)) + if (mm_flags_test(MMF_VM_MERGE_ANY, mm)) return 0; - if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) { + if (!mm_flags_test(MMF_VM_MERGEABLE, mm)) { err = __ksm_enter(mm); if (err) return err; } - set_bit(MMF_VM_MERGE_ANY, &mm->flags); + mm_flags_set(MMF_VM_MERGE_ANY, mm); ksm_add_vmas(mm); return 0; @@ -2815,7 +2812,7 @@ int ksm_disable_merge_any(struct mm_struct *mm) { int err; - if (!test_bit(MMF_VM_MERGE_ANY, &mm->flags)) + if (!mm_flags_test(MMF_VM_MERGE_ANY, mm)) return 0; err = ksm_del_vmas(mm); @@ -2824,7 +2821,7 @@ int ksm_disable_merge_any(struct mm_struct *mm) return err; } - clear_bit(MMF_VM_MERGE_ANY, &mm->flags); + mm_flags_clear(MMF_VM_MERGE_ANY, mm); return 0; } @@ -2832,9 +2829,9 @@ int ksm_disable(struct mm_struct *mm) { mmap_assert_write_locked(mm); - if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) + if (!mm_flags_test(MMF_VM_MERGEABLE, mm)) return 0; - if (test_bit(MMF_VM_MERGE_ANY, &mm->flags)) + if (mm_flags_test(MMF_VM_MERGE_ANY, mm)) return ksm_disable_merge_any(mm); return ksm_del_vmas(mm); } @@ -2852,7 +2849,7 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start, if (!vma_ksm_compatible(vma)) return 0; - if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) { + if (!mm_flags_test(MMF_VM_MERGEABLE, mm)) { err = __ksm_enter(mm); if (err) return err; @@ -2912,7 +2909,7 @@ int __ksm_enter(struct mm_struct *mm) list_add_tail(&slot->mm_node, &ksm_scan.mm_slot->slot.mm_node); spin_unlock(&ksm_mmlist_lock); - set_bit(MMF_VM_MERGEABLE, &mm->flags); + mm_flags_set(MMF_VM_MERGEABLE, mm); mmgrab(mm); if (needs_wakeup) @@ -2939,23 +2936,25 @@ void __ksm_exit(struct mm_struct *mm) spin_lock(&ksm_mmlist_lock); slot = mm_slot_lookup(mm_slots_hash, mm); - mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot); - if (mm_slot && ksm_scan.mm_slot != mm_slot) { - if (!mm_slot->rmap_list) { - hash_del(&slot->hash); - list_del(&slot->mm_node); - easy_to_free = 1; - } else { - list_move(&slot->mm_node, - &ksm_scan.mm_slot->slot.mm_node); + if (slot) { + mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot); + if (ksm_scan.mm_slot != mm_slot) { + if (!mm_slot->rmap_list) { + hash_del(&slot->hash); + list_del(&slot->mm_node); + easy_to_free = 1; + } else { + list_move(&slot->mm_node, + &ksm_scan.mm_slot->slot.mm_node); + } } } spin_unlock(&ksm_mmlist_lock); if (easy_to_free) { mm_slot_free(mm_slot_cache, mm_slot); - clear_bit(MMF_VM_MERGE_ANY, &mm->flags); - clear_bit(MMF_VM_MERGEABLE, &mm->flags); + mm_flags_clear(MMF_VM_MERGE_ANY, mm); + mm_flags_clear(MMF_VM_MERGEABLE, mm); mmdrop(mm); } else if (mm_slot) { mmap_write_lock(mm); diff --git a/mm/memblock.c b/mm/memblock.c index 154f1d73b61f..117d963e677c 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -780,9 +780,9 @@ bool __init_memblock memblock_validate_numa_coverage(unsigned long threshold_byt } if ((nr_pages << PAGE_SHIFT) > threshold_bytes) { - mem_size_mb = memblock_phys_mem_size() >> 20; + mem_size_mb = memblock_phys_mem_size() / SZ_1M; pr_err("NUMA: no nodes coverage for %luMB of %luMB RAM\n", - (nr_pages << PAGE_SHIFT) >> 20, mem_size_mb); + (nr_pages << PAGE_SHIFT) / SZ_1M, mem_size_mb); return false; } @@ -1091,13 +1091,20 @@ int __init_memblock memblock_clear_nomap(phys_addr_t base, phys_addr_t size) /** * memblock_reserved_mark_noinit - Mark a reserved memory region with flag - * MEMBLOCK_RSRV_NOINIT which results in the struct pages not being initialized - * for this region. + * MEMBLOCK_RSRV_NOINIT + * * @base: the base phys addr of the region * @size: the size of the region * - * struct pages will not be initialized for reserved memory regions marked with - * %MEMBLOCK_RSRV_NOINIT. + * The struct pages for the reserved regions marked %MEMBLOCK_RSRV_NOINIT will + * not be fully initialized to allow the caller optimize their initialization. + * + * When %CONFIG_DEFERRED_STRUCT_PAGE_INIT is enabled, setting this flag + * completely bypasses the initialization of struct pages for such region. + * + * When %CONFIG_DEFERRED_STRUCT_PAGE_INIT is disabled, struct pages in this + * region will be initialized with default values but won't be marked as + * reserved. * * Return: 0 on success, -errno on failure. */ diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c index 4b94731305b9..6eed14bff742 100644 --- a/mm/memcontrol-v1.c +++ b/mm/memcontrol-v1.c @@ -761,7 +761,7 @@ static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg, size = thresholds->primary ? thresholds->primary->size + 1 : 1; /* Allocate memory for new array of thresholds */ - new = kmalloc(struct_size(new, entries, size), GFP_KERNEL); + new = kmalloc(struct_size(new, entries, size), GFP_KERNEL_ACCOUNT); if (!new) { ret = -ENOMEM; goto unlock; @@ -924,7 +924,7 @@ static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg, { struct mem_cgroup_eventfd_list *event; - event = kmalloc(sizeof(*event), GFP_KERNEL); + event = kmalloc(sizeof(*event), GFP_KERNEL_ACCOUNT); if (!event) return -ENOMEM; @@ -1087,7 +1087,7 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of, CLASS(fd, cfile)(cfd); - event = kzalloc(sizeof(*event), GFP_KERNEL); + event = kzalloc(sizeof(*event), GFP_KERNEL_ACCOUNT); if (!event) return -ENOMEM; @@ -2053,7 +2053,7 @@ struct cftype mem_cgroup_legacy_files[] = { { .name = "cgroup.event_control", /* XXX: for compat */ .write = memcg_write_event_control, - .flags = CFTYPE_NO_PREFIX | CFTYPE_WORLD_WRITABLE, + .flags = CFTYPE_NO_PREFIX, }, { .name = "swappiness", diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 8dd7fbed5a94..e090f29eb03b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -287,6 +287,7 @@ ino_t page_cgroup_ino(struct page *page) rcu_read_unlock(); return ino; } +EXPORT_SYMBOL_GPL(page_cgroup_ino); /* Subset of node_stat_item for memcg stats */ static const unsigned int memcg_node_stat_items[] = { @@ -2203,7 +2204,7 @@ static unsigned long calculate_high_delay(struct mem_cgroup *memcg, * try_charge() (context permitting), as well as from the userland * return path where reclaim is always able to block. */ -void mem_cgroup_handle_over_high(gfp_t gfp_mask) +void __mem_cgroup_handle_over_high(gfp_t gfp_mask) { unsigned long penalty_jiffies; unsigned long pflags; @@ -2213,9 +2214,6 @@ void mem_cgroup_handle_over_high(gfp_t gfp_mask) struct mem_cgroup *memcg; bool in_retry = false; - if (likely(!nr_pages)) - return; - memcg = get_mem_cgroup_from_mm(current->mm); current->memcg_nr_pages_over_high = 0; @@ -2486,7 +2484,7 @@ done_restock: if (current->memcg_nr_pages_over_high > MEMCG_CHARGE_BATCH && !(current->flags & PF_MEMALLOC) && gfpflags_allow_blocking(gfp_mask)) - mem_cgroup_handle_over_high(gfp_mask); + __mem_cgroup_handle_over_high(gfp_mask); return 0; } @@ -5020,22 +5018,42 @@ out: void mem_cgroup_sk_free(struct sock *sk) { - if (sk->sk_memcg) - css_put(&sk->sk_memcg->css); + struct mem_cgroup *memcg = mem_cgroup_from_sk(sk); + + if (memcg) + css_put(&memcg->css); +} + +void mem_cgroup_sk_inherit(const struct sock *sk, struct sock *newsk) +{ + struct mem_cgroup *memcg; + + if (sk->sk_memcg == newsk->sk_memcg) + return; + + mem_cgroup_sk_free(newsk); + + memcg = mem_cgroup_from_sk(sk); + if (memcg) + css_get(&memcg->css); + + newsk->sk_memcg = sk->sk_memcg; } /** - * mem_cgroup_charge_skmem - charge socket memory - * @memcg: memcg to charge + * mem_cgroup_sk_charge - charge socket memory + * @sk: socket in memcg to charge * @nr_pages: number of pages to charge * @gfp_mask: reclaim mode * * Charges @nr_pages to @memcg. Returns %true if the charge fit within * @memcg's configured limit, %false if it doesn't. */ -bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages, - gfp_t gfp_mask) +bool mem_cgroup_sk_charge(const struct sock *sk, unsigned int nr_pages, + gfp_t gfp_mask) { + struct mem_cgroup *memcg = mem_cgroup_from_sk(sk); + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) return memcg1_charge_skmem(memcg, nr_pages, gfp_mask); @@ -5048,12 +5066,14 @@ bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages, } /** - * mem_cgroup_uncharge_skmem - uncharge socket memory - * @memcg: memcg to uncharge + * mem_cgroup_sk_uncharge - uncharge socket memory + * @sk: socket in memcg to uncharge * @nr_pages: number of pages to uncharge */ -void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages) +void mem_cgroup_sk_uncharge(const struct sock *sk, unsigned int nr_pages) { + struct mem_cgroup *memcg = mem_cgroup_from_sk(sk); + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) { memcg1_uncharge_skmem(memcg, nr_pages); return; diff --git a/mm/memfd.c b/mm/memfd.c index bbe679895ef6..1d109c1acf21 100644 --- a/mm/memfd.c +++ b/mm/memfd.c @@ -385,11 +385,11 @@ static int sanitize_flags(unsigned int *flags_ptr) unsigned int flags = *flags_ptr; if (!(flags & MFD_HUGETLB)) { - if (flags & ~(unsigned int)MFD_ALL_FLAGS) + if (flags & ~MFD_ALL_FLAGS) return -EINVAL; } else { /* Allow huge page size encoding in flags. */ - if (flags & ~(unsigned int)(MFD_ALL_FLAGS | + if (flags & ~(MFD_ALL_FLAGS | (MFD_HUGE_MASK << MFD_HUGE_SHIFT))) return -EINVAL; } diff --git a/mm/memory-failure.c b/mm/memory-failure.c index fc30ca4804bf..3edebb0cda30 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -212,106 +212,34 @@ static bool page_handle_poison(struct page *page, bool hugepage_or_freepage, boo return true; } -#if IS_ENABLED(CONFIG_HWPOISON_INJECT) +static hwpoison_filter_func_t __rcu *hwpoison_filter_func __read_mostly; -u32 hwpoison_filter_enable = 0; -u32 hwpoison_filter_dev_major = ~0U; -u32 hwpoison_filter_dev_minor = ~0U; -u64 hwpoison_filter_flags_mask; -u64 hwpoison_filter_flags_value; -EXPORT_SYMBOL_GPL(hwpoison_filter_enable); -EXPORT_SYMBOL_GPL(hwpoison_filter_dev_major); -EXPORT_SYMBOL_GPL(hwpoison_filter_dev_minor); -EXPORT_SYMBOL_GPL(hwpoison_filter_flags_mask); -EXPORT_SYMBOL_GPL(hwpoison_filter_flags_value); - -static int hwpoison_filter_dev(struct page *p) +void hwpoison_filter_register(hwpoison_filter_func_t *filter) { - struct folio *folio = page_folio(p); - struct address_space *mapping; - dev_t dev; - - if (hwpoison_filter_dev_major == ~0U && - hwpoison_filter_dev_minor == ~0U) - return 0; - - mapping = folio_mapping(folio); - if (mapping == NULL || mapping->host == NULL) - return -EINVAL; - - dev = mapping->host->i_sb->s_dev; - if (hwpoison_filter_dev_major != ~0U && - hwpoison_filter_dev_major != MAJOR(dev)) - return -EINVAL; - if (hwpoison_filter_dev_minor != ~0U && - hwpoison_filter_dev_minor != MINOR(dev)) - return -EINVAL; - - return 0; + rcu_assign_pointer(hwpoison_filter_func, filter); } +EXPORT_SYMBOL_GPL(hwpoison_filter_register); -static int hwpoison_filter_flags(struct page *p) +void hwpoison_filter_unregister(void) { - if (!hwpoison_filter_flags_mask) - return 0; - - if ((stable_page_flags(p) & hwpoison_filter_flags_mask) == - hwpoison_filter_flags_value) - return 0; - else - return -EINVAL; + RCU_INIT_POINTER(hwpoison_filter_func, NULL); + synchronize_rcu(); } +EXPORT_SYMBOL_GPL(hwpoison_filter_unregister); -/* - * This allows stress tests to limit test scope to a collection of tasks - * by putting them under some memcg. This prevents killing unrelated/important - * processes such as /sbin/init. Note that the target task may share clean - * pages with init (eg. libc text), which is harmless. If the target task - * share _dirty_ pages with another task B, the test scheme must make sure B - * is also included in the memcg. At last, due to race conditions this filter - * can only guarantee that the page either belongs to the memcg tasks, or is - * a freed page. - */ -#ifdef CONFIG_MEMCG -u64 hwpoison_filter_memcg; -EXPORT_SYMBOL_GPL(hwpoison_filter_memcg); -static int hwpoison_filter_task(struct page *p) +static int hwpoison_filter(struct page *p) { - if (!hwpoison_filter_memcg) - return 0; - - if (page_cgroup_ino(p) != hwpoison_filter_memcg) - return -EINVAL; - - return 0; -} -#else -static int hwpoison_filter_task(struct page *p) { return 0; } -#endif - -int hwpoison_filter(struct page *p) -{ - if (!hwpoison_filter_enable) - return 0; - - if (hwpoison_filter_dev(p)) - return -EINVAL; - - if (hwpoison_filter_flags(p)) - return -EINVAL; + int ret = 0; + hwpoison_filter_func_t *filter; - if (hwpoison_filter_task(p)) - return -EINVAL; + rcu_read_lock(); + filter = rcu_dereference(hwpoison_filter_func); + if (filter) + ret = filter(p); + rcu_read_unlock(); - return 0; -} -EXPORT_SYMBOL_GPL(hwpoison_filter); -#else -int hwpoison_filter(struct page *p) -{ - return 0; + return ret; } -#endif /* * Kill all processes that have a poisoned page mapped and then isolate @@ -956,7 +884,7 @@ static const char * const action_page_types[] = { [MF_MSG_BUDDY] = "free buddy page", [MF_MSG_DAX] = "dax page", [MF_MSG_UNSPLIT_THP] = "unsplit thp", - [MF_MSG_ALREADY_POISONED] = "already poisoned", + [MF_MSG_ALREADY_POISONED] = "already poisoned page", [MF_MSG_UNKNOWN] = "unknown page", }; @@ -1199,7 +1127,7 @@ static int me_swapcache_clean(struct page_state *ps, struct page *p) struct folio *folio = page_folio(p); int ret; - delete_from_swap_cache(folio); + swap_cache_del_folio(folio); ret = delete_from_lru_cache(folio) ? MF_FAILED : MF_RECOVERED; folio_unlock(folio); @@ -1349,9 +1277,10 @@ static int action_result(unsigned long pfn, enum mf_action_page_type type, { trace_memory_failure_event(pfn, type, result); - num_poisoned_pages_inc(pfn); - - update_per_node_mf_stats(pfn, result); + if (type != MF_MSG_ALREADY_POISONED) { + num_poisoned_pages_inc(pfn); + update_per_node_mf_stats(pfn, result); + } pr_err("%#lx: recovery action for %s: %s\n", pfn, action_page_types[type], action_name[result]); @@ -1707,10 +1636,10 @@ static int identify_page_state(unsigned long pfn, struct page *p, * carried out only if the first check can't determine the page status. */ for (ps = error_states;; ps++) - if ((p->flags & ps->mask) == ps->res) + if ((p->flags.f & ps->mask) == ps->res) break; - page_flags |= (p->flags & (1UL << PG_dirty)); + page_flags |= (p->flags.f & (1UL << PG_dirty)); if (!ps->mask) for (ps = error_states;; ps++) @@ -2094,12 +2023,11 @@ retry: *hugetlb = 0; return 0; } else if (res == -EHWPOISON) { - pr_err("%#lx: already hardware poisoned\n", pfn); if (flags & MF_ACTION_REQUIRED) { folio = page_folio(p); res = kill_accessing_process(current, folio_pfn(folio), flags); - action_result(pfn, MF_MSG_ALREADY_POISONED, MF_FAILED); } + action_result(pfn, MF_MSG_ALREADY_POISONED, MF_FAILED); return res; } else if (res == -EBUSY) { if (!(flags & MF_NO_RETRY)) { @@ -2137,7 +2065,7 @@ retry: return action_result(pfn, MF_MSG_FREE_HUGE, res); } - page_flags = folio->flags; + page_flags = folio->flags.f; if (!hwpoison_user_mappings(folio, p, pfn, flags)) { folio_unlock(folio); @@ -2266,7 +2194,7 @@ int memory_failure(unsigned long pfn, int flags) goto unlock_mutex; if (pfn_valid(pfn)) { - pgmap = get_dev_pagemap(pfn, NULL); + pgmap = get_dev_pagemap(pfn); put_ref_page(pfn, flags); if (pgmap) { res = memory_failure_dev_pagemap(pfn, flags, @@ -2285,7 +2213,6 @@ try_again: goto unlock_mutex; if (TestSetPageHWPoison(p)) { - pr_err("%#lx: already hardware poisoned\n", pfn); res = -EHWPOISON; if (flags & MF_ACTION_REQUIRED) res = kill_accessing_process(current, pfn, flags); @@ -2398,7 +2325,7 @@ try_again: * folio_remove_rmap_*() in try_to_unmap_one(). So to determine page * status correctly, we save a copy of the page flags at this time. */ - page_flags = folio->flags; + page_flags = folio->flags.f; /* * __munlock_folio() may clear a writeback folio's LRU flag without @@ -2569,10 +2496,9 @@ int unpoison_memory(unsigned long pfn) static DEFINE_RATELIMIT_STATE(unpoison_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); - if (!pfn_valid(pfn)) - return -ENXIO; - - p = pfn_to_page(pfn); + p = pfn_to_online_page(pfn); + if (!p) + return -EIO; folio = page_folio(p); mutex_lock(&mf_mutex); @@ -2744,13 +2670,13 @@ static int soft_offline_in_use_page(struct page *page) putback_movable_pages(&pagelist); pr_info("%#lx: %s migration failed %ld, type %pGp\n", - pfn, msg_page[huge], ret, &page->flags); + pfn, msg_page[huge], ret, &page->flags.f); if (ret > 0) ret = -EBUSY; } } else { pr_info("%#lx: %s isolation failed, page count %d, type %pGp\n", - pfn, msg_page[huge], page_count(page), &page->flags); + pfn, msg_page[huge], page_count(page), &page->flags.f); ret = -EBUSY; } return ret; diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c index 0382b6942b8b..0ea5c13f10a2 100644 --- a/mm/memory-tiers.c +++ b/mm/memory-tiers.c @@ -942,11 +942,23 @@ static ssize_t demotion_enabled_store(struct kobject *kobj, const char *buf, size_t count) { ssize_t ret; + bool before = numa_demotion_enabled; ret = kstrtobool(buf, &numa_demotion_enabled); if (ret) return ret; + /* + * Reset kswapd_failures statistics. They may no longer be + * valid since the policy for kswapd has changed. + */ + if (before == false && numa_demotion_enabled == true) { + struct pglist_data *pgdat; + + for_each_online_pgdat(pgdat) + atomic_set(&pgdat->kswapd_failures, 0); + } + return count; } diff --git a/mm/memory.c b/mm/memory.c index 0ba4f6b71847..74b45e258323 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -491,22 +491,8 @@ static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss) add_mm_counter(mm, i, rss[i]); } -/* - * This function is called to print an error when a bad pte - * is found. For example, we might have a PFN-mapped pte in - * a region that doesn't allow it. - * - * The calling function must still handle the error. - */ -static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, - pte_t pte, struct page *page) +static bool is_bad_page_map_ratelimited(void) { - pgd_t *pgd = pgd_offset(vma->vm_mm, addr); - p4d_t *p4d = p4d_offset(pgd, addr); - pud_t *pud = pud_offset(p4d, addr); - pmd_t *pmd = pmd_offset(pud, addr); - struct address_space *mapping; - pgoff_t index; static unsigned long resume; static unsigned long nr_shown; static unsigned long nr_unshown; @@ -518,7 +504,7 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, if (nr_shown == 60) { if (time_before(jiffies, resume)) { nr_unshown++; - return; + return true; } if (nr_unshown) { pr_alert("BUG: Bad page map: %lu messages suppressed\n", @@ -529,15 +515,91 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, } if (nr_shown++ == 0) resume = jiffies + 60 * HZ; + return false; +} + +static void __print_bad_page_map_pgtable(struct mm_struct *mm, unsigned long addr) +{ + unsigned long long pgdv, p4dv, pudv, pmdv; + p4d_t p4d, *p4dp; + pud_t pud, *pudp; + pmd_t pmd, *pmdp; + pgd_t *pgdp; + + /* + * Although this looks like a fully lockless pgtable walk, it is not: + * see locking requirements for print_bad_page_map(). + */ + pgdp = pgd_offset(mm, addr); + pgdv = pgd_val(*pgdp); + + if (!pgd_present(*pgdp) || pgd_leaf(*pgdp)) { + pr_alert("pgd:%08llx\n", pgdv); + return; + } + + p4dp = p4d_offset(pgdp, addr); + p4d = p4dp_get(p4dp); + p4dv = p4d_val(p4d); + + if (!p4d_present(p4d) || p4d_leaf(p4d)) { + pr_alert("pgd:%08llx p4d:%08llx\n", pgdv, p4dv); + return; + } + + pudp = pud_offset(p4dp, addr); + pud = pudp_get(pudp); + pudv = pud_val(pud); + + if (!pud_present(pud) || pud_leaf(pud)) { + pr_alert("pgd:%08llx p4d:%08llx pud:%08llx\n", pgdv, p4dv, pudv); + return; + } + + pmdp = pmd_offset(pudp, addr); + pmd = pmdp_get(pmdp); + pmdv = pmd_val(pmd); + + /* + * Dumping the PTE would be nice, but it's tricky with CONFIG_HIGHPTE, + * because the table should already be mapped by the caller and + * doing another map would be bad. print_bad_page_map() should + * already take care of printing the PTE. + */ + pr_alert("pgd:%08llx p4d:%08llx pud:%08llx pmd:%08llx\n", pgdv, + p4dv, pudv, pmdv); +} + +/* + * This function is called to print an error when a bad page table entry (e.g., + * corrupted page table entry) is found. For example, we might have a + * PFN-mapped pte in a region that doesn't allow it. + * + * The calling function must still handle the error. + * + * This function must be called during a proper page table walk, as it will + * re-walk the page table to dump information: the caller MUST prevent page + * table teardown (by holding mmap, vma or rmap lock) and MUST hold the leaf + * page table lock. + */ +static void print_bad_page_map(struct vm_area_struct *vma, + unsigned long addr, unsigned long long entry, struct page *page, + enum pgtable_level level) +{ + struct address_space *mapping; + pgoff_t index; + + if (is_bad_page_map_ratelimited()) + return; mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL; index = linear_page_index(vma, addr); - pr_alert("BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n", - current->comm, - (long long)pte_val(pte), (long long)pmd_val(*pmd)); + pr_alert("BUG: Bad page map in process %s %s:%08llx", current->comm, + pgtable_level_to_str(level), entry); + __print_bad_page_map_pgtable(vma->vm_mm, addr); if (page) - dump_page(page, "bad pte"); + dump_page(page, "bad page map"); pr_alert("addr:%px vm_flags:%08lx anon_vma:%px mapping:%px index:%lx\n", (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index); pr_alert("file:%pD fault:%ps mmap:%ps mmap_prepare: %ps read_folio:%ps\n", @@ -549,18 +611,39 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, dump_stack(); add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); } +#define print_bad_pte(vma, addr, pte, page) \ + print_bad_page_map(vma, addr, pte_val(pte), page, PGTABLE_LEVEL_PTE) -/* - * vm_normal_page -- This function gets the "struct page" associated with a pte. +/** + * __vm_normal_page() - Get the "struct page" associated with a page table entry. + * @vma: The VMA mapping the page table entry. + * @addr: The address where the page table entry is mapped. + * @pfn: The PFN stored in the page table entry. + * @special: Whether the page table entry is marked "special". + * @level: The page table level for error reporting purposes only. + * @entry: The page table entry value for error reporting purposes only. * * "Special" mappings do not wish to be associated with a "struct page" (either * it doesn't exist, or it exists but they don't want to touch it). In this - * case, NULL is returned here. "Normal" mappings do have a struct page. + * case, NULL is returned here. "Normal" mappings do have a struct page and + * are ordinarily refcounted. + * + * Page mappings of the shared zero folios are always considered "special", as + * they are not ordinarily refcounted: neither the refcount nor the mapcount + * of these folios is adjusted when mapping them into user page tables. + * Selected page table walkers (such as GUP) can still identify mappings of the + * shared zero folios and work with the underlying "struct page". * - * There are 2 broad cases. Firstly, an architecture may define a pte_special() - * pte bit, in which case this function is trivial. Secondly, an architecture - * may not have a spare pte bit, which requires a more complicated scheme, - * described below. + * There are 2 broad cases. Firstly, an architecture may define a "special" + * page table entry bit, such as pte_special(), in which case this function is + * trivial. Secondly, an architecture may not have a spare page table + * entry bit, which requires a more complicated scheme, described below. + * + * With CONFIG_FIND_NORMAL_PAGE, we might have the "special" bit set on + * page table entries that actually map "normal" pages: however, that page + * cannot be looked up through the PFN stored in the page table entry, but + * instead will be looked up through vm_ops->find_normal_page(). So far, this + * only applies to PTEs. * * A raw VM_PFNMAP mapping (ie. one that is not COWed) is always considered a * special mapping (even if there are underlying and valid "struct pages"). @@ -585,72 +668,104 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, * * VM_MIXEDMAP mappings can likewise contain memory with or without "struct * page" backing, however the difference is that _all_ pages with a struct - * page (that is, those where pfn_valid is true) are refcounted and considered - * normal pages by the VM. The only exception are zeropages, which are - * *never* refcounted. + * page (that is, those where pfn_valid is true, except the shared zero + * folios) are refcounted and considered normal pages by the VM. * * The disadvantage is that pages are refcounted (which can be slower and * simply not an option for some PFNMAP users). The advantage is that we * don't have to follow the strict linearity rule of PFNMAP mappings in * order to support COWable mappings. * + * Return: Returns the "struct page" if this is a "normal" mapping. Returns + * NULL if this is a "special" mapping. */ -struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, - pte_t pte) +static inline struct page *__vm_normal_page(struct vm_area_struct *vma, + unsigned long addr, unsigned long pfn, bool special, + unsigned long long entry, enum pgtable_level level) { - unsigned long pfn = pte_pfn(pte); - if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL)) { - if (likely(!pte_special(pte))) - goto check_pfn; - if (vma->vm_ops && vma->vm_ops->find_special_page) - return vma->vm_ops->find_special_page(vma, addr); - if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)) - return NULL; - if (is_zero_pfn(pfn)) - return NULL; - - print_bad_pte(vma, addr, pte, NULL); - return NULL; - } - - /* !CONFIG_ARCH_HAS_PTE_SPECIAL case follows: */ - - if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) { - if (vma->vm_flags & VM_MIXEDMAP) { - if (!pfn_valid(pfn)) - return NULL; - if (is_zero_pfn(pfn)) - return NULL; - goto out; - } else { - unsigned long off; - off = (addr - vma->vm_start) >> PAGE_SHIFT; - if (pfn == vma->vm_pgoff + off) + if (unlikely(special)) { +#ifdef CONFIG_FIND_NORMAL_PAGE + if (vma->vm_ops && vma->vm_ops->find_normal_page) + return vma->vm_ops->find_normal_page(vma, addr); +#endif /* CONFIG_FIND_NORMAL_PAGE */ + if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)) return NULL; - if (!is_cow_mapping(vma->vm_flags)) + if (is_zero_pfn(pfn) || is_huge_zero_pfn(pfn)) return NULL; + + print_bad_page_map(vma, addr, entry, NULL, level); + return NULL; } - } + /* + * With CONFIG_ARCH_HAS_PTE_SPECIAL, any special page table + * mappings (incl. shared zero folios) are marked accordingly. + */ + } else { + if (unlikely(vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))) { + if (vma->vm_flags & VM_MIXEDMAP) { + /* If it has a "struct page", it's "normal". */ + if (!pfn_valid(pfn)) + return NULL; + } else { + unsigned long off = (addr - vma->vm_start) >> PAGE_SHIFT; - if (is_zero_pfn(pfn)) - return NULL; + /* Only CoW'ed anon folios are "normal". */ + if (pfn == vma->vm_pgoff + off) + return NULL; + if (!is_cow_mapping(vma->vm_flags)) + return NULL; + } + } + + if (is_zero_pfn(pfn) || is_huge_zero_pfn(pfn)) + return NULL; + } -check_pfn: if (unlikely(pfn > highest_memmap_pfn)) { - print_bad_pte(vma, addr, pte, NULL); + /* Corrupted page table entry. */ + print_bad_page_map(vma, addr, entry, NULL, level); return NULL; } - /* * NOTE! We still have PageReserved() pages in the page tables. - * eg. VDSO mappings can cause them to exist. + * For example, VDSO mappings can cause them to exist. */ -out: - VM_WARN_ON_ONCE(is_zero_pfn(pfn)); + VM_WARN_ON_ONCE(is_zero_pfn(pfn) || is_huge_zero_pfn(pfn)); return pfn_to_page(pfn); } +/** + * vm_normal_page() - Get the "struct page" associated with a PTE + * @vma: The VMA mapping the @pte. + * @addr: The address where the @pte is mapped. + * @pte: The PTE. + * + * Get the "struct page" associated with a PTE. See __vm_normal_page() + * for details on "normal" and "special" mappings. + * + * Return: Returns the "struct page" if this is a "normal" mapping. Returns + * NULL if this is a "special" mapping. + */ +struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, + pte_t pte) +{ + return __vm_normal_page(vma, addr, pte_pfn(pte), pte_special(pte), + pte_val(pte), PGTABLE_LEVEL_PTE); +} + +/** + * vm_normal_folio() - Get the "struct folio" associated with a PTE + * @vma: The VMA mapping the @pte. + * @addr: The address where the @pte is mapped. + * @pte: The PTE. + * + * Get the "struct folio" associated with a PTE. See __vm_normal_page() + * for details on "normal" and "special" mappings. + * + * Return: Returns the "struct folio" if this is a "normal" mapping. Returns + * NULL if this is a "special" mapping. + */ struct folio *vm_normal_folio(struct vm_area_struct *vma, unsigned long addr, pte_t pte) { @@ -662,43 +777,37 @@ struct folio *vm_normal_folio(struct vm_area_struct *vma, unsigned long addr, } #ifdef CONFIG_PGTABLE_HAS_HUGE_LEAVES +/** + * vm_normal_page_pmd() - Get the "struct page" associated with a PMD + * @vma: The VMA mapping the @pmd. + * @addr: The address where the @pmd is mapped. + * @pmd: The PMD. + * + * Get the "struct page" associated with a PTE. See __vm_normal_page() + * for details on "normal" and "special" mappings. + * + * Return: Returns the "struct page" if this is a "normal" mapping. Returns + * NULL if this is a "special" mapping. + */ struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t pmd) { - unsigned long pfn = pmd_pfn(pmd); - - /* Currently it's only used for huge pfnmaps */ - if (unlikely(pmd_special(pmd))) - return NULL; - - if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) { - if (vma->vm_flags & VM_MIXEDMAP) { - if (!pfn_valid(pfn)) - return NULL; - goto out; - } else { - unsigned long off; - off = (addr - vma->vm_start) >> PAGE_SHIFT; - if (pfn == vma->vm_pgoff + off) - return NULL; - if (!is_cow_mapping(vma->vm_flags)) - return NULL; - } - } - - if (is_huge_zero_pfn(pfn)) - return NULL; - if (unlikely(pfn > highest_memmap_pfn)) - return NULL; - - /* - * NOTE! We still have PageReserved() pages in the page tables. - * eg. VDSO mappings can cause them to exist. - */ -out: - return pfn_to_page(pfn); + return __vm_normal_page(vma, addr, pmd_pfn(pmd), pmd_special(pmd), + pmd_val(pmd), PGTABLE_LEVEL_PMD); } +/** + * vm_normal_folio_pmd() - Get the "struct folio" associated with a PMD + * @vma: The VMA mapping the @pmd. + * @addr: The address where the @pmd is mapped. + * @pmd: The PMD. + * + * Get the "struct folio" associated with a PTE. See __vm_normal_page() + * for details on "normal" and "special" mappings. + * + * Return: Returns the "struct folio" if this is a "normal" mapping. Returns + * NULL if this is a "special" mapping. + */ struct folio *vm_normal_folio_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t pmd) { @@ -708,6 +817,25 @@ struct folio *vm_normal_folio_pmd(struct vm_area_struct *vma, return page_folio(page); return NULL; } + +/** + * vm_normal_page_pud() - Get the "struct page" associated with a PUD + * @vma: The VMA mapping the @pud. + * @addr: The address where the @pud is mapped. + * @pud: The PUD. + * + * Get the "struct page" associated with a PUD. See __vm_normal_page() + * for details on "normal" and "special" mappings. + * + * Return: Returns the "struct page" if this is a "normal" mapping. Returns + * NULL if this is a "special" mapping. + */ +struct page *vm_normal_page_pud(struct vm_area_struct *vma, + unsigned long addr, pud_t pud) +{ + return __vm_normal_page(vma, addr, pud_pfn(pud), pud_special(pud), + pud_val(pud), PGTABLE_LEVEL_PUD); +} #endif /** @@ -2138,8 +2266,7 @@ static int validate_page_before_insert(struct vm_area_struct *vma, return -EINVAL; return 0; } - if (folio_test_anon(folio) || folio_test_slab(folio) || - page_has_type(page)) + if (folio_test_anon(folio) || page_has_type(page)) return -EINVAL; flush_dcache_folio(folio); return 0; @@ -4387,8 +4514,8 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf) * Get a list of all the (large) orders below PMD_ORDER that are enabled * and suitable for swapping THP. */ - orders = thp_vma_allowable_orders(vma, vma->vm_flags, - TVA_IN_PF | TVA_ENFORCE_SYSFS, BIT(PMD_ORDER) - 1); + orders = thp_vma_allowable_orders(vma, vma->vm_flags, TVA_PAGEFAULT, + BIT(PMD_ORDER) - 1); orders = thp_vma_suitable_orders(vma, vmf->address, orders); orders = thp_swap_suitable_orders(swp_offset(entry), vmf->address, orders); @@ -4532,9 +4659,9 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) if (unlikely(!si)) goto out; - folio = swap_cache_get_folio(entry, vma, vmf->address); + folio = swap_cache_get_folio(entry); if (folio) - page = folio_file_page(folio, swp_offset(entry)); + swap_update_readahead(folio, vma, vmf->address); swapcache = folio; if (!folio) { @@ -4571,7 +4698,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) memcg1_swapin(entry, nr_pages); - shadow = get_shadow_from_swap_cache(entry); + shadow = swap_cache_get_shadow(entry); if (shadow) workingset_refault(folio, shadow); @@ -4605,20 +4732,13 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) ret = VM_FAULT_MAJOR; count_vm_event(PGMAJFAULT); count_memcg_event_mm(vma->vm_mm, PGMAJFAULT); - page = folio_file_page(folio, swp_offset(entry)); - } else if (PageHWPoison(page)) { - /* - * hwpoisoned dirty swapcache pages are kept for killing - * owner processes (which may be unknown at hwpoison time) - */ - ret = VM_FAULT_HWPOISON; - goto out_release; } ret |= folio_lock_or_retry(folio, vmf); if (ret & VM_FAULT_RETRY) goto out_release; + page = folio_file_page(folio, swp_offset(entry)); if (swapcache) { /* * Make sure folio_free_swap() or swapoff did not release the @@ -4627,10 +4747,18 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) * swapcache, we need to check that the page's swap has not * changed. */ - if (unlikely(!folio_test_swapcache(folio) || - page_swap_entry(page).val != entry.val)) + if (unlikely(!folio_matches_swap_entry(folio, entry))) goto out_page; + if (unlikely(PageHWPoison(page))) { + /* + * hwpoisoned dirty swapcache pages are kept for killing + * owner processes (which may be unknown at hwpoison time) + */ + ret = VM_FAULT_HWPOISON; + goto out_page; + } + /* * KSM sometimes has to copy on read faults, for example, if * folio->index of non-ksm folios would be nonlinear inside the @@ -4935,8 +5063,8 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf) * for this vma. Then filter out the orders that can't be allocated over * the faulting address and still be fully contained in the vma. */ - orders = thp_vma_allowable_orders(vma, vma->vm_flags, - TVA_IN_PF | TVA_ENFORCE_SYSFS, BIT(PMD_ORDER) - 1); + orders = thp_vma_allowable_orders(vma, vma->vm_flags, TVA_PAGEFAULT, + BIT(PMD_ORDER) - 1); orders = thp_vma_suitable_orders(vma, vmf->address, orders); if (!orders) @@ -5204,9 +5332,11 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct folio *folio, struct page *pa * It is too late to allocate a small folio, we already have a large * folio in the pagecache: especially s390 KVM cannot tolerate any * PMD mappings, but PTE-mapped THP are fine. So let's simply refuse any - * PMD mappings if THPs are disabled. + * PMD mappings if THPs are disabled. As we already have a THP, + * behave as if we are forcing a collapse. */ - if (thp_disabled_by_hw() || vma_thp_disabled(vma, vma->vm_flags)) + if (thp_disabled_by_hw() || vma_thp_disabled(vma, vma->vm_flags, + /* forced_collapse=*/ true)) return ret; if (!thp_vma_suitable_order(vma, haddr, PMD_ORDER)) @@ -5386,13 +5516,8 @@ fallback: nr_pages = folio_nr_pages(folio); - /* - * Using per-page fault to maintain the uffd semantics, and same - * approach also applies to non shmem/tmpfs faults to avoid - * inflating the RSS of the process. - */ - if (!vma_is_shmem(vma) || unlikely(userfaultfd_armed(vma)) || - unlikely(needs_fallback)) { + /* Using per-page fault to maintain the uffd semantics */ + if (unlikely(userfaultfd_armed(vma)) || unlikely(needs_fallback)) { nr_pages = 1; } else if (nr_pages > 1) { pgoff_t idx = folio_page_idx(folio, page); @@ -6126,8 +6251,7 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, return VM_FAULT_OOM; retry_pud: if (pud_none(*vmf.pud) && - thp_vma_allowable_order(vma, vm_flags, - TVA_IN_PF | TVA_ENFORCE_SYSFS, PUD_ORDER)) { + thp_vma_allowable_order(vma, vm_flags, TVA_PAGEFAULT, PUD_ORDER)) { ret = create_huge_pud(&vmf); if (!(ret & VM_FAULT_FALLBACK)) return ret; @@ -6161,8 +6285,7 @@ retry_pud: goto retry_pud; if (pmd_none(*vmf.pmd) && - thp_vma_allowable_order(vma, vm_flags, - TVA_IN_PF | TVA_ENFORCE_SYSFS, PMD_ORDER)) { + thp_vma_allowable_order(vma, vm_flags, TVA_PAGEFAULT, PMD_ORDER)) { ret = create_huge_pmd(&vmf); if (!(ret & VM_FAULT_FALLBACK)) return ret; diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 1f15af712bc3..e9f14de4a9c9 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -375,7 +375,7 @@ struct page *pfn_to_online_page(unsigned long pfn) * the section may be 'offline' but 'valid'. Only * get_dev_pagemap() can determine sub-section online status. */ - pgmap = get_dev_pagemap(pfn, NULL); + pgmap = get_dev_pagemap(pfn); put_dev_pagemap(pgmap); /* The presence of a pgmap indicates ZONE_DEVICE offline pfn */ @@ -955,7 +955,7 @@ static struct zone *default_kernel_zone_for_pfn(int nid, unsigned long start_pfn * effectively unused by the kernel, yet they account to "present pages". * Fortunately, these allocations are comparatively small in relevant setups * (e.g., fraction of system memory). - * b) Some hotplugged memory blocks in virtualized environments, esecially + * b) Some hotplugged memory blocks in virtualized environments, especially * hotplugged by virtio-mem, look like they are completely present, however, * only parts of the memory block are actually currently usable. * "present pages" is an upper limit that can get reached at runtime. As @@ -1815,8 +1815,14 @@ static void do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) pfn = folio_pfn(folio) + folio_nr_pages(folio) - 1; if (folio_contain_hwpoisoned_page(folio)) { - if (WARN_ON(folio_test_lru(folio))) - folio_isolate_lru(folio); + /* + * unmap_poisoned_folio() cannot handle large folios + * in all cases yet. + */ + if (folio_test_large(folio) && !folio_test_hugetlb(folio)) + goto put_folio; + if (folio_test_lru(folio) && !folio_isolate_lru(folio)) + goto put_folio; if (folio_mapped(folio)) { folio_lock(folio); unmap_poisoned_folio(folio, pfn, false); diff --git a/mm/memremap.c b/mm/memremap.c index b0ce0d8254bd..46cb1b0b6f72 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -153,14 +153,14 @@ static int pagemap_range(struct dev_pagemap *pgmap, struct mhp_params *params, "altmap not supported for multiple ranges\n")) return -EINVAL; - conflict_pgmap = get_dev_pagemap(PHYS_PFN(range->start), NULL); + conflict_pgmap = get_dev_pagemap(PHYS_PFN(range->start)); if (conflict_pgmap) { WARN(1, "Conflicting mapping in same section\n"); put_dev_pagemap(conflict_pgmap); return -ENOMEM; } - conflict_pgmap = get_dev_pagemap(PHYS_PFN(range->end), NULL); + conflict_pgmap = get_dev_pagemap(PHYS_PFN(range->end)); if (conflict_pgmap) { WARN(1, "Conflicting mapping in same section\n"); put_dev_pagemap(conflict_pgmap); @@ -275,6 +275,9 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid) if (WARN_ONCE(!nr_range, "nr_range must be specified\n")) return ERR_PTR(-EINVAL); + if (WARN_ONCE(pgmap->vmemmap_shift > MAX_FOLIO_ORDER, + "requested folio size unsupported\n")) + return ERR_PTR(-EINVAL); switch (pgmap->type) { case MEMORY_DEVICE_PRIVATE: @@ -394,26 +397,12 @@ EXPORT_SYMBOL_GPL(devm_memunmap_pages); /** * get_dev_pagemap() - take a new live reference on the dev_pagemap for @pfn * @pfn: page frame number to lookup page_map - * @pgmap: optional known pgmap that already has a reference - * - * If @pgmap is non-NULL and covers @pfn it will be returned as-is. If @pgmap - * is non-NULL but does not cover @pfn the reference to it will be released. */ -struct dev_pagemap *get_dev_pagemap(unsigned long pfn, - struct dev_pagemap *pgmap) +struct dev_pagemap *get_dev_pagemap(unsigned long pfn) { + struct dev_pagemap *pgmap; resource_size_t phys = PFN_PHYS(pfn); - /* - * In the cached case we're already holding a live reference. - */ - if (pgmap) { - if (phys >= pgmap->range.start && phys <= pgmap->range.end) - return pgmap; - put_dev_pagemap(pgmap); - } - - /* fall back to slow path lookup */ rcu_read_lock(); pgmap = xa_load(&pgmap_array, PHYS_PFN(phys)); if (pgmap && !percpu_ref_tryget_live_rcu(&pgmap->ref)) diff --git a/mm/migrate.c b/mm/migrate.c index 9e5ef39ce73a..aee61a980374 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -231,18 +231,17 @@ static void putback_movable_ops_page(struct page *page) * src and dst are also released by migration core. These pages will not be * folios in the future, so that must be reworked. * - * Returns MIGRATEPAGE_SUCCESS on success, otherwise a negative error - * code. + * Returns 0 on success, otherwise a negative error code. */ static int migrate_movable_ops_page(struct page *dst, struct page *src, enum migrate_mode mode) { - int rc = MIGRATEPAGE_SUCCESS; + int rc; VM_WARN_ON_ONCE_PAGE(!page_has_movable_ops(src), src); VM_WARN_ON_ONCE_PAGE(!PageMovableOpsIsolated(src), src); rc = page_movable_ops(src)->migrate_page(dst, src, mode); - if (rc == MIGRATEPAGE_SUCCESS) + if (!rc) ClearPageMovableOpsIsolated(src); return rc; } @@ -564,10 +563,10 @@ static int __folio_migrate_mapping(struct address_space *mapping, struct folio *newfolio, struct folio *folio, int expected_count) { XA_STATE(xas, &mapping->i_pages, folio_index(folio)); + struct swap_cluster_info *ci = NULL; struct zone *oldzone, *newzone; int dirty; long nr = folio_nr_pages(folio); - long entries, i; if (!mapping) { /* Take off deferred split queue while frozen and memcg set */ @@ -587,15 +586,22 @@ static int __folio_migrate_mapping(struct address_space *mapping, if (folio_test_swapbacked(folio)) __folio_set_swapbacked(newfolio); - return MIGRATEPAGE_SUCCESS; + return 0; } oldzone = folio_zone(folio); newzone = folio_zone(newfolio); - xas_lock_irq(&xas); + if (folio_test_swapcache(folio)) + ci = swap_cluster_get_and_lock_irq(folio); + else + xas_lock_irq(&xas); + if (!folio_ref_freeze(folio, expected_count)) { - xas_unlock_irq(&xas); + if (ci) + swap_cluster_unlock_irq(ci); + else + xas_unlock_irq(&xas); return -EAGAIN; } @@ -616,9 +622,6 @@ static int __folio_migrate_mapping(struct address_space *mapping, if (folio_test_swapcache(folio)) { folio_set_swapcache(newfolio); newfolio->private = folio_get_private(folio); - entries = nr; - } else { - entries = 1; } /* Move dirty while folio refs frozen and newfolio not yet exposed */ @@ -628,11 +631,10 @@ static int __folio_migrate_mapping(struct address_space *mapping, folio_set_dirty(newfolio); } - /* Swap cache still stores N entries instead of a high-order entry */ - for (i = 0; i < entries; i++) { + if (folio_test_swapcache(folio)) + __swap_cache_replace_folio(ci, folio, newfolio); + else xas_store(&xas, newfolio); - xas_next(&xas); - } /* * Drop cache reference from old folio by unfreezing @@ -641,8 +643,11 @@ static int __folio_migrate_mapping(struct address_space *mapping, */ folio_ref_unfreeze(folio, expected_count - nr); - xas_unlock(&xas); /* Leave irq disabled to prevent preemption while updating stats */ + if (ci) + swap_cluster_unlock(ci); + else + xas_unlock(&xas); /* * If moved to a different zone then also account @@ -688,7 +693,7 @@ static int __folio_migrate_mapping(struct address_space *mapping, } local_irq_enable(); - return MIGRATEPAGE_SUCCESS; + return 0; } int folio_migrate_mapping(struct address_space *mapping, @@ -737,7 +742,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, xas_unlock_irq(&xas); - return MIGRATEPAGE_SUCCESS; + return 0; } /* @@ -853,14 +858,14 @@ static int __migrate_folio(struct address_space *mapping, struct folio *dst, return rc; rc = __folio_migrate_mapping(mapping, dst, src, expected_count); - if (rc != MIGRATEPAGE_SUCCESS) + if (rc) return rc; if (src_private) folio_attach_private(dst, folio_detach_private(src)); folio_migrate_flags(dst, src); - return MIGRATEPAGE_SUCCESS; + return 0; } /** @@ -967,7 +972,7 @@ recheck_buffers: } rc = filemap_migrate_folio(mapping, dst, src, mode); - if (rc != MIGRATEPAGE_SUCCESS) + if (rc) goto unlock_buffers; bh = head; @@ -1071,7 +1076,7 @@ static int fallback_migrate_folio(struct address_space *mapping, * * Return value: * < 0 - error code - * MIGRATEPAGE_SUCCESS - success + * 0 - success */ static int move_to_new_folio(struct folio *dst, struct folio *src, enum migrate_mode mode) @@ -1099,7 +1104,7 @@ static int move_to_new_folio(struct folio *dst, struct folio *src, else rc = fallback_migrate_folio(mapping, dst, src, mode); - if (rc == MIGRATEPAGE_SUCCESS) { + if (!rc) { /* * For pagecache folios, src->mapping must be cleared before src * is freed. Anonymous folios must stay anonymous until freed. @@ -1189,7 +1194,7 @@ static void migrate_folio_done(struct folio *src, static int migrate_folio_unmap(new_folio_t get_new_folio, free_folio_t put_new_folio, unsigned long private, struct folio *src, struct folio **dstp, enum migrate_mode mode, - enum migrate_reason reason, struct list_head *ret) + struct list_head *ret) { struct folio *dst; int rc = -EAGAIN; @@ -1198,16 +1203,6 @@ static int migrate_folio_unmap(new_folio_t get_new_folio, bool locked = false; bool dst_locked = false; - if (folio_ref_count(src) == 1) { - /* Folio was freed from under us. So we are done. */ - folio_clear_active(src); - folio_clear_unevictable(src); - /* free_pages_prepare() will clear PG_isolated. */ - list_del(&src->lru); - migrate_folio_done(src, reason); - return MIGRATEPAGE_SUCCESS; - } - dst = get_new_folio(src, private); if (!dst) return -ENOMEM; @@ -1297,7 +1292,7 @@ static int migrate_folio_unmap(new_folio_t get_new_folio, if (unlikely(page_has_movable_ops(&src->page))) { __migrate_folio_record(dst, old_page_state, anon_vma); - return MIGRATEPAGE_UNMAP; + return 0; } /* @@ -1327,7 +1322,7 @@ static int migrate_folio_unmap(new_folio_t get_new_folio, if (!folio_mapped(src)) { __migrate_folio_record(dst, old_page_state, anon_vma); - return MIGRATEPAGE_UNMAP; + return 0; } out: @@ -1459,7 +1454,7 @@ static int unmap_and_move_huge_page(new_folio_t get_new_folio, if (folio_ref_count(src) == 1) { /* page was freed from under us. So we are done. */ folio_putback_hugetlb(src); - return MIGRATEPAGE_SUCCESS; + return 0; } dst = get_new_folio(src, private); @@ -1522,8 +1517,7 @@ static int unmap_and_move_huge_page(new_folio_t get_new_folio, rc = move_to_new_folio(dst, src, mode); if (page_was_mapped) - remove_migration_ptes(src, - rc == MIGRATEPAGE_SUCCESS ? dst : src, 0); + remove_migration_ptes(src, !rc ? dst : src, 0); unlock_put_anon: folio_unlock(dst); @@ -1532,7 +1526,7 @@ put_anon: if (anon_vma) put_anon_vma(anon_vma); - if (rc == MIGRATEPAGE_SUCCESS) { + if (!rc) { move_hugetlb_state(src, dst, reason); put_new_folio = NULL; } @@ -1540,7 +1534,7 @@ put_anon: out_unlock: folio_unlock(src); out: - if (rc == MIGRATEPAGE_SUCCESS) + if (!rc) folio_putback_hugetlb(src); else if (rc != -EAGAIN) list_move_tail(&src->lru, ret); @@ -1650,7 +1644,7 @@ static int migrate_hugetlbs(struct list_head *from, new_folio_t get_new_folio, reason, ret_folios); /* * The rules are: - * Success: hugetlb folio will be put back + * 0: hugetlb folio will be put back * -EAGAIN: stay on the from list * -ENOMEM: stay on the from list * Other errno: put on ret_folios list @@ -1667,7 +1661,7 @@ static int migrate_hugetlbs(struct list_head *from, new_folio_t get_new_folio, retry++; nr_retry_pages += nr_pages; break; - case MIGRATEPAGE_SUCCESS: + case 0: stats->nr_succeeded += nr_pages; break; default: @@ -1721,7 +1715,7 @@ static void migrate_folios_move(struct list_head *src_folios, reason, ret_folios); /* * The rules are: - * Success: folio will be freed + * 0: folio will be freed * -EAGAIN: stay on the unmap_folios list * Other errno: put on ret_folios list */ @@ -1731,7 +1725,7 @@ static void migrate_folios_move(struct list_head *src_folios, *thp_retry += is_thp; *nr_retry_pages += nr_pages; break; - case MIGRATEPAGE_SUCCESS: + case 0: stats->nr_succeeded += nr_pages; stats->nr_thp_succeeded += is_thp; break; @@ -1870,14 +1864,27 @@ static int migrate_pages_batch(struct list_head *from, continue; } + /* + * If we are holding the last folio reference, the folio + * was freed from under us, so just drop our reference. + */ + if (likely(!page_has_movable_ops(&folio->page)) && + folio_ref_count(folio) == 1) { + folio_clear_active(folio); + folio_clear_unevictable(folio); + list_del(&folio->lru); + migrate_folio_done(folio, reason); + stats->nr_succeeded += nr_pages; + stats->nr_thp_succeeded += is_thp; + continue; + } + rc = migrate_folio_unmap(get_new_folio, put_new_folio, - private, folio, &dst, mode, reason, - ret_folios); + private, folio, &dst, mode, ret_folios); /* * The rules are: - * Success: folio will be freed - * Unmap: folio will be put on unmap_folios list, - * dst folio put on dst_folios list + * 0: folio will be put on unmap_folios list, + * dst folio put on dst_folios list * -EAGAIN: stay on the from list * -ENOMEM: stay on the from list * Other errno: put on ret_folios list @@ -1927,11 +1934,7 @@ static int migrate_pages_batch(struct list_head *from, thp_retry += is_thp; nr_retry_pages += nr_pages; break; - case MIGRATEPAGE_SUCCESS: - stats->nr_succeeded += nr_pages; - stats->nr_thp_succeeded += is_thp; - break; - case MIGRATEPAGE_UNMAP: + case 0: list_move_tail(&folio->lru, &unmap_folios); list_add_tail(&dst->lru, &dst_folios); break; diff --git a/mm/migrate_device.c b/mm/migrate_device.c index e05e14d6eacd..abd9f6850db6 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -778,7 +778,7 @@ static void __migrate_device_pages(unsigned long *src_pfns, if (migrate && migrate->fault_page == page) extra_cnt = 1; r = folio_migrate_mapping(mapping, newfolio, folio, extra_cnt); - if (r != MIGRATEPAGE_SUCCESS) + if (r) src_pfns[i] &= ~MIGRATE_PFN_MIGRATE; else folio_migrate_flags(newfolio, folio); diff --git a/mm/mincore.c b/mm/mincore.c index 10dabefc3acc..8ec4719370e1 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -47,6 +47,47 @@ static int mincore_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr, return 0; } +static unsigned char mincore_swap(swp_entry_t entry, bool shmem) +{ + struct swap_info_struct *si; + struct folio *folio = NULL; + unsigned char present = 0; + + if (!IS_ENABLED(CONFIG_SWAP)) { + WARN_ON(1); + return 0; + } + + /* + * Shmem mapping may contain swapin error entries, which are + * absent. Page table may contain migration or hwpoison + * entries which are always uptodate. + */ + if (non_swap_entry(entry)) + return !shmem; + + /* + * Shmem mapping lookup is lockless, so we need to grab the swap + * device. mincore page table walk locks the PTL, and the swap + * device is stable, avoid touching the si for better performance. + */ + if (shmem) { + si = get_swap_device(entry); + if (!si) + return 0; + } + folio = swap_cache_get_folio(entry); + if (shmem) + put_swap_device(si); + /* The swap cache space contains either folio, shadow or NULL */ + if (folio && !xa_is_value(folio)) { + present = folio_test_uptodate(folio); + folio_put(folio); + } + + return present; +} + /* * Later we can get more picky about what "in core" means precisely. * For now, simply check to see if the page is in the page cache, @@ -64,8 +105,15 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t index) * any other file mapping (ie. marked !present and faulted in with * tmpfs's .fault). So swapped out tmpfs mappings are tested here. */ - folio = filemap_get_incore_folio(mapping, index); - if (!IS_ERR(folio)) { + folio = filemap_get_entry(mapping, index); + if (folio) { + if (xa_is_value(folio)) { + if (shmem_mapping(mapping)) + return mincore_swap(radix_to_swp_entry(folio), + true); + else + return 0; + } present = folio_test_uptodate(folio); folio_put(folio); } @@ -143,23 +191,7 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, for (i = 0; i < step; i++) vec[i] = 1; } else { /* pte is a swap entry */ - swp_entry_t entry = pte_to_swp_entry(pte); - - if (non_swap_entry(entry)) { - /* - * migration or hwpoison entries are always - * uptodate - */ - *vec = 1; - } else { -#ifdef CONFIG_SWAP - *vec = mincore_page(swap_address_space(entry), - swap_cache_index(entry)); -#else - WARN_ON(1); - *vec = 1; -#endif - } + *vec = mincore_swap(pte_to_swp_entry(pte), false); } vec += step; } diff --git a/mm/mlock.c b/mm/mlock.c index a1d93ad33c6d..bb0776f5ef7c 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -255,7 +255,7 @@ void mlock_folio(struct folio *folio) folio_get(folio); if (!folio_batch_add(fbatch, mlock_lru(folio)) || - folio_test_large(folio) || lru_cache_disabled()) + !folio_may_be_lru_cached(folio) || lru_cache_disabled()) mlock_folio_batch(fbatch); local_unlock(&mlock_fbatch.lock); } @@ -278,7 +278,7 @@ void mlock_new_folio(struct folio *folio) folio_get(folio); if (!folio_batch_add(fbatch, mlock_new(folio)) || - folio_test_large(folio) || lru_cache_disabled()) + !folio_may_be_lru_cached(folio) || lru_cache_disabled()) mlock_folio_batch(fbatch); local_unlock(&mlock_fbatch.lock); } @@ -299,7 +299,7 @@ void munlock_folio(struct folio *folio) */ folio_get(folio); if (!folio_batch_add(fbatch, folio) || - folio_test_large(folio) || lru_cache_disabled()) + !folio_may_be_lru_cached(folio) || lru_cache_disabled()) mlock_folio_batch(fbatch); local_unlock(&mlock_fbatch.lock); } diff --git a/mm/mm_init.c b/mm/mm_init.c index 5c21b3af216b..df614556741a 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -1091,6 +1091,12 @@ static void __ref memmap_init_compound(struct page *head, unsigned long pfn, end_pfn = head_pfn + nr_pages; unsigned int order = pgmap->vmemmap_shift; + /* + * We have to initialize the pages, including setting up page links. + * prep_compound_page() does not take care of that, so instead we + * open-code prep_compound_page() so we can take care of initializing + * the pages in the same go. + */ __SetPageHead(head); for (pfn = head_pfn + 1; pfn < end_pfn; pfn++) { struct page *page = pfn_to_page(pfn); @@ -1098,15 +1104,8 @@ static void __ref memmap_init_compound(struct page *head, __init_zone_device_page(page, pfn, zone_idx, nid, pgmap); prep_compound_tail(head, pfn - head_pfn); set_page_count(page, 0); - - /* - * The first tail page stores important compound page info. - * Call prep_compound_head() after the first tail page has - * been initialized, to not have the data overwritten. - */ - if (pfn == head_pfn + 1) - prep_compound_head(head, order); } + prep_compound_head(head, order); } void __ref memmap_init_zone_device(struct zone *zone, diff --git a/mm/mmap.c b/mm/mmap.c index 7306253cc3b5..5fd3b80fda1d 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -225,7 +225,7 @@ static inline unsigned long round_hint_to_min(unsigned long hint) return hint; } -bool mlock_future_ok(struct mm_struct *mm, vm_flags_t vm_flags, +bool mlock_future_ok(const struct mm_struct *mm, vm_flags_t vm_flags, unsigned long bytes) { unsigned long locked_pages, limit_pages; @@ -802,7 +802,7 @@ unsigned long mm_get_unmapped_area_vmflags(struct mm_struct *mm, struct file *fi unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags) { - if (test_bit(MMF_TOPDOWN, &mm->flags)) + if (mm_flags_test(MMF_TOPDOWN, mm)) return arch_get_unmapped_area_topdown(filp, addr, len, pgoff, flags, vm_flags); return arch_get_unmapped_area(filp, addr, len, pgoff, flags, vm_flags); @@ -1284,7 +1284,7 @@ void exit_mmap(struct mm_struct *mm) * Set MMF_OOM_SKIP to hide this task from the oom killer/reaper * because the memory has been already freed. */ - set_bit(MMF_OOM_SKIP, &mm->flags); + mm_flags_set(MMF_OOM_SKIP, mm); mmap_write_lock(mm); mt_clear_in_rcu(&mm->mm_mt); vma_iter_set(&vmi, vma->vm_end); @@ -1859,14 +1859,14 @@ loop_out: mas_set_range(&vmi.mas, mpnt->vm_start, mpnt->vm_end - 1); mas_store(&vmi.mas, XA_ZERO_ENTRY); /* Avoid OOM iterating a broken tree */ - set_bit(MMF_OOM_SKIP, &mm->flags); + mm_flags_set(MMF_OOM_SKIP, mm); } /* * The mm_struct is going to exit, but the locks will be dropped * first. Set the mm_struct as unstable is advisable as it is * not fully initialised. */ - set_bit(MMF_UNSTABLE, &mm->flags); + mm_flags_set(MMF_UNSTABLE, mm); } out: mmap_write_unlock(mm); diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c index b006cec8e6fe..0a0db5849b8e 100644 --- a/mm/mmap_lock.c +++ b/mm/mmap_lock.c @@ -128,6 +128,95 @@ void vma_mark_detached(struct vm_area_struct *vma) } /* + * Try to read-lock a vma. The function is allowed to occasionally yield false + * locked result to avoid performance overhead, in which case we fall back to + * using mmap_lock. The function should never yield false unlocked result. + * False locked result is possible if mm_lock_seq overflows or if vma gets + * reused and attached to a different mm before we lock it. + * Returns the vma on success, NULL on failure to lock and EAGAIN if vma got + * detached. + * + * IMPORTANT: RCU lock must be held upon entering the function, but upon error + * IT IS RELEASED. The caller must handle this correctly. + */ +static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm, + struct vm_area_struct *vma) +{ + struct mm_struct *other_mm; + int oldcnt; + + RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "no rcu lock held"); + /* + * Check before locking. A race might cause false locked result. + * We can use READ_ONCE() for the mm_lock_seq here, and don't need + * ACQUIRE semantics, because this is just a lockless check whose result + * we don't rely on for anything - the mm_lock_seq read against which we + * need ordering is below. + */ + if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(mm->mm_lock_seq.sequence)) { + vma = NULL; + goto err; + } + + /* + * If VMA_LOCK_OFFSET is set, __refcount_inc_not_zero_limited_acquire() + * will fail because VMA_REF_LIMIT is less than VMA_LOCK_OFFSET. + * Acquire fence is required here to avoid reordering against later + * vm_lock_seq check and checks inside lock_vma_under_rcu(). + */ + if (unlikely(!__refcount_inc_not_zero_limited_acquire(&vma->vm_refcnt, &oldcnt, + VMA_REF_LIMIT))) { + /* return EAGAIN if vma got detached from under us */ + vma = oldcnt ? NULL : ERR_PTR(-EAGAIN); + goto err; + } + + rwsem_acquire_read(&vma->vmlock_dep_map, 0, 1, _RET_IP_); + + if (unlikely(vma->vm_mm != mm)) + goto err_unstable; + + /* + * Overflow of vm_lock_seq/mm_lock_seq might produce false locked result. + * False unlocked result is impossible because we modify and check + * vma->vm_lock_seq under vma->vm_refcnt protection and mm->mm_lock_seq + * modification invalidates all existing locks. + * + * We must use ACQUIRE semantics for the mm_lock_seq so that if we are + * racing with vma_end_write_all(), we only start reading from the VMA + * after it has been unlocked. + * This pairs with RELEASE semantics in vma_end_write_all(). + */ + if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&mm->mm_lock_seq))) { + vma_refcount_put(vma); + vma = NULL; + goto err; + } + + return vma; +err: + rcu_read_unlock(); + + return vma; +err_unstable: + /* + * If vma got attached to another mm from under us, that mm is not + * stable and can be freed in the narrow window after vma->vm_refcnt + * is dropped and before rcuwait_wake_up(mm) is called. Grab it before + * releasing vma->vm_refcnt. + */ + other_mm = vma->vm_mm; /* use a copy as vma can be freed after we drop vm_refcnt */ + + /* __mmdrop() is a heavy operation, do it after dropping RCU lock. */ + rcu_read_unlock(); + mmgrab(other_mm); + vma_refcount_put(vma); + mmdrop(other_mm); + + return NULL; +} + +/* * Lookup and lock a VMA under RCU protection. Returned VMA is guaranteed to be * stable and not isolated. If the VMA is not found or is being modified the * function returns NULL. @@ -138,11 +227,13 @@ struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, MA_STATE(mas, &mm->mm_mt, address, address); struct vm_area_struct *vma; - rcu_read_lock(); retry: + rcu_read_lock(); vma = mas_walk(&mas); - if (!vma) + if (!vma) { + rcu_read_unlock(); goto inval; + } vma = vma_start_read(mm, vma); if (IS_ERR_OR_NULL(vma)) { @@ -162,18 +253,17 @@ retry: * From here on, we can access the VMA without worrying about which * fields are accessible for RCU readers. */ + rcu_read_unlock(); /* Check if the vma we locked is the right one. */ - if (unlikely(address < vma->vm_start || address >= vma->vm_end)) - goto inval_end_read; + if (unlikely(address < vma->vm_start || address >= vma->vm_end)) { + vma_end_read(vma); + goto inval; + } - rcu_read_unlock(); return vma; -inval_end_read: - vma_end_read(vma); inval: - rcu_read_unlock(); count_vm_vma_lock_event(VMA_LOCK_ABORT); return NULL; } @@ -228,6 +318,7 @@ retry: */ if (PTR_ERR(vma) == -EAGAIN) { /* reset to search from the last address */ + rcu_read_lock(); vma_iter_set(vmi, from_addr); goto retry; } @@ -257,9 +348,9 @@ retry: return vma; fallback_unlock: + rcu_read_unlock(); vma_end_read(vma); fallback: - rcu_read_unlock(); vma = lock_next_vma_under_mmap_lock(mm, vmi, from_addr); rcu_read_lock(); /* Reinitialize the iterator after re-entering rcu read section */ diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c index b49cc6385f1f..374aa6f021c6 100644 --- a/mm/mmu_gather.c +++ b/mm/mmu_gather.c @@ -32,7 +32,7 @@ static bool tlb_next_batch(struct mmu_gather *tlb) if (tlb->batch_count == MAX_GATHER_BATCH_COUNT) return false; - batch = (void *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN); + batch = (void *)__get_free_page(GFP_NOWAIT); if (!batch) return false; @@ -364,7 +364,7 @@ void tlb_remove_table(struct mmu_gather *tlb, void *table) struct mmu_table_batch **batch = &tlb->batch; if (*batch == NULL) { - *batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN); + *batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT); if (*batch == NULL) { tlb_table_invalidate(tlb); tlb_remove_table_one(table); diff --git a/mm/mmzone.c b/mm/mmzone.c index f9baa8882fbf..0c8f181d9d50 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c @@ -99,14 +99,14 @@ int folio_xchg_last_cpupid(struct folio *folio, int cpupid) unsigned long old_flags, flags; int last_cpupid; - old_flags = READ_ONCE(folio->flags); + old_flags = READ_ONCE(folio->flags.f); do { flags = old_flags; last_cpupid = (flags >> LAST_CPUPID_PGSHIFT) & LAST_CPUPID_MASK; flags &= ~(LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT); flags |= (cpupid & LAST_CPUPID_MASK) << LAST_CPUPID_PGSHIFT; - } while (unlikely(!try_cmpxchg(&folio->flags, &old_flags, flags))); + } while (unlikely(!try_cmpxchg(&folio->flags.f, &old_flags, flags))); return last_cpupid; } diff --git a/mm/mremap.c b/mm/mremap.c index e618a706aff5..35de0a7b910e 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -1774,15 +1774,18 @@ static unsigned long check_mremap_params(struct vma_remap_struct *vrm) if (!vrm->new_len) return -EINVAL; - /* Is the new length or address silly? */ - if (vrm->new_len > TASK_SIZE || - vrm->new_addr > TASK_SIZE - vrm->new_len) + /* Is the new length silly? */ + if (vrm->new_len > TASK_SIZE) return -EINVAL; /* Remainder of checks are for cases with specific new_addr. */ if (!vrm_implies_new_addr(vrm)) return 0; + /* Is the new address silly? */ + if (vrm->new_addr > TASK_SIZE - vrm->new_len) + return -EINVAL; + /* The new address must be page-aligned. */ if (offset_in_page(vrm->new_addr)) return -EINVAL; diff --git a/mm/nommu.c b/mm/nommu.c index 8b819fafd57b..c3a23b082adb 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -64,7 +64,7 @@ const struct vm_operations_struct generic_file_vm_ops = { */ unsigned int kobjsize(const void *objp) { - struct page *page; + struct folio *folio; /* * If the object we have should not have ksize performed on it, @@ -73,22 +73,22 @@ unsigned int kobjsize(const void *objp) if (!objp || !virt_addr_valid(objp)) return 0; - page = virt_to_head_page(objp); + folio = virt_to_folio(objp); /* * If the allocator sets PageSlab, we know the pointer came from * kmalloc(). */ - if (PageSlab(page)) + if (folio_test_slab(folio)) return ksize(objp); /* - * If it's not a compound page, see if we have a matching VMA + * If it's not a large folio, see if we have a matching VMA * region. This test is intentionally done in reverse order, * so if there's no VMA, we still fall through and hand back - * PAGE_SIZE for 0-order pages. + * PAGE_SIZE for 0-order folios. */ - if (!PageCompound(page)) { + if (!folio_test_large(folio)) { struct vm_area_struct *vma; vma = find_vma(current->mm, (unsigned long)objp); @@ -100,7 +100,7 @@ unsigned int kobjsize(const void *objp) * The ksize() function is only guaranteed to work for pointers * returned by kmalloc(). So handle arbitrary pointers here. */ - return page_size(page); + return folio_size(folio); } void vfree(const void *addr) @@ -119,7 +119,8 @@ void *__vmalloc_noprof(unsigned long size, gfp_t gfp_mask) } EXPORT_SYMBOL(__vmalloc_noprof); -void *vrealloc_noprof(const void *p, size_t size, gfp_t flags) +void *vrealloc_node_align_noprof(const void *p, size_t size, unsigned long align, + gfp_t flags, int node) { return krealloc_noprof(p, size, (flags | __GFP_COMP) & ~__GFP_HIGHMEM); } diff --git a/mm/numa_emulation.c b/mm/numa_emulation.c index 9d55679d99ce..703c8fa05048 100644 --- a/mm/numa_emulation.c +++ b/mm/numa_emulation.c @@ -73,7 +73,7 @@ static int __init emu_setup_memblk(struct numa_meminfo *ei, } printk(KERN_INFO "Faking node %d at [mem %#018Lx-%#018Lx] (%LuMB)\n", - nid, eb->start, eb->end - 1, (eb->end - eb->start) >> 20); + nid, eb->start, eb->end - 1, (eb->end - eb->start) / SZ_1M); return 0; } @@ -264,7 +264,7 @@ static int __init split_nodes_size_interleave_uniform(struct numa_meminfo *ei, min_size = ALIGN(max(min_size, FAKE_NODE_MIN_SIZE), FAKE_NODE_MIN_SIZE); if (size < min_size) { pr_err("Fake node size %LuMB too small, increasing to %LuMB\n", - size >> 20, min_size >> 20); + size / SZ_1M, min_size / SZ_1M); size = min_size; } size = ALIGN_DOWN(size, FAKE_NODE_MIN_SIZE); diff --git a/mm/numa_memblks.c b/mm/numa_memblks.c index 541a99c4071a..5b009a9cd8b4 100644 --- a/mm/numa_memblks.c +++ b/mm/numa_memblks.c @@ -76,7 +76,7 @@ static int __init numa_alloc_distance(void) for (j = 0; j < cnt; j++) numa_distance[i * cnt + j] = i == j ? LOCAL_DISTANCE : REMOTE_DISTANCE; - printk(KERN_DEBUG "NUMA: Initialized distance table, cnt=%d\n", cnt); + pr_debug("NUMA: Initialized distance table, cnt=%d\n", cnt); return 0; } @@ -427,9 +427,9 @@ static int __init numa_register_meminfo(struct numa_meminfo *mi) unsigned long pfn_align = node_map_pfn_alignment(); if (pfn_align && pfn_align < PAGES_PER_SECTION) { - unsigned long node_align_mb = PFN_PHYS(pfn_align) >> 20; + unsigned long node_align_mb = PFN_PHYS(pfn_align) / SZ_1M; - unsigned long sect_align_mb = PFN_PHYS(PAGES_PER_SECTION) >> 20; + unsigned long sect_align_mb = PFN_PHYS(PAGES_PER_SECTION) / SZ_1M; pr_warn("Node alignment %luMB < min %luMB, rejecting NUMA config\n", node_align_mb, sect_align_mb); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 25923cfec9c6..c145b0feecc1 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-only /* * linux/mm/oom_kill.c - * + * * Copyright (C) 1998,2000 Rik van Riel * Thanks go out to Claus Fischer for some serious inspiration and * for goading me into coding this file... @@ -218,7 +218,7 @@ long oom_badness(struct task_struct *p, unsigned long totalpages) */ adj = (long)p->signal->oom_score_adj; if (adj == OOM_SCORE_ADJ_MIN || - test_bit(MMF_OOM_SKIP, &p->mm->flags) || + mm_flags_test(MMF_OOM_SKIP, p->mm) || in_vfork(p)) { task_unlock(p); return LONG_MIN; @@ -325,7 +325,7 @@ static int oom_evaluate_task(struct task_struct *task, void *arg) * any memory is quite low. */ if (!is_sysrq_oom(oc) && tsk_is_oom_victim(task)) { - if (test_bit(MMF_OOM_SKIP, &task->signal->oom_mm->flags)) + if (mm_flags_test(MMF_OOM_SKIP, task->signal->oom_mm)) goto next; goto abort; } @@ -490,12 +490,12 @@ static bool oom_killer_disabled __read_mostly; * task's threads: if one of those is using this mm then this task was also * using it. */ -bool process_shares_mm(struct task_struct *p, struct mm_struct *mm) +bool process_shares_mm(const struct task_struct *p, const struct mm_struct *mm) { - struct task_struct *t; + const struct task_struct *t; for_each_thread(p, t) { - struct mm_struct *t_mm = READ_ONCE(t->mm); + const struct mm_struct *t_mm = READ_ONCE(t->mm); if (t_mm) return t_mm == mm; } @@ -516,7 +516,7 @@ static bool __oom_reap_task_mm(struct mm_struct *mm) { struct vm_area_struct *vma; bool ret = true; - VMA_ITERATOR(vmi, mm, 0); + MA_STATE(mas, &mm->mm_mt, ULONG_MAX, ULONG_MAX); /* * Tell all users of get_user/copy_from_user etc... that the content @@ -524,9 +524,15 @@ static bool __oom_reap_task_mm(struct mm_struct *mm) * should imply barriers already and the reader would hit a page fault * if it stumbled over a reaped memory. */ - set_bit(MMF_UNSTABLE, &mm->flags); + mm_flags_set(MMF_UNSTABLE, mm); - for_each_vma(vmi, vma) { + /* + * It might start racing with the dying task and compete for shared + * resources - e.g. page table lock contention has been observed. + * Reduce those races by reaping the oom victim from the other end + * of the address space. + */ + mas_for_each_rev(&mas, vma, 0) { if (vma->vm_flags & (VM_HUGETLB|VM_PFNMAP)) continue; @@ -583,7 +589,7 @@ static bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) * under mmap_lock for reading because it serializes against the * mmap_write_lock();mmap_write_unlock() cycle in exit_mmap(). */ - if (test_bit(MMF_OOM_SKIP, &mm->flags)) { + if (mm_flags_test(MMF_OOM_SKIP, mm)) { trace_skip_task_reaping(tsk->pid); goto out_unlock; } @@ -619,7 +625,7 @@ static void oom_reap_task(struct task_struct *tsk) schedule_timeout_idle(HZ/10); if (attempts <= MAX_OOM_REAP_RETRIES || - test_bit(MMF_OOM_SKIP, &mm->flags)) + mm_flags_test(MMF_OOM_SKIP, mm)) goto done; pr_info("oom_reaper: unable to reap pid:%d (%s)\n", @@ -634,7 +640,7 @@ done: * Hide this mm from OOM killer because it has been either reaped or * somebody can't call mmap_write_unlock(mm). */ - set_bit(MMF_OOM_SKIP, &mm->flags); + mm_flags_set(MMF_OOM_SKIP, mm); /* Drop a reference taken by queue_oom_reaper */ put_task_struct(tsk); @@ -670,7 +676,7 @@ static void wake_oom_reaper(struct timer_list *timer) unsigned long flags; /* The victim managed to terminate on its own - see exit_mmap */ - if (test_bit(MMF_OOM_SKIP, &mm->flags)) { + if (mm_flags_test(MMF_OOM_SKIP, mm)) { put_task_struct(tsk); return; } @@ -695,7 +701,7 @@ static void wake_oom_reaper(struct timer_list *timer) static void queue_oom_reaper(struct task_struct *tsk) { /* mm is already queued? */ - if (test_and_set_bit(MMF_OOM_REAP_QUEUED, &tsk->signal->oom_mm->flags)) + if (mm_flags_test_and_set(MMF_OOM_REAP_QUEUED, tsk->signal->oom_mm)) return; get_task_struct(tsk); @@ -772,12 +778,12 @@ static void mark_oom_victim(struct task_struct *tsk) mmgrab(tsk->signal->oom_mm); /* - * Make sure that the task is woken up from uninterruptible sleep - * if it is frozen because OOM killer wouldn't be able to free - * any memory and livelock. freezing_slow_path will tell the freezer - * that TIF_MEMDIE tasks should be ignored. + * Make sure that the process is woken up from uninterruptible sleep + * if it is frozen because OOM killer wouldn't be able to free any + * memory and livelock. The freezer will thaw the tasks that are OOM + * victims regardless of the PM freezing and cgroup freezing states. */ - __thaw_task(tsk); + thaw_process(tsk); atomic_inc(&oom_victims); cred = get_task_cred(tsk); trace_mark_victim(tsk, cred->uid.val); @@ -892,7 +898,7 @@ static bool task_will_free_mem(struct task_struct *task) * This task has already been drained by the oom reaper so there are * only small chances it will free some more */ - if (test_bit(MMF_OOM_SKIP, &mm->flags)) + if (mm_flags_test(MMF_OOM_SKIP, mm)) return false; if (atomic_read(&mm->mm_users) <= 1) @@ -977,7 +983,7 @@ static void __oom_kill_process(struct task_struct *victim, const char *message) continue; if (is_global_init(p)) { can_oom_reap = false; - set_bit(MMF_OOM_SKIP, &mm->flags); + mm_flags_set(MMF_OOM_SKIP, mm); pr_info("oom killer %d (%s) has mm pinned by %d (%s)\n", task_pid_nr(victim), victim->comm, task_pid_nr(p), p->comm); @@ -1235,7 +1241,7 @@ SYSCALL_DEFINE2(process_mrelease, int, pidfd, unsigned int, flags) reap = true; else { /* Error only if the work has not been done already */ - if (!test_bit(MMF_OOM_SKIP, &mm->flags)) + if (!mm_flags_test(MMF_OOM_SKIP, mm)) ret = -EINVAL; } task_unlock(p); @@ -1251,7 +1257,7 @@ SYSCALL_DEFINE2(process_mrelease, int, pidfd, unsigned int, flags) * Check MMF_OOM_SKIP again under mmap_read_lock protection to ensure * possible change in exit_mmap is seen */ - if (!test_bit(MMF_OOM_SKIP, &mm->flags) && !__oom_reap_task_mm(mm)) + if (!mm_flags_test(MMF_OOM_SKIP, mm) && !__oom_reap_task_mm(mm)) ret = -EAGAIN; mmap_read_unlock(mm); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 3e248d1c3969..5f90fd6a7137 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -38,10 +38,10 @@ #include <linux/sched/rt.h> #include <linux/sched/signal.h> #include <linux/mm_inline.h> +#include <linux/shmem_fs.h> #include <trace/events/writeback.h> #include "internal.h" -#include "swap.h" /* * Sleep at most 200ms at a time in balance_dirty_pages(). @@ -2590,36 +2590,6 @@ done: } EXPORT_SYMBOL_GPL(writeback_iter); -/** - * write_cache_pages - walk the list of dirty pages of the given address space and write all of them. - * @mapping: address space structure to write - * @wbc: subtract the number of written pages from *@wbc->nr_to_write - * @writepage: function called for each page - * @data: data passed to writepage function - * - * Return: %0 on success, negative error code otherwise - * - * Note: please use writeback_iter() instead. - */ -int write_cache_pages(struct address_space *mapping, - struct writeback_control *wbc, writepage_t writepage, - void *data) -{ - struct folio *folio = NULL; - int error; - - while ((folio = writeback_iter(mapping, wbc, folio, &error))) { - error = writepage(folio, wbc, data); - if (error == AOP_WRITEPAGE_ACTIVATE) { - folio_unlock(folio); - error = 0; - } - } - - return error; -} -EXPORT_SYMBOL(write_cache_pages); - int do_writepages(struct address_space *mapping, struct writeback_control *wbc) { int ret; @@ -2735,12 +2705,18 @@ void __folio_mark_dirty(struct folio *folio, struct address_space *mapping, { unsigned long flags; + /* + * Shmem writeback relies on swap, and swap writeback is LRU based, + * not using the dirty mark. + */ + VM_WARN_ON_ONCE(folio_test_swapcache(folio) || shmem_mapping(mapping)); + xa_lock_irqsave(&mapping->i_pages, flags); if (folio->mapping) { /* Race with truncate? */ WARN_ON_ONCE(warn && !folio_test_uptodate(folio)); folio_account_dirtied(folio, mapping); - __xa_set_mark(&mapping->i_pages, folio_index(folio), - PAGECACHE_TAG_DIRTY); + __xa_set_mark(&mapping->i_pages, folio->index, + PAGECACHE_TAG_DIRTY); } xa_unlock_irqrestore(&mapping->i_pages, flags); } @@ -3019,7 +2995,7 @@ bool __folio_end_writeback(struct folio *folio) xa_lock_irqsave(&mapping->i_pages, flags); ret = folio_xor_flags_has_waiters(folio, 1 << PG_writeback); - __xa_clear_mark(&mapping->i_pages, folio_index(folio), + __xa_clear_mark(&mapping->i_pages, folio->index, PAGECACHE_TAG_WRITEBACK); if (bdi->capabilities & BDI_CAP_WRITEBACK_ACCT) { struct bdi_writeback *wb = inode_to_wb(inode); @@ -3056,7 +3032,7 @@ void __folio_start_writeback(struct folio *folio, bool keep_write) VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); if (mapping && mapping_use_writeback_tags(mapping)) { - XA_STATE(xas, &mapping->i_pages, folio_index(folio)); + XA_STATE(xas, &mapping->i_pages, folio->index); struct inode *inode = mapping->host; struct backing_dev_info *bdi = inode_to_bdi(inode); unsigned long flags; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d1d037f97c5f..600d9e981c23 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -355,7 +355,7 @@ static inline int pfn_to_bitidx(const struct page *page, unsigned long pfn) static __always_inline bool is_standalone_pb_bit(enum pageblock_bits pb_bit) { - return pb_bit > PB_migrate_end && pb_bit < __NR_PAGEBLOCK_BITS; + return pb_bit >= PB_compact_skip && pb_bit < __NR_PAGEBLOCK_BITS; } static __always_inline void @@ -370,7 +370,7 @@ get_pfnblock_bitmap_bitidx(const struct page *page, unsigned long pfn, #else BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4); #endif - BUILD_BUG_ON(__MIGRATE_TYPE_END >= (1 << PB_migratetype_bits)); + BUILD_BUG_ON(__MIGRATE_TYPE_END > MIGRATETYPE_MASK); VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page); bitmap = get_pageblock_bitmap(page, pfn); @@ -538,8 +538,7 @@ static void set_pageblock_migratetype(struct page *page, "Use set_pageblock_isolate() for pageblock isolation"); return; } - VM_WARN_ONCE(get_pfnblock_bit(page, page_to_pfn(page), - PB_migrate_isolate), + VM_WARN_ONCE(get_pageblock_isolate(page), "Use clear_pageblock_isolate() to unisolate pageblock"); /* MIGRATETYPE_AND_ISO_MASK clears PB_migrate_isolate if it is set */ #endif @@ -797,7 +796,7 @@ static inline void account_freepages(struct zone *zone, int nr_pages, if (is_migrate_cma(migratetype)) __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, nr_pages); - else if (is_migrate_highatomic(migratetype)) + else if (migratetype == MIGRATE_HIGHATOMIC) WRITE_ONCE(zone->nr_free_highatomic, zone->nr_free_highatomic + nr_pages); } @@ -950,7 +949,7 @@ static inline void __free_one_page(struct page *page, bool to_tail; VM_BUG_ON(!zone_is_initialized(zone)); - VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page); + VM_BUG_ON_PAGE(page->flags.f & PAGE_FLAGS_CHECK_AT_PREP, page); VM_BUG_ON(migratetype == -1); VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page); @@ -1043,7 +1042,7 @@ static inline bool page_expected_state(struct page *page, page->memcg_data | #endif page_pool_page_is_pp(page) | - (page->flags & check_flags))) + (page->flags.f & check_flags))) return false; return true; @@ -1059,7 +1058,7 @@ static const char *page_bad_reason(struct page *page, unsigned long flags) bad_reason = "non-NULL mapping"; if (unlikely(page_ref_count(page) != 0)) bad_reason = "nonzero _refcount"; - if (unlikely(page->flags & flags)) { + if (unlikely(page->flags.f & flags)) { if (flags == PAGE_FLAGS_CHECK_AT_PREP) bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag(s) set"; else @@ -1358,7 +1357,7 @@ __always_inline bool free_pages_prepare(struct page *page, int i; if (compound) { - page[1].flags &= ~PAGE_FLAGS_SECOND; + page[1].flags.f &= ~PAGE_FLAGS_SECOND; #ifdef NR_PAGES_IN_LARGE_FOLIO folio->_nr_pages = 0; #endif @@ -1372,7 +1371,7 @@ __always_inline bool free_pages_prepare(struct page *page, continue; } } - (page + i)->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; + (page + i)->flags.f &= ~PAGE_FLAGS_CHECK_AT_PREP; } } if (folio_test_anon(folio)) { @@ -1391,7 +1390,7 @@ __always_inline bool free_pages_prepare(struct page *page, } page_cpupid_reset_last(page); - page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; + page->flags.f &= ~PAGE_FLAGS_CHECK_AT_PREP; reset_page_owner(page, order); page_table_check_free(page, order); pgalloc_tag_sub(page, 1 << order); @@ -1521,7 +1520,7 @@ static void add_page_to_zone_llist(struct zone *zone, struct page *page, unsigned int order) { /* Remember the order */ - page->order = order; + page->private = order; /* Add the page to the free list */ llist_add(&page->pcp_llist, &zone->trylock_free_pages); } @@ -1550,7 +1549,7 @@ static void free_one_page(struct zone *zone, struct page *page, llnode = llist_del_all(llhead); llist_for_each_entry_safe(p, tmp, llnode, pcp_llist) { - unsigned int p_order = p->order; + unsigned int p_order = p->private; split_large_buddy(zone, p, page_to_pfn(p), p_order, fpi_flags); __count_vm_events(PGFREE, 1 << p_order); @@ -2034,7 +2033,13 @@ static int move_freepages_block(struct zone *zone, struct page *page, /* Look for a buddy that straddles start_pfn */ static unsigned long find_large_buddy(unsigned long start_pfn) { - int order = 0; + /* + * If start_pfn is not an order-0 PageBuddy, next PageBuddy containing + * start_pfn has minimal order of __ffs(start_pfn) + 1. Start checking + * the order with __ffs(start_pfn). If start_pfn is order-0 PageBuddy, + * the starting order does not matter. + */ + int order = start_pfn ? __ffs(start_pfn) : MAX_PAGE_ORDER; struct page *page; unsigned long pfn = start_pfn; @@ -2058,9 +2063,9 @@ static unsigned long find_large_buddy(unsigned long start_pfn) static inline void toggle_pageblock_isolate(struct page *page, bool isolate) { if (isolate) - set_pfnblock_bit(page, page_to_pfn(page), PB_migrate_isolate); + set_pageblock_isolate(page); else - clear_pfnblock_bit(page, page_to_pfn(page), PB_migrate_isolate); + clear_pageblock_isolate(page); } /** @@ -2085,9 +2090,10 @@ static inline void toggle_pageblock_isolate(struct page *page, bool isolate) static bool __move_freepages_block_isolate(struct zone *zone, struct page *page, bool isolate) { - unsigned long start_pfn, pfn; + unsigned long start_pfn, buddy_pfn; int from_mt; int to_mt; + struct page *buddy; if (isolate == get_pageblock_isolate(page)) { VM_WARN_ONCE(1, "%s a pageblock that is already in that state", @@ -2102,29 +2108,19 @@ static bool __move_freepages_block_isolate(struct zone *zone, if (pageblock_order == MAX_PAGE_ORDER) goto move; - /* We're a tail block in a larger buddy */ - pfn = find_large_buddy(start_pfn); - if (pfn != start_pfn) { - struct page *buddy = pfn_to_page(pfn); + buddy_pfn = find_large_buddy(start_pfn); + buddy = pfn_to_page(buddy_pfn); + /* We're a part of a larger buddy */ + if (PageBuddy(buddy) && buddy_order(buddy) > pageblock_order) { int order = buddy_order(buddy); del_page_from_free_list(buddy, zone, order, - get_pfnblock_migratetype(buddy, pfn)); + get_pfnblock_migratetype(buddy, buddy_pfn)); toggle_pageblock_isolate(page, isolate); - split_large_buddy(zone, buddy, pfn, order, FPI_NONE); + split_large_buddy(zone, buddy, buddy_pfn, order, FPI_NONE); return true; } - /* We're the starting block of a larger buddy */ - if (PageBuddy(page) && buddy_order(page) > pageblock_order) { - int order = buddy_order(page); - - del_page_from_free_list(page, zone, order, - get_pfnblock_migratetype(page, pfn)); - toggle_pageblock_isolate(page, isolate); - split_large_buddy(zone, page, pfn, order, FPI_NONE); - return true; - } move: /* Use MIGRATETYPE_MASK to get non-isolate migratetype */ if (isolate) { @@ -2864,14 +2860,29 @@ static void free_frozen_page_commit(struct zone *zone, */ return; } + high = nr_pcp_high(pcp, zone, batch, free_high); - if (pcp->count >= high) { - free_pcppages_bulk(zone, nr_pcp_free(pcp, batch, high, free_high), - pcp, pindex); - if (test_bit(ZONE_BELOW_HIGH, &zone->flags) && - zone_watermark_ok(zone, 0, high_wmark_pages(zone), - ZONE_MOVABLE, 0)) - clear_bit(ZONE_BELOW_HIGH, &zone->flags); + if (pcp->count < high) + return; + + free_pcppages_bulk(zone, nr_pcp_free(pcp, batch, high, free_high), + pcp, pindex); + if (test_bit(ZONE_BELOW_HIGH, &zone->flags) && + zone_watermark_ok(zone, 0, high_wmark_pages(zone), + ZONE_MOVABLE, 0)) { + struct pglist_data *pgdat = zone->zone_pgdat; + clear_bit(ZONE_BELOW_HIGH, &zone->flags); + + /* + * Assume that memory pressure on this node is gone and may be + * in a reclaimable state. If a memory fallback node exists, + * direct reclaim may not have been triggered, causing a + * 'hopeless node' to stay in that state for a while. Let + * kswapd work again by resetting kswapd_failures. + */ + if (atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES && + next_memory_node(pgdat->node_id) < MAX_NUMNODES) + atomic_set(&pgdat->kswapd_failures, 0); } } @@ -3724,6 +3735,8 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, struct pglist_data *last_pgdat = NULL; bool last_pgdat_dirty_ok = false; bool no_fallback; + bool skip_kswapd_nodes = nr_online_nodes > 1; + bool skipped_kswapd_nodes = false; retry: /* @@ -3786,6 +3799,19 @@ retry: } } + /* + * If kswapd is already active on a node, keep looking + * for other nodes that might be idle. This can happen + * if another process has NUMA bindings and is causing + * kswapd wakeups on only some nodes. Avoid accidental + * "node_reclaim_mode"-like behavior in this case. + */ + if (skip_kswapd_nodes && + !waitqueue_active(&zone->zone_pgdat->kswapd_wait)) { + skipped_kswapd_nodes = true; + continue; + } + cond_accept_memory(zone, order, alloc_flags); /* @@ -3878,6 +3904,15 @@ try_this_zone: } /* + * If we skipped over nodes with active kswapds and found no + * idle nodes, retry and place anywhere the watermarks permit. + */ + if (skip_kswapd_nodes && skipped_kswapd_nodes) { + skip_kswapd_nodes = false; + goto retry; + } + + /* * It's possible on a UMA machine to get through all zones that are * fragmented. If avoiding fragmentation, reset and try again. */ @@ -4182,7 +4217,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, } static inline bool -should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_flags, +should_compact_retry(struct alloc_context *ac, int order, int alloc_flags, enum compact_result compact_result, enum compact_priority *compact_priority, int *compaction_retries) @@ -4408,7 +4443,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask, unsigned int order) if (!(gfp_mask & __GFP_NOMEMALLOC)) { alloc_flags |= ALLOC_NON_BLOCK; - if (order > 0) + if (order > 0 && (alloc_flags & ALLOC_MIN_RESERVE)) alloc_flags |= ALLOC_HIGHATOMIC; } @@ -5229,9 +5264,16 @@ static void ___free_pages(struct page *page, unsigned int order, __free_frozen_pages(page, order, fpi_flags); else if (!head) { pgalloc_tag_sub_pages(tag, (1 << order) - 1); - while (order-- > 0) + while (order-- > 0) { + /* + * The "tail" pages of this non-compound high-order + * page will have no code tags, so to avoid warnings + * mark them as empty. + */ + clear_page_tag_ref(page + (1 << order)); __free_frozen_pages(page + (1 << order), order, fpi_flags); + } } } @@ -5270,6 +5312,15 @@ void free_pages_nolock(struct page *page, unsigned int order) ___free_pages(page, order, FPI_TRYLOCK); } +/** + * free_pages - Free pages allocated with __get_free_pages(). + * @addr: The virtual address tied to a page returned from __get_free_pages(). + * @order: The order of the allocation. + * + * This function behaves the same as __free_pages(). Use this function + * to free pages when you only have a valid virtual address. If you have + * the page, call __free_pages() instead. + */ void free_pages(unsigned long addr, unsigned int order) { if (addr != 0) { @@ -5946,7 +5997,6 @@ static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonesta pcp->high_min = BOOT_PAGESET_HIGH; pcp->high_max = BOOT_PAGESET_HIGH; pcp->batch = BOOT_PAGESET_BATCH; - pcp->free_count = 0; } static void __zone_set_pageset_high_and_batch(struct zone *zone, unsigned long high_min, @@ -6236,16 +6286,13 @@ static void calculate_totalreserve_pages(void) unsigned long managed_pages = zone_managed_pages(zone); /* Find valid and maximum lowmem_reserve in the zone */ - for (j = i; j < MAX_NR_ZONES; j++) { - if (zone->lowmem_reserve[j] > max) - max = zone->lowmem_reserve[j]; - } + for (j = i; j < MAX_NR_ZONES; j++) + max = max(max, zone->lowmem_reserve[j]); /* we treat the high watermark as reserved pages. */ max += high_wmark_pages(zone); - if (max > managed_pages) - max = managed_pages; + max = min_t(unsigned long, max, managed_pages); pgdat->totalreserve_pages += max; @@ -6837,6 +6884,7 @@ static int __alloc_contig_verify_gfp_mask(gfp_t gfp_mask, gfp_t *gfp_cc_mask) int alloc_contig_range_noprof(unsigned long start, unsigned long end, acr_flags_t alloc_flags, gfp_t gfp_mask) { + const unsigned int order = ilog2(end - start); unsigned long outer_start, outer_end; int ret = 0; @@ -6854,6 +6902,14 @@ int alloc_contig_range_noprof(unsigned long start, unsigned long end, PB_ISOLATE_MODE_CMA_ALLOC : PB_ISOLATE_MODE_OTHER; + /* + * In contrast to the buddy, we allow for orders here that exceed + * MAX_PAGE_ORDER, so we must manually make sure that we are not + * exceeding the maximum folio order. + */ + if (WARN_ON_ONCE((gfp_mask & __GFP_COMP) && order > MAX_FOLIO_ORDER)) + return -EINVAL; + gfp_mask = current_gfp_context(gfp_mask); if (__alloc_contig_verify_gfp_mask(gfp_mask, (gfp_t *)&cc.gfp_mask)) return -EINVAL; @@ -6951,7 +7007,6 @@ int alloc_contig_range_noprof(unsigned long start, unsigned long end, free_contig_range(end, outer_end - end); } else if (start == outer_start && end == outer_end && is_power_of_2(end - start)) { struct page *head = pfn_to_page(start); - int order = ilog2(end - start); check_new_pages(head, order); prep_new_page(head, order, gfp_mask, 0); @@ -7478,22 +7533,7 @@ static bool __free_unaccepted(struct page *page) #endif /* CONFIG_UNACCEPTED_MEMORY */ -/** - * alloc_pages_nolock - opportunistic reentrant allocation from any context - * @nid: node to allocate from - * @order: allocation order size - * - * Allocates pages of a given order from the given node. This is safe to - * call from any context (from atomic, NMI, and also reentrant - * allocator -> tracepoint -> alloc_pages_nolock_noprof). - * Allocation is best effort and to be expected to fail easily so nobody should - * rely on the success. Failures are not reported via warn_alloc(). - * See always fail conditions below. - * - * Return: allocated page or NULL on failure. NULL does not mean EBUSY or EAGAIN. - * It means ENOMEM. There is no reason to call it again and expect !NULL. - */ -struct page *alloc_pages_nolock_noprof(int nid, unsigned int order) +struct page *alloc_frozen_pages_nolock_noprof(gfp_t gfp_flags, int nid, unsigned int order) { /* * Do not specify __GFP_DIRECT_RECLAIM, since direct claim is not allowed. @@ -7515,12 +7555,13 @@ struct page *alloc_pages_nolock_noprof(int nid, unsigned int order) * specify it here to highlight that alloc_pages_nolock() * doesn't want to deplete reserves. */ - gfp_t alloc_gfp = __GFP_NOWARN | __GFP_ZERO | __GFP_NOMEMALLOC - | __GFP_ACCOUNT; + gfp_t alloc_gfp = __GFP_NOWARN | __GFP_ZERO | __GFP_NOMEMALLOC | __GFP_COMP + | gfp_flags; unsigned int alloc_flags = ALLOC_TRYLOCK; struct alloc_context ac = { }; struct page *page; + VM_WARN_ON_ONCE(gfp_flags & ~__GFP_ACCOUNT); /* * In PREEMPT_RT spin_trylock() will call raw_spin_lock() which is * unsafe in NMI. If spin_trylock() is called from hard IRQ the current @@ -7555,15 +7596,38 @@ struct page *alloc_pages_nolock_noprof(int nid, unsigned int order) /* Unlike regular alloc_pages() there is no __alloc_pages_slowpath(). */ - if (page) - set_page_refcounted(page); - - if (memcg_kmem_online() && page && + if (memcg_kmem_online() && page && (gfp_flags & __GFP_ACCOUNT) && unlikely(__memcg_kmem_charge_page(page, alloc_gfp, order) != 0)) { - free_pages_nolock(page, order); + __free_frozen_pages(page, order, FPI_TRYLOCK); page = NULL; } trace_mm_page_alloc(page, order, alloc_gfp, ac.migratetype); kmsan_alloc_page(page, order, alloc_gfp); return page; } +/** + * alloc_pages_nolock - opportunistic reentrant allocation from any context + * @gfp_flags: GFP flags. Only __GFP_ACCOUNT allowed. + * @nid: node to allocate from + * @order: allocation order size + * + * Allocates pages of a given order from the given node. This is safe to + * call from any context (from atomic, NMI, and also reentrant + * allocator -> tracepoint -> alloc_pages_nolock_noprof). + * Allocation is best effort and to be expected to fail easily so nobody should + * rely on the success. Failures are not reported via warn_alloc(). + * See always fail conditions below. + * + * Return: allocated page or NULL on failure. NULL does not mean EBUSY or EAGAIN. + * It means ENOMEM. There is no reason to call it again and expect !NULL. + */ +struct page *alloc_pages_nolock_noprof(gfp_t gfp_flags, int nid, unsigned int order) +{ + struct page *page; + + page = alloc_frozen_pages_nolock_noprof(gfp_flags, nid, order); + if (page) + set_page_refcounted(page); + return page; +} +EXPORT_SYMBOL_GPL(alloc_pages_nolock_noprof); diff --git a/mm/page_io.c b/mm/page_io.c index a2056a5ecb13..3c342db77ce3 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -204,7 +204,7 @@ static bool is_folio_zero_filled(struct folio *folio) static void swap_zeromap_folio_set(struct folio *folio) { struct obj_cgroup *objcg = get_obj_cgroup_from_folio(folio); - struct swap_info_struct *sis = swp_swap_info(folio->swap); + struct swap_info_struct *sis = __swap_entry_to_info(folio->swap); int nr_pages = folio_nr_pages(folio); swp_entry_t entry; unsigned int i; @@ -223,7 +223,7 @@ static void swap_zeromap_folio_set(struct folio *folio) static void swap_zeromap_folio_clear(struct folio *folio) { - struct swap_info_struct *sis = swp_swap_info(folio->swap); + struct swap_info_struct *sis = __swap_entry_to_info(folio->swap); swp_entry_t entry; unsigned int i; @@ -374,7 +374,7 @@ static void sio_write_complete(struct kiocb *iocb, long ret) static void swap_writepage_fs(struct folio *folio, struct swap_iocb **swap_plug) { struct swap_iocb *sio = swap_plug ? *swap_plug : NULL; - struct swap_info_struct *sis = swp_swap_info(folio->swap); + struct swap_info_struct *sis = __swap_entry_to_info(folio->swap); struct file *swap_file = sis->swap_file; loff_t pos = swap_dev_pos(folio->swap); @@ -446,7 +446,7 @@ static void swap_writepage_bdev_async(struct folio *folio, void __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug) { - struct swap_info_struct *sis = swp_swap_info(folio->swap); + struct swap_info_struct *sis = __swap_entry_to_info(folio->swap); VM_BUG_ON_FOLIO(!folio_test_swapcache(folio), folio); /* @@ -537,7 +537,7 @@ static bool swap_read_folio_zeromap(struct folio *folio) static void swap_read_folio_fs(struct folio *folio, struct swap_iocb **plug) { - struct swap_info_struct *sis = swp_swap_info(folio->swap); + struct swap_info_struct *sis = __swap_entry_to_info(folio->swap); struct swap_iocb *sio = NULL; loff_t pos = swap_dev_pos(folio->swap); @@ -608,7 +608,7 @@ static void swap_read_folio_bdev_async(struct folio *folio, void swap_read_folio(struct folio *folio, struct swap_iocb **plug) { - struct swap_info_struct *sis = swp_swap_info(folio->swap); + struct swap_info_struct *sis = __swap_entry_to_info(folio->swap); bool synchronous = sis->flags & SWP_SYNCHRONOUS_IO; bool workingset = folio_test_workingset(folio); unsigned long pflags; diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c index e981a1a292d2..c498a91b6706 100644 --- a/mm/page_vma_mapped.c +++ b/mm/page_vma_mapped.c @@ -309,6 +309,7 @@ next_pte: } pte_unmap(pvmw->pte); pvmw->pte = NULL; + pvmw->flags |= PVMW_PGTABLE_CROSSED; goto restart; } pvmw->pte++; diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 648038247a8d..9f91cf85a5be 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -606,10 +606,32 @@ int walk_page_range(struct mm_struct *mm, unsigned long start, int walk_kernel_page_table_range(unsigned long start, unsigned long end, const struct mm_walk_ops *ops, pgd_t *pgd, void *private) { - struct mm_struct *mm = &init_mm; + /* + * Kernel intermediate page tables are usually not freed, so the mmap + * read lock is sufficient. But there are some exceptions. + * E.g. memory hot-remove. In which case, the mmap lock is insufficient + * to prevent the intermediate kernel pages tables belonging to the + * specified address range from being freed. The caller should take + * other actions to prevent this race. + */ + mmap_assert_locked(&init_mm); + + return walk_kernel_page_table_range_lockless(start, end, ops, pgd, + private); +} + +/* + * Use this function to walk the kernel page tables locklessly. It should be + * guaranteed that the caller has exclusive access over the range they are + * operating on - that there should be no concurrent access, for example, + * changing permissions for vmalloc objects. + */ +int walk_kernel_page_table_range_lockless(unsigned long start, unsigned long end, + const struct mm_walk_ops *ops, pgd_t *pgd, void *private) +{ struct mm_walk walk = { .ops = ops, - .mm = mm, + .mm = &init_mm, .pgd = pgd, .private = private, .no_vma = true @@ -620,16 +642,6 @@ int walk_kernel_page_table_range(unsigned long start, unsigned long end, if (!check_ops_valid(ops)) return -EINVAL; - /* - * Kernel intermediate page tables are usually not freed, so the mmap - * read lock is sufficient. But there are some exceptions. - * E.g. memory hot-remove. In which case, the mmap lock is insufficient - * to prevent the intermediate kernel pages tables belonging to the - * specified address range from being freed. The caller should take - * other actions to prevent this race. - */ - mmap_assert_locked(mm); - return walk_pgd_range(start, end, &walk); } @@ -902,23 +914,23 @@ struct folio *folio_walk_start(struct folio_walk *fw, fw->pudp = pudp; fw->pud = pud; - /* - * TODO: FW_MIGRATION support for PUD migration entries - * once there are relevant users. - */ - if (!pud_present(pud) || pud_special(pud)) { + if (pud_none(pud)) { spin_unlock(ptl); goto not_found; - } else if (!pud_leaf(pud)) { + } else if (pud_present(pud) && !pud_leaf(pud)) { spin_unlock(ptl); goto pmd_table; + } else if (pud_present(pud)) { + page = vm_normal_page_pud(vma, addr, pud); + if (page) + goto found; } /* - * TODO: vm_normal_page_pud() will be handy once we want to - * support PUD mappings in VM_PFNMAP|VM_MIXEDMAP VMAs. + * TODO: FW_MIGRATION support for PUD migration entries + * once there are relevant users. */ - page = pud_page(pud); - goto found; + spin_unlock(ptl); + goto not_found; } pmd_table: @@ -1004,7 +1016,7 @@ not_found: found: if (expose_page) /* Note: Offset from the mapped page, not the folio start. */ - fw->page = nth_page(page, (addr & (entry_size - 1)) >> PAGE_SHIFT); + fw->page = page + ((addr & (entry_size - 1)) >> PAGE_SHIFT); else fw->page = NULL; fw->ptl = ptl; diff --git a/mm/percpu-km.c b/mm/percpu-km.c index fe31aa19db81..4efa74a495cb 100644 --- a/mm/percpu-km.c +++ b/mm/percpu-km.c @@ -69,7 +69,7 @@ static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp) } for (i = 0; i < nr_pages; i++) - pcpu_set_page_chunk(nth_page(pages, i), chunk); + pcpu_set_page_chunk(pages + i, chunk); chunk->data = pages; chunk->base_addr = page_address(pages); diff --git a/mm/percpu.c b/mm/percpu.c index d9cbaee92b60..81462ce5866e 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1734,7 +1734,7 @@ void __percpu *pcpu_alloc_noprof(size_t size, size_t align, bool reserved, bool is_atomic; bool do_warn; struct obj_cgroup *objcg = NULL; - static int warn_limit = 10; + static atomic_t warn_limit = ATOMIC_INIT(10); struct pcpu_chunk *chunk, *next; const char *err; int slot, off, cpu, ret; @@ -1904,13 +1904,17 @@ fail_unlock: fail: trace_percpu_alloc_percpu_fail(reserved, is_atomic, size, align); - if (do_warn && warn_limit) { - pr_warn("allocation failed, size=%zu align=%zu atomic=%d, %s\n", - size, align, is_atomic, err); - if (!is_atomic) - dump_stack(); - if (!--warn_limit) - pr_info("limit reached, disable warning\n"); + if (do_warn) { + int remaining = atomic_dec_if_positive(&warn_limit); + + if (remaining >= 0) { + pr_warn("allocation failed, size=%zu align=%zu atomic=%d, %s\n", + size, align, is_atomic, err); + if (!is_atomic) + dump_stack(); + if (remaining == 0) + pr_info("limit reached, disable warning\n"); + } } if (is_atomic) { @@ -3108,7 +3112,7 @@ out_free: #endif /* BUILD_EMBED_FIRST_CHUNK */ #ifdef BUILD_PAGE_FIRST_CHUNK -#include <asm/pgalloc.h> +#include <linux/pgalloc.h> #ifndef P4D_TABLE_SIZE #define P4D_TABLE_SIZE PAGE_SIZE @@ -3134,13 +3138,13 @@ void __init __weak pcpu_populate_pte(unsigned long addr) if (pgd_none(*pgd)) { p4d = memblock_alloc_or_panic(P4D_TABLE_SIZE, P4D_TABLE_SIZE); - pgd_populate(&init_mm, pgd, p4d); + pgd_populate_kernel(addr, pgd, p4d); } p4d = p4d_offset(pgd, addr); if (p4d_none(*p4d)) { pud = memblock_alloc_or_panic(PUD_TABLE_SIZE, PUD_TABLE_SIZE); - p4d_populate(&init_mm, p4d, pud); + p4d_populate_kernel(addr, p4d, pud); } pud = pud_offset(p4d, addr); diff --git a/mm/readahead.c b/mm/readahead.c index 406756d34309..3a4b5d58eeb6 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -129,6 +129,9 @@ #include <linux/fadvise.h> #include <linux/sched/mm.h> +#define CREATE_TRACE_POINTS +#include <trace/events/readahead.h> + #include "internal.h" /* @@ -225,6 +228,8 @@ void page_cache_ra_unbounded(struct readahead_control *ractl, */ unsigned int nofs = memalloc_nofs_save(); + trace_page_cache_ra_unbounded(mapping->host, index, nr_to_read, + lookahead_size); filemap_invalidate_lock_shared(mapping); index = mapping_align_index(mapping, index); @@ -470,6 +475,7 @@ void page_cache_ra_order(struct readahead_control *ractl, gfp_t gfp = readahead_gfp_mask(mapping); unsigned int new_order = ra->order; + trace_page_cache_ra_order(mapping->host, start, ra); if (!mapping_large_folio_support(mapping)) { ra->order = 0; goto fallback; @@ -554,6 +560,7 @@ void page_cache_sync_ra(struct readahead_control *ractl, unsigned long max_pages, contig_count; pgoff_t prev_index, miss; + trace_page_cache_sync_ra(ractl->mapping->host, index, ra, req_count); /* * Even if readahead is disabled, issue this request as readahead * as we'll need it to satisfy the requested range. The forced @@ -638,6 +645,7 @@ void page_cache_async_ra(struct readahead_control *ractl, if (folio_test_writeback(folio)) return; + trace_page_cache_async_ra(ractl->mapping->host, index, ra, req_count); folio_clear_readahead(folio); if (blk_cgroup_congested()) diff --git a/mm/rmap.c b/mm/rmap.c index 568198e9efc2..ac4f783d6ec2 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -79,7 +79,6 @@ #include <asm/tlbflush.h> #define CREATE_TRACE_POINTS -#include <trace/events/tlb.h> #include <trace/events/migrate.h> #include "internal.h" @@ -285,7 +284,7 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) { struct anon_vma *anon_vma; - avc = anon_vma_chain_alloc(GFP_NOWAIT | __GFP_NOWARN); + avc = anon_vma_chain_alloc(GFP_NOWAIT); if (unlikely(!avc)) { unlock_anon_vma_root(root); root = NULL; @@ -851,34 +850,34 @@ static bool folio_referenced_one(struct folio *folio, { struct folio_referenced_arg *pra = arg; DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0); - int referenced = 0; - unsigned long start = address, ptes = 0; + int ptes = 0, referenced = 0; while (page_vma_mapped_walk(&pvmw)) { address = pvmw.address; if (vma->vm_flags & VM_LOCKED) { - if (!folio_test_large(folio) || !pvmw.pte) { - /* Restore the mlock which got missed */ - mlock_vma_folio(folio, vma); - page_vma_mapped_walk_done(&pvmw); - pra->vm_flags |= VM_LOCKED; - return false; /* To break the loop */ - } + ptes++; + pra->mapcount--; + + /* Only mlock fully mapped pages */ + if (pvmw.pte && ptes != pvmw.nr_pages) + continue; + /* - * For large folio fully mapped to VMA, will - * be handled after the pvmw loop. + * All PTEs must be protected by page table lock in + * order to mlock the page. * - * For large folio cross VMA boundaries, it's - * expected to be picked by page reclaim. But - * should skip reference of pages which are in - * the range of VM_LOCKED vma. As page reclaim - * should just count the reference of pages out - * the range of VM_LOCKED vma. + * If page table boundary has been cross, current ptl + * only protect part of ptes. */ - ptes++; - pra->mapcount--; - continue; + if (pvmw.flags & PVMW_PGTABLE_CROSSED) + continue; + + /* Restore the mlock which got missed */ + mlock_vma_folio(folio, vma); + page_vma_mapped_walk_done(&pvmw); + pra->vm_flags |= VM_LOCKED; + return false; /* To break the loop */ } /* @@ -914,23 +913,6 @@ static bool folio_referenced_one(struct folio *folio, pra->mapcount--; } - if ((vma->vm_flags & VM_LOCKED) && - folio_test_large(folio) && - folio_within_vma(folio, vma)) { - unsigned long s_align, e_align; - - s_align = ALIGN_DOWN(start, PMD_SIZE); - e_align = ALIGN_DOWN(start + folio_size(folio) - 1, PMD_SIZE); - - /* folio doesn't cross page table boundary and fully mapped */ - if ((s_align == e_align) && (ptes == folio_nr_pages(folio))) { - /* Restore the mlock which got missed */ - mlock_vma_folio(folio, vma); - pra->vm_flags |= VM_LOCKED; - return false; /* To break the loop */ - } - } - if (referenced) folio_clear_idle(folio); if (folio_test_clear_young(folio)) @@ -1241,18 +1223,40 @@ int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff, return page_vma_mkclean_one(&pvmw); } -static __always_inline unsigned int __folio_add_rmap(struct folio *folio, +static void __folio_mod_stat(struct folio *folio, int nr, int nr_pmdmapped) +{ + int idx; + + if (nr) { + idx = folio_test_anon(folio) ? NR_ANON_MAPPED : NR_FILE_MAPPED; + __lruvec_stat_mod_folio(folio, idx, nr); + } + if (nr_pmdmapped) { + if (folio_test_anon(folio)) { + idx = NR_ANON_THPS; + __lruvec_stat_mod_folio(folio, idx, nr_pmdmapped); + } else { + /* NR_*_PMDMAPPED are not maintained per-memcg */ + idx = folio_test_swapbacked(folio) ? + NR_SHMEM_PMDMAPPED : NR_FILE_PMDMAPPED; + __mod_node_page_state(folio_pgdat(folio), idx, + nr_pmdmapped); + } + } +} + +static __always_inline void __folio_add_rmap(struct folio *folio, struct page *page, int nr_pages, struct vm_area_struct *vma, - enum rmap_level level, int *nr_pmdmapped) + enum pgtable_level level) { atomic_t *mapped = &folio->_nr_pages_mapped; const int orig_nr_pages = nr_pages; - int first = 0, nr = 0; + int first = 0, nr = 0, nr_pmdmapped = 0; __folio_rmap_sanity_checks(folio, page, nr_pages, level); switch (level) { - case RMAP_LEVEL_PTE: + case PGTABLE_LEVEL_PTE: if (!folio_test_large(folio)) { nr = atomic_inc_and_test(&folio->_mapcount); break; @@ -1278,12 +1282,12 @@ static __always_inline unsigned int __folio_add_rmap(struct folio *folio, folio_add_large_mapcount(folio, orig_nr_pages, vma); break; - case RMAP_LEVEL_PMD: - case RMAP_LEVEL_PUD: + case PGTABLE_LEVEL_PMD: + case PGTABLE_LEVEL_PUD: first = atomic_inc_and_test(&folio->_entire_mapcount); if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT)) { - if (level == RMAP_LEVEL_PMD && first) - *nr_pmdmapped = folio_large_nr_pages(folio); + if (level == PGTABLE_LEVEL_PMD && first) + nr_pmdmapped = folio_large_nr_pages(folio); nr = folio_inc_return_large_mapcount(folio, vma); if (nr == 1) /* Was completely unmapped. */ @@ -1301,8 +1305,8 @@ static __always_inline unsigned int __folio_add_rmap(struct folio *folio, * We only track PMD mappings of PMD-sized * folios separately. */ - if (level == RMAP_LEVEL_PMD) - *nr_pmdmapped = nr_pages; + if (level == PGTABLE_LEVEL_PMD) + nr_pmdmapped = nr_pages; nr = nr_pages - (nr & FOLIO_PAGES_MAPPED); /* Raced ahead of a remove and another add? */ if (unlikely(nr < 0)) @@ -1314,8 +1318,10 @@ static __always_inline unsigned int __folio_add_rmap(struct folio *folio, } folio_inc_large_mapcount(folio, vma); break; + default: + BUILD_BUG(); } - return nr; + __folio_mod_stat(folio, nr, nr_pmdmapped); } /** @@ -1403,59 +1409,37 @@ static void __page_check_anon_rmap(const struct folio *folio, page); } -static void __folio_mod_stat(struct folio *folio, int nr, int nr_pmdmapped) -{ - int idx; - - if (nr) { - idx = folio_test_anon(folio) ? NR_ANON_MAPPED : NR_FILE_MAPPED; - __lruvec_stat_mod_folio(folio, idx, nr); - } - if (nr_pmdmapped) { - if (folio_test_anon(folio)) { - idx = NR_ANON_THPS; - __lruvec_stat_mod_folio(folio, idx, nr_pmdmapped); - } else { - /* NR_*_PMDMAPPED are not maintained per-memcg */ - idx = folio_test_swapbacked(folio) ? - NR_SHMEM_PMDMAPPED : NR_FILE_PMDMAPPED; - __mod_node_page_state(folio_pgdat(folio), idx, - nr_pmdmapped); - } - } -} - static __always_inline void __folio_add_anon_rmap(struct folio *folio, struct page *page, int nr_pages, struct vm_area_struct *vma, - unsigned long address, rmap_t flags, enum rmap_level level) + unsigned long address, rmap_t flags, enum pgtable_level level) { - int i, nr, nr_pmdmapped = 0; + int i; VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio); - nr = __folio_add_rmap(folio, page, nr_pages, vma, level, &nr_pmdmapped); + __folio_add_rmap(folio, page, nr_pages, vma, level); if (likely(!folio_test_ksm(folio))) __page_check_anon_rmap(folio, page, vma, address); - __folio_mod_stat(folio, nr, nr_pmdmapped); - if (flags & RMAP_EXCLUSIVE) { switch (level) { - case RMAP_LEVEL_PTE: + case PGTABLE_LEVEL_PTE: for (i = 0; i < nr_pages; i++) SetPageAnonExclusive(page + i); break; - case RMAP_LEVEL_PMD: + case PGTABLE_LEVEL_PMD: SetPageAnonExclusive(page); break; - case RMAP_LEVEL_PUD: + case PGTABLE_LEVEL_PUD: /* * Keep the compiler happy, we don't support anonymous * PUD mappings. */ WARN_ON_ONCE(1); break; + default: + BUILD_BUG(); } } @@ -1479,12 +1463,12 @@ static __always_inline void __folio_add_anon_rmap(struct folio *folio, } /* - * For large folio, only mlock it if it's fully mapped to VMA. It's - * not easy to check whether the large folio is fully mapped to VMA - * here. Only mlock normal 4K folio and leave page reclaim to handle - * large folio. + * Only mlock it if the folio is fully mapped to the VMA. + * + * Partially mapped folios can be split on reclaim and part outside + * of mlocked VMA can be evicted or freed. */ - if (!folio_test_large(folio)) + if (folio_nr_pages(folio) == nr_pages) mlock_vma_folio(folio, vma); } @@ -1509,7 +1493,7 @@ void folio_add_anon_rmap_ptes(struct folio *folio, struct page *page, rmap_t flags) { __folio_add_anon_rmap(folio, page, nr_pages, vma, address, flags, - RMAP_LEVEL_PTE); + PGTABLE_LEVEL_PTE); } /** @@ -1530,7 +1514,7 @@ void folio_add_anon_rmap_pmd(struct folio *folio, struct page *page, { #ifdef CONFIG_TRANSPARENT_HUGEPAGE __folio_add_anon_rmap(folio, page, HPAGE_PMD_NR, vma, address, flags, - RMAP_LEVEL_PMD); + PGTABLE_LEVEL_PMD); #else WARN_ON_ONCE(true); #endif @@ -1611,17 +1595,19 @@ void folio_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma, static __always_inline void __folio_add_file_rmap(struct folio *folio, struct page *page, int nr_pages, struct vm_area_struct *vma, - enum rmap_level level) + enum pgtable_level level) { - int nr, nr_pmdmapped = 0; - VM_WARN_ON_FOLIO(folio_test_anon(folio), folio); - nr = __folio_add_rmap(folio, page, nr_pages, vma, level, &nr_pmdmapped); - __folio_mod_stat(folio, nr, nr_pmdmapped); + __folio_add_rmap(folio, page, nr_pages, vma, level); - /* See comments in folio_add_anon_rmap_*() */ - if (!folio_test_large(folio)) + /* + * Only mlock it if the folio is fully mapped to the VMA. + * + * Partially mapped folios can be split on reclaim and part outside + * of mlocked VMA can be evicted or freed. + */ + if (folio_nr_pages(folio) == nr_pages) mlock_vma_folio(folio, vma); } @@ -1639,7 +1625,7 @@ static __always_inline void __folio_add_file_rmap(struct folio *folio, void folio_add_file_rmap_ptes(struct folio *folio, struct page *page, int nr_pages, struct vm_area_struct *vma) { - __folio_add_file_rmap(folio, page, nr_pages, vma, RMAP_LEVEL_PTE); + __folio_add_file_rmap(folio, page, nr_pages, vma, PGTABLE_LEVEL_PTE); } /** @@ -1656,7 +1642,7 @@ void folio_add_file_rmap_pmd(struct folio *folio, struct page *page, struct vm_area_struct *vma) { #ifdef CONFIG_TRANSPARENT_HUGEPAGE - __folio_add_file_rmap(folio, page, HPAGE_PMD_NR, vma, RMAP_LEVEL_PMD); + __folio_add_file_rmap(folio, page, HPAGE_PMD_NR, vma, PGTABLE_LEVEL_PMD); #else WARN_ON_ONCE(true); #endif @@ -1677,7 +1663,7 @@ void folio_add_file_rmap_pud(struct folio *folio, struct page *page, { #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \ defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) - __folio_add_file_rmap(folio, page, HPAGE_PUD_NR, vma, RMAP_LEVEL_PUD); + __folio_add_file_rmap(folio, page, HPAGE_PUD_NR, vma, PGTABLE_LEVEL_PUD); #else WARN_ON_ONCE(true); #endif @@ -1685,7 +1671,7 @@ void folio_add_file_rmap_pud(struct folio *folio, struct page *page, static __always_inline void __folio_remove_rmap(struct folio *folio, struct page *page, int nr_pages, struct vm_area_struct *vma, - enum rmap_level level) + enum pgtable_level level) { atomic_t *mapped = &folio->_nr_pages_mapped; int last = 0, nr = 0, nr_pmdmapped = 0; @@ -1694,7 +1680,7 @@ static __always_inline void __folio_remove_rmap(struct folio *folio, __folio_rmap_sanity_checks(folio, page, nr_pages, level); switch (level) { - case RMAP_LEVEL_PTE: + case PGTABLE_LEVEL_PTE: if (!folio_test_large(folio)) { nr = atomic_add_negative(-1, &folio->_mapcount); break; @@ -1704,7 +1690,7 @@ static __always_inline void __folio_remove_rmap(struct folio *folio, nr = folio_sub_return_large_mapcount(folio, nr_pages, vma); if (!nr) { /* Now completely unmapped. */ - nr = folio_nr_pages(folio); + nr = folio_large_nr_pages(folio); } else { partially_mapped = nr < folio_large_nr_pages(folio) && !folio_entire_mapcount(folio); @@ -1724,11 +1710,11 @@ static __always_inline void __folio_remove_rmap(struct folio *folio, partially_mapped = nr && atomic_read(mapped); break; - case RMAP_LEVEL_PMD: - case RMAP_LEVEL_PUD: + case PGTABLE_LEVEL_PMD: + case PGTABLE_LEVEL_PUD: if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT)) { last = atomic_add_negative(-1, &folio->_entire_mapcount); - if (level == RMAP_LEVEL_PMD && last) + if (level == PGTABLE_LEVEL_PMD && last) nr_pmdmapped = folio_large_nr_pages(folio); nr = folio_dec_return_large_mapcount(folio, vma); if (!nr) { @@ -1748,9 +1734,9 @@ static __always_inline void __folio_remove_rmap(struct folio *folio, nr = atomic_sub_return_relaxed(ENTIRELY_MAPPED, mapped); if (likely(nr < ENTIRELY_MAPPED)) { nr_pages = folio_large_nr_pages(folio); - if (level == RMAP_LEVEL_PMD) + if (level == PGTABLE_LEVEL_PMD) nr_pmdmapped = nr_pages; - nr = nr_pages - (nr & FOLIO_PAGES_MAPPED); + nr = nr_pages - nr; /* Raced ahead of another remove and an add? */ if (unlikely(nr < 0)) nr = 0; @@ -1762,6 +1748,8 @@ static __always_inline void __folio_remove_rmap(struct folio *folio, partially_mapped = nr && nr < nr_pmdmapped; break; + default: + BUILD_BUG(); } /* @@ -1801,7 +1789,7 @@ static __always_inline void __folio_remove_rmap(struct folio *folio, void folio_remove_rmap_ptes(struct folio *folio, struct page *page, int nr_pages, struct vm_area_struct *vma) { - __folio_remove_rmap(folio, page, nr_pages, vma, RMAP_LEVEL_PTE); + __folio_remove_rmap(folio, page, nr_pages, vma, PGTABLE_LEVEL_PTE); } /** @@ -1818,7 +1806,7 @@ void folio_remove_rmap_pmd(struct folio *folio, struct page *page, struct vm_area_struct *vma) { #ifdef CONFIG_TRANSPARENT_HUGEPAGE - __folio_remove_rmap(folio, page, HPAGE_PMD_NR, vma, RMAP_LEVEL_PMD); + __folio_remove_rmap(folio, page, HPAGE_PMD_NR, vma, PGTABLE_LEVEL_PMD); #else WARN_ON_ONCE(true); #endif @@ -1839,7 +1827,7 @@ void folio_remove_rmap_pud(struct folio *folio, struct page *page, { #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \ defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) - __folio_remove_rmap(folio, page, HPAGE_PUD_NR, vma, RMAP_LEVEL_PUD); + __folio_remove_rmap(folio, page, HPAGE_PUD_NR, vma, PGTABLE_LEVEL_PUD); #else WARN_ON_ONCE(true); #endif @@ -1887,6 +1875,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, unsigned long nr_pages = 1, end_addr; unsigned long pfn; unsigned long hsz = 0; + int ptes = 0; /* * When racing against e.g. zap_pte_range() on another cpu, @@ -1927,10 +1916,34 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, */ if (!(flags & TTU_IGNORE_MLOCK) && (vma->vm_flags & VM_LOCKED)) { + ptes++; + + /* + * Set 'ret' to indicate the page cannot be unmapped. + * + * Do not jump to walk_abort immediately as additional + * iteration might be required to detect fully mapped + * folio an mlock it. + */ + ret = false; + + /* Only mlock fully mapped pages */ + if (pvmw.pte && ptes != pvmw.nr_pages) + continue; + + /* + * All PTEs must be protected by page table lock in + * order to mlock the page. + * + * If page table boundary has been cross, current ptl + * only protect part of ptes. + */ + if (pvmw.flags & PVMW_PGTABLE_CROSSED) + goto walk_done; + /* Restore the mlock which got missed */ - if (!folio_test_large(folio)) - mlock_vma_folio(folio, vma); - goto walk_abort; + mlock_vma_folio(folio, vma); + goto walk_done; } if (!pvmw.pte) { diff --git a/mm/shmem.c b/mm/shmem.c index e2c76a30802b..b9081b817d28 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -275,18 +275,18 @@ static const struct vm_operations_struct shmem_vm_ops; static const struct vm_operations_struct shmem_anon_vm_ops; static struct file_system_type shmem_fs_type; -bool shmem_mapping(struct address_space *mapping) +bool shmem_mapping(const struct address_space *mapping) { return mapping->a_ops == &shmem_aops; } EXPORT_SYMBOL_GPL(shmem_mapping); -bool vma_is_anon_shmem(struct vm_area_struct *vma) +bool vma_is_anon_shmem(const struct vm_area_struct *vma) { return vma->vm_ops == &shmem_anon_vm_ops; } -bool vma_is_shmem(struct vm_area_struct *vma) +bool vma_is_shmem(const struct vm_area_struct *vma) { return vma_is_anon_shmem(vma) || vma->vm_ops == &shmem_vm_ops; } @@ -573,42 +573,6 @@ static int shmem_confirm_swap(struct address_space *mapping, pgoff_t index, static int shmem_huge __read_mostly = SHMEM_HUGE_NEVER; static int tmpfs_huge __read_mostly = SHMEM_HUGE_NEVER; -/** - * shmem_mapping_size_orders - Get allowable folio orders for the given file size. - * @mapping: Target address_space. - * @index: The page index. - * @write_end: end of a write, could extend inode size. - * - * This returns huge orders for folios (when supported) based on the file size - * which the mapping currently allows at the given index. The index is relevant - * due to alignment considerations the mapping might have. The returned order - * may be less than the size passed. - * - * Return: The orders. - */ -static inline unsigned int -shmem_mapping_size_orders(struct address_space *mapping, pgoff_t index, loff_t write_end) -{ - unsigned int order; - size_t size; - - if (!mapping_large_folio_support(mapping) || !write_end) - return 0; - - /* Calculate the write size based on the write_end */ - size = write_end - (index << PAGE_SHIFT); - order = filemap_get_order(size); - if (!order) - return 0; - - /* If we're not aligned, allocate a smaller folio */ - if (index & ((1UL << order) - 1)) - order = __ffs(index); - - order = min_t(size_t, order, MAX_PAGECACHE_ORDER); - return order > 0 ? BIT(order + 1) - 1 : 0; -} - static unsigned int shmem_get_orders_within_size(struct inode *inode, unsigned long within_size_orders, pgoff_t index, loff_t write_end) @@ -655,22 +619,21 @@ static unsigned int shmem_huge_global_enabled(struct inode *inode, pgoff_t index * For tmpfs mmap()'s huge order, we still use PMD-sized order to * allocate huge pages due to lack of a write size hint. * - * Otherwise, tmpfs will allow getting a highest order hint based on - * the size of write and fallocate paths, then will try each allowable - * huge orders. + * For tmpfs with 'huge=always' or 'huge=within_size' mount option, + * we will always try PMD-sized order first. If that failed, it will + * fall back to small large folios. */ switch (SHMEM_SB(inode->i_sb)->huge) { case SHMEM_HUGE_ALWAYS: if (vma) return maybe_pmd_order; - return shmem_mapping_size_orders(inode->i_mapping, index, write_end); + return THP_ORDERS_ALL_FILE_DEFAULT; case SHMEM_HUGE_WITHIN_SIZE: if (vma) within_size_orders = maybe_pmd_order; else - within_size_orders = shmem_mapping_size_orders(inode->i_mapping, - index, write_end); + within_size_orders = THP_ORDERS_ALL_FILE_DEFAULT; within_size_orders = shmem_get_orders_within_size(inode, within_size_orders, index, write_end); @@ -1006,15 +969,15 @@ unsigned long shmem_partial_swap_usage(struct address_space *mapping, pgoff_t start, pgoff_t end) { XA_STATE(xas, &mapping->i_pages, start); - struct page *page; + struct folio *folio; unsigned long swapped = 0; unsigned long max = end - 1; rcu_read_lock(); - xas_for_each(&xas, page, max) { - if (xas_retry(&xas, page)) + xas_for_each(&xas, folio, max) { + if (xas_retry(&xas, folio)) continue; - if (xa_is_value(page)) + if (xa_is_value(folio)) swapped += 1 << xas_get_order(&xas); if (xas.xa_index == max) break; @@ -1698,13 +1661,13 @@ try_split: } /* - * The delete_from_swap_cache() below could be left for + * The swap_cache_del_folio() below could be left for * shrink_folio_list()'s folio_free_swap() to dispose of; * but I'm a little nervous about letting this folio out of * shmem_writeout() in a hybrid half-tmpfs-half-swap state * e.g. folio_mapping(folio) might give an unexpected answer. */ - delete_from_swap_cache(folio); + swap_cache_del_folio(folio); goto redirty; } if (nr_pages > 1) @@ -1817,7 +1780,7 @@ unsigned long shmem_allowable_huge_orders(struct inode *inode, vm_flags_t vm_flags = vma ? vma->vm_flags : 0; unsigned int global_orders; - if (thp_disabled_by_hw() || (vma && vma_thp_disabled(vma, vm_flags))) + if (thp_disabled_by_hw() || (vma && vma_thp_disabled(vma, vm_flags, shmem_huge_force))) return 0; global_orders = shmem_huge_global_enabled(inode, index, write_end, @@ -2082,7 +2045,7 @@ retry: new->swap = entry; memcg1_swapin(entry, nr_pages); - shadow = get_shadow_from_swap_cache(entry); + shadow = swap_cache_get_shadow(entry); if (shadow) workingset_refault(new, shadow); folio_add_lru(new); @@ -2120,13 +2083,11 @@ static int shmem_replace_folio(struct folio **foliop, gfp_t gfp, struct shmem_inode_info *info, pgoff_t index, struct vm_area_struct *vma) { + struct swap_cluster_info *ci; struct folio *new, *old = *foliop; swp_entry_t entry = old->swap; - struct address_space *swap_mapping = swap_address_space(entry); - pgoff_t swap_index = swap_cache_index(entry); - XA_STATE(xas, &swap_mapping->i_pages, swap_index); int nr_pages = folio_nr_pages(old); - int error = 0, i; + int error = 0; /* * We have arrived here because our zones are constrained, so don't @@ -2155,38 +2116,15 @@ static int shmem_replace_folio(struct folio **foliop, gfp_t gfp, new->swap = entry; folio_set_swapcache(new); - /* Swap cache still stores N entries instead of a high-order entry */ - xa_lock_irq(&swap_mapping->i_pages); - for (i = 0; i < nr_pages; i++) { - void *item = xas_load(&xas); + ci = swap_cluster_get_and_lock_irq(old); + __swap_cache_replace_folio(ci, old, new); + mem_cgroup_replace_folio(old, new); + shmem_update_stats(new, nr_pages); + shmem_update_stats(old, -nr_pages); + swap_cluster_unlock_irq(ci); - if (item != old) { - error = -ENOENT; - break; - } - - xas_store(&xas, new); - xas_next(&xas); - } - if (!error) { - mem_cgroup_replace_folio(old, new); - shmem_update_stats(new, nr_pages); - shmem_update_stats(old, -nr_pages); - } - xa_unlock_irq(&swap_mapping->i_pages); - - if (unlikely(error)) { - /* - * Is this possible? I think not, now that our callers - * check both the swapcache flag and folio->private - * after getting the folio lock; but be defensive. - * Reverse old to newpage for clear and free. - */ - old = new; - } else { - folio_add_lru(new); - *foliop = new; - } + folio_add_lru(new); + *foliop = new; folio_clear_swapcache(old); old->private = NULL; @@ -2220,7 +2158,7 @@ static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index, nr_pages = folio_nr_pages(folio); folio_wait_writeback(folio); if (!skip_swapcache) - delete_from_swap_cache(folio); + swap_cache_del_folio(folio); /* * Don't treat swapin error folio as alloced. Otherwise inode->i_blocks * won't be 0 when inode is released and thus trigger WARN_ON(i_blocks) @@ -2235,7 +2173,7 @@ static int shmem_split_large_entry(struct inode *inode, pgoff_t index, { struct address_space *mapping = inode->i_mapping; XA_STATE_ORDER(xas, &mapping->i_pages, index, 0); - int split_order = 0, entry_order; + int split_order = 0; int i; /* Convert user data gfp flags to xarray node gfp flags */ @@ -2253,15 +2191,12 @@ static int shmem_split_large_entry(struct inode *inode, pgoff_t index, goto unlock; } - entry_order = xas_get_order(&xas); - - if (!entry_order) + cur_order = xas_get_order(&xas); + if (!cur_order) goto unlock; /* Try to split large swap entry in pagecache */ - cur_order = entry_order; - swap_index = round_down(index, 1 << entry_order); - + swap_index = round_down(index, 1 << cur_order); split_order = xas_try_split_min_order(cur_order); while (cur_order > 0) { @@ -2354,7 +2289,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, } /* Look it up and read it in.. */ - folio = swap_cache_get_folio(swap, NULL, 0); + folio = swap_cache_get_folio(swap); if (!folio) { if (data_race(si->flags & SWP_SYNCHRONOUS_IO)) { /* Direct swapin skipping swap cache & readahead */ @@ -2379,6 +2314,8 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, count_vm_event(PGMAJFAULT); count_memcg_event_mm(fault_mm, PGMAJFAULT); } + } else { + swap_update_readahead(folio, NULL, 0); } if (order > folio_order(folio)) { @@ -2430,7 +2367,6 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, goto failed; } folio_wait_writeback(folio); - nr_pages = folio_nr_pages(folio); /* * Some architectures may have to restore extra metadata to the @@ -2458,7 +2394,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, folio->swap.val = 0; swapcache_clear(si, swap, nr_pages); } else { - delete_from_swap_cache(folio); + swap_cache_del_folio(folio); } folio_mark_dirty(folio); swap_free_nr(swap, nr_pages); @@ -5081,7 +5017,7 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc) sb->s_flags |= SB_NOUSER; } sb->s_export_op = &shmem_export_ops; - sb->s_flags |= SB_NOSEC | SB_I_VERSION; + sb->s_flags |= SB_NOSEC; #if IS_ENABLED(CONFIG_UNICODE) if (!ctx->encoding && ctx->strict_encoding) { @@ -5341,7 +5277,7 @@ static const struct super_operations shmem_ops = { .get_dquots = shmem_get_dquots, #endif .evict_inode = shmem_evict_inode, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, .put_super = shmem_put_super, #ifdef CONFIG_TRANSPARENT_HUGEPAGE .nr_cached_objects = shmem_unused_huge_count, @@ -5385,6 +5321,9 @@ int shmem_init_fs_context(struct fs_context *fc) fc->fs_private = ctx; fc->ops = &shmem_fs_context_ops; +#ifdef CONFIG_TMPFS + fc->sb_flags |= SB_I_VERSION; +#endif return 0; } diff --git a/mm/show_mem.c b/mm/show_mem.c index 41999e94a56d..3a4b5207635d 100644 --- a/mm/show_mem.c +++ b/mm/show_mem.c @@ -278,7 +278,8 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z #endif K(node_page_state(pgdat, NR_PAGETABLE)), K(node_page_state(pgdat, NR_SECONDARY_PAGETABLE)), - str_yes_no(pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES), + str_yes_no(atomic_read(&pgdat->kswapd_failures) >= + MAX_RECLAIM_RETRIES), K(node_page_state(pgdat, NR_BALLOON_PAGES))); } @@ -310,6 +311,7 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z " inactive_file:%lukB" " unevictable:%lukB" " writepending:%lukB" + " zspages:%lukB" " present:%lukB" " managed:%lukB" " mlocked:%lukB" @@ -332,6 +334,11 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z K(zone_page_state(zone, NR_ZONE_INACTIVE_FILE)), K(zone_page_state(zone, NR_ZONE_UNEVICTABLE)), K(zone_page_state(zone, NR_ZONE_WRITE_PENDING)), +#if IS_ENABLED(CONFIG_ZSMALLOC) + K(zone_page_state(zone, NR_ZSPAGES)), +#else + 0UL, +#endif K(zone->present_pages), K(zone_managed_pages(zone)), K(zone_page_state(zone, NR_MLOCK)), @@ -419,13 +426,16 @@ void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx) printk("%lu pages hwpoisoned\n", atomic_long_read(&num_poisoned_pages)); #endif #ifdef CONFIG_MEM_ALLOC_PROFILING - { + static DEFINE_SPINLOCK(mem_alloc_profiling_spinlock); + + if (spin_trylock(&mem_alloc_profiling_spinlock)) { struct codetag_bytes tags[10]; size_t i, nr; nr = alloc_tag_top_users(tags, ARRAY_SIZE(tags), false); if (nr) { - pr_notice("Memory allocations:\n"); + pr_notice("Memory allocations (profiling is currently turned %s):\n", + mem_alloc_profiling_enabled() ? "on" : "off"); for (i = 0; i < nr; i++) { struct codetag *ct = tags[i].ct; struct alloc_tag *tag = ct_to_alloc_tag(ct); @@ -445,6 +455,7 @@ void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx) ct->lineno, ct->function); } } + spin_unlock(&mem_alloc_profiling_spinlock); } #endif } diff --git a/mm/slab.h b/mm/slab.h index 248b34c839b7..078daecc7cf5 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -50,13 +50,17 @@ typedef union { /* Reuses the bits in struct page */ struct slab { - unsigned long flags; + memdesc_flags_t flags; struct kmem_cache *slab_cache; union { struct { union { struct list_head slab_list; + struct { /* For deferred deactivate_slab() */ + struct llist_node llnode; + void *flush_freelist; + }; #ifdef CONFIG_SLUB_CPU_PARTIAL struct { struct slab *next; @@ -174,12 +178,12 @@ static inline void *slab_address(const struct slab *slab) static inline int slab_nid(const struct slab *slab) { - return folio_nid(slab_folio(slab)); + return memdesc_nid(slab->flags); } static inline pg_data_t *slab_pgdat(const struct slab *slab) { - return folio_pgdat(slab_folio(slab)); + return NODE_DATA(slab_nid(slab)); } static inline struct slab *virt_to_slab(const void *addr) @@ -234,7 +238,9 @@ struct kmem_cache_order_objects { struct kmem_cache { #ifndef CONFIG_SLUB_TINY struct kmem_cache_cpu __percpu *cpu_slab; + struct lock_class_key lock_key; #endif + struct slub_percpu_sheaves __percpu *cpu_sheaves; /* Used for retrieving partial slabs, etc. */ slab_flags_t flags; unsigned long min_partial; @@ -248,6 +254,7 @@ struct kmem_cache { /* Number of per cpu partial slabs to keep around */ unsigned int cpu_partial_slabs; #endif + unsigned int sheaf_capacity; struct kmem_cache_order_objects oo; /* Allocation and freeing of slabs */ @@ -433,6 +440,9 @@ static inline bool is_kmalloc_normal(struct kmem_cache *s) return !(s->flags & (SLAB_CACHE_DMA|SLAB_ACCOUNT|SLAB_RECLAIM_ACCOUNT)); } +bool __kfree_rcu_sheaf(struct kmem_cache *s, void *obj); +void flush_all_rcu_sheaves(void); + #define SLAB_CORE_FLAGS (SLAB_HWCACHE_ALIGN | SLAB_CACHE_DMA | \ SLAB_CACHE_DMA32 | SLAB_PANIC | \ SLAB_TYPESAFE_BY_RCU | SLAB_DEBUG_OBJECTS | \ @@ -526,8 +536,12 @@ static inline struct slabobj_ext *slab_obj_exts(struct slab *slab) unsigned long obj_exts = READ_ONCE(slab->obj_exts); #ifdef CONFIG_MEMCG - VM_BUG_ON_PAGE(obj_exts && !(obj_exts & MEMCG_DATA_OBJEXTS), - slab_page(slab)); + /* + * obj_exts should be either NULL, a valid pointer with + * MEMCG_DATA_OBJEXTS bit set or be equal to OBJEXTS_ALLOC_FAIL. + */ + VM_BUG_ON_PAGE(obj_exts && !(obj_exts & MEMCG_DATA_OBJEXTS) && + obj_exts != OBJEXTS_ALLOC_FAIL, slab_page(slab)); VM_BUG_ON_PAGE(obj_exts & MEMCG_DATA_KMEM, slab_page(slab)); #endif return (struct slabobj_ext *)(obj_exts & ~OBJEXTS_FLAGS_MASK); @@ -656,6 +670,8 @@ void __kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab) void __check_heap_object(const void *ptr, unsigned long n, const struct slab *slab, bool to_user); +void defer_free_barrier(void); + static inline bool slub_debug_orig_size(struct kmem_cache *s) { return (kmem_cache_debug_flags(s, SLAB_STORE_USER) && diff --git a/mm/slab_common.c b/mm/slab_common.c index bfe7c40eeee1..932d13ada36c 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -163,6 +163,9 @@ int slab_unmergeable(struct kmem_cache *s) return 1; #endif + if (s->cpu_sheaves) + return 1; + /* * We may have set a slab to be unmergeable during bootstrap. */ @@ -321,7 +324,7 @@ struct kmem_cache *__kmem_cache_create_args(const char *name, object_size - args->usersize < args->useroffset)) args->usersize = args->useroffset = 0; - if (!args->usersize) + if (!args->usersize && !args->sheaf_capacity) s = __kmem_cache_alias(name, object_size, args->align, flags, args->ctor); if (s) @@ -507,6 +510,9 @@ void kmem_cache_destroy(struct kmem_cache *s) rcu_barrier(); } + /* Wait for deferred work from kmalloc/kfree_nolock() */ + defer_free_barrier(); + cpus_read_lock(); mutex_lock(&slab_mutex); @@ -1605,6 +1611,30 @@ static void kfree_rcu_work(struct work_struct *work) kvfree_rcu_list(head); } +static bool kfree_rcu_sheaf(void *obj) +{ + struct kmem_cache *s; + struct folio *folio; + struct slab *slab; + + if (is_vmalloc_addr(obj)) + return false; + + folio = virt_to_folio(obj); + if (unlikely(!folio_test_slab(folio))) + return false; + + slab = folio_slab(folio); + s = slab->slab_cache; + if (s->cpu_sheaves) { + if (likely(!IS_ENABLED(CONFIG_NUMA) || + slab_nid(slab) == numa_mem_id())) + return __kfree_rcu_sheaf(s, obj); + } + + return false; +} + static bool need_offload_krc(struct kfree_rcu_cpu *krcp) { @@ -1949,6 +1979,9 @@ void kvfree_call_rcu(struct rcu_head *head, void *ptr) if (!head) might_sleep(); + if (!IS_ENABLED(CONFIG_PREEMPT_RT) && kfree_rcu_sheaf(ptr)) + return; + // Queue the object but don't yet schedule the batch. if (debug_rcu_head_queue(ptr)) { // Probable double kfree_rcu(), just leak. @@ -2023,6 +2056,8 @@ void kvfree_rcu_barrier(void) bool queued; int i, cpu; + flush_all_rcu_sheaves(); + /* * Firstly we detach objects and queue them over an RCU-batch * for all CPUs. Finally queued works are flushed for each CPU. diff --git a/mm/slub.c b/mm/slub.c index 30003763d224..584a5ff1828b 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -44,7 +44,8 @@ #include <kunit/test.h> #include <kunit/test-bug.h> #include <linux/sort.h> - +#include <linux/irq_work.h> +#include <linux/kprobes.h> #include <linux/debugfs.h> #include <trace/events/kmem.h> @@ -363,8 +364,12 @@ static inline void debugfs_slab_add(struct kmem_cache *s) { } #endif enum stat_item { + ALLOC_PCS, /* Allocation from percpu sheaf */ ALLOC_FASTPATH, /* Allocation from cpu slab */ ALLOC_SLOWPATH, /* Allocation by getting a new cpu slab */ + FREE_PCS, /* Free to percpu sheaf */ + FREE_RCU_SHEAF, /* Free to rcu_free sheaf */ + FREE_RCU_SHEAF_FAIL, /* Failed to free to a rcu_free sheaf */ FREE_FASTPATH, /* Free to cpu slab */ FREE_SLOWPATH, /* Freeing not to cpu slab */ FREE_FROZEN, /* Freeing to frozen slab */ @@ -389,6 +394,19 @@ enum stat_item { CPU_PARTIAL_FREE, /* Refill cpu partial on free */ CPU_PARTIAL_NODE, /* Refill cpu partial from node partial */ CPU_PARTIAL_DRAIN, /* Drain cpu partial to node partial */ + SHEAF_FLUSH, /* Objects flushed from a sheaf */ + SHEAF_REFILL, /* Objects refilled to a sheaf */ + SHEAF_ALLOC, /* Allocation of an empty sheaf */ + SHEAF_FREE, /* Freeing of an empty sheaf */ + BARN_GET, /* Got full sheaf from barn */ + BARN_GET_FAIL, /* Failed to get full sheaf from barn */ + BARN_PUT, /* Put full sheaf to barn */ + BARN_PUT_FAIL, /* Failed to put full sheaf to barn */ + SHEAF_PREFILL_FAST, /* Sheaf prefill grabbed the spare sheaf */ + SHEAF_PREFILL_SLOW, /* Sheaf prefill found no spare sheaf */ + SHEAF_PREFILL_OVERSIZE, /* Allocation of oversize sheaf for prefill */ + SHEAF_RETURN_FAST, /* Sheaf return reattached spare sheaf */ + SHEAF_RETURN_SLOW, /* Sheaf return could not reattach spare */ NR_SLUB_STAT_ITEMS }; @@ -409,7 +427,7 @@ struct kmem_cache_cpu { #ifdef CONFIG_SLUB_CPU_PARTIAL struct slab *partial; /* Partially allocated slabs */ #endif - local_lock_t lock; /* Protects the fields above */ + local_trylock_t lock; /* Protects the fields above */ #ifdef CONFIG_SLUB_STATS unsigned int stat[NR_SLUB_STAT_ITEMS]; #endif @@ -435,6 +453,37 @@ void stat_add(const struct kmem_cache *s, enum stat_item si, int v) #endif } +#define MAX_FULL_SHEAVES 10 +#define MAX_EMPTY_SHEAVES 10 + +struct node_barn { + spinlock_t lock; + struct list_head sheaves_full; + struct list_head sheaves_empty; + unsigned int nr_full; + unsigned int nr_empty; +}; + +struct slab_sheaf { + union { + struct rcu_head rcu_head; + struct list_head barn_list; + /* only used for prefilled sheafs */ + unsigned int capacity; + }; + struct kmem_cache *cache; + unsigned int size; + int node; /* only used for rcu_sheaf */ + void *objects[]; +}; + +struct slub_percpu_sheaves { + local_trylock_t lock; + struct slab_sheaf *main; /* never NULL when unlocked */ + struct slab_sheaf *spare; /* empty or full, may be NULL */ + struct slab_sheaf *rcu_free; /* for batching kfree_rcu() */ +}; + /* * The slab lists for all objects. */ @@ -447,6 +496,7 @@ struct kmem_cache_node { atomic_long_t total_objects; struct list_head full; #endif + struct node_barn *barn; }; static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) @@ -454,6 +504,12 @@ static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) return s->node[node]; } +/* Get the barn of the current cpu's memory node */ +static inline struct node_barn *get_barn(struct kmem_cache *s) +{ + return get_node(s, numa_mem_id())->barn; +} + /* * Iterator over all nodes. The body will be executed for each node that has * a kmem_cache_node structure allocated (which is true for all online nodes) @@ -470,12 +526,19 @@ static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) */ static nodemask_t slab_nodes; -#ifndef CONFIG_SLUB_TINY /* * Workqueue used for flush_cpu_slab(). */ static struct workqueue_struct *flushwq; -#endif + +struct slub_flush_work { + struct work_struct work; + struct kmem_cache *s; + bool skip; +}; + +static DEFINE_MUTEX(flush_lock); +static DEFINE_PER_CPU(struct slub_flush_work, slub_flush); /******************************************************************** * Core slab cache functions @@ -657,17 +720,17 @@ static inline unsigned int slub_get_cpu_partial(struct kmem_cache *s) */ static inline bool slab_test_pfmemalloc(const struct slab *slab) { - return test_bit(SL_pfmemalloc, &slab->flags); + return test_bit(SL_pfmemalloc, &slab->flags.f); } static inline void slab_set_pfmemalloc(struct slab *slab) { - set_bit(SL_pfmemalloc, &slab->flags); + set_bit(SL_pfmemalloc, &slab->flags.f); } static inline void __slab_clear_pfmemalloc(struct slab *slab) { - __clear_bit(SL_pfmemalloc, &slab->flags); + __clear_bit(SL_pfmemalloc, &slab->flags.f); } /* @@ -675,12 +738,12 @@ static inline void __slab_clear_pfmemalloc(struct slab *slab) */ static __always_inline void slab_lock(struct slab *slab) { - bit_spin_lock(SL_locked, &slab->flags); + bit_spin_lock(SL_locked, &slab->flags.f); } static __always_inline void slab_unlock(struct slab *slab) { - bit_spin_unlock(SL_locked, &slab->flags); + bit_spin_unlock(SL_locked, &slab->flags.f); } static inline bool @@ -822,6 +885,16 @@ static inline unsigned int get_orig_size(struct kmem_cache *s, void *object) } #ifdef CONFIG_SLUB_DEBUG + +/* + * For debugging context when we want to check if the struct slab pointer + * appears to be valid. + */ +static inline bool validate_slab_ptr(struct slab *slab) +{ + return PageSlab(slab_page(slab)); +} + static unsigned long object_map[BITS_TO_LONGS(MAX_OBJS_PER_PAGE)]; static DEFINE_SPINLOCK(object_map_lock); @@ -962,19 +1035,19 @@ static struct track *get_track(struct kmem_cache *s, void *object, } #ifdef CONFIG_STACKDEPOT -static noinline depot_stack_handle_t set_track_prepare(void) +static noinline depot_stack_handle_t set_track_prepare(gfp_t gfp_flags) { depot_stack_handle_t handle; unsigned long entries[TRACK_ADDRS_COUNT]; unsigned int nr_entries; nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 3); - handle = stack_depot_save(entries, nr_entries, GFP_NOWAIT); + handle = stack_depot_save(entries, nr_entries, gfp_flags); return handle; } #else -static inline depot_stack_handle_t set_track_prepare(void) +static inline depot_stack_handle_t set_track_prepare(gfp_t gfp_flags) { return 0; } @@ -996,9 +1069,9 @@ static void set_track_update(struct kmem_cache *s, void *object, } static __always_inline void set_track(struct kmem_cache *s, void *object, - enum track_item alloc, unsigned long addr) + enum track_item alloc, unsigned long addr, gfp_t gfp_flags) { - depot_stack_handle_t handle = set_track_prepare(); + depot_stack_handle_t handle = set_track_prepare(gfp_flags); set_track_update(s, object, alloc, addr, handle); } @@ -1046,7 +1119,7 @@ static void print_slab_info(const struct slab *slab) { pr_err("Slab 0x%p objects=%u used=%u fp=0x%p flags=%pGp\n", slab, slab->objects, slab->inuse, slab->freelist, - &slab->flags); + &slab->flags.f); } void skip_orig_size_check(struct kmem_cache *s, const void *object) @@ -1140,7 +1213,12 @@ static void object_err(struct kmem_cache *s, struct slab *slab, return; slab_bug(s, reason); - print_trailer(s, slab, object); + if (!object || !check_valid_pointer(s, slab, object)) { + print_slab_info(slab); + pr_err("Invalid pointer 0x%p\n", object); + } else { + print_trailer(s, slab, object); + } add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); WARN_ON(1); @@ -1444,15 +1522,15 @@ static int check_object(struct kmem_cache *s, struct slab *slab, return ret; } +/* + * Checks if the slab state looks sane. Assumes the struct slab pointer + * was either obtained in a way that ensures it's valid, or validated + * by validate_slab_ptr() + */ static int check_slab(struct kmem_cache *s, struct slab *slab) { int maxobj; - if (!folio_test_slab(slab_folio(slab))) { - slab_err(s, slab, "Not a valid slab page"); - return 0; - } - maxobj = order_objects(slab_order(slab), s->size); if (slab->objects > maxobj) { slab_err(s, slab, "objects %u > max %u", @@ -1648,17 +1726,15 @@ static noinline bool alloc_debug_processing(struct kmem_cache *s, return true; bad: - if (folio_test_slab(slab_folio(slab))) { - /* - * If this is a slab page then lets do the best we can - * to avoid issues in the future. Marking all objects - * as used avoids touching the remaining objects. - */ - slab_fix(s, "Marking all objects used"); - slab->inuse = slab->objects; - slab->freelist = NULL; - slab->frozen = 1; /* mark consistency-failed slab as frozen */ - } + /* + * Let's do the best we can to avoid issues in the future. Marking all + * objects as used avoids touching the remaining objects. + */ + slab_fix(s, "Marking all objects used"); + slab->inuse = slab->objects; + slab->freelist = NULL; + slab->frozen = 1; /* mark consistency-failed slab as frozen */ + return false; } @@ -1679,10 +1755,7 @@ static inline int free_consistency_checks(struct kmem_cache *s, return 0; if (unlikely(s != slab->slab_cache)) { - if (!folio_test_slab(slab_folio(slab))) { - slab_err(s, slab, "Attempt to free object(0x%p) outside of slab", - object); - } else if (!slab->slab_cache) { + if (!slab->slab_cache) { slab_err(NULL, slab, "No slab cache for object 0x%p", object); } else { @@ -1921,9 +1994,9 @@ static inline bool free_debug_processing(struct kmem_cache *s, static inline void slab_pad_check(struct kmem_cache *s, struct slab *slab) {} static inline int check_object(struct kmem_cache *s, struct slab *slab, void *object, u8 val) { return 1; } -static inline depot_stack_handle_t set_track_prepare(void) { return 0; } +static inline depot_stack_handle_t set_track_prepare(gfp_t gfp_flags) { return 0; } static inline void set_track(struct kmem_cache *s, void *object, - enum track_item alloc, unsigned long addr) {} + enum track_item alloc, unsigned long addr, gfp_t gfp_flags) {} static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n, struct slab *slab) {} static inline void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, @@ -1984,7 +2057,7 @@ static inline void handle_failed_objexts_alloc(unsigned long obj_exts, * objects with no tag reference. Mark all references in this * vector as empty to avoid warnings later on. */ - if (obj_exts & OBJEXTS_ALLOC_FAIL) { + if (obj_exts == OBJEXTS_ALLOC_FAIL) { unsigned int i; for (i = 0; i < objects; i++) @@ -2017,6 +2090,7 @@ static inline void init_slab_obj_exts(struct slab *slab) int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s, gfp_t gfp, bool new_slab) { + bool allow_spin = gfpflags_allow_spinning(gfp); unsigned int objects = objs_per_slab(s, slab); unsigned long new_exts; unsigned long old_exts; @@ -2025,17 +2099,32 @@ int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s, gfp &= ~OBJCGS_CLEAR_MASK; /* Prevent recursive extension vector allocation */ gfp |= __GFP_NO_OBJ_EXT; - vec = kcalloc_node(objects, sizeof(struct slabobj_ext), gfp, - slab_nid(slab)); + + /* + * Note that allow_spin may be false during early boot and its + * restricted GFP_BOOT_MASK. Due to kmalloc_nolock() only supporting + * architectures with cmpxchg16b, early obj_exts will be missing for + * very early allocations on those. + */ + if (unlikely(!allow_spin)) { + size_t sz = objects * sizeof(struct slabobj_ext); + + vec = kmalloc_nolock(sz, __GFP_ZERO | __GFP_NO_OBJ_EXT, + slab_nid(slab)); + } else { + vec = kcalloc_node(objects, sizeof(struct slabobj_ext), gfp, + slab_nid(slab)); + } if (!vec) { /* Mark vectors which failed to allocate */ - if (new_slab) - mark_failed_objexts_alloc(slab); + mark_failed_objexts_alloc(slab); return -ENOMEM; } new_exts = (unsigned long)vec; + if (unlikely(!allow_spin)) + new_exts |= OBJEXTS_NOSPIN_ALLOC; #ifdef CONFIG_MEMCG new_exts |= MEMCG_DATA_OBJEXTS; #endif @@ -2056,7 +2145,10 @@ int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s, * objcg vector should be reused. */ mark_objexts_empty(vec); - kfree(vec); + if (unlikely(!allow_spin)) + kfree_nolock(vec); + else + kfree(vec); return 0; } @@ -2080,7 +2172,10 @@ static inline void free_slab_obj_exts(struct slab *slab) * the extension for obj_exts is expected to be NULL. */ mark_objexts_empty(obj_exts); - kfree(obj_exts); + if (unlikely(READ_ONCE(slab->obj_exts) & OBJEXTS_NOSPIN_ALLOC)) + kfree_nolock(obj_exts); + else + kfree(obj_exts); slab->obj_exts = 0; } @@ -2109,15 +2204,6 @@ prepare_slab_obj_exts_hook(struct kmem_cache *s, gfp_t flags, void *p) { struct slab *slab; - if (!p) - return NULL; - - if (s->flags & (SLAB_NO_OBJ_EXT | SLAB_NOLEAKTRACE)) - return NULL; - - if (flags & __GFP_NO_OBJ_EXT) - return NULL; - slab = virt_to_slab(p); if (!slab_obj_exts(slab) && alloc_slab_obj_exts(slab, s, flags, false)) { @@ -2135,6 +2221,15 @@ __alloc_tagging_slab_alloc_hook(struct kmem_cache *s, void *object, gfp_t flags) { struct slabobj_ext *obj_exts; + if (!object) + return; + + if (s->flags & (SLAB_NO_OBJ_EXT | SLAB_NOLEAKTRACE)) + return; + + if (flags & __GFP_NO_OBJ_EXT) + return; + obj_exts = prepare_slab_obj_exts_hook(s, flags, object); /* * Currently obj_exts is used only for allocation profiling. @@ -2143,6 +2238,8 @@ __alloc_tagging_slab_alloc_hook(struct kmem_cache *s, void *object, gfp_t flags) */ if (likely(obj_exts)) alloc_tag_add(&obj_exts->ref, current->alloc_tag, s->size); + else + alloc_tag_set_inaccurate(current->alloc_tag); } static inline void @@ -2414,7 +2511,7 @@ bool slab_free_hook(struct kmem_cache *s, void *x, bool init, } /* KASAN might put x into memory quarantine, delaying its reuse. */ - return !kasan_slab_free(s, x, init, still_accessible); + return !kasan_slab_free(s, x, init, still_accessible, false); } static __fastpath_inline @@ -2473,17 +2570,463 @@ static void *setup_object(struct kmem_cache *s, void *object) return object; } +static struct slab_sheaf *alloc_empty_sheaf(struct kmem_cache *s, gfp_t gfp) +{ + struct slab_sheaf *sheaf = kzalloc(struct_size(sheaf, objects, + s->sheaf_capacity), gfp); + + if (unlikely(!sheaf)) + return NULL; + + sheaf->cache = s; + + stat(s, SHEAF_ALLOC); + + return sheaf; +} + +static void free_empty_sheaf(struct kmem_cache *s, struct slab_sheaf *sheaf) +{ + kfree(sheaf); + + stat(s, SHEAF_FREE); +} + +static int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, + size_t size, void **p); + + +static int refill_sheaf(struct kmem_cache *s, struct slab_sheaf *sheaf, + gfp_t gfp) +{ + int to_fill = s->sheaf_capacity - sheaf->size; + int filled; + + if (!to_fill) + return 0; + + filled = __kmem_cache_alloc_bulk(s, gfp, to_fill, + &sheaf->objects[sheaf->size]); + + sheaf->size += filled; + + stat_add(s, SHEAF_REFILL, filled); + + if (filled < to_fill) + return -ENOMEM; + + return 0; +} + + +static struct slab_sheaf *alloc_full_sheaf(struct kmem_cache *s, gfp_t gfp) +{ + struct slab_sheaf *sheaf = alloc_empty_sheaf(s, gfp); + + if (!sheaf) + return NULL; + + if (refill_sheaf(s, sheaf, gfp)) { + free_empty_sheaf(s, sheaf); + return NULL; + } + + return sheaf; +} + +/* + * Maximum number of objects freed during a single flush of main pcs sheaf. + * Translates directly to an on-stack array size. + */ +#define PCS_BATCH_MAX 32U + +static void __kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p); + +/* + * Free all objects from the main sheaf. In order to perform + * __kmem_cache_free_bulk() outside of cpu_sheaves->lock, work in batches where + * object pointers are moved to a on-stack array under the lock. To bound the + * stack usage, limit each batch to PCS_BATCH_MAX. + * + * returns true if at least partially flushed + */ +static bool sheaf_flush_main(struct kmem_cache *s) +{ + struct slub_percpu_sheaves *pcs; + unsigned int batch, remaining; + void *objects[PCS_BATCH_MAX]; + struct slab_sheaf *sheaf; + bool ret = false; + +next_batch: + if (!local_trylock(&s->cpu_sheaves->lock)) + return ret; + + pcs = this_cpu_ptr(s->cpu_sheaves); + sheaf = pcs->main; + + batch = min(PCS_BATCH_MAX, sheaf->size); + + sheaf->size -= batch; + memcpy(objects, sheaf->objects + sheaf->size, batch * sizeof(void *)); + + remaining = sheaf->size; + + local_unlock(&s->cpu_sheaves->lock); + + __kmem_cache_free_bulk(s, batch, &objects[0]); + + stat_add(s, SHEAF_FLUSH, batch); + + ret = true; + + if (remaining) + goto next_batch; + + return ret; +} + +/* + * Free all objects from a sheaf that's unused, i.e. not linked to any + * cpu_sheaves, so we need no locking and batching. The locking is also not + * necessary when flushing cpu's sheaves (both spare and main) during cpu + * hotremove as the cpu is not executing anymore. + */ +static void sheaf_flush_unused(struct kmem_cache *s, struct slab_sheaf *sheaf) +{ + if (!sheaf->size) + return; + + stat_add(s, SHEAF_FLUSH, sheaf->size); + + __kmem_cache_free_bulk(s, sheaf->size, &sheaf->objects[0]); + + sheaf->size = 0; +} + +static void __rcu_free_sheaf_prepare(struct kmem_cache *s, + struct slab_sheaf *sheaf) +{ + bool init = slab_want_init_on_free(s); + void **p = &sheaf->objects[0]; + unsigned int i = 0; + + while (i < sheaf->size) { + struct slab *slab = virt_to_slab(p[i]); + + memcg_slab_free_hook(s, slab, p + i, 1); + alloc_tagging_slab_free_hook(s, slab, p + i, 1); + + if (unlikely(!slab_free_hook(s, p[i], init, true))) { + p[i] = p[--sheaf->size]; + continue; + } + + i++; + } +} + +static void rcu_free_sheaf_nobarn(struct rcu_head *head) +{ + struct slab_sheaf *sheaf; + struct kmem_cache *s; + + sheaf = container_of(head, struct slab_sheaf, rcu_head); + s = sheaf->cache; + + __rcu_free_sheaf_prepare(s, sheaf); + + sheaf_flush_unused(s, sheaf); + + free_empty_sheaf(s, sheaf); +} + +/* + * Caller needs to make sure migration is disabled in order to fully flush + * single cpu's sheaves + * + * must not be called from an irq + * + * flushing operations are rare so let's keep it simple and flush to slabs + * directly, skipping the barn + */ +static void pcs_flush_all(struct kmem_cache *s) +{ + struct slub_percpu_sheaves *pcs; + struct slab_sheaf *spare, *rcu_free; + + local_lock(&s->cpu_sheaves->lock); + pcs = this_cpu_ptr(s->cpu_sheaves); + + spare = pcs->spare; + pcs->spare = NULL; + + rcu_free = pcs->rcu_free; + pcs->rcu_free = NULL; + + local_unlock(&s->cpu_sheaves->lock); + + if (spare) { + sheaf_flush_unused(s, spare); + free_empty_sheaf(s, spare); + } + + if (rcu_free) + call_rcu(&rcu_free->rcu_head, rcu_free_sheaf_nobarn); + + sheaf_flush_main(s); +} + +static void __pcs_flush_all_cpu(struct kmem_cache *s, unsigned int cpu) +{ + struct slub_percpu_sheaves *pcs; + + pcs = per_cpu_ptr(s->cpu_sheaves, cpu); + + /* The cpu is not executing anymore so we don't need pcs->lock */ + sheaf_flush_unused(s, pcs->main); + if (pcs->spare) { + sheaf_flush_unused(s, pcs->spare); + free_empty_sheaf(s, pcs->spare); + pcs->spare = NULL; + } + + if (pcs->rcu_free) { + call_rcu(&pcs->rcu_free->rcu_head, rcu_free_sheaf_nobarn); + pcs->rcu_free = NULL; + } +} + +static void pcs_destroy(struct kmem_cache *s) +{ + int cpu; + + for_each_possible_cpu(cpu) { + struct slub_percpu_sheaves *pcs; + + pcs = per_cpu_ptr(s->cpu_sheaves, cpu); + + /* can happen when unwinding failed create */ + if (!pcs->main) + continue; + + /* + * We have already passed __kmem_cache_shutdown() so everything + * was flushed and there should be no objects allocated from + * slabs, otherwise kmem_cache_destroy() would have aborted. + * Therefore something would have to be really wrong if the + * warnings here trigger, and we should rather leave objects and + * sheaves to leak in that case. + */ + + WARN_ON(pcs->spare); + WARN_ON(pcs->rcu_free); + + if (!WARN_ON(pcs->main->size)) { + free_empty_sheaf(s, pcs->main); + pcs->main = NULL; + } + } + + free_percpu(s->cpu_sheaves); + s->cpu_sheaves = NULL; +} + +static struct slab_sheaf *barn_get_empty_sheaf(struct node_barn *barn) +{ + struct slab_sheaf *empty = NULL; + unsigned long flags; + + if (!data_race(barn->nr_empty)) + return NULL; + + spin_lock_irqsave(&barn->lock, flags); + + if (likely(barn->nr_empty)) { + empty = list_first_entry(&barn->sheaves_empty, + struct slab_sheaf, barn_list); + list_del(&empty->barn_list); + barn->nr_empty--; + } + + spin_unlock_irqrestore(&barn->lock, flags); + + return empty; +} + +/* + * The following two functions are used mainly in cases where we have to undo an + * intended action due to a race or cpu migration. Thus they do not check the + * empty or full sheaf limits for simplicity. + */ + +static void barn_put_empty_sheaf(struct node_barn *barn, struct slab_sheaf *sheaf) +{ + unsigned long flags; + + spin_lock_irqsave(&barn->lock, flags); + + list_add(&sheaf->barn_list, &barn->sheaves_empty); + barn->nr_empty++; + + spin_unlock_irqrestore(&barn->lock, flags); +} + +static void barn_put_full_sheaf(struct node_barn *barn, struct slab_sheaf *sheaf) +{ + unsigned long flags; + + spin_lock_irqsave(&barn->lock, flags); + + list_add(&sheaf->barn_list, &barn->sheaves_full); + barn->nr_full++; + + spin_unlock_irqrestore(&barn->lock, flags); +} + +static struct slab_sheaf *barn_get_full_or_empty_sheaf(struct node_barn *barn) +{ + struct slab_sheaf *sheaf = NULL; + unsigned long flags; + + if (!data_race(barn->nr_full) && !data_race(barn->nr_empty)) + return NULL; + + spin_lock_irqsave(&barn->lock, flags); + + if (barn->nr_full) { + sheaf = list_first_entry(&barn->sheaves_full, struct slab_sheaf, + barn_list); + list_del(&sheaf->barn_list); + barn->nr_full--; + } else if (barn->nr_empty) { + sheaf = list_first_entry(&barn->sheaves_empty, + struct slab_sheaf, barn_list); + list_del(&sheaf->barn_list); + barn->nr_empty--; + } + + spin_unlock_irqrestore(&barn->lock, flags); + + return sheaf; +} + +/* + * If a full sheaf is available, return it and put the supplied empty one to + * barn. We ignore the limit on empty sheaves as the number of sheaves doesn't + * change. + */ +static struct slab_sheaf * +barn_replace_empty_sheaf(struct node_barn *barn, struct slab_sheaf *empty) +{ + struct slab_sheaf *full = NULL; + unsigned long flags; + + if (!data_race(barn->nr_full)) + return NULL; + + spin_lock_irqsave(&barn->lock, flags); + + if (likely(barn->nr_full)) { + full = list_first_entry(&barn->sheaves_full, struct slab_sheaf, + barn_list); + list_del(&full->barn_list); + list_add(&empty->barn_list, &barn->sheaves_empty); + barn->nr_full--; + barn->nr_empty++; + } + + spin_unlock_irqrestore(&barn->lock, flags); + + return full; +} + +/* + * If an empty sheaf is available, return it and put the supplied full one to + * barn. But if there are too many full sheaves, reject this with -E2BIG. + */ +static struct slab_sheaf * +barn_replace_full_sheaf(struct node_barn *barn, struct slab_sheaf *full) +{ + struct slab_sheaf *empty; + unsigned long flags; + + /* we don't repeat this check under barn->lock as it's not critical */ + if (data_race(barn->nr_full) >= MAX_FULL_SHEAVES) + return ERR_PTR(-E2BIG); + if (!data_race(barn->nr_empty)) + return ERR_PTR(-ENOMEM); + + spin_lock_irqsave(&barn->lock, flags); + + if (likely(barn->nr_empty)) { + empty = list_first_entry(&barn->sheaves_empty, struct slab_sheaf, + barn_list); + list_del(&empty->barn_list); + list_add(&full->barn_list, &barn->sheaves_full); + barn->nr_empty--; + barn->nr_full++; + } else { + empty = ERR_PTR(-ENOMEM); + } + + spin_unlock_irqrestore(&barn->lock, flags); + + return empty; +} + +static void barn_init(struct node_barn *barn) +{ + spin_lock_init(&barn->lock); + INIT_LIST_HEAD(&barn->sheaves_full); + INIT_LIST_HEAD(&barn->sheaves_empty); + barn->nr_full = 0; + barn->nr_empty = 0; +} + +static void barn_shrink(struct kmem_cache *s, struct node_barn *barn) +{ + struct list_head empty_list; + struct list_head full_list; + struct slab_sheaf *sheaf, *sheaf2; + unsigned long flags; + + INIT_LIST_HEAD(&empty_list); + INIT_LIST_HEAD(&full_list); + + spin_lock_irqsave(&barn->lock, flags); + + list_splice_init(&barn->sheaves_full, &full_list); + barn->nr_full = 0; + list_splice_init(&barn->sheaves_empty, &empty_list); + barn->nr_empty = 0; + + spin_unlock_irqrestore(&barn->lock, flags); + + list_for_each_entry_safe(sheaf, sheaf2, &full_list, barn_list) { + sheaf_flush_unused(s, sheaf); + free_empty_sheaf(s, sheaf); + } + + list_for_each_entry_safe(sheaf, sheaf2, &empty_list, barn_list) + free_empty_sheaf(s, sheaf); +} + /* * Slab allocation and freeing */ static inline struct slab *alloc_slab_page(gfp_t flags, int node, - struct kmem_cache_order_objects oo) + struct kmem_cache_order_objects oo, + bool allow_spin) { struct folio *folio; struct slab *slab; unsigned int order = oo_order(oo); - if (node == NUMA_NO_NODE) + if (unlikely(!allow_spin)) + folio = (struct folio *)alloc_frozen_pages_nolock(0/* __GFP_COMP is implied */, + node, order); + else if (node == NUMA_NO_NODE) folio = (struct folio *)alloc_frozen_pages(flags, order); else folio = (struct folio *)__alloc_frozen_pages(flags, order, node, NULL); @@ -2633,6 +3176,7 @@ static __always_inline void unaccount_slab(struct slab *slab, int order, static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) { + bool allow_spin = gfpflags_allow_spinning(flags); struct slab *slab; struct kmem_cache_order_objects oo = s->oo; gfp_t alloc_gfp; @@ -2652,7 +3196,11 @@ static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) if ((alloc_gfp & __GFP_DIRECT_RECLAIM) && oo_order(oo) > oo_order(s->min)) alloc_gfp = (alloc_gfp | __GFP_NOMEMALLOC) & ~__GFP_RECLAIM; - slab = alloc_slab_page(alloc_gfp, node, oo); + /* + * __GFP_RECLAIM could be cleared on the first allocation attempt, + * so pass allow_spin flag directly. + */ + slab = alloc_slab_page(alloc_gfp, node, oo, allow_spin); if (unlikely(!slab)) { oo = s->min; alloc_gfp = flags; @@ -2660,7 +3208,7 @@ static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) * Allocation may have failed due to fragmentation. * Try a lower order alloc if possible */ - slab = alloc_slab_page(alloc_gfp, node, oo); + slab = alloc_slab_page(alloc_gfp, node, oo, allow_spin); if (unlikely(!slab)) return NULL; stat(s, ORDER_FALLBACK); @@ -2755,17 +3303,17 @@ static void discard_slab(struct kmem_cache *s, struct slab *slab) static inline bool slab_test_node_partial(const struct slab *slab) { - return test_bit(SL_partial, &slab->flags); + return test_bit(SL_partial, &slab->flags.f); } static inline void slab_set_node_partial(struct slab *slab) { - set_bit(SL_partial, &slab->flags); + set_bit(SL_partial, &slab->flags.f); } static inline void slab_clear_node_partial(struct slab *slab) { - clear_bit(SL_partial, &slab->flags); + clear_bit(SL_partial, &slab->flags.f); } /* @@ -2811,13 +3359,21 @@ static void *alloc_single_from_partial(struct kmem_cache *s, lockdep_assert_held(&n->list_lock); +#ifdef CONFIG_SLUB_DEBUG + if (s->flags & SLAB_CONSISTENCY_CHECKS) { + if (!validate_slab_ptr(slab)) { + slab_err(s, slab, "Not a valid slab page"); + return NULL; + } + } +#endif + object = slab->freelist; slab->freelist = get_freepointer(s, object); slab->inuse++; if (!alloc_debug_processing(s, slab, object, orig_size)) { - if (folio_test_slab(slab_folio(slab))) - remove_partial(n, slab); + remove_partial(n, slab); return NULL; } @@ -2829,33 +3385,47 @@ static void *alloc_single_from_partial(struct kmem_cache *s, return object; } +static void defer_deactivate_slab(struct slab *slab, void *flush_freelist); + /* * Called only for kmem_cache_debug() caches to allocate from a freshly * allocated slab. Allocate a single object instead of whole freelist * and put the slab to the partial (or full) list. */ -static void *alloc_single_from_new_slab(struct kmem_cache *s, - struct slab *slab, int orig_size) +static void *alloc_single_from_new_slab(struct kmem_cache *s, struct slab *slab, + int orig_size, gfp_t gfpflags) { + bool allow_spin = gfpflags_allow_spinning(gfpflags); int nid = slab_nid(slab); struct kmem_cache_node *n = get_node(s, nid); unsigned long flags; void *object; + if (!allow_spin && !spin_trylock_irqsave(&n->list_lock, flags)) { + /* Unlucky, discard newly allocated slab */ + slab->frozen = 1; + defer_deactivate_slab(slab, NULL); + return NULL; + } object = slab->freelist; slab->freelist = get_freepointer(s, object); slab->inuse = 1; - if (!alloc_debug_processing(s, slab, object, orig_size)) + if (!alloc_debug_processing(s, slab, object, orig_size)) { /* * It's not really expected that this would fail on a * freshly allocated slab, but a concurrent memory * corruption in theory could cause that. + * Leak memory of allocated slab. */ + if (!allow_spin) + spin_unlock_irqrestore(&n->list_lock, flags); return NULL; + } - spin_lock_irqsave(&n->list_lock, flags); + if (allow_spin) + spin_lock_irqsave(&n->list_lock, flags); if (slab->inuse == slab->objects) add_full(s, n, slab); @@ -2896,7 +3466,10 @@ static struct slab *get_partial_node(struct kmem_cache *s, if (!n || !n->nr_partial) return NULL; - spin_lock_irqsave(&n->list_lock, flags); + if (gfpflags_allow_spinning(pc->flags)) + spin_lock_irqsave(&n->list_lock, flags); + else if (!spin_trylock_irqsave(&n->list_lock, flags)) + return NULL; list_for_each_entry_safe(slab, slab2, &n->partial, slab_list) { if (!pfmemalloc_match(slab, pc->flags)) continue; @@ -3064,30 +3637,46 @@ static inline void note_cmpxchg_failure(const char *n, pr_info("%s %s: cmpxchg redo ", n, s->name); -#ifdef CONFIG_PREEMPTION - if (tid_to_cpu(tid) != tid_to_cpu(actual_tid)) + if (IS_ENABLED(CONFIG_PREEMPTION) && + tid_to_cpu(tid) != tid_to_cpu(actual_tid)) { pr_warn("due to cpu change %d -> %d\n", tid_to_cpu(tid), tid_to_cpu(actual_tid)); - else -#endif - if (tid_to_event(tid) != tid_to_event(actual_tid)) + } else if (tid_to_event(tid) != tid_to_event(actual_tid)) { pr_warn("due to cpu running other code. Event %ld->%ld\n", tid_to_event(tid), tid_to_event(actual_tid)); - else + } else { pr_warn("for unknown reason: actual=%lx was=%lx target=%lx\n", actual_tid, tid, next_tid(tid)); + } #endif stat(s, CMPXCHG_DOUBLE_CPU_FAIL); } static void init_kmem_cache_cpus(struct kmem_cache *s) { +#ifdef CONFIG_PREEMPT_RT + /* + * Register lockdep key for non-boot kmem caches to avoid + * WARN_ON_ONCE(static_obj(key))) in lockdep_register_key() + */ + bool finegrain_lockdep = !init_section_contains(s, 1); +#else + /* + * Don't bother with different lockdep classes for each + * kmem_cache, since we only use local_trylock_irqsave(). + */ + bool finegrain_lockdep = false; +#endif int cpu; struct kmem_cache_cpu *c; + if (finegrain_lockdep) + lockdep_register_key(&s->lock_key); for_each_possible_cpu(cpu) { c = per_cpu_ptr(s->cpu_slab, cpu); - local_lock_init(&c->lock); + local_trylock_init(&c->lock); + if (finegrain_lockdep) + lockdep_set_class(&c->lock, &s->lock_key); c->tid = init_tid(cpu); } } @@ -3178,6 +3767,47 @@ static void deactivate_slab(struct kmem_cache *s, struct slab *slab, } } +/* + * ___slab_alloc()'s caller is supposed to check if kmem_cache::kmem_cache_cpu::lock + * can be acquired without a deadlock before invoking the function. + * + * Without LOCKDEP we trust the code to be correct. kmalloc_nolock() is + * using local_lock_is_locked() properly before calling local_lock_cpu_slab(), + * and kmalloc() is not used in an unsupported context. + * + * With LOCKDEP, on PREEMPT_RT lockdep does its checking in local_lock_irqsave(). + * On !PREEMPT_RT we use trylock to avoid false positives in NMI, but + * lockdep_assert() will catch a bug in case: + * #1 + * kmalloc() -> ___slab_alloc() -> irqsave -> NMI -> bpf -> kmalloc_nolock() + * or + * #2 + * kmalloc() -> ___slab_alloc() -> irqsave -> tracepoint/kprobe -> bpf -> kmalloc_nolock() + * + * On PREEMPT_RT an invocation is not possible from IRQ-off or preempt + * disabled context. The lock will always be acquired and if needed it + * block and sleep until the lock is available. + * #1 is possible in !PREEMPT_RT only. + * #2 is possible in both with a twist that irqsave is replaced with rt_spinlock: + * kmalloc() -> ___slab_alloc() -> rt_spin_lock(kmem_cache_A) -> + * tracepoint/kprobe -> bpf -> kmalloc_nolock() -> rt_spin_lock(kmem_cache_B) + * + * local_lock_is_locked() prevents the case kmem_cache_A == kmem_cache_B + */ +#if defined(CONFIG_PREEMPT_RT) || !defined(CONFIG_LOCKDEP) +#define local_lock_cpu_slab(s, flags) \ + local_lock_irqsave(&(s)->cpu_slab->lock, flags) +#else +#define local_lock_cpu_slab(s, flags) \ + do { \ + bool __l = local_trylock_irqsave(&(s)->cpu_slab->lock, flags); \ + lockdep_assert(__l); \ + } while (0) +#endif + +#define local_unlock_cpu_slab(s, flags) \ + local_unlock_irqrestore(&(s)->cpu_slab->lock, flags) + #ifdef CONFIG_SLUB_CPU_PARTIAL static void __put_partials(struct kmem_cache *s, struct slab *partial_slab) { @@ -3262,7 +3892,7 @@ static void put_cpu_partial(struct kmem_cache *s, struct slab *slab, int drain) unsigned long flags; int slabs = 0; - local_lock_irqsave(&s->cpu_slab->lock, flags); + local_lock_cpu_slab(s, flags); oldslab = this_cpu_read(s->cpu_slab->partial); @@ -3287,7 +3917,7 @@ static void put_cpu_partial(struct kmem_cache *s, struct slab *slab, int drain) this_cpu_write(s->cpu_slab->partial, slab); - local_unlock_irqrestore(&s->cpu_slab->lock, flags); + local_unlock_cpu_slab(s, flags); if (slab_to_put) { __put_partials(s, slab_to_put); @@ -3344,11 +3974,40 @@ static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) put_partials_cpu(s, c); } -struct slub_flush_work { - struct work_struct work; - struct kmem_cache *s; - bool skip; -}; +static inline void flush_this_cpu_slab(struct kmem_cache *s) +{ + struct kmem_cache_cpu *c = this_cpu_ptr(s->cpu_slab); + + if (c->slab) + flush_slab(s, c); + + put_partials(s); +} + +static bool has_cpu_slab(int cpu, struct kmem_cache *s) +{ + struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); + + return c->slab || slub_percpu_partial(c); +} + +#else /* CONFIG_SLUB_TINY */ +static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) { } +static inline bool has_cpu_slab(int cpu, struct kmem_cache *s) { return false; } +static inline void flush_this_cpu_slab(struct kmem_cache *s) { } +#endif /* CONFIG_SLUB_TINY */ + +static bool has_pcs_used(int cpu, struct kmem_cache *s) +{ + struct slub_percpu_sheaves *pcs; + + if (!s->cpu_sheaves) + return false; + + pcs = per_cpu_ptr(s->cpu_sheaves, cpu); + + return (pcs->spare || pcs->rcu_free || pcs->main->size); +} /* * Flush cpu slab. @@ -3358,30 +4017,18 @@ struct slub_flush_work { static void flush_cpu_slab(struct work_struct *w) { struct kmem_cache *s; - struct kmem_cache_cpu *c; struct slub_flush_work *sfw; sfw = container_of(w, struct slub_flush_work, work); s = sfw->s; - c = this_cpu_ptr(s->cpu_slab); - - if (c->slab) - flush_slab(s, c); - - put_partials(s); -} -static bool has_cpu_slab(int cpu, struct kmem_cache *s) -{ - struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); + if (s->cpu_sheaves) + pcs_flush_all(s); - return c->slab || slub_percpu_partial(c); + flush_this_cpu_slab(s); } -static DEFINE_MUTEX(flush_lock); -static DEFINE_PER_CPU(struct slub_flush_work, slub_flush); - static void flush_all_cpus_locked(struct kmem_cache *s) { struct slub_flush_work *sfw; @@ -3392,7 +4039,7 @@ static void flush_all_cpus_locked(struct kmem_cache *s) for_each_online_cpu(cpu) { sfw = &per_cpu(slub_flush, cpu); - if (!has_cpu_slab(cpu, s)) { + if (!has_cpu_slab(cpu, s) && !has_pcs_used(cpu, s)) { sfw->skip = true; continue; } @@ -3419,6 +4066,74 @@ static void flush_all(struct kmem_cache *s) cpus_read_unlock(); } +static void flush_rcu_sheaf(struct work_struct *w) +{ + struct slub_percpu_sheaves *pcs; + struct slab_sheaf *rcu_free; + struct slub_flush_work *sfw; + struct kmem_cache *s; + + sfw = container_of(w, struct slub_flush_work, work); + s = sfw->s; + + local_lock(&s->cpu_sheaves->lock); + pcs = this_cpu_ptr(s->cpu_sheaves); + + rcu_free = pcs->rcu_free; + pcs->rcu_free = NULL; + + local_unlock(&s->cpu_sheaves->lock); + + if (rcu_free) + call_rcu(&rcu_free->rcu_head, rcu_free_sheaf_nobarn); +} + + +/* needed for kvfree_rcu_barrier() */ +void flush_all_rcu_sheaves(void) +{ + struct slub_flush_work *sfw; + struct kmem_cache *s; + unsigned int cpu; + + cpus_read_lock(); + mutex_lock(&slab_mutex); + + list_for_each_entry(s, &slab_caches, list) { + if (!s->cpu_sheaves) + continue; + + mutex_lock(&flush_lock); + + for_each_online_cpu(cpu) { + sfw = &per_cpu(slub_flush, cpu); + + /* + * we don't check if rcu_free sheaf exists - racing + * __kfree_rcu_sheaf() might have just removed it. + * by executing flush_rcu_sheaf() on the cpu we make + * sure the __kfree_rcu_sheaf() finished its call_rcu() + */ + + INIT_WORK(&sfw->work, flush_rcu_sheaf); + sfw->s = s; + queue_work_on(cpu, flushwq, &sfw->work); + } + + for_each_online_cpu(cpu) { + sfw = &per_cpu(slub_flush, cpu); + flush_work(&sfw->work); + } + + mutex_unlock(&flush_lock); + } + + mutex_unlock(&slab_mutex); + cpus_read_unlock(); + + rcu_barrier(); +} + /* * Use the cpu notifier to insure that the cpu slabs are flushed when * necessary. @@ -3428,19 +4143,15 @@ static int slub_cpu_dead(unsigned int cpu) struct kmem_cache *s; mutex_lock(&slab_mutex); - list_for_each_entry(s, &slab_caches, list) + list_for_each_entry(s, &slab_caches, list) { __flush_cpu_slab(s, cpu); + if (s->cpu_sheaves) + __pcs_flush_all_cpu(s, cpu); + } mutex_unlock(&slab_mutex); return 0; } -#else /* CONFIG_SLUB_TINY */ -static inline void flush_all_cpus_locked(struct kmem_cache *s) { } -static inline void flush_all(struct kmem_cache *s) { } -static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) { } -static inline int slub_cpu_dead(unsigned int cpu) { return 0; } -#endif /* CONFIG_SLUB_TINY */ - /* * Check if the objects in a per cpu structure fit numa * locality expectations. @@ -3721,6 +4432,7 @@ static inline void *freeze_slab(struct kmem_cache *s, struct slab *slab) static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, unsigned long addr, struct kmem_cache_cpu *c, unsigned int orig_size) { + bool allow_spin = gfpflags_allow_spinning(gfpflags); void *freelist; struct slab *slab; unsigned long flags; @@ -3746,9 +4458,21 @@ reread_slab: if (unlikely(!node_match(slab, node))) { /* * same as above but node_match() being false already - * implies node != NUMA_NO_NODE + * implies node != NUMA_NO_NODE. + * + * We don't strictly honor pfmemalloc and NUMA preferences + * when !allow_spin because: + * + * 1. Most kmalloc() users allocate objects on the local node, + * so kmalloc_nolock() tries not to interfere with them by + * deactivating the cpu slab. + * + * 2. Deactivating due to NUMA or pfmemalloc mismatch may cause + * unnecessary slab allocations even when n->partial list + * is not empty. */ - if (!node_isset(node, slab_nodes)) { + if (!node_isset(node, slab_nodes) || + !allow_spin) { node = NUMA_NO_NODE; } else { stat(s, ALLOC_NODE_MISMATCH); @@ -3761,13 +4485,14 @@ reread_slab: * PFMEMALLOC but right now, we are losing the pfmemalloc * information when the page leaves the per-cpu allocator */ - if (unlikely(!pfmemalloc_match(slab, gfpflags))) + if (unlikely(!pfmemalloc_match(slab, gfpflags) && allow_spin)) goto deactivate_slab; /* must check again c->slab in case we got preempted and it changed */ - local_lock_irqsave(&s->cpu_slab->lock, flags); + local_lock_cpu_slab(s, flags); + if (unlikely(slab != c->slab)) { - local_unlock_irqrestore(&s->cpu_slab->lock, flags); + local_unlock_cpu_slab(s, flags); goto reread_slab; } freelist = c->freelist; @@ -3779,7 +4504,7 @@ reread_slab: if (!freelist) { c->slab = NULL; c->tid = next_tid(c->tid); - local_unlock_irqrestore(&s->cpu_slab->lock, flags); + local_unlock_cpu_slab(s, flags); stat(s, DEACTIVATE_BYPASS); goto new_slab; } @@ -3798,34 +4523,34 @@ load_freelist: VM_BUG_ON(!c->slab->frozen); c->freelist = get_freepointer(s, freelist); c->tid = next_tid(c->tid); - local_unlock_irqrestore(&s->cpu_slab->lock, flags); + local_unlock_cpu_slab(s, flags); return freelist; deactivate_slab: - local_lock_irqsave(&s->cpu_slab->lock, flags); + local_lock_cpu_slab(s, flags); if (slab != c->slab) { - local_unlock_irqrestore(&s->cpu_slab->lock, flags); + local_unlock_cpu_slab(s, flags); goto reread_slab; } freelist = c->freelist; c->slab = NULL; c->freelist = NULL; c->tid = next_tid(c->tid); - local_unlock_irqrestore(&s->cpu_slab->lock, flags); + local_unlock_cpu_slab(s, flags); deactivate_slab(s, slab, freelist); new_slab: #ifdef CONFIG_SLUB_CPU_PARTIAL while (slub_percpu_partial(c)) { - local_lock_irqsave(&s->cpu_slab->lock, flags); + local_lock_cpu_slab(s, flags); if (unlikely(c->slab)) { - local_unlock_irqrestore(&s->cpu_slab->lock, flags); + local_unlock_cpu_slab(s, flags); goto reread_slab; } if (unlikely(!slub_percpu_partial(c))) { - local_unlock_irqrestore(&s->cpu_slab->lock, flags); + local_unlock_cpu_slab(s, flags); /* we were preempted and partial list got empty */ goto new_objects; } @@ -3834,7 +4559,8 @@ new_slab: slub_set_percpu_partial(c, slab); if (likely(node_match(slab, node) && - pfmemalloc_match(slab, gfpflags))) { + pfmemalloc_match(slab, gfpflags)) || + !allow_spin) { c->slab = slab; freelist = get_freelist(s, slab); VM_BUG_ON(!freelist); @@ -3842,7 +4568,7 @@ new_slab: goto load_freelist; } - local_unlock_irqrestore(&s->cpu_slab->lock, flags); + local_unlock_cpu_slab(s, flags); slab->next = NULL; __put_partials(s, slab); @@ -3864,8 +4590,13 @@ new_objects: * allocating new page from other nodes */ if (unlikely(node != NUMA_NO_NODE && !(gfpflags & __GFP_THISNODE) - && try_thisnode)) - pc.flags = GFP_NOWAIT | __GFP_THISNODE; + && try_thisnode)) { + if (unlikely(!allow_spin)) + /* Do not upgrade gfp to NOWAIT from more restrictive mode */ + pc.flags = gfpflags | __GFP_THISNODE; + else + pc.flags = GFP_NOWAIT | __GFP_THISNODE; + } pc.orig_size = orig_size; slab = get_partial(s, node, &pc); @@ -3876,9 +4607,14 @@ new_objects: * For debug caches here we had to go through * alloc_single_from_partial() so just store the * tracking info and return the object. + * + * Due to disabled preemption we need to disallow + * blocking. The flags are further adjusted by + * gfp_nested_mask() in stack_depot itself. */ if (s->flags & SLAB_STORE_USER) - set_track(s, freelist, TRACK_ALLOC, addr); + set_track(s, freelist, TRACK_ALLOC, addr, + gfpflags & ~(__GFP_DIRECT_RECLAIM)); return freelist; } @@ -3904,13 +4640,14 @@ new_objects: stat(s, ALLOC_SLAB); if (kmem_cache_debug(s)) { - freelist = alloc_single_from_new_slab(s, slab, orig_size); + freelist = alloc_single_from_new_slab(s, slab, orig_size, gfpflags); if (unlikely(!freelist)) goto new_objects; if (s->flags & SLAB_STORE_USER) - set_track(s, freelist, TRACK_ALLOC, addr); + set_track(s, freelist, TRACK_ALLOC, addr, + gfpflags & ~(__GFP_DIRECT_RECLAIM)); return freelist; } @@ -3926,7 +4663,7 @@ new_objects: inc_slabs_node(s, slab_nid(slab), slab->objects); - if (unlikely(!pfmemalloc_match(slab, gfpflags))) { + if (unlikely(!pfmemalloc_match(slab, gfpflags) && allow_spin)) { /* * For !pfmemalloc_match() case we don't load freelist so that * we don't make further mismatched allocations easier. @@ -3937,7 +4674,7 @@ new_objects: retry_load_slab: - local_lock_irqsave(&s->cpu_slab->lock, flags); + local_lock_cpu_slab(s, flags); if (unlikely(c->slab)) { void *flush_freelist = c->freelist; struct slab *flush_slab = c->slab; @@ -3946,9 +4683,14 @@ retry_load_slab: c->freelist = NULL; c->tid = next_tid(c->tid); - local_unlock_irqrestore(&s->cpu_slab->lock, flags); + local_unlock_cpu_slab(s, flags); - deactivate_slab(s, flush_slab, flush_freelist); + if (unlikely(!allow_spin)) { + /* Reentrant slub cannot take locks, defer */ + defer_deactivate_slab(flush_slab, flush_freelist); + } else { + deactivate_slab(s, flush_slab, flush_freelist); + } stat(s, CPUSLAB_FLUSH); @@ -3958,6 +4700,19 @@ retry_load_slab: goto load_freelist; } +/* + * We disallow kprobes in ___slab_alloc() to prevent reentrance + * + * kmalloc() -> ___slab_alloc() -> local_lock_cpu_slab() protected part of + * ___slab_alloc() manipulating c->freelist -> kprobe -> bpf -> + * kmalloc_nolock() or kfree_nolock() -> __update_cpu_freelist_fast() + * manipulating c->freelist without lock. + * + * This does not prevent kprobe in functions called from ___slab_alloc() such as + * local_lock_irqsave() itself, and that is fine, we only need to protect the + * c->freelist manipulation in ___slab_alloc() itself. + */ +NOKPROBE_SYMBOL(___slab_alloc); /* * A wrapper for ___slab_alloc() for contexts where preemption is not yet @@ -3977,8 +4732,19 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, */ c = slub_get_cpu_ptr(s->cpu_slab); #endif - + if (unlikely(!gfpflags_allow_spinning(gfpflags))) { + if (local_lock_is_locked(&s->cpu_slab->lock)) { + /* + * EBUSY is an internal signal to kmalloc_nolock() to + * retry a different bucket. It's not propagated + * to the caller. + */ + p = ERR_PTR(-EBUSY); + goto out; + } + } p = ___slab_alloc(s, gfpflags, node, addr, c, orig_size); +out: #ifdef CONFIG_PREEMPT_COUNT slub_put_cpu_ptr(s->cpu_slab); #endif @@ -4102,7 +4868,7 @@ static void *__slab_alloc_node(struct kmem_cache *s, return NULL; } - object = alloc_single_from_new_slab(s, slab, orig_size); + object = alloc_single_from_new_slab(s, slab, orig_size, gfpflags); return object; } @@ -4181,8 +4947,9 @@ bool slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru, if (p[i] && init && (!kasan_init || !kasan_has_integrated_init())) memset(p[i], 0, zero_size); - kmemleak_alloc_recursive(p[i], s->object_size, 1, - s->flags, init_flags); + if (gfpflags_allow_spinning(flags)) + kmemleak_alloc_recursive(p[i], s->object_size, 1, + s->flags, init_flags); kmsan_slab_alloc(s, p[i], init_flags); alloc_tagging_slab_alloc_hook(s, p[i], flags); } @@ -4191,6 +4958,251 @@ bool slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru, } /* + * Replace the empty main sheaf with a (at least partially) full sheaf. + * + * Must be called with the cpu_sheaves local lock locked. If successful, returns + * the pcs pointer and the local lock locked (possibly on a different cpu than + * initially called). If not successful, returns NULL and the local lock + * unlocked. + */ +static struct slub_percpu_sheaves * +__pcs_replace_empty_main(struct kmem_cache *s, struct slub_percpu_sheaves *pcs, gfp_t gfp) +{ + struct slab_sheaf *empty = NULL; + struct slab_sheaf *full; + struct node_barn *barn; + bool can_alloc; + + lockdep_assert_held(this_cpu_ptr(&s->cpu_sheaves->lock)); + + if (pcs->spare && pcs->spare->size > 0) { + swap(pcs->main, pcs->spare); + return pcs; + } + + barn = get_barn(s); + + full = barn_replace_empty_sheaf(barn, pcs->main); + + if (full) { + stat(s, BARN_GET); + pcs->main = full; + return pcs; + } + + stat(s, BARN_GET_FAIL); + + can_alloc = gfpflags_allow_blocking(gfp); + + if (can_alloc) { + if (pcs->spare) { + empty = pcs->spare; + pcs->spare = NULL; + } else { + empty = barn_get_empty_sheaf(barn); + } + } + + local_unlock(&s->cpu_sheaves->lock); + + if (!can_alloc) + return NULL; + + if (empty) { + if (!refill_sheaf(s, empty, gfp)) { + full = empty; + } else { + /* + * we must be very low on memory so don't bother + * with the barn + */ + free_empty_sheaf(s, empty); + } + } else { + full = alloc_full_sheaf(s, gfp); + } + + if (!full) + return NULL; + + /* + * we can reach here only when gfpflags_allow_blocking + * so this must not be an irq + */ + local_lock(&s->cpu_sheaves->lock); + pcs = this_cpu_ptr(s->cpu_sheaves); + + /* + * If we are returning empty sheaf, we either got it from the + * barn or had to allocate one. If we are returning a full + * sheaf, it's due to racing or being migrated to a different + * cpu. Breaching the barn's sheaf limits should be thus rare + * enough so just ignore them to simplify the recovery. + */ + + if (pcs->main->size == 0) { + barn_put_empty_sheaf(barn, pcs->main); + pcs->main = full; + return pcs; + } + + if (!pcs->spare) { + pcs->spare = full; + return pcs; + } + + if (pcs->spare->size == 0) { + barn_put_empty_sheaf(barn, pcs->spare); + pcs->spare = full; + return pcs; + } + + barn_put_full_sheaf(barn, full); + stat(s, BARN_PUT); + + return pcs; +} + +static __fastpath_inline +void *alloc_from_pcs(struct kmem_cache *s, gfp_t gfp, int node) +{ + struct slub_percpu_sheaves *pcs; + bool node_requested; + void *object; + +#ifdef CONFIG_NUMA + if (static_branch_unlikely(&strict_numa) && + node == NUMA_NO_NODE) { + + struct mempolicy *mpol = current->mempolicy; + + if (mpol) { + /* + * Special BIND rule support. If the local node + * is in permitted set then do not redirect + * to a particular node. + * Otherwise we apply the memory policy to get + * the node we need to allocate on. + */ + if (mpol->mode != MPOL_BIND || + !node_isset(numa_mem_id(), mpol->nodes)) + + node = mempolicy_slab_node(); + } + } +#endif + + node_requested = IS_ENABLED(CONFIG_NUMA) && node != NUMA_NO_NODE; + + /* + * We assume the percpu sheaves contain only local objects although it's + * not completely guaranteed, so we verify later. + */ + if (unlikely(node_requested && node != numa_mem_id())) + return NULL; + + if (!local_trylock(&s->cpu_sheaves->lock)) + return NULL; + + pcs = this_cpu_ptr(s->cpu_sheaves); + + if (unlikely(pcs->main->size == 0)) { + pcs = __pcs_replace_empty_main(s, pcs, gfp); + if (unlikely(!pcs)) + return NULL; + } + + object = pcs->main->objects[pcs->main->size - 1]; + + if (unlikely(node_requested)) { + /* + * Verify that the object was from the node we want. This could + * be false because of cpu migration during an unlocked part of + * the current allocation or previous freeing process. + */ + if (folio_nid(virt_to_folio(object)) != node) { + local_unlock(&s->cpu_sheaves->lock); + return NULL; + } + } + + pcs->main->size--; + + local_unlock(&s->cpu_sheaves->lock); + + stat(s, ALLOC_PCS); + + return object; +} + +static __fastpath_inline +unsigned int alloc_from_pcs_bulk(struct kmem_cache *s, size_t size, void **p) +{ + struct slub_percpu_sheaves *pcs; + struct slab_sheaf *main; + unsigned int allocated = 0; + unsigned int batch; + +next_batch: + if (!local_trylock(&s->cpu_sheaves->lock)) + return allocated; + + pcs = this_cpu_ptr(s->cpu_sheaves); + + if (unlikely(pcs->main->size == 0)) { + + struct slab_sheaf *full; + + if (pcs->spare && pcs->spare->size > 0) { + swap(pcs->main, pcs->spare); + goto do_alloc; + } + + full = barn_replace_empty_sheaf(get_barn(s), pcs->main); + + if (full) { + stat(s, BARN_GET); + pcs->main = full; + goto do_alloc; + } + + stat(s, BARN_GET_FAIL); + + local_unlock(&s->cpu_sheaves->lock); + + /* + * Once full sheaves in barn are depleted, let the bulk + * allocation continue from slab pages, otherwise we would just + * be copying arrays of pointers twice. + */ + return allocated; + } + +do_alloc: + + main = pcs->main; + batch = min(size, main->size); + + main->size -= batch; + memcpy(p, main->objects + main->size, batch * sizeof(void *)); + + local_unlock(&s->cpu_sheaves->lock); + + stat_add(s, ALLOC_PCS, batch); + + allocated += batch; + + if (batch < size) { + p += batch; + size -= batch; + goto next_batch; + } + + return allocated; +} + + +/* * Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc) * have the fastpath folded into their functions. So no function call * overhead for requests that can be satisfied on the fastpath. @@ -4214,7 +5226,11 @@ static __fastpath_inline void *slab_alloc_node(struct kmem_cache *s, struct list if (unlikely(object)) goto out; - object = __slab_alloc_node(s, gfpflags, node, addr, orig_size); + if (s->cpu_sheaves) + object = alloc_from_pcs(s, gfpflags, node); + + if (!object) + object = __slab_alloc_node(s, gfpflags, node, addr, orig_size); maybe_wipe_obj_freeptr(s, object); init = slab_want_init_on_alloc(gfpflags, s); @@ -4287,6 +5303,228 @@ void *kmem_cache_alloc_node_noprof(struct kmem_cache *s, gfp_t gfpflags, int nod EXPORT_SYMBOL(kmem_cache_alloc_node_noprof); /* + * returns a sheaf that has at least the requested size + * when prefilling is needed, do so with given gfp flags + * + * return NULL if sheaf allocation or prefilling failed + */ +struct slab_sheaf * +kmem_cache_prefill_sheaf(struct kmem_cache *s, gfp_t gfp, unsigned int size) +{ + struct slub_percpu_sheaves *pcs; + struct slab_sheaf *sheaf = NULL; + + if (unlikely(size > s->sheaf_capacity)) { + + /* + * slab_debug disables cpu sheaves intentionally so all + * prefilled sheaves become "oversize" and we give up on + * performance for the debugging. Same with SLUB_TINY. + * Creating a cache without sheaves and then requesting a + * prefilled sheaf is however not expected, so warn. + */ + WARN_ON_ONCE(s->sheaf_capacity == 0 && + !IS_ENABLED(CONFIG_SLUB_TINY) && + !(s->flags & SLAB_DEBUG_FLAGS)); + + sheaf = kzalloc(struct_size(sheaf, objects, size), gfp); + if (!sheaf) + return NULL; + + stat(s, SHEAF_PREFILL_OVERSIZE); + sheaf->cache = s; + sheaf->capacity = size; + + if (!__kmem_cache_alloc_bulk(s, gfp, size, + &sheaf->objects[0])) { + kfree(sheaf); + return NULL; + } + + sheaf->size = size; + + return sheaf; + } + + local_lock(&s->cpu_sheaves->lock); + pcs = this_cpu_ptr(s->cpu_sheaves); + + if (pcs->spare) { + sheaf = pcs->spare; + pcs->spare = NULL; + stat(s, SHEAF_PREFILL_FAST); + } else { + stat(s, SHEAF_PREFILL_SLOW); + sheaf = barn_get_full_or_empty_sheaf(get_barn(s)); + if (sheaf && sheaf->size) + stat(s, BARN_GET); + else + stat(s, BARN_GET_FAIL); + } + + local_unlock(&s->cpu_sheaves->lock); + + + if (!sheaf) + sheaf = alloc_empty_sheaf(s, gfp); + + if (sheaf && sheaf->size < size) { + if (refill_sheaf(s, sheaf, gfp)) { + sheaf_flush_unused(s, sheaf); + free_empty_sheaf(s, sheaf); + sheaf = NULL; + } + } + + if (sheaf) + sheaf->capacity = s->sheaf_capacity; + + return sheaf; +} + +/* + * Use this to return a sheaf obtained by kmem_cache_prefill_sheaf() + * + * If the sheaf cannot simply become the percpu spare sheaf, but there's space + * for a full sheaf in the barn, we try to refill the sheaf back to the cache's + * sheaf_capacity to avoid handling partially full sheaves. + * + * If the refill fails because gfp is e.g. GFP_NOWAIT, or the barn is full, the + * sheaf is instead flushed and freed. + */ +void kmem_cache_return_sheaf(struct kmem_cache *s, gfp_t gfp, + struct slab_sheaf *sheaf) +{ + struct slub_percpu_sheaves *pcs; + struct node_barn *barn; + + if (unlikely(sheaf->capacity != s->sheaf_capacity)) { + sheaf_flush_unused(s, sheaf); + kfree(sheaf); + return; + } + + local_lock(&s->cpu_sheaves->lock); + pcs = this_cpu_ptr(s->cpu_sheaves); + barn = get_barn(s); + + if (!pcs->spare) { + pcs->spare = sheaf; + sheaf = NULL; + stat(s, SHEAF_RETURN_FAST); + } + + local_unlock(&s->cpu_sheaves->lock); + + if (!sheaf) + return; + + stat(s, SHEAF_RETURN_SLOW); + + /* + * If the barn has too many full sheaves or we fail to refill the sheaf, + * simply flush and free it. + */ + if (data_race(barn->nr_full) >= MAX_FULL_SHEAVES || + refill_sheaf(s, sheaf, gfp)) { + sheaf_flush_unused(s, sheaf); + free_empty_sheaf(s, sheaf); + return; + } + + barn_put_full_sheaf(barn, sheaf); + stat(s, BARN_PUT); +} + +/* + * refill a sheaf previously returned by kmem_cache_prefill_sheaf to at least + * the given size + * + * the sheaf might be replaced by a new one when requesting more than + * s->sheaf_capacity objects if such replacement is necessary, but the refill + * fails (returning -ENOMEM), the existing sheaf is left intact + * + * In practice we always refill to full sheaf's capacity. + */ +int kmem_cache_refill_sheaf(struct kmem_cache *s, gfp_t gfp, + struct slab_sheaf **sheafp, unsigned int size) +{ + struct slab_sheaf *sheaf; + + /* + * TODO: do we want to support *sheaf == NULL to be equivalent of + * kmem_cache_prefill_sheaf() ? + */ + if (!sheafp || !(*sheafp)) + return -EINVAL; + + sheaf = *sheafp; + if (sheaf->size >= size) + return 0; + + if (likely(sheaf->capacity >= size)) { + if (likely(sheaf->capacity == s->sheaf_capacity)) + return refill_sheaf(s, sheaf, gfp); + + if (!__kmem_cache_alloc_bulk(s, gfp, sheaf->capacity - sheaf->size, + &sheaf->objects[sheaf->size])) { + return -ENOMEM; + } + sheaf->size = sheaf->capacity; + + return 0; + } + + /* + * We had a regular sized sheaf and need an oversize one, or we had an + * oversize one already but need a larger one now. + * This should be a very rare path so let's not complicate it. + */ + sheaf = kmem_cache_prefill_sheaf(s, gfp, size); + if (!sheaf) + return -ENOMEM; + + kmem_cache_return_sheaf(s, gfp, *sheafp); + *sheafp = sheaf; + return 0; +} + +/* + * Allocate from a sheaf obtained by kmem_cache_prefill_sheaf() + * + * Guaranteed not to fail as many allocations as was the requested size. + * After the sheaf is emptied, it fails - no fallback to the slab cache itself. + * + * The gfp parameter is meant only to specify __GFP_ZERO or __GFP_ACCOUNT + * memcg charging is forced over limit if necessary, to avoid failure. + */ +void * +kmem_cache_alloc_from_sheaf_noprof(struct kmem_cache *s, gfp_t gfp, + struct slab_sheaf *sheaf) +{ + void *ret = NULL; + bool init; + + if (sheaf->size == 0) + goto out; + + ret = sheaf->objects[--sheaf->size]; + + init = slab_want_init_on_alloc(gfp, s); + + /* add __GFP_NOFAIL to force successful memcg charging */ + slab_post_alloc_hook(s, NULL, gfp | __GFP_NOFAIL, 1, &ret, init, s->object_size); +out: + trace_kmem_cache_alloc(_RET_IP_, ret, s, gfp, NUMA_NO_NODE); + + return ret; +} + +unsigned int kmem_cache_sheaf_size(struct slab_sheaf *sheaf) +{ + return sheaf->size; +} +/* * To avoid unnecessary overhead, we pass through large allocation requests * directly to the page allocator. We use __GFP_COMP, because we will need to * know the allocation order to free the pages properly in kfree. @@ -4378,6 +5616,96 @@ void *__kmalloc_noprof(size_t size, gfp_t flags) } EXPORT_SYMBOL(__kmalloc_noprof); +/** + * kmalloc_nolock - Allocate an object of given size from any context. + * @size: size to allocate + * @gfp_flags: GFP flags. Only __GFP_ACCOUNT, __GFP_ZERO, __GFP_NO_OBJ_EXT + * allowed. + * @node: node number of the target node. + * + * Return: pointer to the new object or NULL in case of error. + * NULL does not mean EBUSY or EAGAIN. It means ENOMEM. + * There is no reason to call it again and expect !NULL. + */ +void *kmalloc_nolock_noprof(size_t size, gfp_t gfp_flags, int node) +{ + gfp_t alloc_gfp = __GFP_NOWARN | __GFP_NOMEMALLOC | gfp_flags; + struct kmem_cache *s; + bool can_retry = true; + void *ret = ERR_PTR(-EBUSY); + + VM_WARN_ON_ONCE(gfp_flags & ~(__GFP_ACCOUNT | __GFP_ZERO | + __GFP_NO_OBJ_EXT)); + + if (unlikely(!size)) + return ZERO_SIZE_PTR; + + if (IS_ENABLED(CONFIG_PREEMPT_RT) && (in_nmi() || in_hardirq())) + /* kmalloc_nolock() in PREEMPT_RT is not supported from irq */ + return NULL; +retry: + if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) + return NULL; + s = kmalloc_slab(size, NULL, alloc_gfp, _RET_IP_); + + if (!(s->flags & __CMPXCHG_DOUBLE) && !kmem_cache_debug(s)) + /* + * kmalloc_nolock() is not supported on architectures that + * don't implement cmpxchg16b, but debug caches don't use + * per-cpu slab and per-cpu partial slabs. They rely on + * kmem_cache_node->list_lock, so kmalloc_nolock() can + * attempt to allocate from debug caches by + * spin_trylock_irqsave(&n->list_lock, ...) + */ + return NULL; + + /* + * Do not call slab_alloc_node(), since trylock mode isn't + * compatible with slab_pre_alloc_hook/should_failslab and + * kfence_alloc. Hence call __slab_alloc_node() (at most twice) + * and slab_post_alloc_hook() directly. + * + * In !PREEMPT_RT ___slab_alloc() manipulates (freelist,tid) pair + * in irq saved region. It assumes that the same cpu will not + * __update_cpu_freelist_fast() into the same (freelist,tid) pair. + * Therefore use in_nmi() to check whether particular bucket is in + * irq protected section. + * + * If in_nmi() && local_lock_is_locked(s->cpu_slab) then it means that + * this cpu was interrupted somewhere inside ___slab_alloc() after + * it did local_lock_irqsave(&s->cpu_slab->lock, flags). + * In this case fast path with __update_cpu_freelist_fast() is not safe. + */ +#ifndef CONFIG_SLUB_TINY + if (!in_nmi() || !local_lock_is_locked(&s->cpu_slab->lock)) +#endif + ret = __slab_alloc_node(s, alloc_gfp, node, _RET_IP_, size); + + if (PTR_ERR(ret) == -EBUSY) { + if (can_retry) { + /* pick the next kmalloc bucket */ + size = s->object_size + 1; + /* + * Another alternative is to + * if (memcg) alloc_gfp &= ~__GFP_ACCOUNT; + * else if (!memcg) alloc_gfp |= __GFP_ACCOUNT; + * to retry from bucket of the same size. + */ + can_retry = false; + goto retry; + } + ret = NULL; + } + + maybe_wipe_obj_freeptr(s, ret); + slab_post_alloc_hook(s, NULL, alloc_gfp, 1, &ret, + slab_want_init_on_alloc(alloc_gfp, s), size); + + ret = kasan_kmalloc(s, ret, size, alloc_gfp); + return ret; +} +EXPORT_SYMBOL_GPL(kmalloc_nolock_noprof); + void *__kmalloc_node_track_caller_noprof(DECL_BUCKET_PARAMS(size, b), gfp_t flags, int node, unsigned long caller) { @@ -4421,8 +5749,12 @@ static noinline void free_to_partial_list( unsigned long flags; depot_stack_handle_t handle = 0; + /* + * We cannot use GFP_NOWAIT as there are callsites where waking up + * kswapd could deadlock + */ if (s->flags & SLAB_STORE_USER) - handle = set_track_prepare(); + handle = set_track_prepare(__GFP_NOWARN); spin_lock_irqsave(&n->list_lock, flags); @@ -4591,6 +5923,537 @@ slab_empty: discard_slab(s, slab); } +/* + * pcs is locked. We should have get rid of the spare sheaf and obtained an + * empty sheaf, while the main sheaf is full. We want to install the empty sheaf + * as a main sheaf, and make the current main sheaf a spare sheaf. + * + * However due to having relinquished the cpu_sheaves lock when obtaining + * the empty sheaf, we need to handle some unlikely but possible cases. + * + * If we put any sheaf to barn here, it's because we were interrupted or have + * been migrated to a different cpu, which should be rare enough so just ignore + * the barn's limits to simplify the handling. + * + * An alternative scenario that gets us here is when we fail + * barn_replace_full_sheaf(), because there's no empty sheaf available in the + * barn, so we had to allocate it by alloc_empty_sheaf(). But because we saw the + * limit on full sheaves was not exceeded, we assume it didn't change and just + * put the full sheaf there. + */ +static void __pcs_install_empty_sheaf(struct kmem_cache *s, + struct slub_percpu_sheaves *pcs, struct slab_sheaf *empty) +{ + struct node_barn *barn; + + lockdep_assert_held(this_cpu_ptr(&s->cpu_sheaves->lock)); + + /* This is what we expect to find if nobody interrupted us. */ + if (likely(!pcs->spare)) { + pcs->spare = pcs->main; + pcs->main = empty; + return; + } + + barn = get_barn(s); + + /* + * Unlikely because if the main sheaf had space, we would have just + * freed to it. Get rid of our empty sheaf. + */ + if (pcs->main->size < s->sheaf_capacity) { + barn_put_empty_sheaf(barn, empty); + return; + } + + /* Also unlikely for the same reason */ + if (pcs->spare->size < s->sheaf_capacity) { + swap(pcs->main, pcs->spare); + barn_put_empty_sheaf(barn, empty); + return; + } + + /* + * We probably failed barn_replace_full_sheaf() due to no empty sheaf + * available there, but we allocated one, so finish the job. + */ + barn_put_full_sheaf(barn, pcs->main); + stat(s, BARN_PUT); + pcs->main = empty; +} + +/* + * Replace the full main sheaf with a (at least partially) empty sheaf. + * + * Must be called with the cpu_sheaves local lock locked. If successful, returns + * the pcs pointer and the local lock locked (possibly on a different cpu than + * initially called). If not successful, returns NULL and the local lock + * unlocked. + */ +static struct slub_percpu_sheaves * +__pcs_replace_full_main(struct kmem_cache *s, struct slub_percpu_sheaves *pcs) +{ + struct slab_sheaf *empty; + struct node_barn *barn; + bool put_fail; + +restart: + lockdep_assert_held(this_cpu_ptr(&s->cpu_sheaves->lock)); + + barn = get_barn(s); + put_fail = false; + + if (!pcs->spare) { + empty = barn_get_empty_sheaf(barn); + if (empty) { + pcs->spare = pcs->main; + pcs->main = empty; + return pcs; + } + goto alloc_empty; + } + + if (pcs->spare->size < s->sheaf_capacity) { + swap(pcs->main, pcs->spare); + return pcs; + } + + empty = barn_replace_full_sheaf(barn, pcs->main); + + if (!IS_ERR(empty)) { + stat(s, BARN_PUT); + pcs->main = empty; + return pcs; + } + + if (PTR_ERR(empty) == -E2BIG) { + /* Since we got here, spare exists and is full */ + struct slab_sheaf *to_flush = pcs->spare; + + stat(s, BARN_PUT_FAIL); + + pcs->spare = NULL; + local_unlock(&s->cpu_sheaves->lock); + + sheaf_flush_unused(s, to_flush); + empty = to_flush; + goto got_empty; + } + + /* + * We could not replace full sheaf because barn had no empty + * sheaves. We can still allocate it and put the full sheaf in + * __pcs_install_empty_sheaf(), but if we fail to allocate it, + * make sure to count the fail. + */ + put_fail = true; + +alloc_empty: + local_unlock(&s->cpu_sheaves->lock); + + empty = alloc_empty_sheaf(s, GFP_NOWAIT); + if (empty) + goto got_empty; + + if (put_fail) + stat(s, BARN_PUT_FAIL); + + if (!sheaf_flush_main(s)) + return NULL; + + if (!local_trylock(&s->cpu_sheaves->lock)) + return NULL; + + pcs = this_cpu_ptr(s->cpu_sheaves); + + /* + * we flushed the main sheaf so it should be empty now, + * but in case we got preempted or migrated, we need to + * check again + */ + if (pcs->main->size == s->sheaf_capacity) + goto restart; + + return pcs; + +got_empty: + if (!local_trylock(&s->cpu_sheaves->lock)) { + barn_put_empty_sheaf(barn, empty); + return NULL; + } + + pcs = this_cpu_ptr(s->cpu_sheaves); + __pcs_install_empty_sheaf(s, pcs, empty); + + return pcs; +} + +/* + * Free an object to the percpu sheaves. + * The object is expected to have passed slab_free_hook() already. + */ +static __fastpath_inline +bool free_to_pcs(struct kmem_cache *s, void *object) +{ + struct slub_percpu_sheaves *pcs; + + if (!local_trylock(&s->cpu_sheaves->lock)) + return false; + + pcs = this_cpu_ptr(s->cpu_sheaves); + + if (unlikely(pcs->main->size == s->sheaf_capacity)) { + + pcs = __pcs_replace_full_main(s, pcs); + if (unlikely(!pcs)) + return false; + } + + pcs->main->objects[pcs->main->size++] = object; + + local_unlock(&s->cpu_sheaves->lock); + + stat(s, FREE_PCS); + + return true; +} + +static void rcu_free_sheaf(struct rcu_head *head) +{ + struct slab_sheaf *sheaf; + struct node_barn *barn; + struct kmem_cache *s; + + sheaf = container_of(head, struct slab_sheaf, rcu_head); + + s = sheaf->cache; + + /* + * This may remove some objects due to slab_free_hook() returning false, + * so that the sheaf might no longer be completely full. But it's easier + * to handle it as full (unless it became completely empty), as the code + * handles it fine. The only downside is that sheaf will serve fewer + * allocations when reused. It only happens due to debugging, which is a + * performance hit anyway. + */ + __rcu_free_sheaf_prepare(s, sheaf); + + barn = get_node(s, sheaf->node)->barn; + + /* due to slab_free_hook() */ + if (unlikely(sheaf->size == 0)) + goto empty; + + /* + * Checking nr_full/nr_empty outside lock avoids contention in case the + * barn is at the respective limit. Due to the race we might go over the + * limit but that should be rare and harmless. + */ + + if (data_race(barn->nr_full) < MAX_FULL_SHEAVES) { + stat(s, BARN_PUT); + barn_put_full_sheaf(barn, sheaf); + return; + } + + stat(s, BARN_PUT_FAIL); + sheaf_flush_unused(s, sheaf); + +empty: + if (data_race(barn->nr_empty) < MAX_EMPTY_SHEAVES) { + barn_put_empty_sheaf(barn, sheaf); + return; + } + + free_empty_sheaf(s, sheaf); +} + +bool __kfree_rcu_sheaf(struct kmem_cache *s, void *obj) +{ + struct slub_percpu_sheaves *pcs; + struct slab_sheaf *rcu_sheaf; + + if (!local_trylock(&s->cpu_sheaves->lock)) + goto fail; + + pcs = this_cpu_ptr(s->cpu_sheaves); + + if (unlikely(!pcs->rcu_free)) { + + struct slab_sheaf *empty; + struct node_barn *barn; + + if (pcs->spare && pcs->spare->size == 0) { + pcs->rcu_free = pcs->spare; + pcs->spare = NULL; + goto do_free; + } + + barn = get_barn(s); + + empty = barn_get_empty_sheaf(barn); + + if (empty) { + pcs->rcu_free = empty; + goto do_free; + } + + local_unlock(&s->cpu_sheaves->lock); + + empty = alloc_empty_sheaf(s, GFP_NOWAIT); + + if (!empty) + goto fail; + + if (!local_trylock(&s->cpu_sheaves->lock)) { + barn_put_empty_sheaf(barn, empty); + goto fail; + } + + pcs = this_cpu_ptr(s->cpu_sheaves); + + if (unlikely(pcs->rcu_free)) + barn_put_empty_sheaf(barn, empty); + else + pcs->rcu_free = empty; + } + +do_free: + + rcu_sheaf = pcs->rcu_free; + + /* + * Since we flush immediately when size reaches capacity, we never reach + * this with size already at capacity, so no OOB write is possible. + */ + rcu_sheaf->objects[rcu_sheaf->size++] = obj; + + if (likely(rcu_sheaf->size < s->sheaf_capacity)) { + rcu_sheaf = NULL; + } else { + pcs->rcu_free = NULL; + rcu_sheaf->node = numa_mem_id(); + } + + /* + * we flush before local_unlock to make sure a racing + * flush_all_rcu_sheaves() doesn't miss this sheaf + */ + if (rcu_sheaf) + call_rcu(&rcu_sheaf->rcu_head, rcu_free_sheaf); + + local_unlock(&s->cpu_sheaves->lock); + + stat(s, FREE_RCU_SHEAF); + return true; + +fail: + stat(s, FREE_RCU_SHEAF_FAIL); + return false; +} + +/* + * Bulk free objects to the percpu sheaves. + * Unlike free_to_pcs() this includes the calls to all necessary hooks + * and the fallback to freeing to slab pages. + */ +static void free_to_pcs_bulk(struct kmem_cache *s, size_t size, void **p) +{ + struct slub_percpu_sheaves *pcs; + struct slab_sheaf *main, *empty; + bool init = slab_want_init_on_free(s); + unsigned int batch, i = 0; + struct node_barn *barn; + void *remote_objects[PCS_BATCH_MAX]; + unsigned int remote_nr = 0; + int node = numa_mem_id(); + +next_remote_batch: + while (i < size) { + struct slab *slab = virt_to_slab(p[i]); + + memcg_slab_free_hook(s, slab, p + i, 1); + alloc_tagging_slab_free_hook(s, slab, p + i, 1); + + if (unlikely(!slab_free_hook(s, p[i], init, false))) { + p[i] = p[--size]; + if (!size) + goto flush_remote; + continue; + } + + if (unlikely(IS_ENABLED(CONFIG_NUMA) && slab_nid(slab) != node)) { + remote_objects[remote_nr] = p[i]; + p[i] = p[--size]; + if (++remote_nr >= PCS_BATCH_MAX) + goto flush_remote; + continue; + } + + i++; + } + +next_batch: + if (!local_trylock(&s->cpu_sheaves->lock)) + goto fallback; + + pcs = this_cpu_ptr(s->cpu_sheaves); + + if (likely(pcs->main->size < s->sheaf_capacity)) + goto do_free; + + barn = get_barn(s); + + if (!pcs->spare) { + empty = barn_get_empty_sheaf(barn); + if (!empty) + goto no_empty; + + pcs->spare = pcs->main; + pcs->main = empty; + goto do_free; + } + + if (pcs->spare->size < s->sheaf_capacity) { + swap(pcs->main, pcs->spare); + goto do_free; + } + + empty = barn_replace_full_sheaf(barn, pcs->main); + if (IS_ERR(empty)) { + stat(s, BARN_PUT_FAIL); + goto no_empty; + } + + stat(s, BARN_PUT); + pcs->main = empty; + +do_free: + main = pcs->main; + batch = min(size, s->sheaf_capacity - main->size); + + memcpy(main->objects + main->size, p, batch * sizeof(void *)); + main->size += batch; + + local_unlock(&s->cpu_sheaves->lock); + + stat_add(s, FREE_PCS, batch); + + if (batch < size) { + p += batch; + size -= batch; + goto next_batch; + } + + return; + +no_empty: + local_unlock(&s->cpu_sheaves->lock); + + /* + * if we depleted all empty sheaves in the barn or there are too + * many full sheaves, free the rest to slab pages + */ +fallback: + __kmem_cache_free_bulk(s, size, p); + +flush_remote: + if (remote_nr) { + __kmem_cache_free_bulk(s, remote_nr, &remote_objects[0]); + if (i < size) { + remote_nr = 0; + goto next_remote_batch; + } + } +} + +struct defer_free { + struct llist_head objects; + struct llist_head slabs; + struct irq_work work; +}; + +static void free_deferred_objects(struct irq_work *work); + +static DEFINE_PER_CPU(struct defer_free, defer_free_objects) = { + .objects = LLIST_HEAD_INIT(objects), + .slabs = LLIST_HEAD_INIT(slabs), + .work = IRQ_WORK_INIT(free_deferred_objects), +}; + +/* + * In PREEMPT_RT irq_work runs in per-cpu kthread, so it's safe + * to take sleeping spin_locks from __slab_free() and deactivate_slab(). + * In !PREEMPT_RT irq_work will run after local_unlock_irqrestore(). + */ +static void free_deferred_objects(struct irq_work *work) +{ + struct defer_free *df = container_of(work, struct defer_free, work); + struct llist_head *objs = &df->objects; + struct llist_head *slabs = &df->slabs; + struct llist_node *llnode, *pos, *t; + + if (llist_empty(objs) && llist_empty(slabs)) + return; + + llnode = llist_del_all(objs); + llist_for_each_safe(pos, t, llnode) { + struct kmem_cache *s; + struct slab *slab; + void *x = pos; + + slab = virt_to_slab(x); + s = slab->slab_cache; + + /* + * We used freepointer in 'x' to link 'x' into df->objects. + * Clear it to NULL to avoid false positive detection + * of "Freepointer corruption". + */ + *(void **)x = NULL; + + /* Point 'x' back to the beginning of allocated object */ + x -= s->offset; + __slab_free(s, slab, x, x, 1, _THIS_IP_); + } + + llnode = llist_del_all(slabs); + llist_for_each_safe(pos, t, llnode) { + struct slab *slab = container_of(pos, struct slab, llnode); + +#ifdef CONFIG_SLUB_TINY + discard_slab(slab->slab_cache, slab); +#else + deactivate_slab(slab->slab_cache, slab, slab->flush_freelist); +#endif + } +} + +static void defer_free(struct kmem_cache *s, void *head) +{ + struct defer_free *df = this_cpu_ptr(&defer_free_objects); + + if (llist_add(head + s->offset, &df->objects)) + irq_work_queue(&df->work); +} + +static void defer_deactivate_slab(struct slab *slab, void *flush_freelist) +{ + struct defer_free *df = this_cpu_ptr(&defer_free_objects); + + slab->flush_freelist = flush_freelist; + if (llist_add(&slab->llnode, &df->slabs)) + irq_work_queue(&df->work); +} + +void defer_free_barrier(void) +{ + int cpu; + + for_each_possible_cpu(cpu) + irq_work_sync(&per_cpu_ptr(&defer_free_objects, cpu)->work); +} + #ifndef CONFIG_SLUB_TINY /* * Fastpath with forced inlining to produce a kfree and kmem_cache_free that @@ -4611,6 +6474,8 @@ static __always_inline void do_slab_free(struct kmem_cache *s, struct slab *slab, void *head, void *tail, int cnt, unsigned long addr) { + /* cnt == 0 signals that it's called from kfree_nolock() */ + bool allow_spin = cnt; struct kmem_cache_cpu *c; unsigned long tid; void **freelist; @@ -4629,10 +6494,29 @@ redo: barrier(); if (unlikely(slab != c->slab)) { - __slab_free(s, slab, head, tail, cnt, addr); + if (unlikely(!allow_spin)) { + /* + * __slab_free() can locklessly cmpxchg16 into a slab, + * but then it might need to take spin_lock or local_lock + * in put_cpu_partial() for further processing. + * Avoid the complexity and simply add to a deferred list. + */ + defer_free(s, head); + } else { + __slab_free(s, slab, head, tail, cnt, addr); + } return; } + if (unlikely(!allow_spin)) { + if ((in_nmi() || !USE_LOCKLESS_FAST_PATH()) && + local_lock_is_locked(&s->cpu_slab->lock)) { + defer_free(s, head); + return; + } + cnt = 1; /* restore cnt. kfree_nolock() frees one object at a time */ + } + if (USE_LOCKLESS_FAST_PATH()) { freelist = READ_ONCE(c->freelist); @@ -4643,11 +6527,13 @@ redo: goto redo; } } else { + __maybe_unused unsigned long flags = 0; + /* Update the free list under the local lock */ - local_lock(&s->cpu_slab->lock); + local_lock_cpu_slab(s, flags); c = this_cpu_ptr(s->cpu_slab); if (unlikely(slab != c->slab)) { - local_unlock(&s->cpu_slab->lock); + local_unlock_cpu_slab(s, flags); goto redo; } tid = c->tid; @@ -4657,7 +6543,7 @@ redo: c->freelist = head; c->tid = next_tid(tid); - local_unlock(&s->cpu_slab->lock); + local_unlock_cpu_slab(s, flags); } stat_add(s, FREE_FASTPATH, cnt); } @@ -4677,8 +6563,16 @@ void slab_free(struct kmem_cache *s, struct slab *slab, void *object, memcg_slab_free_hook(s, slab, &object, 1); alloc_tagging_slab_free_hook(s, slab, &object, 1); - if (likely(slab_free_hook(s, object, slab_want_init_on_free(s), false))) - do_slab_free(s, slab, object, object, 1, addr); + if (unlikely(!slab_free_hook(s, object, slab_want_init_on_free(s), false))) + return; + + if (s->cpu_sheaves && likely(!IS_ENABLED(CONFIG_NUMA) || + slab_nid(slab) == numa_mem_id())) { + if (likely(free_to_pcs(s, object))) + return; + } + + do_slab_free(s, slab, object, object, 1, addr); } #ifdef CONFIG_MEMCG @@ -4880,8 +6774,73 @@ void kfree(const void *object) } EXPORT_SYMBOL(kfree); +/* + * Can be called while holding raw_spinlock_t or from IRQ and NMI, + * but ONLY for objects allocated by kmalloc_nolock(). + * Debug checks (like kmemleak and kfence) were skipped on allocation, + * hence + * obj = kmalloc(); kfree_nolock(obj); + * will miss kmemleak/kfence book keeping and will cause false positives. + * large_kmalloc is not supported either. + */ +void kfree_nolock(const void *object) +{ + struct folio *folio; + struct slab *slab; + struct kmem_cache *s; + void *x = (void *)object; + + if (unlikely(ZERO_OR_NULL_PTR(object))) + return; + + folio = virt_to_folio(object); + if (unlikely(!folio_test_slab(folio))) { + WARN_ONCE(1, "large_kmalloc is not supported by kfree_nolock()"); + return; + } + + slab = folio_slab(folio); + s = slab->slab_cache; + + memcg_slab_free_hook(s, slab, &x, 1); + alloc_tagging_slab_free_hook(s, slab, &x, 1); + /* + * Unlike slab_free() do NOT call the following: + * kmemleak_free_recursive(x, s->flags); + * debug_check_no_locks_freed(x, s->object_size); + * debug_check_no_obj_freed(x, s->object_size); + * __kcsan_check_access(x, s->object_size, ..); + * kfence_free(x); + * since they take spinlocks or not safe from any context. + */ + kmsan_slab_free(s, x); + /* + * If KASAN finds a kernel bug it will do kasan_report_invalid_free() + * which will call raw_spin_lock_irqsave() which is technically + * unsafe from NMI, but take chance and report kernel bug. + * The sequence of + * kasan_report_invalid_free() -> raw_spin_lock_irqsave() -> NMI + * -> kfree_nolock() -> kasan_report_invalid_free() on the same CPU + * is double buggy and deserves to deadlock. + */ + if (kasan_slab_pre_free(s, x)) + return; + /* + * memcg, kasan_slab_pre_free are done for 'x'. + * The only thing left is kasan_poison without quarantine, + * since kasan quarantine takes locks and not supported from NMI. + */ + kasan_slab_free(s, x, false, false, /* skip quarantine */true); +#ifndef CONFIG_SLUB_TINY + do_slab_free(s, slab, x, x, 0, _RET_IP_); +#else + defer_free(s, x); +#endif +} +EXPORT_SYMBOL_GPL(kfree_nolock); + static __always_inline __realloc_size(2) void * -__do_krealloc(const void *p, size_t new_size, gfp_t flags) +__do_krealloc(const void *p, size_t new_size, unsigned long align, gfp_t flags, int nid) { void *ret; size_t ks = 0; @@ -4895,6 +6854,16 @@ __do_krealloc(const void *p, size_t new_size, gfp_t flags) if (!kasan_check_byte(p)) return NULL; + /* + * If reallocation is not necessary (e. g. the new size is less + * than the current allocated size), the current allocation will be + * preserved unless __GFP_THISNODE is set. In the latter case a new + * allocation on the requested node will be attempted. + */ + if (unlikely(flags & __GFP_THISNODE) && nid != NUMA_NO_NODE && + nid != page_to_nid(virt_to_page(p))) + goto alloc_new; + if (is_kfence_address(p)) { ks = orig_size = kfence_ksize(p); } else { @@ -4917,6 +6886,10 @@ __do_krealloc(const void *p, size_t new_size, gfp_t flags) if (new_size > ks) goto alloc_new; + /* If the old object doesn't satisfy the new alignment, allocate a new one */ + if (!IS_ALIGNED((unsigned long)p, align)) + goto alloc_new; + /* Zero out spare memory. */ if (want_init_on_alloc(flags)) { kasan_disable_current(); @@ -4939,7 +6912,7 @@ __do_krealloc(const void *p, size_t new_size, gfp_t flags) return (void *)p; alloc_new: - ret = kmalloc_node_track_caller_noprof(new_size, flags, NUMA_NO_NODE, _RET_IP_); + ret = kmalloc_node_track_caller_noprof(new_size, flags, nid, _RET_IP_); if (ret && p) { /* Disable KASAN checks as the object's redzone is accessed. */ kasan_disable_current(); @@ -4951,14 +6924,19 @@ alloc_new: } /** - * krealloc - reallocate memory. The contents will remain unchanged. + * krealloc_node_align - reallocate memory. The contents will remain unchanged. * @p: object to reallocate memory for. * @new_size: how many bytes of memory are required. + * @align: desired alignment. * @flags: the type of memory to allocate. + * @nid: NUMA node or NUMA_NO_NODE * * If @p is %NULL, krealloc() behaves exactly like kmalloc(). If @new_size * is 0 and @p is not a %NULL pointer, the object pointed to is freed. * + * Only alignments up to those guaranteed by kmalloc() will be honored. Please see + * Documentation/core-api/memory-allocation.rst for more details. + * * If __GFP_ZERO logic is requested, callers must ensure that, starting with the * initial memory allocation, every subsequent call to this API for the same * memory allocation is flagged with __GFP_ZERO. Otherwise, it is possible that @@ -4983,7 +6961,8 @@ alloc_new: * * Return: pointer to the allocated memory or %NULL in case of error */ -void *krealloc_noprof(const void *p, size_t new_size, gfp_t flags) +void *krealloc_node_align_noprof(const void *p, size_t new_size, unsigned long align, + gfp_t flags, int nid) { void *ret; @@ -4992,13 +6971,13 @@ void *krealloc_noprof(const void *p, size_t new_size, gfp_t flags) return ZERO_SIZE_PTR; } - ret = __do_krealloc(p, new_size, flags); + ret = __do_krealloc(p, new_size, align, flags, nid); if (ret && kasan_reset_tag(p) != kasan_reset_tag(ret)) kfree(p); return ret; } -EXPORT_SYMBOL(krealloc_noprof); +EXPORT_SYMBOL(krealloc_node_align_noprof); static gfp_t kmalloc_gfp_adjust(gfp_t flags, size_t size) { @@ -5029,9 +7008,13 @@ static gfp_t kmalloc_gfp_adjust(gfp_t flags, size_t size) * failure, fall back to non-contiguous (vmalloc) allocation. * @size: size of the request. * @b: which set of kmalloc buckets to allocate from. + * @align: desired alignment. * @flags: gfp mask for the allocation - must be compatible (superset) with GFP_KERNEL. * @node: numa node to allocate from * + * Only alignments up to those guaranteed by kmalloc() will be honored. Please see + * Documentation/core-api/memory-allocation.rst for more details. + * * Uses kmalloc to get the memory but if the allocation fails then falls back * to the vmalloc allocator. Use kvfree for freeing the memory. * @@ -5041,7 +7024,8 @@ static gfp_t kmalloc_gfp_adjust(gfp_t flags, size_t size) * * Return: pointer to the allocated memory of %NULL in case of failure */ -void *__kvmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), gfp_t flags, int node) +void *__kvmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), unsigned long align, + gfp_t flags, int node) { void *ret; @@ -5071,7 +7055,7 @@ void *__kvmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), gfp_t flags, int node) * about the resulting pointer, and cannot play * protection games. */ - return __vmalloc_node_range_noprof(size, 1, VMALLOC_START, VMALLOC_END, + return __vmalloc_node_range_noprof(size, align, VMALLOC_START, VMALLOC_END, flags, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP, node, __builtin_return_address(0)); } @@ -5115,14 +7099,19 @@ void kvfree_sensitive(const void *addr, size_t len) EXPORT_SYMBOL(kvfree_sensitive); /** - * kvrealloc - reallocate memory; contents remain unchanged + * kvrealloc_node_align - reallocate memory; contents remain unchanged * @p: object to reallocate memory for * @size: the size to reallocate + * @align: desired alignment * @flags: the flags for the page level allocator + * @nid: NUMA node id * * If @p is %NULL, kvrealloc() behaves exactly like kvmalloc(). If @size is 0 * and @p is not a %NULL pointer, the object pointed to is freed. * + * Only alignments up to those guaranteed by kmalloc() will be honored. Please see + * Documentation/core-api/memory-allocation.rst for more details. + * * If __GFP_ZERO logic is requested, callers must ensure that, starting with the * initial memory allocation, every subsequent call to this API for the same * memory allocation is flagged with __GFP_ZERO. Otherwise, it is possible that @@ -5136,17 +7125,18 @@ EXPORT_SYMBOL(kvfree_sensitive); * * Return: pointer to the allocated memory or %NULL in case of error */ -void *kvrealloc_noprof(const void *p, size_t size, gfp_t flags) +void *kvrealloc_node_align_noprof(const void *p, size_t size, unsigned long align, + gfp_t flags, int nid) { void *n; if (is_vmalloc_addr(p)) - return vrealloc_noprof(p, size, flags); + return vrealloc_node_align_noprof(p, size, align, flags, nid); - n = krealloc_noprof(p, size, kmalloc_gfp_adjust(flags, size)); + n = krealloc_node_align_noprof(p, size, align, kmalloc_gfp_adjust(flags, size), nid); if (!n) { /* We failed to krealloc(), fall back to kvmalloc(). */ - n = kvmalloc_noprof(size, flags); + n = kvmalloc_node_align_noprof(size, align, flags, nid); if (!n) return NULL; @@ -5162,7 +7152,7 @@ void *kvrealloc_noprof(const void *p, size_t size, gfp_t flags) return n; } -EXPORT_SYMBOL(kvrealloc_noprof); +EXPORT_SYMBOL(kvrealloc_node_align_noprof); struct detached_freelist { struct slab *slab; @@ -5273,6 +7263,15 @@ void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p) if (!size) return; + /* + * freeing to sheaves is so incompatible with the detached freelist so + * once we go that way, we have to do everything differently + */ + if (s && s->cpu_sheaves) { + free_to_pcs_bulk(s, size, p); + return; + } + do { struct detached_freelist df; @@ -5391,7 +7390,7 @@ error: int kmem_cache_alloc_bulk_noprof(struct kmem_cache *s, gfp_t flags, size_t size, void **p) { - int i; + unsigned int i = 0; if (!size) return 0; @@ -5400,9 +7399,20 @@ int kmem_cache_alloc_bulk_noprof(struct kmem_cache *s, gfp_t flags, size_t size, if (unlikely(!s)) return 0; - i = __kmem_cache_alloc_bulk(s, flags, size, p); - if (unlikely(i == 0)) - return 0; + if (s->cpu_sheaves) + i = alloc_from_pcs_bulk(s, size, p); + + if (i < size) { + /* + * If we ran out of memory, don't bother with freeing back to + * the percpu sheaves, we have bigger problems. + */ + if (unlikely(__kmem_cache_alloc_bulk(s, flags, size - i, p + i) == 0)) { + if (i > 0) + __kmem_cache_free_bulk(s, i, p); + return 0; + } + } /* * memcg and kmem_cache debug support and memory initialization. @@ -5412,11 +7422,11 @@ int kmem_cache_alloc_bulk_noprof(struct kmem_cache *s, gfp_t flags, size_t size, slab_want_init_on_alloc(flags, s), s->object_size))) { return 0; } - return i; + + return size; } EXPORT_SYMBOL(kmem_cache_alloc_bulk_noprof); - /* * Object placement in a slab is made very easy because we always start at * offset 0. If we tune the size of the object to the alignment then we can @@ -5550,7 +7560,7 @@ static inline int calculate_order(unsigned int size) } static void -init_kmem_cache_node(struct kmem_cache_node *n) +init_kmem_cache_node(struct kmem_cache_node *n, struct node_barn *barn) { n->nr_partial = 0; spin_lock_init(&n->list_lock); @@ -5560,6 +7570,9 @@ init_kmem_cache_node(struct kmem_cache_node *n) atomic_long_set(&n->total_objects, 0); INIT_LIST_HEAD(&n->full); #endif + n->barn = barn; + if (barn) + barn_init(barn); } #ifndef CONFIG_SLUB_TINY @@ -5590,6 +7603,26 @@ static inline int alloc_kmem_cache_cpus(struct kmem_cache *s) } #endif /* CONFIG_SLUB_TINY */ +static int init_percpu_sheaves(struct kmem_cache *s) +{ + int cpu; + + for_each_possible_cpu(cpu) { + struct slub_percpu_sheaves *pcs; + + pcs = per_cpu_ptr(s->cpu_sheaves, cpu); + + local_trylock_init(&pcs->lock); + + pcs->main = alloc_empty_sheaf(s, GFP_KERNEL); + + if (!pcs->main) + return -ENOMEM; + } + + return 0; +} + static struct kmem_cache *kmem_cache_node; /* @@ -5625,7 +7658,7 @@ static void early_kmem_cache_node_alloc(int node) slab->freelist = get_freepointer(kmem_cache_node, n); slab->inuse = 1; kmem_cache_node->node[node] = n; - init_kmem_cache_node(n); + init_kmem_cache_node(n, NULL); inc_slabs_node(kmem_cache_node, node, slab->objects); /* @@ -5641,6 +7674,13 @@ static void free_kmem_cache_nodes(struct kmem_cache *s) struct kmem_cache_node *n; for_each_kmem_cache_node(s, node, n) { + if (n->barn) { + WARN_ON(n->barn->nr_full); + WARN_ON(n->barn->nr_empty); + kfree(n->barn); + n->barn = NULL; + } + s->node[node] = NULL; kmem_cache_free(kmem_cache_node, n); } @@ -5649,7 +7689,12 @@ static void free_kmem_cache_nodes(struct kmem_cache *s) void __kmem_cache_release(struct kmem_cache *s) { cache_random_seq_destroy(s); + if (s->cpu_sheaves) + pcs_destroy(s); #ifndef CONFIG_SLUB_TINY +#ifdef CONFIG_PREEMPT_RT + lockdep_unregister_key(&s->lock_key); +#endif free_percpu(s->cpu_slab); #endif free_kmem_cache_nodes(s); @@ -5661,20 +7706,29 @@ static int init_kmem_cache_nodes(struct kmem_cache *s) for_each_node_mask(node, slab_nodes) { struct kmem_cache_node *n; + struct node_barn *barn = NULL; if (slab_state == DOWN) { early_kmem_cache_node_alloc(node); continue; } + + if (s->cpu_sheaves) { + barn = kmalloc_node(sizeof(*barn), GFP_KERNEL, node); + + if (!barn) + return 0; + } + n = kmem_cache_alloc_node(kmem_cache_node, GFP_KERNEL, node); - if (!n) { - free_kmem_cache_nodes(s); + kfree(barn); return 0; } - init_kmem_cache_node(n); + init_kmem_cache_node(n, barn); + s->node[node] = n; } return 1; @@ -5929,8 +7983,15 @@ int __kmem_cache_shutdown(struct kmem_cache *s) struct kmem_cache_node *n; flush_all_cpus_locked(s); + + /* we might have rcu sheaves in flight */ + if (s->cpu_sheaves) + rcu_barrier(); + /* Attempt to free all objects */ for_each_kmem_cache_node(s, node, n) { + if (n->barn) + barn_shrink(s, n->barn); free_partial(s, n); if (n->nr_partial || node_nr_slabs(n)) return 1; @@ -6134,6 +8195,9 @@ static int __kmem_cache_do_shrink(struct kmem_cache *s) for (i = 0; i < SHRINK_PROMOTE_MAX; i++) INIT_LIST_HEAD(promote + i); + if (n->barn) + barn_shrink(s, n->barn); + spin_lock_irqsave(&n->list_lock, flags); /* @@ -6213,12 +8277,24 @@ static int slab_mem_going_online_callback(int nid) */ mutex_lock(&slab_mutex); list_for_each_entry(s, &slab_caches, list) { + struct node_barn *barn = NULL; + /* * The structure may already exist if the node was previously * onlined and offlined. */ if (get_node(s, nid)) continue; + + if (s->cpu_sheaves) { + barn = kmalloc_node(sizeof(*barn), GFP_KERNEL, nid); + + if (!barn) { + ret = -ENOMEM; + goto out; + } + } + /* * XXX: kmem_cache_alloc_node will fallback to other nodes * since memory is not yet available from the node that @@ -6226,10 +8302,13 @@ static int slab_mem_going_online_callback(int nid) */ n = kmem_cache_alloc(kmem_cache_node, GFP_KERNEL); if (!n) { + kfree(barn); ret = -ENOMEM; goto out; } - init_kmem_cache_node(n); + + init_kmem_cache_node(n, barn); + s->node[nid] = n; } /* @@ -6442,6 +8521,17 @@ int do_kmem_cache_create(struct kmem_cache *s, const char *name, set_cpu_partial(s); + if (args->sheaf_capacity && !IS_ENABLED(CONFIG_SLUB_TINY) + && !(s->flags & SLAB_DEBUG_FLAGS)) { + s->cpu_sheaves = alloc_percpu(struct slub_percpu_sheaves); + if (!s->cpu_sheaves) { + err = -ENOMEM; + goto out; + } + // TODO: increase capacity to grow slab_sheaf up to next kmalloc size? + s->sheaf_capacity = args->sheaf_capacity; + } + #ifdef CONFIG_NUMA s->remote_node_defrag_ratio = 1000; #endif @@ -6458,6 +8548,12 @@ int do_kmem_cache_create(struct kmem_cache *s, const char *name, if (!alloc_kmem_cache_cpus(s)) goto out; + if (s->cpu_sheaves) { + err = init_percpu_sheaves(s); + if (err) + goto out; + } + err = 0; /* Mutex is not taken during early boot */ @@ -6499,6 +8595,11 @@ static void validate_slab(struct kmem_cache *s, struct slab *slab, void *p; void *addr = slab_address(slab); + if (!validate_slab_ptr(slab)) { + slab_err(s, slab, "Not a valid slab page"); + return; + } + if (!check_slab(s, slab) || !on_freelist(s, slab, NULL)) return; @@ -6910,6 +9011,12 @@ static ssize_t order_show(struct kmem_cache *s, char *buf) } SLAB_ATTR_RO(order); +static ssize_t sheaf_capacity_show(struct kmem_cache *s, char *buf) +{ + return sysfs_emit(buf, "%u\n", s->sheaf_capacity); +} +SLAB_ATTR_RO(sheaf_capacity); + static ssize_t min_partial_show(struct kmem_cache *s, char *buf) { return sysfs_emit(buf, "%lu\n", s->min_partial); @@ -7257,8 +9364,12 @@ static ssize_t text##_store(struct kmem_cache *s, \ } \ SLAB_ATTR(text); \ +STAT_ATTR(ALLOC_PCS, alloc_cpu_sheaf); STAT_ATTR(ALLOC_FASTPATH, alloc_fastpath); STAT_ATTR(ALLOC_SLOWPATH, alloc_slowpath); +STAT_ATTR(FREE_PCS, free_cpu_sheaf); +STAT_ATTR(FREE_RCU_SHEAF, free_rcu_sheaf); +STAT_ATTR(FREE_RCU_SHEAF_FAIL, free_rcu_sheaf_fail); STAT_ATTR(FREE_FASTPATH, free_fastpath); STAT_ATTR(FREE_SLOWPATH, free_slowpath); STAT_ATTR(FREE_FROZEN, free_frozen); @@ -7283,6 +9394,19 @@ STAT_ATTR(CPU_PARTIAL_ALLOC, cpu_partial_alloc); STAT_ATTR(CPU_PARTIAL_FREE, cpu_partial_free); STAT_ATTR(CPU_PARTIAL_NODE, cpu_partial_node); STAT_ATTR(CPU_PARTIAL_DRAIN, cpu_partial_drain); +STAT_ATTR(SHEAF_FLUSH, sheaf_flush); +STAT_ATTR(SHEAF_REFILL, sheaf_refill); +STAT_ATTR(SHEAF_ALLOC, sheaf_alloc); +STAT_ATTR(SHEAF_FREE, sheaf_free); +STAT_ATTR(BARN_GET, barn_get); +STAT_ATTR(BARN_GET_FAIL, barn_get_fail); +STAT_ATTR(BARN_PUT, barn_put); +STAT_ATTR(BARN_PUT_FAIL, barn_put_fail); +STAT_ATTR(SHEAF_PREFILL_FAST, sheaf_prefill_fast); +STAT_ATTR(SHEAF_PREFILL_SLOW, sheaf_prefill_slow); +STAT_ATTR(SHEAF_PREFILL_OVERSIZE, sheaf_prefill_oversize); +STAT_ATTR(SHEAF_RETURN_FAST, sheaf_return_fast); +STAT_ATTR(SHEAF_RETURN_SLOW, sheaf_return_slow); #endif /* CONFIG_SLUB_STATS */ #ifdef CONFIG_KFENCE @@ -7313,6 +9437,7 @@ static struct attribute *slab_attrs[] = { &object_size_attr.attr, &objs_per_slab_attr.attr, &order_attr.attr, + &sheaf_capacity_attr.attr, &min_partial_attr.attr, &cpu_partial_attr.attr, &objects_partial_attr.attr, @@ -7344,8 +9469,12 @@ static struct attribute *slab_attrs[] = { &remote_node_defrag_ratio_attr.attr, #endif #ifdef CONFIG_SLUB_STATS + &alloc_cpu_sheaf_attr.attr, &alloc_fastpath_attr.attr, &alloc_slowpath_attr.attr, + &free_cpu_sheaf_attr.attr, + &free_rcu_sheaf_attr.attr, + &free_rcu_sheaf_fail_attr.attr, &free_fastpath_attr.attr, &free_slowpath_attr.attr, &free_frozen_attr.attr, @@ -7370,6 +9499,19 @@ static struct attribute *slab_attrs[] = { &cpu_partial_free_attr.attr, &cpu_partial_node_attr.attr, &cpu_partial_drain_attr.attr, + &sheaf_flush_attr.attr, + &sheaf_refill_attr.attr, + &sheaf_alloc_attr.attr, + &sheaf_free_attr.attr, + &barn_get_attr.attr, + &barn_get_fail_attr.attr, + &barn_put_attr.attr, + &barn_put_fail_attr.attr, + &sheaf_prefill_fast_attr.attr, + &sheaf_prefill_slow_attr.attr, + &sheaf_prefill_oversize_attr.attr, + &sheaf_return_fast_attr.attr, + &sheaf_return_slow_attr.attr, #endif #ifdef CONFIG_FAILSLAB &failslab_attr.attr, @@ -7711,15 +9853,12 @@ static void *slab_debugfs_next(struct seq_file *seq, void *v, loff_t *ppos) return NULL; } -static int cmp_loc_by_count(const void *a, const void *b, const void *data) +static int cmp_loc_by_count(const void *a, const void *b) { struct location *loc1 = (struct location *)a; struct location *loc2 = (struct location *)b; - if (loc1->count > loc2->count) - return -1; - else - return 1; + return cmp_int(loc2->count, loc1->count); } static void *slab_debugfs_start(struct seq_file *seq, loff_t *ppos) @@ -7781,8 +9920,8 @@ static int slab_debug_trace_open(struct inode *inode, struct file *filep) } /* Sort locations by count */ - sort_r(t->loc, t->count, sizeof(struct location), - cmp_loc_by_count, NULL, NULL); + sort(t->loc, t->count, sizeof(struct location), + cmp_loc_by_count, NULL); bitmap_free(obj_map); return 0; diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index fd2ab5118e13..dbd8daccade2 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -27,9 +27,9 @@ #include <linux/spinlock.h> #include <linux/vmalloc.h> #include <linux/sched.h> +#include <linux/pgalloc.h> #include <asm/dma.h> -#include <asm/pgalloc.h> #include <asm/tlbflush.h> #include "hugetlb_vmemmap.h" @@ -229,7 +229,7 @@ p4d_t * __meminit vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node) if (!p) return NULL; pud_init(p); - p4d_populate(&init_mm, p4d, p); + p4d_populate_kernel(addr, p4d, p); } return p4d; } @@ -241,7 +241,7 @@ pgd_t * __meminit vmemmap_pgd_populate(unsigned long addr, int node) void *p = vmemmap_alloc_block_zero(PAGE_SIZE, node); if (!p) return NULL; - pgd_populate(&init_mm, pgd, p); + pgd_populate_kernel(addr, pgd, p); } return pgd; } @@ -578,11 +578,6 @@ struct page * __meminit __populate_section_memmap(unsigned long pfn, if (r < 0) return NULL; - if (system_state == SYSTEM_BOOTING) - memmap_boot_pages_add(DIV_ROUND_UP(end - start, PAGE_SIZE)); - else - memmap_pages_add(DIV_ROUND_UP(end - start, PAGE_SIZE)); - return pfn_to_page(pfn); } diff --git a/mm/sparse.c b/mm/sparse.c index 3c012cf83cc2..17c50a6415c2 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -43,11 +43,11 @@ static u8 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned; static u16 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned; #endif -int page_to_nid(const struct page *page) +int memdesc_nid(memdesc_flags_t mdf) { - return section_to_node_table[page_to_section(page)]; + return section_to_node_table[memdesc_section(mdf)]; } -EXPORT_SYMBOL(page_to_nid); +EXPORT_SYMBOL(memdesc_nid); static void set_section_nid(unsigned long section_nr, int nid) { @@ -454,9 +454,6 @@ static void __init sparse_buffer_init(unsigned long size, int nid) */ sparsemap_buf = memmap_alloc(size, section_map_size(), addr, nid, true); sparsemap_buf_end = sparsemap_buf + size; -#ifndef CONFIG_SPARSEMEM_VMEMMAP - memmap_boot_pages_add(DIV_ROUND_UP(size, PAGE_SIZE)); -#endif } static void __init sparse_buffer_fini(void) @@ -567,6 +564,8 @@ static void __init sparse_init_nid(int nid, unsigned long pnum_begin, sparse_buffer_fini(); goto failed; } + memmap_boot_pages_add(DIV_ROUND_UP(PAGES_PER_SECTION * sizeof(struct page), + PAGE_SIZE)); sparse_init_early_section(nid, map, pnum, 0); } } @@ -680,7 +679,6 @@ static void depopulate_section_memmap(unsigned long pfn, unsigned long nr_pages, unsigned long start = (unsigned long) pfn_to_page(pfn); unsigned long end = start + nr_pages * sizeof(struct page); - memmap_pages_add(-1L * (DIV_ROUND_UP(end - start, PAGE_SIZE))); vmemmap_free(start, end, altmap); } static void free_map_bootmem(struct page *memmap) @@ -856,10 +854,14 @@ static void section_deactivate(unsigned long pfn, unsigned long nr_pages, * The memmap of early sections is always fully populated. See * section_activate() and pfn_valid() . */ - if (!section_is_early) + if (!section_is_early) { + memmap_pages_add(-1L * (DIV_ROUND_UP(nr_pages * sizeof(struct page), PAGE_SIZE))); depopulate_section_memmap(pfn, nr_pages, altmap); - else if (memmap) + } else if (memmap) { + memmap_boot_pages_add(-1L * (DIV_ROUND_UP(nr_pages * sizeof(struct page), + PAGE_SIZE))); free_map_bootmem(memmap); + } if (empty) ms->section_mem_map = (unsigned long)NULL; @@ -904,6 +906,7 @@ static struct page * __meminit section_activate(int nid, unsigned long pfn, section_deactivate(pfn, nr_pages, altmap); return ERR_PTR(-ENOMEM); } + memmap_pages_add(DIV_ROUND_UP(nr_pages * sizeof(struct page), PAGE_SIZE)); return memmap; } diff --git a/mm/swap.c b/mm/swap.c index 3632dd061beb..2260dcd2775e 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -164,6 +164,10 @@ static void folio_batch_move_lru(struct folio_batch *fbatch, move_fn_t move_fn) for (i = 0; i < folio_batch_count(fbatch); i++) { struct folio *folio = fbatch->folios[i]; + /* block memcg migration while the folio moves between lru */ + if (move_fn != lru_add && !folio_test_clear_lru(folio)) + continue; + folio_lruvec_relock_irqsave(folio, &lruvec, &flags); move_fn(lruvec, folio); @@ -176,14 +180,10 @@ static void folio_batch_move_lru(struct folio_batch *fbatch, move_fn_t move_fn) } static void __folio_batch_add_and_move(struct folio_batch __percpu *fbatch, - struct folio *folio, move_fn_t move_fn, - bool on_lru, bool disable_irq) + struct folio *folio, move_fn_t move_fn, bool disable_irq) { unsigned long flags; - if (on_lru && !folio_test_clear_lru(folio)) - return; - folio_get(folio); if (disable_irq) @@ -191,8 +191,8 @@ static void __folio_batch_add_and_move(struct folio_batch __percpu *fbatch, else local_lock(&cpu_fbatches.lock); - if (!folio_batch_add(this_cpu_ptr(fbatch), folio) || folio_test_large(folio) || - lru_cache_disabled()) + if (!folio_batch_add(this_cpu_ptr(fbatch), folio) || + !folio_may_be_lru_cached(folio) || lru_cache_disabled()) folio_batch_move_lru(this_cpu_ptr(fbatch), move_fn); if (disable_irq) @@ -201,13 +201,13 @@ static void __folio_batch_add_and_move(struct folio_batch __percpu *fbatch, local_unlock(&cpu_fbatches.lock); } -#define folio_batch_add_and_move(folio, op, on_lru) \ - __folio_batch_add_and_move( \ - &cpu_fbatches.op, \ - folio, \ - op, \ - on_lru, \ - offsetof(struct cpu_fbatches, op) >= offsetof(struct cpu_fbatches, lock_irq) \ +#define folio_batch_add_and_move(folio, op) \ + __folio_batch_add_and_move( \ + &cpu_fbatches.op, \ + folio, \ + op, \ + offsetof(struct cpu_fbatches, op) >= \ + offsetof(struct cpu_fbatches, lock_irq) \ ) static void lru_move_tail(struct lruvec *lruvec, struct folio *folio) @@ -231,10 +231,10 @@ static void lru_move_tail(struct lruvec *lruvec, struct folio *folio) void folio_rotate_reclaimable(struct folio *folio) { if (folio_test_locked(folio) || folio_test_dirty(folio) || - folio_test_unevictable(folio)) + folio_test_unevictable(folio) || !folio_test_lru(folio)) return; - folio_batch_add_and_move(folio, lru_move_tail, true); + folio_batch_add_and_move(folio, lru_move_tail); } void lru_note_cost_unlock_irq(struct lruvec *lruvec, bool file, @@ -328,10 +328,11 @@ static void folio_activate_drain(int cpu) void folio_activate(struct folio *folio) { - if (folio_test_active(folio) || folio_test_unevictable(folio)) + if (folio_test_active(folio) || folio_test_unevictable(folio) || + !folio_test_lru(folio)) return; - folio_batch_add_and_move(folio, lru_activate, true); + folio_batch_add_and_move(folio, lru_activate); } #else @@ -387,14 +388,14 @@ static void __lru_cache_activate_folio(struct folio *folio) static void lru_gen_inc_refs(struct folio *folio) { - unsigned long new_flags, old_flags = READ_ONCE(folio->flags); + unsigned long new_flags, old_flags = READ_ONCE(folio->flags.f); if (folio_test_unevictable(folio)) return; /* see the comment on LRU_REFS_FLAGS */ if (!folio_test_referenced(folio)) { - set_mask_bits(&folio->flags, LRU_REFS_MASK, BIT(PG_referenced)); + set_mask_bits(&folio->flags.f, LRU_REFS_MASK, BIT(PG_referenced)); return; } @@ -406,7 +407,7 @@ static void lru_gen_inc_refs(struct folio *folio) } new_flags = old_flags + BIT(LRU_REFS_PGOFF); - } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags)); + } while (!try_cmpxchg(&folio->flags.f, &old_flags, new_flags)); } static bool lru_gen_clear_refs(struct folio *folio) @@ -418,7 +419,7 @@ static bool lru_gen_clear_refs(struct folio *folio) if (gen < 0) return true; - set_mask_bits(&folio->flags, LRU_REFS_FLAGS | BIT(PG_workingset), 0); + set_mask_bits(&folio->flags.f, LRU_REFS_FLAGS | BIT(PG_workingset), 0); lrugen = &folio_lruvec(folio)->lrugen; /* whether can do without shuffling under the LRU lock */ @@ -507,7 +508,7 @@ void folio_add_lru(struct folio *folio) lru_gen_in_fault() && !(current->flags & PF_MEMALLOC)) folio_set_active(folio); - folio_batch_add_and_move(folio, lru_add, false); + folio_batch_add_and_move(folio, lru_add); } EXPORT_SYMBOL(folio_add_lru); @@ -685,13 +686,13 @@ void lru_add_drain_cpu(int cpu) void deactivate_file_folio(struct folio *folio) { /* Deactivating an unevictable folio will not accelerate reclaim */ - if (folio_test_unevictable(folio)) + if (folio_test_unevictable(folio) || !folio_test_lru(folio)) return; if (lru_gen_enabled() && lru_gen_clear_refs(folio)) return; - folio_batch_add_and_move(folio, lru_deactivate_file, true); + folio_batch_add_and_move(folio, lru_deactivate_file); } /* @@ -704,13 +705,13 @@ void deactivate_file_folio(struct folio *folio) */ void folio_deactivate(struct folio *folio) { - if (folio_test_unevictable(folio)) + if (folio_test_unevictable(folio) || !folio_test_lru(folio)) return; if (lru_gen_enabled() ? lru_gen_clear_refs(folio) : !folio_test_active(folio)) return; - folio_batch_add_and_move(folio, lru_deactivate, true); + folio_batch_add_and_move(folio, lru_deactivate); } /** @@ -723,10 +724,11 @@ void folio_deactivate(struct folio *folio) void folio_mark_lazyfree(struct folio *folio) { if (!folio_test_anon(folio) || !folio_test_swapbacked(folio) || + !folio_test_lru(folio) || folio_test_swapcache(folio) || folio_test_unevictable(folio)) return; - folio_batch_add_and_move(folio, lru_lazyfree, true); + folio_batch_add_and_move(folio, lru_lazyfree); } void lru_add_drain(void) @@ -832,6 +834,9 @@ static inline void __lru_add_drain_all(bool force_all_cpus) */ this_gen = smp_load_acquire(&lru_drain_gen); + /* It helps everyone if we do our own local drain immediately. */ + lru_add_drain(); + mutex_lock(&lock); /* @@ -1096,7 +1101,7 @@ static const struct ctl_table swap_sysctl_table[] = { */ void __init swap_setup(void) { - unsigned long megs = totalram_pages() >> (20 - PAGE_SHIFT); + unsigned long megs = PAGES_TO_MB(totalram_pages()); /* Use a smaller cluster for small-memory machines */ if (megs < 16) diff --git a/mm/swap.h b/mm/swap.h index 911ad5ff0f89..8d8efdf1297a 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -2,15 +2,187 @@ #ifndef _MM_SWAP_H #define _MM_SWAP_H +#include <linux/atomic.h> /* for atomic_long_t */ struct mempolicy; struct swap_iocb; extern int page_cluster; +#ifdef CONFIG_THP_SWAP +#define SWAPFILE_CLUSTER HPAGE_PMD_NR +#define swap_entry_order(order) (order) +#else +#define SWAPFILE_CLUSTER 256 +#define swap_entry_order(order) 0 +#endif + +extern struct swap_info_struct *swap_info[]; + +/* + * We use this to track usage of a cluster. A cluster is a block of swap disk + * space with SWAPFILE_CLUSTER pages long and naturally aligns in disk. All + * free clusters are organized into a list. We fetch an entry from the list to + * get a free cluster. + * + * The flags field determines if a cluster is free. This is + * protected by cluster lock. + */ +struct swap_cluster_info { + spinlock_t lock; /* + * Protect swap_cluster_info fields + * other than list, and swap_info_struct->swap_map + * elements corresponding to the swap cluster. + */ + u16 count; + u8 flags; + u8 order; + atomic_long_t __rcu *table; /* Swap table entries, see mm/swap_table.h */ + struct list_head list; +}; + +/* All on-list cluster must have a non-zero flag. */ +enum swap_cluster_flags { + CLUSTER_FLAG_NONE = 0, /* For temporary off-list cluster */ + CLUSTER_FLAG_FREE, + CLUSTER_FLAG_NONFULL, + CLUSTER_FLAG_FRAG, + /* Clusters with flags above are allocatable */ + CLUSTER_FLAG_USABLE = CLUSTER_FLAG_FRAG, + CLUSTER_FLAG_FULL, + CLUSTER_FLAG_DISCARD, + CLUSTER_FLAG_MAX, +}; + #ifdef CONFIG_SWAP #include <linux/swapops.h> /* for swp_offset */ #include <linux/blk_types.h> /* for bio_end_io_t */ +static inline unsigned int swp_cluster_offset(swp_entry_t entry) +{ + return swp_offset(entry) % SWAPFILE_CLUSTER; +} + +/* + * Callers of all helpers below must ensure the entry, type, or offset is + * valid, and protect the swap device with reference count or locks. + */ +static inline struct swap_info_struct *__swap_type_to_info(int type) +{ + struct swap_info_struct *si; + + si = READ_ONCE(swap_info[type]); /* rcu_dereference() */ + VM_WARN_ON_ONCE(percpu_ref_is_zero(&si->users)); /* race with swapoff */ + return si; +} + +static inline struct swap_info_struct *__swap_entry_to_info(swp_entry_t entry) +{ + return __swap_type_to_info(swp_type(entry)); +} + +static inline struct swap_cluster_info *__swap_offset_to_cluster( + struct swap_info_struct *si, pgoff_t offset) +{ + VM_WARN_ON_ONCE(percpu_ref_is_zero(&si->users)); /* race with swapoff */ + VM_WARN_ON_ONCE(offset >= si->max); + return &si->cluster_info[offset / SWAPFILE_CLUSTER]; +} + +static inline struct swap_cluster_info *__swap_entry_to_cluster(swp_entry_t entry) +{ + return __swap_offset_to_cluster(__swap_entry_to_info(entry), + swp_offset(entry)); +} + +static __always_inline struct swap_cluster_info *__swap_cluster_lock( + struct swap_info_struct *si, unsigned long offset, bool irq) +{ + struct swap_cluster_info *ci = __swap_offset_to_cluster(si, offset); + + /* + * Nothing modifies swap cache in an IRQ context. All access to + * swap cache is wrapped by swap_cache_* helpers, and swap cache + * writeback is handled outside of IRQs. Swapin or swapout never + * occurs in IRQ, and neither does in-place split or replace. + * + * Besides, modifying swap cache requires synchronization with + * swap_map, which was never IRQ safe. + */ + VM_WARN_ON_ONCE(!in_task()); + VM_WARN_ON_ONCE(percpu_ref_is_zero(&si->users)); /* race with swapoff */ + if (irq) + spin_lock_irq(&ci->lock); + else + spin_lock(&ci->lock); + return ci; +} + +/** + * swap_cluster_lock - Lock and return the swap cluster of given offset. + * @si: swap device the cluster belongs to. + * @offset: the swap entry offset, pointing to a valid slot. + * + * Context: The caller must ensure the offset is in the valid range and + * protect the swap device with reference count or locks. + */ +static inline struct swap_cluster_info *swap_cluster_lock( + struct swap_info_struct *si, unsigned long offset) +{ + return __swap_cluster_lock(si, offset, false); +} + +static inline struct swap_cluster_info *__swap_cluster_get_and_lock( + const struct folio *folio, bool irq) +{ + VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio); + VM_WARN_ON_ONCE_FOLIO(!folio_test_swapcache(folio), folio); + return __swap_cluster_lock(__swap_entry_to_info(folio->swap), + swp_offset(folio->swap), irq); +} + +/* + * swap_cluster_get_and_lock - Locks the cluster that holds a folio's entries. + * @folio: The folio. + * + * This locks and returns the swap cluster that contains a folio's swap + * entries. The swap entries of a folio are always in one single cluster. + * The folio has to be locked so its swap entries won't change and the + * cluster won't be freed. + * + * Context: Caller must ensure the folio is locked and in the swap cache. + * Return: Pointer to the swap cluster. + */ +static inline struct swap_cluster_info *swap_cluster_get_and_lock( + const struct folio *folio) +{ + return __swap_cluster_get_and_lock(folio, false); +} + +/* + * swap_cluster_get_and_lock_irq - Locks the cluster that holds a folio's entries. + * @folio: The folio. + * + * Same as swap_cluster_get_and_lock but also disable IRQ. + * + * Context: Caller must ensure the folio is locked and in the swap cache. + * Return: Pointer to the swap cluster. + */ +static inline struct swap_cluster_info *swap_cluster_get_and_lock_irq( + const struct folio *folio) +{ + return __swap_cluster_get_and_lock(folio, true); +} + +static inline void swap_cluster_unlock(struct swap_cluster_info *ci) +{ + spin_unlock(&ci->lock); +} + +static inline void swap_cluster_unlock_irq(struct swap_cluster_info *ci) +{ + spin_unlock_irq(&ci->lock); +} + /* linux/mm/page_io.c */ int sio_pool_init(void); struct swap_iocb; @@ -26,14 +198,11 @@ int swap_writeout(struct folio *folio, struct swap_iocb **swap_plug); void __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug); /* linux/mm/swap_state.c */ -/* One swap address space for each 64M swap space */ -#define SWAP_ADDRESS_SPACE_SHIFT 14 -#define SWAP_ADDRESS_SPACE_PAGES (1 << SWAP_ADDRESS_SPACE_SHIFT) -#define SWAP_ADDRESS_SPACE_MASK (SWAP_ADDRESS_SPACE_PAGES - 1) -extern struct address_space *swapper_spaces[]; -#define swap_address_space(entry) \ - (&swapper_spaces[swp_type(entry)][swp_offset(entry) \ - >> SWAP_ADDRESS_SPACE_SHIFT]) +extern struct address_space swap_space __ro_after_init; +static inline struct address_space *swap_address_space(swp_entry_t entry) +{ + return &swap_space; +} /* * Return the swap device position of the swap entry. @@ -43,30 +212,52 @@ static inline loff_t swap_dev_pos(swp_entry_t entry) return ((loff_t)swp_offset(entry)) << PAGE_SHIFT; } -/* - * Return the swap cache index of the swap entry. +/** + * folio_matches_swap_entry - Check if a folio matches a given swap entry. + * @folio: The folio. + * @entry: The swap entry to check against. + * + * Context: The caller should have the folio locked to ensure it's stable + * and nothing will move it in or out of the swap cache. + * Return: true or false. */ -static inline pgoff_t swap_cache_index(swp_entry_t entry) +static inline bool folio_matches_swap_entry(const struct folio *folio, + swp_entry_t entry) { - BUILD_BUG_ON((SWP_OFFSET_MASK | SWAP_ADDRESS_SPACE_MASK) != SWP_OFFSET_MASK); - return swp_offset(entry) & SWAP_ADDRESS_SPACE_MASK; + swp_entry_t folio_entry = folio->swap; + long nr_pages = folio_nr_pages(folio); + + VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio); + if (!folio_test_swapcache(folio)) + return false; + VM_WARN_ON_ONCE_FOLIO(!IS_ALIGNED(folio_entry.val, nr_pages), folio); + return folio_entry.val == round_down(entry.val, nr_pages); } +/* + * All swap cache helpers below require the caller to ensure the swap entries + * used are valid and stablize the device by any of the following ways: + * - Hold a reference by get_swap_device(): this ensures a single entry is + * valid and increases the swap device's refcount. + * - Locking a folio in the swap cache: this ensures the folio's swap entries + * are valid and pinned, also implies reference to the device. + * - Locking anything referencing the swap entry: e.g. PTL that protects + * swap entries in the page table, similar to locking swap cache folio. + * - See the comment of get_swap_device() for more complex usage. + */ +struct folio *swap_cache_get_folio(swp_entry_t entry); +void *swap_cache_get_shadow(swp_entry_t entry); +void swap_cache_add_folio(struct folio *folio, swp_entry_t entry, void **shadow); +void swap_cache_del_folio(struct folio *folio); +/* Below helpers require the caller to lock and pass in the swap cluster. */ +void __swap_cache_del_folio(struct swap_cluster_info *ci, + struct folio *folio, swp_entry_t entry, void *shadow); +void __swap_cache_replace_folio(struct swap_cluster_info *ci, + struct folio *old, struct folio *new); +void __swap_cache_clear_shadow(swp_entry_t entry, int nr_ents); + void show_swap_cache_info(void); -void *get_shadow_from_swap_cache(swp_entry_t entry); -int add_to_swap_cache(struct folio *folio, swp_entry_t entry, - gfp_t gfp, void **shadowp); -void __delete_from_swap_cache(struct folio *folio, - swp_entry_t entry, void *shadow); -void delete_from_swap_cache(struct folio *folio); -void clear_shadow_from_swap_cache(int type, unsigned long begin, - unsigned long end); void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry, int nr); -struct folio *swap_cache_get_folio(swp_entry_t entry, - struct vm_area_struct *vma, unsigned long addr); -struct folio *filemap_get_incore_folio(struct address_space *mapping, - pgoff_t index); - struct folio *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, struct vm_area_struct *vma, unsigned long addr, struct swap_iocb **plug); @@ -77,10 +268,12 @@ struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t flag, struct mempolicy *mpol, pgoff_t ilx); struct folio *swapin_readahead(swp_entry_t entry, gfp_t flag, struct vm_fault *vmf); +void swap_update_readahead(struct folio *folio, struct vm_area_struct *vma, + unsigned long addr); static inline unsigned int folio_swap_flags(struct folio *folio) { - return swp_swap_info(folio->swap)->flags; + return __swap_entry_to_info(folio->swap)->flags; } /* @@ -91,7 +284,7 @@ static inline unsigned int folio_swap_flags(struct folio *folio) static inline int swap_zeromap_batch(swp_entry_t entry, int max_nr, bool *is_zeromap) { - struct swap_info_struct *sis = swp_swap_info(entry); + struct swap_info_struct *sis = __swap_entry_to_info(entry); unsigned long start = swp_offset(entry); unsigned long end = start + max_nr; bool first_bit; @@ -110,7 +303,7 @@ static inline int swap_zeromap_batch(swp_entry_t entry, int max_nr, static inline int non_swapcache_batch(swp_entry_t entry, int max_nr) { - struct swap_info_struct *si = swp_swap_info(entry); + struct swap_info_struct *si = __swap_entry_to_info(entry); pgoff_t offset = swp_offset(entry); int i; @@ -129,6 +322,37 @@ static inline int non_swapcache_batch(swp_entry_t entry, int max_nr) #else /* CONFIG_SWAP */ struct swap_iocb; +static inline struct swap_cluster_info *swap_cluster_lock( + struct swap_info_struct *si, pgoff_t offset, bool irq) +{ + return NULL; +} + +static inline struct swap_cluster_info *swap_cluster_get_and_lock( + struct folio *folio) +{ + return NULL; +} + +static inline struct swap_cluster_info *swap_cluster_get_and_lock_irq( + struct folio *folio) +{ + return NULL; +} + +static inline void swap_cluster_unlock(struct swap_cluster_info *ci) +{ +} + +static inline void swap_cluster_unlock_irq(struct swap_cluster_info *ci) +{ +} + +static inline struct swap_info_struct *__swap_entry_to_info(swp_entry_t entry) +{ + return NULL; +} + static inline void swap_read_folio(struct folio *folio, struct swap_iocb **plug) { } @@ -141,9 +365,9 @@ static inline struct address_space *swap_address_space(swp_entry_t entry) return NULL; } -static inline pgoff_t swap_cache_index(swp_entry_t entry) +static inline bool folio_matches_swap_entry(const struct folio *folio, swp_entry_t entry) { - return 0; + return false; } static inline void show_swap_cache_info(void) @@ -162,6 +386,11 @@ static inline struct folio *swapin_readahead(swp_entry_t swp, gfp_t gfp_mask, return NULL; } +static inline void swap_update_readahead(struct folio *folio, + struct vm_area_struct *vma, unsigned long addr) +{ +} + static inline int swap_writeout(struct folio *folio, struct swap_iocb **swap_plug) { @@ -172,41 +401,31 @@ static inline void swapcache_clear(struct swap_info_struct *si, swp_entry_t entr { } -static inline struct folio *swap_cache_get_folio(swp_entry_t entry, - struct vm_area_struct *vma, unsigned long addr) +static inline struct folio *swap_cache_get_folio(swp_entry_t entry) { return NULL; } -static inline -struct folio *filemap_get_incore_folio(struct address_space *mapping, - pgoff_t index) -{ - return filemap_get_folio(mapping, index); -} - -static inline void *get_shadow_from_swap_cache(swp_entry_t entry) +static inline void *swap_cache_get_shadow(swp_entry_t entry) { return NULL; } -static inline int add_to_swap_cache(struct folio *folio, swp_entry_t entry, - gfp_t gfp_mask, void **shadowp) +static inline void swap_cache_add_folio(struct folio *folio, swp_entry_t entry, void **shadow) { - return -1; } -static inline void __delete_from_swap_cache(struct folio *folio, - swp_entry_t entry, void *shadow) +static inline void swap_cache_del_folio(struct folio *folio) { } -static inline void delete_from_swap_cache(struct folio *folio) +static inline void __swap_cache_del_folio(struct swap_cluster_info *ci, + struct folio *folio, swp_entry_t entry, void *shadow) { } -static inline void clear_shadow_from_swap_cache(int type, unsigned long begin, - unsigned long end) +static inline void __swap_cache_replace_folio(struct swap_cluster_info *ci, + struct folio *old, struct folio *new) { } @@ -240,8 +459,10 @@ static inline int non_swapcache_batch(swp_entry_t entry, int max_nr) */ static inline pgoff_t folio_index(struct folio *folio) { +#ifdef CONFIG_SWAP if (unlikely(folio_test_swapcache(folio))) - return swap_cache_index(folio->swap); + return swp_offset(folio->swap); +#endif return folio->index; } diff --git a/mm/swap_state.c b/mm/swap_state.c index c354435a0923..b13e9c4baa90 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -23,6 +23,7 @@ #include <linux/huge_mm.h> #include <linux/shmem_fs.h> #include "internal.h" +#include "swap_table.h" #include "swap.h" /* @@ -36,8 +37,11 @@ static const struct address_space_operations swap_aops = { #endif }; -struct address_space *swapper_spaces[MAX_SWAPFILES] __read_mostly; -static unsigned int nr_swapper_spaces[MAX_SWAPFILES] __read_mostly; +/* Set swap_space as read only as swap cache is handled by swap table */ +struct address_space swap_space __ro_after_init = { + .a_ops = &swap_aops, +}; + static bool enable_vma_readahead __read_mostly = true; #define SWAP_RA_ORDER_CEILING 5 @@ -69,150 +73,237 @@ void show_swap_cache_info(void) printk("Total swap = %lukB\n", K(total_swap_pages)); } -void *get_shadow_from_swap_cache(swp_entry_t entry) +/** + * swap_cache_get_folio - Looks up a folio in the swap cache. + * @entry: swap entry used for the lookup. + * + * A found folio will be returned unlocked and with its refcount increased. + * + * Context: Caller must ensure @entry is valid and protect the swap device + * with reference count or locks. + * Return: Returns the found folio on success, NULL otherwise. The caller + * must lock nd check if the folio still matches the swap entry before + * use (e.g., folio_matches_swap_entry). + */ +struct folio *swap_cache_get_folio(swp_entry_t entry) { - struct address_space *address_space = swap_address_space(entry); - pgoff_t idx = swap_cache_index(entry); - void *shadow; + unsigned long swp_tb; + struct folio *folio; + + for (;;) { + swp_tb = swap_table_get(__swap_entry_to_cluster(entry), + swp_cluster_offset(entry)); + if (!swp_tb_is_folio(swp_tb)) + return NULL; + folio = swp_tb_to_folio(swp_tb); + if (likely(folio_try_get(folio))) + return folio; + } - shadow = xa_load(&address_space->i_pages, idx); - if (xa_is_value(shadow)) - return shadow; return NULL; } -/* - * add_to_swap_cache resembles filemap_add_folio on swapper_space, - * but sets SwapCache flag and 'swap' instead of mapping and index. +/** + * swap_cache_get_shadow - Looks up a shadow in the swap cache. + * @entry: swap entry used for the lookup. + * + * Context: Caller must ensure @entry is valid and protect the swap device + * with reference count or locks. + * Return: Returns either NULL or an XA_VALUE (shadow). */ -int add_to_swap_cache(struct folio *folio, swp_entry_t entry, - gfp_t gfp, void **shadowp) +void *swap_cache_get_shadow(swp_entry_t entry) { - struct address_space *address_space = swap_address_space(entry); - pgoff_t idx = swap_cache_index(entry); - XA_STATE_ORDER(xas, &address_space->i_pages, idx, folio_order(folio)); - unsigned long i, nr = folio_nr_pages(folio); - void *old; + unsigned long swp_tb; - xas_set_update(&xas, workingset_update_node); + swp_tb = swap_table_get(__swap_entry_to_cluster(entry), + swp_cluster_offset(entry)); + if (swp_tb_is_shadow(swp_tb)) + return swp_tb_to_shadow(swp_tb); + return NULL; +} - VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); - VM_BUG_ON_FOLIO(folio_test_swapcache(folio), folio); - VM_BUG_ON_FOLIO(!folio_test_swapbacked(folio), folio); +/** + * swap_cache_add_folio - Add a folio into the swap cache. + * @folio: The folio to be added. + * @entry: The swap entry corresponding to the folio. + * @gfp: gfp_mask for XArray node allocation. + * @shadowp: If a shadow is found, return the shadow. + * + * Context: Caller must ensure @entry is valid and protect the swap device + * with reference count or locks. + * The caller also needs to update the corresponding swap_map slots with + * SWAP_HAS_CACHE bit to avoid race or conflict. + */ +void swap_cache_add_folio(struct folio *folio, swp_entry_t entry, void **shadowp) +{ + void *shadow = NULL; + unsigned long old_tb, new_tb; + struct swap_cluster_info *ci; + unsigned int ci_start, ci_off, ci_end; + unsigned long nr_pages = folio_nr_pages(folio); + + VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio); + VM_WARN_ON_ONCE_FOLIO(folio_test_swapcache(folio), folio); + VM_WARN_ON_ONCE_FOLIO(!folio_test_swapbacked(folio), folio); + + new_tb = folio_to_swp_tb(folio); + ci_start = swp_cluster_offset(entry); + ci_end = ci_start + nr_pages; + ci_off = ci_start; + ci = swap_cluster_lock(__swap_entry_to_info(entry), swp_offset(entry)); + do { + old_tb = __swap_table_xchg(ci, ci_off, new_tb); + WARN_ON_ONCE(swp_tb_is_folio(old_tb)); + if (swp_tb_is_shadow(old_tb)) + shadow = swp_tb_to_shadow(old_tb); + } while (++ci_off < ci_end); - folio_ref_add(folio, nr); + folio_ref_add(folio, nr_pages); folio_set_swapcache(folio); folio->swap = entry; + swap_cluster_unlock(ci); - do { - xas_lock_irq(&xas); - xas_create_range(&xas); - if (xas_error(&xas)) - goto unlock; - for (i = 0; i < nr; i++) { - VM_BUG_ON_FOLIO(xas.xa_index != idx + i, folio); - if (shadowp) { - old = xas_load(&xas); - if (xa_is_value(old)) - *shadowp = old; - } - xas_store(&xas, folio); - xas_next(&xas); - } - address_space->nrpages += nr; - __node_stat_mod_folio(folio, NR_FILE_PAGES, nr); - __lruvec_stat_mod_folio(folio, NR_SWAPCACHE, nr); -unlock: - xas_unlock_irq(&xas); - } while (xas_nomem(&xas, gfp)); - - if (!xas_error(&xas)) - return 0; + node_stat_mod_folio(folio, NR_FILE_PAGES, nr_pages); + lruvec_stat_mod_folio(folio, NR_SWAPCACHE, nr_pages); - folio_clear_swapcache(folio); - folio_ref_sub(folio, nr); - return xas_error(&xas); + if (shadowp) + *shadowp = shadow; } -/* - * This must be called only on folios that have - * been verified to be in the swap cache. +/** + * __swap_cache_del_folio - Removes a folio from the swap cache. + * @ci: The locked swap cluster. + * @folio: The folio. + * @entry: The first swap entry that the folio corresponds to. + * @shadow: shadow value to be filled in the swap cache. + * + * Removes a folio from the swap cache and fills a shadow in place. + * This won't put the folio's refcount. The caller has to do that. + * + * Context: Caller must ensure the folio is locked and in the swap cache + * using the index of @entry, and lock the cluster that holds the entries. */ -void __delete_from_swap_cache(struct folio *folio, - swp_entry_t entry, void *shadow) +void __swap_cache_del_folio(struct swap_cluster_info *ci, struct folio *folio, + swp_entry_t entry, void *shadow) { - struct address_space *address_space = swap_address_space(entry); - int i; - long nr = folio_nr_pages(folio); - pgoff_t idx = swap_cache_index(entry); - XA_STATE(xas, &address_space->i_pages, idx); - - xas_set_update(&xas, workingset_update_node); - - VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); - VM_BUG_ON_FOLIO(!folio_test_swapcache(folio), folio); - VM_BUG_ON_FOLIO(folio_test_writeback(folio), folio); - - for (i = 0; i < nr; i++) { - void *entry = xas_store(&xas, shadow); - VM_BUG_ON_PAGE(entry != folio, entry); - xas_next(&xas); - } + unsigned long old_tb, new_tb; + unsigned int ci_start, ci_off, ci_end; + unsigned long nr_pages = folio_nr_pages(folio); + + VM_WARN_ON_ONCE(__swap_entry_to_cluster(entry) != ci); + VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio); + VM_WARN_ON_ONCE_FOLIO(!folio_test_swapcache(folio), folio); + VM_WARN_ON_ONCE_FOLIO(folio_test_writeback(folio), folio); + + new_tb = shadow_swp_to_tb(shadow); + ci_start = swp_cluster_offset(entry); + ci_end = ci_start + nr_pages; + ci_off = ci_start; + do { + /* If shadow is NULL, we sets an empty shadow */ + old_tb = __swap_table_xchg(ci, ci_off, new_tb); + WARN_ON_ONCE(!swp_tb_is_folio(old_tb) || + swp_tb_to_folio(old_tb) != folio); + } while (++ci_off < ci_end); + folio->swap.val = 0; folio_clear_swapcache(folio); - address_space->nrpages -= nr; - __node_stat_mod_folio(folio, NR_FILE_PAGES, -nr); - __lruvec_stat_mod_folio(folio, NR_SWAPCACHE, -nr); + node_stat_mod_folio(folio, NR_FILE_PAGES, -nr_pages); + lruvec_stat_mod_folio(folio, NR_SWAPCACHE, -nr_pages); } -/* - * This must be called only on folios that have - * been verified to be in the swap cache and locked. - * It will never put the folio into the free list, - * the caller has a reference on the folio. +/** + * swap_cache_del_folio - Removes a folio from the swap cache. + * @folio: The folio. + * + * Same as __swap_cache_del_folio, but handles lock and refcount. The + * caller must ensure the folio is either clean or has a swap count + * equal to zero, or it may cause data loss. + * + * Context: Caller must ensure the folio is locked and in the swap cache. */ -void delete_from_swap_cache(struct folio *folio) +void swap_cache_del_folio(struct folio *folio) { + struct swap_cluster_info *ci; swp_entry_t entry = folio->swap; - struct address_space *address_space = swap_address_space(entry); - xa_lock_irq(&address_space->i_pages); - __delete_from_swap_cache(folio, entry, NULL); - xa_unlock_irq(&address_space->i_pages); + ci = swap_cluster_lock(__swap_entry_to_info(entry), swp_offset(entry)); + __swap_cache_del_folio(ci, folio, entry, NULL); + swap_cluster_unlock(ci); put_swap_folio(folio, entry); folio_ref_sub(folio, folio_nr_pages(folio)); } -void clear_shadow_from_swap_cache(int type, unsigned long begin, - unsigned long end) +/** + * __swap_cache_replace_folio - Replace a folio in the swap cache. + * @ci: The locked swap cluster. + * @old: The old folio to be replaced. + * @new: The new folio. + * + * Replace an existing folio in the swap cache with a new folio. The + * caller is responsible for setting up the new folio's flag and swap + * entries. Replacement will take the new folio's swap entry value as + * the starting offset to override all slots covered by the new folio. + * + * Context: Caller must ensure both folios are locked, and lock the + * cluster that holds the old folio to be replaced. + */ +void __swap_cache_replace_folio(struct swap_cluster_info *ci, + struct folio *old, struct folio *new) { - unsigned long curr = begin; - void *old; - - for (;;) { - swp_entry_t entry = swp_entry(type, curr); - unsigned long index = curr & SWAP_ADDRESS_SPACE_MASK; - struct address_space *address_space = swap_address_space(entry); - XA_STATE(xas, &address_space->i_pages, index); - - xas_set_update(&xas, workingset_update_node); - - xa_lock_irq(&address_space->i_pages); - xas_for_each(&xas, old, min(index + (end - curr), SWAP_ADDRESS_SPACE_PAGES)) { - if (!xa_is_value(old)) - continue; - xas_store(&xas, NULL); - } - xa_unlock_irq(&address_space->i_pages); + swp_entry_t entry = new->swap; + unsigned long nr_pages = folio_nr_pages(new); + unsigned int ci_off = swp_cluster_offset(entry); + unsigned int ci_end = ci_off + nr_pages; + unsigned long old_tb, new_tb; + + VM_WARN_ON_ONCE(!folio_test_swapcache(old) || !folio_test_swapcache(new)); + VM_WARN_ON_ONCE(!folio_test_locked(old) || !folio_test_locked(new)); + VM_WARN_ON_ONCE(!entry.val); + + /* Swap cache still stores N entries instead of a high-order entry */ + new_tb = folio_to_swp_tb(new); + do { + old_tb = __swap_table_xchg(ci, ci_off, new_tb); + WARN_ON_ONCE(!swp_tb_is_folio(old_tb) || swp_tb_to_folio(old_tb) != old); + } while (++ci_off < ci_end); - /* search the next swapcache until we meet end */ - curr = ALIGN((curr + 1), SWAP_ADDRESS_SPACE_PAGES); - if (curr > end) - break; + /* + * If the old folio is partially replaced (e.g., splitting a large + * folio, the old folio is shrunk, and new split sub folios replace + * the shrunk part), ensure the new folio doesn't overlap it. + */ + if (IS_ENABLED(CONFIG_DEBUG_VM) && + folio_order(old) != folio_order(new)) { + ci_off = swp_cluster_offset(old->swap); + ci_end = ci_off + folio_nr_pages(old); + while (ci_off++ < ci_end) + WARN_ON_ONCE(swp_tb_to_folio(__swap_table_get(ci, ci_off)) != old); } } +/** + * swap_cache_clear_shadow - Clears a set of shadows in the swap cache. + * @entry: The starting index entry. + * @nr_ents: How many slots need to be cleared. + * + * Context: Caller must ensure the range is valid, all in one single cluster, + * not occupied by any folio, and lock the cluster. + */ +void __swap_cache_clear_shadow(swp_entry_t entry, int nr_ents) +{ + struct swap_cluster_info *ci = __swap_entry_to_cluster(entry); + unsigned int ci_off = swp_cluster_offset(entry), ci_end; + unsigned long old; + + ci_end = ci_off + nr_ents; + do { + old = __swap_table_xchg(ci, ci_off, null_to_swp_tb()); + WARN_ON_ONCE(swp_tb_is_folio(old)); + } while (++ci_off < ci_end); +} + /* * If we are the only user, then try to free up the swap cache. * @@ -272,100 +363,50 @@ static inline bool swap_use_vma_readahead(void) return READ_ONCE(enable_vma_readahead) && !atomic_read(&nr_rotate_swap); } -/* - * Lookup a swap entry in the swap cache. A found folio will be returned - * unlocked and with its refcount incremented - we rely on the kernel - * lock getting page table operations atomic even if we drop the folio - * lock before returning. - * - * Caller must lock the swap device or hold a reference to keep it valid. +/** + * swap_update_readahead - Update the readahead statistics of VMA or globally. + * @folio: the swap cache folio that just got hit. + * @vma: the VMA that should be updated, could be NULL for global update. + * @addr: the addr that triggered the swapin, ignored if @vma is NULL. */ -struct folio *swap_cache_get_folio(swp_entry_t entry, - struct vm_area_struct *vma, unsigned long addr) +void swap_update_readahead(struct folio *folio, struct vm_area_struct *vma, + unsigned long addr) { - struct folio *folio; - - folio = filemap_get_folio(swap_address_space(entry), swap_cache_index(entry)); - if (!IS_ERR(folio)) { - bool vma_ra = swap_use_vma_readahead(); - bool readahead; + bool readahead, vma_ra = swap_use_vma_readahead(); - /* - * At the moment, we don't support PG_readahead for anon THP - * so let's bail out rather than confusing the readahead stat. - */ - if (unlikely(folio_test_large(folio))) - return folio; - - readahead = folio_test_clear_readahead(folio); - if (vma && vma_ra) { - unsigned long ra_val; - int win, hits; - - ra_val = GET_SWAP_RA_VAL(vma); - win = SWAP_RA_WIN(ra_val); - hits = SWAP_RA_HITS(ra_val); - if (readahead) - hits = min_t(int, hits + 1, SWAP_RA_HITS_MAX); - atomic_long_set(&vma->swap_readahead_info, - SWAP_RA_VAL(addr, win, hits)); - } - - if (readahead) { - count_vm_event(SWAP_RA_HIT); - if (!vma || !vma_ra) - atomic_inc(&swapin_readahead_hits); - } - } else { - folio = NULL; + /* + * At the moment, we don't support PG_readahead for anon THP + * so let's bail out rather than confusing the readahead stat. + */ + if (unlikely(folio_test_large(folio))) + return; + + readahead = folio_test_clear_readahead(folio); + if (vma && vma_ra) { + unsigned long ra_val; + int win, hits; + + ra_val = GET_SWAP_RA_VAL(vma); + win = SWAP_RA_WIN(ra_val); + hits = SWAP_RA_HITS(ra_val); + if (readahead) + hits = min_t(int, hits + 1, SWAP_RA_HITS_MAX); + atomic_long_set(&vma->swap_readahead_info, + SWAP_RA_VAL(addr, win, hits)); } - return folio; -} - -/** - * filemap_get_incore_folio - Find and get a folio from the page or swap caches. - * @mapping: The address_space to search. - * @index: The page cache index. - * - * This differs from filemap_get_folio() in that it will also look for the - * folio in the swap cache. - * - * Return: The found folio or %NULL. - */ -struct folio *filemap_get_incore_folio(struct address_space *mapping, - pgoff_t index) -{ - swp_entry_t swp; - struct swap_info_struct *si; - struct folio *folio = filemap_get_entry(mapping, index); - - if (!folio) - return ERR_PTR(-ENOENT); - if (!xa_is_value(folio)) - return folio; - if (!shmem_mapping(mapping)) - return ERR_PTR(-ENOENT); - - swp = radix_to_swp_entry(folio); - /* There might be swapin error entries in shmem mapping. */ - if (non_swap_entry(swp)) - return ERR_PTR(-ENOENT); - /* Prevent swapoff from happening to us */ - si = get_swap_device(swp); - if (!si) - return ERR_PTR(-ENOENT); - index = swap_cache_index(swp); - folio = filemap_get_folio(swap_address_space(swp), index); - put_swap_device(si); - return folio; + if (readahead) { + count_vm_event(SWAP_RA_HIT); + if (!vma || !vma_ra) + atomic_inc(&swapin_readahead_hits); + } } struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, struct mempolicy *mpol, pgoff_t ilx, bool *new_page_allocated, bool skip_if_exists) { - struct swap_info_struct *si = swp_swap_info(entry); + struct swap_info_struct *si = __swap_entry_to_info(entry); struct folio *folio; struct folio *new_folio = NULL; struct folio *result = NULL; @@ -374,14 +415,13 @@ struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, *new_page_allocated = false; for (;;) { int err; + /* - * First check the swap cache. Since this is normally - * called after swap_cache_get_folio() failed, re-calling - * that would confuse statistics. + * Check the swap cache first, if a cached folio is found, + * return it unlocked. The caller will lock and check it. */ - folio = filemap_get_folio(swap_address_space(entry), - swap_cache_index(entry)); - if (!IS_ERR(folio)) + folio = swap_cache_get_folio(entry); + if (folio) goto got_folio; /* @@ -423,7 +463,7 @@ struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, goto put_and_return; /* - * We might race against __delete_from_swap_cache(), and + * We might race against __swap_cache_del_folio(), and * stumble across a swap_map entry whose SWAP_HAS_CACHE * has not yet been cleared. Or race against another * __read_swap_cache_async(), which has set SWAP_HAS_CACHE @@ -441,10 +481,7 @@ struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, if (mem_cgroup_swapin_charge_folio(new_folio, NULL, gfp_mask, entry)) goto fail_unlock; - /* May fail (-ENOMEM) if XArray node allocation failed. */ - if (add_to_swap_cache(new_folio, entry, gfp_mask & GFP_RECLAIM_MASK, &shadow)) - goto fail_unlock; - + swap_cache_add_folio(new_folio, entry, &shadow); memcg1_swapin(entry, 1); if (shadow) @@ -590,7 +627,7 @@ struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, unsigned long offset = entry_offset; unsigned long start_offset, end_offset; unsigned long mask; - struct swap_info_struct *si = swp_swap_info(entry); + struct swap_info_struct *si = __swap_entry_to_info(entry); struct blk_plug plug; struct swap_iocb *splug = NULL; bool page_allocated; @@ -636,41 +673,6 @@ skip: return folio; } -int init_swap_address_space(unsigned int type, unsigned long nr_pages) -{ - struct address_space *spaces, *space; - unsigned int i, nr; - - nr = DIV_ROUND_UP(nr_pages, SWAP_ADDRESS_SPACE_PAGES); - spaces = kvcalloc(nr, sizeof(struct address_space), GFP_KERNEL); - if (!spaces) - return -ENOMEM; - for (i = 0; i < nr; i++) { - space = spaces + i; - xa_init_flags(&space->i_pages, XA_FLAGS_LOCK_IRQ); - atomic_set(&space->i_mmap_writable, 0); - space->a_ops = &swap_aops; - /* swap cache doesn't use writeback related tags */ - mapping_set_no_writeback_tags(space); - } - nr_swapper_spaces[type] = nr; - swapper_spaces[type] = spaces; - - return 0; -} - -void exit_swap_address_space(unsigned int type) -{ - int i; - struct address_space *spaces = swapper_spaces[type]; - - for (i = 0; i < nr_swapper_spaces[type]; i++) - VM_WARN_ON_ONCE(!mapping_empty(&spaces[i])); - kvfree(spaces); - nr_swapper_spaces[type] = 0; - swapper_spaces[type] = NULL; -} - static int swap_vma_ra_win(struct vm_fault *vmf, unsigned long *start, unsigned long *end) { @@ -843,7 +845,7 @@ static const struct attribute_group swap_attr_group = { .attrs = swap_attrs, }; -static int __init swap_init_sysfs(void) +static int __init swap_init(void) { int err; struct kobject *swap_kobj; @@ -858,11 +860,13 @@ static int __init swap_init_sysfs(void) pr_err("failed to register swap group\n"); goto delete_obj; } + /* Swap cache writeback is LRU based, no tags for it */ + mapping_set_no_writeback_tags(&swap_space); return 0; delete_obj: kobject_put(swap_kobj); return err; } -subsys_initcall(swap_init_sysfs); +subsys_initcall(swap_init); #endif diff --git a/mm/swap_table.h b/mm/swap_table.h new file mode 100644 index 000000000000..ea244a57a5b7 --- /dev/null +++ b/mm/swap_table.h @@ -0,0 +1,130 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _MM_SWAP_TABLE_H +#define _MM_SWAP_TABLE_H + +#include <linux/rcupdate.h> +#include <linux/atomic.h> +#include "swap.h" + +/* A typical flat array in each cluster as swap table */ +struct swap_table { + atomic_long_t entries[SWAPFILE_CLUSTER]; +}; + +#define SWP_TABLE_USE_PAGE (sizeof(struct swap_table) == PAGE_SIZE) + +/* + * A swap table entry represents the status of a swap slot on a swap + * (physical or virtual) device. The swap table in each cluster is a + * 1:1 map of the swap slots in this cluster. + * + * Each swap table entry could be a pointer (folio), a XA_VALUE + * (shadow), or NULL. + */ + +/* + * Helpers for casting one type of info into a swap table entry. + */ +static inline unsigned long null_to_swp_tb(void) +{ + BUILD_BUG_ON(sizeof(unsigned long) != sizeof(atomic_long_t)); + return 0; +} + +static inline unsigned long folio_to_swp_tb(struct folio *folio) +{ + BUILD_BUG_ON(sizeof(unsigned long) != sizeof(void *)); + return (unsigned long)folio; +} + +static inline unsigned long shadow_swp_to_tb(void *shadow) +{ + BUILD_BUG_ON((BITS_PER_XA_VALUE + 1) != + BITS_PER_BYTE * sizeof(unsigned long)); + VM_WARN_ON_ONCE(shadow && !xa_is_value(shadow)); + return (unsigned long)shadow; +} + +/* + * Helpers for swap table entry type checking. + */ +static inline bool swp_tb_is_null(unsigned long swp_tb) +{ + return !swp_tb; +} + +static inline bool swp_tb_is_folio(unsigned long swp_tb) +{ + return !xa_is_value((void *)swp_tb) && !swp_tb_is_null(swp_tb); +} + +static inline bool swp_tb_is_shadow(unsigned long swp_tb) +{ + return xa_is_value((void *)swp_tb); +} + +/* + * Helpers for retrieving info from swap table. + */ +static inline struct folio *swp_tb_to_folio(unsigned long swp_tb) +{ + VM_WARN_ON(!swp_tb_is_folio(swp_tb)); + return (void *)swp_tb; +} + +static inline void *swp_tb_to_shadow(unsigned long swp_tb) +{ + VM_WARN_ON(!swp_tb_is_shadow(swp_tb)); + return (void *)swp_tb; +} + +/* + * Helpers for accessing or modifying the swap table of a cluster, + * the swap cluster must be locked. + */ +static inline void __swap_table_set(struct swap_cluster_info *ci, + unsigned int off, unsigned long swp_tb) +{ + atomic_long_t *table = rcu_dereference_protected(ci->table, true); + + lockdep_assert_held(&ci->lock); + VM_WARN_ON_ONCE(off >= SWAPFILE_CLUSTER); + atomic_long_set(&table[off], swp_tb); +} + +static inline unsigned long __swap_table_xchg(struct swap_cluster_info *ci, + unsigned int off, unsigned long swp_tb) +{ + atomic_long_t *table = rcu_dereference_protected(ci->table, true); + + lockdep_assert_held(&ci->lock); + VM_WARN_ON_ONCE(off >= SWAPFILE_CLUSTER); + /* Ordering is guaranteed by cluster lock, relax */ + return atomic_long_xchg_relaxed(&table[off], swp_tb); +} + +static inline unsigned long __swap_table_get(struct swap_cluster_info *ci, + unsigned int off) +{ + atomic_long_t *table; + + VM_WARN_ON_ONCE(off >= SWAPFILE_CLUSTER); + table = rcu_dereference_check(ci->table, lockdep_is_held(&ci->lock)); + + return atomic_long_read(&table[off]); +} + +static inline unsigned long swap_table_get(struct swap_cluster_info *ci, + unsigned int off) +{ + atomic_long_t *table; + unsigned long swp_tb; + + rcu_read_lock(); + table = rcu_dereference(ci->table); + swp_tb = table ? atomic_long_read(&table[off]) : null_to_swp_tb(); + rcu_read_unlock(); + + return swp_tb; +} +#endif diff --git a/mm/swapfile.c b/mm/swapfile.c index b4f3cc712580..10760240a3a2 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -46,6 +46,7 @@ #include <asm/tlbflush.h> #include <linux/swapops.h> #include <linux/swap_cgroup.h> +#include "swap_table.h" #include "internal.h" #include "swap.h" @@ -58,9 +59,9 @@ static void swap_entries_free(struct swap_info_struct *si, static void swap_range_alloc(struct swap_info_struct *si, unsigned int nr_entries); static bool folio_swapcache_freeable(struct folio *folio); -static struct swap_cluster_info *lock_cluster(struct swap_info_struct *si, - unsigned long offset); -static inline void unlock_cluster(struct swap_cluster_info *ci); +static void move_cluster(struct swap_info_struct *si, + struct swap_cluster_info *ci, struct list_head *list, + enum swap_cluster_flags new_flags); static DEFINE_SPINLOCK(swap_lock); static unsigned int nr_swapfiles; @@ -105,7 +106,9 @@ static PLIST_HEAD(swap_active_head); static struct plist_head *swap_avail_heads; static DEFINE_SPINLOCK(swap_avail_lock); -static struct swap_info_struct *swap_info[MAX_SWAPFILES]; +struct swap_info_struct *swap_info[MAX_SWAPFILES]; + +static struct kmem_cache *swap_table_cachep; static DEFINE_MUTEX(swapon_mutex); @@ -127,14 +130,20 @@ static DEFINE_PER_CPU(struct percpu_swap_cluster, percpu_swap_cluster) = { .lock = INIT_LOCAL_LOCK(), }; -static struct swap_info_struct *swap_type_to_swap_info(int type) +/* May return NULL on invalid type, caller must check for NULL return */ +static struct swap_info_struct *swap_type_to_info(int type) { if (type >= MAX_SWAPFILES) return NULL; - return READ_ONCE(swap_info[type]); /* rcu_dereference() */ } +/* May return NULL on invalid entry, caller must check for NULL return */ +static struct swap_info_struct *swap_entry_to_info(swp_entry_t entry) +{ + return swap_type_to_info(swp_type(entry)); +} + static inline unsigned char swap_count(unsigned char ent) { return ent & ~SWAP_HAS_CACHE; /* may include COUNT_CONTINUED flag */ @@ -212,16 +221,15 @@ static bool swap_is_last_map(struct swap_info_struct *si, static int __try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset, unsigned long flags) { - swp_entry_t entry = swp_entry(si->type, offset); - struct address_space *address_space = swap_address_space(entry); + const swp_entry_t entry = swp_entry(si->type, offset); struct swap_cluster_info *ci; struct folio *folio; int ret, nr_pages; bool need_reclaim; again: - folio = filemap_get_folio(address_space, swap_cache_index(entry)); - if (IS_ERR(folio)) + folio = swap_cache_get_folio(entry); + if (!folio) return 0; nr_pages = folio_nr_pages(folio); @@ -241,13 +249,12 @@ again: * Offset could point to the middle of a large folio, or folio * may no longer point to the expected offset before it's locked. */ - entry = folio->swap; - if (offset < swp_offset(entry) || offset >= swp_offset(entry) + nr_pages) { + if (!folio_matches_swap_entry(folio, entry)) { folio_unlock(folio); folio_put(folio); goto again; } - offset = swp_offset(entry); + offset = swp_offset(folio->swap); need_reclaim = ((flags & TTRS_ANYWAY) || ((flags & TTRS_UNMAPPED) && !folio_mapped(folio)) || @@ -260,13 +267,13 @@ again: * swap_map is HAS_CACHE only, which means the slots have no page table * reference or pending writeback, and can't be allocated to others. */ - ci = lock_cluster(si, offset); + ci = swap_cluster_lock(si, offset); need_reclaim = swap_only_has_cache(si, offset, nr_pages); - unlock_cluster(ci); + swap_cluster_unlock(ci); if (!need_reclaim) goto out_unlock; - delete_from_swap_cache(folio); + swap_cache_del_folio(folio); folio_set_dirty(folio); ret = nr_pages; out_unlock: @@ -347,7 +354,7 @@ offset_to_swap_extent(struct swap_info_struct *sis, unsigned long offset) sector_t swap_folio_sector(struct folio *folio) { - struct swap_info_struct *sis = swp_swap_info(folio->swap); + struct swap_info_struct *sis = __swap_entry_to_info(folio->swap); struct swap_extent *se; sector_t sector; pgoff_t offset; @@ -387,19 +394,6 @@ static void discard_swap_cluster(struct swap_info_struct *si, } } -#ifdef CONFIG_THP_SWAP -#define SWAPFILE_CLUSTER HPAGE_PMD_NR - -#define swap_entry_order(order) (order) -#else -#define SWAPFILE_CLUSTER 256 - -/* - * Define swap_entry_order() as constant to let compiler to optimize - * out some code if !CONFIG_THP_SWAP - */ -#define swap_entry_order(order) 0 -#endif #define LATENCY_LIMIT 256 static inline bool cluster_is_empty(struct swap_cluster_info *info) @@ -412,10 +406,17 @@ static inline bool cluster_is_discard(struct swap_cluster_info *info) return info->flags == CLUSTER_FLAG_DISCARD; } +static inline bool cluster_table_is_alloced(struct swap_cluster_info *ci) +{ + return rcu_dereference_protected(ci->table, lockdep_is_held(&ci->lock)); +} + static inline bool cluster_is_usable(struct swap_cluster_info *ci, int order) { if (unlikely(ci->flags > CLUSTER_FLAG_USABLE)) return false; + if (!cluster_table_is_alloced(ci)) + return false; if (!order) return true; return cluster_is_empty(ci) || order == ci->order; @@ -427,32 +428,126 @@ static inline unsigned int cluster_index(struct swap_info_struct *si, return ci - si->cluster_info; } -static inline struct swap_cluster_info *offset_to_cluster(struct swap_info_struct *si, - unsigned long offset) -{ - return &si->cluster_info[offset / SWAPFILE_CLUSTER]; -} - static inline unsigned int cluster_offset(struct swap_info_struct *si, struct swap_cluster_info *ci) { return cluster_index(si, ci) * SWAPFILE_CLUSTER; } -static inline struct swap_cluster_info *lock_cluster(struct swap_info_struct *si, - unsigned long offset) +static struct swap_table *swap_table_alloc(gfp_t gfp) { - struct swap_cluster_info *ci; + struct folio *folio; - ci = offset_to_cluster(si, offset); - spin_lock(&ci->lock); + if (!SWP_TABLE_USE_PAGE) + return kmem_cache_zalloc(swap_table_cachep, gfp); - return ci; + folio = folio_alloc(gfp | __GFP_ZERO, 0); + if (folio) + return folio_address(folio); + return NULL; +} + +static void swap_table_free_folio_rcu_cb(struct rcu_head *head) +{ + struct folio *folio; + + folio = page_folio(container_of(head, struct page, rcu_head)); + folio_put(folio); +} + +static void swap_table_free(struct swap_table *table) +{ + if (!SWP_TABLE_USE_PAGE) { + kmem_cache_free(swap_table_cachep, table); + return; + } + + call_rcu(&(folio_page(virt_to_folio(table), 0)->rcu_head), + swap_table_free_folio_rcu_cb); +} + +static void swap_cluster_free_table(struct swap_cluster_info *ci) +{ + unsigned int ci_off; + struct swap_table *table; + + /* Only empty cluster's table is allow to be freed */ + lockdep_assert_held(&ci->lock); + VM_WARN_ON_ONCE(!cluster_is_empty(ci)); + for (ci_off = 0; ci_off < SWAPFILE_CLUSTER; ci_off++) + VM_WARN_ON_ONCE(!swp_tb_is_null(__swap_table_get(ci, ci_off))); + table = (void *)rcu_dereference_protected(ci->table, true); + rcu_assign_pointer(ci->table, NULL); + + swap_table_free(table); } -static inline void unlock_cluster(struct swap_cluster_info *ci) +/* + * Allocate swap table for one cluster. Attempt an atomic allocation first, + * then fallback to sleeping allocation. + */ +static struct swap_cluster_info * +swap_cluster_alloc_table(struct swap_info_struct *si, + struct swap_cluster_info *ci) { + struct swap_table *table; + + /* + * Only cluster isolation from the allocator does table allocation. + * Swap allocator uses percpu clusters and holds the local lock. + */ + lockdep_assert_held(&ci->lock); + lockdep_assert_held(&this_cpu_ptr(&percpu_swap_cluster)->lock); + + /* The cluster must be free and was just isolated from the free list. */ + VM_WARN_ON_ONCE(ci->flags || !cluster_is_empty(ci)); + + table = swap_table_alloc(__GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN); + if (table) { + rcu_assign_pointer(ci->table, table); + return ci; + } + + /* + * Try a sleep allocation. Each isolated free cluster may cause + * a sleep allocation, but there is a limited number of them, so + * the potential recursive allocation is limited. + */ spin_unlock(&ci->lock); + if (!(si->flags & SWP_SOLIDSTATE)) + spin_unlock(&si->global_cluster_lock); + local_unlock(&percpu_swap_cluster.lock); + + table = swap_table_alloc(__GFP_HIGH | __GFP_NOMEMALLOC | GFP_KERNEL); + + /* + * Back to atomic context. We might have migrated to a new CPU with a + * usable percpu cluster. But just keep using the isolated cluster to + * make things easier. Migration indicates a slight change of workload + * so using a new free cluster might not be a bad idea, and the worst + * could happen with ignoring the percpu cluster is fragmentation, + * which is acceptable since this fallback and race is rare. + */ + local_lock(&percpu_swap_cluster.lock); + if (!(si->flags & SWP_SOLIDSTATE)) + spin_lock(&si->global_cluster_lock); + spin_lock(&ci->lock); + + /* Nothing except this helper should touch a dangling empty cluster. */ + if (WARN_ON_ONCE(cluster_table_is_alloced(ci))) { + if (table) + swap_table_free(table); + return ci; + } + + if (!table) { + move_cluster(si, ci, &si->free_clusters, CLUSTER_FLAG_FREE); + spin_unlock(&ci->lock); + return NULL; + } + + rcu_assign_pointer(ci->table, table); + return ci; } static void move_cluster(struct swap_info_struct *si, @@ -470,11 +565,6 @@ static void move_cluster(struct swap_info_struct *si, else list_move_tail(&ci->list, list); spin_unlock(&si->lock); - - if (ci->flags == CLUSTER_FLAG_FRAG) - atomic_long_dec(&si->frag_cluster_nr[ci->order]); - else if (new_flags == CLUSTER_FLAG_FRAG) - atomic_long_inc(&si->frag_cluster_nr[ci->order]); ci->flags = new_flags; } @@ -489,7 +579,7 @@ static void swap_cluster_schedule_discard(struct swap_info_struct *si, static void __free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci) { - lockdep_assert_held(&ci->lock); + swap_cluster_free_table(ci); move_cluster(si, ci, &si->free_clusters, CLUSTER_FLAG_FREE); ci->order = 0; } @@ -504,15 +594,11 @@ static void __free_cluster(struct swap_info_struct *si, struct swap_cluster_info * this returns NULL for an non-empty list. */ static struct swap_cluster_info *isolate_lock_cluster( - struct swap_info_struct *si, struct list_head *list) + struct swap_info_struct *si, struct list_head *list, int order) { - struct swap_cluster_info *ci, *ret = NULL; + struct swap_cluster_info *ci, *found = NULL; spin_lock(&si->lock); - - if (unlikely(!(si->flags & SWP_WRITEOK))) - goto out; - list_for_each_entry(ci, list, list) { if (!spin_trylock(&ci->lock)) continue; @@ -524,13 +610,19 @@ static struct swap_cluster_info *isolate_lock_cluster( list_del(&ci->list); ci->flags = CLUSTER_FLAG_NONE; - ret = ci; + found = ci; break; } -out: spin_unlock(&si->lock); - return ret; + if (found && !cluster_table_is_alloced(found)) { + /* Only an empty free cluster's swap table can be freed. */ + VM_WARN_ON_ONCE(list != &si->free_clusters); + VM_WARN_ON_ONCE(!cluster_is_empty(found)); + return swap_cluster_alloc_table(si, found); + } + + return found; } /* @@ -663,17 +755,27 @@ static void relocate_cluster(struct swap_info_struct *si, * added to free cluster list and its usage counter will be increased by 1. * Only used for initialization. */ -static void inc_cluster_info_page(struct swap_info_struct *si, +static int inc_cluster_info_page(struct swap_info_struct *si, struct swap_cluster_info *cluster_info, unsigned long page_nr) { unsigned long idx = page_nr / SWAPFILE_CLUSTER; + struct swap_table *table; struct swap_cluster_info *ci; ci = cluster_info + idx; + if (!ci->table) { + table = swap_table_alloc(GFP_KERNEL); + if (!table) + return -ENOMEM; + rcu_assign_pointer(ci->table, table); + } + ci->count++; VM_BUG_ON(ci->count > SWAPFILE_CLUSTER); VM_BUG_ON(ci->flags); + + return 0; } static bool cluster_reclaim_range(struct swap_info_struct *si, @@ -742,6 +844,26 @@ static bool cluster_scan_range(struct swap_info_struct *si, return true; } +/* + * Currently, the swap table is not used for count tracking, just + * do a sanity check here to ensure nothing leaked, so the swap + * table should be empty upon freeing. + */ +static void swap_cluster_assert_table_empty(struct swap_cluster_info *ci, + unsigned int start, unsigned int nr) +{ + unsigned int ci_off = start % SWAPFILE_CLUSTER; + unsigned int ci_end = ci_off + nr; + unsigned long swp_tb; + + if (IS_ENABLED(CONFIG_DEBUG_VM)) { + do { + swp_tb = __swap_table_get(ci, ci_off); + VM_WARN_ON_ONCE(!swp_tb_is_null(swp_tb)); + } while (++ci_off < ci_end); + } +} + static bool cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster_info *ci, unsigned int start, unsigned char usage, unsigned int order) @@ -761,6 +883,7 @@ static bool cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster ci->order = order; memset(si->swap_map + start, usage, nr_pages); + swap_cluster_assert_table_empty(ci, start, nr_pages); swap_range_alloc(si, nr_pages); ci->count += nr_pages; @@ -815,7 +938,7 @@ static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si, } out: relocate_cluster(si, ci); - unlock_cluster(ci); + swap_cluster_unlock(ci); if (si->flags & SWP_SOLIDSTATE) { this_cpu_write(percpu_swap_cluster.offset[order], next); this_cpu_write(percpu_swap_cluster.si[order], si); @@ -825,6 +948,29 @@ out: return found; } +static unsigned int alloc_swap_scan_list(struct swap_info_struct *si, + struct list_head *list, + unsigned int order, + unsigned char usage, + bool scan_all) +{ + unsigned int found = SWAP_ENTRY_INVALID; + + do { + struct swap_cluster_info *ci = isolate_lock_cluster(si, list, order); + unsigned long offset; + + if (!ci) + break; + offset = cluster_offset(si, ci); + found = alloc_swap_scan_cluster(si, ci, offset, order, usage); + if (found) + break; + } while (scan_all); + + return found; +} + static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force) { long to_scan = 1; @@ -836,7 +982,7 @@ static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force) if (force) to_scan = swap_usage_in_pages(si) / SWAPFILE_CLUSTER; - while ((ci = isolate_lock_cluster(si, &si->full_clusters))) { + while ((ci = isolate_lock_cluster(si, &si->full_clusters, 0))) { offset = cluster_offset(si, ci); end = min(si->max, offset + SWAPFILE_CLUSTER); to_scan--; @@ -859,7 +1005,7 @@ static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force) if (ci->flags == CLUSTER_FLAG_NONE) relocate_cluster(si, ci); - unlock_cluster(ci); + swap_cluster_unlock(ci); if (to_scan <= 0) break; } @@ -898,7 +1044,7 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o if (offset == SWAP_ENTRY_INVALID) goto new_cluster; - ci = lock_cluster(si, offset); + ci = swap_cluster_lock(si, offset); /* Cluster could have been used by another order */ if (cluster_is_usable(ci, order)) { if (cluster_is_empty(ci)) @@ -906,53 +1052,53 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o found = alloc_swap_scan_cluster(si, ci, offset, order, usage); } else { - unlock_cluster(ci); + swap_cluster_unlock(ci); } if (found) goto done; } new_cluster: - ci = isolate_lock_cluster(si, &si->free_clusters); - if (ci) { - found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci), - order, usage); + /* + * If the device need discard, prefer new cluster over nonfull + * to spread out the writes. + */ + if (si->flags & SWP_PAGE_DISCARD) { + found = alloc_swap_scan_list(si, &si->free_clusters, order, usage, + false); + if (found) + goto done; + } + + if (order < PMD_ORDER) { + found = alloc_swap_scan_list(si, &si->nonfull_clusters[order], + order, usage, true); + if (found) + goto done; + } + + if (!(si->flags & SWP_PAGE_DISCARD)) { + found = alloc_swap_scan_list(si, &si->free_clusters, order, usage, + false); if (found) goto done; } - /* Try reclaim from full clusters if free clusters list is drained */ + /* Try reclaim full clusters if free and nonfull lists are drained */ if (vm_swap_full()) swap_reclaim_full_clusters(si, false); if (order < PMD_ORDER) { - unsigned int frags = 0, frags_existing; - - while ((ci = isolate_lock_cluster(si, &si->nonfull_clusters[order]))) { - found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci), - order, usage); - if (found) - goto done; - /* Clusters failed to allocate are moved to frag_clusters */ - frags++; - } - - frags_existing = atomic_long_read(&si->frag_cluster_nr[order]); - while (frags < frags_existing && - (ci = isolate_lock_cluster(si, &si->frag_clusters[order]))) { - atomic_long_dec(&si->frag_cluster_nr[order]); - /* - * Rotate the frag list to iterate, they were all - * failing high order allocation or moved here due to - * per-CPU usage, but they could contain newly released - * reclaimable (eg. lazy-freed swap cache) slots. - */ - found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci), - order, usage); - if (found) - goto done; - frags++; - } + /* + * Scan only one fragment cluster is good enough. Order 0 + * allocation will surely success, and large allocation + * failure is not critical. Scanning one cluster still + * keeps the list rotated and reclaimed (for HAS_CACHE). + */ + found = alloc_swap_scan_list(si, &si->frag_clusters[order], order, + usage, false); + if (found) + goto done; } /* @@ -971,24 +1117,20 @@ new_cluster: * Clusters here have at least one usable slots and can't fail order 0 * allocation, but reclaim may drop si->lock and race with another user. */ - while ((ci = isolate_lock_cluster(si, &si->frag_clusters[o]))) { - atomic_long_dec(&si->frag_cluster_nr[o]); - found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci), - 0, usage); - if (found) - goto done; - } + found = alloc_swap_scan_list(si, &si->frag_clusters[o], + 0, usage, true); + if (found) + goto done; - while ((ci = isolate_lock_cluster(si, &si->nonfull_clusters[o]))) { - found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci), - 0, usage); - if (found) - goto done; - } + found = alloc_swap_scan_list(si, &si->nonfull_clusters[o], + 0, usage, true); + if (found) + goto done; } done: if (!(si->flags & SWP_SOLIDSTATE)) spin_unlock(&si->global_cluster_lock); + return found; } @@ -1145,7 +1287,7 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset, swap_slot_free_notify(si->bdev, offset); offset++; } - clear_shadow_from_swap_cache(si->type, begin, end); + __swap_cache_clear_shadow(swp_entry(si->type, begin), nr_entries); /* * Make sure that try_to_unuse() observes si->inuse_pages reaching 0 @@ -1192,7 +1334,7 @@ static bool swap_alloc_fast(swp_entry_t *entry, if (!si || !offset || !get_swap_device_info(si)) return false; - ci = lock_cluster(si, offset); + ci = swap_cluster_lock(si, offset); if (cluster_is_usable(ci, order)) { if (cluster_is_empty(ci)) offset = cluster_offset(si, ci); @@ -1200,7 +1342,7 @@ static bool swap_alloc_fast(swp_entry_t *entry, if (found) *entry = swp_entry(si->type, found); } else { - unlock_cluster(ci); + swap_cluster_unlock(ci); } put_swap_device(si); @@ -1302,16 +1444,7 @@ int folio_alloc_swap(struct folio *folio, gfp_t gfp) if (!entry.val) return -ENOMEM; - /* - * XArray node allocations from PF_MEMALLOC contexts could - * completely exhaust the page allocator. __GFP_NOMEMALLOC - * stops emergency reserves from being allocated. - * - * TODO: this could cause a theoretical memory reclaim - * deadlock in the swap out path. - */ - if (add_to_swap_cache(folio, entry, gfp | __GFP_NOMEMALLOC, NULL)) - goto out_free; + swap_cache_add_folio(folio, entry, NULL); return 0; @@ -1327,7 +1460,7 @@ static struct swap_info_struct *_swap_info_get(swp_entry_t entry) if (!entry.val) goto out; - si = swp_swap_info(entry); + si = swap_entry_to_info(entry); if (!si) goto bad_nofile; if (data_race(!(si->flags & SWP_USED))) @@ -1442,7 +1575,7 @@ struct swap_info_struct *get_swap_device(swp_entry_t entry) if (!entry.val) goto out; - si = swp_swap_info(entry); + si = swap_entry_to_info(entry); if (!si) goto bad_nofile; if (!get_swap_device_info(si)) @@ -1468,14 +1601,14 @@ static void swap_entries_put_cache(struct swap_info_struct *si, unsigned long offset = swp_offset(entry); struct swap_cluster_info *ci; - ci = lock_cluster(si, offset); - if (swap_only_has_cache(si, offset, nr)) + ci = swap_cluster_lock(si, offset); + if (swap_only_has_cache(si, offset, nr)) { swap_entries_free(si, ci, entry, nr); - else { + } else { for (int i = 0; i < nr; i++, entry.val++) swap_entry_put_locked(si, ci, entry, SWAP_HAS_CACHE); } - unlock_cluster(ci); + swap_cluster_unlock(ci); } static bool swap_entries_put_map(struct swap_info_struct *si, @@ -1493,7 +1626,7 @@ static bool swap_entries_put_map(struct swap_info_struct *si, if (count != 1 && count != SWAP_MAP_SHMEM) goto fallback; - ci = lock_cluster(si, offset); + ci = swap_cluster_lock(si, offset); if (!swap_is_last_map(si, offset, nr, &has_cache)) { goto locked_fallback; } @@ -1502,21 +1635,20 @@ static bool swap_entries_put_map(struct swap_info_struct *si, else for (i = 0; i < nr; i++) WRITE_ONCE(si->swap_map[offset + i], SWAP_HAS_CACHE); - unlock_cluster(ci); + swap_cluster_unlock(ci); return has_cache; fallback: - ci = lock_cluster(si, offset); + ci = swap_cluster_lock(si, offset); locked_fallback: for (i = 0; i < nr; i++, entry.val++) { count = swap_entry_put_locked(si, ci, entry, 1); if (count == SWAP_HAS_CACHE) has_cache = true; } - unlock_cluster(ci); + swap_cluster_unlock(ci); return has_cache; - } /* @@ -1566,7 +1698,7 @@ static void swap_entries_free(struct swap_info_struct *si, unsigned char *map_end = map + nr_pages; /* It should never free entries across different clusters */ - VM_BUG_ON(ci != offset_to_cluster(si, offset + nr_pages - 1)); + VM_BUG_ON(ci != __swap_offset_to_cluster(si, offset + nr_pages - 1)); VM_BUG_ON(cluster_is_empty(ci)); VM_BUG_ON(ci->count < nr_pages); @@ -1578,6 +1710,7 @@ static void swap_entries_free(struct swap_info_struct *si, mem_cgroup_uncharge_swap(entry, nr_pages); swap_range_free(si, offset, nr_pages); + swap_cluster_assert_table_empty(ci, offset, nr_pages); if (!ci->count) free_cluster(si, ci); @@ -1624,7 +1757,7 @@ void put_swap_folio(struct folio *folio, swp_entry_t entry) int __swap_count(swp_entry_t entry) { - struct swap_info_struct *si = swp_swap_info(entry); + struct swap_info_struct *si = __swap_entry_to_info(entry); pgoff_t offset = swp_offset(entry); return swap_count(si->swap_map[offset]); @@ -1641,9 +1774,9 @@ bool swap_entry_swapped(struct swap_info_struct *si, swp_entry_t entry) struct swap_cluster_info *ci; int count; - ci = lock_cluster(si, offset); + ci = swap_cluster_lock(si, offset); count = swap_count(si->swap_map[offset]); - unlock_cluster(ci); + swap_cluster_unlock(ci); return !!count; } @@ -1666,7 +1799,7 @@ int swp_swapcount(swp_entry_t entry) offset = swp_offset(entry); - ci = lock_cluster(si, offset); + ci = swap_cluster_lock(si, offset); count = swap_count(si->swap_map[offset]); if (!(count & COUNT_CONTINUED)) @@ -1689,7 +1822,7 @@ int swp_swapcount(swp_entry_t entry) n *= (SWAP_CONT_MAX + 1); } while (tmp_count & COUNT_CONTINUED); out: - unlock_cluster(ci); + swap_cluster_unlock(ci); return count; } @@ -1704,7 +1837,7 @@ static bool swap_page_trans_huge_swapped(struct swap_info_struct *si, int i; bool ret = false; - ci = lock_cluster(si, offset); + ci = swap_cluster_lock(si, offset); if (nr_pages == 1) { if (swap_count(map[roffset])) ret = true; @@ -1717,7 +1850,7 @@ static bool swap_page_trans_huge_swapped(struct swap_info_struct *si, } } unlock_out: - unlock_cluster(ci); + swap_cluster_unlock(ci); return ret; } @@ -1781,7 +1914,7 @@ bool folio_free_swap(struct folio *folio) if (folio_swapped(folio)) return false; - delete_from_swap_cache(folio); + swap_cache_del_folio(folio); folio_set_dirty(folio); return true; } @@ -1855,7 +1988,7 @@ out: swp_entry_t get_swap_page_of_type(int type) { - struct swap_info_struct *si = swap_type_to_swap_info(type); + struct swap_info_struct *si = swap_type_to_info(type); unsigned long offset; swp_entry_t entry = {0}; @@ -1865,7 +1998,13 @@ swp_entry_t get_swap_page_of_type(int type) /* This is called for allocating swap entry, not cache */ if (get_swap_device_info(si)) { if (si->flags & SWP_WRITEOK) { + /* + * Grab the local lock to be complaint + * with swap table allocation. + */ + local_lock(&percpu_swap_cluster.lock); offset = cluster_alloc_swap_entry(si, 0, 1); + local_unlock(&percpu_swap_cluster.lock); if (offset) { entry = swp_entry(si->type, offset); atomic_long_dec(&nr_swap_pages); @@ -1936,7 +2075,7 @@ int find_first_swap(dev_t *device) */ sector_t swapdev_block(int type, pgoff_t offset) { - struct swap_info_struct *si = swap_type_to_swap_info(type); + struct swap_info_struct *si = swap_type_to_info(type); struct swap_extent *se; if (!si || !(si->flags & SWP_WRITEOK)) @@ -1992,6 +2131,13 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, bool hwpoisoned = false; int ret = 1; + /* + * If the folio is removed from swap cache by others, continue to + * unuse other PTEs. try_to_unuse may try again if we missed this one. + */ + if (!folio_matches_swap_entry(folio, entry)) + return 0; + swapcache = folio; folio = ksm_might_need_to_copy(folio, vma, addr); if (unlikely(!folio)) @@ -2118,7 +2264,7 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, pte_unmap(pte); pte = NULL; - folio = swap_cache_get_folio(entry, vma, addr); + folio = swap_cache_get_folio(entry); if (!folio) { struct vm_fault vmf = { .vma = vma, @@ -2243,6 +2389,8 @@ static int unuse_mm(struct mm_struct *mm, unsigned int type) VMA_ITERATOR(vmi, mm, 0); mmap_read_lock(mm); + if (check_stable_address_space(mm)) + goto unlock; for_each_vma(vmi, vma) { if (vma->anon_vma && !is_vm_hugetlb_page(vma)) { ret = unuse_vma(vma, type); @@ -2252,6 +2400,7 @@ static int unuse_mm(struct mm_struct *mm, unsigned int type) cond_resched(); } +unlock: mmap_read_unlock(mm); return ret; } @@ -2344,8 +2493,8 @@ retry: (i = find_next_to_unuse(si, i)) != 0) { entry = swp_entry(type, i); - folio = filemap_get_folio(swap_address_space(entry), swap_cache_index(entry)); - if (IS_ERR(folio)) + folio = swap_cache_get_folio(entry); + if (!folio) continue; /* @@ -2644,9 +2793,30 @@ static void wait_for_allocation(struct swap_info_struct *si) BUG_ON(si->flags & SWP_WRITEOK); for (offset = 0; offset < end; offset += SWAPFILE_CLUSTER) { - ci = lock_cluster(si, offset); - unlock_cluster(ci); + ci = swap_cluster_lock(si, offset); + swap_cluster_unlock(ci); + } +} + +static void free_cluster_info(struct swap_cluster_info *cluster_info, + unsigned long maxpages) +{ + struct swap_cluster_info *ci; + int i, nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER); + + if (!cluster_info) + return; + for (i = 0; i < nr_clusters; i++) { + ci = cluster_info + i; + /* Cluster with bad marks count will have a remaining table */ + spin_lock(&ci->lock); + if (rcu_dereference_protected(ci->table, true)) { + ci->count = 0; + swap_cluster_free_table(ci); + } + spin_unlock(&ci->lock); } + kvfree(cluster_info); } /* @@ -2681,6 +2851,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) struct address_space *mapping; struct inode *inode; struct filename *pathname; + unsigned int maxpages; int err, found = 0; if (!capable(CAP_SYS_ADMIN)) @@ -2783,12 +2954,13 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) swap_file = p->swap_file; p->swap_file = NULL; - p->max = 0; swap_map = p->swap_map; p->swap_map = NULL; zeromap = p->zeromap; p->zeromap = NULL; + maxpages = p->max; cluster_info = p->cluster_info; + p->max = 0; p->cluster_info = NULL; spin_unlock(&p->lock); spin_unlock(&swap_lock); @@ -2799,10 +2971,9 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) p->global_cluster = NULL; vfree(swap_map); kvfree(zeromap); - kvfree(cluster_info); + free_cluster_info(cluster_info, maxpages); /* Destroy swap account information */ swap_cgroup_swapoff(p->type); - exit_swap_address_space(p->type); inode = mapping->host; @@ -2858,7 +3029,7 @@ static void *swap_start(struct seq_file *swap, loff_t *pos) if (!l) return SEQ_START_TOKEN; - for (type = 0; (si = swap_type_to_swap_info(type)); type++) { + for (type = 0; (si = swap_type_to_info(type)); type++) { if (!(si->flags & SWP_USED) || !si->swap_map) continue; if (!--l) @@ -2879,7 +3050,7 @@ static void *swap_next(struct seq_file *swap, void *v, loff_t *pos) type = si->type + 1; ++(*pos); - for (; (si = swap_type_to_swap_info(type)); type++) { + for (; (si = swap_type_to_info(type)); type++) { if (!(si->flags & SWP_USED) || !si->swap_map) continue; return si; @@ -3166,21 +3337,14 @@ static int setup_swap_map(struct swap_info_struct *si, return 0; } -#define SWAP_CLUSTER_INFO_COLS \ - DIV_ROUND_UP(L1_CACHE_BYTES, sizeof(struct swap_cluster_info)) -#define SWAP_CLUSTER_SPACE_COLS \ - DIV_ROUND_UP(SWAP_ADDRESS_SPACE_PAGES, SWAPFILE_CLUSTER) -#define SWAP_CLUSTER_COLS \ - max_t(unsigned int, SWAP_CLUSTER_INFO_COLS, SWAP_CLUSTER_SPACE_COLS) - static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si, union swap_header *swap_header, unsigned long maxpages) { unsigned long nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER); struct swap_cluster_info *cluster_info; - unsigned long i, j, idx; int err = -ENOMEM; + unsigned long i; cluster_info = kvcalloc(nr_clusters, sizeof(*cluster_info), GFP_KERNEL); if (!cluster_info) @@ -3206,16 +3370,23 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si, * See setup_swap_map(): header page, bad pages, * and the EOF part of the last cluster. */ - inc_cluster_info_page(si, cluster_info, 0); + err = inc_cluster_info_page(si, cluster_info, 0); + if (err) + goto err; for (i = 0; i < swap_header->info.nr_badpages; i++) { unsigned int page_nr = swap_header->info.badpages[i]; if (page_nr >= maxpages) continue; - inc_cluster_info_page(si, cluster_info, page_nr); + err = inc_cluster_info_page(si, cluster_info, page_nr); + if (err) + goto err; + } + for (i = maxpages; i < round_up(maxpages, SWAPFILE_CLUSTER); i++) { + err = inc_cluster_info_page(si, cluster_info, i); + if (err) + goto err; } - for (i = maxpages; i < round_up(maxpages, SWAPFILE_CLUSTER); i++) - inc_cluster_info_page(si, cluster_info, i); INIT_LIST_HEAD(&si->free_clusters); INIT_LIST_HEAD(&si->full_clusters); @@ -3224,34 +3395,23 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si, for (i = 0; i < SWAP_NR_ORDERS; i++) { INIT_LIST_HEAD(&si->nonfull_clusters[i]); INIT_LIST_HEAD(&si->frag_clusters[i]); - atomic_long_set(&si->frag_cluster_nr[i], 0); } - /* - * Reduce false cache line sharing between cluster_info and - * sharing same address space. - */ - for (j = 0; j < SWAP_CLUSTER_COLS; j++) { - for (i = 0; i < DIV_ROUND_UP(nr_clusters, SWAP_CLUSTER_COLS); i++) { - struct swap_cluster_info *ci; - idx = i * SWAP_CLUSTER_COLS + j; - ci = cluster_info + idx; - if (idx >= nr_clusters) - continue; - if (ci->count) { - ci->flags = CLUSTER_FLAG_NONFULL; - list_add_tail(&ci->list, &si->nonfull_clusters[0]); - continue; - } + for (i = 0; i < nr_clusters; i++) { + struct swap_cluster_info *ci = &cluster_info[i]; + + if (ci->count) { + ci->flags = CLUSTER_FLAG_NONFULL; + list_add_tail(&ci->list, &si->nonfull_clusters[0]); + } else { ci->flags = CLUSTER_FLAG_FREE; list_add_tail(&ci->list, &si->free_clusters); } } return cluster_info; - err_free: - kvfree(cluster_info); + free_cluster_info(cluster_info, maxpages); err: return ERR_PTR(err); } @@ -3445,13 +3605,9 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) } } - error = init_swap_address_space(si->type, maxpages); - if (error) - goto bad_swap_unlock_inode; - error = zswap_swapon(si->type, maxpages); if (error) - goto free_swap_address_space; + goto bad_swap_unlock_inode; /* * Flush any pending IO and dirty mappings before we start using this @@ -3486,8 +3642,6 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) goto out; free_swap_zswap: zswap_swapoff(si->type); -free_swap_address_space: - exit_swap_address_space(si->type); bad_swap_unlock_inode: inode_unlock(inode); bad_swap: @@ -3502,7 +3656,8 @@ bad_swap: spin_unlock(&swap_lock); vfree(swap_map); kvfree(zeromap); - kvfree(cluster_info); + if (cluster_info) + free_cluster_info(cluster_info, maxpages); if (inced_nr_rotate_swap) atomic_dec(&nr_rotate_swap); if (swap_file) @@ -3553,7 +3708,7 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr) unsigned char has_cache; int err, i; - si = swp_swap_info(entry); + si = swap_entry_to_info(entry); if (WARN_ON_ONCE(!si)) { pr_err("%s%08lx\n", Bad_file, entry.val); return -EINVAL; @@ -3562,7 +3717,7 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr) offset = swp_offset(entry); VM_WARN_ON(nr > SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER); VM_WARN_ON(usage == 1 && nr > 1); - ci = lock_cluster(si, offset); + ci = swap_cluster_lock(si, offset); err = 0; for (i = 0; i < nr; i++) { @@ -3617,7 +3772,7 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr) } unlock_out: - unlock_cluster(ci); + swap_cluster_unlock(ci); return err; } @@ -3668,11 +3823,6 @@ void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry, int nr) swap_entries_put_cache(si, entry, nr); } -struct swap_info_struct *swp_swap_info(swp_entry_t entry) -{ - return swap_type_to_swap_info(swp_type(entry)); -} - /* * add_swap_count_continuation - called when a swap count is duplicated * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's @@ -3716,7 +3866,7 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) offset = swp_offset(entry); - ci = lock_cluster(si, offset); + ci = swap_cluster_lock(si, offset); count = swap_count(si->swap_map[offset]); @@ -3776,7 +3926,7 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) out_unlock_cont: spin_unlock(&si->cont_lock); out: - unlock_cluster(ci); + swap_cluster_unlock(ci); put_swap_device(si); outer: if (page) @@ -3950,6 +4100,16 @@ static int __init swapfile_init(void) swapfile_maximum_size = arch_max_swapfile_size(); + /* + * Once a cluster is freed, it's swap table content is read + * only, and all swap cache readers (swap_cache_*) verifies + * the content before use. So it's safe to use RCU slab here. + */ + if (!SWP_TABLE_USE_PAGE) + swap_table_cachep = kmem_cache_create("swap_table", + sizeof(struct swap_table), + 0, SLAB_PANIC | SLAB_TYPESAFE_BY_RCU, NULL); + #ifdef CONFIG_MIGRATION if (swapfile_maximum_size >= (1UL << SWP_MIG_TOTAL_BITS)) swap_migration_ad_supported = true; diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 45e6290e2e8b..af61b95c89e4 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -1026,18 +1026,64 @@ static inline bool is_pte_pages_stable(pte_t *dst_pte, pte_t *src_pte, pmd_same(dst_pmdval, pmdp_get_lockless(dst_pmd)); } -static int move_present_pte(struct mm_struct *mm, - struct vm_area_struct *dst_vma, - struct vm_area_struct *src_vma, - unsigned long dst_addr, unsigned long src_addr, - pte_t *dst_pte, pte_t *src_pte, - pte_t orig_dst_pte, pte_t orig_src_pte, - pmd_t *dst_pmd, pmd_t dst_pmdval, - spinlock_t *dst_ptl, spinlock_t *src_ptl, - struct folio *src_folio) +/* + * Checks if the two ptes and the corresponding folio are eligible for batched + * move. If so, then returns pointer to the locked folio. Otherwise, returns NULL. + * + * NOTE: folio's reference is not required as the whole operation is within + * PTL's critical section. + */ +static struct folio *check_ptes_for_batched_move(struct vm_area_struct *src_vma, + unsigned long src_addr, + pte_t *src_pte, pte_t *dst_pte, + struct anon_vma *src_anon_vma) +{ + pte_t orig_dst_pte, orig_src_pte; + struct folio *folio; + + orig_dst_pte = ptep_get(dst_pte); + if (!pte_none(orig_dst_pte)) + return NULL; + + orig_src_pte = ptep_get(src_pte); + if (!pte_present(orig_src_pte) || is_zero_pfn(pte_pfn(orig_src_pte))) + return NULL; + + folio = vm_normal_folio(src_vma, src_addr, orig_src_pte); + if (!folio || !folio_trylock(folio)) + return NULL; + if (!PageAnonExclusive(&folio->page) || folio_test_large(folio) || + folio_anon_vma(folio) != src_anon_vma) { + folio_unlock(folio); + return NULL; + } + return folio; +} + +/* + * Moves src folios to dst in a batch as long as they share the same + * anon_vma as the first folio, are not large, and can successfully + * take the lock via folio_trylock(). + */ +static long move_present_ptes(struct mm_struct *mm, + struct vm_area_struct *dst_vma, + struct vm_area_struct *src_vma, + unsigned long dst_addr, unsigned long src_addr, + pte_t *dst_pte, pte_t *src_pte, + pte_t orig_dst_pte, pte_t orig_src_pte, + pmd_t *dst_pmd, pmd_t dst_pmdval, + spinlock_t *dst_ptl, spinlock_t *src_ptl, + struct folio **first_src_folio, unsigned long len, + struct anon_vma *src_anon_vma) { int err = 0; + struct folio *src_folio = *first_src_folio; + unsigned long src_start = src_addr; + unsigned long src_end; + len = pmd_addr_end(dst_addr, dst_addr + len) - dst_addr; + src_end = pmd_addr_end(src_addr, src_addr + len); + flush_cache_range(src_vma, src_addr, src_end); double_pt_lock(dst_ptl, src_ptl); if (!is_pte_pages_stable(dst_pte, src_pte, orig_dst_pte, orig_src_pte, @@ -1051,31 +1097,56 @@ static int move_present_pte(struct mm_struct *mm, err = -EBUSY; goto out; } + /* It's safe to drop the reference now as the page-table is holding one. */ + folio_put(*first_src_folio); + *first_src_folio = NULL; + arch_enter_lazy_mmu_mode(); + + while (true) { + orig_src_pte = ptep_get_and_clear(mm, src_addr, src_pte); + /* Folio got pinned from under us. Put it back and fail the move. */ + if (folio_maybe_dma_pinned(src_folio)) { + set_pte_at(mm, src_addr, src_pte, orig_src_pte); + err = -EBUSY; + break; + } - orig_src_pte = ptep_clear_flush(src_vma, src_addr, src_pte); - /* Folio got pinned from under us. Put it back and fail the move. */ - if (folio_maybe_dma_pinned(src_folio)) { - set_pte_at(mm, src_addr, src_pte, orig_src_pte); - err = -EBUSY; - goto out; - } - - folio_move_anon_rmap(src_folio, dst_vma); - src_folio->index = linear_page_index(dst_vma, dst_addr); + folio_move_anon_rmap(src_folio, dst_vma); + src_folio->index = linear_page_index(dst_vma, dst_addr); - orig_dst_pte = folio_mk_pte(src_folio, dst_vma->vm_page_prot); - /* Set soft dirty bit so userspace can notice the pte was moved */ + orig_dst_pte = folio_mk_pte(src_folio, dst_vma->vm_page_prot); + /* Set soft dirty bit so userspace can notice the pte was moved */ #ifdef CONFIG_MEM_SOFT_DIRTY - orig_dst_pte = pte_mksoft_dirty(orig_dst_pte); + orig_dst_pte = pte_mksoft_dirty(orig_dst_pte); #endif - if (pte_dirty(orig_src_pte)) - orig_dst_pte = pte_mkdirty(orig_dst_pte); - orig_dst_pte = pte_mkwrite(orig_dst_pte, dst_vma); + if (pte_dirty(orig_src_pte)) + orig_dst_pte = pte_mkdirty(orig_dst_pte); + orig_dst_pte = pte_mkwrite(orig_dst_pte, dst_vma); + set_pte_at(mm, dst_addr, dst_pte, orig_dst_pte); + + src_addr += PAGE_SIZE; + if (src_addr == src_end) + break; + dst_addr += PAGE_SIZE; + dst_pte++; + src_pte++; - set_pte_at(mm, dst_addr, dst_pte, orig_dst_pte); + folio_unlock(src_folio); + src_folio = check_ptes_for_batched_move(src_vma, src_addr, src_pte, + dst_pte, src_anon_vma); + if (!src_folio) + break; + } + + arch_leave_lazy_mmu_mode(); + if (src_addr > src_start) + flush_tlb_range(src_vma, src_start, src_addr); + + if (src_folio) + folio_unlock(src_folio); out: double_pt_unlock(dst_ptl, src_ptl); - return err; + return src_addr > src_start ? src_addr - src_start : err; } static int move_swap_pte(struct mm_struct *mm, struct vm_area_struct *dst_vma, @@ -1140,7 +1211,7 @@ static int move_swap_pte(struct mm_struct *mm, struct vm_area_struct *dst_vma, set_pte_at(mm, dst_addr, dst_pte, orig_src_pte); double_pt_unlock(dst_ptl, src_ptl); - return 0; + return PAGE_SIZE; } static int move_zeropage_pte(struct mm_struct *mm, @@ -1167,20 +1238,20 @@ static int move_zeropage_pte(struct mm_struct *mm, set_pte_at(mm, dst_addr, dst_pte, zero_pte); double_pt_unlock(dst_ptl, src_ptl); - return 0; + return PAGE_SIZE; } /* - * The mmap_lock for reading is held by the caller. Just move the page - * from src_pmd to dst_pmd if possible, and return true if succeeded - * in moving the page. + * The mmap_lock for reading is held by the caller. Just move the page(s) + * from src_pmd to dst_pmd if possible, and return number of bytes moved. + * On failure, an error code is returned. */ -static int move_pages_pte(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, - struct vm_area_struct *dst_vma, - struct vm_area_struct *src_vma, - unsigned long dst_addr, unsigned long src_addr, - __u64 mode) +static long move_pages_ptes(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, + struct vm_area_struct *dst_vma, + struct vm_area_struct *src_vma, + unsigned long dst_addr, unsigned long src_addr, + unsigned long len, __u64 mode) { swp_entry_t entry; struct swap_info_struct *si = NULL; @@ -1194,11 +1265,10 @@ static int move_pages_pte(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, struct folio *src_folio = NULL; struct anon_vma *src_anon_vma = NULL; struct mmu_notifier_range range; - int err = 0; + long ret = 0; - flush_cache_range(src_vma, src_addr, src_addr + PAGE_SIZE); mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, - src_addr, src_addr + PAGE_SIZE); + src_addr, src_addr + len); mmu_notifier_invalidate_range_start(&range); retry: /* @@ -1212,7 +1282,7 @@ retry: /* Retry if a huge pmd materialized from under us */ if (unlikely(!dst_pte)) { - err = -EAGAIN; + ret = -EAGAIN; goto out; } @@ -1231,14 +1301,14 @@ retry: * transparent huge pages under us. */ if (unlikely(!src_pte)) { - err = -EAGAIN; + ret = -EAGAIN; goto out; } /* Sanity checks before the operation */ if (pmd_none(*dst_pmd) || pmd_none(*src_pmd) || pmd_trans_huge(*dst_pmd) || pmd_trans_huge(*src_pmd)) { - err = -EINVAL; + ret = -EINVAL; goto out; } @@ -1246,7 +1316,7 @@ retry: orig_dst_pte = ptep_get(dst_pte); spin_unlock(dst_ptl); if (!pte_none(orig_dst_pte)) { - err = -EEXIST; + ret = -EEXIST; goto out; } @@ -1255,21 +1325,21 @@ retry: spin_unlock(src_ptl); if (pte_none(orig_src_pte)) { if (!(mode & UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES)) - err = -ENOENT; + ret = -ENOENT; else /* nothing to do to move a hole */ - err = 0; + ret = PAGE_SIZE; goto out; } /* If PTE changed after we locked the folio them start over */ if (src_folio && unlikely(!pte_same(src_folio_pte, orig_src_pte))) { - err = -EAGAIN; + ret = -EAGAIN; goto out; } if (pte_present(orig_src_pte)) { if (is_zero_pfn(pte_pfn(orig_src_pte))) { - err = move_zeropage_pte(mm, dst_vma, src_vma, + ret = move_zeropage_pte(mm, dst_vma, src_vma, dst_addr, src_addr, dst_pte, src_pte, orig_dst_pte, orig_src_pte, dst_pmd, dst_pmdval, dst_ptl, src_ptl); @@ -1292,14 +1362,14 @@ retry: spin_lock(src_ptl); if (!pte_same(orig_src_pte, ptep_get(src_pte))) { spin_unlock(src_ptl); - err = -EAGAIN; + ret = -EAGAIN; goto out; } folio = vm_normal_folio(src_vma, src_addr, orig_src_pte); if (!folio || !PageAnonExclusive(&folio->page)) { spin_unlock(src_ptl); - err = -EBUSY; + ret = -EBUSY; goto out; } @@ -1313,7 +1383,7 @@ retry: */ if (!locked && folio_test_large(folio)) { spin_unlock(src_ptl); - err = -EAGAIN; + ret = -EAGAIN; goto out; } @@ -1332,7 +1402,7 @@ retry: } if (WARN_ON_ONCE(!folio_test_anon(src_folio))) { - err = -EBUSY; + ret = -EBUSY; goto out; } } @@ -1343,8 +1413,8 @@ retry: pte_unmap(src_pte); pte_unmap(dst_pte); src_pte = dst_pte = NULL; - err = split_folio(src_folio); - if (err) + ret = split_folio(src_folio); + if (ret) goto out; /* have to reacquire the folio after it got split */ folio_unlock(src_folio); @@ -1362,7 +1432,7 @@ retry: src_anon_vma = folio_get_anon_vma(src_folio); if (!src_anon_vma) { /* page was unmapped from under us */ - err = -EAGAIN; + ret = -EAGAIN; goto out; } if (!anon_vma_trylock_write(src_anon_vma)) { @@ -1375,10 +1445,11 @@ retry: } } - err = move_present_pte(mm, dst_vma, src_vma, - dst_addr, src_addr, dst_pte, src_pte, - orig_dst_pte, orig_src_pte, dst_pmd, - dst_pmdval, dst_ptl, src_ptl, src_folio); + ret = move_present_ptes(mm, dst_vma, src_vma, + dst_addr, src_addr, dst_pte, src_pte, + orig_dst_pte, orig_src_pte, dst_pmd, + dst_pmdval, dst_ptl, src_ptl, &src_folio, + len, src_anon_vma); } else { struct folio *folio = NULL; @@ -1389,20 +1460,20 @@ retry: pte_unmap(dst_pte); src_pte = dst_pte = NULL; migration_entry_wait(mm, src_pmd, src_addr); - err = -EAGAIN; + ret = -EAGAIN; } else - err = -EFAULT; + ret = -EFAULT; goto out; } if (!pte_swp_exclusive(orig_src_pte)) { - err = -EBUSY; + ret = -EBUSY; goto out; } si = get_swap_device(entry); if (unlikely(!si)) { - err = -EAGAIN; + ret = -EAGAIN; goto out; } /* @@ -1418,11 +1489,10 @@ retry: * separately to allow proper handling. */ if (!src_folio) - folio = filemap_get_folio(swap_address_space(entry), - swap_cache_index(entry)); - if (!IS_ERR_OR_NULL(folio)) { + folio = swap_cache_get_folio(entry); + if (folio) { if (folio_test_large(folio)) { - err = -EBUSY; + ret = -EBUSY; folio_put(folio); goto out; } @@ -1439,7 +1509,7 @@ retry: goto retry; } } - err = move_swap_pte(mm, dst_vma, dst_addr, src_addr, dst_pte, src_pte, + ret = move_swap_pte(mm, dst_vma, dst_addr, src_addr, dst_pte, src_pte, orig_dst_pte, orig_src_pte, dst_pmd, dst_pmdval, dst_ptl, src_ptl, src_folio, si, entry); } @@ -1453,15 +1523,20 @@ out: folio_unlock(src_folio); folio_put(src_folio); } - if (dst_pte) - pte_unmap(dst_pte); + /* + * Unmap in reverse order (LIFO) to maintain proper kmap_local + * index ordering when CONFIG_HIGHPTE is enabled. We mapped dst_pte + * first, then src_pte, so we must unmap src_pte first, then dst_pte. + */ if (src_pte) pte_unmap(src_pte); + if (dst_pte) + pte_unmap(dst_pte); mmu_notifier_invalidate_range_end(&range); if (si) put_swap_device(si); - return err; + return ret; } #ifdef CONFIG_TRANSPARENT_HUGEPAGE @@ -1732,7 +1807,7 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start, { struct mm_struct *mm = ctx->mm; struct vm_area_struct *src_vma, *dst_vma; - unsigned long src_addr, dst_addr; + unsigned long src_addr, dst_addr, src_end; pmd_t *src_pmd, *dst_pmd; long err = -EINVAL; ssize_t moved = 0; @@ -1775,8 +1850,8 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start, if (err) goto out_unlock; - for (src_addr = src_start, dst_addr = dst_start; - src_addr < src_start + len;) { + for (src_addr = src_start, dst_addr = dst_start, src_end = src_start + len; + src_addr < src_end;) { spinlock_t *ptl; pmd_t dst_pmdval; unsigned long step_size; @@ -1844,6 +1919,8 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start, dst_addr, src_addr); step_size = HPAGE_PMD_SIZE; } else { + long ret; + if (pmd_none(*src_pmd)) { if (!(mode & UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES)) { err = -ENOENT; @@ -1860,10 +1937,13 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start, break; } - err = move_pages_pte(mm, dst_pmd, src_pmd, - dst_vma, src_vma, - dst_addr, src_addr, mode); - step_size = PAGE_SIZE; + ret = move_pages_ptes(mm, dst_pmd, src_pmd, + dst_vma, src_vma, dst_addr, + src_addr, src_end - src_addr, mode); + if (ret < 0) + err = ret; + else + step_size = ret; } cond_resched(); diff --git a/mm/util.c b/mm/util.c index f814e6a59ab1..6c1d64ed0221 100644 --- a/mm/util.c +++ b/mm/util.c @@ -315,7 +315,7 @@ void *memdup_user_nul(const void __user *src, size_t len) EXPORT_SYMBOL(memdup_user_nul); /* Check if the vma is being used as a stack by this task */ -int vma_is_stack_for_current(struct vm_area_struct *vma) +int vma_is_stack_for_current(const struct vm_area_struct *vma) { struct task_struct * __maybe_unused t = current; @@ -410,7 +410,7 @@ unsigned long arch_mmap_rnd(void) return rnd << PAGE_SHIFT; } -static int mmap_is_legacy(struct rlimit *rlim_stack) +static int mmap_is_legacy(const struct rlimit *rlim_stack) { if (current->personality & ADDR_COMPAT_LAYOUT) return 1; @@ -431,7 +431,7 @@ static int mmap_is_legacy(struct rlimit *rlim_stack) #define MIN_GAP (SZ_128M) #define MAX_GAP (STACK_TOP / 6 * 5) -static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack) +static unsigned long mmap_base(const unsigned long rnd, const struct rlimit *rlim_stack) { #ifdef CONFIG_STACK_GROWSUP /* @@ -462,7 +462,7 @@ static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack) #endif } -void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) +void arch_pick_mmap_layout(struct mm_struct *mm, const struct rlimit *rlim_stack) { unsigned long random_factor = 0UL; @@ -471,17 +471,17 @@ void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) if (mmap_is_legacy(rlim_stack)) { mm->mmap_base = TASK_UNMAPPED_BASE + random_factor; - clear_bit(MMF_TOPDOWN, &mm->flags); + mm_flags_clear(MMF_TOPDOWN, mm); } else { mm->mmap_base = mmap_base(random_factor, rlim_stack); - set_bit(MMF_TOPDOWN, &mm->flags); + mm_flags_set(MMF_TOPDOWN, mm); } } #elif defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT) -void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) +void arch_pick_mmap_layout(struct mm_struct *mm, const struct rlimit *rlim_stack) { mm->mmap_base = TASK_UNMAPPED_BASE; - clear_bit(MMF_TOPDOWN, &mm->flags); + mm_flags_clear(MMF_TOPDOWN, mm); } #endif #ifdef CONFIG_MMU @@ -504,7 +504,7 @@ EXPORT_SYMBOL_IF_KUNIT(arch_pick_mmap_layout); * * -ENOMEM if RLIMIT_MEMLOCK would be exceeded. */ int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc, - struct task_struct *task, bool bypass_rlim) + const struct task_struct *task, bool bypass_rlim) { unsigned long locked_vm, limit; int ret = 0; @@ -688,7 +688,7 @@ struct anon_vma *folio_anon_vma(const struct folio *folio) * You can call this for folios which aren't in the swap cache or page * cache and it will return NULL. */ -struct address_space *folio_mapping(struct folio *folio) +struct address_space *folio_mapping(const struct folio *folio) { struct address_space *mapping; @@ -926,7 +926,7 @@ EXPORT_SYMBOL_GPL(vm_memory_committed); * Note this is a helper function intended to be used by LSMs which * wish to use this logic. */ -int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) +int __vm_enough_memory(const struct mm_struct *mm, long pages, int cap_sys_admin) { long allowed; unsigned long bytes_failed; @@ -1134,16 +1134,50 @@ EXPORT_SYMBOL(flush_dcache_folio); #endif /** + * __compat_vma_mmap_prepare() - See description for compat_vma_mmap_prepare() + * for details. This is the same operation, only with a specific file operations + * struct which may or may not be the same as vma->vm_file->f_op. + * @f_op: The file operations whose .mmap_prepare() hook is specified. + * @file: The file which backs or will back the mapping. + * @vma: The VMA to apply the .mmap_prepare() hook to. + * Returns: 0 on success or error. + */ +int __compat_vma_mmap_prepare(const struct file_operations *f_op, + struct file *file, struct vm_area_struct *vma) +{ + struct vm_area_desc desc = { + .mm = vma->vm_mm, + .file = file, + .start = vma->vm_start, + .end = vma->vm_end, + + .pgoff = vma->vm_pgoff, + .vm_file = vma->vm_file, + .vm_flags = vma->vm_flags, + .page_prot = vma->vm_page_prot, + }; + int err; + + err = f_op->mmap_prepare(&desc); + if (err) + return err; + set_vma_from_desc(vma, &desc); + + return 0; +} +EXPORT_SYMBOL(__compat_vma_mmap_prepare); + +/** * compat_vma_mmap_prepare() - Apply the file's .mmap_prepare() hook to an - * existing VMA - * @file: The file which possesss an f_op->mmap_prepare() hook + * existing VMA. + * @file: The file which possesss an f_op->mmap_prepare() hook. * @vma: The VMA to apply the .mmap_prepare() hook to. * * Ordinarily, .mmap_prepare() is invoked directly upon mmap(). However, certain - * 'wrapper' file systems invoke a nested mmap hook of an underlying file. + * stacked filesystems invoke a nested mmap hook of an underlying file. * * Until all filesystems are converted to use .mmap_prepare(), we must be - * conservative and continue to invoke these 'wrapper' filesystems using the + * conservative and continue to invoke these stacked filesystems using the * deprecated .mmap() hook. * * However we have a problem if the underlying file system possesses an @@ -1161,15 +1195,7 @@ EXPORT_SYMBOL(flush_dcache_folio); */ int compat_vma_mmap_prepare(struct file *file, struct vm_area_struct *vma) { - struct vm_area_desc desc; - int err; - - err = file->f_op->mmap_prepare(vma_to_desc(vma, &desc)); - if (err) - return err; - set_vma_from_desc(vma, &desc); - - return 0; + return __compat_vma_mmap_prepare(file->f_op, file, vma); } EXPORT_SYMBOL(compat_vma_mmap_prepare); @@ -1281,3 +1307,39 @@ unsigned int folio_pte_batch(struct folio *folio, pte_t *ptep, pte_t pte, return folio_pte_batch_flags(folio, NULL, ptep, &pte, max_nr, 0); } #endif /* CONFIG_MMU */ + +#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) +/** + * page_range_contiguous - test whether the page range is contiguous + * @page: the start of the page range. + * @nr_pages: the number of pages in the range. + * + * Test whether the page range is contiguous, such that they can be iterated + * naively, corresponding to iterating a contiguous PFN range. + * + * This function should primarily only be used for debug checks, or when + * working with page ranges that are not naturally contiguous (e.g., pages + * within a folio are). + * + * Returns true if contiguous, otherwise false. + */ +bool page_range_contiguous(const struct page *page, unsigned long nr_pages) +{ + const unsigned long start_pfn = page_to_pfn(page); + const unsigned long end_pfn = start_pfn + nr_pages; + unsigned long pfn; + + /* + * The memmap is allocated per memory section, so no need to check + * within the first section. However, we need to check each other + * spanned memory section once, making sure the first page in a + * section could similarly be reached by just iterating pages. + */ + for (pfn = ALIGN(start_pfn, PAGES_PER_SECTION); + pfn < end_pfn; pfn += PAGES_PER_SECTION) + if (unlikely(page + (pfn - start_pfn) != pfn_to_page(pfn))) + return false; + return true; +} +EXPORT_SYMBOL(page_range_contiguous); +#endif @@ -2572,11 +2572,12 @@ static int call_mmap_prepare(struct mmap_state *map) int err; struct vm_area_desc desc = { .mm = map->mm, + .file = map->file, .start = map->addr, .end = map->end, .pgoff = map->pgoff, - .file = map->file, + .vm_file = map->file, .vm_flags = map->vm_flags, .page_prot = map->page_prot, }; @@ -2588,7 +2589,7 @@ static int call_mmap_prepare(struct mmap_state *map) /* Update fields permitted to be changed. */ map->pgoff = desc.pgoff; - map->file = desc.file; + map->file = desc.vm_file; map->vm_flags = desc.vm_flags; map->page_prot = desc.page_prot; /* User-defined fields. */ @@ -145,7 +145,7 @@ struct vma_merge_struct { */ bool __remove_middle :1; /* - * Internal flag used during the merge operationr to indicate we will + * Internal flag used during the merge operation to indicate we will * remove vmg->next. */ bool __remove_next :1; @@ -222,31 +222,11 @@ static inline int vma_iter_store_gfp(struct vma_iterator *vmi, return 0; } - /* - * Temporary helper functions for file systems which wrap an invocation of + * Temporary helper function for stacked mmap handlers which specify * f_op->mmap() but which might have an underlying file system which implements * f_op->mmap_prepare(). */ - -static inline struct vm_area_desc *vma_to_desc(struct vm_area_struct *vma, - struct vm_area_desc *desc) -{ - desc->mm = vma->vm_mm; - desc->start = vma->vm_start; - desc->end = vma->vm_end; - - desc->pgoff = vma->vm_pgoff; - desc->file = vma->vm_file; - desc->vm_flags = vma->vm_flags; - desc->page_prot = vma->vm_page_prot; - - desc->vm_ops = NULL; - desc->private_data = NULL; - - return desc; -} - static inline void set_vma_from_desc(struct vm_area_struct *vma, struct vm_area_desc *desc) { @@ -258,9 +238,9 @@ static inline void set_vma_from_desc(struct vm_area_struct *vma, /* Mutable fields. Populated with initial state. */ vma->vm_pgoff = desc->pgoff; - if (vma->vm_file != desc->file) - vma_set_file(vma, desc->file); - if (vma->vm_flags != desc->vm_flags) + if (desc->vm_file != vma->vm_file) + vma_set_file(vma, desc->vm_file); + if (desc->vm_flags != vma->vm_flags) vm_flags_set(vma, desc->vm_flags); vma->vm_page_prot = desc->page_prot; diff --git a/mm/vma_init.c b/mm/vma_init.c index 8e53c7943561..3c0b65950510 100644 --- a/mm/vma_init.c +++ b/mm/vma_init.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-or-later /* - * Functions for initialisaing, allocating, freeing and duplicating VMAs. Shared + * Functions for initializing, allocating, freeing and duplicating VMAs. Shared * between CONFIG_MMU and non-CONFIG_MMU kernel configurations. */ @@ -16,6 +16,7 @@ void __init vma_state_init(void) struct kmem_cache_args args = { .use_freeptr_offset = true, .freeptr_offset = offsetof(struct vm_area_struct, vm_freeptr), + .sheaf_capacity = 32, }; vm_area_cachep = kmem_cache_create("vm_area_struct", diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 6dbcdceecae1..798b2ed21e46 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2026,6 +2026,8 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, if (unlikely(!vmap_initialized)) return ERR_PTR(-EBUSY); + /* Only reclaim behaviour flags are relevant. */ + gfp_mask = gfp_mask & GFP_RECLAIM_MASK; might_sleep(); /* @@ -2038,8 +2040,6 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, */ va = node_alloc(size, align, vstart, vend, &addr, &vn_id); if (!va) { - gfp_mask = gfp_mask & GFP_RECLAIM_MASK; - va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node); if (unlikely(!va)) return ERR_PTR(-ENOMEM); @@ -2057,6 +2057,12 @@ retry: addr = __alloc_vmap_area(&free_vmap_area_root, &free_vmap_area_list, size, align, vstart, vend); spin_unlock(&free_vmap_area_lock); + + /* + * This is not a fast path. Check if yielding is needed. This + * is the only reschedule point in the vmalloc() path. + */ + cond_resched(); } trace_alloc_vmap_area(addr, size, align, vstart, vend, IS_ERR_VALUE(addr)); @@ -2089,7 +2095,7 @@ retry: BUG_ON(va->va_start < vstart); BUG_ON(va->va_end > vend); - ret = kasan_populate_vmalloc(addr, size); + ret = kasan_populate_vmalloc(addr, size, gfp_mask); if (ret) { free_vmap_area(va); return ERR_PTR(ret); @@ -3622,7 +3628,6 @@ vm_area_alloc_pages(gfp_t gfp, int nid, pages + nr_allocated); nr_allocated += nr; - cond_resched(); /* * If zero or pages were obtained partly, @@ -3664,7 +3669,6 @@ vm_area_alloc_pages(gfp_t gfp, int nid, for (i = 0; i < (1U << order); i++) pages[nr_allocated + i] = page + i; - cond_resched(); nr_allocated += 1U << order; } @@ -4089,19 +4093,29 @@ void *vzalloc_node_noprof(unsigned long size, int node) EXPORT_SYMBOL(vzalloc_node_noprof); /** - * vrealloc - reallocate virtually contiguous memory; contents remain unchanged + * vrealloc_node_align_noprof - reallocate virtually contiguous memory; contents + * remain unchanged * @p: object to reallocate memory for * @size: the size to reallocate + * @align: requested alignment * @flags: the flags for the page level allocator + * @nid: node number of the target node + * + * If @p is %NULL, vrealloc_XXX() behaves exactly like vmalloc_XXX(). If @size + * is 0 and @p is not a %NULL pointer, the object pointed to is freed. * - * If @p is %NULL, vrealloc() behaves exactly like vmalloc(). If @size is 0 and - * @p is not a %NULL pointer, the object pointed to is freed. + * If the caller wants the new memory to be on specific node *only*, + * __GFP_THISNODE flag should be set, otherwise the function will try to avoid + * reallocation and possibly disregard the specified @nid. * * If __GFP_ZERO logic is requested, callers must ensure that, starting with the * initial memory allocation, every subsequent call to this API for the same * memory allocation is flagged with __GFP_ZERO. Otherwise, it is possible that * __GFP_ZERO is not fully honored by this API. * + * Requesting an alignment that is bigger than the alignment of the existing + * allocation will fail. + * * In any case, the contents of the object pointed to are preserved up to the * lesser of the new and old sizes. * @@ -4111,7 +4125,8 @@ EXPORT_SYMBOL(vzalloc_node_noprof); * Return: pointer to the allocated memory; %NULL if @size is zero or in case of * failure */ -void *vrealloc_noprof(const void *p, size_t size, gfp_t flags) +void *vrealloc_node_align_noprof(const void *p, size_t size, unsigned long align, + gfp_t flags, int nid) { struct vm_struct *vm = NULL; size_t alloced_size = 0; @@ -4135,6 +4150,12 @@ void *vrealloc_noprof(const void *p, size_t size, gfp_t flags) if (WARN(alloced_size < old_size, "vrealloc() has mismatched area vs requested sizes (%p)\n", p)) return NULL; + if (WARN(!IS_ALIGNED((unsigned long)p, align), + "will not reallocate with a bigger alignment (0x%lx)\n", align)) + return NULL; + if (unlikely(flags & __GFP_THISNODE) && nid != NUMA_NO_NODE && + nid != page_to_nid(vmalloc_to_page(p))) + goto need_realloc; } /* @@ -4165,8 +4186,10 @@ void *vrealloc_noprof(const void *p, size_t size, gfp_t flags) return (void *)p; } +need_realloc: /* TODO: Grow the vm_area, i.e. allocate and map additional pages. */ - n = __vmalloc_noprof(size, flags); + n = __vmalloc_node_noprof(size, align, flags, nid, __builtin_return_address(0)); + if (!n) return NULL; @@ -4826,7 +4849,7 @@ retry: /* populate the kasan shadow space */ for (area = 0; area < nr_vms; area++) { - if (kasan_populate_vmalloc(vas[area]->va_start, sizes[area])) + if (kasan_populate_vmalloc(vas[area]->va_start, sizes[area], GFP_KERNEL)) goto err_free_shadow; } @@ -5177,7 +5200,7 @@ static void vmap_init_nodes(void) int n = clamp_t(unsigned int, num_possible_cpus(), 1, 128); if (n > 1) { - vn = kmalloc_array(n, sizeof(*vn), GFP_NOWAIT | __GFP_NOWARN); + vn = kmalloc_array(n, sizeof(*vn), GFP_NOWAIT); if (vn) { /* Node partition is 16 pages. */ vmap_zone_size = (1 << 4) * PAGE_SIZE; diff --git a/mm/vmscan.c b/mm/vmscan.c index a48aec8bfd92..b2fc8b626d3d 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -398,14 +398,7 @@ unsigned long zone_reclaimable_pages(struct zone *zone) if (can_reclaim_anon_pages(NULL, zone_to_nid(zone), NULL)) nr += zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_ANON) + zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_ANON); - /* - * If there are no reclaimable file-backed or anonymous pages, - * ensure zones with sufficient free pages are not skipped. - * This prevents zones like DMA32 from being ignored in reclaim - * scenarios where they can still help alleviate memory pressure. - */ - if (nr == 0) - nr = zone_page_state_snapshot(zone, NR_FREE_PAGES); + return nr; } @@ -525,7 +518,7 @@ static bool skip_throttle_noprogress(pg_data_t *pgdat) * If kswapd is disabled, reschedule if necessary but do not * throttle as the system is likely near OOM. */ - if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES) + if (atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES) return true; /* @@ -737,13 +730,18 @@ static int __remove_mapping(struct address_space *mapping, struct folio *folio, { int refcount; void *shadow = NULL; + struct swap_cluster_info *ci; BUG_ON(!folio_test_locked(folio)); BUG_ON(mapping != folio_mapping(folio)); - if (!folio_test_swapcache(folio)) + if (folio_test_swapcache(folio)) { + ci = swap_cluster_get_and_lock_irq(folio); + } else { spin_lock(&mapping->host->i_lock); - xa_lock_irq(&mapping->i_pages); + xa_lock_irq(&mapping->i_pages); + } + /* * The non racy check for a busy folio. * @@ -783,9 +781,9 @@ static int __remove_mapping(struct address_space *mapping, struct folio *folio, if (reclaimed && !mapping_exiting(mapping)) shadow = workingset_eviction(folio, target_memcg); - __delete_from_swap_cache(folio, swap, shadow); + __swap_cache_del_folio(ci, folio, swap, shadow); memcg1_swapout(folio, swap); - xa_unlock_irq(&mapping->i_pages); + swap_cluster_unlock_irq(ci); put_swap_folio(folio, swap); } else { void (*free_folio)(struct folio *); @@ -823,9 +821,12 @@ static int __remove_mapping(struct address_space *mapping, struct folio *folio, return 1; cannot_free: - xa_unlock_irq(&mapping->i_pages); - if (!folio_test_swapcache(folio)) + if (folio_test_swapcache(folio)) { + swap_cluster_unlock_irq(ci); + } else { + xa_unlock_irq(&mapping->i_pages); spin_unlock(&mapping->host->i_lock); + } return 0; } @@ -888,11 +889,11 @@ static bool lru_gen_set_refs(struct folio *folio) { /* see the comment on LRU_REFS_FLAGS */ if (!folio_test_referenced(folio) && !folio_test_workingset(folio)) { - set_mask_bits(&folio->flags, LRU_REFS_MASK, BIT(PG_referenced)); + set_mask_bits(&folio->flags.f, LRU_REFS_MASK, BIT(PG_referenced)); return false; } - set_mask_bits(&folio->flags, LRU_REFS_FLAGS, BIT(PG_workingset)); + set_mask_bits(&folio->flags.f, LRU_REFS_FLAGS, BIT(PG_workingset)); return true; } #else @@ -3257,13 +3258,13 @@ static bool positive_ctrl_err(struct ctrl_pos *sp, struct ctrl_pos *pv) /* promote pages accessed through page tables */ static int folio_update_gen(struct folio *folio, int gen) { - unsigned long new_flags, old_flags = READ_ONCE(folio->flags); + unsigned long new_flags, old_flags = READ_ONCE(folio->flags.f); VM_WARN_ON_ONCE(gen >= MAX_NR_GENS); /* see the comment on LRU_REFS_FLAGS */ if (!folio_test_referenced(folio) && !folio_test_workingset(folio)) { - set_mask_bits(&folio->flags, LRU_REFS_MASK, BIT(PG_referenced)); + set_mask_bits(&folio->flags.f, LRU_REFS_MASK, BIT(PG_referenced)); return -1; } @@ -3274,7 +3275,7 @@ static int folio_update_gen(struct folio *folio, int gen) new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_FLAGS); new_flags |= ((gen + 1UL) << LRU_GEN_PGOFF) | BIT(PG_workingset); - } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags)); + } while (!try_cmpxchg(&folio->flags.f, &old_flags, new_flags)); return ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; } @@ -3285,7 +3286,7 @@ static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclai int type = folio_is_file_lru(folio); struct lru_gen_folio *lrugen = &lruvec->lrugen; int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]); - unsigned long new_flags, old_flags = READ_ONCE(folio->flags); + unsigned long new_flags, old_flags = READ_ONCE(folio->flags.f); VM_WARN_ON_ONCE_FOLIO(!(old_flags & LRU_GEN_MASK), folio); @@ -3302,7 +3303,7 @@ static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclai /* for folio_end_writeback() */ if (reclaiming) new_flags |= BIT(PG_reclaim); - } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags)); + } while (!try_cmpxchg(&folio->flags.f, &old_flags, new_flags)); lru_gen_update_size(lruvec, folio, old_gen, new_gen); @@ -4507,7 +4508,7 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_c } /* ineligible */ - if (!folio_test_lru(folio) || zone > sc->reclaim_idx) { + if (zone > sc->reclaim_idx) { gen = folio_inc_gen(lruvec, folio, false); list_move_tail(&folio->lru, &lrugen->folios[gen][type][zone]); return true; @@ -4553,7 +4554,7 @@ static bool isolate_folio(struct lruvec *lruvec, struct folio *folio, struct sca /* see the comment on LRU_REFS_FLAGS */ if (!folio_test_referenced(folio)) - set_mask_bits(&folio->flags, LRU_REFS_MASK, 0); + set_mask_bits(&folio->flags.f, LRU_REFS_MASK, 0); /* for shrink_folio_list() */ folio_clear_reclaim(folio); @@ -4766,7 +4767,7 @@ retry: /* don't add rejected folios to the oldest generation */ if (lru_gen_folio_seq(lruvec, folio, false) == min_seq[type]) - set_mask_bits(&folio->flags, LRU_REFS_FLAGS, BIT(PG_active)); + set_mask_bits(&folio->flags.f, LRU_REFS_FLAGS, BIT(PG_active)); } spin_lock_irq(&lruvec->lru_lock); @@ -5100,7 +5101,7 @@ static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control * blk_finish_plug(&plug); done: if (sc->nr_reclaimed > reclaimed) - pgdat->kswapd_failures = 0; + atomic_set(&pgdat->kswapd_failures, 0); } /****************************************************************************** @@ -5561,6 +5562,7 @@ static int run_cmd(char cmd, int memcg_id, int nid, unsigned long seq, if (memcg_id != mem_cgroup_id(memcg)) goto done; + sc->target_mem_cgroup = memcg; lruvec = get_lruvec(memcg, nid); if (swappiness < MIN_SWAPPINESS) @@ -5597,6 +5599,7 @@ static ssize_t lru_gen_seq_write(struct file *file, const char __user *src, .may_swap = true, .reclaim_idx = MAX_NR_ZONES - 1, .gfp_mask = GFP_KERNEL, + .proactive = true, }; buf = kvmalloc(len + 1, GFP_KERNEL); @@ -6177,7 +6180,7 @@ again: * successful direct reclaim run will revive a dormant kswapd. */ if (reclaimable) - pgdat->kswapd_failures = 0; + atomic_set(&pgdat->kswapd_failures, 0); else if (sc->cache_trim_mode) sc->cache_trim_mode_failed = 1; } @@ -6489,11 +6492,11 @@ static bool allow_direct_reclaim(pg_data_t *pgdat) int i; bool wmark_ok; - if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES) + if (atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES) return true; for_each_managed_zone_pgdat(zone, pgdat, i, ZONE_NORMAL) { - if (!zone_reclaimable_pages(zone)) + if (!zone_reclaimable_pages(zone) && zone_page_state_snapshot(zone, NR_FREE_PAGES)) continue; pfmemalloc_reserve += min_wmark_pages(zone); @@ -6899,7 +6902,7 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, wake_up_all(&pgdat->pfmemalloc_wait); /* Hopeless node, leave it to direct reclaim */ - if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES) + if (atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES) return true; if (pgdat_balanced(pgdat, order, highest_zoneidx)) { @@ -7167,7 +7170,7 @@ restart: } if (!sc.nr_reclaimed) - pgdat->kswapd_failures++; + atomic_inc(&pgdat->kswapd_failures); out: clear_reclaim_active(pgdat, highest_zoneidx); @@ -7426,7 +7429,7 @@ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order, return; /* Hopeless node, leave it to direct reclaim if possible */ - if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES || + if (atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES || (pgdat_balanced(pgdat, order, highest_zoneidx) && !pgdat_watermark_boosted(pgdat, highest_zoneidx))) { /* diff --git a/mm/vmstat.c b/mm/vmstat.c index 71cd1ceba191..bb09c032eecf 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1280,6 +1280,7 @@ const char * const vmstat_text[] = { #ifdef CONFIG_NUMA_BALANCING [I(PGPROMOTE_SUCCESS)] = "pgpromote_success", [I(PGPROMOTE_CANDIDATE)] = "pgpromote_candidate", + [I(PGPROMOTE_CANDIDATE_NRL)] = "pgpromote_candidate_nrl", #endif [I(PGDEMOTE_KSWAPD)] = "pgdemote_kswapd", [I(PGDEMOTE_DIRECT)] = "pgdemote_direct", @@ -1289,6 +1290,7 @@ const char * const vmstat_text[] = { [I(NR_HUGETLB)] = "nr_hugetlb", #endif [I(NR_BALLOON_PAGES)] = "nr_balloon_pages", + [I(NR_KERNEL_FILE_PAGES)] = "nr_kernel_file_pages", #undef I /* system-wide enum vm_stat_item counters */ @@ -1846,7 +1848,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, seq_printf(m, "\n node_unreclaimable: %u" "\n start_pfn: %lu", - pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES, + atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES, zone->zone_start_pfn); seq_putc(m, '\n'); } diff --git a/mm/workingset.c b/mm/workingset.c index 6e7f4cb1b9a7..68a76a91111f 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -318,7 +318,7 @@ static void lru_gen_refault(struct folio *folio, void *shadow) folio_set_workingset(folio); mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + type, delta); } else - set_mask_bits(&folio->flags, LRU_REFS_MASK, (refs - 1UL) << LRU_REFS_PGOFF); + set_mask_bits(&folio->flags.f, LRU_REFS_MASK, (refs - 1UL) << LRU_REFS_PGOFF); unlock: rcu_read_unlock(); } diff --git a/mm/zpdesc.h b/mm/zpdesc.h index 25bf5ea0beb8..b8258dc78548 100644 --- a/mm/zpdesc.h +++ b/mm/zpdesc.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* zpdesc.h: zswap.zpool memory descriptor +/* zpdesc.h: zsmalloc pool memory descriptor * * Written by Alex Shi <alexs@kernel.org> * Hyeonggon Yoo <42.hyeyoo@gmail.com> @@ -11,14 +11,14 @@ #include <linux/pagemap.h> /* - * struct zpdesc - Memory descriptor for zpool memory. + * struct zpdesc - Memory descriptor for zsmalloc pool memory. * @flags: Page flags, mostly unused by zsmalloc. * @lru: Indirectly used by page migration. * @movable_ops: Used by page migration. - * @next: Next zpdesc in a zspage in zsmalloc zpool. - * @handle: For huge zspage in zsmalloc zpool. + * @next: Next zpdesc in a zspage in zsmalloc pool. + * @handle: For huge zspage in zsmalloc pool. * @zspage: Points to the zspage this zpdesc is a part of. - * @first_obj_offset: First object offset in zsmalloc zpool. + * @first_obj_offset: First object offset in zsmalloc pool. * @_refcount: The number of references to this zpdesc. * * This struct overlays struct page for now. Do not modify without a good @@ -79,8 +79,8 @@ static_assert(sizeof(struct zpdesc) <= sizeof(struct page)); * zpdesc_folio - The folio allocated for a zpdesc * @zp: The zpdesc. * - * Zpdescs are descriptors for zpool memory. The zpool memory itself is - * allocated as folios that contain the zpool objects, and zpdesc uses specific + * Zpdescs are descriptors for zsmalloc memory. The memory itself is allocated + * as folios that contain the zsmalloc objects, and zpdesc uses specific * fields in the first struct page of the folio - those fields are now accessed * by struct zpdesc. * diff --git a/mm/zpool.c b/mm/zpool.c deleted file mode 100644 index 0a71d03369f1..000000000000 --- a/mm/zpool.c +++ /dev/null @@ -1,328 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * zpool memory storage api - * - * Copyright (C) 2014 Dan Streetman - * - * This is a common frontend for memory storage pool implementations. - * Typically, this is used to store compressed memory. - */ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include <linux/list.h> -#include <linux/types.h> -#include <linux/mm.h> -#include <linux/slab.h> -#include <linux/spinlock.h> -#include <linux/module.h> -#include <linux/zpool.h> - -struct zpool { - struct zpool_driver *driver; - void *pool; -}; - -static LIST_HEAD(drivers_head); -static DEFINE_SPINLOCK(drivers_lock); - -/** - * zpool_register_driver() - register a zpool implementation. - * @driver: driver to register - */ -void zpool_register_driver(struct zpool_driver *driver) -{ - spin_lock(&drivers_lock); - atomic_set(&driver->refcount, 0); - list_add(&driver->list, &drivers_head); - spin_unlock(&drivers_lock); -} -EXPORT_SYMBOL(zpool_register_driver); - -/** - * zpool_unregister_driver() - unregister a zpool implementation. - * @driver: driver to unregister. - * - * Module usage counting is used to prevent using a driver - * while/after unloading, so if this is called from module - * exit function, this should never fail; if called from - * other than the module exit function, and this returns - * failure, the driver is in use and must remain available. - */ -int zpool_unregister_driver(struct zpool_driver *driver) -{ - int ret = 0, refcount; - - spin_lock(&drivers_lock); - refcount = atomic_read(&driver->refcount); - WARN_ON(refcount < 0); - if (refcount > 0) - ret = -EBUSY; - else - list_del(&driver->list); - spin_unlock(&drivers_lock); - - return ret; -} -EXPORT_SYMBOL(zpool_unregister_driver); - -/* this assumes @type is null-terminated. */ -static struct zpool_driver *zpool_get_driver(const char *type) -{ - struct zpool_driver *driver; - - spin_lock(&drivers_lock); - list_for_each_entry(driver, &drivers_head, list) { - if (!strcmp(driver->type, type)) { - bool got = try_module_get(driver->owner); - - if (got) - atomic_inc(&driver->refcount); - spin_unlock(&drivers_lock); - return got ? driver : NULL; - } - } - - spin_unlock(&drivers_lock); - return NULL; -} - -static void zpool_put_driver(struct zpool_driver *driver) -{ - atomic_dec(&driver->refcount); - module_put(driver->owner); -} - -/** - * zpool_has_pool() - Check if the pool driver is available - * @type: The type of the zpool to check (e.g. zsmalloc) - * - * This checks if the @type pool driver is available. This will try to load - * the requested module, if needed, but there is no guarantee the module will - * still be loaded and available immediately after calling. If this returns - * true, the caller should assume the pool is available, but must be prepared - * to handle the @zpool_create_pool() returning failure. However if this - * returns false, the caller should assume the requested pool type is not - * available; either the requested pool type module does not exist, or could - * not be loaded, and calling @zpool_create_pool() with the pool type will - * fail. - * - * The @type string must be null-terminated. - * - * Returns: true if @type pool is available, false if not - */ -bool zpool_has_pool(char *type) -{ - struct zpool_driver *driver = zpool_get_driver(type); - - if (!driver) { - request_module("zpool-%s", type); - driver = zpool_get_driver(type); - } - - if (!driver) - return false; - - zpool_put_driver(driver); - return true; -} -EXPORT_SYMBOL(zpool_has_pool); - -/** - * zpool_create_pool() - Create a new zpool - * @type: The type of the zpool to create (e.g. zsmalloc) - * @name: The name of the zpool (e.g. zram0, zswap) - * @gfp: The GFP flags to use when allocating the pool. - * - * This creates a new zpool of the specified type. The gfp flags will be - * used when allocating memory, if the implementation supports it. If the - * ops param is NULL, then the created zpool will not be evictable. - * - * Implementations must guarantee this to be thread-safe. - * - * The @type and @name strings must be null-terminated. - * - * Returns: New zpool on success, NULL on failure. - */ -struct zpool *zpool_create_pool(const char *type, const char *name, gfp_t gfp) -{ - struct zpool_driver *driver; - struct zpool *zpool; - - pr_debug("creating pool type %s\n", type); - - driver = zpool_get_driver(type); - - if (!driver) { - request_module("zpool-%s", type); - driver = zpool_get_driver(type); - } - - if (!driver) { - pr_err("no driver for type %s\n", type); - return NULL; - } - - zpool = kmalloc(sizeof(*zpool), gfp); - if (!zpool) { - pr_err("couldn't create zpool - out of memory\n"); - zpool_put_driver(driver); - return NULL; - } - - zpool->driver = driver; - zpool->pool = driver->create(name, gfp); - - if (!zpool->pool) { - pr_err("couldn't create %s pool\n", type); - zpool_put_driver(driver); - kfree(zpool); - return NULL; - } - - pr_debug("created pool type %s\n", type); - - return zpool; -} - -/** - * zpool_destroy_pool() - Destroy a zpool - * @zpool: The zpool to destroy. - * - * Implementations must guarantee this to be thread-safe, - * however only when destroying different pools. The same - * pool should only be destroyed once, and should not be used - * after it is destroyed. - * - * This destroys an existing zpool. The zpool should not be in use. - */ -void zpool_destroy_pool(struct zpool *zpool) -{ - pr_debug("destroying pool type %s\n", zpool->driver->type); - - zpool->driver->destroy(zpool->pool); - zpool_put_driver(zpool->driver); - kfree(zpool); -} - -/** - * zpool_get_type() - Get the type of the zpool - * @zpool: The zpool to check - * - * This returns the type of the pool. - * - * Implementations must guarantee this to be thread-safe. - * - * Returns: The type of zpool. - */ -const char *zpool_get_type(struct zpool *zpool) -{ - return zpool->driver->type; -} - -/** - * zpool_malloc() - Allocate memory - * @zpool: The zpool to allocate from. - * @size: The amount of memory to allocate. - * @gfp: The GFP flags to use when allocating memory. - * @handle: Pointer to the handle to set - * @nid: The preferred node id. - * - * This allocates the requested amount of memory from the pool. - * The gfp flags will be used when allocating memory, if the - * implementation supports it. The provided @handle will be - * set to the allocated object handle. The allocation will - * prefer the NUMA node specified by @nid. - * - * Implementations must guarantee this to be thread-safe. - * - * Returns: 0 on success, negative value on error. - */ -int zpool_malloc(struct zpool *zpool, size_t size, gfp_t gfp, - unsigned long *handle, const int nid) -{ - return zpool->driver->malloc(zpool->pool, size, gfp, handle, nid); -} - -/** - * zpool_free() - Free previously allocated memory - * @zpool: The zpool that allocated the memory. - * @handle: The handle to the memory to free. - * - * This frees previously allocated memory. This does not guarantee - * that the pool will actually free memory, only that the memory - * in the pool will become available for use by the pool. - * - * Implementations must guarantee this to be thread-safe, - * however only when freeing different handles. The same - * handle should only be freed once, and should not be used - * after freeing. - */ -void zpool_free(struct zpool *zpool, unsigned long handle) -{ - zpool->driver->free(zpool->pool, handle); -} - -/** - * zpool_obj_read_begin() - Start reading from a previously allocated handle. - * @zpool: The zpool that the handle was allocated from - * @handle: The handle to read from - * @local_copy: A local buffer to use if needed. - * - * This starts a read operation of a previously allocated handle. The passed - * @local_copy buffer may be used if needed by copying the memory into. - * zpool_obj_read_end() MUST be called after the read is completed to undo any - * actions taken (e.g. release locks). - * - * Returns: A pointer to the handle memory to be read, if @local_copy is used, - * the returned pointer is @local_copy. - */ -void *zpool_obj_read_begin(struct zpool *zpool, unsigned long handle, - void *local_copy) -{ - return zpool->driver->obj_read_begin(zpool->pool, handle, local_copy); -} - -/** - * zpool_obj_read_end() - Finish reading from a previously allocated handle. - * @zpool: The zpool that the handle was allocated from - * @handle: The handle to read from - * @handle_mem: The pointer returned by zpool_obj_read_begin() - * - * Finishes a read operation previously started by zpool_obj_read_begin(). - */ -void zpool_obj_read_end(struct zpool *zpool, unsigned long handle, - void *handle_mem) -{ - zpool->driver->obj_read_end(zpool->pool, handle, handle_mem); -} - -/** - * zpool_obj_write() - Write to a previously allocated handle. - * @zpool: The zpool that the handle was allocated from - * @handle: The handle to read from - * @handle_mem: The memory to copy from into the handle. - * @mem_len: The length of memory to be written. - * - */ -void zpool_obj_write(struct zpool *zpool, unsigned long handle, - void *handle_mem, size_t mem_len) -{ - zpool->driver->obj_write(zpool->pool, handle, handle_mem, mem_len); -} - -/** - * zpool_get_total_pages() - The total size of the pool - * @zpool: The zpool to check - * - * This returns the total size in pages of the pool. - * - * Returns: Total size of the zpool in pages. - */ -u64 zpool_get_total_pages(struct zpool *zpool) -{ - return zpool->driver->total_pages(zpool->pool); -} - -MODULE_AUTHOR("Dan Streetman <ddstreet@ieee.org>"); -MODULE_DESCRIPTION("Common API for compressed memory storage"); diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 805a10b41266..5bf832f9c05c 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -36,7 +36,6 @@ #include <linux/types.h> #include <linux/debugfs.h> #include <linux/zsmalloc.h> -#include <linux/zpool.h> #include <linux/fs.h> #include <linux/workqueue.h> #include "zpdesc.h" @@ -433,78 +432,6 @@ static void record_obj(unsigned long handle, unsigned long obj) *(unsigned long *)handle = obj; } -/* zpool driver */ - -#ifdef CONFIG_ZPOOL - -static void *zs_zpool_create(const char *name, gfp_t gfp) -{ - /* - * Ignore global gfp flags: zs_malloc() may be invoked from - * different contexts and its caller must provide a valid - * gfp mask. - */ - return zs_create_pool(name); -} - -static void zs_zpool_destroy(void *pool) -{ - zs_destroy_pool(pool); -} - -static int zs_zpool_malloc(void *pool, size_t size, gfp_t gfp, - unsigned long *handle, const int nid) -{ - *handle = zs_malloc(pool, size, gfp, nid); - - if (IS_ERR_VALUE(*handle)) - return PTR_ERR((void *)*handle); - return 0; -} -static void zs_zpool_free(void *pool, unsigned long handle) -{ - zs_free(pool, handle); -} - -static void *zs_zpool_obj_read_begin(void *pool, unsigned long handle, - void *local_copy) -{ - return zs_obj_read_begin(pool, handle, local_copy); -} - -static void zs_zpool_obj_read_end(void *pool, unsigned long handle, - void *handle_mem) -{ - zs_obj_read_end(pool, handle, handle_mem); -} - -static void zs_zpool_obj_write(void *pool, unsigned long handle, - void *handle_mem, size_t mem_len) -{ - zs_obj_write(pool, handle, handle_mem, mem_len); -} - -static u64 zs_zpool_total_pages(void *pool) -{ - return zs_get_total_pages(pool); -} - -static struct zpool_driver zs_zpool_driver = { - .type = "zsmalloc", - .owner = THIS_MODULE, - .create = zs_zpool_create, - .destroy = zs_zpool_destroy, - .malloc = zs_zpool_malloc, - .free = zs_zpool_free, - .obj_read_begin = zs_zpool_obj_read_begin, - .obj_read_end = zs_zpool_obj_read_end, - .obj_write = zs_zpool_obj_write, - .total_pages = zs_zpool_total_pages, -}; - -MODULE_ALIAS("zpool-zsmalloc"); -#endif /* CONFIG_ZPOOL */ - static inline bool __maybe_unused is_first_zpdesc(struct zpdesc *zpdesc) { return PagePrivate(zpdesc_page(zpdesc)); @@ -1746,7 +1673,7 @@ static int zs_page_migrate(struct page *newpage, struct page *page, * instead. */ if (!zpdesc->zspage) - return MIGRATEPAGE_SUCCESS; + return 0; /* The page is locked, so this pointer must remain valid */ zspage = get_zspage(zpdesc); @@ -1813,7 +1740,7 @@ static int zs_page_migrate(struct page *newpage, struct page *page, reset_zpdesc(zpdesc); zpdesc_put(zpdesc); - return MIGRATEPAGE_SUCCESS; + return 0; } static void zs_page_putback(struct page *page) @@ -2248,9 +2175,6 @@ static int __init zs_init(void) { int rc __maybe_unused; -#ifdef CONFIG_ZPOOL - zpool_register_driver(&zs_zpool_driver); -#endif #ifdef CONFIG_COMPACTION rc = set_movable_ops(&zsmalloc_mops, PGTY_zsmalloc); if (rc) @@ -2262,9 +2186,6 @@ static int __init zs_init(void) static void __exit zs_exit(void) { -#ifdef CONFIG_ZPOOL - zpool_unregister_driver(&zs_zpool_driver); -#endif #ifdef CONFIG_COMPACTION set_movable_ops(NULL, PGTY_zsmalloc); #endif diff --git a/mm/zswap.c b/mm/zswap.c index 3c0fd8a13718..c1af782e54ec 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -25,7 +25,6 @@ #include <linux/scatterlist.h> #include <linux/mempolicy.h> #include <linux/mempool.h> -#include <linux/zpool.h> #include <crypto/acompress.h> #include <linux/zswap.h> #include <linux/mm_types.h> @@ -35,6 +34,7 @@ #include <linux/pagemap.h> #include <linux/workqueue.h> #include <linux/list_lru.h> +#include <linux/zsmalloc.h> #include "swap.h" #include "internal.h" @@ -42,8 +42,10 @@ /********************************* * statistics **********************************/ -/* The number of compressed pages currently stored in zswap */ +/* The number of pages currently stored in zswap */ atomic_long_t zswap_stored_pages = ATOMIC_LONG_INIT(0); +/* The number of incompressible pages currently stored in zswap */ +static atomic_long_t zswap_stored_incompressible_pages = ATOMIC_LONG_INIT(0); /* * The statistics below are not protected from concurrent access for @@ -105,16 +107,6 @@ static const struct kernel_param_ops zswap_compressor_param_ops = { module_param_cb(compressor, &zswap_compressor_param_ops, &zswap_compressor, 0644); -/* Compressed storage zpool to use */ -static char *zswap_zpool_type = CONFIG_ZSWAP_ZPOOL_DEFAULT; -static int zswap_zpool_param_set(const char *, const struct kernel_param *); -static const struct kernel_param_ops zswap_zpool_param_ops = { - .set = zswap_zpool_param_set, - .get = param_get_charp, - .free = param_free_charp, -}; -module_param_cb(zpool, &zswap_zpool_param_ops, &zswap_zpool_type, 0644); - /* The maximum percentage of memory that the compressed pool can occupy */ static unsigned int zswap_max_pool_percent = 20; module_param_named(max_pool_percent, zswap_max_pool_percent, uint, 0644); @@ -159,7 +151,7 @@ struct crypto_acomp_ctx { * needs to be verified that it's still valid in the tree. */ struct zswap_pool { - struct zpool *zpool; + struct zs_pool *zs_pool; struct crypto_acomp_ctx __percpu *acomp_ctx; struct percpu_ref ref; struct list_head list; @@ -191,7 +183,7 @@ static struct shrinker *zswap_shrinker; * logic if referenced is unset. See comments in the shrinker * section for context. * pool - the zswap_pool the entry's data is in - * handle - zpool allocation handle that stores the compressed page data + * handle - zsmalloc allocation handle that stores the compressed page data * objcg - the obj_cgroup that the compressed memory is charged to * lru - handle to the pool's lru used to evict pages. */ @@ -212,7 +204,7 @@ static unsigned int nr_zswap_trees[MAX_SWAPFILES]; static LIST_HEAD(zswap_pools); /* protects zswap_pools list modification */ static DEFINE_SPINLOCK(zswap_pools_lock); -/* pool counter to provide unique names to zpool */ +/* pool counter to provide unique names to zsmalloc */ static atomic_t zswap_pools_count = ATOMIC_INIT(0); enum zswap_init_type { @@ -233,38 +225,31 @@ static bool zswap_has_pool; * helpers and fwd declarations **********************************/ +/* One swap address space for each 64M swap space */ +#define ZSWAP_ADDRESS_SPACE_SHIFT 14 +#define ZSWAP_ADDRESS_SPACE_PAGES (1 << ZSWAP_ADDRESS_SPACE_SHIFT) static inline struct xarray *swap_zswap_tree(swp_entry_t swp) { return &zswap_trees[swp_type(swp)][swp_offset(swp) - >> SWAP_ADDRESS_SPACE_SHIFT]; + >> ZSWAP_ADDRESS_SPACE_SHIFT]; } -#define zswap_pool_debug(msg, p) \ - pr_debug("%s pool %s/%s\n", msg, (p)->tfm_name, \ - zpool_get_type((p)->zpool)) +#define zswap_pool_debug(msg, p) \ + pr_debug("%s pool %s\n", msg, (p)->tfm_name) /********************************* * pool functions **********************************/ static void __zswap_pool_empty(struct percpu_ref *ref); -static struct zswap_pool *zswap_pool_create(char *type, char *compressor) +static struct zswap_pool *zswap_pool_create(char *compressor) { struct zswap_pool *pool; char name[38]; /* 'zswap' + 32 char (max) num + \0 */ - gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM; int ret, cpu; - if (!zswap_has_pool) { - /* if either are unset, pool initialization failed, and we - * need both params to be set correctly before trying to - * create a pool. - */ - if (!strcmp(type, ZSWAP_PARAM_UNSET)) - return NULL; - if (!strcmp(compressor, ZSWAP_PARAM_UNSET)) - return NULL; - } + if (!zswap_has_pool && !strcmp(compressor, ZSWAP_PARAM_UNSET)) + return NULL; pool = kzalloc(sizeof(*pool), GFP_KERNEL); if (!pool) @@ -272,12 +257,9 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor) /* unique name for each pool specifically required by zsmalloc */ snprintf(name, 38, "zswap%x", atomic_inc_return(&zswap_pools_count)); - pool->zpool = zpool_create_pool(type, name, gfp); - if (!pool->zpool) { - pr_err("%s zpool not available\n", type); + pool->zs_pool = zs_create_pool(name); + if (!pool->zs_pool) goto error; - } - pr_debug("using %s zpool\n", zpool_get_type(pool->zpool)); strscpy(pool->tfm_name, compressor, sizeof(pool->tfm_name)); @@ -313,52 +295,29 @@ ref_fail: error: if (pool->acomp_ctx) free_percpu(pool->acomp_ctx); - if (pool->zpool) - zpool_destroy_pool(pool->zpool); + if (pool->zs_pool) + zs_destroy_pool(pool->zs_pool); kfree(pool); return NULL; } static struct zswap_pool *__zswap_pool_create_fallback(void) { - bool has_comp, has_zpool; - - has_comp = crypto_has_acomp(zswap_compressor, 0, 0); - if (!has_comp && strcmp(zswap_compressor, - CONFIG_ZSWAP_COMPRESSOR_DEFAULT)) { + if (!crypto_has_acomp(zswap_compressor, 0, 0) && + strcmp(zswap_compressor, CONFIG_ZSWAP_COMPRESSOR_DEFAULT)) { pr_err("compressor %s not available, using default %s\n", zswap_compressor, CONFIG_ZSWAP_COMPRESSOR_DEFAULT); param_free_charp(&zswap_compressor); zswap_compressor = CONFIG_ZSWAP_COMPRESSOR_DEFAULT; - has_comp = crypto_has_acomp(zswap_compressor, 0, 0); - } - if (!has_comp) { - pr_err("default compressor %s not available\n", - zswap_compressor); - param_free_charp(&zswap_compressor); - zswap_compressor = ZSWAP_PARAM_UNSET; - } - - has_zpool = zpool_has_pool(zswap_zpool_type); - if (!has_zpool && strcmp(zswap_zpool_type, - CONFIG_ZSWAP_ZPOOL_DEFAULT)) { - pr_err("zpool %s not available, using default %s\n", - zswap_zpool_type, CONFIG_ZSWAP_ZPOOL_DEFAULT); - param_free_charp(&zswap_zpool_type); - zswap_zpool_type = CONFIG_ZSWAP_ZPOOL_DEFAULT; - has_zpool = zpool_has_pool(zswap_zpool_type); - } - if (!has_zpool) { - pr_err("default zpool %s not available\n", - zswap_zpool_type); - param_free_charp(&zswap_zpool_type); - zswap_zpool_type = ZSWAP_PARAM_UNSET; } - if (!has_comp || !has_zpool) + /* Default compressor should be available. Kconfig bug? */ + if (WARN_ON_ONCE(!crypto_has_acomp(zswap_compressor, 0, 0))) { + zswap_compressor = ZSWAP_PARAM_UNSET; return NULL; + } - return zswap_pool_create(zswap_zpool_type, zswap_compressor); + return zswap_pool_create(zswap_compressor); } static void zswap_pool_destroy(struct zswap_pool *pool) @@ -368,7 +327,7 @@ static void zswap_pool_destroy(struct zswap_pool *pool) cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node); free_percpu(pool->acomp_ctx); - zpool_destroy_pool(pool->zpool); + zs_destroy_pool(pool->zs_pool); kfree(pool); } @@ -460,7 +419,7 @@ static struct zswap_pool *zswap_pool_current_get(void) } /* type and compressor must be null-terminated */ -static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor) +static struct zswap_pool *zswap_pool_find_get(char *compressor) { struct zswap_pool *pool; @@ -469,8 +428,6 @@ static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor) list_for_each_entry_rcu(pool, &zswap_pools, list) { if (strcmp(pool->tfm_name, compressor)) continue; - if (strcmp(zpool_get_type(pool->zpool), type)) - continue; /* if we can't get it, it's about to be destroyed */ if (!zswap_pool_tryget(pool)) continue; @@ -497,7 +454,7 @@ unsigned long zswap_total_pages(void) rcu_read_lock(); list_for_each_entry_rcu(pool, &zswap_pools, list) - total += zpool_get_total_pages(pool->zpool); + total += zs_get_total_pages(pool->zs_pool); rcu_read_unlock(); return total; @@ -522,33 +479,22 @@ static bool zswap_check_limits(void) * param callbacks **********************************/ -static bool zswap_pool_changed(const char *s, const struct kernel_param *kp) -{ - /* no change required */ - if (!strcmp(s, *(char **)kp->arg) && zswap_has_pool) - return false; - return true; -} - -/* val must be a null-terminated string */ -static int __zswap_param_set(const char *val, const struct kernel_param *kp, - char *type, char *compressor) +static int zswap_compressor_param_set(const char *val, const struct kernel_param *kp) { struct zswap_pool *pool, *put_pool = NULL; char *s = strstrip((char *)val); + bool create_pool = false; int ret = 0; - bool new_pool = false; mutex_lock(&zswap_init_lock); switch (zswap_init_state) { case ZSWAP_UNINIT: - /* if this is load-time (pre-init) param setting, - * don't create a pool; that's done during init. - */ + /* Handled in zswap_setup() */ ret = param_set_charp(s, kp); break; case ZSWAP_INIT_SUCCEED: - new_pool = zswap_pool_changed(s, kp); + if (!zswap_has_pool || strcmp(s, *(char **)kp->arg)) + create_pool = true; break; case ZSWAP_INIT_FAILED: pr_err("can't set param, initialization failed\n"); @@ -556,30 +502,17 @@ static int __zswap_param_set(const char *val, const struct kernel_param *kp, } mutex_unlock(&zswap_init_lock); - /* no need to create a new pool, return directly */ - if (!new_pool) + if (!create_pool) return ret; - if (!type) { - if (!zpool_has_pool(s)) { - pr_err("zpool %s not available\n", s); - return -ENOENT; - } - type = s; - } else if (!compressor) { - if (!crypto_has_acomp(s, 0, 0)) { - pr_err("compressor %s not available\n", s); - return -ENOENT; - } - compressor = s; - } else { - WARN_ON(1); - return -EINVAL; + if (!crypto_has_acomp(s, 0, 0)) { + pr_err("compressor %s not available\n", s); + return -ENOENT; } spin_lock_bh(&zswap_pools_lock); - pool = zswap_pool_find_get(type, compressor); + pool = zswap_pool_find_get(s); if (pool) { zswap_pool_debug("using existing", pool); WARN_ON(pool == zswap_pool_current()); @@ -589,7 +522,7 @@ static int __zswap_param_set(const char *val, const struct kernel_param *kp, spin_unlock_bh(&zswap_pools_lock); if (!pool) - pool = zswap_pool_create(type, compressor); + pool = zswap_pool_create(s); else { /* * Restore the initial ref dropped by percpu_ref_kill() @@ -614,7 +547,8 @@ static int __zswap_param_set(const char *val, const struct kernel_param *kp, list_add_rcu(&pool->list, &zswap_pools); zswap_has_pool = true; } else if (pool) { - /* add the possibly pre-existing pool to the end of the pools + /* + * Add the possibly pre-existing pool to the end of the pools * list; if it's new (and empty) then it'll be removed and * destroyed by the put after we drop the lock */ @@ -624,18 +558,8 @@ static int __zswap_param_set(const char *val, const struct kernel_param *kp, spin_unlock_bh(&zswap_pools_lock); - if (!zswap_has_pool && !pool) { - /* if initial pool creation failed, and this pool creation also - * failed, maybe both compressor and zpool params were bad. - * Allow changing this param, so pool creation will succeed - * when the other param is changed. We already verified this - * param is ok in the zpool_has_pool() or crypto_has_acomp() - * checks above. - */ - ret = param_set_charp(s, kp); - } - - /* drop the ref from either the old current pool, + /* + * Drop the ref from either the old current pool, * or the new pool we failed to add */ if (put_pool) @@ -644,18 +568,6 @@ static int __zswap_param_set(const char *val, const struct kernel_param *kp, return ret; } -static int zswap_compressor_param_set(const char *val, - const struct kernel_param *kp) -{ - return __zswap_param_set(val, kp, zswap_zpool_type, NULL); -} - -static int zswap_zpool_param_set(const char *val, - const struct kernel_param *kp) -{ - return __zswap_param_set(val, kp, NULL, zswap_compressor); -} - static int zswap_enabled_param_set(const char *val, const struct kernel_param *kp) { @@ -799,18 +711,20 @@ static void zswap_entry_cache_free(struct zswap_entry *entry) } /* - * Carries out the common pattern of freeing and entry's zpool allocation, + * Carries out the common pattern of freeing an entry's zsmalloc allocation, * freeing the entry itself, and decrementing the number of stored pages. */ static void zswap_entry_free(struct zswap_entry *entry) { zswap_lru_del(&zswap_list_lru, entry); - zpool_free(entry->pool->zpool, entry->handle); + zs_free(entry->pool->zs_pool, entry->handle); zswap_pool_put(entry->pool); if (entry->objcg) { obj_cgroup_uncharge_zswap(entry->objcg, entry->length); obj_cgroup_put(entry->objcg); } + if (entry->length == PAGE_SIZE) + atomic_long_dec(&zswap_stored_incompressible_pages); zswap_entry_cache_free(entry); atomic_long_dec(&zswap_stored_pages); } @@ -827,7 +741,7 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node) u8 *buffer = NULL; int ret; - buffer = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu)); + buffer = kmalloc_node(PAGE_SIZE, GFP_KERNEL, cpu_to_node(cpu)); if (!buffer) { ret = -ENOMEM; goto fail; @@ -945,21 +859,16 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry, int comp_ret = 0, alloc_ret = 0; unsigned int dlen = PAGE_SIZE; unsigned long handle; - struct zpool *zpool; gfp_t gfp; u8 *dst; + bool mapped = false; acomp_ctx = acomp_ctx_get_cpu_lock(pool); dst = acomp_ctx->buffer; sg_init_table(&input, 1); sg_set_page(&input, page, PAGE_SIZE, 0); - /* - * We need PAGE_SIZE * 2 here since there maybe over-compression case, - * and hardware-accelerators may won't check the dst buffer size, so - * giving the dst buffer with enough length to avoid buffer overflow. - */ - sg_init_one(&output, dst, PAGE_SIZE * 2); + sg_init_one(&output, dst, PAGE_SIZE); acomp_request_set_params(acomp_ctx->req, &input, &output, PAGE_SIZE, dlen); /* @@ -976,20 +885,41 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry, */ comp_ret = crypto_wait_req(crypto_acomp_compress(acomp_ctx->req), &acomp_ctx->wait); dlen = acomp_ctx->req->dlen; - if (comp_ret) - goto unlock; - zpool = pool->zpool; + /* + * If a page cannot be compressed into a size smaller than PAGE_SIZE, + * save the content as is without a compression, to keep the LRU order + * of writebacks. If writeback is disabled, reject the page since it + * only adds metadata overhead. swap_writeout() will put the page back + * to the active LRU list in the case. + */ + if (comp_ret || !dlen || dlen >= PAGE_SIZE) { + dlen = PAGE_SIZE; + if (!mem_cgroup_zswap_writeback_enabled( + folio_memcg(page_folio(page)))) { + comp_ret = comp_ret ? comp_ret : -EINVAL; + goto unlock; + } + comp_ret = 0; + dlen = PAGE_SIZE; + dst = kmap_local_page(page); + mapped = true; + } + gfp = GFP_NOWAIT | __GFP_NORETRY | __GFP_HIGHMEM | __GFP_MOVABLE; - alloc_ret = zpool_malloc(zpool, dlen, gfp, &handle, page_to_nid(page)); - if (alloc_ret) + handle = zs_malloc(pool->zs_pool, dlen, gfp, page_to_nid(page)); + if (IS_ERR_VALUE(handle)) { + alloc_ret = PTR_ERR((void *)handle); goto unlock; + } - zpool_obj_write(zpool, handle, dst, dlen); + zs_obj_write(pool->zs_pool, handle, dst, dlen); entry->handle = handle; entry->length = dlen; unlock: + if (mapped) + kunmap_local(dst); if (comp_ret == -ENOSPC || alloc_ret == -ENOSPC) zswap_reject_compress_poor++; else if (comp_ret) @@ -1003,17 +933,23 @@ unlock: static bool zswap_decompress(struct zswap_entry *entry, struct folio *folio) { - struct zpool *zpool = entry->pool->zpool; + struct zswap_pool *pool = entry->pool; struct scatterlist input, output; struct crypto_acomp_ctx *acomp_ctx; - int decomp_ret, dlen; + int decomp_ret = 0, dlen = PAGE_SIZE; u8 *src, *obj; - acomp_ctx = acomp_ctx_get_cpu_lock(entry->pool); - obj = zpool_obj_read_begin(zpool, entry->handle, acomp_ctx->buffer); + acomp_ctx = acomp_ctx_get_cpu_lock(pool); + obj = zs_obj_read_begin(pool->zs_pool, entry->handle, acomp_ctx->buffer); + + /* zswap entries of length PAGE_SIZE are not compressed. */ + if (entry->length == PAGE_SIZE) { + memcpy_to_folio(folio, 0, obj, entry->length); + goto read_done; + } /* - * zpool_obj_read_begin() might return a kmap address of highmem when + * zs_obj_read_begin() might return a kmap address of highmem when * acomp_ctx->buffer is not used. However, sg_init_one() does not * handle highmem addresses, so copy the object to acomp_ctx->buffer. */ @@ -1032,7 +968,8 @@ static bool zswap_decompress(struct zswap_entry *entry, struct folio *folio) decomp_ret = crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait); dlen = acomp_ctx->req->dlen; - zpool_obj_read_end(zpool, entry->handle, obj); +read_done: + zs_obj_read_end(pool->zs_pool, entry->handle, obj); acomp_ctx_put_unlock(acomp_ctx); if (!decomp_ret && dlen == PAGE_SIZE) @@ -1135,7 +1072,7 @@ static int zswap_writeback_entry(struct zswap_entry *entry, out: if (ret && ret != -EEXIST) { - delete_from_swap_cache(folio); + swap_cache_del_folio(folio); folio_unlock(folio); } folio_put(folio); @@ -1524,6 +1461,8 @@ static bool zswap_store_page(struct page *page, obj_cgroup_charge_zswap(objcg, entry->length); } atomic_long_inc(&zswap_stored_pages); + if (entry->length == PAGE_SIZE) + atomic_long_inc(&zswap_stored_incompressible_pages); /* * We finish initializing the entry while it's already in xarray. @@ -1547,7 +1486,7 @@ static bool zswap_store_page(struct page *page, return true; store_failed: - zpool_free(pool->zpool, entry->handle); + zs_free(pool->zs_pool, entry->handle); compress_failed: zswap_entry_cache_free(entry); return false; @@ -1738,7 +1677,7 @@ int zswap_swapon(int type, unsigned long nr_pages) struct xarray *trees, *tree; unsigned int nr, i; - nr = DIV_ROUND_UP(nr_pages, SWAP_ADDRESS_SPACE_PAGES); + nr = DIV_ROUND_UP(nr_pages, ZSWAP_ADDRESS_SPACE_PAGES); trees = kvcalloc(nr, sizeof(*tree), GFP_KERNEL); if (!trees) { pr_err("alloc failed, zswap disabled for swap type %d\n", type); @@ -1792,6 +1731,14 @@ static int debugfs_get_stored_pages(void *data, u64 *val) } DEFINE_DEBUGFS_ATTRIBUTE(stored_pages_fops, debugfs_get_stored_pages, NULL, "%llu\n"); +static int debugfs_get_stored_incompressible_pages(void *data, u64 *val) +{ + *val = atomic_long_read(&zswap_stored_incompressible_pages); + return 0; +} +DEFINE_DEBUGFS_ATTRIBUTE(stored_incompressible_pages_fops, + debugfs_get_stored_incompressible_pages, NULL, "%llu\n"); + static int zswap_debugfs_init(void) { if (!debugfs_initialized()) @@ -1819,6 +1766,9 @@ static int zswap_debugfs_init(void) zswap_debugfs_root, NULL, &total_size_fops); debugfs_create_file("stored_pages", 0444, zswap_debugfs_root, NULL, &stored_pages_fops); + debugfs_create_file("stored_incompressible_pages", 0444, + zswap_debugfs_root, NULL, + &stored_incompressible_pages_fops); return 0; } @@ -1866,8 +1816,7 @@ static int zswap_setup(void) pool = __zswap_pool_create_fallback(); if (pool) { - pr_info("loaded using pool %s/%s\n", pool->tfm_name, - zpool_get_type(pool->zpool)); + pr_info("loaded using pool %s\n", pool->tfm_name); list_add(&pool->list, &zswap_pools); zswap_has_pool = true; static_branch_enable(&zswap_ever_enabled); |